implemented mode wrap_centered on gpu for Images2Neibs

ab5ddd68 · Frederic Bastien · b3203222 · ab5ddd68 · ab5ddd68
--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -52,8 +52,8 @@ class Images2Neibs(Op):
        fail = sub['fail']
        mode=self.mode
        return """
-        int grid_c; //number of patch in height
-        int grid_d; //number of patch in width
+        int grid_c = -1; //number of patch in height
+        int grid_d = -1; //number of patch in width
        {
        if (%(ten4)s->nd != 4)
        {
@@ -230,8 +230,8 @@ def neibs2images(neibs, neib_shape, original_shape):
 # This is work in progress
 class GpuImages2Neibs(Images2Neibs):
    def __init__(self, mode='valid'):
-        if mode not in ['valid']:
-            raise NotImplementedError("Only the mode valid have been implemented for the op GpuImages2Neibs")
+        if mode not in ['valid', 'wrap_centered']:
+            raise NotImplementedError("Only the mode valid and wrap_centered have been implemented for the op GpuImages2Neibs")
        self.mode = mode

    def make_node(self, ten4, neib_shape, neib_step):
@@ -253,7 +253,8 @@ class GpuImages2Neibs(Images2Neibs):
        return (2,)

    def c_support_code_apply(self, node, nodename):
-        return """
+        if self.mode=="valid":
+            return """
        static __global__ void k_multi_warp_%(nodename)s(
            const int nb_batch,
            const int nb_stack,
@@ -302,11 +303,74 @@ class GpuImages2Neibs(Images2Neibs):
        }

        """ % locals()
+        if self.mode=="wrap_centered":
+            return """
+        static __global__ void k_multi_warp_%(nodename)s(
+            const int nb_batch,
+            const int nb_stack,
+            const int height,
+            const int width,
+            const int c,
+            const int d,
+            const int grid_c,
+            const int grid_d,
+            const int stride0, const int stride1, const int stride2, const int stride3,
+            float * global_ten4,
+            float * global_out
+        )
+        {
+            const int wrap_centered_idx_shift_x = c/2;
+            const int wrap_centered_idx_shift_y = d/2;
+
+            for(int tblock = blockIdx.x;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x){
+                const int b = tblock%%grid_d;
+                int left = tblock/grid_d;
+                const int a = left%%grid_c;
+                left = left/grid_c;
+                const int s = left%%nb_stack;
+                left = left/nb_stack;
+                const int n = left;
+
+                if(n>nb_batch)continue;
+                if(s>nb_stack)continue;
+                if(a>grid_c)continue;
+                if(b>grid_d)continue;
+                            int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n));
+                            for (int i = 0; i < c; i++)     // loop over c
+                            {
+                                int ten4_2 = i + a * c;
+                                ten4_2 -= wrap_centered_idx_shift_x;
+                                if ( ten4_2 < 0 ) ten4_2 += height;
+                                else if (ten4_2 >= height) ten4_2 -= height;
+
+                                for (int j = threadIdx.x; j < d; j+=blockDim.x)  // loop over d
+                                {
+                                    int ten4_3 = j + b * d;
+                                    ten4_3 -= wrap_centered_idx_shift_y;
+                                    if ( ten4_3 < 0 ) ten4_3 += width;
+                                    else if (ten4_3 >= width) ten4_3 -= width;
+                                    //int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
+                                    //int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n)); 
+                                    int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; 
+
+                                    int z_col = j + d * i;
+                                    int z_idx = z_col + c*d*z_row;
+                                    global_out[z_idx] = global_ten4[ten4_idx];
+                                }
+                            }
+            }
+        }
+
+        """ % locals()


    def c_code(self, node, name, (ten4, neib_shape), (z,), sub):
        fail = sub['fail']
+        mode = self.mode
        return """
+        int grid_c = -1;
+        int grid_d = -1;
+
        {
            if (%(ten4)s->nd != 4)
            {
@@ -339,26 +403,46 @@ class GpuImages2Neibs(Images2Neibs):

            const int c = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0);
            const int d = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
-            
-            //const float * cd = CudaNdarray_DEV_DATA(%(neib_shape)s);
-            //const int c = (int) cd[0];
-            //const int d = (int) cd[1];
-            
-            if ( CudaNdarray_HOST_DIMS(%(ten4)s)[2] %% c != 0)
-            {
-                PyErr_Format(PyExc_TypeError, "neib_shape[0] must divide ten4.shape[2]");
-                %(fail)s;
-            }
-            if ( CudaNdarray_HOST_DIMS(%(ten4)s)[3] %% d != 0)
-            {
-                PyErr_Format(PyExc_TypeError, "neib_shape[1] must divide ten4.shape[3]");
-                %(fail)s;
+            const int step_x = c;//will change when we implement neib_step
+            const int step_y = d;//will change when we implement neib_step
+
+            if ( "%(mode)s" == "wrap_centered") {
+                if (c%%2!=1 || d%%2!=1){
+                    PyErr_Format(PyExc_TypeError, "Images2Neibs: in mode wrap_centered need patch with odd shapes");
+                    %(fail)s;
+                }
+                if ( CudaNdarray_HOST_DIMS(%(ten4)s)[2] < c || CudaNdarray_HOST_DIMS(%(ten4)s)[3] < d)
+                {
+                    PyErr_Format(PyExc_TypeError, "Images2Neibs: in wrap_centered mode, don't support image shapes smaller then the patch shapes: neib_shape=(%%d,%%d), ten4[2:]=[%%d,%%d]",
+                                 c, d, CudaNdarray_HOST_DIMS(%(ten4)s)[2], CudaNdarray_HOST_DIMS(%(ten4)s)[3]);
+                    %(fail)s;
+                }
+                //grid_c = CEIL_INTDIV(((CudaNdarray_HOST_DIMS(%(ten4)s))[2]),step_x)
+                //grid_d = CEIL_INTDIV(((CudaNdarray_HOST_DIMS(%(ten4)s))[3]),step_y)
+                grid_c = ((CudaNdarray_HOST_DIMS(%(ten4)s))[2])/step_x + ((((CudaNdarray_HOST_DIMS(%(ten4)s))[2])%%step_x)? 1:0);
+                grid_d = ((CudaNdarray_HOST_DIMS(%(ten4)s))[3])/step_y + ((((CudaNdarray_HOST_DIMS(%(ten4)s))[3])%%step_y)? 1:0);
+            }else if ( "%(mode)s" == "valid") {            
+                if ( CudaNdarray_HOST_DIMS(%(ten4)s)[2] %% c != 0)
+                {
+                    PyErr_Format(PyExc_TypeError, "neib_shape[0] must divide ten4.shape[2]");
+                    %(fail)s;
+                }
+                if ( CudaNdarray_HOST_DIMS(%(ten4)s)[3] %% d != 0)
+                {
+                    PyErr_Format(PyExc_TypeError, "neib_shape[1] must divide ten4.shape[3]");
+                    %(fail)s;
+                }
+                grid_c = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[2]-c)/step_x); //number of patch in height
+                grid_d = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[3]-d)/step_y); //number of patch in width
+            }else{
+                PyErr_Format(PyExc_TypeError, "Images2Neibs: unknow mode '%(mode)s'");
+                 %(fail)s;
            }
-            
+
            // new dimensions for z
            const int z_dim1 = c * d;
-            const int z_dim0 =  CudaNdarray_HOST_DIMS(%(ten4)s)[2] / c
-                                * CudaNdarray_HOST_DIMS(%(ten4)s)[3] / d
+            const int z_dim0 =  grid_c
+                                * grid_d
                                * CudaNdarray_HOST_DIMS(%(ten4)s)[1]
                                * CudaNdarray_HOST_DIMS(%(ten4)s)[0];
            
@@ -400,9 +484,6 @@ class GpuImages2Neibs(Images2Neibs):
            const int c = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 0);
            const int d = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(%(neib_shape)s, 1);
            
-            const int grid_c = height/c;
-            const int grid_d = width/d;
-
            int nb_block;
            if (nb_batch %% 32 == 0)
                nb_block = nb_batch/32;

--- a/theano/sandbox/test_neighbours.py
+++ b/theano/sandbox/test_neighbours.py
@@ -200,17 +200,19 @@ def test_neibs_wrap_centered_step_manual():
    images = shared(numpy.asarray(numpy.arange(numpy.prod(shape)).reshape(shape),dtype='float32'))
    neib_shape = T.as_tensor_variable((3,3))
    neib_step = T.as_tensor_variable((2,2))
+    neib_step = neib_shape
+
    modes = [mode_without_gpu]
    if cuda.cuda_available:
        modes.append(mode_with_gpu)

-    for mode in modes:
+    for mode_idx,mode in enumerate(modes):
        f = function([], images2neibs(images, neib_shape, neib_step, mode="wrap_centered"), mode=mode)
        neibs = f()
        print repr(neibs)
        print neibs.shape
        print images.value
-        expected = numpy.asarray([[24, 20, 21,  4,  0,  1,  9,  5,  6],
+        expected1 = numpy.asarray([[24, 20, 21,  4,  0,  1,  9,  5,  6],
                    [21, 22, 23,  1,  2,  3,  6,  7,  8],
                    [23, 24, 20,  3,  4,  0,  8,  9,  5],
                    [ 9,  5,  6, 14, 10, 11, 19, 15, 16],
@@ -219,13 +221,14 @@ def test_neibs_wrap_centered_step_manual():
                    [19, 15, 16, 24, 20, 21,  4,  0,  1],
                    [16, 17, 18, 21, 22, 23,  1,  2,  3],
                    [18, 19, 15, 23, 24, 20,  3,  4,  0]])
-
-        assert numpy.allclose(neibs[0:9,:],expected)
-        assert numpy.allclose(neibs[9:18,:],expected+25)
-        assert numpy.allclose(neibs[18:27,:],expected+50)
-        assert numpy.allclose(neibs[27:36,:],expected+75)
-        assert numpy.allclose(neibs[36:45,:],expected+100)
-        assert numpy.allclose(neibs[45:,:],expected+125)
+        expected2 = numpy.asarray([[ 24.,  20.,  21.,   4.,   0.,   1.,   9.,   5.,   6.],
+                                   [ 22.,  23.,  24.,   2.,   3.,   4.,   7.,   8.,   9.],
+                                   [ 14.,  10.,  11.,  19.,  15.,  16.,  24.,  20.,  21.],
+                                   [ 12.,  13.,  14.,  17.,  18.,  19.,  22.,  23.,  24.]])
+
+        expected = expected2
+        for i in range(shape[0]*shape[1]):
+            assert numpy.allclose(neibs[i*expected.shape[0]:(i+1)*expected.shape[0],:],expected+25*i), mode_idx

        #g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_without_gpu)