Allow GpuDownsampleFactorMaxGrad to work with more then 512 columns in its outputs.

ebad678c · Frederic Bastien · 577aee4a · ebad678c · ebad678c
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -588,7 +588,7 @@ class GpuDownsampleFactorMaxGrad(Op):
        return Apply(self, [x, z, gz], [x.type()])
    def c_code_cache_version(self):
        #return ()
-        return (3,)
+        return (4,)
    def c_code(self, node, nodename, inp, out, sub):
        x, z, gz = inp
@@ -625,7 +625,8 @@ class GpuDownsampleFactorMaxGrad(Op):
            // make sure we cover every x row when ignore border isset and there's a border present to be ignored
            int needs_extra_z_col = %(ignore_border)s && (CudaNdarray_HOST_DIMS(%(x)s)[2] %% %(ds0)s);
            dim3 grid(CudaNdarray_HOST_DIMS(%(z)s)[0],CudaNdarray_HOST_DIMS(%(z)s)[2] + (needs_extra_z_col ? 1 : 0));
-            dim3 block(CudaNdarray_HOST_DIMS(%(x)s)[3]);
+            dim3 block(std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], 512));
            kDownsampleMaxGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>(
                CudaNdarray_HOST_DIMS(%(z)s)[0],
                CudaNdarray_HOST_DIMS(%(z)s)[1],
@@ -705,32 +706,44 @@ class GpuDownsampleFactorMaxGrad(Op):
            for (i1 = 0; i1 < D1; ++i1) // loop over images (same for z and x)
            {
-                if (%(ignore_border)s && x_col >= ds1 * D3)
+                for(int col_iter = 0; col_iter * blockDim.x <= xD3 ; col_iter++){
-                {
+                    //The if inside is to don't do the division if we need only 1 col_iter
-                    // This happens only if x_col was ignored (via ignore_border)
+                    if(blockDim.x != xD3)
-                    // TODO: if ignore_border is False, this is impossible and we don't even
+                    {
-                    //       need to generate this code.
+                        x_col = threadIdx.x + col_iter * blockDim.x;
+                        z_col = x_col/ds1;
+                    }
-                    my_gz = 0.0f;
+                    if (%(ignore_border)s && x_col >= ds1 * D3)
-                    //any fp number suffices for my_z, so we don't even need to set it to
+                    {
-                    //anything in particular.
+                        // This happens only if x_col was ignored (via ignore_border)
-                }
+                        // TODO: if ignore_border is False, this is impossible and we don't even
-                else
+                        //       need to generate this code.
-                {
-                    // this is effectively:
+                        my_gz = 0.0f;
-                    // my_gz = gz[image_row][image_col][z_row][z_col]
+                        //any fp number suffices for my_z, so we don't even need to set it to
-                    // my_z  = z[image_row][image_col][z_row][z_col]
+                        //anything in particular.
-                    my_gz = gz[i0 * gzS0 + i1 * gzS1 + i2 * gzS2 + z_col*gzS3];
+                    }
-                    my_z =   z[i0 *  zS0 + i1 *  zS1 + i2 *  zS2 + z_col* zS3];
+                    else
-                }
+                    {
+                        // this is effectively:
+                        // my_gz = gz[image_row][image_col][z_row][z_col]
+                        // my_z  = z[image_row][image_col][z_row][z_col]
+                        my_gz = gz[i0 * gzS0 + i1 * gzS1 + i2 * gzS2 + z_col*gzS3];
+                        my_z =   z[i0 *  zS0 + i1 *  zS1 + i2 *  zS2 + z_col* zS3];
+                    }
+                    if(x_col<xD3){
+                        for (int x_row = i2*ds0; (x_row < i2*ds0+ds0) && (x_row < xD2); ++x_row)
+                        {
+                            // this is effectively:
+                            // gx[image_row][image_col][x_row][x_col]
+                            //   = (my_z == x[image_row][image_col][x_row][x_col]) ? my_gz : 0.0f;
+                            gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + x_row*xD3 + x_col]
+                               = (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col*xS3]) ? my_gz : 0.0f;
+                        }
+                    //gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + x_row*xD3 + x_col] = -999;
+}
-                for (int x_row = i2*ds0; (x_row < i2*ds0+ds0) && (x_row < xD2); ++x_row)
-                {
-                    // this is effectively:
-                    // gx[image_row][image_col][x_row][x_col]
-                    //   = (my_z == x[image_row][image_col][x_row][x_col]) ? my_gz : 0.0f;
-                    gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + x_row*xD3 + x_col]
-                       = (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col*xS3]) ? my_gz : 0.0f;
                }
            }
        }

--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -12,7 +12,7 @@ if cuda_ndarray.cuda_available == False:
 import theano.sandbox.cuda as tcn
-from theano.tensor.signal.downsample import DownsampleFactorMax
+from theano.tensor.signal.downsample import DownsampleFactorMax, DownsampleFactorMaxGrad
 import theano.compile.mode
@@ -163,7 +163,12 @@ def test_downsample():
            (30, 6, 12, 12),
            (30, 2, 24, 24),
            (30, 6, 24, 24),
-            (10, 10, 10, 11)]
+            (10, 10, 10, 11),
+            (1,1,10,1025),
+            (1,1,10,1023),
+            (1,1,1025,10),
+            (1,1,1023,10),
+             ]
    numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps)
@@ -171,6 +176,8 @@ def test_downsample():
        for ds in (2, 2), (3,2), (1,1):
            if ds[0] > shp[2]: continue
            if ds[1] > shp[3]: continue
+            #GpuDownsampleFactorMax don't having more then 512 columns in the output tensor
+            if float(shp[3])/ds[1]>512: continue
            for ignore_border in (True, False):
                print 'test_downsample', shp, ds, ignore_border
                ds_op = DownsampleFactorMax(ds, ignore_border=ignore_border)
@@ -180,12 +187,16 @@ def test_downsample():
                f2 = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_without_gpu)
                assert any([isinstance(node.op, tcn.blas.GpuDownsampleFactorMax) for node in
                            f.maker.env.toposort()])
+                assert any([isinstance(node.op, DownsampleFactorMax) for node in
+                            f2.maker.env.toposort()])
                assert numpy.allclose(f(),f2())
                g = pfunc([], tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(),a), mode=mode_with_gpu)
                g2 = pfunc([], tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(),a), mode=mode_without_gpu)
                assert any([isinstance(node.op, tcn.blas.GpuDownsampleFactorMaxGrad)
                            for node in g.maker.env.toposort()])
+                assert any([isinstance(node.op, DownsampleFactorMaxGrad)
+                            for node in g2.maker.env.toposort()])
                assert numpy.allclose(g(),g2())
                #We already check that the gpu version return the same value as the gpu version