Added GpuDownsampleFactorMaxGradGrad

a044a12a · Yann N. Dauphin · c8f8a276 · a044a12a · a044a12a · a044a12a
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -2145,7 +2145,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
                            // this is effectively:
                            // gx[image_row][image_col][x_row][x_col]
                            //   = (my_z == x[image_row][image_col][
-                            //                x_row][x_col]) ? my_gz : 0.0f;
+                            //                x_row][x  _col]) ? my_gz : 0.0f;
                            gx[i0*gxS0 + i1*gxS1 + x_row*gxS2 + x_col*gxS3]
                               = (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 +
                                            x_col*xS3]) ? my_gz : 0.0f;
@@ -2156,3 +2156,194 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
            }
        }
        """ % locals()
+class GpuDownsampleFactorMaxGradGrad(GpuOp):
+    """
+    Implement the grad of downsample with max on the gpu.
+    """
+    def __init__(self, ds, ignore_border):
+        self.ds = tuple(ds)
+        self.ignore_border = ignore_border
+    def __eq__(self, other):
+        return (type(self) == type(other) and
+                self.ds == other.ds and
+                self.ignore_border == other.ignore_border)
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.ds) ^ hash(self.ignore_border)
+    def __str__(self):
+        return '%s{%s,%s}' % (self.__class__.__name__,
+                              self.ds,
+                              self.ignore_border)
+    def make_node(self, x, z, gx):
+        return Apply(self, [x, z, gx], [z.type()])
+    #def c_code_cache_version(self):
+    #    return (1,)
+    def c_code(self, node, nodename, inp, out, sub):
+        x, z, gx = inp
+        gz, = out
+        fail = sub['fail']
+        ds0, ds1 = self.ds
+        ignore_border = int(self.ignore_border)
+        return """
+        if (%(x)s->nd != 4
+            || %(z)s->nd != 4
+            || %(gx)s->nd != 4)
+        {
+            PyErr_SetString(PyExc_ValueError, "rank error");
+            %(fail)s;
+        }
+        if ((NULL == %(gz)s)
+            || (CudaNdarray_HOST_DIMS(%(gz)s)[0] !=
+                CudaNdarray_HOST_DIMS(%(z)s)[0])
+            || (CudaNdarray_HOST_DIMS(%(gz)s)[1] !=
+                CudaNdarray_HOST_DIMS(%(z)s)[1])
+            || (CudaNdarray_HOST_DIMS(%(gz)s)[2] !=
+                CudaNdarray_HOST_DIMS(%(z)s)[2])
+            || (CudaNdarray_HOST_DIMS(%(gz)s)[3] !=
+                CudaNdarray_HOST_DIMS(%(z)s)[3]))
+        {
+            Py_XDECREF(%(gz)s);
+            %(gz)s = (CudaNdarray*)CudaNdarray_New();
+            if ((NULL == %(gz)s)
+                || CudaNdarray_alloc_contiguous(%(gz)s, 4,
+                                                CudaNdarray_HOST_DIMS(%(z)s)))
+            {
+                Py_XDECREF(%(gz)s);
+                %(gz)s = NULL;
+                %(fail)s;
+            }
+        }
+        {
+            int needs_extra_z_col = %(ignore_border)s && (CudaNdarray_HOST_DIMS(%(x)s)[2] %% %(ds0)s);
+            dim3 grid(std::min(CudaNdarray_HOST_DIMS(%(z)s)[0], 65535),
+                      CudaNdarray_HOST_DIMS(%(z)s)[2] + (needs_extra_z_col ? 1 : 0));
+            dim3 block(std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], 512));
+            kDownsampleMaxGradGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>(
+                CudaNdarray_HOST_DIMS(%(z)s)[0],
+                CudaNdarray_HOST_DIMS(%(z)s)[1],
+                CudaNdarray_HOST_DIMS(%(z)s)[2],
+                CudaNdarray_HOST_DIMS(%(z)s)[3],
+                CudaNdarray_HOST_DIMS(%(x)s)[2],
+                CudaNdarray_HOST_DIMS(%(x)s)[3],
+                CudaNdarray_DEV_DATA(%(x)s),
+                CudaNdarray_HOST_STRIDES(%(x)s)[0],
+                CudaNdarray_HOST_STRIDES(%(x)s)[1],
+                CudaNdarray_HOST_STRIDES(%(x)s)[2],
+                CudaNdarray_HOST_STRIDES(%(x)s)[3],
+                CudaNdarray_DEV_DATA(%(z)s),
+                CudaNdarray_HOST_STRIDES(%(z)s)[0],
+                CudaNdarray_HOST_STRIDES(%(z)s)[1],
+                CudaNdarray_HOST_STRIDES(%(z)s)[2],
+                CudaNdarray_HOST_STRIDES(%(z)s)[3],
+                CudaNdarray_DEV_DATA(%(gz)s),
+                CudaNdarray_HOST_STRIDES(%(gz)s)[0],
+                CudaNdarray_HOST_STRIDES(%(gz)s)[1],
+                CudaNdarray_HOST_STRIDES(%(gz)s)[2],
+                CudaNdarray_HOST_STRIDES(%(gz)s)[3],
+                CudaNdarray_DEV_DATA(%(gx)s),
+                CudaNdarray_HOST_STRIDES(%(gx)s)[0],
+                CudaNdarray_HOST_STRIDES(%(gx)s)[1],
+                CudaNdarray_HOST_STRIDES(%(gx)s)[2],
+                CudaNdarray_HOST_STRIDES(%(gx)s)[3]);
+            CNDA_THREAD_SYNC;
+            cudaError_t err = cudaGetLastError();
+            if( cudaSuccess != err)
+            {
+                PyErr_Format(PyExc_RuntimeError,
+    "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
+                    "kDownsampleMaxGradGrad_%(nodename)s",
+                    cudaGetErrorString(err),
+                    grid.x,
+                    grid.y,
+                    block.x,
+                    block.y,
+                    block.z);
+                %(fail)s;
+            }
+        }
+        """ % locals()
+    def c_support_code_apply(self, node, nodename):
+        ignore_border = int(self.ignore_border)
+        return """
+        // ds0 is the downsampling factor in rows, ds1 in columns
+        template<int ds0, int ds1>
+        __global__ void kDownsampleMaxGradGrad_%(nodename)s(
+           int D0, int D1, int D2, int D3, int xD2, int xD3,
+           const float * x, int xS0, int xS1, int xS2, int xS3,
+           const float * z, int zS0, int zS1, int zS2, int zS3,
+           float * gz, int gzS0, int gzS1, int gzS2, int gzS3,
+           const float *gx, int gxS0, int gxS1, int gxS2, int gxS3)
+        {
+            //  D0: number of image rows
+            //  D1: number of image cols
+            //  D2: number of z rows
+            //  D3: number of z cols
+            // xD2: number of x rows
+            // xD3: number of x cols
+            // various .S. variables are strides
+            float cur_max, cur_x, my_z, my_gx;
+            // Cast threadIdx.x into a signed int, to avoid problems with
+            // indexing with negative offsets.
+            int tx = threadIdx.x;
+            int bdimx = blockDim.x;
+            for(int i0 = blockIdx.x;
+                i0 < D0;
+                i0 += gridDim.x){
+                int i1 = 0;                // image col
+                // row wrt z and/or gz, ranges from 0 to D2 - 1 OR D2
+                // (as needed to cover all x rows)
+                int i2 = blockIdx.y;
+                int x_col = tx;            // col wrt x, ranges from 0 to xD3 - 1
+                int z_col = x_col/ds1;     // z_col corresponding to this x_col
+                //TODO: raise occupancy.  Use threadIdx.y to run several
+                //      iterations of this i1 loop in parallel
+                for (i1 = 0; i1 < D1; ++i1) // loop over images (same for z and x)
+                {
+                    for(int col_iter = 0;
+                        (tx + col_iter * bdimx < xD3) ; col_iter++){
+                        //The if inside is to don't do the division if we
+                        // need only 1 col_iter
+                        if(tx + bdimx < xD3)
+                        {
+                            x_col = tx + col_iter * bdimx;
+                            z_col = x_col/ds1;
+                        }
+                        my_z = z[i0 *  zS0 + i1 *  zS1 + i2 *  zS2 + z_col* zS3];
+                        for (int x_row = i2*ds0;
+                              (x_row < i2*ds0+ds0) && (x_row < xD2); ++x_row)
+                        {
+                            // my_gx = gx[image_row][image_col][x_row][x_col]
+                            my_gx = gx[i0*gxS0 + i1*gxS1 + x_row*gxS2 + x_col*gxS3];
+                            if (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col*xS3]) {
+                                gz[i0 *  gzS0 + i1 *  gzS1 + i2 *  gzS2 + z_col* gzS3] = my_gx;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        """ % locals()
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -35,7 +35,7 @@ from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace
 from theano.sandbox.cuda.blas import gpu_ger_no_inplace
 from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
-        GpuDownsampleFactorMaxGrad)
+        GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad)
 from theano.sandbox.cuda.nnet import (
        GpuCrossentropySoftmaxArgmax1HotWithBias,
        GpuCrossentropySoftmax1HotWithBiasDx,
@@ -1623,6 +1623,19 @@ def local_gpu_downsample_factor_max_grad(node):
                                              gpu_from_host(gz)))]
+@register_opt()
+@local_optimizer([downsample.DownsampleFactorMaxGradGrad])
+def local_gpu_downsample_factor_max_grad_grad(node):
+    if isinstance(node.op, downsample.DownsampleFactorMaxGradGrad):
+        x, z, gx = node.inputs
+        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
+            op = GpuDownsampleFactorMaxGradGrad(node.op.ds,
+                                                node.op.ignore_border)
+            return [host_from_gpu(op(x.owner.inputs[0],
+                                     gpu_from_host(z),
+                                     gpu_from_host(gx)))]
 from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin

--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -2,6 +2,7 @@ import copy
 from unittest import TestCase
 from theano.compile.pfunc import pfunc
+from theano import gradient
 from theano import tensor
 from theano.tests import unittest_tools
@@ -16,7 +17,7 @@ if cuda_ndarray.cuda_available == False:
 import theano.sandbox.cuda as tcn
 from theano.tensor.signal.downsample import (DownsampleFactorMax,
-        DownsampleFactorMaxGrad)
+        DownsampleFactorMaxGrad, DownsampleFactorMaxGradGrad)
 from theano.gof.python25 import any
 import theano.compile.mode
@@ -26,12 +27,12 @@ from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace
 if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu').excluding('cudnn')
    mode_without_gpu = theano.compile.mode.get_mode(
-            'FAST_RUN').excluding('gpu')
+            'FAST_RUN').excluding('gpu').excluding('cudnn')
 else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu').excluding('cudnn')
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu').excluding('cudnn')
 #The CPU tests already compare C/Py, so we only check C/GPU
 mode_with_gpu = copy.copy(mode_with_gpu)
@@ -308,6 +309,18 @@ def test_downsample():
                            for node in g2.maker.fgraph.toposort()])
                assert numpy.allclose(g(), g2()), shp
+                ggf = gradient.Lop(tensor.grad((ds_op(
+                    tensor.as_tensor_variable(a))**2).sum(), a), a, a)
+                gg = pfunc([], ggf, mode=mode_with_gpu)
+                gg2 = pfunc([], ggf, mode=mode_without_gpu)
+                assert any([isinstance(node.op,
+                                       tcn.blas.GpuDownsampleFactorMaxGradGrad)
+                            for node in gg.maker.fgraph.toposort()])
+                assert any([isinstance(node.op, DownsampleFactorMaxGradGrad)
+                            for node in gg2.maker.fgraph.toposort()])
+                assert numpy.allclose(gg(), gg2()), shp
                # We already check that the gpu version return
                # the same value as the gpu version for
                # GpuDownsampleFactorMaxGrad. So no need to call