Merge pull request #2316 from ynd/poolgradgrad

Added GpuDownsampleFactorMaxGradGrad

Merge pull request #2316 from ynd/poolgradgrad
fe166844 · Frédéric Bastien · ab1d2d48 · 69f0732b · fe166844 · fe166844
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -2156,3 +2156,192 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
            }
        }
        """ % locals()
+
+
+class GpuDownsampleFactorMaxGradGrad(GpuOp):
+    """
+    Implement the grad of downsample with max on the gpu.
+    """
+    __props__ = ('ds', 'ignore_border')
+    
+    def __init__(self, ds, ignore_border):
+        self.ds = tuple(ds)
+        self.ignore_border = ignore_border
+
+    def make_node(self, x, z, gx):
+        x = as_cuda_ndarray_variable(x)
+        z = as_cuda_ndarray_variable(z)
+        gx = as_cuda_ndarray_variable(gx)
+        
+        if x.type.ndim != 4:
+            raise TypeError('x must be 4D tensor')
+        if z.type.ndim != 4:
+            raise TypeError('z must be 4D tensor')
+        if gx.type.ndim != 4:
+            raise TypeError('gx must be 4D tensor')
+        
+        return Apply(self, [x, z, gx], [x.type()])
+
+    def c_code_cache_version(self):
+        return (1,)
+
+    def c_code(self, node, nodename, inp, out, sub):
+        x, z, gx = inp
+        gz, = out
+        fail = sub['fail']
+        ds0, ds1 = self.ds
+        ignore_border = int(self.ignore_border)
+        return """
+        if (%(x)s->nd != 4
+            || %(z)s->nd != 4
+            || %(gx)s->nd != 4)
+        {
+            PyErr_SetString(PyExc_ValueError, "GpuDownsampleFactorMaxGradGrad: rank error");
+            %(fail)s;
+        }
+        if ((NULL == %(gz)s)
+            || (CudaNdarray_HOST_DIMS(%(gz)s)[0] !=
+                CudaNdarray_HOST_DIMS(%(z)s)[0])
+            || (CudaNdarray_HOST_DIMS(%(gz)s)[1] !=
+                CudaNdarray_HOST_DIMS(%(z)s)[1])
+            || (CudaNdarray_HOST_DIMS(%(gz)s)[2] !=
+                CudaNdarray_HOST_DIMS(%(z)s)[2])
+            || (CudaNdarray_HOST_DIMS(%(gz)s)[3] !=
+                CudaNdarray_HOST_DIMS(%(z)s)[3]))
+        {
+            Py_XDECREF(%(gz)s);
+            %(gz)s = (CudaNdarray*)CudaNdarray_New();
+            if ((NULL == %(gz)s)
+                || CudaNdarray_alloc_contiguous(%(gz)s, 4,
+                                                CudaNdarray_HOST_DIMS(%(z)s)))
+            {
+                Py_XDECREF(%(gz)s);
+                %(gz)s = NULL;
+                %(fail)s;
+            }
+        }
+        {
+            
+            int needs_extra_z_col = %(ignore_border)s && (CudaNdarray_HOST_DIMS(%(x)s)[2] %% %(ds0)s);
+            dim3 grid(std::min(CudaNdarray_HOST_DIMS(%(z)s)[0], 65535),
+                      CudaNdarray_HOST_DIMS(%(z)s)[2] + (needs_extra_z_col ? 1 : 0));
+            dim3 block(std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], 512));
+
+            kDownsampleMaxGradGrad_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block>>>(
+                CudaNdarray_HOST_DIMS(%(z)s)[0],
+                CudaNdarray_HOST_DIMS(%(z)s)[1],
+                CudaNdarray_HOST_DIMS(%(z)s)[2],
+                CudaNdarray_HOST_DIMS(%(z)s)[3],
+                CudaNdarray_HOST_DIMS(%(x)s)[2],
+                CudaNdarray_HOST_DIMS(%(x)s)[3],
+                CudaNdarray_DEV_DATA(%(x)s),
+                CudaNdarray_HOST_STRIDES(%(x)s)[0],
+                CudaNdarray_HOST_STRIDES(%(x)s)[1],
+                CudaNdarray_HOST_STRIDES(%(x)s)[2],
+                CudaNdarray_HOST_STRIDES(%(x)s)[3],
+                CudaNdarray_DEV_DATA(%(z)s),
+                CudaNdarray_HOST_STRIDES(%(z)s)[0],
+                CudaNdarray_HOST_STRIDES(%(z)s)[1],
+                CudaNdarray_HOST_STRIDES(%(z)s)[2],
+                CudaNdarray_HOST_STRIDES(%(z)s)[3],
+                CudaNdarray_DEV_DATA(%(gz)s),
+                CudaNdarray_HOST_STRIDES(%(gz)s)[0],
+                CudaNdarray_HOST_STRIDES(%(gz)s)[1],
+                CudaNdarray_HOST_STRIDES(%(gz)s)[2],
+                CudaNdarray_HOST_STRIDES(%(gz)s)[3],
+                CudaNdarray_DEV_DATA(%(gx)s),
+                CudaNdarray_HOST_STRIDES(%(gx)s)[0],
+                CudaNdarray_HOST_STRIDES(%(gx)s)[1],
+                CudaNdarray_HOST_STRIDES(%(gx)s)[2],
+                CudaNdarray_HOST_STRIDES(%(gx)s)[3]);
+            CNDA_THREAD_SYNC;
+            cudaError_t err = cudaGetLastError();
+            if( cudaSuccess != err)
+            {
+                PyErr_Format(PyExc_RuntimeError,
+    "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
+                    "kDownsampleMaxGradGrad_%(nodename)s",
+                    cudaGetErrorString(err),
+                    grid.x,
+                    grid.y,
+                    block.x,
+                    block.y,
+                    block.z);
+                %(fail)s;
+            }
+        }
+        """ % locals()
+
+    def c_support_code_apply(self, node, nodename):
+        return """
+        // ds0 is the downsampling factor in rows, ds1 in columns
+        template<int ds0, int ds1>
+        __global__ void kDownsampleMaxGradGrad_%(nodename)s(
+           int D0, int D1, int D2, int D3, int xD2, int xD3,
+           const float * x, int xS0, int xS1, int xS2, int xS3,
+           const float * z, int zS0, int zS1, int zS2, int zS3,
+           float * gz, int gzS0, int gzS1, int gzS2, int gzS3,
+           const float *gx, int gxS0, int gxS1, int gxS2, int gxS3)
+        {
+            //  D0: number of image rows
+            //  D1: number of image cols
+            //  D2: number of z rows
+            //  D3: number of z cols
+            // xD2: number of x rows
+            // xD3: number of x cols
+            // various .S. variables are strides
+
+            float cur_max, cur_x, my_z, my_gx;
+            // Cast threadIdx.x into a signed int, to avoid problems with
+            // indexing with negative offsets.
+            int tx = threadIdx.x;
+            int bdimx = blockDim.x;
+
+            for(int i0 = blockIdx.x;
+                i0 < D0;
+                i0 += gridDim.x){
+
+                int i1 = 0;                // image col
+                // row wrt z and/or gz, ranges from 0 to D2 - 1 OR D2
+                // (as needed to cover all x rows)
+                int i2 = blockIdx.y;
+                int x_col = tx;            // col wrt x, ranges from 0 to xD3 - 1
+                int z_col = x_col/ds1;     // z_col corresponding to this x_col
+
+
+                //TODO: raise occupancy.  Use threadIdx.y to run several
+                //      iterations of this i1 loop in parallel
+
+                for (i1 = 0; i1 < D1; ++i1) // loop over images (same for z and x)
+                {
+                    for(int col_iter = 0;
+                        (tx + col_iter * bdimx < xD3) ; col_iter++){
+
+                        //The if inside is to don't do the division if we
+                        // need only 1 col_iter
+
+                        if(tx + bdimx < xD3)
+                        {
+                            x_col = tx + col_iter * bdimx;
+                            z_col = x_col/ds1;
+                        }
+
+                        my_z = z[i0 *  zS0 + i1 *  zS1 + i2 *  zS2 + z_col* zS3];
+
+                        for (int x_row = i2*ds0;
+                              (x_row < i2*ds0+ds0) && (x_row < xD2); ++x_row)
+                        {
+                            // my_gx = gx[image_row][image_col][x_row][x_col]
+                            my_gx = gx[i0*gxS0 + i1*gxS1 + x_row*gxS2 + x_col*gxS3];
+                            
+                            if (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col*xS3]) {
+                                gz[i0 *  gzS0 + i1 *  gzS1 + i2 *  gzS2 + z_col* gzS3] = my_gx;
+                            }
+                        }
+                        
+
+                    }
+                }
+            }
+        }
+        """ % locals()
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -35,7 +35,7 @@ from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace
 from theano.sandbox.cuda.blas import gpu_ger_no_inplace
 from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
-        GpuDownsampleFactorMaxGrad)
+        GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad)
 from theano.sandbox.cuda.nnet import (
        GpuCrossentropySoftmaxArgmax1HotWithBias,
        GpuCrossentropySoftmax1HotWithBiasDx,
@@ -1624,6 +1624,19 @@ def local_gpu_downsample_factor_max_grad(node):
                                              gpu_from_host(gz)))]


+@register_opt()
+@local_optimizer([downsample.DownsampleFactorMaxGradGrad])
+def local_gpu_downsample_factor_max_grad_grad(node):
+    if isinstance(node.op, downsample.DownsampleFactorMaxGradGrad):
+        x, z, gx = node.inputs
+        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
+            op = GpuDownsampleFactorMaxGradGrad(node.op.ds,
+                                                node.op.ignore_border)
+            return [host_from_gpu(op(x.owner.inputs[0],
+                                     gpu_from_host(z),
+                                     gpu_from_host(gx)))]
+
+
 from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin



--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -2,6 +2,7 @@ import copy
 from unittest import TestCase

 from theano.compile.pfunc import pfunc
+from theano import gradient
 from theano import tensor
 from theano.tests import unittest_tools

@@ -16,7 +17,7 @@ if cuda_ndarray.cuda_available == False:
 import theano.sandbox.cuda as tcn

 from theano.tensor.signal.downsample import (DownsampleFactorMax,
-        DownsampleFactorMaxGrad)
+        DownsampleFactorMaxGrad, DownsampleFactorMaxGradGrad)
 from theano.gof.python25 import any

 import theano.compile.mode
@@ -308,6 +309,23 @@ def test_downsample():
                            for node in g2.maker.fgraph.toposort()])
                assert numpy.allclose(g(), g2()), shp

+                ggf = gradient.Lop(tensor.grad((ds_op(
+                    tensor.as_tensor_variable(a))**2).sum(), a), a, a)
+                
+                ref_mode = copy.copy(mode_without_gpu)
+                ref_mode.check_py_code = False
+                gpu_mode = copy.copy(mode_with_gpu)
+                gpu_mode.check_py_code = False
+                gg = pfunc([], ggf, mode=gpu_mode)
+                gg2 = pfunc([], ggf, mode=ref_mode)
+                
+                assert any([isinstance(node.op,
+                                       tcn.blas.GpuDownsampleFactorMaxGradGrad)
+                            for node in gg.maker.fgraph.toposort()])
+                assert any([isinstance(node.op, DownsampleFactorMaxGradGrad)
+                            for node in gg2.maker.fgraph.toposort()])
+                assert numpy.allclose(gg(), gg2()), shp
+
                # We already check that the gpu version return
                # the same value as the gpu version for
                # GpuDownsampleFactorMaxGrad. So no need to call