Merge pull request #2556 from lamblin/dnn_pool

Dnn pool support of pad

Merge pull request #2556 from lamblin/dnn_pool
6da4304d · abergeron · bf3641e2 · 1304489a · 6da4304d · 6da4304d
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -521,8 +521,8 @@ class BaseGpuCorrMM(GpuOp):
    def __init__(self, border_mode="valid", subsample=(1, 1), pad=(0, 0)):
        if pad != (0, 0):
            _logger.warning(
-                'do not use pad for BaseGpuCorrMM; please set padding in'
-                'border_mode, see the docstring for more details')
+                'do not use pad for BaseGpuCorrMM; please set padding in '
+                'border_mode parameter, see the docstring for more details')
            if border_mode != "valid":
                raise ValueError("border_mode must be 'valid'")
            border_mode = pad

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -10,10 +10,13 @@ from theano.compat import PY3
 from theano.compile.ops import shape_i
 from theano.configparser import AddConfigVar, EnumStr
 from theano.tensor.nnet import SoftmaxGrad
+from theano.tensor.signal.downsample import (
+    DownsampleFactorMax, DownsampleFactorMaxGrad)
 from theano.tensor.basic import ShapeError
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
+                                           host_from_gpu,
                                           gpu_contiguous, HostFromGpu,
                                           cp_on_negative_strides)
 from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
@@ -85,16 +88,19 @@ dnn_available.msg = None

 def c_set_tensor4d(var, desc, err, fail):
    return """
+{
+    int str0, str1, str2, str3;
+    str3 = CudaNdarray_HOST_STRIDES(%(var)s)[3]?CudaNdarray_HOST_STRIDES(%(var)s)[3]:1;
+    str2 = CudaNdarray_HOST_STRIDES(%(var)s)[2]?CudaNdarray_HOST_STRIDES(%(var)s)[2]:CudaNdarray_HOST_DIMS(%(var)s)[3];
+    str1 = CudaNdarray_HOST_STRIDES(%(var)s)[1]?CudaNdarray_HOST_STRIDES(%(var)s)[1]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3];
+    str0 = CudaNdarray_HOST_STRIDES(%(var)s)[0]?CudaNdarray_HOST_STRIDES(%(var)s)[0]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3]*CudaNdarray_HOST_DIMS(%(var)s)[1];
 %(err)s = cudnnSetTensor4dDescriptorEx(
    %(desc)s, CUDNN_DATA_FLOAT,
    CudaNdarray_HOST_DIMS(%(var)s)[0],
    CudaNdarray_HOST_DIMS(%(var)s)[1],
    CudaNdarray_HOST_DIMS(%(var)s)[2],
    CudaNdarray_HOST_DIMS(%(var)s)[3],
-    CudaNdarray_HOST_STRIDES(%(var)s)[0]?CudaNdarray_HOST_STRIDES(%(var)s)[0]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3]*CudaNdarray_HOST_DIMS(%(var)s)[1],
-    CudaNdarray_HOST_STRIDES(%(var)s)[1]?CudaNdarray_HOST_STRIDES(%(var)s)[1]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3],
-    CudaNdarray_HOST_STRIDES(%(var)s)[2]?CudaNdarray_HOST_STRIDES(%(var)s)[2]:CudaNdarray_HOST_DIMS(%(var)s)[3],
-    CudaNdarray_HOST_STRIDES(%(var)s)[3]?CudaNdarray_HOST_STRIDES(%(var)s)[3]:1
+    str0, str1, str2, str3
 );
 if (%(err)s != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
@@ -105,13 +111,12 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
    CudaNdarray_HOST_DIMS(%(var)s)[1],
    CudaNdarray_HOST_DIMS(%(var)s)[2],
    CudaNdarray_HOST_DIMS(%(var)s)[3],
-    CudaNdarray_HOST_STRIDES(%(var)s)[0]?CudaNdarray_HOST_STRIDES(%(var)s)[0]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3]*CudaNdarray_HOST_DIMS(%(var)s)[1],
-    CudaNdarray_HOST_STRIDES(%(var)s)[1]?CudaNdarray_HOST_STRIDES(%(var)s)[1]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3],
-    CudaNdarray_HOST_STRIDES(%(var)s)[2]?CudaNdarray_HOST_STRIDES(%(var)s)[2]:CudaNdarray_HOST_DIMS(%(var)s)[3],
-    CudaNdarray_HOST_STRIDES(%(var)s)[3]?CudaNdarray_HOST_STRIDES(%(var)s)[3]:1
+    str0, str1, str2, str3
    );
    %(fail)s
 }
+}
+
        """ % dict(var=var, err=err, desc=desc, fail=fail)


@@ -659,8 +664,11 @@ class GpuDnnPoolDesc(GpuOp):
    :param ws: windows size
    :param stride: (dx, dy)
    :param mode: 'max' or 'average'
+    :param pad: (padX, padY) padding information.
+        padX is the size of the left and right borders,
+        padY is the size of the top and bottom borders.
    """
-    __props__ = ('ws', 'stride', 'mode')
+    __props__ = ('ws', 'stride', 'mode', 'pad')

    def c_headers(self):
        return ['cudnn.h', 'cudnn_helper.h']
@@ -677,15 +685,27 @@ class GpuDnnPoolDesc(GpuOp):
    def do_constant_folding(self, node):
        return False

-    def __init__(self, ws=(1, 1), stride=(1, 1), mode='max'):
+    def __init__(self, ws=(1, 1), stride=(1, 1), mode='max', pad=(0, 0)):
        assert mode in ('max', 'average')
        self.mode = mode
        assert len(ws) == 2
        self.ws = ws
        assert len(stride) == 2
        self.stride = stride
+        assert len(stride) == 2
+        self.pad = pad
+        if (pad[0] != 0 or pad[1] != 0) and version() < 20:
+            raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'pad'):
+            self.pad = (0, 0)

    def make_node(self):
+        if self.pad != (0, 0) and version() < 20:
+            raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
+
        return Apply(self, [],
                     [CDataType("cudnnPoolingDescriptor_t")()])

@@ -720,7 +740,7 @@ class GpuDnnPoolDesc(GpuOp):
  %(desc)s,
  %(mode_flag)s,
  %(wsX)d, %(wsY)d,
-  0, 0,
+  %(padX)d, %(padY)d,
  %(stridex)d, %(stridey)d
  );
 #endif
@@ -731,11 +751,13 @@ class GpuDnnPoolDesc(GpuOp):
  }
 }
 """ % dict(name=name, desc=desc, mode_flag=mode_flag, fail=sub['fail'],
-           wsX=self.ws[0], wsY=self.ws[1], stridex=self.stride[0],
-           stridey=self.stride[1])
+           wsX=self.ws[0], wsY=self.ws[1],
+           stridex=self.stride[0], stridey=self.stride[1],
+           padX=self.pad[0], padY=self.pad[1],
+       )

    def c_code_cache_version(self):
-        return (1, version())
+        return (2, version())


 class GpuDnnPool(DnnBase):
@@ -845,8 +867,8 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {

 %(out)s_dims[0] = CudaNdarray_HOST_DIMS(%(input)s)[0];
 %(out)s_dims[1] = CudaNdarray_HOST_DIMS(%(input)s)[1];
-%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] - wsX) / strideX + 1;
-%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] - wsY) / strideY + 1;
+%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] + (vpad*2) - wsX) / strideX + 1;
+%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] + (hpad*2) - wsY) / strideY + 1;

 if (CudaNdarray_prep_output(&%(out)s, 4, %(out)s_dims) != 0)
 {
@@ -904,7 +926,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
        return [[1], [0]]

    def c_code_cache_version(self):
-        return (4, version())
+        return (6, version())


 class GpuDnnPoolGrad(DnnBase):
@@ -1063,8 +1085,29 @@ _handle,
 #endif
 if (err%(name)s != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_RuntimeError,
-               "GpuDnnPoolGrad: error doing operation: %%s",
-               cudnnGetErrorString(err%(name)s));
+               "GpuDnnPoolGrad: error doing operation: %%s. "
+               "input.shape=(%%d, %%d, %%d, %%d) "
+               "input_grad.shape=(%%d, %%d, %%d, %%d) "
+               "output.shape=(%%d, %%d, %%d, %%d) "
+               "output_grad.shape=(%%d, %%d, %%d, %%d)",
+               cudnnGetErrorString(err%(name)s),
+               CudaNdarray_HOST_DIMS(%(input)s)[0],
+               CudaNdarray_HOST_DIMS(%(input)s)[1],
+               CudaNdarray_HOST_DIMS(%(input)s)[2],
+               CudaNdarray_HOST_DIMS(%(input)s)[3],
+               CudaNdarray_HOST_DIMS(%(input_grad)s)[0],
+               CudaNdarray_HOST_DIMS(%(input_grad)s)[1],
+               CudaNdarray_HOST_DIMS(%(input_grad)s)[2],
+               CudaNdarray_HOST_DIMS(%(input_grad)s)[3],
+               CudaNdarray_HOST_DIMS(%(output)s)[0],
+               CudaNdarray_HOST_DIMS(%(output)s)[1],
+               CudaNdarray_HOST_DIMS(%(output)s)[2],
+               CudaNdarray_HOST_DIMS(%(output)s)[3],
+               CudaNdarray_HOST_DIMS(%(output_grad)s)[0],
+               CudaNdarray_HOST_DIMS(%(output_grad)s)[1],
+               CudaNdarray_HOST_DIMS(%(output_grad)s)[2],
+               CudaNdarray_HOST_DIMS(%(output_grad)s)[3]
+               );
  %(fail)s
 }
 """ % dict(output_grad=out_grad, desc=desc,
@@ -1077,13 +1120,13 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
           output_grad_desc="output_grad"+name)

    def c_code_cache_version(self):
-        return (4, version())
+        return (5, version())

    def infer_shape(self, node, shape):
        return [shape[0]]


-def dnn_pool(img, ws, stride=(1, 1), mode='max'):
+def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
    """
    GPU pooling using cuDNN from NVIDIA.

@@ -1094,6 +1137,9 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max'):
    :param ws: subsampling window size
    :param stride: subsampling stride (default: (1, 1))
    :param mode: one of 'max', 'average' (default: 'max')
+    :param pad: (padX, padY) padding information.
+        padX is the size of the left and right borders,
+        padY is the size of the top and bottom borders.

    :warning: The cuDNN library only works with GPU that have a compute
      capability of 3.0 or higer.  This means that older GPU will not
@@ -1101,7 +1147,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max'):
    :note: This Op implements the ignore_border=True of max_pool_2d.
    """
    img = gpu_contiguous(img)
-    desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode)()
+    desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)()
    return GpuDnnPool()(img, desc)


@@ -1437,6 +1483,23 @@ if True:
            ds = node.op.ds
            return [dnn_pool(gpu_contiguous(img), ds, ds)]

+    @register_opt('cudnn')
+    @local_optimizer([DownsampleFactorMax])
+    def local_pool_dnn_stride(node):
+        if not dnn_available():
+            return
+        if isinstance(node.op, DownsampleFactorMax):
+            if not node.op.ignore_border:
+                return
+            img, = node.inputs
+            ds = node.op.ds
+            stride = node.op.st
+            pad = node.op.padding
+            if (img.owner and isinstance(img.owner.op, HostFromGpu)):
+                ret = dnn_pool(gpu_contiguous(img.owner.inputs[0]),
+                               ds, stride=stride, pad=pad)
+                return [host_from_gpu(ret)]
+
    @register_opt('cudnn')
    @local_optimizer([GpuDownsampleFactorMaxGrad])
    def local_pool_dnn_grad(node):
@@ -1454,6 +1517,30 @@ if True:
                                     gpu_contiguous(inp_grad),
                                     desc)]

+    @register_opt('cudnn')
+    @local_optimizer([DownsampleFactorMaxGrad])
+    def local_pool_dnn_grad_stride(node):
+        if not dnn_available():
+            return
+        if isinstance(node.op, DownsampleFactorMaxGrad):
+            inp, out, inp_grad = node.inputs
+            ds = node.op.ds
+            st = node.op.st
+            pad = node.op.padding
+
+            if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
+                (out.owner and isinstance(out.owner.op, HostFromGpu)) or
+                (inp_grad.owner and isinstance(inp_grad.owner.op, HostFromGpu))
+            ):
+                desc = GpuDnnPoolDesc(ws=ds, stride=st, mode="max", pad=pad)()
+                if not node.op.ignore_border:
+                    return
+                ret = GpuDnnPoolGrad()(gpu_contiguous(inp),
+                                       gpu_contiguous(out),
+                                       gpu_contiguous(inp_grad),
+                                       desc)
+                return [host_from_gpu(ret)]
+
    @register_opt('cudnn')
    @local_optimizer([GpuSoftmax])
    def local_softmax_dnn(node):

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
 import logging
-import unittest

 from nose.plugins.skip import SkipTest
 import numpy
@@ -10,11 +9,10 @@ from theano.compat.six import StringIO
 from theano.gof.python25 import any
 import theano.tensor as T
 import theano.tests.unittest_tools as utt
-from theano.sandbox.neighbours import images2neibs, neibs2images
+from theano.sandbox.neighbours import images2neibs
 from theano.tensor.signal.downsample import max_pool_2d
 from theano.tensor.signal.downsample import DownsampleFactorMaxGrad
 import theano.sandbox.cuda.dnn as dnn
-from theano.sandbox.cuda.basic_ops import gpu_contiguous

 # Skip test if cuda_ndarray is not available.
 import theano.sandbox.cuda as cuda
@@ -31,6 +29,7 @@ else:


 def pool_2d_i2n(input, ds=(2, 2), strides=None,
+                pad=(0, 0),
                pool_function=T.max, mode='ignore_borders'):
    if strides is None:
        strides = ds
@@ -40,8 +39,19 @@ def pool_2d_i2n(input, ds=(2, 2), strides=None,
            "strides should be smaller than or equal to ds,"
            " strides=(%d, %d) and ds=(%d, %d)" %
            (strides + ds))
-
    shape = input.shape
+    if pad != (0, 0):
+        assert pool_function is T.max
+        pad_x = pad[0]
+        pad_y = pad[1]
+        a = T.alloc(-numpy.inf, shape[0], shape[1], shape[2] + pad_x*2,
+                    shape[3] + pad_y*2)
+        input = T.set_subtensor(a[:, :,
+                                  pad_x:pad_x+shape[2],
+                                  pad_y:pad_y+shape[3]],
+                                input)
+        shape = input.shape
+
    neibs = images2neibs(input, ds, strides, mode=mode)
    pooled_neibs = pool_function(neibs, axis=1)

@@ -58,33 +68,41 @@ def test_pooling():
        raise SkipTest(cuda.dnn.dnn_available.msg)

    x = T.ftensor4()
+    for func, pad in product((T.max, T.mean),
+                             ((0, 0), (1, 0), (1, 0), (2, 3), (3, 2))):
+        if pad != (0, 0) and cuda.dnn.version() < 20:
+            continue

-    for func in (T.max, T.mean):
-        for ws in (2, 4, 5):
+        for ws in (4, 2, 5):
            for stride in (2, 3):
                if stride > ws:
                    continue
-                if ws == stride and func is T.max:
+                if func is T.max:
                    # We will check that the opt introduced it.
-                    out1 = max_pool_2d(x, (ws, ws), ignore_border=True)
+                    out1 = max_pool_2d(x, (ws, ws),
+                                       st=(stride, stride),
+                                       ignore_border=True,
+                                       padding=pad)
                else:
                    out1 = cuda.dnn.dnn_pool(
                        x, ws=(ws, ws),
                        stride=(stride, stride),
+                        pad=pad,
                        mode='max' if func is T.max else "average")
                out2 = pool_2d_i2n(x, ds=(ws, ws), strides=(stride, stride),
+                                   pad=pad,
                                   pool_function=func)

                f1 = theano.function([x], out1, mode=mode_with_gpu)
                assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
                            for node in f1.maker.fgraph.apply_nodes])
-                f2 = theano.function([x], out2, mode=mode_with_gpu)
+                f2 = theano.function([x], out2, mode=mode_without_gpu)
                assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
                                for node in f2.maker.fgraph.apply_nodes])
                for shp in [(1, 10, 100, 100),
                            (1, 3, 99, 99),
                            (32, 1, 147, 197),
-                         ]:
+                            ]:
                    data = numpy.random.normal(0, 1, shp).astype("float32")
                    a = f1(data).__array__()

@@ -98,45 +116,50 @@ def test_pooling():
            data = numpy.random.normal(0, 1, shp).astype("float32")*10

            ws = 2
-            strides = 2
+            stride = 2

            # This test the CPU grad + opt + GPU implemtentation
            def fn(x):
-                return max_pool_2d(x, (ws, ws), ignore_border=True)
+                return max_pool_2d(x, (ws, ws), ignore_border=True,
+                                   padding=pad)
            theano.tests.unittest_tools.verify_grad(fn, [data],
                                                    cast_to_output_type=False,
                                                    mode=mode_with_gpu)
            # Confirm that the opt would have inserted it.
-            f = theano.function([x], theano.grad(fn(x).sum(), x),
-                                mode=mode_with_gpu)
+            fg = theano.function([x], theano.grad(fn(x).sum(), x),
+                                 mode=mode_with_gpu)
            assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
-                        for node in f.maker.fgraph.toposort()])
+                        for node in fg.maker.fgraph.toposort()])

            # Test the GPU grad + GPU implementation
            def fn(x):
                dnn_op = cuda.dnn.dnn_pool(
                    x, ws=(ws, ws),
                    stride=(stride, stride),
+                    pad=pad,
                    mode='max' if func is T.max else "average")
                return dnn_op
-            theano.tests.unittest_tools.verify_grad(fn, [data],
-                                                    cast_to_output_type=False,
-                                                    mode=mode_with_gpu)
+            theano.tests.unittest_tools.verify_grad(
+                fn, [data],
+                cast_to_output_type=False,
+                mode=mode_with_gpu)
            # Confirm that we get the good op.
-            f = theano.function([x], theano.grad(fn(x).sum(), x),
-                                mode=mode_with_gpu)
+            fg = theano.function([x], theano.grad(fn(x).sum(), x),
+                                 mode=mode_with_gpu)
            assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
-                        for node in f.maker.fgraph.toposort()])
-            g_out = f(data)
+                        for node in fg.maker.fgraph.toposort()])
+            g_out = fg(data)

            if func is T.max:
                # Compare again the CPU result
-                out = max_pool_2d(x, (ws, ws), ignore_border=True)
-                f = theano.function([x], theano.grad(out.sum(), x),
-                                    mode=mode_without_gpu)
+                out = max_pool_2d(x, (ws, ws),
+                                  padding=pad,
+                                  ignore_border=True)
+                fc = theano.function([x], theano.grad(out.sum(), x),
+                                     mode=mode_without_gpu)
                assert any([isinstance(node.op, DownsampleFactorMaxGrad)
-                            for node in f.maker.fgraph.toposort()])
-                c_out = f(data)
+                            for node in fc.maker.fgraph.toposort()])
+                c_out = fc(data)
                assert numpy.allclose(c_out, g_out)


@@ -165,7 +188,7 @@ def test_pooling_opt():

 def test_dnn_tag():
    """
-    We test that if cudnn isn't avail we crash and that if it is avail, we use it.
+    Test that if cudnn isn't avail we crash and that if it is avail, we use it.
    """
    x = T.ftensor4()
    old = theano.config.on_opt_error
@@ -412,11 +435,11 @@ class TestDnnInferShapes(utt.InferShapeTester):
                mode=params[2]
            )()
            pool_grad = dnn.GpuDnnPoolGrad()(
-                    img,
-                    out,
-                    img_grad,
-                    desc
-                )
+                img,
+                out,
+                img_grad,
+                desc
+            )
            self._compile_and_check(
                [img, img_grad, out],
                [pool_grad],