Merge pull request #2281 from nouiz/dnn_pool_grad

Enable again dnn pool grad after fixed it.

Merge pull request #2281 from nouiz/dnn_pool_grad
36437aca · abergeron · f3a80dcb · 44be801d · 36437aca · 36437aca
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -81,6 +81,16 @@ from theano.updates import Updates, OrderedUpdates

 from theano.gradient import Rop, Lop, grad, subgraph_grad

+# This need to be before the init of GPU, as it add config variable
+# needed during that phase.
+import theano.tests
+if hasattr(theano.tests, "TheanoNoseTester"):
+    test = theano.tests.TheanoNoseTester().test
+else:
+    def test():
+        raise ImportError("The nose module is not installed."
+                          " It is needed for Theano tests.")
+
 if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
    import theano.sandbox.cuda
    # We can't test the driver during import of theano.sandbox.cuda as
@@ -185,15 +195,6 @@ def sparse_grad(var):
    ret = var.owner.op.__class__(sparse_grad=True)(*var.owner.inputs)
    return ret

-
-import theano.tests
-if hasattr(theano.tests, "TheanoNoseTester"):
-    test = theano.tests.TheanoNoseTester().test
-else:
-    def test():
-        raise ImportError("The nose module is not installed."
-                          " It is needed for Theano tests.")
-
 # This cannot be done in tensor/__init__.py due to a circular dependency -- randomstreams
 # depends on raw_random which depends on tensor.  As a work-around, we import RandomStreams
 # here and inject an instance in tensor.

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -126,10 +126,10 @@ def raise_with_op(node, thunk=None, exc_info=None):
                       for ipt in thunk.inputs]
            scalar_values = []
            for ipt in thunk.inputs:
-                if getattr(ipt[0], "size", -1) == 1:
+                if getattr(ipt[0], "size", -1) <= 5:
                    scalar_values.append(ipt[0])
                else:
-                    scalar_values.append("not scalar")
+                    scalar_values.append("not shown")
        else:
            shapes = "The thunk don't have an inputs attributes."
            strides = "So we can't access the strides of inputs values"
@@ -137,7 +137,7 @@ def raise_with_op(node, thunk=None, exc_info=None):

        detailed_err_msg += ("Inputs shapes: %s" % shapes +
                             "\nInputs strides: %s" % strides +
-                             "\nInputs scalar values: %s\n" % scalar_values)
+                             "\nInputs values: %s\n" % scalar_values)
    else:
        hints.append(
            "HINT: Use another linker then the c linker to"

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2345,7 +2345,8 @@ class GpuReshape(tensor.Reshape, GpuOp):
                shp = shp_new

            else:
-                raise ValueError("total size of new array must be unchanged")
+                raise ValueError("total size of new array must be unchanged",
+                                 x.shape, shp)

        out[0] = x.reshape(tuple(shp))


--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -723,7 +723,8 @@ int wsX, wsY, strideX, strideY;
 err%(name)s = cudnnGetPoolingDescriptor(%(desc)s, &mode, &wsX, &wsY, &strideX, &strideY);

 if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "error doing operation: %%s",
+  PyErr_Format(PyExc_RuntimeError,
+               "GpuDnnPool: error doing cudnnGetPoolingDescriptor operation: %%s",
               cudnnGetErrorString(err%(name)s));
  %(fail)s
 }
@@ -747,7 +748,8 @@ _handle,
 %(output_desc)s, CudaNdarray_DEV_DATA(%(out)s)
 );
 if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "error doing operation: %%s",
+  PyErr_Format(PyExc_RuntimeError,
+               "GpuDnnPool: error doing cudnnPoolingForward operation: %%s",
               cudnnGetErrorString(err%(name)s));
  %(fail)s
 }
@@ -765,7 +767,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {

        out = self(img, desc)

-        g_out = GpuDnnPoolGrad()(out, grad, img, desc)
+        g_out = GpuDnnPoolGrad()(img, out, grad, desc)

        return g_out, theano.gradient.DisconnectedType()()

@@ -774,7 +776,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
        return [[1], [0]]

    def c_code_cache_version(self):
-        return (2,)
+        return (4,)


 class GpuDnnPoolGrad(DnnBase):
@@ -782,13 +784,13 @@ class GpuDnnPoolGrad(DnnBase):
    The pooling gradient.

    :param inp: the input of the pooling.
-    :param inp_grad: same size as out, but is the corresponding gradient information.
    :param out: the output of the pooling in the forward.
+    :param inp_grad: same size as out, but is the corresponding gradient information.
    :param desc: The pooling descriptor.
    """
    __props__ = ()

-    def make_node(self, inp, inp_grad, out, desc):
+    def make_node(self, inp, out, inp_grad, desc):
        inp = as_cuda_ndarray_variable(inp)
        if inp.type.ndim != 4:
            raise TypeError('inp must be 4D tensor')
@@ -805,7 +807,7 @@ class GpuDnnPoolGrad(DnnBase):
                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
            raise TypeError('desc must be cudnnPoolingDescriptor_t')

-        return Apply(self, [inp, inp_grad, out, desc],
+        return Apply(self, [inp, out, inp_grad, desc],
                     [inp.type()])

    def c_support_code_struct(self, node, struct_id):
@@ -824,22 +826,26 @@ input_grad%(id)d = NULL;
 output%(id)d = NULL;
 output_grad%(id)d = NULL;
 if ((err%(id)d = cudnnCreateTensor4dDescriptor(&input%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError,
+               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
               "(input): %%s", cudnnGetErrorString(err%(id)d));
  %(fail)s
 }
 if ((err%(id)d = cudnnCreateTensor4dDescriptor(&input_grad%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError,
+               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
               "(input_grad): %%s", cudnnGetErrorString(err%(id)d));
  %(fail)s
 }
 if ((err%(id)d = cudnnCreateTensor4dDescriptor(&output%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError,
+               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
               "(output): %%s", cudnnGetErrorString(err%(id)d));
  %(fail)s
 }
 if ((err%(id)d = cudnnCreateTensor4dDescriptor(&output_grad%(id)d)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError,
+               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
               "(output_grad): %%s", cudnnGetErrorString(err%(id)d));
  %(fail)s
 }
@@ -854,7 +860,10 @@ if (output_grad%(id)d != NULL) { cudnnDestroyTensor4dDescriptor(output_grad%(id)
 """ % dict(id=struct_id)

    def c_code(self, node, name, inputs, outputs, sub):
-        inp, inp_grad, out, desc = inputs
+        # Here the name out and inp are based on the cudnn definition.
+        # Not the definition of this class.
+        # This make it complicated.
+        out, inp, inp_grad, desc = inputs
        out_grad, = outputs

        set_in = "\n".join([
@@ -873,23 +882,27 @@ if (output_grad%(id)d != NULL) { cudnnDestroyTensor4dDescriptor(output_grad%(id)
 cudnnStatus_t err%(name)s;

 if (!CudaNdarray_is_c_contiguous(%(input)s)) {
-  PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
+  PyErr_SetString(PyExc_ValueError,
+                  "GpuDnnPoolGrad: Only contiguous inputs are supported.");
  %(fail)s
 }

 if (!CudaNdarray_is_c_contiguous(%(input_grad)s)) {
-  PyErr_SetString(PyExc_ValueError, "Only contiguous input gradients are supported.");
+  PyErr_SetString(PyExc_ValueError,
+                  "GpuDnnPoolGrad: Only contiguous input gradients are supported.");
  %(fail)s
 }

 if (!CudaNdarray_is_c_contiguous(%(output)s)) {
-  PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
+  PyErr_SetString(PyExc_ValueError,
+                  "GpuDnnPoolGrad: Only contiguous outputs are supported.");
  %(fail)s
 }

 %(set_in)s

-if (CudaNdarray_prep_output(&%(output_grad)s, 4, CudaNdarray_HOST_DIMS(%(output)s)) != 0)
+if (CudaNdarray_prep_output(&%(output_grad)s, 4,
+                            CudaNdarray_HOST_DIMS(%(output)s)) != 0)
 {
  %(fail)s
 }
@@ -905,7 +918,8 @@ _handle,
 %(output_grad_desc)s, CudaNdarray_DEV_DATA(%(output_grad)s)
 );
 if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "error doing operation: %%s",
+  PyErr_Format(PyExc_RuntimeError,
+               "GpuDnnPoolGrad: error doing operation: %%s",
               cudnnGetErrorString(err%(name)s));
  %(fail)s
 }
@@ -919,7 +933,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
           output_grad_desc="output_grad"+str(sub['struct_id']))

    def c_code_cache_version(self):
-        return (2,)
+        return (4,)


 def dnn_pool(img, ws, stride=(1, 1), mode='max'):
@@ -1193,36 +1207,34 @@ if True:
                             border_mode=border_mode, subsample=subsample,
                             direction_hint=direction_hint)]

-# DISABLED as there is problems in the handling of borders
-#    @register_opt('cudnn')
+    @register_opt('cudnn')
    @local_optimizer([GpuDownsampleFactorMax])
    def local_pool_dnn(node):
        if not dnn_available():
            return
        if isinstance(node.op, GpuDownsampleFactorMax):
-            if node.op.ignore_border:
+            if not node.op.ignore_border:
                return
            img, = node.inputs
            ds = node.op.ds
            return [dnn_pool(gpu_contiguous(img), ds, ds)]

-# DISABLED as there is problems in the handling of borders
-#    @register_opt('cudnn')
+    @register_opt('cudnn')
    @local_optimizer([GpuDownsampleFactorMaxGrad])
    def local_pool_dnn_grad(node):
        if not dnn_available():
            return
        if isinstance(node.op, GpuDownsampleFactorMaxGrad):
-            if node.op.ignore_border:
+            if not node.op.ignore_border:
                return
            inp, out, inp_grad = node.inputs
            ds = node.op.ds

            desc = GpuDnnPoolDesc(ws=ds, stride=ds, mode="max")()
-
            return [GpuDnnPoolGrad()(gpu_contiguous(inp),
+                                     gpu_contiguous(out),
                                     gpu_contiguous(inp_grad),
-                                     gpu_contiguous(out), desc)]
+                                     desc)]

    @register_opt('cudnn')
    @local_optimizer([GpuSoftmax])

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -11,6 +11,7 @@ import theano.tensor as T
 import theano.tests.unittest_tools as utt
 from theano.sandbox.neighbours import images2neibs, neibs2images
 from theano.tensor.signal.downsample import max_pool_2d
+from theano.tensor.signal.downsample import DownsampleFactorMaxGrad


 # Skip test if cuda_ndarray is not available.
@@ -57,26 +58,84 @@ def test_pooling():
    x = T.ftensor4()

    for func in (T.max, T.mean):
-        for ws in (4, 5):
+        for ws in (2, 4, 5):
            for stride in (2, 3):
-                out1 = cuda.dnn.dnn_pool(
-                    x, ws=(ws, ws),
-                    stride=(stride, stride),
-                    mode='max' if func is T.max else "average")
+                if stride > ws:
+                    continue
+                if ws == stride and func is T.max:
+                    # We will check that the opt introduced it.
+                    out1 = max_pool_2d(x, (ws, ws), ignore_border=True)
+                else:
+                    out1 = cuda.dnn.dnn_pool(
+                        x, ws=(ws, ws),
+                        stride=(stride, stride),
+                        mode='max' if func is T.max else "average")
                out2 = pool_2d_i2n(x, ds=(ws, ws), strides=(stride, stride),
                                   pool_function=func)

                f1 = theano.function([x], out1, mode=mode_with_gpu)
+                assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
+                            for node in f1.maker.fgraph.apply_nodes])
                f2 = theano.function([x], out2, mode=mode_with_gpu)
-
-                data = numpy.random.normal(
-                    0, 1, (1, 10, 100, 100)).astype("float32")
-                a = f1(data).__array__()
-
-                b = f2(data).__array__()
-
-                assert numpy.allclose(a, b,
-                                      atol=numpy.finfo(numpy.float32).eps)
+                assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
+                                for node in f2.maker.fgraph.apply_nodes])
+                for shp in [(1, 10, 100, 100),
+                            (1, 3, 99, 99),
+                            (32, 1, 147, 197),
+                         ]:
+                    data = numpy.random.normal(0, 1, shp).astype("float32")
+                    a = f1(data).__array__()
+
+                    b = f2(data).__array__()
+                    assert numpy.allclose(a, b,
+                                          atol=numpy.finfo(numpy.float32).eps)
+
+        # Test the grad
+        for shp in [(1, 1, 2, 2),
+                    (1, 1, 3, 3)]:
+            data = numpy.random.normal(0, 1, shp).astype("float32")*10
+
+            ws = 2
+            strides = 2
+
+            # This test the CPU grad + opt + GPU implemtentation
+            def fn(x):
+                return max_pool_2d(x, (ws, ws), ignore_border=True)
+            theano.tests.unittest_tools.verify_grad(fn, [data],
+                                                    cast_to_output_type=False,
+                                                    mode=mode_with_gpu)
+            # Confirm that the opt would have inserted it.
+            f = theano.function([x], theano.grad(fn(x).sum(), x),
+                                mode=mode_with_gpu)
+            assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
+                        for node in f.maker.fgraph.toposort()])
+
+            # Test the GPU grad + GPU implementation
+            def fn(x):
+                dnn_op = cuda.dnn.dnn_pool(
+                    x, ws=(ws, ws),
+                    stride=(stride, stride),
+                    mode='max' if func is T.max else "average")
+                return dnn_op
+            theano.tests.unittest_tools.verify_grad(fn, [data],
+                                                    cast_to_output_type=False,
+                                                    mode=mode_with_gpu)
+            # Confirm that we get the good op.
+            f = theano.function([x], theano.grad(fn(x).sum(), x),
+                                mode=mode_with_gpu)
+            assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
+                        for node in f.maker.fgraph.toposort()])
+            g_out = f(data)
+
+            if func is T.max:
+                # Compare again the CPU result
+                out = max_pool_2d(x, (ws, ws), ignore_border=True)
+                f = theano.function([x], theano.grad(out.sum(), x),
+                                    mode=mode_without_gpu)
+                assert any([isinstance(node.op, DownsampleFactorMaxGrad)
+                            for node in f.maker.fgraph.toposort()])
+                c_out = f(data)
+                assert numpy.allclose(c_out, g_out)


 def test_pooling_opt():
@@ -87,7 +146,7 @@ def test_pooling_opt():

    f = theano.function(
        [x],
-        max_pool_2d(x, ds=(2, 2)),
+        max_pool_2d(x, ds=(2, 2), ignore_border=True),
        mode=mode_with_gpu)

    assert any([isinstance(n.op, cuda.dnn.GpuDnnPool)
@@ -95,7 +154,7 @@ def test_pooling_opt():

    f = theano.function(
        [x],
-        T.grad(max_pool_2d(x, ds=(2, 2)).sum(), x),
+        T.grad(max_pool_2d(x, ds=(2, 2), ignore_border=True).sum(), x),
        mode=mode_with_gpu.including("cudnn"))

    assert any([isinstance(n.op, cuda.dnn.GpuDnnPoolGrad)