[disabled BUGfix,CRASH] related to the cudnn pooling gradient.

bug: The optimization where converting the ignore_border=False to ignore_border=True. crash: The optimization that convert the gpudownsample grad to cuddpoolgrad was swapping the inputs parameter. This is caused in part by cudnn using strange definition of input and output. Now the op use the same inputs order as the other pool grad ops. The only difference is tha the c_code() use the name that cudnn use. There is a comment that tell that.

[disabled BUGfix,CRASH] related to the cudnn pooling gradient.
44be801d · Frederic · 216a6670 · 44be801d · 44be801d
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -737,7 +737,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {

        out = self(img, desc)

-        g_out = GpuDnnPoolGrad()(out, grad, img, desc)
+        g_out = GpuDnnPoolGrad()(img, out, grad, desc)

        return g_out, theano.gradient.DisconnectedType()()

@@ -745,8 +745,8 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
        # not connected to desc
        return [[1], [0]]

-#    def c_code_cache_version(self):
-#        return (3,)
+    def c_code_cache_version(self):
+        return (4,)


 class GpuDnnPoolGrad(DnnBase):
@@ -754,13 +754,13 @@ class GpuDnnPoolGrad(DnnBase):
    The pooling gradient.

    :param inp: the input of the pooling.
-    :param inp_grad: same size as out, but is the corresponding gradient information.
    :param out: the output of the pooling in the forward.
+    :param inp_grad: same size as out, but is the corresponding gradient information.
    :param desc: The pooling descriptor.
    """
    __props__ = ()

-    def make_node(self, inp, inp_grad, out, desc):
+    def make_node(self, inp, out, inp_grad, desc):
        inp = as_cuda_ndarray_variable(inp)
        if inp.type.ndim != 4:
            raise TypeError('inp must be 4D tensor')
@@ -777,7 +777,7 @@ class GpuDnnPoolGrad(DnnBase):
                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
            raise TypeError('desc must be cudnnPoolingDescriptor_t')

-        return Apply(self, [inp, inp_grad, out, desc],
+        return Apply(self, [inp, out, inp_grad, desc],
                     [inp.type()])

    def c_support_code_struct(self, node, struct_id):
@@ -830,7 +830,10 @@ if (output_grad%(id)d != NULL) { cudnnDestroyTensor4dDescriptor(output_grad%(id)
 """ % dict(id=struct_id)

    def c_code(self, node, name, inputs, outputs, sub):
-        inp, inp_grad, out, desc = inputs
+        # Here the name out and inp are based on the cudnn definition.
+        # Not the definition of this class.
+        # This make it complicated.
+        out, inp, inp_grad, desc = inputs
        out_grad, = outputs

        set_in = "\n".join([
@@ -900,7 +903,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
           output_grad_desc="output_grad"+str(sub['struct_id']))

    def c_code_cache_version(self):
-        return (3,)
+        return (4,)


 def dnn_pool(img, ws, stride=(1, 1), mode='max'):
@@ -1178,36 +1181,34 @@ if cuda_available:
                             border_mode=border_mode, subsample=subsample,
                             direction_hint=direction_hint)]

-# DISABLED as there is problems in the handling of borders
-#    @register_opt('cudnn')
+    @register_opt('cudnn')
    @local_optimizer([GpuDownsampleFactorMax])
    def local_pool_dnn(node):
        if not dnn_available():
            return
        if isinstance(node.op, GpuDownsampleFactorMax):
-            if node.op.ignore_border:
+            if not node.op.ignore_border:
                return
            img, = node.inputs
            ds = node.op.ds
            return [dnn_pool(gpu_contiguous(img), ds, ds)]

-# DISABLED as there is problems in the handling of borders
-#    @register_opt('cudnn')
+    @register_opt('cudnn')
    @local_optimizer([GpuDownsampleFactorMaxGrad])
    def local_pool_dnn_grad(node):
        if not dnn_available():
            return
        if isinstance(node.op, GpuDownsampleFactorMaxGrad):
-            if node.op.ignore_border:
+            if not node.op.ignore_border:
                return
            inp, out, inp_grad = node.inputs
            ds = node.op.ds

            desc = GpuDnnPoolDesc(ws=ds, stride=ds, mode="max")()
-
            return [GpuDnnPoolGrad()(gpu_contiguous(inp),
+                                     gpu_contiguous(out),
                                     gpu_contiguous(inp_grad),
-                                     gpu_contiguous(out), desc)]
+                                     desc)]

    @register_opt('cudnn')
    @local_optimizer([GpuSoftmax])

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -11,6 +11,7 @@ import theano.tensor as T
 import theano.tests.unittest_tools as utt
 from theano.sandbox.neighbours import images2neibs, neibs2images
 from theano.tensor.signal.downsample import max_pool_2d
+from theano.tensor.signal.downsample import DownsampleFactorMaxGrad


 # Skip test if cuda_ndarray is not available.
@@ -61,6 +62,10 @@ def test_pooling():
            for stride in (2, 3):
                if stride > ws:
                    continue
+                if ws == stride and func is T.max:
+                    # We will check that the opt introduced it.
+                    out1 = max_pool_2d(x, (ws, ws), ignore_border=True)
+                else:
                    out1 = cuda.dnn.dnn_pool(
                        x, ws=(ws, ws),
                        stride=(stride, stride),
@@ -69,7 +74,11 @@ def test_pooling():
                                   pool_function=func)

                f1 = theano.function([x], out1, mode=mode_with_gpu)
+                assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
+                            for node in f1.maker.fgraph.apply_nodes])
                f2 = theano.function([x], out2, mode=mode_with_gpu)
+                assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
+                                for node in f2.maker.fgraph.apply_nodes])
                for shp in [(1, 10, 100, 100),
                            (1, 3, 99, 99),
                            (32, 1, 147, 197),
@@ -81,6 +90,53 @@ def test_pooling():
                    assert numpy.allclose(a, b,
                                          atol=numpy.finfo(numpy.float32).eps)

+        # Test the grad
+        for shp in [(1, 1, 2, 2),
+                    (1, 1, 3, 3)]:
+            data = numpy.random.normal(0, 1, shp).astype("float32")*10
+
+            ws = 2
+            strides = 2
+
+            # This test the CPU grad + opt + GPU implemtentation
+            def fn(x):
+                return max_pool_2d(x, (ws, ws), ignore_border=True)
+            theano.tests.unittest_tools.verify_grad(fn, [data],
+                                                    cast_to_output_type=False,
+                                                    mode=mode_with_gpu)
+            # Confirm that the opt would have inserted it.
+            f = theano.function([x], theano.grad(fn(x).sum(), x),
+                                mode=mode_with_gpu)
+            assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
+                        for node in f.maker.fgraph.toposort()])
+
+            # Test the GPU grad + GPU implementation
+            def fn(x):
+                dnn_op = cuda.dnn.dnn_pool(
+                    x, ws=(ws, ws),
+                    stride=(stride, stride),
+                    mode='max' if func is T.max else "average")
+                return dnn_op
+            theano.tests.unittest_tools.verify_grad(fn, [data],
+                                                    cast_to_output_type=False,
+                                                    mode=mode_with_gpu)
+            # Confirm that we get the good op.
+            f = theano.function([x], theano.grad(fn(x).sum(), x),
+                                mode=mode_with_gpu)
+            assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
+                        for node in f.maker.fgraph.toposort()])
+            g_out = f(data)
+
+            if func is T.max:
+                # Compare again the CPU result
+                out = max_pool_2d(x, (ws, ws), ignore_border=True)
+                f = theano.function([x], theano.grad(out.sum(), x),
+                                    mode=mode_without_gpu)
+                assert any([isinstance(node.op, DownsampleFactorMaxGrad)
+                            for node in f.maker.fgraph.toposort()])
+                c_out = f(data)
+                assert numpy.allclose(c_out, g_out)
+

 def test_pooling_opt():
    if not cuda.dnn.dnn_available():
@@ -90,7 +146,7 @@ def test_pooling_opt():

    f = theano.function(
        [x],
-        max_pool_2d(x, ds=(2, 2)),
+        max_pool_2d(x, ds=(2, 2), ignore_border=True),
        mode=mode_with_gpu)

    assert any([isinstance(n.op, cuda.dnn.GpuDnnPool)
@@ -98,7 +154,7 @@ def test_pooling_opt():

    f = theano.function(
        [x],
-        T.grad(max_pool_2d(x, ds=(2, 2)).sum(), x),
+        T.grad(max_pool_2d(x, ds=(2, 2), ignore_border=True).sum(), x),
        mode=mode_with_gpu.including("cudnn"))

    assert any([isinstance(n.op, cuda.dnn.GpuDnnPoolGrad)