Merge pull request #4256 from harmdevries89/gpupool_newbackend

Gpupool newbackend

Merge pull request #4256 from harmdevries89/gpupool_newbackend
7ddf071c · Frédéric Bastien · 963eac17 · 720355c3 · 7ddf071c · 7ddf071c
--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -364,28 +364,32 @@ def test_pooling_with_tensor_vars():
            cast_to_output_type=False,
            mode=mode_with_gpu)
-    out2 = pool_2d_i2n(x, ds=(2, 2), strides=(1, 1),
-                       pad=(0, 0),
-                       pool_function=T.max)
    mode_without_gpu2 = mode_without_gpu.including()
    mode_without_gpu2.check_isfinite = False
-    f1 = theano.function([x], fn(x), mode=mode_with_gpu)
+    f_gpu = theano.function([x], fn(x), mode=mode_with_gpu)
    assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
-                for node in f1.maker.fgraph.apply_nodes])
+                for node in f_gpu.maker.fgraph.apply_nodes])
-    f2 = theano.function([x], out2, mode=mode_without_gpu2)
-    assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
+    i = 1
-                    for node in f2.maker.fgraph.apply_nodes])
    for shp in [(1, 10, 100, 100),
                (1, 3, 99, 99),
-                (32, 1, 147, 197),
+                (32, 1, 147, 197)]:
-                ]:
        data = numpy.random.normal(0, 1, shp).astype("float32")
-        a = f1(data).__array__()
+        out = pool_2d_i2n(x, ds=(i, i), strides=(1, 1),
+                          pad=(0, 0),
-        b = f2(data).__array__()
+                          pool_function=T.max)
+        f_cpu = theano.function([x], out, mode=mode_without_gpu2)
+        assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
+                        for node in f_cpu.maker.fgraph.apply_nodes])
+        # Change the window size dynamically for gpu op
+        ws.set_value(numpy.array([i, i]).astype('int32'))
+        a = f_gpu(data).__array__()
+        b = f_cpu(data).__array__()
        utt.assert_allclose(a, b)
+        i += 1
 def test_old_pool_interface():
@@ -745,6 +749,7 @@ def test_dnn_tag():
 class TestDnnInferShapes(utt.InferShapeTester):
    def setUp(self):
        super(TestDnnInferShapes, self).setUp()
        self.mode = mode_with_gpu

--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -142,6 +142,7 @@ dnn_available.msg = None
 class DnnBase(COp):
    """
    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
@@ -255,6 +256,7 @@ version.v = None
 class GpuDnnConvDesc(COp):
    """
    This Op builds a convolution descriptor for use in the other convolution
    operations.
@@ -388,6 +390,7 @@ def ensure_dt(val, default, name, dtype):
 class GpuDnnConv(DnnBase):
    """
    The forward convolution.
@@ -555,6 +558,7 @@ class GpuDnnConv(DnnBase):
 class GpuDnnConvGradW(DnnBase):
    """
    The convolution gradient with respect to the weights.
@@ -675,6 +679,7 @@ class GpuDnnConvGradW(DnnBase):
 class GpuDnnConvGradI(DnnBase):
    """
    The convolution gradient with respect to the inputs.
@@ -943,6 +948,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
 class GpuDnnPoolDesc(Op):
    """
    This Op builds a pooling descriptor for use in the other
    pooling operations.
@@ -1061,69 +1067,87 @@ class GpuDnnPoolDesc(Op):
 class GpuDnnPool(DnnBase):
-    """
-    Pooling.
+    """
    Parameters
    ----------
    img
-        The image 4d tensor.
+        The image 4d or 5d tensor.
-    desc
+    Parameters
-        The pooling descriptor.
+    ----------
+    ws : tensor variable
+        Window size.
+    stride : tensor variable
+        (dx, dy) or (dx, dy, dz).
+    mode : {'max', 'average_inc_pad', 'average_exc_pad'}
+        The old deprecated name 'average' corresponds to 'average_inc_pad'.
+    pad : tensor
+        (padX, padY) or (padX, padY, padZ)
    """
-    __props__ = ()
+    __props__ = ('mode',)
-    def __init__(self):
+    def __init__(self, mode='max'):
        DnnBase.__init__(self, ["dnn_pool.c"], "APPLY_SPECIFIC(dnn_pool)")
+        if mode == 'average':
+            mode = 'average_inc_pad'
+        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
+        self.mode = mode
-    def make_node(self, img, desc):
+    def get_op_params(self):
-        img = as_gpuarray_variable(img, infer_context_name(img))
+        if self.mode == 'max':
+            mode_flag = 'CUDNN_POOLING_MAX'
+        elif self.mode == "average_inc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
+        elif self.mode == "average_exc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
-        if desc.owner is not None:
+        return [('MODE_FLAG', mode_flag)]
-            e_ndim = desc.owner.op.get_ndim() + 2
-            if img.type.ndim != e_ndim:
+    def make_node(self, img, ws, stride, pad):
-                raise TypeError('img must be %dD tensor' % (e_ndim,))
+        ctx_name = infer_context_name(img)
+        img = as_gpuarray_variable(img, ctx_name)
-        if (not isinstance(desc.type, CDataType) or
+        ws = tensor.as_tensor_variable(ws)
-                desc.type.ctype != 'cudnnPoolingDescriptor_t'):
+        stride = tensor.as_tensor_variable(stride)
-            raise TypeError('desc must be cudnnPoolingDescriptor_t')
+        pad = tensor.as_tensor_variable(pad)
+        assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
+        assert ws.type.ndim == 1
-        return Apply(self, [img, desc], [img.type()])
+        return Apply(self, [img, ws, stride, pad], [img.type()])
    def infer_shape(self, node, shape):
-        desc = node.inputs[1].owner.op
+        w = node.inputs[1]
-        w = desc.ws
+        s = node.inputs[2]
-        s = desc.stride
+        p = node.inputs[3]
-        p = desc.pad
        res = [shape[0][0], shape[0][1],
               (shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1,
               (shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1
               ]
-        if len(w) > 2:
+        if node.inputs[0].ndim == 5:
            res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
        return [res]
    def grad(self, inp, grads):
-        img, desc = inp
+        img, ws, stride, pad = inp
        grad, = grads
        grad = gpu_contiguous(grad)
-        out = self(img, desc)
+        out = self(img, ws, stride, pad)
-        g_out = GpuDnnPoolGrad()(img, out, grad, desc)
+        g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad)
-        return g_out, theano.gradient.DisconnectedType()()
+        return g_out, theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()()
    def connection_pattern(self, node):
-        # not connected to desc
+        # not connected to parameters
-        return [[1], [0]]
+        return [[1], [0], [0], [0]]
 class GpuDnnPoolGrad(DnnBase):
    """
    The pooling gradient.
@@ -1135,40 +1159,56 @@ class GpuDnnPoolGrad(DnnBase):
        The output of the pooling in the forward.
    out_grad
        Same size as out, but is the corresponding gradient information.
-    desc
+    ws : tensor variable
-        The pooling descriptor.
+        Window size.
+    stride : tensor variable
+        (dx, dy) or (dx, dy, dz).
+    mode : {'max', 'average_inc_pad', 'average_exc_pad'}
+        The old deprecated name 'average' corresponds to 'average_inc_pad'.
+    pad : tensor
+        (padX, padY) or (padX, padY, padZ)
    """
-    __props__ = ()
+    __props__ = ('mode',)
-    def __init__(self):
+    def __init__(self, mode='max'):
        DnnBase.__init__(self, ["dnn_pool_grad.c"],
                         "APPLY_SPECIFIC(dnn_pool_grad)")
+        if mode == 'average':
+            mode = 'average_inc_pad'
+        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
+        self.mode = mode
+    def get_op_params(self):
+        if self.mode == 'max':
+            mode_flag = 'CUDNN_POOLING_MAX'
+        elif self.mode == "average_inc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
+        elif self.mode == "average_exc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
-    def make_node(self, inp, out, out_grad, desc):
+        return [('MODE_FLAG', mode_flag)]
+    def make_node(self, inp, out, out_grad, ws, stride, pad):
        ctx_name = infer_context_name(inp, out, out_grad)
        inp = as_gpuarray_variable(inp, ctx_name)
+        assert (inp.ndim in [4, 5])
        out_grad = as_gpuarray_variable(out_grad, ctx_name)
+        assert (out_grad.ndim in [4, 5])
        out = as_gpuarray_variable(out, ctx_name)
+        assert(out.ndim in [4, 5])
-        if desc.owner is not None:
+        assert (out_grad.ndim == inp.ndim)
-            nd = desc.owner.op.get_ndim() + 2
+        assert (inp.ndim == out.ndim)
-            if inp.type.ndim != nd:
-                raise TypeError('inp must be %dD tensor' % (nd,))
-            if out_grad.type.ndim != nd:
+        ws = tensor.as_tensor_variable(ws)
-                raise TypeError('out_grad must be %dD tensor' % (nd,))
+        stride = tensor.as_tensor_variable(stride)
+        pad = tensor.as_tensor_variable(pad)
+        assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
+        assert ws.type.ndim == 1
-            if out.type.ndim != nd:
+        return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()])
-                raise TypeError('out must be %dD tensor' % (nd,))
-        if (not isinstance(desc.type, CDataType) or
-                desc.type.ctype != 'cudnnPoolingDescriptor_t'):
-            raise TypeError('desc must be cudnnPoolingDescriptor_t')
-        return Apply(self, [inp, out, out_grad, desc], [inp.type()])
    def infer_shape(self, node, shape):
        return [shape[0]]
@@ -1206,11 +1246,11 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
    """
    img = gpu_contiguous(img)
-    desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)()
+    return GpuDnnPool(mode=mode)(img, ws, stride, pad)
-    return GpuDnnPool()(img, desc)
 class GpuDnnSoftmaxBase(DnnBase):
    """
    Op for the cuDNN Softmax.
@@ -1263,6 +1303,7 @@ class GpuDnnSoftmaxBase(DnnBase):
 class GpuDnnSoftmax(GpuDnnSoftmaxBase):
    """
    Op for the cuDNN Softmax.
@@ -1296,6 +1337,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
 class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
    """
    Op for the cuDNN SoftmaxGrad.
@@ -1467,11 +1509,12 @@ def local_pool_dnn_grad_stride(node, ctx_name):
    pad = node.op.padding
    mode = node.op.mode
-    desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
+    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
-    return GpuDnnPoolGrad()(gpu_contiguous(inp),
+                                     gpu_contiguous(out),
-                            gpu_contiguous(out),
+                                     gpu_contiguous(out_grad),
-                            gpu_contiguous(out_grad),
+                                     ds,
-                            desc)
+                                     st,
+                                     pad)
 @register_opt('cudnn')
@@ -1491,11 +1534,10 @@ def local_avg_pool_dnn_grad_stride(node, ctx_name):
    cg = gpu_contiguous(out_grad)
-    desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
    # We reuse cg because CuDNN does not use the value of the `out`
    # argument but still checks its shape for average pooling. This
    # has been observed in v2 and v3 as far as I know.
-    return GpuDnnPoolGrad()(gpu_contiguous(inp), cg, cg, desc)
+    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), cg, cg, ds, st, pad)
 @register_opt('cudnn')
@@ -1548,6 +1590,7 @@ def local_logsoftmax_to_dnn(node, ctx_name):
 class NoCuDNNRaise(Optimizer):
    def apply(self, fgraph):
        """
        Raise a error if cudnn can't be used.

--- a/theano/sandbox/gpuarray/dnn_pool.c
+++ b/theano/sandbox/gpuarray/dnn_pool.c
@@ -2,12 +2,15 @@
 cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);
 #section init_code_struct
 cudnnStatus_t APPLY_SPECIFIC(err);
 APPLY_SPECIFIC(input) = NULL;
 APPLY_SPECIFIC(output) = NULL;
+APPLY_SPECIFIC(pool) = NULL;
 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
@@ -19,16 +22,25 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output)))
               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }
+if ((APPLY_SPECIFIC(err) = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
+                "(pool): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));  
+  FAIL;
+}
 #section cleanup_code_struct
 if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
 if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
+if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }
 #section support_code_struct
 int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
-                             cudnnPoolingDescriptor_t desc,
+                             PyArrayObject *ws, 
+                             PyArrayObject *stride,
+                             PyArrayObject *pad,
                             PyGpuArrayObject **out,
                             PyGpuContextObject *c) {
  cudnnStatus_t err;
@@ -46,14 +58,21 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
  int w[3];
  int p[3];
  int s[3];
-  int ndims;
+  int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
+  for(int i = 0; i < ndims; i++) {
+     w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
+  }
+  for(int i = 0; i < ndims; i++) {
+     p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
+  }
+  for(int i = 0; i < ndims; i++) {
+     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
+  }
+  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
-  err = cudnnGetPoolingNdDescriptor(desc, 3, &mode, &ndims, w, p, s);
  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
+    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
-                 "error doing cudnnGetPoolingDescriptor operation: %s",
-                 cudnnGetErrorString(err));
-    return 1;
  }
  dims[0] = PyGpuArray_DIM(img, 0);
@@ -98,7 +117,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
    cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
    err = cudnnPoolingForward(
-      APPLY_SPECIFIC(_handle), desc,
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
      alpha,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
      beta,

--- a/theano/sandbox/gpuarray/dnn_pool_grad.c
+++ b/theano/sandbox/gpuarray/dnn_pool_grad.c
@@ -4,6 +4,7 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(input_grad);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(output_grad);
+cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);
 #section init_code_struct
@@ -11,6 +12,7 @@ APPLY_SPECIFIC(input) = NULL;
 APPLY_SPECIFIC(input_grad) = NULL;
 APPLY_SPECIFIC(output) = NULL;
 APPLY_SPECIFIC(output_grad) = NULL;
+APPLY_SPECIFIC(pool) = NULL;
 {
  cudnnStatus_t err;
@@ -38,6 +40,11 @@ APPLY_SPECIFIC(output_grad) = NULL;
                 cudnnGetErrorString(err));
    FAIL;
  }
+  if ((err = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
+                "(pool): %s", cudnnGetErrorString(err));  
+    FAIL;
+  }
 }
 #section cleanup_code_struct
@@ -46,13 +53,16 @@ if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC
 if (APPLY_SPECIFIC(input_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input_grad)); }
 if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
 if (APPLY_SPECIFIC(output_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output_grad)); }
+if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }
 #section support_code_struct
 int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
                                  PyGpuArrayObject *out,
                                  PyGpuArrayObject *out_grad,
-                                  cudnnPoolingDescriptor_t desc,
+                                  PyArrayObject *ws, 
+                                  PyArrayObject *stride,
+                                  PyArrayObject *pad,
                                  PyGpuArrayObject **inp_grad,
                                  PyGpuContextObject *c) {
  cudnnStatus_t err;
@@ -85,6 +95,26 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
    return 1;
  }
+  int w[3];
+  int p[3];
+  int s[3];
+  int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
+  for(int i = 0; i < ndims; i++) {
+     w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
+  }
+  for(int i = 0; i < ndims; i++) {
+     p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
+  }
+  for(int i = 0; i < ndims; i++) {
+     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
+  }
+  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
+  }
  if (c_set_tensorNd(*inp_grad, APPLY_SPECIFIC(input_grad)) != 0)
    return 1;
@@ -118,7 +148,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
    cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
    err = cudnnPoolingBackward(
-      APPLY_SPECIFIC(_handle), desc,
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
      alpha,
      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
      APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),

--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
@@ -275,6 +275,55 @@ def test_pooling():
            utt.assert_allclose(c_out, g_out)
+def test_pooling_with_tensor_vars():
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+    x = T.ftensor4()
+    ws = theano.shared(numpy.array([2, 2], dtype='int32'))
+    st = theano.shared(numpy.array([1, 1], dtype='int32'))
+    pad = theano.shared(numpy.array([0, 0], dtype='int32'))
+    mode = 'max'
+    def fn(x):
+        dnn_op = dnn.dnn_pool(x,
+                              ws=ws,
+                              stride=st,
+                              pad=pad,
+                              mode=mode)
+        return dnn_op
+    for shp in [(1, 1, 2, 2),
+                (1, 1, 3, 3)]:
+        data = numpy.random.normal(0, 1, shp).astype("float32") * 10
+        theano.tests.unittest_tools.verify_grad(
+            fn, [data],
+            cast_to_output_type=False,
+            mode=mode_with_gpu)
+    out2 = pool_2d_i2n(x, ds=(2, 2), strides=(1, 1),
+                       pad=(0, 0),
+                       pool_function=T.max)
+    mode_without_gpu2 = mode_without_gpu.including()
+    mode_without_gpu2.check_isfinite = False
+    f1 = theano.function([x], fn(x), mode=mode_with_gpu)
+    assert any([isinstance(node.op, dnn.GpuDnnPool)
+                for node in f1.maker.fgraph.apply_nodes])
+    f2 = theano.function([x], out2, mode=mode_without_gpu2)
+    assert not any([isinstance(node.op, dnn.GpuDnnPool)
+                    for node in f2.maker.fgraph.apply_nodes])
+    for shp in [(1, 10, 100, 100),
+                (1, 3, 99, 99),
+                (32, 1, 147, 197),
+                ]:
+        data = numpy.random.normal(0, 1, shp).astype("float32")
+        a = f1(data).__array__()
+        b = f2(data).__array__()
+        utt.assert_allclose(a, b)
 def test_pooling_opt():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
@@ -340,6 +389,7 @@ def test_dnn_tag():
 class TestDnnInferShapes(utt.InferShapeTester):
    def setUp(self):
        super(TestDnnInferShapes, self).setUp()
        self.mode = mode_with_gpu
@@ -525,14 +575,9 @@ class TestDnnInferShapes(utt.InferShapeTester):
            [(1, 1), (2, 2), (3, 3)],
            modes
        ):
-            desc = dnn.GpuDnnPoolDesc(
-                ws=params[0],
-                stride=params[1],
-                mode=params[2]
-            )()
            self._compile_and_check(
                [img],
-                [dnn.GpuDnnPool()(img, desc)],
+                [dnn.GpuDnnPool(mode=params[2])(img, params[0], params[1], (0, 0))],
                [img_val],
                dnn.GpuDnnPool
            )
@@ -561,16 +606,13 @@ class TestDnnInferShapes(utt.InferShapeTester):
            [(1, 1), (2, 2), (3, 3)],
            ['max', 'average_inc_pad']
        ):
-            desc = dnn.GpuDnnPoolDesc(
+            pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
-                ws=params[0],
-                stride=params[1],
-                mode=params[2]
-            )()
-            pool_grad = dnn.GpuDnnPoolGrad()(
                img,
                out,
                img_grad,
-                desc
+                params[0],
+                params[1],
+                (0, 0)
            )
            self._compile_and_check(
                [img, img_grad, out],