Merge pull request #4256 from harmdevries89/gpupool_newbackend

Gpupool newbackend

Merge pull request #4256 from harmdevries89/gpupool_newbackend
7ddf071c · Frédéric Bastien · 963eac17 · 720355c3 · 7ddf071c · 7ddf071c
--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -364,28 +364,32 @@ def test_pooling_with_tensor_vars():
            cast_to_output_type=False,
            mode=mode_with_gpu)

-    out2 = pool_2d_i2n(x, ds=(2, 2), strides=(1, 1),
-                       pad=(0, 0),
-                       pool_function=T.max)
-
    mode_without_gpu2 = mode_without_gpu.including()
    mode_without_gpu2.check_isfinite = False

-    f1 = theano.function([x], fn(x), mode=mode_with_gpu)
+    f_gpu = theano.function([x], fn(x), mode=mode_with_gpu)
    assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
-                for node in f1.maker.fgraph.apply_nodes])
-    f2 = theano.function([x], out2, mode=mode_without_gpu2)
-    assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
-                    for node in f2.maker.fgraph.apply_nodes])
+                for node in f_gpu.maker.fgraph.apply_nodes])
+
+    i = 1
    for shp in [(1, 10, 100, 100),
                (1, 3, 99, 99),
-                (32, 1, 147, 197),
-                ]:
+                (32, 1, 147, 197)]:
        data = numpy.random.normal(0, 1, shp).astype("float32")
-        a = f1(data).__array__()
-
-        b = f2(data).__array__()
+        out = pool_2d_i2n(x, ds=(i, i), strides=(1, 1),
+                          pad=(0, 0),
+                          pool_function=T.max)
+
+        f_cpu = theano.function([x], out, mode=mode_without_gpu2)
+        assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
+                        for node in f_cpu.maker.fgraph.apply_nodes])
+
+        # Change the window size dynamically for gpu op
+        ws.set_value(numpy.array([i, i]).astype('int32'))
+        a = f_gpu(data).__array__()
+        b = f_cpu(data).__array__()
        utt.assert_allclose(a, b)
+        i += 1


 def test_old_pool_interface():
@@ -745,6 +749,7 @@ def test_dnn_tag():


 class TestDnnInferShapes(utt.InferShapeTester):
+
    def setUp(self):
        super(TestDnnInferShapes, self).setUp()
        self.mode = mode_with_gpu

--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -142,6 +142,7 @@ dnn_available.msg = None


 class DnnBase(COp):
+
    """
    Creates a handle for cudnn and pulls in the cudnn libraries and headers.

@@ -255,6 +256,7 @@ version.v = None


 class GpuDnnConvDesc(COp):
+
    """
    This Op builds a convolution descriptor for use in the other convolution
    operations.
@@ -388,6 +390,7 @@ def ensure_dt(val, default, name, dtype):


 class GpuDnnConv(DnnBase):
+
    """
    The forward convolution.

@@ -555,6 +558,7 @@ class GpuDnnConv(DnnBase):


 class GpuDnnConvGradW(DnnBase):
+
    """
    The convolution gradient with respect to the weights.

@@ -675,6 +679,7 @@ class GpuDnnConvGradW(DnnBase):


 class GpuDnnConvGradI(DnnBase):
+
    """
    The convolution gradient with respect to the inputs.

@@ -943,6 +948,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',


 class GpuDnnPoolDesc(Op):
+
    """
    This Op builds a pooling descriptor for use in the other
    pooling operations.
@@ -1061,69 +1067,87 @@ class GpuDnnPoolDesc(Op):


 class GpuDnnPool(DnnBase):
-    """
-    Pooling.

+    """
    Parameters
    ----------
    img
-        The image 4d tensor.
-    desc
-        The pooling descriptor.
-
+        The image 4d or 5d tensor.
+    Parameters
+    ----------
+    ws : tensor variable
+        Window size.
+    stride : tensor variable
+        (dx, dy) or (dx, dy, dz).
+    mode : {'max', 'average_inc_pad', 'average_exc_pad'}
+        The old deprecated name 'average' corresponds to 'average_inc_pad'.
+    pad : tensor
+        (padX, padY) or (padX, padY, padZ)
    """

-    __props__ = ()
+    __props__ = ('mode',)

-    def __init__(self):
+    def __init__(self, mode='max'):
        DnnBase.__init__(self, ["dnn_pool.c"], "APPLY_SPECIFIC(dnn_pool)")
+        if mode == 'average':
+            mode = 'average_inc_pad'
+        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
+        self.mode = mode

-    def make_node(self, img, desc):
-        img = as_gpuarray_variable(img, infer_context_name(img))
+    def get_op_params(self):
+        if self.mode == 'max':
+            mode_flag = 'CUDNN_POOLING_MAX'
+        elif self.mode == "average_inc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
+        elif self.mode == "average_exc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'

-        if desc.owner is not None:
-            e_ndim = desc.owner.op.get_ndim() + 2
+        return [('MODE_FLAG', mode_flag)]

-            if img.type.ndim != e_ndim:
-                raise TypeError('img must be %dD tensor' % (e_ndim,))
+    def make_node(self, img, ws, stride, pad):
+        ctx_name = infer_context_name(img)
+        img = as_gpuarray_variable(img, ctx_name)

-        if (not isinstance(desc.type, CDataType) or
-                desc.type.ctype != 'cudnnPoolingDescriptor_t'):
-            raise TypeError('desc must be cudnnPoolingDescriptor_t')
+        ws = tensor.as_tensor_variable(ws)
+        stride = tensor.as_tensor_variable(stride)
+        pad = tensor.as_tensor_variable(pad)
+        assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
+        assert ws.type.ndim == 1

-        return Apply(self, [img, desc], [img.type()])
+        return Apply(self, [img, ws, stride, pad], [img.type()])

    def infer_shape(self, node, shape):
-        desc = node.inputs[1].owner.op
-        w = desc.ws
-        s = desc.stride
-        p = desc.pad
+        w = node.inputs[1]
+        s = node.inputs[2]
+        p = node.inputs[3]
+
        res = [shape[0][0], shape[0][1],
               (shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1,
               (shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1
               ]
-        if len(w) > 2:
+        if node.inputs[0].ndim == 5:
            res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
        return [res]

    def grad(self, inp, grads):
-        img, desc = inp
+        img, ws, stride, pad = inp
        grad, = grads

        grad = gpu_contiguous(grad)

-        out = self(img, desc)
+        out = self(img, ws, stride, pad)

-        g_out = GpuDnnPoolGrad()(img, out, grad, desc)
+        g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad)

-        return g_out, theano.gradient.DisconnectedType()()
+        return g_out, theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()()

    def connection_pattern(self, node):
-        # not connected to desc
-        return [[1], [0]]
+        # not connected to parameters
+        return [[1], [0], [0], [0]]


 class GpuDnnPoolGrad(DnnBase):
+
    """
    The pooling gradient.

@@ -1135,40 +1159,56 @@ class GpuDnnPoolGrad(DnnBase):
        The output of the pooling in the forward.
    out_grad
        Same size as out, but is the corresponding gradient information.
-    desc
-        The pooling descriptor.
+    ws : tensor variable
+        Window size.
+    stride : tensor variable
+        (dx, dy) or (dx, dy, dz).
+    mode : {'max', 'average_inc_pad', 'average_exc_pad'}
+        The old deprecated name 'average' corresponds to 'average_inc_pad'.
+    pad : tensor
+        (padX, padY) or (padX, padY, padZ)

    """

-    __props__ = ()
+    __props__ = ('mode',)

-    def __init__(self):
+    def __init__(self, mode='max'):
        DnnBase.__init__(self, ["dnn_pool_grad.c"],
                         "APPLY_SPECIFIC(dnn_pool_grad)")
+        if mode == 'average':
+            mode = 'average_inc_pad'
+        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
+        self.mode = mode
+
+    def get_op_params(self):
+        if self.mode == 'max':
+            mode_flag = 'CUDNN_POOLING_MAX'
+        elif self.mode == "average_inc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
+        elif self.mode == "average_exc_pad":
+            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'

-    def make_node(self, inp, out, out_grad, desc):
+        return [('MODE_FLAG', mode_flag)]
+
+    def make_node(self, inp, out, out_grad, ws, stride, pad):
        ctx_name = infer_context_name(inp, out, out_grad)
        inp = as_gpuarray_variable(inp, ctx_name)
+        assert (inp.ndim in [4, 5])
        out_grad = as_gpuarray_variable(out_grad, ctx_name)
+        assert (out_grad.ndim in [4, 5])
        out = as_gpuarray_variable(out, ctx_name)
+        assert(out.ndim in [4, 5])

-        if desc.owner is not None:
-            nd = desc.owner.op.get_ndim() + 2
-
-            if inp.type.ndim != nd:
-                raise TypeError('inp must be %dD tensor' % (nd,))
+        assert (out_grad.ndim == inp.ndim)
+        assert (inp.ndim == out.ndim)

-            if out_grad.type.ndim != nd:
-                raise TypeError('out_grad must be %dD tensor' % (nd,))
+        ws = tensor.as_tensor_variable(ws)
+        stride = tensor.as_tensor_variable(stride)
+        pad = tensor.as_tensor_variable(pad)
+        assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
+        assert ws.type.ndim == 1

-            if out.type.ndim != nd:
-                raise TypeError('out must be %dD tensor' % (nd,))
-
-        if (not isinstance(desc.type, CDataType) or
-                desc.type.ctype != 'cudnnPoolingDescriptor_t'):
-            raise TypeError('desc must be cudnnPoolingDescriptor_t')
-
-        return Apply(self, [inp, out, out_grad, desc], [inp.type()])
+        return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()])

    def infer_shape(self, node, shape):
        return [shape[0]]
@@ -1206,11 +1246,11 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):

    """
    img = gpu_contiguous(img)
-    desc = GpuDnnPoolDesc(ws=ws, stride=stride, mode=mode, pad=pad)()
-    return GpuDnnPool()(img, desc)
+    return GpuDnnPool(mode=mode)(img, ws, stride, pad)


 class GpuDnnSoftmaxBase(DnnBase):
+
    """
    Op for the cuDNN Softmax.

@@ -1263,6 +1303,7 @@ class GpuDnnSoftmaxBase(DnnBase):


 class GpuDnnSoftmax(GpuDnnSoftmaxBase):
+
    """
    Op for the cuDNN Softmax.

@@ -1296,6 +1337,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):


 class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
+
    """
    Op for the cuDNN SoftmaxGrad.

@@ -1467,11 +1509,12 @@ def local_pool_dnn_grad_stride(node, ctx_name):
    pad = node.op.padding
    mode = node.op.mode

-    desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
-    return GpuDnnPoolGrad()(gpu_contiguous(inp),
-                            gpu_contiguous(out),
-                            gpu_contiguous(out_grad),
-                            desc)
+    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
+                                     gpu_contiguous(out),
+                                     gpu_contiguous(out_grad),
+                                     ds,
+                                     st,
+                                     pad)


 @register_opt('cudnn')
@@ -1491,11 +1534,10 @@ def local_avg_pool_dnn_grad_stride(node, ctx_name):

    cg = gpu_contiguous(out_grad)

-    desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
    # We reuse cg because CuDNN does not use the value of the `out`
    # argument but still checks its shape for average pooling. This
    # has been observed in v2 and v3 as far as I know.
-    return GpuDnnPoolGrad()(gpu_contiguous(inp), cg, cg, desc)
+    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), cg, cg, ds, st, pad)


 @register_opt('cudnn')
@@ -1548,6 +1590,7 @@ def local_logsoftmax_to_dnn(node, ctx_name):


 class NoCuDNNRaise(Optimizer):
+
    def apply(self, fgraph):
        """
        Raise a error if cudnn can't be used.

--- a/theano/sandbox/gpuarray/dnn_pool.c
+++ b/theano/sandbox/gpuarray/dnn_pool.c
@@ -2,12 +2,15 @@

 cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);
+

 #section init_code_struct

 cudnnStatus_t APPLY_SPECIFIC(err);
 APPLY_SPECIFIC(input) = NULL;
 APPLY_SPECIFIC(output) = NULL;
+APPLY_SPECIFIC(pool) = NULL;

 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
@@ -19,16 +22,25 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output)))
               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }
+if ((APPLY_SPECIFIC(err) = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
+                "(pool): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));  
+  FAIL;
+}

 #section cleanup_code_struct

 if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
 if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
+if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }
+

 #section support_code_struct

 int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
-                             cudnnPoolingDescriptor_t desc,
+                             PyArrayObject *ws, 
+                             PyArrayObject *stride,
+                             PyArrayObject *pad,
                             PyGpuArrayObject **out,
                             PyGpuContextObject *c) {
  cudnnStatus_t err;
@@ -46,14 +58,21 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
  int w[3];
  int p[3];
  int s[3];
-  int ndims;
+  int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
+
+  for(int i = 0; i < ndims; i++) {
+     w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
+  }
+  for(int i = 0; i < ndims; i++) {
+     p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
+  }
+  for(int i = 0; i < ndims; i++) {
+     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
+  }
+  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);

-  err = cudnnGetPoolingNdDescriptor(desc, 3, &mode, &ndims, w, p, s);
  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "error doing cudnnGetPoolingDescriptor operation: %s",
-                 cudnnGetErrorString(err));
-    return 1;
+    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
  }

  dims[0] = PyGpuArray_DIM(img, 0);
@@ -98,7 +117,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
    cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    err = cudnnPoolingForward(
-      APPLY_SPECIFIC(_handle), desc,
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
      alpha,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
      beta,

--- a/theano/sandbox/gpuarray/dnn_pool_grad.c
+++ b/theano/sandbox/gpuarray/dnn_pool_grad.c
@@ -4,6 +4,7 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(input_grad);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(output_grad);
+cudnnPoolingDescriptor_t APPLY_SPECIFIC(pool);

 #section init_code_struct

@@ -11,6 +12,7 @@ APPLY_SPECIFIC(input) = NULL;
 APPLY_SPECIFIC(input_grad) = NULL;
 APPLY_SPECIFIC(output) = NULL;
 APPLY_SPECIFIC(output_grad) = NULL;
+APPLY_SPECIFIC(pool) = NULL;

 {
  cudnnStatus_t err;
@@ -38,6 +40,11 @@ APPLY_SPECIFIC(output_grad) = NULL;
                 cudnnGetErrorString(err));
    FAIL;
  }
+  if ((err = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
+                "(pool): %s", cudnnGetErrorString(err));  
+    FAIL;
+  }
 }

 #section cleanup_code_struct
@@ -46,13 +53,16 @@ if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC
 if (APPLY_SPECIFIC(input_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input_grad)); }
 if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
 if (APPLY_SPECIFIC(output_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output_grad)); }
+if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC(pool)); }

 #section support_code_struct

 int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
                                  PyGpuArrayObject *out,
                                  PyGpuArrayObject *out_grad,
-                                  cudnnPoolingDescriptor_t desc,
+                                  PyArrayObject *ws, 
+                                  PyArrayObject *stride,
+                                  PyArrayObject *pad,
                                  PyGpuArrayObject **inp_grad,
                                  PyGpuContextObject *c) {
  cudnnStatus_t err;
@@ -85,6 +95,26 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
    return 1;
  }

+  int w[3];
+  int p[3];
+  int s[3];
+  int ndims = PyArray_DIM(ws, 0);//PyGpuArray_NDIM(img) - 2;
+
+  for(int i = 0; i < ndims; i++) {
+     w[i] = *((npy_intp*)PyArray_GETPTR1(ws, i));
+  }
+  for(int i = 0; i < ndims; i++) {
+     p[i] = *((npy_intp*)PyArray_GETPTR1(pad, i));
+  }
+  for(int i = 0; i < ndims; i++) {
+     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
+  }
+  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
+  }
+
  if (c_set_tensorNd(*inp_grad, APPLY_SPECIFIC(input_grad)) != 0)
    return 1;

@@ -118,7 +148,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
    cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    err = cudnnPoolingBackward(
-      APPLY_SPECIFIC(_handle), desc,
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
      alpha,
      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
      APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),

--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
@@ -275,6 +275,55 @@ def test_pooling():
            utt.assert_allclose(c_out, g_out)


+def test_pooling_with_tensor_vars():
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+    x = T.ftensor4()
+    ws = theano.shared(numpy.array([2, 2], dtype='int32'))
+    st = theano.shared(numpy.array([1, 1], dtype='int32'))
+    pad = theano.shared(numpy.array([0, 0], dtype='int32'))
+    mode = 'max'
+
+    def fn(x):
+        dnn_op = dnn.dnn_pool(x,
+                              ws=ws,
+                              stride=st,
+                              pad=pad,
+                              mode=mode)
+        return dnn_op
+
+    for shp in [(1, 1, 2, 2),
+                (1, 1, 3, 3)]:
+        data = numpy.random.normal(0, 1, shp).astype("float32") * 10
+        theano.tests.unittest_tools.verify_grad(
+            fn, [data],
+            cast_to_output_type=False,
+            mode=mode_with_gpu)
+
+    out2 = pool_2d_i2n(x, ds=(2, 2), strides=(1, 1),
+                       pad=(0, 0),
+                       pool_function=T.max)
+
+    mode_without_gpu2 = mode_without_gpu.including()
+    mode_without_gpu2.check_isfinite = False
+
+    f1 = theano.function([x], fn(x), mode=mode_with_gpu)
+    assert any([isinstance(node.op, dnn.GpuDnnPool)
+                for node in f1.maker.fgraph.apply_nodes])
+    f2 = theano.function([x], out2, mode=mode_without_gpu2)
+    assert not any([isinstance(node.op, dnn.GpuDnnPool)
+                    for node in f2.maker.fgraph.apply_nodes])
+    for shp in [(1, 10, 100, 100),
+                (1, 3, 99, 99),
+                (32, 1, 147, 197),
+                ]:
+        data = numpy.random.normal(0, 1, shp).astype("float32")
+        a = f1(data).__array__()
+
+        b = f2(data).__array__()
+        utt.assert_allclose(a, b)
+
+
 def test_pooling_opt():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
@@ -340,6 +389,7 @@ def test_dnn_tag():


 class TestDnnInferShapes(utt.InferShapeTester):
+
    def setUp(self):
        super(TestDnnInferShapes, self).setUp()
        self.mode = mode_with_gpu
@@ -525,14 +575,9 @@ class TestDnnInferShapes(utt.InferShapeTester):
            [(1, 1), (2, 2), (3, 3)],
            modes
        ):
-            desc = dnn.GpuDnnPoolDesc(
-                ws=params[0],
-                stride=params[1],
-                mode=params[2]
-            )()
            self._compile_and_check(
                [img],
-                [dnn.GpuDnnPool()(img, desc)],
+                [dnn.GpuDnnPool(mode=params[2])(img, params[0], params[1], (0, 0))],
                [img_val],
                dnn.GpuDnnPool
            )
@@ -561,16 +606,13 @@ class TestDnnInferShapes(utt.InferShapeTester):
            [(1, 1), (2, 2), (3, 3)],
            ['max', 'average_inc_pad']
        ):
-            desc = dnn.GpuDnnPoolDesc(
-                ws=params[0],
-                stride=params[1],
-                mode=params[2]
-            )()
-            pool_grad = dnn.GpuDnnPoolGrad()(
+            pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
                img,
                out,
                img_grad,
-                desc
+                params[0],
+                params[1],
+                (0, 0)
            )
            self._compile_and_check(
                [img, img_grad, out],