Merge pull request #4884 from gvtulder/f-pool-3d

theano.tensor.signal.Pool with 3D support

Merge pull request #4884 from gvtulder/f-pool-3d
1ed69042 · Frédéric Bastien · GitHub · 146ef971 · 8318ac85 · 1ed69042
--- a/doc/library/tensor/signal/pool.txt
+++ b/doc/library/tensor/signal/pool.txt
@@ -13,3 +13,4 @@
 .. autofunction:: theano.tensor.signal.pool.pool_2d
 .. autofunction:: theano.tensor.signal.pool.max_pool_2d_same_size
+.. autofunction:: theano.tensor.signal.pool.pool_3d
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -38,7 +38,7 @@ from .nnet import GpuSoftmax
 from .opt import (gpu_seqopt, register_opt,
                  op_lifter, register_opt2)
-from .opt_util import alpha_merge, output_merge, inplace_allocempty
+from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims
 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
@@ -1404,7 +1404,7 @@ class GpuDnnPoolGrad(DnnBase):
        return [shape[0]]
-def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
+def dnn_pool(img, ws, stride=None, mode='max', pad=None):
    """
    GPU pooling using cuDNN from NVIDIA.
@@ -1418,13 +1418,13 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
    img
        Images to do the pooling over.
    ws : tuple
-        Subsampling window size.
+        Subsampling window size.  Should have 2 or 3 elements.
    stride : tuple
-        Subsampling stride (default: (1, 1)).
+        Subsampling stride (default: (1, 1) or (1, 1, 1)).
    mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum'}
    pad : tuple
        (padX, padY) or (padX, padY, padZ)
-        default: (0, 0)
+        default: (0, 0) or (0, 0, 0)
    .. warning:: The cuDNN library only works with GPU that have a compute
        capability of 3.0 or higer.  This means that older GPU will not
@@ -1436,6 +1436,10 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
    """
    img = gpu_contiguous(img)
+    if stride is None:
+        stride = (1,) * len(ws)
+    if pad is None:
+        pad = (0,) * len(ws)
    if mode == "sum":
        ret = GpuDnnPool(mode="average_inc_pad")(img, ws, stride, pad)
        context_name = ret.type.context_name
@@ -2073,9 +2077,19 @@ def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
    if not op.ignore_border:
        return
    img, ws, stride, pad = inputs
-    img = as_gpuarray_variable(img, ctx_name)
+    nd = op.ndim if op.ndim else (img.ndim - 2)
+    if nd not in (2, 3):
+        return
+    img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
    mode = op.mode
-    return dnn_pool(gpu_contiguous(img), ws, stride=stride, pad=pad, mode=mode)
+    # dnn_pool expects exactly 2 non-pooling dimensions
+    if img.ndim == nd + 2:
+        return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
+    else:
+        # reshape to 4D or 5D with 2 non-pooling dimensions
+        img_padded = pad_dims(img, 2, nd)
+        ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
+        return unpad_dims(ret_padded, img, 2, nd)
 @register_opt('cudnn', 'fast_compile')
@@ -2087,17 +2101,34 @@ def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not op.ignore_border:
        return
    inp, out, out_grad, ws, stride, pad = inputs
-    inp = as_gpuarray_variable(inp, ctx_name)
+    nd = op.ndim if op.ndim else (inp.ndim - 2)
-    out = as_gpuarray_variable(out, ctx_name)
+    if nd not in (2, 3):
-    out_grad = as_gpuarray_variable(out_grad, ctx_name)
+        return
+    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
+    out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
+    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
    mode = op.mode
-    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
+    # the GPU ops expect exactly 2 non-pooling dimensions
-                                     gpu_contiguous(out),
+    if inp.ndim == nd + 2:
-                                     gpu_contiguous(out_grad),
+        return GpuDnnPoolGrad(mode=mode)(inp,
-                                     ws,
+                                         out,
-                                     stride,
+                                         out_grad,
-                                     pad)
+                                         ws,
+                                         stride,
+                                         pad)
+    else:
+        # reshape to 4D or 5D with 2 non-pooling dimensions
+        inp_padded = pad_dims(inp, 2, nd)
+        out_padded = pad_dims(out, 2, nd)
+        out_grad_padded = pad_dims(out_grad, 2, nd)
+        ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
+                                               out_padded,
+                                               out_grad_padded,
+                                               ws,
+                                               stride,
+                                               pad)
+        return unpad_dims(ret_padded, inp, 2, nd)
 @register_opt('cudnn', 'fast_compile')
@@ -2109,16 +2140,30 @@ def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not op.ignore_border:
        return
    inp, out_grad, ws, stride, pad = inputs
-    inp = as_gpuarray_variable(inp, ctx_name)
+    nd = op.ndim if op.ndim else (inp.ndim - 2)
-    out_grad = as_gpuarray_variable(out_grad, ctx_name)
+    if nd not in (2, 3):
+        return
+    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
+    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
    mode = op.mode
-    cg = gpu_contiguous(out_grad)
+    # the GPU ops expect exactly 2 non-pooling dimensions
+    if inp.ndim == nd + 2:
-    # We reuse cg because cuDNN does not use the value of the `out`
+        # We reuse out_grad because cuDNN does not use the value of the `out`
-    # argument but still checks its shape for average pooling. This
+        # argument but still checks its shape for average pooling. This
-    # has been observed in v2 and v3 as far as I know.
+        # has been observed in v2 and v3 as far as I know.
-    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), cg, cg, ws, stride, pad)
+        return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad)
+    else:
+        # reshape to 4D or 5D with 2 non-pooling dimensions
+        inp_padded = pad_dims(inp, 2, nd)
+        out_grad_padded = pad_dims(out_grad, 2, nd)
+        ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
+                                               out_grad_padded,
+                                               out_grad_padded,
+                                               ws,
+                                               stride,
+                                               pad)
+        return unpad_dims(ret_padded, inp, 2, nd)
 @register_opt('cudnn', 'fast_compile')

--- a/theano/gpuarray/opt_util.py
+++ b/theano/gpuarray/opt_util.py
@@ -3,12 +3,12 @@ from functools import wraps
 import numpy
-from theano import scalar as scal, Constant
+from theano import tensor, scalar as scal, Constant
 from theano.gof import local_optimizer
 from theano.tensor import (DimShuffle, get_scalar_constant_value,
                           NotScalarConstantError)
-from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, gpu_alloc_empty
+from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, GpuReshape, gpu_alloc_empty
 from .elemwise import GpuDimShuffle, GpuElemwise
 _one = scal.constant(numpy.asarray(1.0, dtype='float32'))
@@ -329,3 +329,76 @@ def inplace_allocempty(op, idx):
            return maker(node, inputs)
        return opt
    return wrapper
+def pad_dims(input, leftdims, rightdims):
+    """Reshapes the input to a (leftdims + rightdims) tensor
+    This helper function is used to convert pooling inputs with arbitrary
+    non-pooling dimensions to the correct number of dimensions for the
+    GPU pooling ops.
+    This reduces or expands the number of dimensions of the input to
+    exactly `leftdims`, by adding extra dimensions on the left or by
+    combining some existing dimensions on the left of the input.
+    Use `unpad_dims` to reshape back to the original dimensions.
+    Examples
+    --------
+    Given input of shape (3, 5, 7), ``pad_dims(input, 2, 2)``
+    adds a singleton dimension and reshapes to (1, 3, 5, 7).
+    Given that output from pad_dims, ``unpad_dims(output, input, 2, 2)``
+    reshapes back to (3, 5, 7).
+    Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 2)``
+    does not reshape and returns output with shape (3, 5, 7, 9).
+    Given input of shape (3, 5, 7, 9, 11), ``pad_dims(input, 2, 2)``
+    combines the first two dimensions and reshapes to (15, 7, 9, 11).
+    Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 3)``
+    adds a singleton dimension and reshapes to (1, 3, 5, 7, 9).
+    """
+    assert input.ndim >= rightdims
+    if input.ndim == (leftdims + rightdims):
+        return input
+    # extract image dimensions
+    img_shape = input.shape[-rightdims:]
+    non_pool_ndim = input.ndim - rightdims
+    if non_pool_ndim < leftdims:
+        # too few dimensions, pad on the left
+        dummy_dims = tensor.as_tensor([1] * (leftdims - non_pool_ndim))
+        new_shape = tensor.join(0, dummy_dims,
+                                input.shape[:non_pool_ndim],
+                                img_shape)
+    else:
+        # too many dimensions, combine the leading dimensions
+        batched_ndim = non_pool_ndim - leftdims + 1
+        batch_size = tensor.prod(input.shape[:batched_ndim])
+        # convert to a vector for tensor.join
+        batch_size = tensor.shape_padright(batch_size, 1)
+        new_shape = tensor.join(0, batch_size,
+                                input.shape[batched_ndim:non_pool_ndim],
+                                img_shape)
+    # store in the required shape
+    new_shape = tensor.cast(new_shape, 'int64')
+    input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
+    return input_ND
+def unpad_dims(output, input, leftdims, rightdims):
+    """Reshapes the output after pad_dims.
+    This reverts the padding by `pad_dims`.
+    """
+    if output.ndim == input.ndim:
+        return output
+    # restore the output to the original shape
+    outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
+    return GpuReshape(input.ndim)(output, outshp)
--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -40,6 +40,7 @@ from theano.sandbox.cuda.basic_ops import (
    GpuSubtensor, GpuAdvancedSubtensor1,
    GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
+from theano.sandbox.cuda.opt_util import pad_dims, unpad_dims
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (
@@ -1883,15 +1884,12 @@ def local_convtransp3d_gemm(node):
 gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm)
-def _check_constant_args_pool(ws, stride, pad, node):
+def _check_constant_args_pool(ndim, ws, stride, pad, node):
    """Check if the args of pool are constants. Warns if not."""
    try:
-        ws_w = tensor.get_scalar_constant_value(ws[0])
+        ws = tuple(tensor.get_scalar_constant_value(ws[i]) for i in range(ndim))
-        ws_h = tensor.get_scalar_constant_value(ws[1])
+        stride = tuple(tensor.get_scalar_constant_value(stride[i]) for i in range(ndim))
-        stride_w = tensor.get_scalar_constant_value(stride[0])
+        pad = tuple(tensor.get_scalar_constant_value(pad[i]) for i in range(ndim))
-        stride_h = tensor.get_scalar_constant_value(stride[1])
-        pad_w = tensor.get_scalar_constant_value(pad[0])
-        pad_h = tensor.get_scalar_constant_value(pad[1])
    except tensor.NotScalarConstantError:
        msg = ("Pool with tensor variable for the window size, stride or "
               "padding is only supported in the new GPU backend, so this op "
@@ -1901,65 +1899,96 @@ def _check_constant_args_pool(ws, stride, pad, node):
        elif config.assert_no_cpu_op == "raise":
            raise AssertionError(msg)
        return None
-    ws = (ws_w, ws_h)
-    stride = (stride_w, stride_h)
-    pad = (pad_w, pad_h)
    return ws, stride, pad
 @register_opt()
 @local_optimizer([pool.Pool])
 def local_gpu_downsample_factor_max(node):
-    if isinstance(node.op, pool.Pool):
+    if (isinstance(node.op, pool.Pool)):
-        assert node.op.__props__ == ('ignore_border', 'mode')
+        assert node.op.__props__ == ('ignore_border', 'mode', 'ndim')
        x, ws, stride, pad = node.inputs
-        ret = _check_constant_args_pool(ws, stride, pad, node)
+        nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
+        ret = _check_constant_args_pool(nd, ws, stride, pad, node)
        if ret is None:
            return
        ws, stride, pad = ret
-        if (pad) != (0, 0) or node.op.mode != 'max' or stride != ws:
+        if (nd != 2 or
+                max(pad) != 0 or
+                node.op.mode != 'max' or
+                stride != ws):
            return
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
-            gpu_ds = GpuDownsampleFactorMax(ws, node.op.ignore_border)
+            gpu_ws = GpuDownsampleFactorMax(ws, node.op.ignore_border)
-            return [host_from_gpu(gpu_ds(x.owner.inputs[0]))]
+            if node.inputs[0].ndim == 4:
+                return [host_from_gpu(gpu_ws(x.owner.inputs[0]))]
+            else:
+                input_4D = pad_dims(x.owner.inputs[0], 2, 2)
+                output_4D = gpu_ws(input_4D)
+                output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
+                return [host_from_gpu(output)]
 @register_opt()
 @local_optimizer([pool.MaxPoolGrad])
 def local_gpu_downsample_factor_max_grad(node):
-    if isinstance(node.op, pool.MaxPoolGrad):
+    if (isinstance(node.op, pool.MaxPoolGrad)):
-        assert node.op.__props__ == ('ignore_border', 'mode')
+        assert node.op.__props__ == ('ignore_border', 'mode', 'ndim')
        x, z, gz, ws, stride, pad = node.inputs
-        ret = _check_constant_args_pool(ws, stride, pad, node)
+        nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
+        ret = _check_constant_args_pool(nd, ws, stride, pad, node)
        if ret is None:
            return
        ws, stride, pad = ret
-        if pad != (0, 0) or node.op.mode != 'max' or stride != ws:
+        if (nd != 2 or
+                max(pad) != 0 or
+                node.op.mode != 'max' or
+                stride != ws):
            return
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
-            gpu_ds_grad = GpuDownsampleFactorMaxGrad(ws, node.op.ignore_border)
+            gpu_ws_grad = GpuDownsampleFactorMaxGrad(ws, node.op.ignore_border)
-            return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0],
+            if node.inputs[0].ndim == 4:
-                                              as_cuda_ndarray_variable(z),
+                return [host_from_gpu(gpu_ws_grad(x.owner.inputs[0],
-                                              as_cuda_ndarray_variable(gz)))]
+                                                  as_cuda_ndarray_variable(z),
+                                                  as_cuda_ndarray_variable(gz)))]
+            else:
+                x_4D = pad_dims(x.owner.inputs[0], 2, 2)
+                z_4D = pad_dims(as_cuda_ndarray_variable(z), 2, 2)
+                gz_4D = pad_dims(as_cuda_ndarray_variable(gz), 2, 2)
+                output_4D = gpu_ws_grad(x_4D, z_4D, gz_4D)
+                output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
+                return [host_from_gpu(output)]
 @register_opt()
 @local_optimizer([pool.DownsampleFactorMaxGradGrad])
 def local_gpu_downsample_factor_max_grad_grad(node):
    if isinstance(node.op, pool.DownsampleFactorMaxGradGrad):
-        assert node.op.__props__ == ('ignore_border', 'mode')
+        assert node.op.__props__ == ('ignore_border', 'mode', 'ndim')
        x, z, gx, ws, stride, pad = node.inputs
-        ret = _check_constant_args_pool(ws, stride, pad, node)
+        nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
+        ret = _check_constant_args_pool(nd, ws, stride, pad, node)
        if ret is None:
            return
        ws, stride, pad = ret
-        if pad != (0, 0) or node.op.mode != 'max' or stride != ws:
+        if (nd != 2 or
+                max(pad) != 0 or
+                node.op.mode != 'max' or
+                stride != ws):
            return
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
            op = GpuDownsampleFactorMaxGradGrad(ws, node.op.ignore_border)
-            return [host_from_gpu(op(x.owner.inputs[0],
+            if node.inputs[0].ndim == 4:
-                                     as_cuda_ndarray_variable(z),
+                return [host_from_gpu(op(x.owner.inputs[0],
-                                     as_cuda_ndarray_variable(gx)))]
+                                         as_cuda_ndarray_variable(z),
+                                         as_cuda_ndarray_variable(gx)))]
+            else:
+                x_4D = pad_dims(x.owner.inputs[0], 2, 2)
+                z_4D = pad_dims(as_cuda_ndarray_variable(z), 2, 2)
+                gx_4D = pad_dims(as_cuda_ndarray_variable(gx), 2, 2)
+                output_4D = op(x_4D, z_4D, gx_4D)
+                output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
+                return [host_from_gpu(output)]
 @register_opt()

--- a/theano/sandbox/cuda/opt_util.py
+++ b/theano/sandbox/cuda/opt_util.py
@@ -3,13 +3,13 @@ from functools import wraps
 import numpy
-from theano import scalar as scal, Constant
+from theano import tensor, scalar as scal, Constant
 from theano.gof import local_optimizer
 from theano.tensor import (DimShuffle, get_scalar_constant_value,
                           NotScalarConstantError)
 from theano.sandbox.cuda.basic_ops import (
-    GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise)
+    GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise, GpuReshape)
 _one = scal.constant(numpy.asarray(1.0, dtype='float32'))
@@ -126,3 +126,76 @@ def output_merge(cls, alpha_in, beta_in, out_in):
                return maker(targ, *inputs)
        return opt
    return wrapper
+def pad_dims(input, leftdims, rightdims):
+    """Reshapes the input to a (leftdims + rightdims) tensor
+    This helper function is used to convert pooling inputs with arbitrary
+    non-pooling dimensions to the correct number of dimensions for the
+    GPU pooling ops.
+    This reduces or expands the number of dimensions of the input to
+    exactly `leftdims`, by adding extra dimensions on the left or by
+    combining some existing dimensions on the left of the input.
+    Use `unpad_dims` to reshape back to the original dimensions.
+    Examples
+    --------
+    Given input of shape (3, 5, 7), ``pad_dims(input, 2, 2)``
+    adds a singleton dimension and reshapes to (3, 1, 5, 7).
+    Given that output from pad_dims, ``unpad_dims(output, input, 2, 2)``
+    reshapes back to (3, 5, 7).
+    Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 2)``
+    does not reshape and returns output with shape (3, 5, 7, 9).
+    Given input of shape (3, 5, 7, 9, 11), ``pad_dims(input, 2, 2)``
+    combines the first two dimensions and reshapes to (8, 7, 9, 11).
+    Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 3)``
+    adds a singleton dimension and reshapes to (3, 1, 5, 7, 9).
+    """
+    assert input.ndim >= rightdims
+    if input.ndim == (leftdims + rightdims):
+        return input
+    # extract image dimensions
+    img_shape = input.shape[-rightdims:]
+    non_pool_ndim = input.ndim - rightdims
+    if non_pool_ndim < leftdims:
+        # too few dimensions, pad on the left
+        dummy_dims = tensor.as_tensor([1] * (leftdims - non_pool_ndim))
+        new_shape = tensor.join(0, dummy_dims,
+                                input.shape[:non_pool_ndim],
+                                img_shape)
+    else:
+        # too many dimensions, combine the leading dimensions
+        batched_ndim = non_pool_ndim - leftdims + 1
+        batch_size = tensor.prod(input.shape[:batched_ndim])
+        # convert to a vector for tensor.join
+        batch_size = tensor.shape_padright(batch_size, 1)
+        new_shape = tensor.join(0, batch_size,
+                                input.shape[batched_ndim:non_pool_ndim],
+                                img_shape)
+    # store in the required shape
+    new_shape = tensor.cast(new_shape, 'int64')
+    input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
+    return input_ND
+def unpad_dims(output, input, leftdims, rightdims):
+    """Reshapes the output after pad_dims.
+    This reverts the padding by `pad_dims`.
+    """
+    if output.ndim == input.ndim:
+        return output
+    # restore the output to the original shape
+    outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
+    return GpuReshape(input.ndim)(output, outshp)
--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -326,7 +326,9 @@ if 0:
 def test_downsample():
-    shps = [(1, 1, 1, 12),
+    shps = [(1, 12),
+            (1, 1, 12),
+            (1, 1, 1, 12),
            (1, 1, 2, 2),
            (1, 1, 1, 1),
            (1, 1, 4, 4),
@@ -359,17 +361,17 @@ def test_downsample():
    for shp in shps:
        for ds in (2, 2), (3, 2), (1, 1):
-            if ds[0] > shp[2]:
+            if ds[0] > shp[-2]:
                continue
-            if ds[1] > shp[3]:
+            if ds[1] > shp[-1]:
                continue
            # GpuDownsampleFactorMax doesn't like having more than 512 columns
            # in the output tensor.
-            if float(shp[3]) / ds[1] > 512:
+            if float(shp[-1]) / ds[1] > 512:
                continue
            for ignore_border in (True, False):
                # print 'test_downsample', shp, ds, ignore_border
-                ds_op = Pool(ignore_border=ignore_border)
+                ds_op = Pool(ndim=len(ds), ignore_border=ignore_border)
                a = tcn.shared_constructor(my_rand(*shp), 'a')
                f = pfunc([], ds_op(tensor.as_tensor_variable(a), ds),

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
--- a/theano/tensor/signal/pool.py
+++ b/theano/tensor/signal/pool.py
--- a/theano/tensor/signal/tests/test_pool.py
+++ b/theano/tensor/signal/tests/test_pool.py