theano.tensor.signal.Pool with 3D support.

a16e91f7 · Gijs van Tulder · 172e699c · a16e91f7 · a16e91f7 · a16e91f7
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -35,7 +35,7 @@ from .nnet import GpuSoftmax
 from .opt import (gpu_seqopt, register_opt,
                  op_lifter, register_opt2)

-from .opt_util import alpha_merge, output_merge, inplace_allocempty
+from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims

 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER

@@ -1253,7 +1253,7 @@ class GpuDnnPoolGrad(DnnBase):
        return [shape[0]]


-def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
+def dnn_pool(img, ws, stride=None, mode='max', pad=None):
    """
    GPU pooling using cuDNN from NVIDIA.

@@ -1267,13 +1267,13 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
    img
        Images to do the pooling over.
    ws : tuple
-        Subsampling window size.
+        Subsampling window size.  Should have 2 or 3 elements.
    stride : tuple
-        Subsampling stride (default: (1, 1)).
+        Subsampling stride (default: (1, 1) or (1, 1, 1)).
    mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum'}
    pad : tuple
        (padX, padY) or (padX, padY, padZ)
-        default: (0, 0)
+        default: (0, 0) or (0, 0, 0)

    .. warning:: The cuDNN library only works with GPU that have a compute
        capability of 3.0 or higer.  This means that older GPU will not
@@ -1285,6 +1285,10 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):

    """
    img = gpu_contiguous(img)
+    if stride is None:
+        stride = (1,) * len(ws)
+    if pad is None:
+        pad = (0,) * len(ws)
    if mode == "sum":
        ret = GpuDnnPool(mode="average_inc_pad")(img, ws, stride, pad)
        context_name = ret.type.context_name
@@ -1868,9 +1872,18 @@ def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
    if not op.ignore_border:
        return
    img, ws, stride, pad = inputs
-    img = as_gpuarray_variable(img, ctx_name)
+    nd = op.ndim if op.ndim else (img.ndim - 2)
+    if nd not in (2, 3):
+        return
+    img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
    mode = op.mode
-    return dnn_pool(gpu_contiguous(img), ws, stride=stride, pad=pad, mode=mode)
+    if img.ndim == nd + 2:
+        return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
+    else:
+        # reshape to 4D or 5D with 2 non-pooling dimensions
+        img_padded = pad_dims(img, 2, nd)
+        ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
+        return unpad_dims(ret_padded, img, 2, nd)


 @register_opt('cudnn', 'fast_compile')
@@ -1882,17 +1895,33 @@ def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not op.ignore_border:
        return
    inp, out, out_grad, ws, stride, pad = inputs
-    inp = as_gpuarray_variable(inp, ctx_name)
-    out = as_gpuarray_variable(out, ctx_name)
-    out_grad = as_gpuarray_variable(out_grad, ctx_name)
+    nd = op.ndim if op.ndim else (inp.ndim - 2)
+    if nd not in (2, 3):
+        return
+    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
+    out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
+    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
    mode = op.mode

-    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
-                                     gpu_contiguous(out),
-                                     gpu_contiguous(out_grad),
+    if inp.ndim == nd + 2:
+        return GpuDnnPoolGrad(mode=mode)(inp,
+                                         out,
+                                         out_grad,
                                         ws,
                                         stride,
                                         pad)
+    else:
+        # reshape to 4D or 5D with 2 non-pooling dimensions
+        inp_padded = pad_dims(inp, 2, nd)
+        out_padded = pad_dims(out, 2, nd)
+        out_grad_padded = pad_dims(out_grad, 2, nd)
+        ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
+                                               out_padded,
+                                               out_grad_padded,
+                                               ws,
+                                               stride,
+                                               pad)
+        return unpad_dims(ret_padded, inp, 2, nd)


 @register_opt('cudnn', 'fast_compile')
@@ -1904,16 +1933,28 @@ def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not op.ignore_border:
        return
    inp, out_grad, ws, stride, pad = inputs
-    inp = as_gpuarray_variable(inp, ctx_name)
-    out_grad = as_gpuarray_variable(out_grad, ctx_name)
+    nd = op.ndim if op.ndim else (inp.ndim - 2)
+    if nd not in (2, 3):
+        return
+    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
+    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
    mode = op.mode

-    cg = gpu_contiguous(out_grad)
-
-    # We reuse cg because cuDNN does not use the value of the `out`
+    if inp.ndim == nd + 2:
+        # We reuse out_grad because cuDNN does not use the value of the `out`
        # argument but still checks its shape for average pooling. This
        # has been observed in v2 and v3 as far as I know.
-    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), cg, cg, ws, stride, pad)
+        return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad)
+    else:
+        inp_padded = pad_dims(inp, 2, nd)
+        out_grad_padded = pad_dims(out_grad, 2, nd)
+        ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
+                                               out_grad_padded,
+                                               out_grad_padded,
+                                               ws,
+                                               stride,
+                                               pad)
+        return unpad_dims(ret_padded, inp, 2, nd)


 @register_opt('cudnn', 'fast_compile')

--- a/theano/gpuarray/opt_util.py
+++ b/theano/gpuarray/opt_util.py
@@ -3,12 +3,12 @@ from functools import wraps

 import numpy

-from theano import scalar as scal, Constant
+from theano import tensor, scalar as scal, Constant
 from theano.gof import local_optimizer
 from theano.tensor import (DimShuffle, get_scalar_constant_value,
                           NotScalarConstantError)

-from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, gpu_alloc_empty
+from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, GpuReshape, gpu_alloc_empty
 from .elemwise import GpuDimShuffle, GpuElemwise

 _one = scal.constant(numpy.asarray(1.0, dtype='float32'))
@@ -329,3 +329,48 @@ def inplace_allocempty(op, idx):
            return maker(node, inputs)
        return opt
    return wrapper
+
+
+def pad_dims(input, leftdims, rightdims):
+    """Reshapes the input to a (leftdims + rightdims) tensor
+
+    This helper function is used to convert pooling inputs with arbitrary
+    non-pooling dimensions to the correct number of dimensions for the
+    GPU pooling ops.
+
+    This reduces or expands the number of dimensions of the input to
+    exactly `leftdims`, by adding extra dimensions on the left or by
+    combining some existing dimensions on the left of the input.
+    """
+    assert input.ndim >= rightdims
+
+    if input.ndim == (leftdims + rightdims):
+        return input
+
+    # extract image dimensions
+    img_shape = input.shape[-rightdims:]
+
+    # count the number of "leading" dimensions, store as dmatrix
+    batch_size = tensor.prod(input.shape[:-rightdims])
+    batch_size = tensor.shape_padright(batch_size, 1)
+
+    # store in the required shape, for example as a 4D tensor
+    # with shape: (batch_size,1,height,width)
+    new_shape = tensor.cast(tensor.join(0, batch_size,
+                                        tensor.as_tensor([1] * (leftdims - 1)),
+                                        img_shape), 'int64')
+    input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
+    return input_ND
+
+
+def unpad_dims(output, input, leftdims, rightdims):
+    """Reshapes the output after pad_dims.
+
+    This reverts the padding by `pad_dims`.
+    """
+    if output.ndim == input.ndim:
+        return output
+
+    # restore the output to the original shape
+    outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
+    return GpuReshape(input.ndim)(output, outshp)
--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -30,7 +30,8 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
 from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
                                      GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import GpuSoftmax
-from theano.sandbox.cuda.opt_util import alpha_merge, output_merge
+from theano.sandbox.cuda.opt_util import (alpha_merge, output_merge,
+                                          pad_dims, unpad_dims)
 from theano.sandbox.cuda import gpu_seqopt, register_opt

 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
@@ -1391,20 +1392,23 @@ class GpuDnnPoolDesc(GpuOp):
    def do_constant_folding(self, node):
        return False

-    def __init__(self, ws=(1, 1), stride=(1, 1), mode='max', pad=(0, 0)):
+    def __init__(self, ws=(1, 1), stride=None, mode='max', pad=None):
        if mode == 'average':
            mode = 'average_inc_pad'
        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
        self.mode = mode

+        if stride is None:
+            stride = (1,) * len(ws)
+        if pad is None:
+            pad = (0,) * len(ws)
+
        assert len(ws) == len(stride) and len(stride) == len(pad)
        assert len(ws) in (2, 3)
        self.ws = ws
        self.stride = stride
        self.pad = pad

-        if (pad[0] != 0 or pad[1] != 0) and version() == -1:
-            raise RuntimeError("cuDNN pooling with padding requires cuDNN v2")
        if self.get_ndim() == 3 and version() < (3000, 3000):
            raise RuntimeError("cuDNN 3d pooling requires cuDNN v3")
        if (mode == 'average_exc_pad' and max(pad) > 0 and
@@ -1418,12 +1422,9 @@ class GpuDnnPoolDesc(GpuOp):
    def __setstate__(self, d):
        self.__dict__.update(d)
        if not hasattr(self, 'pad'):
-            self.pad = (0, 0)
+            self.pad = (0,) * self.get_ndim()

    def make_node(self):
-        if self.pad != (0, 0) and version() == -1:
-            raise RuntimeError("cuDNN pooling with padding requires cuDNN v2")
-
        node = Apply(self, [],
                     [CDataType("cudnnPoolingDescriptor_t",
                                freefunc="cudnnDestroyPoolingDescriptor")()])
@@ -1444,8 +1445,6 @@ class GpuDnnPoolDesc(GpuOp):
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
        elif self.mode == "average_exc_pad":
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
-            if version() == -1:
-                raise Exception("cudnn v1 do not support average_exc_pad")
        else:
            raise NotImplementedError("Unsupported pooling model.")

@@ -1616,8 +1615,6 @@ if (pool%(name)s != NULL) { cudnnDestroyPoolingDescriptor(pool%(name)s); }
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
        elif self.mode == "average_exc_pad":
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
-            if version() == -1:
-                raise Exception("cudnn v1 do not support average_exc_pad")
        else:
            raise NotImplementedError("Unsupported pooling model.")

@@ -1872,8 +1869,6 @@ if (pool%(name)s != NULL) { cudnnDestroyPoolingDescriptor(pool%(name)s); }
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
        elif self.mode == "average_exc_pad":
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
-            if version() == -1:
-                raise Exception("cudnn v1 do not support average_exc_pad")
        else:
            raise NotImplementedError("Unsupported pooling model.")

@@ -1976,28 +1971,33 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
        return [shape[0]]


-def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
+def dnn_pool(img, ws, stride=None, mode='max', pad=None):
    """
    GPU pooling using cuDNN from NVIDIA.

-    The memory layout to use is 'bc01', that is 'batch', 'channel',
-    'first dim', 'second dim' in that order.
+    For 2D pooling, the memory layout to use is 'bc01', that is 'batch',
+    'channel', 'first dim', 'second dim' in that order.
+
+    For 3D pooling, the memory layout to use is 'bc012', that is 'batch',
+    'channel', 'first dim', 'second dim', 'third dim'.

    Parameters
    ----------
    img
        Images to do the pooling over.
    ws
-        Subsampling window size.
+        Subsampling window size.  Should have 2 or 3 elements.
    stride
-        Subsampling stride (default: (1, 1)).
-    mode : {'max', 'average_inc_pad', 'average_exc_pad, 'sum'}
-    pad :
-        (pad_h, pad_w) padding information.
+        Subsampling stride (default: (1, 1) or (1, 1, 1)).
+    mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum'}
+    pad
+        Padding: (pad_h, pad_w) for 2D or (pad_h, pad_w, pad_d) for 3D.
        pad_h is the number of zero-valued pixels added to each of the top and
        bottom borders.
        pad_w is the number of zero-valued pixels added to each of the left
        and right borders.
+        pad_d is the number of zero-valued pixels added to each of the front
+        and back borders (3D pooling only).

    .. warning:: The cuDNN library only works with GPU that have a compute
      capability of 3.0 or higer.  This means that older GPU will not
@@ -2009,6 +2009,10 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):

    """
    img = gpu_contiguous(img)
+    if stride is None:
+        stride = (1,) * len(ws)
+    if pad is None:
+        pad = (0,) * len(ws)
    if mode == "sum":
        ret = GpuDnnPool(mode="average_inc_pad")(img, ws, stride, pad)
        window_elem = theano.tensor.prod(ws).astype(ret.dtype)
@@ -2972,10 +2976,21 @@ if True:
            if not node.op.ignore_border:
                return
            img, ws, stride, pad = node.inputs
+            nd = node.op.ndim if node.op.ndim else (img.ndim - 2)
            mode = node.op.mode
+            if nd not in (2, 3):
+                return
            if (img.owner and isinstance(img.owner.op, HostFromGpu)):
+                if img.ndim == nd + 2:
                    ret = dnn_pool(gpu_contiguous(img.owner.inputs[0]),
                                   ws, stride=stride, pad=pad, mode=mode)
+                else:
+                    input = gpu_contiguous(img.owner.inputs[0])
+                    # reshape to 4D or 5D with 2 non-pooling dimensions
+                    input_padded = pad_dims(input, 2, nd)
+                    ret_padded = dnn_pool(input_padded,
+                                          ws, stride=stride, pad=pad, mode=mode)
+                    ret = unpad_dims(ret_padded, input, 2, nd)
                return [host_from_gpu(ret)]

    @register_opt('cudnn')
@@ -3003,17 +3018,30 @@ if True:
            if not node.op.ignore_border:
                return
            inp, out, inp_grad, ws, stride, pad = node.inputs
+            nd = node.op.ndim if node.op.ndim else (inp.ndim - 2)
            mode = node.op.mode
+            if nd not in (2, 3):
+                return

            if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
                (out.owner and isinstance(out.owner.op, HostFromGpu)) or
                (inp_grad.owner and isinstance(inp_grad.owner.op,
                                               HostFromGpu))):
-
+                if inp.ndim == nd + 2:
                    ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
                                                    gpu_contiguous(out),
                                                    gpu_contiguous(inp_grad),
                                                    ws, stride, pad)
+                else:
+                    # reshape to 4D or 5D with 2 non-pooling dimensions
+                    inp_padded = pad_dims(gpu_contiguous(inp), 2, nd)
+                    out_padded = pad_dims(gpu_contiguous(out), 2, nd)
+                    inp_grad_padded = pad_dims(gpu_contiguous(inp_grad), 2, nd)
+                    ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
+                                                           out_padded,
+                                                           inp_grad_padded,
+                                                           ws, stride, pad)
+                    ret = unpad_dims(ret_padded, inp, 2, nd)
                return [host_from_gpu(ret)]

    @register_opt('cudnn')
@@ -3025,16 +3053,28 @@ if True:
            if not node.op.ignore_border:
                return
            inp, inp_grad, ws, stride, pad = node.inputs
+            nd = node.op.ndim if node.op.ndim else (inp.ndim - 2)
            mode = node.op.mode
+            if nd not in (2, 3):
+                return

            if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
                (inp_grad.owner and isinstance(inp_grad.owner.op,
                                               HostFromGpu))):
+                if inp.ndim == nd + 2:
                    contiguous_inp_grad = gpu_contiguous(inp_grad)
                    ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
                                                    contiguous_inp_grad,
                                                    contiguous_inp_grad,
                                                    ws, stride, pad)
+                else:
+                    inp_padded = pad_dims(gpu_contiguous(inp), 2, nd)
+                    inp_grad_padded = pad_dims(gpu_contiguous(inp_grad), 2, nd)
+                    ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
+                                                           inp_grad_padded,
+                                                           inp_grad_padded,
+                                                           ws, stride, pad)
+                    ret = unpad_dims(ret_padded, inp, 2, nd)
                return [host_from_gpu(ret)]

    @register_opt('cudnn')

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -40,6 +40,7 @@ from theano.sandbox.cuda.basic_ops import (
    GpuSubtensor, GpuAdvancedSubtensor1,
    GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
+from theano.sandbox.cuda.opt_util import pad_dims, unpad_dims

 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (
@@ -1891,15 +1892,12 @@ def local_convtransp3d_gemm(node):
 gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm)


-def _check_constant_args_pool(ws, stride, pad, node):
+def _check_constant_args_pool(ndim, ws, stride, pad, node):
    """Check if the args of pool are constants. Warns if not."""
    try:
-        ws_w = tensor.get_scalar_constant_value(ws[0])
-        ws_h = tensor.get_scalar_constant_value(ws[1])
-        stride_w = tensor.get_scalar_constant_value(stride[0])
-        stride_h = tensor.get_scalar_constant_value(stride[1])
-        pad_w = tensor.get_scalar_constant_value(pad[0])
-        pad_h = tensor.get_scalar_constant_value(pad[1])
+        ws = tuple(tensor.get_scalar_constant_value(ws[i]) for i in range(ndim))
+        stride = tuple(tensor.get_scalar_constant_value(stride[i]) for i in range(ndim))
+        pad = tuple(tensor.get_scalar_constant_value(pad[i]) for i in range(ndim))
    except tensor.NotScalarConstantError:
        msg = ("Pool with tensor variable for the window size, stride or "
               "padding is only supported in the new GPU backend, so this op "
@@ -1909,65 +1907,96 @@ def _check_constant_args_pool(ws, stride, pad, node):
        elif config.assert_no_cpu_op == "raise":
            raise AssertionError(msg)
        return None
-    ws = (ws_w, ws_h)
-    stride = (stride_w, stride_h)
-    pad = (pad_w, pad_h)
    return ws, stride, pad


 @register_opt()
 @local_optimizer([pool.Pool])
 def local_gpu_downsample_factor_max(node):
-    if isinstance(node.op, pool.Pool):
-        assert node.op.__props__ == ('ignore_border', 'mode')
+    if (isinstance(node.op, pool.Pool)):
+        assert node.op.__props__ == ('ndim', 'ignore_border', 'mode')
        x, ws, stride, pad = node.inputs
-        ret = _check_constant_args_pool(ws, stride, pad, node)
+        nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
+        ret = _check_constant_args_pool(nd, ws, stride, pad, node)
        if ret is None:
            return
        ws, stride, pad = ret
-        if (pad) != (0, 0) or node.op.mode != 'max' or stride != ws:
+        if (nd != 2 or
+                max(node.op.padding) != 0 or
+                node.op.mode != 'max' or
+                stride != ws):
            return
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
-            gpu_ds = GpuDownsampleFactorMax(ws, node.op.ignore_border)
-            return [host_from_gpu(gpu_ds(x.owner.inputs[0]))]
+            gpu_ws = GpuDownsampleFactorMax(ws, node.op.ignore_border)
+            if node.inputs[0].ndim == 4:
+                return [host_from_gpu(gpu_ws(x.owner.inputs[0]))]
+            else:
+                input_4D = pad_dims(x.owner.inputs[0], 2, 2)
+                output_4D = gpu_ws(input_4D)
+                output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
+                return [host_from_gpu(output)]


 @register_opt()
 @local_optimizer([pool.MaxPoolGrad])
 def local_gpu_downsample_factor_max_grad(node):
-    if isinstance(node.op, pool.MaxPoolGrad):
-        assert node.op.__props__ == ('ignore_border', 'mode')
+    if (isinstance(node.op, pool.MaxPoolGrad)):
+        assert node.op.__props__ == ('ndim', 'ignore_border', 'mode')
        x, z, gz, ws, stride, pad = node.inputs
-        ret = _check_constant_args_pool(ws, stride, pad, node)
+        nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
+        ret = _check_constant_args_pool(nd, ws, stride, pad, node)
        if ret is None:
            return
        ws, stride, pad = ret
-        if pad != (0, 0) or node.op.mode != 'max' or stride != ws:
+        if (nd != 2 or
+                max(node.op.padding) != 0 or
+                node.op.mode != 'max' or
+                stride != ws):
            return
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
-            gpu_ds_grad = GpuDownsampleFactorMaxGrad(ws, node.op.ignore_border)
-            return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0],
+            gpu_ws_grad = GpuDownsampleFactorMaxGrad(ws, node.op.ignore_border)
+            if node.inputs[0].ndim == 4:
+                return [host_from_gpu(gpu_ws_grad(x.owner.inputs[0],
                                                  as_cuda_ndarray_variable(z),
                                                  as_cuda_ndarray_variable(gz)))]
+            else:
+                x_4D = pad_dims(x.owner.inputs[0], 2, 2)
+                z_4D = pad_dims(as_cuda_ndarray_variable(z), 2, 2)
+                gz_4D = pad_dims(as_cuda_ndarray_variable(gz), 2, 2)
+                output_4D = gpu_ws_grad(x_4D, z_4D, gz_4D)
+                output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
+                return [host_from_gpu(output)]


 @register_opt()
 @local_optimizer([pool.DownsampleFactorMaxGradGrad])
 def local_gpu_downsample_factor_max_grad_grad(node):
    if isinstance(node.op, pool.DownsampleFactorMaxGradGrad):
-        assert node.op.__props__ == ('ignore_border', 'mode')
+        assert node.op.__props__ == ('ndim', 'ignore_border', 'mode')
        x, z, gx, ws, stride, pad = node.inputs
-        ret = _check_constant_args_pool(ws, stride, pad, node)
+        nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
+        ret = _check_constant_args_pool(nd, ws, stride, pad, node)
        if ret is None:
            return
        ws, stride, pad = ret
-        if pad != (0, 0) or node.op.mode != 'max' or stride != ws:
+        if (nd != 2 or
+                max(node.op.padding) != 0 or
+                node.op.mode != 'max' or
+                stride != ws):
            return
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
            op = GpuDownsampleFactorMaxGradGrad(ws, node.op.ignore_border)
+            if node.inputs[0].ndim == 4:
                return [host_from_gpu(op(x.owner.inputs[0],
                                         as_cuda_ndarray_variable(z),
                                         as_cuda_ndarray_variable(gx)))]
+            else:
+                x_4D = pad_dims(x.owner.inputs[0], 2, 2)
+                z_4D = pad_dims(as_cuda_ndarray_variable(z), 2, 2)
+                gx_4D = pad_dims(as_cuda_ndarray_variable(gx), 2, 2)
+                output_4D = op(x_4D, z_4D, gx_4D)
+                output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
+                return [host_from_gpu(output)]


 @register_opt()

--- a/theano/sandbox/cuda/opt_util.py
+++ b/theano/sandbox/cuda/opt_util.py
@@ -3,13 +3,13 @@ from functools import wraps

 import numpy

-from theano import scalar as scal, Constant
+from theano import tensor, scalar as scal, Constant
 from theano.gof import local_optimizer
 from theano.tensor import (DimShuffle, get_scalar_constant_value,
                           NotScalarConstantError)

 from theano.sandbox.cuda.basic_ops import (
-    GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise)
+    GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise, GpuReshape)

 _one = scal.constant(numpy.asarray(1.0, dtype='float32'))

@@ -126,3 +126,48 @@ def output_merge(cls, alpha_in, beta_in, out_in):
                return maker(targ, *inputs)
        return opt
    return wrapper
+
+
+def pad_dims(input, leftdims, rightdims):
+    """Reshapes the input to a (leftdims + rightdims) tensor
+
+    This helper function is used to convert pooling inputs with arbitrary
+    non-pooling dimensions to the correct number of dimensions for the
+    GPU pooling ops.
+
+    This reduces or expands the number of dimensions of the input to
+    exactly `leftdims`, by adding extra dimensions on the left or by
+    combining some existing dimensions on the left of the input.
+    """
+    assert input.ndim >= rightdims
+
+    if input.ndim == (leftdims + rightdims):
+        return input
+
+    # extract image dimensions
+    img_shape = input.shape[-rightdims:]
+
+    # count the number of "leading" dimensions, store as dmatrix
+    batch_size = tensor.prod(input.shape[:-rightdims])
+    batch_size = tensor.shape_padright(batch_size, 1)
+
+    # store in the required shape, for example as a 4D tensor
+    # with shape: (batch_size,1,height,width)
+    new_shape = tensor.cast(tensor.join(0, batch_size,
+                                        tensor.as_tensor([1] * (leftdims - 1)),
+                                        img_shape), 'int64')
+    input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
+    return input_ND
+
+
+def unpad_dims(output, input, leftdims, rightdims):
+    """Reshapes the output after pad_dims.
+
+    This reverts the padding by `pad_dims`.
+    """
+    if output.ndim == input.ndim:
+        return output
+
+    # restore the output to the original shape
+    outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
+    return GpuReshape(input.ndim)(output, outshp)
--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -326,7 +326,9 @@ if 0:


 def test_downsample():
-    shps = [(1, 1, 1, 12),
+    shps = [(1, 12),
+            (1, 1, 12),
+            (1, 1, 1, 12),
            (1, 1, 2, 2),
            (1, 1, 1, 1),
            (1, 1, 4, 4),
@@ -359,17 +361,17 @@ def test_downsample():

    for shp in shps:
        for ds in (2, 2), (3, 2), (1, 1):
-            if ds[0] > shp[2]:
+            if ds[0] > shp[-2]:
                continue
-            if ds[1] > shp[3]:
+            if ds[1] > shp[-1]:
                continue
            # GpuDownsampleFactorMax doesn't like having more than 512 columns
            # in the output tensor.
-            if float(shp[3]) / ds[1] > 512:
+            if float(shp[-1]) / ds[1] > 512:
                continue
            for ignore_border in (True, False):
                # print 'test_downsample', shp, ds, ignore_border
-                ds_op = Pool(ignore_border=ignore_border)
+                ds_op = Pool(ndim=len(ds), ignore_border=ignore_border)

                a = tcn.shared_constructor(my_rand(*shp), 'a')
                f = pfunc([], ds_op(tensor.as_tensor_variable(a), ds),

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -15,8 +15,8 @@ import theano
 import theano.tensor as T
 import theano.tests.unittest_tools as utt
 from theano.sandbox.neighbours import images2neibs
-from theano.tensor.signal.pool import pool_2d
-from theano.tensor.signal.pool import MaxPoolGrad, AveragePoolGrad
+from theano.tensor.signal.pool import pool_2d, pool_3d
+from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
 import theano.sandbox.cuda.dnn as dnn
 from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
 from theano.sandbox.cuda import float32_shared_constructor as shared
@@ -170,7 +170,7 @@ def test_dnn_conv_inplace():


 def pool3d2d(input, ds=(2, 2, 2), strides=None, pad=(0, 0, 0),
-             pool_func=T.max, mode='ignore_borders'):
+             pool_function=T.max, mode='ignore_borders'):
    if strides is None:
        strides = ds

@@ -179,13 +179,13 @@ def pool3d2d(input, ds=(2, 2, 2), strides=None, pad=(0, 0, 0),
    # resahpe to B, C*0, 1, 2 and do the pooling on 1, 2
    first = input.reshape((shape[0], shape[1] * shape[2], shape[3], shape[4]))
    pooled1 = pool_2d_i2n(first, ds=ds[1:], strides=strides[1:], pad=pad[1:],
-                          pool_function=pool_func, mode=mode)
+                          pool_function=pool_function, mode=mode)

    shp1 = pooled1.shape
    # reshape to B, C, 0, 1*2 and do the pooling on 0
    second = pooled1.reshape((shape[0], shape[1], shape[2], shp1[2] * shp1[3]))
    pooled2 = pool_2d_i2n(second, ds=(ds[0], 1), strides=(strides[0], 1),
-                          pad=(pad[0], 0), pool_function=pool_func, mode=mode)
+                          pad=(pad[0], 0), pool_function=pool_function, mode=mode)
    shp2 = pooled2.shape
    return pooled2.reshape((shape[0], shape[1], shp2[2], shp1[2], shp1[3]))

@@ -241,8 +241,6 @@ def test_pooling():
            func = T.max
        else:
            func = T.mean
-        if pad != (0, 0) and cuda.dnn.version() == -1:
-            continue

        if pad != (0, 0) and func is T.mean:
            continue
@@ -418,6 +416,7 @@ def test_pooling3d():
    if not cuda.dnn.dnn_available() or cuda.dnn.version() < (3000, 3000):
        raise SkipTest(cuda.dnn.dnn_available.msg)

+    # We force the FAST_RUN as we don't want the reference to run in DebugMode.
    mode_without_gpu_ref = theano.compile.mode.get_mode(
        'FAST_RUN').excluding('gpu')

@@ -427,8 +426,7 @@ def test_pooling3d():
    else:
        modes = ('max', 'average_inc_pad', 'average_exc_pad')

-    x = T.TensorType(broadcastable=(False, False, False, False, False),
-                     dtype='float32')()
+    x = T.ftensor5()
    for mode, pad in product(modes,
                             ((0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1),
                              (2, 3, 2), (3, 2, 2), (2, 2, 3))):
@@ -436,8 +434,6 @@ def test_pooling3d():
            func = T.max
        else:
            func = T.mean
-        if pad != (0, 0, 0) and cuda.dnn.version() == -1:
-            continue

        if pad != (0, 0, 0) and func is T.mean:
            continue
@@ -449,13 +445,13 @@ def test_pooling3d():
                if pad[0] > stride or pad[1] > stride or pad[2] > stride:
                    # Not implemented
                    continue
-                out1 = cuda.dnn.dnn_pool(x, (ws, ws, ws),
-                                         stride=(stride, stride, stride),
-                                         pad=pad, mode=mode)
-                out2 = pool3d2d(x, ds=(ws, ws, ws),
-                                strides=(stride, stride, stride),
-                                pad=pad, pool_func=func)
-
+                out1 = pool_3d(x, (ws, ws, ws),
+                               st=(stride, stride, stride),
+                               ignore_border=True,
+                               padding=pad, mode=mode)
+                out2 = pool3d2d(x, ds=(ws, ws, ws), strides=(stride, stride, stride),
+                                pad=pad,
+                                pool_function=func)
                f1 = theano.function([x], out1, mode=mode_with_gpu)
                assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
                            for node in f1.maker.fgraph.apply_nodes])
@@ -510,11 +506,17 @@ def test_pooling3d():
            g_out = fg(data)

            # Compare again the CPU result
-            out = pool3d2d(x, (ws, ws, ws),
-                           strides=(stride, stride, stride),
-                           pad=pad, pool_func=func)
+            out = pool_3d(x, (ws, ws, ws),
+                          padding=pad,
+                          ignore_border=True, mode=mode)
            fc = theano.function([x], theano.grad(out.sum(), x),
                                 mode=mode_without_gpu_ref)
+            if mode == 'max':
+                assert any([isinstance(node.op, MaxPoolGrad)
+                            for node in fc.maker.fgraph.toposort()])
+            else:
+                assert any([isinstance(node.op, AveragePoolGrad)
+                            for node in fc.maker.fgraph.toposort()])
            c_out = fc(data)
            utt.assert_allclose(c_out, g_out)

@@ -523,6 +525,7 @@ def test_pooling_opt():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)

+    # 2D pooling
    x = T.fmatrix()

    f = theano.function(
@@ -535,6 +538,7 @@ def test_pooling_opt():

    f(numpy.zeros((10, 10), dtype='float32'))

+    # gradient of 2D pooling
    f = theano.function(
        [x],
        T.grad(pool_2d(x, ds=(2, 2), mode='average_inc_pad',
@@ -545,6 +549,7 @@ def test_pooling_opt():
                for n in f.maker.fgraph.toposort()])

    f(numpy.zeros((10, 10), dtype='float32'))
+
    # Test sum pooling
    f = theano.function(
        [x],
@@ -557,6 +562,82 @@ def test_pooling_opt():
    data = numpy.random.rand(10, 10).astype('float32')
    f(data)

+    # 3D pooling
+    x = T.ftensor3()
+
+    f = theano.function(
+        [x],
+        pool_3d(x, ds=(2, 2, 2), mode='average_inc_pad', ignore_border=True),
+        mode=mode_with_gpu)
+
+    assert any([isinstance(n.op, cuda.dnn.GpuDnnPool)
+                for n in f.maker.fgraph.toposort()])
+
+    f(numpy.zeros((10, 10, 10), dtype='float32'))
+
+    # gradient of 3D pooling
+    f = theano.function(
+        [x],
+        T.grad(pool_3d(x, ds=(2, 2, 2), mode='average_inc_pad',
+                       ignore_border=True).sum(), x),
+        mode=mode_with_gpu.including("cudnn"))
+
+    assert any([isinstance(n.op, cuda.dnn.GpuDnnPoolGrad)
+                for n in f.maker.fgraph.toposort()])
+
+    f(numpy.zeros((10, 10, 10), dtype='float32'))
+
+
+def test_pooling_opt_arbitrary_dimensions():
+    # test if input with an arbitrary number of non-pooling dimensions
+    # is correctly reshaped to run on the GPU
+
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+
+    # 'average_exc_pad' is disabled for versions < 4004
+    if cuda.dnn.version() < (4004, 4004):
+        modes = ('max', 'average_inc_pad')
+    else:
+        modes = ('max', 'average_inc_pad', 'average_exc_pad')
+
+    for n_non_pool_dims in (0, 1, 2, 3):
+        for ws in ((2, 2), (3, 3, 3)):
+            # create input shape: non-pooling dimensions
+            # followed by 2 or 3 pooling dimensions
+            shp = (2,) * n_non_pool_dims + (5,) * len(ws)
+            data = numpy.random.normal(0, 1, shp).astype('float32')
+            input = shared(data)
+
+            for mode in modes:
+                out_pool = Pool(ndim=len(ws), mode=mode, ignore_border=True)(input, ws)
+                out_pool_grad = T.grad(T.sum(out_pool), wrt=input)
+                out = [out_pool, out_pool_grad]
+
+                # run on GPU
+                fg = theano.function([], out, mode=mode_with_gpu)
+                assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
+                           for node in fg.maker.fgraph.toposort()])
+                assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
+                           for node in fg.maker.fgraph.toposort()])
+                res_gpu = fg()
+
+                # run on CPU
+                fc = theano.function([], out, mode=mode_without_gpu)
+                assert any([isinstance(node.op, Pool)
+                           for node in fc.maker.fgraph.toposort()])
+                if mode == 'max':
+                    assert any([isinstance(node.op, MaxPoolGrad)
+                               for node in fc.maker.fgraph.toposort()])
+                else:
+                    assert any([isinstance(node.op, AveragePoolGrad)
+                               for node in fc.maker.fgraph.toposort()])
+                res_cpu = fg()
+
+                # check for similarity
+                utt.assert_allclose(res_gpu[0], res_cpu[0])
+                utt.assert_allclose(res_gpu[1], res_cpu[1])
+

 class test_DnnSoftMax(test_nnet.test_SoftMax):
    gpu_op = dnn.GpuDnnSoftmax

--- a/theano/tensor/signal/pool.py
+++ b/theano/tensor/signal/pool.py
--- a/theano/tensor/signal/tests/test_pool.py
+++ b/theano/tensor/signal/tests/test_pool.py