theano.tensor.signal.Pool with 3D support.

a16e91f7 · Gijs van Tulder · 172e699c · a16e91f7 · a16e91f7 · a16e91f7
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -35,7 +35,7 @@ from .nnet import GpuSoftmax
 from .opt import (gpu_seqopt, register_opt,
                  op_lifter, register_opt2)

-from .opt_util import alpha_merge, output_merge, inplace_allocempty
+from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims

 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER

@@ -1253,7 +1253,7 @@ class GpuDnnPoolGrad(DnnBase):
        return [shape[0]]


-def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
+def dnn_pool(img, ws, stride=None, mode='max', pad=None):
    """
    GPU pooling using cuDNN from NVIDIA.

@@ -1267,13 +1267,13 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
    img
        Images to do the pooling over.
    ws : tuple
-        Subsampling window size.
+        Subsampling window size.  Should have 2 or 3 elements.
    stride : tuple
-        Subsampling stride (default: (1, 1)).
+        Subsampling stride (default: (1, 1) or (1, 1, 1)).
    mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum'}
    pad : tuple
        (padX, padY) or (padX, padY, padZ)
-        default: (0, 0)
+        default: (0, 0) or (0, 0, 0)

    .. warning:: The cuDNN library only works with GPU that have a compute
        capability of 3.0 or higer.  This means that older GPU will not
@@ -1285,6 +1285,10 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):

    """
    img = gpu_contiguous(img)
+    if stride is None:
+        stride = (1,) * len(ws)
+    if pad is None:
+        pad = (0,) * len(ws)
    if mode == "sum":
        ret = GpuDnnPool(mode="average_inc_pad")(img, ws, stride, pad)
        context_name = ret.type.context_name
@@ -1868,9 +1872,18 @@ def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
    if not op.ignore_border:
        return
    img, ws, stride, pad = inputs
-    img = as_gpuarray_variable(img, ctx_name)
+    nd = op.ndim if op.ndim else (img.ndim - 2)
+    if nd not in (2, 3):
+        return
+    img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
    mode = op.mode
-    return dnn_pool(gpu_contiguous(img), ws, stride=stride, pad=pad, mode=mode)
+    if img.ndim == nd + 2:
+        return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
+    else:
+        # reshape to 4D or 5D with 2 non-pooling dimensions
+        img_padded = pad_dims(img, 2, nd)
+        ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
+        return unpad_dims(ret_padded, img, 2, nd)


 @register_opt('cudnn', 'fast_compile')
@@ -1882,17 +1895,33 @@ def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not op.ignore_border:
        return
    inp, out, out_grad, ws, stride, pad = inputs
-    inp = as_gpuarray_variable(inp, ctx_name)
-    out = as_gpuarray_variable(out, ctx_name)
-    out_grad = as_gpuarray_variable(out_grad, ctx_name)
+    nd = op.ndim if op.ndim else (inp.ndim - 2)
+    if nd not in (2, 3):
+        return
+    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
+    out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
+    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
    mode = op.mode

-    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
-                                     gpu_contiguous(out),
-                                     gpu_contiguous(out_grad),
-                                     ws,
-                                     stride,
-                                     pad)
+    if inp.ndim == nd + 2:
+        return GpuDnnPoolGrad(mode=mode)(inp,
+                                         out,
+                                         out_grad,
+                                         ws,
+                                         stride,
+                                         pad)
+    else:
+        # reshape to 4D or 5D with 2 non-pooling dimensions
+        inp_padded = pad_dims(inp, 2, nd)
+        out_padded = pad_dims(out, 2, nd)
+        out_grad_padded = pad_dims(out_grad, 2, nd)
+        ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
+                                               out_padded,
+                                               out_grad_padded,
+                                               ws,
+                                               stride,
+                                               pad)
+        return unpad_dims(ret_padded, inp, 2, nd)


 @register_opt('cudnn', 'fast_compile')
@@ -1904,16 +1933,28 @@ def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not op.ignore_border:
        return
    inp, out_grad, ws, stride, pad = inputs
-    inp = as_gpuarray_variable(inp, ctx_name)
-    out_grad = as_gpuarray_variable(out_grad, ctx_name)
+    nd = op.ndim if op.ndim else (inp.ndim - 2)
+    if nd not in (2, 3):
+        return
+    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
+    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
    mode = op.mode

-    cg = gpu_contiguous(out_grad)
-
-    # We reuse cg because cuDNN does not use the value of the `out`
-    # argument but still checks its shape for average pooling. This
-    # has been observed in v2 and v3 as far as I know.
-    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), cg, cg, ws, stride, pad)
+    if inp.ndim == nd + 2:
+        # We reuse out_grad because cuDNN does not use the value of the `out`
+        # argument but still checks its shape for average pooling. This
+        # has been observed in v2 and v3 as far as I know.
+        return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad)
+    else:
+        inp_padded = pad_dims(inp, 2, nd)
+        out_grad_padded = pad_dims(out_grad, 2, nd)
+        ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
+                                               out_grad_padded,
+                                               out_grad_padded,
+                                               ws,
+                                               stride,
+                                               pad)
+        return unpad_dims(ret_padded, inp, 2, nd)


 @register_opt('cudnn', 'fast_compile')

--- a/theano/gpuarray/opt_util.py
+++ b/theano/gpuarray/opt_util.py
@@ -3,12 +3,12 @@ from functools import wraps

 import numpy

-from theano import scalar as scal, Constant
+from theano import tensor, scalar as scal, Constant
 from theano.gof import local_optimizer
 from theano.tensor import (DimShuffle, get_scalar_constant_value,
                           NotScalarConstantError)

-from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, gpu_alloc_empty
+from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, GpuReshape, gpu_alloc_empty
 from .elemwise import GpuDimShuffle, GpuElemwise

 _one = scal.constant(numpy.asarray(1.0, dtype='float32'))
@@ -329,3 +329,48 @@ def inplace_allocempty(op, idx):
            return maker(node, inputs)
        return opt
    return wrapper
+
+
+def pad_dims(input, leftdims, rightdims):
+    """Reshapes the input to a (leftdims + rightdims) tensor
+
+    This helper function is used to convert pooling inputs with arbitrary
+    non-pooling dimensions to the correct number of dimensions for the
+    GPU pooling ops.
+
+    This reduces or expands the number of dimensions of the input to
+    exactly `leftdims`, by adding extra dimensions on the left or by
+    combining some existing dimensions on the left of the input.
+    """
+    assert input.ndim >= rightdims
+
+    if input.ndim == (leftdims + rightdims):
+        return input
+
+    # extract image dimensions
+    img_shape = input.shape[-rightdims:]
+
+    # count the number of "leading" dimensions, store as dmatrix
+    batch_size = tensor.prod(input.shape[:-rightdims])
+    batch_size = tensor.shape_padright(batch_size, 1)
+
+    # store in the required shape, for example as a 4D tensor
+    # with shape: (batch_size,1,height,width)
+    new_shape = tensor.cast(tensor.join(0, batch_size,
+                                        tensor.as_tensor([1] * (leftdims - 1)),
+                                        img_shape), 'int64')
+    input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
+    return input_ND
+
+
+def unpad_dims(output, input, leftdims, rightdims):
+    """Reshapes the output after pad_dims.
+
+    This reverts the padding by `pad_dims`.
+    """
+    if output.ndim == input.ndim:
+        return output
+
+    # restore the output to the original shape
+    outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
+    return GpuReshape(input.ndim)(output, outshp)
--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -11,11 +11,12 @@ from six import StringIO
 import theano.tensor as T
 import theano.tests.unittest_tools as utt
 from theano.sandbox.neighbours import images2neibs
-from theano.tensor.signal.pool import pool_2d
-from theano.tensor.signal.pool import MaxPoolGrad, AveragePoolGrad
+from theano.tensor.signal.pool import pool_2d, pool_3d
+from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad

 from .. import dnn
 from ..basic_ops import GpuAllocEmpty
+from ..type import gpuarray_shared_constructor

 from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
 from . import test_nnet
@@ -166,6 +167,27 @@ def pool_2d_i2n(input, ds=(2, 2), strides=None,
    return pooled_output


+def pool3d2d(input, ds=(2, 2, 2), strides=None, pad=(0, 0, 0),
+             pool_function=T.max, mode='ignore_borders'):
+    if strides is None:
+        strides = ds
+
+    shape = input.shape
+
+    # reshape to B, C*0, 1, 2 and do the pooling on 1, 2
+    first = input.reshape((shape[0], shape[1] * shape[2], shape[3], shape[4]))
+    pooled1 = pool_2d_i2n(first, ds=ds[1:], strides=strides[1:], pad=pad[1:],
+                          pool_function=pool_function, mode=mode)
+
+    shp1 = pooled1.shape
+    # reshape to B, C, 0, 1*2 and do the pooling on 0
+    second = pooled1.reshape((shape[0], shape[1], shape[2], shp1[2] * shp1[3]))
+    pooled2 = pool_2d_i2n(second, ds=(ds[0], 1), strides=(strides[0], 1),
+                          pad=(pad[0], 0), pool_function=pool_function, mode=mode)
+    shp2 = pooled2.shape
+    return pooled2.reshape((shape[0], shape[1], shp2[2], shp1[2], shp1[3]))
+
+
 def test_pooling():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
@@ -178,7 +200,7 @@ def test_pooling():

    x = T.ftensor4()
    for mode, pad in product(modes,
-                             ((0, 0), (1, 0), (1, 0), (2, 3), (3, 2))):
+                             ((0, 0), (1, 0), (0, 1), (2, 3), (3, 2))):
        if mode == 'max':
            func = T.max
        else:
@@ -278,6 +300,119 @@ def test_pooling():
            utt.assert_allclose(c_out, g_out)


+def test_pooling_3d():
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+
+    # 'average_exc_pad' is disabled for versions < 4004
+    if dnn.version(raises=False) < 4004:
+        modes = ('max', 'average_inc_pad')
+    else:
+        modes = ('max', 'average_inc_pad', 'average_exc_pad')
+
+    x = T.ftensor5()
+    for mode, pad in product(modes,
+                             ((0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1),
+                              (3, 2, 2), (2, 3, 2), (2, 2, 3))):
+        if mode == 'max':
+            func = T.max
+        else:
+            func = T.mean
+
+        if pad != (0, 0, 0) and func is T.mean:
+            continue
+
+        for ws in (4, 2, 5):
+            for stride in (2, 3):
+                if stride > ws:
+                    continue
+                if pad[0] > stride or pad[1] > stride or pad[2] > stride:
+                    # Not implemented
+                    continue
+                # We will check that the opt introduced it.
+                out1 = pool_3d(x, (ws, ws, ws),
+                               st=(stride, stride, stride),
+                               ignore_border=True,
+                               padding=pad, mode=mode)
+                out2 = pool3d2d(x, ds=(ws, ws, ws), strides=(stride, stride, stride),
+                                pad=pad,
+                                pool_function=func)
+                mode_without_gpu2 = mode_without_gpu.including()
+                mode_without_gpu2.check_isfinite = False
+                f1 = theano.function([x], out1, mode=mode_with_gpu)
+                assert any([isinstance(node.op, dnn.GpuDnnPool)
+                            for node in f1.maker.fgraph.apply_nodes])
+                f2 = theano.function([x], out2, mode=mode_without_gpu2)
+                assert not any([isinstance(node.op, dnn.GpuDnnPool)
+                                for node in f2.maker.fgraph.apply_nodes])
+                for shp in [(1, 10, 30, 30, 30),
+                            (1, 3, 29, 29, 29),
+                            (3, 1, 47, 97, 47),
+                            ]:
+                    data = numpy.random.normal(0, 1, shp).astype("float32")
+                    a = f1(data)
+                    b = f2(data)
+
+                    utt.assert_allclose(a, b)
+
+        # Test the grad
+        for shp in [(1, 1, 2, 2, 2),
+                    (1, 1, 3, 3, 3)]:
+            data = numpy.random.normal(0, 1, shp).astype("float32") * 10
+
+            ws = 2
+            stride = 2
+            if pad[0] > stride or pad[1] > stride or pad[2] > stride:
+                # Not implemented
+                continue
+
+            # This test the CPU grad + opt + GPU implemtentation
+            def fn(x):
+                return pool_3d(x, (ws, ws, ws), ignore_border=True,
+                               padding=pad, mode=mode)
+            utt.verify_grad(fn, [data],
+                            cast_to_output_type=False,
+                            mode=mode_with_gpu)
+            # Confirm that the opt would have inserted it.
+            fg = theano.function([x], theano.grad(fn(x).sum(), x),
+                                 mode=mode_with_gpu)
+            assert any([isinstance(node.op, dnn.GpuDnnPoolGrad)
+                        for node in fg.maker.fgraph.toposort()])
+
+            # Test the GPU grad + GPU implementation
+            def fn(x):
+                dnn_op = dnn.dnn_pool(
+                    x, ws=(ws, ws, ws),
+                    stride=(stride, stride, stride),
+                    pad=pad,
+                    mode=mode)
+                return dnn_op
+            utt.verify_grad(fn, [data],
+                            cast_to_output_type=False,
+                            mode=mode_with_gpu)
+            # Confirm that we get the good op.
+            fg = theano.function([x], theano.grad(fn(x).sum(), x),
+                                 mode=mode_with_gpu)
+            assert any([isinstance(node.op, dnn.GpuDnnPoolGrad)
+                        for node in fg.maker.fgraph.toposort()])
+            g_out = fg(data)
+
+            # Compare against the CPU result
+            out = pool_3d(x, (ws, ws, ws),
+                          padding=pad,
+                          ignore_border=True, mode=mode)
+            fc = theano.function([x], theano.grad(out.sum(), x),
+                                 mode=mode_without_gpu)
+            if mode == 'max':
+                assert any([isinstance(node.op, MaxPoolGrad)
+                            for node in fc.maker.fgraph.toposort()])
+            else:
+                assert any([isinstance(node.op, AveragePoolGrad)
+                            for node in fc.maker.fgraph.toposort()])
+            c_out = fc(data)
+            utt.assert_allclose(c_out, g_out)
+
+
 def test_pooling_with_tensor_vars():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
@@ -331,6 +466,7 @@ def test_pooling_opt():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)

+    # 2D pooling
    x = T.fmatrix()

    f = theano.function(
@@ -344,6 +480,7 @@ def test_pooling_opt():

    f(numpy.zeros((10, 10), dtype='float32'))

+    # gradient of 2D pooling
    f = theano.function(
        [x],
        T.grad(pool_2d(x, ds=(2, 2), mode='average_inc_pad',
@@ -362,11 +499,86 @@ def test_pooling_opt():
        pool_2d(x, ds=(2, 3), mode='sum',
                ignore_border=True),
        mode=mode_with_gpu)
+    data = numpy.random.rand(10, 10).astype('float32')
+    f(data)
+
+    # 3D pooling
+    x = T.ftensor3()
+
+    f = theano.function(
+        [x],
+        pool_3d(x, ds=(2, 2, 2), mode='average_inc_pad',
+                ignore_border=True),
+        mode=mode_with_gpu)

    assert any([isinstance(n.op, dnn.GpuDnnPool)
                for n in f.maker.fgraph.toposort()])
-    data = numpy.random.rand(10, 10).astype('float32')
-    f(data)
+
+    f(numpy.zeros((10, 10, 10), dtype='float32'))
+
+    # gradient of 3D pooling
+    f = theano.function(
+        [x],
+        T.grad(pool_3d(x, ds=(2, 2, 2), mode='average_inc_pad',
+                       ignore_border=True).sum(),
+               x),
+        mode=mode_with_gpu.including("cudnn"))
+
+    assert any([isinstance(n.op, dnn.GpuDnnPoolGrad)
+                for n in f.maker.fgraph.toposort()])
+
+    f(numpy.zeros((10, 10, 10), dtype='float32'))
+
+
+def test_pooling_opt_arbitrary_dimensions():
+    # test if input with an arbitrary number of non-pooling dimensions
+    # is correctly reshaped to run on the GPU
+
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+
+    # 'average_exc_pad' is disabled for versions < 4004
+    if dnn.version(raises=False) < 4004:
+        modes = ('max', 'average_inc_pad')
+    else:
+        modes = ('max', 'average_inc_pad', 'average_exc_pad')
+
+    for n_non_pool_dims in (0, 1, 2, 3):
+        for ws in ((2, 2), (3, 3, 3)):
+            # create input shape: non-pooling dimensions
+            # followed by 2 or 3 pooling dimensions
+            shp = (2,) * n_non_pool_dims + (5,) * len(ws)
+            data = numpy.random.normal(0, 1, shp).astype('float32')
+            input = gpuarray_shared_constructor(data)
+
+            for mode in modes:
+                out_pool = Pool(ndim=len(ws), mode=mode, ignore_border=True)(input, ws)
+                out_pool_grad = T.grad(T.sum(out_pool), wrt=input)
+                out = [out_pool, out_pool_grad]
+
+                # run on GPU
+                fg = theano.function([], out, mode=mode_with_gpu)
+                assert any([isinstance(node.op, dnn.GpuDnnPool)
+                           for node in fg.maker.fgraph.toposort()])
+                assert any([isinstance(node.op, dnn.GpuDnnPoolGrad)
+                           for node in fg.maker.fgraph.toposort()])
+                res_gpu = fg()
+
+                # run on CPU
+                fc = theano.function([], out, mode=mode_without_gpu)
+                assert any([isinstance(node.op, Pool)
+                           for node in fc.maker.fgraph.toposort()])
+                if mode == 'max':
+                    assert any([isinstance(node.op, MaxPoolGrad)
+                               for node in fc.maker.fgraph.toposort()])
+                else:
+                    assert any([isinstance(node.op, AveragePoolGrad)
+                               for node in fc.maker.fgraph.toposort()])
+                res_cpu = fg()
+
+                # check for similarity
+                utt.assert_allclose(res_gpu[0], res_cpu[0])
+                utt.assert_allclose(res_gpu[1], res_cpu[1])


 def test_dnn_tag():
@@ -625,6 +837,33 @@ class TestDnnInferShapes(utt.InferShapeTester):
                dnn.GpuDnnPool
            )

+    def test_pool_3d(self):
+        if not dnn.dnn_available(test_ctx_name):
+            raise SkipTest(dnn.dnn_available.msg)
+        img = T.ftensor5('img')
+        img_val = numpy.asarray(
+            numpy.random.rand(2, 3, 4, 5, 6),
+            dtype='float32'
+        )
+
+        # 'average_exc_pad' is disabled for versions < 4004
+        if dnn.version(raises=False) < 4004:
+            modes = ['max', 'average_inc_pad']
+        else:
+            modes = ['max', 'average_inc_pad', 'average_exc_pad']
+
+        for params in product(
+            [(1, 1, 1), (2, 2, 2), (3, 3, 3)],
+            [(1, 1, 1), (2, 2, 2), (3, 3, 3)],
+            modes
+        ):
+            self._compile_and_check(
+                [img],
+                [dnn.GpuDnnPool(mode=params[2])(img, params[0], params[1], (0, 0, 0))],
+                [img_val],
+                dnn.GpuDnnPool
+            )
+
    def test_pool_grad(self):
        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
@@ -664,6 +903,45 @@ class TestDnnInferShapes(utt.InferShapeTester):
                dnn.GpuDnnPoolGrad
            )

+    def test_pool_3d_grad(self):
+        if not dnn.dnn_available(test_ctx_name):
+            raise SkipTest(dnn.dnn_available.msg)
+        img = T.ftensor5('img')
+        img_grad = T.ftensor5('img_grad')
+        out = T.ftensor5('out')
+        img_val = numpy.asarray(
+            numpy.random.rand(2, 3, 4, 5, 6),
+            dtype='float32'
+        )
+        img_grad_val = numpy.asarray(
+            numpy.random.rand(2, 3, 4, 5, 6),
+            dtype='float32'
+        )
+        out_val = numpy.asarray(
+            numpy.random.rand(2, 3, 4, 5, 6),
+            dtype='float32'
+        )
+
+        for params in product(
+            [(1, 1, 1), (2, 2, 2), (3, 3, 3)],
+            [(1, 1, 1), (2, 2, 2), (3, 3, 3)],
+            ['max', 'average_inc_pad']
+        ):
+            pool_grad = dnn.GpuDnnPoolGrad(mode=params[2])(
+                img,
+                out,
+                img_grad,
+                params[0],
+                params[1],
+                (0, 0, 0)
+            )
+            self._compile_and_check(
+                [img, img_grad, out],
+                [pool_grad],
+                [img_val, img_grad_val, out_val],
+                dnn.GpuDnnPoolGrad
+            )
+

 # this has been a problem in the past
 def test_dnn_conv_border_mode():

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -30,7 +30,8 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
 from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
                                      GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import GpuSoftmax
-from theano.sandbox.cuda.opt_util import alpha_merge, output_merge
+from theano.sandbox.cuda.opt_util import (alpha_merge, output_merge,
+                                          pad_dims, unpad_dims)
 from theano.sandbox.cuda import gpu_seqopt, register_opt

 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
@@ -1391,20 +1392,23 @@ class GpuDnnPoolDesc(GpuOp):
    def do_constant_folding(self, node):
        return False

-    def __init__(self, ws=(1, 1), stride=(1, 1), mode='max', pad=(0, 0)):
+    def __init__(self, ws=(1, 1), stride=None, mode='max', pad=None):
        if mode == 'average':
            mode = 'average_inc_pad'
        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
        self.mode = mode

+        if stride is None:
+            stride = (1,) * len(ws)
+        if pad is None:
+            pad = (0,) * len(ws)
+
        assert len(ws) == len(stride) and len(stride) == len(pad)
        assert len(ws) in (2, 3)
        self.ws = ws
        self.stride = stride
        self.pad = pad

-        if (pad[0] != 0 or pad[1] != 0) and version() == -1:
-            raise RuntimeError("cuDNN pooling with padding requires cuDNN v2")
        if self.get_ndim() == 3 and version() < (3000, 3000):
            raise RuntimeError("cuDNN 3d pooling requires cuDNN v3")
        if (mode == 'average_exc_pad' and max(pad) > 0 and
@@ -1418,12 +1422,9 @@ class GpuDnnPoolDesc(GpuOp):
    def __setstate__(self, d):
        self.__dict__.update(d)
        if not hasattr(self, 'pad'):
-            self.pad = (0, 0)
+            self.pad = (0,) * self.get_ndim()

    def make_node(self):
-        if self.pad != (0, 0) and version() == -1:
-            raise RuntimeError("cuDNN pooling with padding requires cuDNN v2")
-
        node = Apply(self, [],
                     [CDataType("cudnnPoolingDescriptor_t",
                                freefunc="cudnnDestroyPoolingDescriptor")()])
@@ -1444,8 +1445,6 @@ class GpuDnnPoolDesc(GpuOp):
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
        elif self.mode == "average_exc_pad":
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
-            if version() == -1:
-                raise Exception("cudnn v1 do not support average_exc_pad")
        else:
            raise NotImplementedError("Unsupported pooling model.")

@@ -1616,8 +1615,6 @@ if (pool%(name)s != NULL) { cudnnDestroyPoolingDescriptor(pool%(name)s); }
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
        elif self.mode == "average_exc_pad":
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
-            if version() == -1:
-                raise Exception("cudnn v1 do not support average_exc_pad")
        else:
            raise NotImplementedError("Unsupported pooling model.")

@@ -1872,8 +1869,6 @@ if (pool%(name)s != NULL) { cudnnDestroyPoolingDescriptor(pool%(name)s); }
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
        elif self.mode == "average_exc_pad":
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
-            if version() == -1:
-                raise Exception("cudnn v1 do not support average_exc_pad")
        else:
            raise NotImplementedError("Unsupported pooling model.")

@@ -1976,28 +1971,33 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
        return [shape[0]]


-def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
+def dnn_pool(img, ws, stride=None, mode='max', pad=None):
    """
    GPU pooling using cuDNN from NVIDIA.

-    The memory layout to use is 'bc01', that is 'batch', 'channel',
-    'first dim', 'second dim' in that order.
+    For 2D pooling, the memory layout to use is 'bc01', that is 'batch',
+    'channel', 'first dim', 'second dim' in that order.
+
+    For 3D pooling, the memory layout to use is 'bc012', that is 'batch',
+    'channel', 'first dim', 'second dim', 'third dim'.

    Parameters
    ----------
    img
        Images to do the pooling over.
    ws
-        Subsampling window size.
+        Subsampling window size.  Should have 2 or 3 elements.
    stride
-        Subsampling stride (default: (1, 1)).
-    mode : {'max', 'average_inc_pad', 'average_exc_pad, 'sum'}
-    pad :
-        (pad_h, pad_w) padding information.
+        Subsampling stride (default: (1, 1) or (1, 1, 1)).
+    mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum'}
+    pad
+        Padding: (pad_h, pad_w) for 2D or (pad_h, pad_w, pad_d) for 3D.
        pad_h is the number of zero-valued pixels added to each of the top and
        bottom borders.
        pad_w is the number of zero-valued pixels added to each of the left
        and right borders.
+        pad_d is the number of zero-valued pixels added to each of the front
+        and back borders (3D pooling only).

    .. warning:: The cuDNN library only works with GPU that have a compute
      capability of 3.0 or higer.  This means that older GPU will not
@@ -2009,6 +2009,10 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):

    """
    img = gpu_contiguous(img)
+    if stride is None:
+        stride = (1,) * len(ws)
+    if pad is None:
+        pad = (0,) * len(ws)
    if mode == "sum":
        ret = GpuDnnPool(mode="average_inc_pad")(img, ws, stride, pad)
        window_elem = theano.tensor.prod(ws).astype(ret.dtype)
@@ -2972,10 +2976,21 @@ if True:
            if not node.op.ignore_border:
                return
            img, ws, stride, pad = node.inputs
+            nd = node.op.ndim if node.op.ndim else (img.ndim - 2)
            mode = node.op.mode
+            if nd not in (2, 3):
+                return
            if (img.owner and isinstance(img.owner.op, HostFromGpu)):
-                ret = dnn_pool(gpu_contiguous(img.owner.inputs[0]),
-                               ws, stride=stride, pad=pad, mode=mode)
+                if img.ndim == nd + 2:
+                    ret = dnn_pool(gpu_contiguous(img.owner.inputs[0]),
+                                   ws, stride=stride, pad=pad, mode=mode)
+                else:
+                    input = gpu_contiguous(img.owner.inputs[0])
+                    # reshape to 4D or 5D with 2 non-pooling dimensions
+                    input_padded = pad_dims(input, 2, nd)
+                    ret_padded = dnn_pool(input_padded,
+                                          ws, stride=stride, pad=pad, mode=mode)
+                    ret = unpad_dims(ret_padded, input, 2, nd)
                return [host_from_gpu(ret)]

    @register_opt('cudnn')
@@ -3003,17 +3018,30 @@ if True:
            if not node.op.ignore_border:
                return
            inp, out, inp_grad, ws, stride, pad = node.inputs
+            nd = node.op.ndim if node.op.ndim else (inp.ndim - 2)
            mode = node.op.mode
+            if nd not in (2, 3):
+                return

            if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
                (out.owner and isinstance(out.owner.op, HostFromGpu)) or
                (inp_grad.owner and isinstance(inp_grad.owner.op,
                                               HostFromGpu))):
-
-                ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
-                                                gpu_contiguous(out),
-                                                gpu_contiguous(inp_grad),
-                                                ws, stride, pad)
+                if inp.ndim == nd + 2:
+                    ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
+                                                    gpu_contiguous(out),
+                                                    gpu_contiguous(inp_grad),
+                                                    ws, stride, pad)
+                else:
+                    # reshape to 4D or 5D with 2 non-pooling dimensions
+                    inp_padded = pad_dims(gpu_contiguous(inp), 2, nd)
+                    out_padded = pad_dims(gpu_contiguous(out), 2, nd)
+                    inp_grad_padded = pad_dims(gpu_contiguous(inp_grad), 2, nd)
+                    ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
+                                                           out_padded,
+                                                           inp_grad_padded,
+                                                           ws, stride, pad)
+                    ret = unpad_dims(ret_padded, inp, 2, nd)
                return [host_from_gpu(ret)]

    @register_opt('cudnn')
@@ -3025,16 +3053,28 @@ if True:
            if not node.op.ignore_border:
                return
            inp, inp_grad, ws, stride, pad = node.inputs
+            nd = node.op.ndim if node.op.ndim else (inp.ndim - 2)
            mode = node.op.mode
+            if nd not in (2, 3):
+                return

            if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
                (inp_grad.owner and isinstance(inp_grad.owner.op,
                                               HostFromGpu))):
-                contiguous_inp_grad = gpu_contiguous(inp_grad)
-                ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
-                                                contiguous_inp_grad,
-                                                contiguous_inp_grad,
-                                                ws, stride, pad)
+                if inp.ndim == nd + 2:
+                    contiguous_inp_grad = gpu_contiguous(inp_grad)
+                    ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
+                                                    contiguous_inp_grad,
+                                                    contiguous_inp_grad,
+                                                    ws, stride, pad)
+                else:
+                    inp_padded = pad_dims(gpu_contiguous(inp), 2, nd)
+                    inp_grad_padded = pad_dims(gpu_contiguous(inp_grad), 2, nd)
+                    ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
+                                                           inp_grad_padded,
+                                                           inp_grad_padded,
+                                                           ws, stride, pad)
+                    ret = unpad_dims(ret_padded, inp, 2, nd)
                return [host_from_gpu(ret)]

    @register_opt('cudnn')

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -40,6 +40,7 @@ from theano.sandbox.cuda.basic_ops import (
    GpuSubtensor, GpuAdvancedSubtensor1,
    GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
+from theano.sandbox.cuda.opt_util import pad_dims, unpad_dims

 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (
@@ -1891,15 +1892,12 @@ def local_convtransp3d_gemm(node):
 gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm)


-def _check_constant_args_pool(ws, stride, pad, node):
+def _check_constant_args_pool(ndim, ws, stride, pad, node):
    """Check if the args of pool are constants. Warns if not."""
    try:
-        ws_w = tensor.get_scalar_constant_value(ws[0])
-        ws_h = tensor.get_scalar_constant_value(ws[1])
-        stride_w = tensor.get_scalar_constant_value(stride[0])
-        stride_h = tensor.get_scalar_constant_value(stride[1])
-        pad_w = tensor.get_scalar_constant_value(pad[0])
-        pad_h = tensor.get_scalar_constant_value(pad[1])
+        ws = tuple(tensor.get_scalar_constant_value(ws[i]) for i in range(ndim))
+        stride = tuple(tensor.get_scalar_constant_value(stride[i]) for i in range(ndim))
+        pad = tuple(tensor.get_scalar_constant_value(pad[i]) for i in range(ndim))
    except tensor.NotScalarConstantError:
        msg = ("Pool with tensor variable for the window size, stride or "
               "padding is only supported in the new GPU backend, so this op "
@@ -1909,65 +1907,96 @@ def _check_constant_args_pool(ws, stride, pad, node):
        elif config.assert_no_cpu_op == "raise":
            raise AssertionError(msg)
        return None
-    ws = (ws_w, ws_h)
-    stride = (stride_w, stride_h)
-    pad = (pad_w, pad_h)
    return ws, stride, pad


 @register_opt()
 @local_optimizer([pool.Pool])
 def local_gpu_downsample_factor_max(node):
-    if isinstance(node.op, pool.Pool):
-        assert node.op.__props__ == ('ignore_border', 'mode')
+    if (isinstance(node.op, pool.Pool)):
+        assert node.op.__props__ == ('ndim', 'ignore_border', 'mode')
        x, ws, stride, pad = node.inputs
-        ret = _check_constant_args_pool(ws, stride, pad, node)
+        nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
+        ret = _check_constant_args_pool(nd, ws, stride, pad, node)
        if ret is None:
            return
        ws, stride, pad = ret
-        if (pad) != (0, 0) or node.op.mode != 'max' or stride != ws:
+        if (nd != 2 or
+                max(node.op.padding) != 0 or
+                node.op.mode != 'max' or
+                stride != ws):
            return
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
-            gpu_ds = GpuDownsampleFactorMax(ws, node.op.ignore_border)
-            return [host_from_gpu(gpu_ds(x.owner.inputs[0]))]
+            gpu_ws = GpuDownsampleFactorMax(ws, node.op.ignore_border)
+            if node.inputs[0].ndim == 4:
+                return [host_from_gpu(gpu_ws(x.owner.inputs[0]))]
+            else:
+                input_4D = pad_dims(x.owner.inputs[0], 2, 2)
+                output_4D = gpu_ws(input_4D)
+                output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
+                return [host_from_gpu(output)]


 @register_opt()
 @local_optimizer([pool.MaxPoolGrad])
 def local_gpu_downsample_factor_max_grad(node):
-    if isinstance(node.op, pool.MaxPoolGrad):
-        assert node.op.__props__ == ('ignore_border', 'mode')
+    if (isinstance(node.op, pool.MaxPoolGrad)):
+        assert node.op.__props__ == ('ndim', 'ignore_border', 'mode')
        x, z, gz, ws, stride, pad = node.inputs
-        ret = _check_constant_args_pool(ws, stride, pad, node)
+        nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
+        ret = _check_constant_args_pool(nd, ws, stride, pad, node)
        if ret is None:
            return
        ws, stride, pad = ret
-        if pad != (0, 0) or node.op.mode != 'max' or stride != ws:
+        if (nd != 2 or
+                max(node.op.padding) != 0 or
+                node.op.mode != 'max' or
+                stride != ws):
            return
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
-            gpu_ds_grad = GpuDownsampleFactorMaxGrad(ws, node.op.ignore_border)
-            return [host_from_gpu(gpu_ds_grad(x.owner.inputs[0],
-                                              as_cuda_ndarray_variable(z),
-                                              as_cuda_ndarray_variable(gz)))]
+            gpu_ws_grad = GpuDownsampleFactorMaxGrad(ws, node.op.ignore_border)
+            if node.inputs[0].ndim == 4:
+                return [host_from_gpu(gpu_ws_grad(x.owner.inputs[0],
+                                                  as_cuda_ndarray_variable(z),
+                                                  as_cuda_ndarray_variable(gz)))]
+            else:
+                x_4D = pad_dims(x.owner.inputs[0], 2, 2)
+                z_4D = pad_dims(as_cuda_ndarray_variable(z), 2, 2)
+                gz_4D = pad_dims(as_cuda_ndarray_variable(gz), 2, 2)
+                output_4D = gpu_ws_grad(x_4D, z_4D, gz_4D)
+                output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
+                return [host_from_gpu(output)]


 @register_opt()
 @local_optimizer([pool.DownsampleFactorMaxGradGrad])
 def local_gpu_downsample_factor_max_grad_grad(node):
    if isinstance(node.op, pool.DownsampleFactorMaxGradGrad):
-        assert node.op.__props__ == ('ignore_border', 'mode')
+        assert node.op.__props__ == ('ndim', 'ignore_border', 'mode')
        x, z, gx, ws, stride, pad = node.inputs
-        ret = _check_constant_args_pool(ws, stride, pad, node)
+        nd = node.op.ndim if node.op.ndim else (x.ndim - 2)
+        ret = _check_constant_args_pool(nd, ws, stride, pad, node)
        if ret is None:
            return
        ws, stride, pad = ret
-        if pad != (0, 0) or node.op.mode != 'max' or stride != ws:
+        if (nd != 2 or
+                max(node.op.padding) != 0 or
+                node.op.mode != 'max' or
+                stride != ws):
            return
        if (x.owner and isinstance(x.owner.op, HostFromGpu)):
            op = GpuDownsampleFactorMaxGradGrad(ws, node.op.ignore_border)
-            return [host_from_gpu(op(x.owner.inputs[0],
-                                     as_cuda_ndarray_variable(z),
-                                     as_cuda_ndarray_variable(gx)))]
+            if node.inputs[0].ndim == 4:
+                return [host_from_gpu(op(x.owner.inputs[0],
+                                         as_cuda_ndarray_variable(z),
+                                         as_cuda_ndarray_variable(gx)))]
+            else:
+                x_4D = pad_dims(x.owner.inputs[0], 2, 2)
+                z_4D = pad_dims(as_cuda_ndarray_variable(z), 2, 2)
+                gx_4D = pad_dims(as_cuda_ndarray_variable(gx), 2, 2)
+                output_4D = op(x_4D, z_4D, gx_4D)
+                output = unpad_dims(output_4D, x.owner.inputs[0], 2, 2)
+                return [host_from_gpu(output)]


 @register_opt()

--- a/theano/sandbox/cuda/opt_util.py
+++ b/theano/sandbox/cuda/opt_util.py
@@ -3,13 +3,13 @@ from functools import wraps

 import numpy

-from theano import scalar as scal, Constant
+from theano import tensor, scalar as scal, Constant
 from theano.gof import local_optimizer
 from theano.tensor import (DimShuffle, get_scalar_constant_value,
                           NotScalarConstantError)

 from theano.sandbox.cuda.basic_ops import (
-    GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise)
+    GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise, GpuReshape)

 _one = scal.constant(numpy.asarray(1.0, dtype='float32'))

@@ -126,3 +126,48 @@ def output_merge(cls, alpha_in, beta_in, out_in):
                return maker(targ, *inputs)
        return opt
    return wrapper
+
+
+def pad_dims(input, leftdims, rightdims):
+    """Reshapes the input to a (leftdims + rightdims) tensor
+
+    This helper function is used to convert pooling inputs with arbitrary
+    non-pooling dimensions to the correct number of dimensions for the
+    GPU pooling ops.
+
+    This reduces or expands the number of dimensions of the input to
+    exactly `leftdims`, by adding extra dimensions on the left or by
+    combining some existing dimensions on the left of the input.
+    """
+    assert input.ndim >= rightdims
+
+    if input.ndim == (leftdims + rightdims):
+        return input
+
+    # extract image dimensions
+    img_shape = input.shape[-rightdims:]
+
+    # count the number of "leading" dimensions, store as dmatrix
+    batch_size = tensor.prod(input.shape[:-rightdims])
+    batch_size = tensor.shape_padright(batch_size, 1)
+
+    # store in the required shape, for example as a 4D tensor
+    # with shape: (batch_size,1,height,width)
+    new_shape = tensor.cast(tensor.join(0, batch_size,
+                                        tensor.as_tensor([1] * (leftdims - 1)),
+                                        img_shape), 'int64')
+    input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
+    return input_ND
+
+
+def unpad_dims(output, input, leftdims, rightdims):
+    """Reshapes the output after pad_dims.
+
+    This reverts the padding by `pad_dims`.
+    """
+    if output.ndim == input.ndim:
+        return output
+
+    # restore the output to the original shape
+    outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
+    return GpuReshape(input.ndim)(output, outshp)
--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -326,7 +326,9 @@ if 0:


 def test_downsample():
-    shps = [(1, 1, 1, 12),
+    shps = [(1, 12),
+            (1, 1, 12),
+            (1, 1, 1, 12),
            (1, 1, 2, 2),
            (1, 1, 1, 1),
            (1, 1, 4, 4),
@@ -359,17 +361,17 @@ def test_downsample():

    for shp in shps:
        for ds in (2, 2), (3, 2), (1, 1):
-            if ds[0] > shp[2]:
+            if ds[0] > shp[-2]:
                continue
-            if ds[1] > shp[3]:
+            if ds[1] > shp[-1]:
                continue
            # GpuDownsampleFactorMax doesn't like having more than 512 columns
            # in the output tensor.
-            if float(shp[3]) / ds[1] > 512:
+            if float(shp[-1]) / ds[1] > 512:
                continue
            for ignore_border in (True, False):
                # print 'test_downsample', shp, ds, ignore_border
-                ds_op = Pool(ignore_border=ignore_border)
+                ds_op = Pool(ndim=len(ds), ignore_border=ignore_border)

                a = tcn.shared_constructor(my_rand(*shp), 'a')
                f = pfunc([], ds_op(tensor.as_tensor_variable(a), ds),

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -15,8 +15,8 @@ import theano
 import theano.tensor as T
 import theano.tests.unittest_tools as utt
 from theano.sandbox.neighbours import images2neibs
-from theano.tensor.signal.pool import pool_2d
-from theano.tensor.signal.pool import MaxPoolGrad, AveragePoolGrad
+from theano.tensor.signal.pool import pool_2d, pool_3d
+from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
 import theano.sandbox.cuda.dnn as dnn
 from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
 from theano.sandbox.cuda import float32_shared_constructor as shared
@@ -170,7 +170,7 @@ def test_dnn_conv_inplace():


 def pool3d2d(input, ds=(2, 2, 2), strides=None, pad=(0, 0, 0),
-             pool_func=T.max, mode='ignore_borders'):
+             pool_function=T.max, mode='ignore_borders'):
    if strides is None:
        strides = ds

@@ -179,13 +179,13 @@ def pool3d2d(input, ds=(2, 2, 2), strides=None, pad=(0, 0, 0),
    # resahpe to B, C*0, 1, 2 and do the pooling on 1, 2
    first = input.reshape((shape[0], shape[1] * shape[2], shape[3], shape[4]))
    pooled1 = pool_2d_i2n(first, ds=ds[1:], strides=strides[1:], pad=pad[1:],
-                          pool_function=pool_func, mode=mode)
+                          pool_function=pool_function, mode=mode)

    shp1 = pooled1.shape
    # reshape to B, C, 0, 1*2 and do the pooling on 0
    second = pooled1.reshape((shape[0], shape[1], shape[2], shp1[2] * shp1[3]))
    pooled2 = pool_2d_i2n(second, ds=(ds[0], 1), strides=(strides[0], 1),
-                          pad=(pad[0], 0), pool_function=pool_func, mode=mode)
+                          pad=(pad[0], 0), pool_function=pool_function, mode=mode)
    shp2 = pooled2.shape
    return pooled2.reshape((shape[0], shape[1], shp2[2], shp1[2], shp1[3]))

@@ -241,8 +241,6 @@ def test_pooling():
            func = T.max
        else:
            func = T.mean
-        if pad != (0, 0) and cuda.dnn.version() == -1:
-            continue

        if pad != (0, 0) and func is T.mean:
            continue
@@ -418,6 +416,7 @@ def test_pooling3d():
    if not cuda.dnn.dnn_available() or cuda.dnn.version() < (3000, 3000):
        raise SkipTest(cuda.dnn.dnn_available.msg)

+    # We force the FAST_RUN as we don't want the reference to run in DebugMode.
    mode_without_gpu_ref = theano.compile.mode.get_mode(
        'FAST_RUN').excluding('gpu')

@@ -427,8 +426,7 @@ def test_pooling3d():
    else:
        modes = ('max', 'average_inc_pad', 'average_exc_pad')

-    x = T.TensorType(broadcastable=(False, False, False, False, False),
-                     dtype='float32')()
+    x = T.ftensor5()
    for mode, pad in product(modes,
                             ((0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1),
                              (2, 3, 2), (3, 2, 2), (2, 2, 3))):
@@ -436,8 +434,6 @@ def test_pooling3d():
            func = T.max
        else:
            func = T.mean
-        if pad != (0, 0, 0) and cuda.dnn.version() == -1:
-            continue

        if pad != (0, 0, 0) and func is T.mean:
            continue
@@ -449,13 +445,13 @@ def test_pooling3d():
                if pad[0] > stride or pad[1] > stride or pad[2] > stride:
                    # Not implemented
                    continue
-                out1 = cuda.dnn.dnn_pool(x, (ws, ws, ws),
-                                         stride=(stride, stride, stride),
-                                         pad=pad, mode=mode)
-                out2 = pool3d2d(x, ds=(ws, ws, ws),
-                                strides=(stride, stride, stride),
-                                pad=pad, pool_func=func)
-
+                out1 = pool_3d(x, (ws, ws, ws),
+                               st=(stride, stride, stride),
+                               ignore_border=True,
+                               padding=pad, mode=mode)
+                out2 = pool3d2d(x, ds=(ws, ws, ws), strides=(stride, stride, stride),
+                                pad=pad,
+                                pool_function=func)
                f1 = theano.function([x], out1, mode=mode_with_gpu)
                assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
                            for node in f1.maker.fgraph.apply_nodes])
@@ -510,11 +506,17 @@ def test_pooling3d():
            g_out = fg(data)

            # Compare again the CPU result
-            out = pool3d2d(x, (ws, ws, ws),
-                           strides=(stride, stride, stride),
-                           pad=pad, pool_func=func)
+            out = pool_3d(x, (ws, ws, ws),
+                          padding=pad,
+                          ignore_border=True, mode=mode)
            fc = theano.function([x], theano.grad(out.sum(), x),
                                 mode=mode_without_gpu_ref)
+            if mode == 'max':
+                assert any([isinstance(node.op, MaxPoolGrad)
+                            for node in fc.maker.fgraph.toposort()])
+            else:
+                assert any([isinstance(node.op, AveragePoolGrad)
+                            for node in fc.maker.fgraph.toposort()])
            c_out = fc(data)
            utt.assert_allclose(c_out, g_out)

@@ -523,6 +525,7 @@ def test_pooling_opt():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)

+    # 2D pooling
    x = T.fmatrix()

    f = theano.function(
@@ -535,6 +538,7 @@ def test_pooling_opt():

    f(numpy.zeros((10, 10), dtype='float32'))

+    # gradient of 2D pooling
    f = theano.function(
        [x],
        T.grad(pool_2d(x, ds=(2, 2), mode='average_inc_pad',
@@ -545,6 +549,7 @@ def test_pooling_opt():
                for n in f.maker.fgraph.toposort()])

    f(numpy.zeros((10, 10), dtype='float32'))
+
    # Test sum pooling
    f = theano.function(
        [x],
@@ -557,6 +562,82 @@ def test_pooling_opt():
    data = numpy.random.rand(10, 10).astype('float32')
    f(data)

+    # 3D pooling
+    x = T.ftensor3()
+
+    f = theano.function(
+        [x],
+        pool_3d(x, ds=(2, 2, 2), mode='average_inc_pad', ignore_border=True),
+        mode=mode_with_gpu)
+
+    assert any([isinstance(n.op, cuda.dnn.GpuDnnPool)
+                for n in f.maker.fgraph.toposort()])
+
+    f(numpy.zeros((10, 10, 10), dtype='float32'))
+
+    # gradient of 3D pooling
+    f = theano.function(
+        [x],
+        T.grad(pool_3d(x, ds=(2, 2, 2), mode='average_inc_pad',
+                       ignore_border=True).sum(), x),
+        mode=mode_with_gpu.including("cudnn"))
+
+    assert any([isinstance(n.op, cuda.dnn.GpuDnnPoolGrad)
+                for n in f.maker.fgraph.toposort()])
+
+    f(numpy.zeros((10, 10, 10), dtype='float32'))
+
+
+def test_pooling_opt_arbitrary_dimensions():
+    # test if input with an arbitrary number of non-pooling dimensions
+    # is correctly reshaped to run on the GPU
+
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+
+    # 'average_exc_pad' is disabled for versions < 4004
+    if cuda.dnn.version() < (4004, 4004):
+        modes = ('max', 'average_inc_pad')
+    else:
+        modes = ('max', 'average_inc_pad', 'average_exc_pad')
+
+    for n_non_pool_dims in (0, 1, 2, 3):
+        for ws in ((2, 2), (3, 3, 3)):
+            # create input shape: non-pooling dimensions
+            # followed by 2 or 3 pooling dimensions
+            shp = (2,) * n_non_pool_dims + (5,) * len(ws)
+            data = numpy.random.normal(0, 1, shp).astype('float32')
+            input = shared(data)
+
+            for mode in modes:
+                out_pool = Pool(ndim=len(ws), mode=mode, ignore_border=True)(input, ws)
+                out_pool_grad = T.grad(T.sum(out_pool), wrt=input)
+                out = [out_pool, out_pool_grad]
+
+                # run on GPU
+                fg = theano.function([], out, mode=mode_with_gpu)
+                assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
+                           for node in fg.maker.fgraph.toposort()])
+                assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
+                           for node in fg.maker.fgraph.toposort()])
+                res_gpu = fg()
+
+                # run on CPU
+                fc = theano.function([], out, mode=mode_without_gpu)
+                assert any([isinstance(node.op, Pool)
+                           for node in fc.maker.fgraph.toposort()])
+                if mode == 'max':
+                    assert any([isinstance(node.op, MaxPoolGrad)
+                               for node in fc.maker.fgraph.toposort()])
+                else:
+                    assert any([isinstance(node.op, AveragePoolGrad)
+                               for node in fc.maker.fgraph.toposort()])
+                res_cpu = fg()
+
+                # check for similarity
+                utt.assert_allclose(res_gpu[0], res_cpu[0])
+                utt.assert_allclose(res_gpu[1], res_cpu[1])
+

 class test_DnnSoftMax(test_nnet.test_SoftMax):
    gpu_op = dnn.GpuDnnSoftmax

--- a/theano/tensor/signal/pool.py
+++ b/theano/tensor/signal/pool.py
@@ -7,6 +7,7 @@ Pool, DownsampleAvg, DownsampleSoftmax.
 from __future__ import absolute_import, print_function, division
 # This file should move along with conv.py
 import warnings
+import itertools

 import numpy
 from six.moves import xrange
@@ -84,67 +85,101 @@ def pool_2d(input, ds, ignore_border=None, st=None, padding=(0, 0),
            " Otherwise, the convolution will be executed on CPU.",
            stacklevel=2)
        ignore_border = False
-    if input.ndim == 4:
-        op = Pool(ignore_border, mode=mode)
-        output = op(input, ds, st, padding)
-        return output
+    op = Pool(ignore_border, ndim=2, mode=mode)
+    output = op(input, ds, st, padding)
+    return output

-    # extract image dimensions
-    img_shape = input.shape[-2:]

-    # count the number of "leading" dimensions, store as dmatrix
-    batch_size = tensor.prod(input.shape[:-2])
-    batch_size = tensor.shape_padright(batch_size, 1)
+def pool_3d(input, ds, ignore_border=None, st=None, padding=(0, 0, 0),
+            mode='max'):
+    """Downscale the input by a specified factor

-    # store as 4D tensor with shape: (batch_size,1,height,width)
-    new_shape = tensor.cast(tensor.join(0, batch_size,
-                                        tensor.as_tensor([1]),
-                                        img_shape), 'int64')
-    input_4D = tensor.reshape(input, new_shape, ndim=4)
+    Takes as input a N-D tensor, where N >= 3. It downscales the input image by
+    the specified factor, by keeping only the maximum value of non-overlapping
+    patches of size (ds[0],ds[1],ds[2])

-    # downsample mini-batch of images
-    op = Pool(ignore_border, mode=mode)
-    output = op(input_4D, ds, st, padding)
+    Parameters
+    ----------
+    input : N-D theano tensor of input images
+        Input images. Max pooling will be done over the 3 last dimensions.
+    ds : tuple of length 3 or theano vector of ints of size 3
+        Factor by which to downscale (vertical ds, horizontal ds, depth ds).
+        (2,2,2) will halve the image in each dimension.
+    ignore_border : bool (default None, will print a warning and set to False)
+        When True, (5,5,5) input with ds=(2,2,2) will generate a (2,2,2) output.
+        (3,3,3) otherwise.
+    st : tuple of three ints or theano vector of ints of size 3
+        Stride size, which is the number of shifts over rows/cols/slices to get
+        the next pool region. If st is None, it is considered equal to ds
+        (no overlap on pooling regions).
+    padding : tuple of two ints or theano vector of ints of size 3
+        (pad_h, pad_w, pad_d), pad zeros to extend beyond six borders of the
+        images, pad_h is the size of the top and bottom margins,
+        pad_w is the size of the left and right margins, and pad_d is the size
+        of the front and back margins
+    mode : {'max', 'sum', 'average_inc_pad', 'average_exc_pad'}
+        Operation executed on each window. `max` and `sum` always exclude
+        the padding in the computation. `average` gives you the choice to
+        include or exclude it.

-    # restore to original shape
-    outshp = tensor.join(0, input.shape[:-2], output.shape[-2:])
-    return tensor.reshape(output, outshp, ndim=input.ndim)
+    """
+    if input.ndim < 3:
+        raise NotImplementedError('pool_3d requires a dimension >= 3')
+    if ignore_border is None:
+        warnings.warn(
+            "pool_3d() will have the parameter ignore_border"
+            " default value changed to True (currently"
+            " False). To have consistent behavior with all Theano"
+            " version, explicitly add the parameter ignore_border=True."
+            " On the GPU, using ignore_border=True is needed to use cuDNN."
+            " When using ignore_border=False and not using cuDNN, the only"
+            " GPU combination supported is when"
+            " `ds == st and padding == (0, 0, 0) and mode == 'max'`."
+            " Otherwise, the convolution will be executed on CPU.",
+            stacklevel=2)
+        ignore_border = False
+    op = Pool(ignore_border, ndim=3, mode=mode)
+    output = op(input, ds, st, padding)
+    return output


 class Pool(OpenMPOp):
    """
-    For N-dimensional tensors, consider that the last two dimensions span
-    images. This Op downsamples these images by taking the max, sum or average
-    over different patch.
-
-    The constructor takes the max, sum or average or different input patches.
+    This Op downsamples the last N dimensions of the input by taking the max,
+    sum or average over different patches.

    Parameters
    ----------
-    ds : list or tuple of two ints
-        Downsample factor over rows and column.
-        ds indicates the pool region size.
+    ds : list or tuple of N ints
+        Downsample factor over rows, columns etc.
+        ds indicates the size of the pooling region.
    ignore_border : bool
-        If ds doesn't divide imgshape, do we include an extra row/col
+        If ds doesn't divide imgshape, do we include an extra row/col/slice
        of partial downsampling (False) or ignore it (True).
-    st : list or tuple of two ints or None
-        Stride size, which is the number of shifts over rows/cols to get the
+    st : list or tuple of N ints or None
+        Stride size, which is the number of shifts over rows/cols/slices to get the
        next pool region. If st is None, it is considered equal to ds
        (no overlap on pooling regions).
-    padding: tuple of two ints
-        (pad_h, pad_w), pad zeros to extend beyond four borders of the images,
-        pad_h is the size of the top and bottom margins, and pad_w is the size
-        of the left and right margins.
+    padding : tuple of N ints or None
+        For each downsampling dimension, this specifies the number of zeros to
+        add as padding on both sides. For 2D and (pad_h, pad_w), pad_h specifies the
+        size of the top and bottom margins, pad_w specifies the size of the left and
+        right margins. No padding is added if padding is None.
    mode : {'max', 'sum', 'average_inc_pad', 'average_exc_pad'}
        ('average_inc_pad' excludes the padding from the count,
        'average_exc_pad' include it)
+    ndim : int
+        The number of pooling dimensions N.
+        If this number is not specified, the default is set to the
+        (input.ndim - 2), assuming that the first two dimensions of the input
+        are non-pooling dimensions.

    """

-    __props__ = ('ignore_border', 'mode')
+    __props__ = ('ndim', 'ignore_border', 'mode')

    @staticmethod
-    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)):
+    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=None, ndim=None):
        """
        Return the shape of the output from this op, for input of given
        shape and flags.
@@ -152,88 +187,79 @@ class Pool(OpenMPOp):
        Parameters
        ----------
        imgshape : tuple, list, or similar of integer or scalar Theano variable
-            The shape of a tensor of images. The last two elements are
+            The shape of a tensor of images. The last N elements are
            interpreted as the number of rows, and the number of cols.
-        ds : list or tuple of two ints
-            Downsample factor over rows and columns this parameter indicates
-            the size of the pooling region.
-        st : list or tuple of two ints
-            The stride size. This is the distance between the pooling regions.
-            If it's set to None, it equals ds.
+        ds : list or tuple of N ints
+            Downsample factor over rows and column.
+            ds indicates the pool region size.
        ignore_border : bool
-            If ds doesn't divide imgshape, do we include an extra row/col of
-            partial downsampling (False) or ignore it (True).
-        padding : tuple of two ints
-            (pad_h, pad_w), pad zeros to extend beyond four borders
-            of the images, pad_h is the size of the top and bottom margins,
-            and pad_w is the size of the left and right margins.
+            If ds doesn't divide imgshape, do we include an extra row/col/slice
+            of partial downsampling (False) or ignore it (True).
+        st : list or tuple of N ints or None
+            Stride size, which is the number of shifts over rows/cols/slices to get the
+            next pool region. If st is None, it is considered equal to ds
+            (no overlap on pooling regions).
+        padding : tuple of N ints or None
+            For each downsampling dimension, this specifies the number of zeros to
+            add as padding on both sides. For 2D and (pad_h, pad_w), pad_h specifies the
+            size of the top and bottom margins, pad_w specifies the size of the left and
+            right margins. No padding is added if padding is None.
+        ndim : int
+            The number of pooling dimensions N.
+            If this number is not specified, the default is set to the
+            (input.ndim - 2), assuming that the first two dimensions of the input
+            are non-pooling dimensions.

        Returns
        -------
        list
            The shape of the output from this op, for input of given shape.
-            This will have the same length as imgshape, but with last two
+            This will have the same length as imgshape, but with last N
            elements reduced as per the downsampling & ignore_border flags.

        """
-        if len(imgshape) < 2:
-            raise TypeError('imgshape must have at least two elements '
-                            '(rows, cols)')
+        if ndim is None:
+            ndim = len(imgshape) - 2
+
+        if len(imgshape) < ndim:
+            raise TypeError('imgshape must have at least {} dimensions'.format(ndim))

        if st is None:
            st = ds
-        r, c = imgshape[-2:]
-        r = tensor.extract_constant(r)
-        c = tensor.extract_constant(c)
-        if padding[0]:
-            r = r + padding[0] * 2
-        if padding[1]:
-            c = c + padding[1] * 2
-
-        if ignore_border:
-            if ds[0] == st[0]:
-                nr = r // st[0]
-            else:
-                out_r = (r - ds[0]) // st[0] + 1
-                if isinstance(r, theano.Variable):
-                    nr = tensor.maximum(out_r, 0)
+        if padding is None:
+            padding = (0,) * ndim
+        patch_shape = tuple(tensor.extract_constant(imgshape[-ndim + i]) + padding[i] * 2
+                            for i in xrange(ndim))
+
+        def compute_out(v, downsample, stride):
+            if ignore_border:
+                if downsample == stride:
+                    return v // stride
                else:
-                    nr = numpy.maximum(out_r, 0)
-
-            if ds[1] == st[1]:
-                nc = c // st[1]
+                    out = (v - downsample) // stride + 1
+                    if isinstance(out, theano.Variable):
+                        return tensor.maximum(out, 0)
+                    else:
+                        return numpy.maximum(out, 0)
            else:
-                out_c = (c - ds[1]) // st[1] + 1
-                if isinstance(c, theano.Variable):
-                    nc = tensor.maximum(out_c, 0)
+                if isinstance(v, theano.Variable):
+                    return tensor.switch(tensor.ge(stride, downsample),
+                                         (v - 1) // stride + 1,
+                                         tensor.maximum(0, (v - 1 - downsample) //
+                                                        stride + 1) + 1)
+                elif stride >= downsample:
+                    return (v - 1) // stride + 1
                else:
-                    nc = numpy.maximum(out_c, 0)
-        else:
-            if isinstance(r, theano.Variable):
-                nr = tensor.switch(tensor.ge(st[0], ds[0]),
-                                   (r - 1) // st[0] + 1,
-                                   tensor.maximum(0, (r - 1 - ds[0]) //
-                                                  st[0] + 1) + 1)
-            elif st[0] >= ds[0]:
-                nr = (r - 1) // st[0] + 1
-            else:
-                nr = max(0, (r - 1 - ds[0] + st[0]) // st[0]) + 1
-
-            if isinstance(c, theano.Variable):
-                nc = tensor.switch(tensor.ge(st[1], ds[1]),
-                                   (c - 1) // st[1] + 1,
-                                   tensor.maximum(0, (c - 1 - ds[1]) //
-                                                  st[1] + 1) + 1)
-            elif st[1] >= ds[1]:
-                nc = (c - 1) // st[1] + 1
-            else:
-                nc = max(0, (c - 1 - ds[1] + st[1]) // st[1]) + 1
+                    return max(0, (v - 1 - downsample + stride) // stride) + 1
+
+        out_shape = [compute_out(patch_shape[i], ds[i], st[i]) for i in xrange(ndim)]

-        rval = list(imgshape[:-2]) + [nr, nc]
+        rval = list(imgshape[:-ndim]) + out_shape
        return rval

-    def __init__(self, ignore_border=False, mode='max', openmp=None):
+    def __init__(self, ignore_border=False, mode='max', ndim=None, openmp=None):
        super(Pool, self).__init__(openmp=openmp)
+        self.ndim = ndim
        self.ignore_border = ignore_border
        if mode not in ['max', 'average_inc_pad', 'average_exc_pad', 'sum']:
            raise ValueError(
@@ -244,6 +270,7 @@ class Pool(OpenMPOp):
    def prepare_node(self, node, storage_map, compute_map):
        if len(node.inputs) == 1:
            # Old interface
+            self.ndim = len(node.op.ds)
            self.mode = node.op.mode
            ws = theano.tensor.constant(node.op.ds)
            st = theano.tensor.constant(node.op.st)
@@ -270,17 +297,22 @@ class Pool(OpenMPOp):
                storage_map[pad] = [None]
                compute_map[pad] = [False]

-    def make_node(self, x, ws, stride=None, pad=(0, 0)):
+    def make_node(self, x, ws, stride=None, pad=None):
        # TODO: consider restricting the dtype?
        x = tensor.as_tensor_variable(x)
+        nd = self.ndim
+        if nd is None:
+            nd = x.type.ndim - 2
        if stride is None:
            stride = ws
-        if isinstance(pad, (tuple, list)):
-            if tuple(pad) != (0, 0) and not self.ignore_border:
+        if pad is None:
+            pad = (0,) * nd
+        elif isinstance(pad, (tuple, list)):
+            if max(pad) != 0 and not self.ignore_border:
                raise NotImplementedError(
                    'padding works only with ignore_border=True')
            if isinstance(ws, (tuple, list)):
-                if pad[0] >= ws[0] or pad[1] >= ws[1]:
+                if any(pad[i] >= ws[i] for i in range(nd)):
                    raise NotImplementedError(
                        'padding_h and padding_w must be smaller than strides')
        ws = tensor.as_tensor_variable(ws)
@@ -289,7 +321,7 @@ class Pool(OpenMPOp):
        assert ws.ndim == 1
        assert stride.ndim == 1
        assert pad.ndim == 1
-        if x.type.ndim != 4:
+        if x.type.ndim < nd:
            raise TypeError()
        if not ws.dtype.startswith('int'):
            raise TypeError('Pool downsample parameters must be ints.')
@@ -298,42 +330,36 @@ class Pool(OpenMPOp):
        if not pad.dtype.startswith('int'):
            raise TypeError('Padding parameters must be ints.')
        # If the input shape are broadcastable we can have 0 in the output shape
-        broad = x.broadcastable[:2] + (False, False)
+        broad = x.broadcastable[:-nd] + (False,) * nd
        out = tensor.TensorType(x.dtype, broad)
        return gof.Apply(self, [x, ws, stride, pad], [out()])

    def perform(self, node, inp, out):
        x, ws, stride, pad = inp
        z, = out
-        assert ws.shape == stride.shape == pad.shape == (2,)
-        if len(x.shape) != 4:
+        nd = self.ndim
+        if nd is None:
+            nd = len(x.shape) - 2
+        assert ws.shape == stride.shape == pad.shape == (nd,)
+        if len(x.shape) < nd:
            raise NotImplementedError(
-                'Pool requires 4D input for now')
-        z_shape = self.out_shape(x.shape, ws, self.ignore_border, stride, pad)
+                'Pool requires input with {} or more dimensions'.format(nd))
+        z_shape = self.out_shape(x.shape, ws, self.ignore_border, stride, pad, nd)
        if not self.ignore_border:
-            assert z_shape[2] > 0
-            assert z_shape[3] > 0
+            assert all(z > 0 for z in z_shape[-nd:])
        if (z[0] is None) or (z[0].shape != z_shape):
            z[0] = numpy.empty(z_shape, dtype=x.dtype)
        zz = z[0]
-        # number of pooling output rows
-        pr = zz.shape[-2]
-        # number of pooling output cols
-        pc = zz.shape[-1]
-        ws0, ws1 = ws
-        st0, st1 = stride
-        pad_h = pad[0]
-        pad_w = pad[1]
-        img_rows = x.shape[-2] + 2 * pad_h
-        img_cols = x.shape[-1] + 2 * pad_w
+        # size of pooling output
+        pool_out_shp = zz.shape[-nd:]
+        img_shp = tuple(x.shape[-nd + i] + 2 * pad[i] for i in xrange(nd))
        inc_pad = self.mode == 'average_inc_pad'

        # pad the image
-        if (pad_h, pad_w) != (0, 0):
-            y = numpy.zeros(
-                (x.shape[0], x.shape[1], img_rows, img_cols),
-                dtype=x.dtype)
-            y[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)] = x
+        if max(pad) != 0:
+            y = numpy.zeros(x.shape[:-nd] + img_shp, dtype=x.dtype)
+            y[(slice(None),) * (len(x.shape) - nd) +
+              tuple(slice(pad[i], img_shp[i] - pad[i]) for i in xrange(nd))] = x
        else:
            y = x
        func = numpy.max
@@ -342,28 +368,30 @@ class Pool(OpenMPOp):
        elif self.mode != 'max':
            func = numpy.average

-        for n in xrange(x.shape[0]):
-            for k in xrange(x.shape[1]):
-                for r in xrange(pr):
-                    row_st = r * st0
-                    row_end = builtins.min(row_st + ws0, img_rows)
-                    if not inc_pad:
-                        row_st = builtins.max(row_st, pad_h)
-                        row_end = builtins.min(row_end, x.shape[-2] + pad_h)
-                    for c in xrange(pc):
-                        col_st = c * st1
-                        col_end = builtins.min(col_st + ws1, img_cols)
-                        if not inc_pad:
-                            col_st = builtins.max(col_st, pad_w)
-                            col_end = builtins.min(col_end,
-                                                   x.shape[-1] + pad_w)
-                        zz[n, k, r, c] = func(y[
-                            n, k, row_st:row_end, col_st:col_end])
+        # precompute the region boundaries for each dimension
+        region_slices = [[] for i in xrange(nd)]
+        for i in xrange(nd):
+            for j in xrange(pool_out_shp[i]):
+                start = j * stride[i]
+                end = builtins.min(start + ws[i], img_shp[i])
+                if not inc_pad:
+                    start = builtins.max(start, pad[i])
+                    end = builtins.min(end, img_shp[i] - pad[i])
+                region_slices[i].append(slice(start, end))
+
+        # iterate over non-pooling dimensions
+        for k in numpy.ndindex(*x.shape[:-nd]):
+            zzk = zz[k]
+            yk = y[k]
+            # iterate over pooling regions
+            for r in numpy.ndindex(*pool_out_shp):
+                zzk[r] = func(
+                    yk[[region_slices[i][r[i]] for i in xrange(nd)]])

    def infer_shape(self, node, in_shapes):
        ws, stride, pad = [node.inputs[1], node.inputs[2], node.inputs[3]]
        shp = self.out_shape(in_shapes[0], ws, self.ignore_border, stride,
-                             pad)
+                             pad, self.ndim)
        return [shp]

    def grad(self, inp, grads):
@@ -372,10 +400,12 @@ class Pool(OpenMPOp):
        disc = [DisconnectedType()() for i in inp[1:]]
        if self.mode == 'max':
            maxout = self(x, ws, stride, pad)
-            return [MaxPoolGrad(ignore_border=self.ignore_border)(
+            return [MaxPoolGrad(ndim=self.ndim,
+                                ignore_border=self.ignore_border)(
                x, maxout, gz, ws=ws, stride=stride, pad=pad)] + disc
        else:
-            return [AveragePoolGrad(ignore_border=self.ignore_border,
+            return [AveragePoolGrad(ndim=self.ndim,
+                                    ignore_border=self.ignore_border,
                                    mode=self.mode)(
                x, gz, ws=ws, stride=stride, pad=pad)] + disc

@@ -392,292 +422,386 @@ class Pool(OpenMPOp):
            raise theano.gof.utils.MethodNotDefined()
        x, ws, stride, pad = inp
        z, = out
+        nd = self.ndim
+        if nd is None:
+            nd = node.inputs[0].ndim - 2
+        total_ndim = node.inputs[0].ndim
+        non_pool_ndim = total_ndim - nd
        fail = sub['fail']
        ignore_border = int(self.ignore_border)
        if self.openmp:
-            omp_parallel = '#pragma omp parallel for private(r_st, r_end, c_st, c_end, collector) schedule(static)'
+            # run in parallel over each pooling block
+            omp_parallel = '#pragma omp parallel for private(r_st, r_end, r_idx, i_idx, o_idx, collector) schedule(static)'
        else:
            omp_parallel = ''
        ccode = """
-        int ws0, ws1, st0, st1, pd0, pd1;
        int typenum = PyArray_ObjectType((PyObject*)%(x)s, 0);
-        int z_r, z_c; // shape of the output
-        int r, c; // shape of the padded_input
-        if(PyArray_DIM(%(ws)s, 0)!=2)
+        if(PyArray_NDIM(%(x)s)!=%(total_ndim)s)
        {
-            PyErr_SetString(PyExc_ValueError, "ws must be a vector of size 2");
+            PyErr_SetString(PyExc_ValueError, "x must be a %(total_ndim)sD ndarray");
            %(fail)s;
        }
-        if(PyArray_DIM(%(stride)s, 0)!=2)
+        if(PyArray_DIM(%(ws)s, 0)!=%(nd)s)
        {
-            PyErr_SetString(PyExc_ValueError, "stride must be a vector of size 2");
+            PyErr_SetString(PyExc_ValueError, "ws must be a vector of size %(nd)s");
            %(fail)s;
        }
-        if(PyArray_DIM(%(pad)s, 0)!=2)
+        if(PyArray_DIM(%(stride)s, 0)!=%(nd)s)
        {
-            PyErr_SetString(PyExc_ValueError, "pad must be a vector of size 2");
+            PyErr_SetString(PyExc_ValueError, "stride must be a vector of size %(nd)s");
            %(fail)s;
        }
-        // Getting ws, stride and pad
-        ws0 = *((npy_intp*)PyArray_GETPTR1(%(ws)s, 0));
-        ws1 = *((npy_intp*)PyArray_GETPTR1(%(ws)s, 1));
-        st0 = *((npy_intp*)PyArray_GETPTR1(%(stride)s, 0));
-        st1 = *((npy_intp*)PyArray_GETPTR1(%(stride)s, 1));
-        pd0 = *((npy_intp*)PyArray_GETPTR1(%(pad)s, 0));
-        pd1 = *((npy_intp*)PyArray_GETPTR1(%(pad)s, 1));
-        if(PyArray_NDIM(%(x)s)!=4)
+        if(PyArray_DIM(%(pad)s, 0)!=%(nd)s)
        {
-            PyErr_SetString(PyExc_ValueError, "x must be a 4d ndarray");
+            PyErr_SetString(PyExc_ValueError, "pad must be a vector of size %(nd)s");
+            %(fail)s;
+        }
+        int z[%(nd)s]; // shape of the output
+        int r[%(nd)s]; // shape of the padded_input
+        int ws[%(nd)s];
+        int st[%(nd)s];
+        int pd[%(nd)s];
+        int nonzero_padding;
+        nonzero_padding = 0;
+        for (int i=0; i<%(nd)s; i++)
+        {
+            ws[i] = *((npy_intp*)PyArray_GETPTR1(%(ws)s, i));
+            st[i] = *((npy_intp*)PyArray_GETPTR1(%(stride)s, i));
+            pd[i] = *((npy_intp*)PyArray_GETPTR1(%(pad)s, i));
+            r[i] = PyArray_DIMS(%(x)s)[%(non_pool_ndim)s + i] + 2 * pd[i];
+            if (pd[i]>0)
+                nonzero_padding = 1;
+        }
+        if (!%(ignore_border)s && nonzero_padding)
+        {
+            PyErr_SetString(PyExc_ValueError,
+              "padding must be zero when ignore border is False");
            %(fail)s;
        }
-        r = PyArray_DIMS(%(x)s)[2];
-        c = PyArray_DIMS(%(x)s)[3];
-        r += pd0 * 2;
-        c += pd1 * 2;
-        if (pd0 != 0 && pd1 != 0 && !%(ignore_border)s)
-            {
-              PyErr_SetString(PyExc_ValueError,
-                "padding must be (0,0) when ignore border is False");
-              %(fail)s;
-            }
        if (%(ignore_border)s)
        {
-            // '/' in C is different from '/' in python
-            if (r - ws0 < 0)
+            for (int i=0; i<%(nd)s; i++)
            {
-              z_r = 0;
-            }
-            else
-            {
-              z_r = (r - ws0) / st0 + 1;
-            }
-            if (c - ws1 < 0)
-            {
-              z_c = 0;
-            }
-            else
-            {
-              z_c = (c - ws1) / st1 + 1;
+                // '/' in C is different from '/' in python
+                if (r[i] - ws[i] < 0)
+                {
+                  z[i] = 0;
+                }
+                else
+                {
+                  z[i] = (r[i] - ws[i]) / st[i] + 1;
+                }
            }
        }
        else
        {
-            // decide how many rows the output has
-            if (st0 >= ws0)
+            for (int i=0; i<%(nd)s; i++)
            {
-                z_r = (r - 1) / st0 + 1;
-            }
-            else
-            {
-                z_r = std::max(0, (r - 1 - ws0 + st0) / st0) + 1;
+                // decide how many rows/cols the output has
+                if (st[i] >= ws[i])
+                {
+                    z[i] = (r[i] - 1) / st[i] + 1;
+                }
+                else
+                {
+                    z[i] = std::max(0, (r[i] - 1 - ws[i] + st[i]) / st[i]) + 1;
+                }
+                assert(z[i] > 0);
            }
-            // decide how many columns the output has
-            if (st1 >= ws1)
+        }
+        // memory allocation of z if necessary
+        int mem_nec;
+        mem_nec = 0;
+        if ((!%(z)s) || *PyArray_DIMS(%(z)s)!=%(total_ndim)s)
+        {
+            mem_nec = 1;
+        }
+        if (!mem_nec)
+        {
+            for (int i=0; i<%(non_pool_ndim)s; i++)
            {
-                z_c = (c - 1) / st1 + 1;
+                if (PyArray_DIMS(%(z)s)[i] != PyArray_DIMS(%(x)s)[i])
+                {
+                    mem_nec = 1;
+                    break;
+                }
            }
-            else
+        }
+        if (!mem_nec)
+        {
+            for (int i=0; i<%(nd)s; i++)
            {
-                z_c = std::max(0, (c - 1 - ws1 + st0) / st1) + 1;
+                if (PyArray_DIMS(%(z)s)[%(non_pool_ndim)s + i] != z[i])
+                {
+                    mem_nec = 1;
+                    break;
+                }
            }
-            assert(z_r > 0);
-            assert(z_c > 0);
        }
-        // memory allocation of z if necessary
-        if ((!%(z)s)
-          || *PyArray_DIMS(%(z)s)!=4
-          ||(PyArray_DIMS(%(z)s)[0] != PyArray_DIMS(%(x)s)[0])
-          ||(PyArray_DIMS(%(z)s)[1] != PyArray_DIMS(%(x)s)[1])
-          ||(PyArray_DIMS(%(z)s)[2] != z_r)
-          ||(PyArray_DIMS(%(z)s)[3] != z_c)
-          )
+        if (mem_nec)
        {
          if (%(z)s) Py_XDECREF(%(z)s);
-          npy_intp dims[4] = {0,0,0,0};
-          dims[0]=PyArray_DIMS(%(x)s)[0];
-          dims[1]=PyArray_DIMS(%(x)s)[1];
-          dims[2]=z_r;
-          dims[3]=z_c;
+          npy_intp dims[%(total_ndim)s];
+          for (int i=0; i<%(non_pool_ndim)s; i++)
+          {
+              dims[i] = PyArray_DIMS(%(x)s)[i];
+          }
+          for (int i=0; i<%(nd)s; i++)
+          {
+              dims[%(non_pool_ndim)s + i] = z[i];
+          }
          //TODO: zeros not necessary
-          %(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
+          %(z)s = (PyArrayObject*) PyArray_ZEROS(%(total_ndim)s, dims, typenum,0);
+        }
+        // initialize temp var for the value in a region
+        dtype_%(x)s collector;
+        int z_prod;
+        // do not run if any z[i] is zero
+        z_prod = 1;
+        for (int i=0; i<%(nd)s; i++)
+        {
+            z_prod *= z[i];
        }
-        // used for indexing a pool region inside the input
-        dtype_%(x)s collector; // temp var for the value in a region
-        if (z_r && z_c)
+        if (z_prod)
        {
-            int r_st, r_end, c_st, c_end;
+            // will be used to hold start and end index of a region
+            int r_st[%(nd)s];
+            int r_end[%(nd)s];
+            // index for iterating over the pooling regions
+            int r_idx[%(nd)s];
+            // placeholder for PyArray indexing (output)
+            long int o_idx[%(total_ndim)s];
+            // placeholder for PyArray indexing (input)
+            long int i_idx[%(total_ndim)s];
+            // loop over non-pooling dimensions
+            int non_pooling_prod = 1;
+            for (int i=0; i<%(non_pool_ndim)s; i++)
+            {
+                non_pooling_prod *= PyArray_DIMS(%(x)s)[i];
+            }
            %(omp_parallel)s
-            for(int t = 0; t < PyArray_DIMS(%(x)s)[0] * PyArray_DIMS(%(x)s)[1]; t++){
-                int b = t %% PyArray_DIMS(%(x)s)[0];
-                int k = t / PyArray_DIMS(%(x)s)[0];
-                for(int i=0; i < z_r; i++){
-                  r_st = i * st0;
-                  r_end = r_st + ws0;
+            // first loop over non-pooling dimensions
+            for (int t=0; t<non_pooling_prod; t++)
+            {
+                // compute the non-pooling index in each dimension
+                if (%(non_pool_ndim)s!=0)
+                {
+                    o_idx[0] = t;
+                    i_idx[0] = t;
+                    for (int i=1; i<%(non_pool_ndim)s; i++)
+                    {
+                        o_idx[i] = o_idx[i - 1] / PyArray_DIMS(%(x)s)[i - 1];
+                        o_idx[i - 1] = o_idx[i - 1] %% PyArray_DIMS(%(x)s)[i - 1];
+                        i_idx[i] = o_idx[i];
+                        i_idx[i - 1] = o_idx[i - 1];
+                    }
+                }
+
+                // then loop over each region in each pooling dimension
+        """
+
+        for i in xrange(nd):
+            ccode += """
+                for (r_idx[%(i)s]=0; r_idx[%(i)s] < z[%(i)s]; r_idx[%(i)s]++) {
+                  r_st[%(i)s] = r_idx[%(i)s] * st[%(i)s];
+                  r_end[%(i)s] = r_st[%(i)s] + ws[%(i)s];
                  // skip the padding
-                  r_st = r_st < pd0 ? pd0 : r_st;
-                  r_end = r_end > (r - pd0) ? r - pd0 : r_end;
+                  r_st[%(i)s] = r_st[%(i)s] < pd[%(i)s] ? pd[%(i)s] : r_st[%(i)s];
+                  r_end[%(i)s] = r_end[%(i)s] > (r[%(i)s] - pd[%(i)s]) ? r[%(i)s] - pd[%(i)s] : r_end[%(i)s];
                  // from padded_img space to img space
-                  r_st -= pd0;
-                  r_end -= pd0;
+                  r_st[%(i)s] -= pd[%(i)s];
+                  r_end[%(i)s] -= pd[%(i)s];
                  // handle the case where no padding, ignore border is True
                  if (%(ignore_border)s)
                  {
-                    r_end = r_end > r ? r : r_end;
+                    r_end[%(i)s] = r_end[%(i)s] > r[%(i)s] ? r[%(i)s] : r_end[%(i)s];
                  }
-                  for(int j=0; j<z_c; j++){
-                    c_st = j * st1;
-                    c_end = c_st + ws1;
-                    // skip the padding
-                    c_st = c_st < pd1 ? pd1 : c_st;
-                    c_end = c_end > (c - pd1) ? c - pd1 : c_end;
-                    dtype_%(z)s * z = (
-                          (dtype_%(z)s*)(PyArray_GETPTR4(%(z)s, b, k, i, j)));
-                    // change coordinates from padding_img space into img space
-                    c_st -= pd1;
-                    c_end -= pd1;
-                    // handle the case where no padding, ignore border is True
-                    if (%(ignore_border)s)
-                    {
-                      c_end = c_end > c ? c : c_end;
-                    }
+                  // use the index to find the correct position in the output
+                  o_idx[%(non_pool_ndim)s + %(i)s] = r_idx[%(i)s];
+            """ % dict(i=i, ignore_border=ignore_border, non_pool_ndim=non_pool_ndim)
+
+        ccode += """
+                  // get a pointer to the correct position in the output
+                  dtype_%(z)s * z;
+                  if (%(total_ndim)s == 4)
+                    z = ((dtype_%(z)s*)(PyArray_GETPTR4(%(z)s, o_idx[0], o_idx[1], o_idx[2], o_idx[3])));
+                  else
+                    z = ((dtype_%(z)s*)(PyArray_GetPtr(%(z)s, o_idx)));
        """
+
        if self.mode == 'max':
+            for i in xrange(nd):
+                ccode += """
+                  // set the first index of dimension %(i)s
+                  i_idx[%(non_pool_ndim)s + %(i)s] = r_st[%(i)s];
+                """ % dict(i=i, non_pool_ndim=non_pool_ndim)
            ccode += """
-                    // use the first element as the initial value of collector
-                    collector = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,r_st,c_st)))[0];
-                    // go through the pooled region in the unpadded input
-                    for(int m=r_st; m<r_end; m++)
-                    {
-                      for(int n=c_st; n<c_end; n++)
-                      {
-                        dtype_%(x)s a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,m,n)))[0];
-                        collector = (a > collector) ? a : collector;
-                      }
-                    }
-                    z[0] = collector;
+                  // use the first element as the initial value of collector
+                  if (%(total_ndim)s == 4)
+                    collector = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,i_idx[0],i_idx[1],i_idx[2],i_idx[3])))[0];
+                  else
+                    collector = ((dtype_%(x)s*)(PyArray_GetPtr(%(x)s,i_idx)))[0];
+            """
+            for i in xrange(nd):
+                ccode += """
+                  // go through the pooled region in the unpadded input
+                  for(int m%(i)s=r_st[%(i)s]; m%(i)s<r_end[%(i)s]; m%(i)s++)
+                  {
+                    i_idx[%(non_pool_ndim)s + %(i)s] = m%(i)s;
+                """ % dict(i=i, non_pool_ndim=non_pool_ndim)
+            ccode += """
+                    // update maximum
+                    dtype_%(x)s a;
+                    if (%(total_ndim)s == 4)
+                      a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,i_idx[0],i_idx[1],i_idx[2],i_idx[3])))[0];
+                    else
+                      a = ((dtype_%(x)s*)(PyArray_GetPtr(%(x)s,i_idx)))[0];
+                    collector = (a > collector) ? a : collector;
+            """
+            for i in xrange(nd):
+                ccode += """
+                  } // for loop over region
+                """
+            ccode += """
+                  z[0] = collector;
            """
        elif self.mode in ('sum', 'average_exc_pad', 'average_inc_pad'):
            ccode += """
-                    // initialize the sum at zero
-                    collector = ((dtype_%(x)s)(0));
-                    // go through the pooled region in the unpadded input
-                    for(int m=r_st; m<r_end; m++)
-                    {
-                      for(int n=c_st; n<c_end; n++)
-                      {
-                        dtype_%(x)s a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,m,n)))[0];
-                        collector += a;
-                      }
-                    }
+                  // initialize the sum at zero
+                  collector = ((dtype_%(x)s)(0));
            """
+            for i in xrange(nd):
+                ccode += """
+                  // go through the pooled region in the unpadded input
+                  for(int m%(i)s=r_st[%(i)s]; m%(i)s<r_end[%(i)s]; m%(i)s++)
+                  {
+                    i_idx[%(non_pool_ndim)s + %(i)s] = m%(i)s;
+                """ % dict(i=i, non_pool_ndim=non_pool_ndim)
+            ccode += """
+                    // update sum
+                    dtype_%(x)s a;
+                    if (%(total_ndim)s == 4)
+                      a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,i_idx[0],i_idx[1],i_idx[2],i_idx[3])))[0];
+                    else
+                      a = ((dtype_%(x)s*)(PyArray_GetPtr(%(x)s,i_idx)))[0];
+                    collector += a;
+            """
+            for i in xrange(nd):
+                ccode += """
+                  } // for loop over region
+                """
            if self.mode == "sum":
                ccode += """
-                    z[0] = collector;
+                  z[0] = collector;
                """
            elif self.mode == 'average_inc_pad' and self.ignore_border:
+                # region size = product over all pooling dimensions
+                region_size = ' * '.join('ws[%d]' % i for i in xrange(nd))
                ccode += """
-                    z[0] = collector / (ws0 * ws1);
-                """
+                  z[0] = collector / (%(region_size)s);
+                """ % dict(region_size=region_size)
            else:
+                # region size = number elements of in this region
+                region_size = ' * '.join('(r_end[%d]-r_st[%d])' % (i, i) for i in xrange(nd))
                ccode += """
-                    z[0] = collector / ((r_end-r_st)*(c_end-c_st));
-                """
+                  z[0] = collector / (%(region_size)s);
+                """ % dict(region_size=region_size)
+        for i in xrange(nd):
+            ccode += """
+            } // loop over pooling dimension
+            """
+
        ccode += """
-                  }
-                }
-              }
-            }
+          } // for loop over non-pooling dimensions
+        } // if z_prod
        """
        return ccode % locals()

    def c_code_cache_version(self):
-        return (0, 6, 8, 6, self.openmp)
+        return (0, 6, 8, 7, self.openmp)


 class PoolGrad(OpenMPOp):
-    __props__ = ('ignore_border', 'mode')
+    __props__ = ('ndim', 'ignore_border', 'mode')

    @staticmethod
-    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)):
+    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=None, ndim=None):
        """Return the shape of the output from this op, for input of given
        shape and flags.

        Parameters
        ----------
        imgshape : tuple of integers or scalar Theano variables
-            the shape of a tensor of images. The last two elements are
-            interpreted as the number of rows, and the number of cols.
-        ds : tuple of two ints
+            the shape of a tensor of images. The last N elements are
+            interpreted as the downsampling dimensions.
+        ds : tuple of N ints
            downsample factor over rows and columns this parameter
            indicates the size of the pooling region
-        st : tuple of two ints
-            the stride size. This is the distance between the pooling
-            regions. If it's set to None, in which case it equlas ds.
        ignore_border : bool
-            if ds doesn't divide imgshape, do we include an extra
-            row/col of partial downsampling (False) or ignore it
-            (True).
-        padding : tuple of two ints
-            (pad_h, pad_w), pad zeros to extend beyond four borders of
-            the images, pad_h is the size of the top and bottom
-            margins, and pad_w is the size of the left and right
-            margins.
+            If ds doesn't divide imgshape, do we include an extra row/col/slice
+            of partial downsampling (False) or ignore it (True).
+        st : list or tuple of N ints or None
+            Stride size, which is the number of shifts over rows/cols/slices to get the
+            next pool region. If st is None, it is considered equal to ds
+            (no overlap on pooling regions).
+        padding : tuple of N ints or None
+            For each downsampling dimension, this specifies the number of zeros to
+            add as padding on both sides. For 2D and (pad_h, pad_w), pad_h specifies the
+            size of the top and bottom margins, pad_w specifies the size of the left and
+            right margins. No padding is added if padding is None.
+        ndim : int
+            The number of pooling dimensions N.
+            If this number is not specified, the default is set to the
+            (input.ndim - 2), assuming that the first two dimensions of the input
+            are non-pooling dimensions.

        Returns
        -------
        list :
            the shape of the output from this op, for input of given
            shape.  This will have the same length as imgshape, but
-            with last two elements reduced as per the downsampling &
+            with last N elements reduced as per the downsampling &
            ignore_border flags.

        """
-        if len(imgshape) < 2:
-            raise TypeError('imgshape must have at least two elements '
-                            '(rows, cols)')
+        if ndim is None:
+            ndim = len(imgshape) - 2
+
+        if len(imgshape) < ndim:
+            raise TypeError('imgshape must have at least {} dimensions'.format(ndim))

        if st is None:
            st = ds
-        r, c = imgshape[-2:]
-        r += padding[0] * 2
-        c += padding[1] * 2
-
-        if ignore_border:
-            out_r = (r - ds[0]) // st[0] + 1
-            out_c = (c - ds[1]) // st[1] + 1
-            if isinstance(r, theano.Variable):
-                nr = tensor.maximum(out_r, 0)
-            else:
-                nr = numpy.maximum(out_r, 0)
-            if isinstance(c, theano.Variable):
-                nc = tensor.maximum(out_c, 0)
-            else:
-                nc = numpy.maximum(out_c, 0)
-        else:
-            if isinstance(r, theano.Variable):
-                nr = tensor.switch(tensor.ge(st[0], ds[0]),
-                                   (r - 1) // st[0] + 1,
-                                   tensor.maximum(0, (r - 1 - ds[0]) //
-                                                  st[0] + 1) + 1)
-            elif st[0] >= ds[0]:
-                nr = (r - 1) // st[0] + 1
-            else:
-                nr = max(0, (r - 1 - ds[0]) // st[0] + 1) + 1
-
-            if isinstance(c, theano.Variable):
-                nc = tensor.switch(tensor.ge(st[1], ds[1]),
-                                   (c - 1) // st[1] + 1,
-                                   tensor.maximum(0, (c - 1 - ds[1]) //
-                                                  st[1] + 1) + 1)
-            elif st[1] >= ds[1]:
-                nc = (c - 1) // st[1] + 1
+        if padding is None:
+            padding = (0,) * ndim
+        patch_shape = tuple(tensor.extract_constant(imgshape[-ndim + i]) + padding[i] * 2
+                            for i in xrange(ndim))
+
+        def compute_out(v, downsample, stride):
+            if ignore_border:
+                out = (v - downsample) // stride + 1
+                if isinstance(out, theano.Variable):
+                    return tensor.maximum(out, 0)
+                else:
+                    return numpy.maximum(out, 0)
            else:
-                nc = max(0, (c - 1 - ds[1]) // st[1] + 1) + 1
+                if isinstance(v, theano.Variable):
+                    return tensor.switch(tensor.ge(stride, downsample),
+                                         (v - 1) // stride + 1,
+                                         tensor.maximum(0, (v - 1 - downsample) //
+                                                        stride + 1) + 1)
+                elif stride >= downsample:
+                    return (v - 1) // stride + 1
+                else:
+                    return max(0, (v - 1 - downsample) // stride + 1) + 1
+
+        out_shape = [compute_out(patch_shape[i], ds[i], st[i]) for i in xrange(ndim)]

-        rval = list(imgshape[:-2]) + [nr, nc]
+        rval = list(imgshape[:-ndim]) + out_shape
        return rval

-    def __init__(self, ignore_border, mode='max', openmp=None):
+    def __init__(self, ignore_border, mode='max', ndim=None, openmp=None):
+        self.ndim = ndim
        self.ignore_border = ignore_border
        if mode not in ['max', 'sum', 'average_inc_pad', 'average_exc_pad']:
            raise ValueError(
@@ -689,6 +813,7 @@ class PoolGrad(OpenMPOp):
    def prepare_node(self, node, storage_map, compute_map):
        if len(node.inputs) < 5:  # 5 for AveragePoolGrad, 6 for MaxPoolGrad
            # Old interface
+            self.ndim = len(node.op.ds)
            self.mode = node.op.mode
            ws = theano.tensor.constant(node.op.ds)
            st = theano.tensor.constant(node.op.st)
@@ -720,26 +845,32 @@ class PoolGrad(OpenMPOp):


 class MaxPoolGrad(PoolGrad):
-    def __init__(self, ignore_border, openmp=None):
-        PoolGrad.__init__(self, ignore_border, mode='max', openmp=openmp)
+    def __init__(self, ignore_border, ndim=None, openmp=None):
+        PoolGrad.__init__(self, ignore_border, mode='max', ndim=ndim, openmp=openmp)

-    def make_node(self, x, maxout, gz, ws, stride=None, pad=(0, 0)):
+    def make_node(self, x, maxout, gz, ws, stride=None, pad=None):
        # make_node should only be called by the grad function of
        # Pool, so these asserts should not fail.
        x = tensor.as_tensor_variable(x)
        maxout = tensor.as_tensor_variable(maxout)
        gz = tensor.as_tensor_variable(gz)
+        nd = self.ndim
+        if nd is None:
+            nd = x.ndim - 2
        if stride is None:
            stride = ws
+        if pad is None:
+            pad = (0,) * nd
        ws = tensor.as_tensor_variable(ws)
        stride = tensor.as_tensor_variable(stride)
        pad = tensor.as_tensor_variable(pad)
-        assert isinstance(x, Variable) and x.ndim == 4
-        assert isinstance(maxout, Variable) and maxout.ndim == 4
-        assert isinstance(gz, Variable) and gz.ndim == 4
+        assert isinstance(x, Variable) and x.ndim >= nd
+        assert isinstance(maxout, Variable) and maxout.ndim >= nd
+        assert isinstance(gz, Variable) and gz.ndim >= nd
        assert isinstance(ws, Variable) and ws.ndim == 1
        assert isinstance(stride, Variable) and stride.ndim == 1
        assert isinstance(pad, Variable) and pad.ndim == 1
+        assert x.ndim == maxout.ndim == gz.ndim >= nd
        if not ws.dtype.startswith('int'):
            raise TypeError('Pool downsample parameters must be ints.')
        if not stride.dtype.startswith('int'):
@@ -752,41 +883,51 @@ class MaxPoolGrad(PoolGrad):
        assert self.mode == 'max'
        x, maxout, gz, ws, stride, pad = inp
        gx_stg, = out
-        assert ws.shape == stride.shape == pad.shape == (2,)
-        # number of pooling output rows
-        pr = maxout.shape[-2]
-        # number of pooling output cols
-        pc = maxout.shape[-1]
-        ws0, ws1 = ws
-        st0, st1 = stride
-        pad_h = pad[0]
-        pad_w = pad[1]
-        img_rows = x.shape[-2] + 2 * pad_h
-        img_cols = x.shape[-1] + 2 * pad_w
+        nd = self.ndim
+        if nd is None:
+            nd = len(x.shape) - 2
+        assert ws.shape == stride.shape == pad.shape == (nd,)
+        if len(x.shape) < nd:
+            raise NotImplementedError(
+                'MaxPoolGrad requires input with {} or more dimensions'.format(nd))
+        pool_out_shp = maxout.shape[-nd:]
+        img_shp = tuple(x.shape[-nd + i] + 2 * pad[i] for i in xrange(nd))

        # pad the image
-        if (pad_h, pad_w) != (0, 0):
-            y = numpy.zeros(
-                (x.shape[0], x.shape[1], img_rows, img_cols),
-                dtype=x.dtype)
-            y[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)] = x
+        if max(pad) != 0:
+            y = numpy.zeros(x.shape[:-nd] + img_shp, dtype=x.dtype)
+            y[(slice(None),) * (len(x.shape) - nd) +
+              tuple(slice(pad[i], img_shp[i] - pad[i]) for i in xrange(nd))] = x
        else:
            y = x
        gx = numpy.zeros_like(y)
-        for n in xrange(x.shape[0]):
-            for k in xrange(x.shape[1]):
-                for r in xrange(pr):
-                    row_st = builtins.max(r * st0, pad_h)
-                    row_end = builtins.min(row_st + ws0, img_rows)
-                    for c in xrange(pc):
-                        col_st = builtins.max(c * st1, pad_w)
-                        col_end = builtins.min(col_st + ws1, img_cols)
-                        for row_ind in xrange(row_st, row_end):
-                            for col_ind in xrange(col_st, col_end):
-                                if (maxout[n, k, r, c] == y[n, k, row_ind, col_ind]):
-                                    gx[n, k, row_ind, col_ind] += gz[n, k, r, c]
+
+        # precompute the region boundaries for each dimension
+        region_ranges = [[] for i in xrange(nd)]
+        for i in xrange(nd):
+            for j in xrange(pool_out_shp[i]):
+                start = builtins.max(j * stride[i], pad[i])
+                end = builtins.min(start + ws[i], img_shp[i])
+                region_ranges[i].append(xrange(start, end))
+
+        # iterate over non-pooling dimensions
+        for k in numpy.ndindex(*x.shape[:-nd]):
+            gxk = gx[k]
+            gzk = gz[k]
+            yk = y[k]
+            maxoutk = maxout[k]
+            # iterate over pooling regions
+            for r in numpy.ndindex(*pool_out_shp):
+                maxout_value = maxoutk[r]
+                # iterate inside region
+                for c in itertools.product(*[region_ranges[i][r[i]]
+                                             for i in xrange(nd)]):
+                    if maxout_value == yk[c]:
+                        gxk[c] += gzk[r]
+
        # unpad the image
-        gx = gx[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)]
+        gx = gx[(slice(None),) * (len(x.shape) - nd) +
+                tuple(slice(pad[i], img_shp[i] - pad[i]) for i in xrange(nd))]
        gx_stg[0] = gx

    def grad(self, inp, grads):
@@ -794,7 +935,8 @@ class MaxPoolGrad(PoolGrad):
        ggx, = grads
        return ([theano.tensor.zeros_like(x),
                 theano.tensor.zeros_like(maxout),
-                 DownsampleFactorMaxGradGrad(ignore_border=self.ignore_border)(
+                 DownsampleFactorMaxGradGrad(ndim=self.ndim,
+                                             ignore_border=self.ignore_border)(
                x, maxout, ggx, ws, stride, pad)] +
                [DisconnectedType()() for i in inp[3:]])

@@ -805,161 +947,253 @@ class MaxPoolGrad(PoolGrad):
        assert self.mode == 'max'
        x, z, gz, ws, stride, pad = inp
        gx, = out
+        nd = self.ndim
+        if nd is None:
+            nd = node.inputs[0].ndim - 2
+        total_ndim = node.inputs[0].ndim
+        non_pool_ndim = total_ndim - nd
        fail = sub['fail']
        ignore_border = int(self.ignore_border)
        if self.openmp:
-            omp_parallel = '#pragma omp parallel for private(r_st, r_end, c_st, c_end, maximum) schedule(static)'
+            # run in parallel over each pooling block
+            omp_parallel = '#pragma omp parallel for private(r_st, r_end, r_idx, i_idx, o_idx, maximum) schedule(static)'
        else:
            omp_parallel = ''
-        return """
+
+        ccode = """
        // sanity checks
        int x_typenum = PyArray_ObjectType((PyObject*)%(x)s, 0);
        int z_typenum = PyArray_ObjectType((PyObject*)%(z)s, 0);
        int gz_typenum = PyArray_ObjectType((PyObject*)%(gz)s, 0);
-        int ws0, ws1, st0, st1, pd0, pd1;
-        int z_r, z_c;
-        int r, c; // shape of the padded_input
        if ((x_typenum != z_typenum) || (x_typenum != gz_typenum))
        {
            PyErr_SetString(PyExc_ValueError, "input types must all match");
            %(fail)s;
        }
-        if(PyArray_NDIM(%(x)s)!=4)
+        if(PyArray_NDIM(%(x)s)!=%(total_ndim)s)
        {
-            PyErr_SetString(PyExc_ValueError, "x must be a 4d ndarray");
+            PyErr_SetString(PyExc_ValueError, "x must be a %(total_ndim)sD ndarray");
            %(fail)s;
        }
-        if(PyArray_NDIM(%(z)s)!=4)
+        if(PyArray_NDIM(%(z)s)!=%(total_ndim)s)
        {
-            PyErr_SetString(PyExc_ValueError, "z must be a 4d ndarray");
+            PyErr_SetString(PyExc_ValueError, "z must be a %(total_ndim)sD ndarray");
            %(fail)s;
        }
-        if(PyArray_NDIM(%(gz)s)!=4)
+        if(PyArray_NDIM(%(gz)s)!=%(total_ndim)s)
        {
-            PyErr_SetString(PyExc_ValueError, "gz must be a 4d ndarray");
+            PyErr_SetString(PyExc_ValueError, "gz must be a %(total_ndim)sD ndarray");
            %(fail)s;
        }
-        if(PyArray_DIM(%(ws)s, 0)!=2)
+        if(PyArray_DIM(%(ws)s, 0)!=%(nd)s)
        {
-            PyErr_SetString(PyExc_ValueError, "ws must be a vector of size 2");
+            PyErr_SetString(PyExc_ValueError, "ws must be a vector of size %(nd)s");
            %(fail)s;
        }
-        if(PyArray_DIM(%(stride)s, 0)!=2)
+        if(PyArray_DIM(%(stride)s, 0)!=%(nd)s)
        {
-            PyErr_SetString(PyExc_ValueError, "stride must be a vector of size 2");
+            PyErr_SetString(PyExc_ValueError, "stride must be a vector of size %(nd)s");
            %(fail)s;
        }
-        if(PyArray_DIM(%(pad)s, 0)!=2)
+        if(PyArray_DIM(%(pad)s, 0)!=%(nd)s)
        {
-            PyErr_SetString(PyExc_ValueError, "pad must be a vector of size 2");
+            PyErr_SetString(PyExc_ValueError, "pad must be a vector of size %(nd)s");
            %(fail)s;
        }
-        // Getting ws, stride and pad
-        ws0 = *((npy_intp*)PyArray_GETPTR1(%(ws)s, 0));
-        ws1 = *((npy_intp*)PyArray_GETPTR1(%(ws)s, 1));
-        st0 = *((npy_intp*)PyArray_GETPTR1(%(stride)s, 0));
-        st1 = *((npy_intp*)PyArray_GETPTR1(%(stride)s, 1));
-        pd0 = *((npy_intp*)PyArray_GETPTR1(%(pad)s, 0));
-        pd1 = *((npy_intp*)PyArray_GETPTR1(%(pad)s, 1));
-        z_r = PyArray_DIMS(%(z)s)[2];
-        z_c = PyArray_DIMS(%(z)s)[3];
-        r = PyArray_DIMS(%(x)s)[2];
-        c = PyArray_DIMS(%(x)s)[3];
-        r += pd0 * 2;
-        c += pd1 * 2;
-        // allocating memory for gx
-        if ((!%(gx)s)
-          || !PyArray_ISCONTIGUOUS(%(gx)s)
-          || *PyArray_DIMS(%(gx)s)!=4
-          ||(PyArray_DIMS(%(gx)s)[0] != PyArray_DIMS(%(x)s)[0])
-          ||(PyArray_DIMS(%(gx)s)[1] != PyArray_DIMS(%(x)s)[1])
-          ||(PyArray_DIMS(%(gx)s)[2] != PyArray_DIMS(%(x)s)[2])
-          ||(PyArray_DIMS(%(gx)s)[3] != PyArray_DIMS(%(x)s)[3])
-          )
+        int z[%(nd)s]; // shape of the output
+        int r[%(nd)s]; // shape of the padded_input
+        int ws[%(nd)s];
+        int st[%(nd)s];
+        int pd[%(nd)s];
+        int nonzero_padding;
+        nonzero_padding = 0;
+        for (int i=0; i<%(nd)s; i++)
+        {
+            ws[i] = *((npy_intp*)PyArray_GETPTR1(%(ws)s, i));
+            st[i] = *((npy_intp*)PyArray_GETPTR1(%(stride)s, i));
+            pd[i] = *((npy_intp*)PyArray_GETPTR1(%(pad)s, i));
+            z[i] = PyArray_DIMS(%(z)s)[%(non_pool_ndim)s + i];
+            r[i] = PyArray_DIMS(%(x)s)[%(non_pool_ndim)s + i] + 2 * pd[i];
+            if (pd[i]>0)
+                nonzero_padding = 1;
+        }
+        // allocating memory for output, if necessary
+        int mem_nec;
+        mem_nec = 0;
+        if ((!%(gx)s) || !PyArray_ISCONTIGUOUS(%(gx)s)
+            || *PyArray_DIMS(%(gx)s)!=%(total_ndim)s)
+        {
+            mem_nec = 1;
+        }
+        if (!mem_nec)
+        {
+            for (int i=0; i<%(total_ndim)s; i++)
+            {
+                if (PyArray_DIMS(%(gx)s)[i] != PyArray_DIMS(%(x)s)[i])
+                {
+                    mem_nec = 1;
+                    break;
+                }
+            }
+        }
+        if (mem_nec)
        {
          Py_XDECREF(%(gx)s);
-          %(gx)s = (PyArrayObject*) PyArray_ZEROS(4, PyArray_DIMS(%(x)s), x_typenum,0);
+          %(gx)s = (PyArrayObject*) PyArray_ZEROS(%(total_ndim)s, PyArray_DIMS(%(x)s), x_typenum,0);
        }
        else {
          PyArray_FILLWBYTE(%(gx)s, 0);
        }
        dtype_%(z)s maximum; // temp var for maximum value in a region
-        if (z_r && z_c)
+        int z_prod;
+        // do not run if any z[i] is zero
+        z_prod = 1;
+        for (int i=0; i<%(nd)s; i++)
        {
-            int r_st, r_end, c_st, c_end;
+            z_prod *= z[i];
+        }
+        if (z_prod)
+        {
+            // will be used to hold start and end index of a region
+            int r_st[%(nd)s];
+            int r_end[%(nd)s];
+            // index for iterating over the pooling regions
+            int r_idx[%(nd)s];
+            // placeholder for PyArray indexing (output)
+            long int o_idx[%(total_ndim)s];
+            // placeholder for PyArray indexing (input)
+            long int i_idx[%(total_ndim)s];
+            // loop over non-pooling dimensions
+            int non_pooling_prod = 1;
+            for (int i=0; i<%(non_pool_ndim)s; i++)
+            {
+                non_pooling_prod *= PyArray_DIMS(%(x)s)[i];
+            }
            %(omp_parallel)s
-            for(int t = 0; t < PyArray_DIMS(%(x)s)[0] * PyArray_DIMS(%(x)s)[1]; t++){
-                int b = t %% PyArray_DIMS(%(x)s)[0];
-                int k = t / PyArray_DIMS(%(x)s)[0];
-                for(int i=0; i < z_r; i++){
-                  r_st = i * st0;
-                  r_end = r_st + ws0;
+            // first loop over non-pooling dimensions
+            for (int t=0; t<non_pooling_prod; t++)
+            {
+                // compute the non-pooling index in each dimension
+                if (%(non_pool_ndim)s!=0)
+                {
+                    o_idx[0] = t;
+                    i_idx[0] = t;
+                    for (int i=1; i<%(non_pool_ndim)s; i++)
+                    {
+                        o_idx[i] = o_idx[i - 1] / PyArray_DIMS(%(x)s)[i - 1];
+                        o_idx[i - 1] =o_idx[i - 1] %% PyArray_DIMS(%(x)s)[i - 1];
+                        i_idx[i] = o_idx[i];
+                        i_idx[i - 1] = o_idx[i - 1];
+                    }
+                }
+
+                // then loop over each region in each pooling dimension
+        """
+
+        for i in xrange(nd):
+            ccode += """
+                for (r_idx[%(i)s]=0; r_idx[%(i)s] < z[%(i)s]; r_idx[%(i)s]++) {
+                  r_st[%(i)s] = r_idx[%(i)s] * st[%(i)s];
+                  r_end[%(i)s] = r_st[%(i)s] + ws[%(i)s];
                  // skip the padding
-                  r_st = r_st < pd0 ? pd0 : r_st;
-                  r_end = r_end > (r - pd0) ? r - pd0 : r_end;
+                  r_st[%(i)s] = r_st[%(i)s] < pd[%(i)s] ? pd[%(i)s] : r_st[%(i)s];
+                  r_end[%(i)s] = r_end[%(i)s] > (r[%(i)s] - pd[%(i)s]) ? r[%(i)s] - pd[%(i)s] : r_end[%(i)s];
                  // from padded_img space to img space
-                  r_st -= pd0;
-                  r_end -= pd0;
-                  for(int j=0; j<z_c; j++){
-                    c_st = j * st1;
-                    c_end = c_st + ws1;
-                    // skip the padding
-                    c_st = c_st < pd1 ? pd1 : c_st;
-                    c_end = c_end > (c - pd1) ? c - pd1 : c_end;
-                    // change coordinates from padding_img space into img space
-                    c_st -= pd1;
-                    c_end -= pd1;
+                  r_st[%(i)s] -= pd[%(i)s];
+                  r_end[%(i)s] -= pd[%(i)s];
+                  // use the index to find the correct position in the output
+                  o_idx[%(non_pool_ndim)s + %(i)s] = r_idx[%(i)s];
+            """ % dict(i=i, non_pool_ndim=non_pool_ndim)
+
+        ccode += """
+                  dtype_%(gz)s * gz;
+                  if (%(total_ndim)s == 4)
+                  {
+                    // the maximum value
+                    maximum = ((dtype_%(z)s*)(PyArray_GETPTR4(%(z)s,o_idx[0],o_idx[1],o_idx[2],o_idx[3])))[0];
+                    // the gradient corresponding to this maximum value in z
+                    gz = ((dtype_%(gz)s*)(PyArray_GETPTR4(%(gz)s, o_idx[0],o_idx[1],o_idx[2],o_idx[3])));
+                  }
+                  else
+                  {
                    // the maximum value
-                    maximum = ((dtype_%(z)s*)(PyArray_GETPTR4(%(z)s,b,k,i,j)))[0];
+                    maximum = ((dtype_%(z)s*)(PyArray_GetPtr(%(z)s,o_idx)))[0];
                    // the gradient corresponding to this maximum value in z
-                    dtype_%(gz)s * gz = (
-                          (dtype_%(gz)s*)(PyArray_GETPTR4(%(gz)s, b, k, i, j)));
-                    // go through the pooled region in the unpadded input
-                    for(int m=r_st; m<r_end; m++)
+                    gz = ((dtype_%(gz)s*)(PyArray_GetPtr(%(gz)s, o_idx)));
+                  }
+        """
+        for i in xrange(nd):
+            ccode += """
+                  // go through the pooled region in the unpadded input
+                  for(int m%(i)s=r_st[%(i)s]; m%(i)s<r_end[%(i)s]; m%(i)s++)
+                  {
+                    i_idx[%(non_pool_ndim)s + %(i)s] = m%(i)s;
+                """ % dict(i=i, non_pool_ndim=non_pool_ndim)
+        ccode += """
+                    dtype_%(x)s a;
+                    dtype_%(gx)s * gx;
+                    if (%(total_ndim)s == 4)
                    {
-                      for(int n=c_st; n<c_end; n++)
-                      {
-                        dtype_%(x)s a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,m,n)))[0];
-                        dtype_%(gx)s * gx = (
-                          (dtype_%(gx)s*)(PyArray_GETPTR4(%(gx)s, b, k, m, n)));
-                        if (a == maximum){
-                          gx[0] = gx[0] + gz[0];
-                        }
-                      }
+                      a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,i_idx[0],i_idx[1],i_idx[2],i_idx[3])))[0];
+                      gx = ((dtype_%(gx)s*)(PyArray_GETPTR4(%(gx)s, i_idx[0],i_idx[1],i_idx[2],i_idx[3])));
                    }
-                  }
-                }
-              }
-            }
-        """ % locals()
+                    else
+                    {
+                      a = ((dtype_%(x)s*)(PyArray_GetPtr(%(x)s,i_idx)))[0];
+                      gx = ((dtype_%(gx)s*)(PyArray_GetPtr(%(gx)s, i_idx)));
+                    }
+                    if (a == maximum){
+                      gx[0] = gx[0] + gz[0];
+                    }
+        """
+        for i in xrange(nd):
+            ccode += """
+                  } // for loop over region
+                """
+        for i in xrange(nd):
+            ccode += """
+                } // loop over pooling dimension
+            """
+
+        ccode += """
+            } // for loop over non-pooling dimensions
+        } // if z_prod
+        """
+        return ccode % locals()

    def c_code_cache_version(self):
-        return (0, 9, self.openmp)
+        return (0, 10, self.openmp)


 class AveragePoolGrad(PoolGrad):
-    def __init__(self, ignore_border, mode='average_inc_pad'):
+    def __init__(self, ignore_border, mode='average_inc_pad', ndim=None):
        assert mode in ['sum', 'average_inc_pad', 'average_exc_pad']
-        PoolGrad.__init__(self, ignore_border, mode)
+        PoolGrad.__init__(self, ignore_border, mode, ndim)

    # There is an extra dummy parameter to match the parameter count
    # of MaxPoolGrad.  They have to keep the same interface because of
    # the DownsampleFactorMaxGrad trick to keep old scripts working
    # (see downsample.py for details on this).
-    def make_node(self, x, gz, ws, stride=None, pad=(0, 0), dummy=None):
+    def make_node(self, x, gz, ws, stride=None, pad=None, dummy=None):
        # make_node should only be called by the grad function of
        # Pool, so these asserts should not fail.
        x = tensor.as_tensor_variable(x)
        gz = tensor.as_tensor_variable(gz)
+        nd = self.ndim
+        if nd is None:
+            nd = x.ndim - 2
        if stride is None:
            stride = ws
+        if pad is None:
+            pad = (0,) * nd
        ws = tensor.as_tensor_variable(ws)
        stride = tensor.as_tensor_variable(stride)
        pad = tensor.as_tensor_variable(pad)
-        assert isinstance(x, Variable) and x.ndim == 4
-        assert isinstance(gz, Variable) and gz.ndim == 4
+        assert isinstance(x, Variable) and x.ndim >= nd
+        assert isinstance(gz, Variable) and gz.ndim >= nd
        assert isinstance(ws, Variable) and ws.ndim == 1
        assert isinstance(stride, Variable) and stride.ndim == 1
+        assert x.ndim == gz.ndim >= nd
        assert isinstance(pad, Variable) and pad.ndim == 1
        if not ws.dtype.startswith('int'):
            raise TypeError('Pool downsample parameters must be ints.')
@@ -972,96 +1206,333 @@ class AveragePoolGrad(PoolGrad):
    def perform(self, node, inp, out):
        x, gz, ws, stride, pad = inp
        gx_stg, = out
-        assert ws.shape == stride.shape == pad.shape == (2,)
-        if self.mode == 'average_exc_pad' and pad[0] != 0 and pad[1] != 0:
+        nd = self.ndim
+        if nd is None:
+            nd = len(x.shape) - 2
+        assert ws.shape == stride.shape == pad.shape == (nd,)
+        if len(x.shape) < nd:
+            raise NotImplementedError(
+                'AveragePoolGrad requires input with {} or more dimensions'.format(nd))
+        if self.mode == 'average_exc_pad' and max(pad) != 0:
            raise NotImplementedError()
-        z_shape = self.out_shape(x.shape, ws, self.ignore_border, stride, pad)
+        z_shape = self.out_shape(x.shape, ws, self.ignore_border, stride, pad, nd)
        if (gx_stg[0] is None) or (gx_stg[0].shape != z_shape):
            gx_stg[0] = numpy.empty(z_shape, dtype=x.dtype)
        zz = gx_stg[0]
-        # number of pooling output rows
-        pr = zz.shape[-2]
-        # number of pooling output cols
-        pc = zz.shape[-1]
-        ws0, ws1 = ws
-        st0, st1 = stride
-        pad_h = pad[0]
-        pad_w = pad[1]
-        img_rows = x.shape[-2] + 2 * pad_h
-        img_cols = x.shape[-1] + 2 * pad_w
+        # size of pooling output
+        pool_out_shp = zz.shape[-nd:]
+        img_shp = tuple(x.shape[-nd + i] + 2 * pad[i] for i in xrange(nd))
        inc_pad = self.mode == 'average_inc_pad'
        sum_mode = self.mode == 'sum'

-        # pad the image
-        if (pad_h, pad_w) != (0, 0):
-            y = numpy.zeros(
-                (x.shape[0], x.shape[1], img_rows, img_cols),
-                dtype=x.dtype)
-            y[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)] = x
-        else:
-            y = x
-        gx = numpy.zeros_like(y)
-        for n in xrange(x.shape[0]):
-            for k in xrange(x.shape[1]):
-                for r in xrange(pr):
-                    if sum_mode or inc_pad:
-                        row_st = r * st0
-                    else:
-                        row_st = builtins.max(r * st0, pad_h)
-                    row_end = builtins.min(row_st + ws0, img_rows)
-                    for c in xrange(pc):
-                        if sum_mode or inc_pad:
-                            col_st = c * st1
-                        else:
-                            col_st = builtins.max(c * st1, pad_w)
-                        col_end = builtins.min(col_st + ws1, img_cols)
-                        if sum_mode:
-                            val = gz[n, k, r, c]
-                        else:
-                            val = gz[n, k, r, c] / ((row_end - row_st) *
-                                                    (col_end - col_st))
-                        gx[n, k, row_st:row_end, col_st:col_end] += val
+        # initialize the padded output
+        gx = numpy.zeros((x.shape[:-nd] + img_shp), dtype=x.dtype)
+
+        # precompute the region boundaries and sizes for each dimension
+        region_slices = [[] for i in xrange(nd)]
+        region_sizes = [[] for i in xrange(nd)]
+        for i in xrange(nd):
+            for j in xrange(pool_out_shp[i]):
+                if sum_mode or inc_pad:
+                    start = j * stride[i]
+                else:
+                    start = builtins.max(j * stride[i], pad[i])
+                end = builtins.min(start + ws[i], img_shp[i])
+                region_slices[i].append(slice(start, end))
+                region_sizes[i].append(end - start)
+
+        # iterate over non-pooling dimensions
+        region_slice = [None] * nd
+        for k in numpy.ndindex(*x.shape[:-nd]):
+            gzk = gz[k]
+            gxk = gx[k]
+            # iterate over pooling regions
+            for r in numpy.ndindex(*pool_out_shp):
+                region_size = 1
+                for i in xrange(nd):
+                    region_slice[i] = region_slices[i][r[i]]
+                    region_size *= region_sizes[i][r[i]]
+                if sum_mode:
+                    val = gzk[r]
+                else:
+                    # divide by region size
+                    val = gzk[r] / region_size
+                gxk[region_slice] += val
+
        # unpad the image
-        gx = gx[:, :, pad_h:(img_rows - pad_h), pad_w:(img_cols - pad_w)]
+        gx = gx[(slice(None),) * (len(x.shape) - nd) +
+                tuple(slice(pad[i], img_shp[i] - pad[i]) for i in xrange(nd))]
        gx_stg[0] = gx

    def grad(self, inp, grads):
        x, gz, ws, stride, pad = inp
        ggx, = grads
        return ([theano.tensor.zeros_like(x),
-                 Pool(ignore_border=self.ignore_border, mode=self.mode)(ggx,
+                 Pool(ignore_border=self.ignore_border,
+                      ndim=self.ndim, mode=self.mode)(ggx,
                ws, stride, pad)] + [DisconnectedType()() for i in inp[2:]])

    def connection_pattern(self, node):
        return [[1], [1], [0], [0], [0]]

+    def c_code(self, node, name, inp, out, sub):
+        x, gz, ws, stride, pad = inp
+        gx, = out
+        nd = self.ndim
+        if nd is None:
+            nd = node.inputs[0].ndim - 2
+        total_ndim = node.inputs[0].ndim
+        non_pool_ndim = total_ndim - nd
+        fail = sub['fail']
+        inc_pad = int(self.mode == 'average_inc_pad')
+        sum_mode = int(self.mode == 'sum')
+        if self.openmp:
+            # run in parallel over each pooling block
+            omp_parallel = '#pragma omp parallel for private(r_st, r_end, r_pad_width, r_idx, i_idx, o_idx) schedule(static)'
+        else:
+            omp_parallel = ''
+
+        ccode = """
+        // sanity checks
+        int x_typenum = PyArray_ObjectType((PyObject*)%(x)s, 0);
+        int gz_typenum = PyArray_ObjectType((PyObject*)%(gz)s, 0);
+        if (x_typenum != gz_typenum)
+        {
+            PyErr_SetString(PyExc_ValueError, "input types must all match");
+            %(fail)s;
+        }
+        if(PyArray_NDIM(%(x)s)!=%(total_ndim)s)
+        {
+            PyErr_SetString(PyExc_ValueError, "x must be a %(total_ndim)sD ndarray");
+            %(fail)s;
+        }
+        if(PyArray_NDIM(%(gz)s)!=%(total_ndim)s)
+        {
+            PyErr_SetString(PyExc_ValueError, "gz must be a %(total_ndim)sD ndarray");
+            %(fail)s;
+        }
+        if(PyArray_DIM(%(ws)s, 0)!=%(nd)s)
+        {
+            PyErr_SetString(PyExc_ValueError, "ws must be a vector of size %(nd)s");
+            %(fail)s;
+        }
+        if(PyArray_DIM(%(stride)s, 0)!=%(nd)s)
+        {
+            PyErr_SetString(PyExc_ValueError, "stride must be a vector of size %(nd)s");
+            %(fail)s;
+        }
+        if(PyArray_DIM(%(pad)s, 0)!=%(nd)s)
+        {
+            PyErr_SetString(PyExc_ValueError, "pad must be a vector of size %(nd)s");
+            %(fail)s;
+        }
+        int z[%(nd)s]; // shape of the output
+        int r[%(nd)s]; // shape of the padded_input
+        int ws[%(nd)s];
+        int st[%(nd)s];
+        int pd[%(nd)s];
+        int nonzero_padding;
+        nonzero_padding = 0;
+        for (int i=0; i<%(nd)s; i++)
+        {
+            ws[i] = *((npy_intp*)PyArray_GETPTR1(%(ws)s, i));
+            st[i] = *((npy_intp*)PyArray_GETPTR1(%(stride)s, i));
+            pd[i] = *((npy_intp*)PyArray_GETPTR1(%(pad)s, i));
+            z[i] = PyArray_DIMS(%(gz)s)[%(non_pool_ndim)s + i];
+            r[i] = PyArray_DIMS(%(x)s)[%(non_pool_ndim)s + i] + 2 * pd[i];
+            if (pd[i]>0)
+                nonzero_padding = 1;
+        }
+        if (!%(inc_pad)s && !%(sum_mode)s && nonzero_padding)
+        {
+            PyErr_SetString(PyExc_ValueError,
+              "padding must be zero for average_exc_pad");
+            %(fail)s;
+        }
+        // allocating memory for output, if necessary
+        int mem_nec;
+        mem_nec = 0;
+        if ((!%(gx)s) || !PyArray_ISCONTIGUOUS(%(gx)s)
+            || *PyArray_DIMS(%(gx)s)!=%(total_ndim)s)
+        {
+            mem_nec = 1;
+        }
+        if (!mem_nec)
+        {
+            for (int i=0; i<%(total_ndim)s; i++)
+            {
+                if (PyArray_DIMS(%(gx)s)[i] != PyArray_DIMS(%(x)s)[i])
+                {
+                    mem_nec = 1;
+                    break;
+                }
+            }
+        }
+        if (mem_nec)
+        {
+          Py_XDECREF(%(gx)s);
+          %(gx)s = (PyArrayObject*) PyArray_ZEROS(%(total_ndim)s, PyArray_DIMS(%(x)s), x_typenum,0);
+        }
+        else {
+          PyArray_FILLWBYTE(%(gx)s, 0);
+        }
+        int z_prod;
+        // do not run if any z[i] is zero
+        z_prod = 1;
+        for (int i=0; i<%(nd)s; i++)
+        {
+            z_prod *= z[i];
+        }
+        if (z_prod)
+        {
+            // will be used to hold start and end index of a region
+            int r_st[%(nd)s];
+            int r_end[%(nd)s];
+            // padded region size
+            int r_pad_width[%(nd)s];
+            // index for iterating over the pooling regions
+            int r_idx[%(nd)s];
+            // placeholder for PyArray indexing (output)
+            long int o_idx[%(total_ndim)s];
+            // placeholder for PyArray indexing (input)
+            long int i_idx[%(total_ndim)s];
+            // loop over non-pooling dimensions
+            int non_pooling_prod = 1;
+            for (int i=0; i<%(non_pool_ndim)s; i++)
+            {
+                non_pooling_prod *= PyArray_DIMS(%(x)s)[i];
+            }
+            %(omp_parallel)s
+            // first loop over non-pooling dimensions
+            for (int t=0; t<non_pooling_prod; t++)
+            {
+                // compute the non-pooling index in each dimension
+                if (%(non_pool_ndim)s!=0)
+                {
+                    o_idx[0] = t;
+                    i_idx[0] = t;
+                    for (int i=1; i<%(non_pool_ndim)s; i++)
+                    {
+                        o_idx[i] = o_idx[i - 1] / PyArray_DIMS(%(x)s)[i - 1];
+                        o_idx[i - 1] =o_idx[i - 1] %% PyArray_DIMS(%(x)s)[i - 1];
+                        i_idx[i] = o_idx[i];
+                        i_idx[i - 1] = o_idx[i - 1];
+                    }
+                }
+
+                // then loop over each region in each pooling dimension
+        """
+
+        for i in xrange(nd):
+            ccode += """
+                for (r_idx[%(i)s]=0; r_idx[%(i)s] < z[%(i)s]; r_idx[%(i)s]++) {
+                  r_st[%(i)s] = r_idx[%(i)s] * st[%(i)s];
+                  if (!%(sum_mode)s && !%(inc_pad)s && r_st[%(i)s] < pd[%(i)s])
+                  {
+                    r_st[%(i)s] = pd[%(i)s];
+                  }
+                  r_end[%(i)s] = r_st[%(i)s] + ws[%(i)s];
+                  r_end[%(i)s] = r_end[%(i)s] > r[%(i)s] ? r[%(i)s] : r_end[%(i)s];
+                  r_pad_width[%(i)s] = r_end[%(i)s] - r_st[%(i)s];
+                  // from padded_img space to img space
+                  r_st[%(i)s] = r_st[%(i)s] - pd[%(i)s] > 0 ? r_st[%(i)s] - pd[%(i)s] : 0;
+                  r_end[%(i)s] = r_end[%(i)s] > r[%(i)s] - pd[%(i)s] ? r[%(i)s] - 2 * pd[%(i)s] : r_end[%(i)s] - pd[%(i)s];
+
+                  // use the index to find the correct position in the output
+                  o_idx[%(non_pool_ndim)s + %(i)s] = r_idx[%(i)s];
+            """ % dict(i=i, sum_mode=sum_mode, inc_pad=inc_pad, non_pool_ndim=non_pool_ndim)
+
+        ccode += """
+                  dtype_%(gz)s * gz;
+                  dtype_%(gz)s val;
+                  if (%(total_ndim)s == 4)
+                  {
+                    // the gradient for this region
+                    gz = ((dtype_%(gz)s*)(PyArray_GETPTR4(%(gz)s, o_idx[0],o_idx[1],o_idx[2],o_idx[3])));
+                  }
+                  else
+                  {
+                    // the gradient for this region
+                    gz = ((dtype_%(gz)s*)(PyArray_GetPtr(%(gz)s, o_idx)));
+                  }
+                  // compute the contribution
+                  if (%(sum_mode)s)
+                  {
+                    val = gz[0];
+                  }
+                  else
+                  {
+                    val = gz[0] / (%(region_size)s);
+                  }
+        """
+        region_size = ' * '.join('r_pad_width[%d]' % i for i in xrange(nd))
+        for i in xrange(nd):
+            ccode += """
+                  // go through the pooled region in the unpadded input
+                  for(int m%(i)s=r_st[%(i)s]; m%(i)s<r_end[%(i)s]; m%(i)s++)
+                  {
+                    i_idx[%(non_pool_ndim)s + %(i)s] = m%(i)s;
+                """ % dict(i=i, non_pool_ndim=non_pool_ndim)
+        ccode += """
+                    dtype_%(gx)s * gx;
+                    if (%(total_ndim)s == 4)
+                    {
+                      gx = ((dtype_%(gx)s*)(PyArray_GETPTR4(%(gx)s, i_idx[0],i_idx[1],i_idx[2],i_idx[3])));
+                    }
+                    else
+                    {
+                      gx = ((dtype_%(gx)s*)(PyArray_GetPtr(%(gx)s, i_idx)));
+                    }
+                    gx[0] = gx[0] + val;
+        """
+        for i in xrange(nd):
+            ccode += """
+                  } // for loop over region
+                """
+        for i in xrange(nd):
+            ccode += """
+                } // loop over pooling dimension
+            """
+
+        ccode += """
+            } // for loop over non-pooling dimensions
+        } // if z_prod
+        """
+        return ccode % locals()
+
+    def c_code_cache_version(self):
+        return (0, 3, self.openmp)
+

 class DownsampleFactorMaxGradGrad(OpenMPOp):
-    __props__ = ('ignore_border', 'mode')
+    __props__ = ('ndim', 'ignore_border', 'mode')

-    def __init__(self, ignore_border, mode='max', openmp=None):
+    def __init__(self, ignore_border, mode='max', ndim=None, openmp=None):
+        self.ndim = ndim
        self.ignore_border = ignore_border
        self.mode = mode
        super(DownsampleFactorMaxGradGrad, self).__init__(openmp=openmp)
        assert self.mode == 'max'

-    def make_node(self, x, maxout, gz, ws, stride=None, pad=(0, 0)):
+    def make_node(self, x, maxout, gz, ws, stride=None, pad=None):
        # make_node should only be called by the grad function of
        # MaxPoolGrad, so these asserts should not fail.
        x = tensor.as_tensor_variable(x)
        maxout = tensor.as_tensor_variable(maxout)
        gz = tensor.as_tensor_variable(gz)
-        assert x.ndim == 4
-        assert maxout.ndim == 4
-        assert gz.ndim == 4
+        nd = self.ndim
+        if nd is None:
+            nd = x.ndim - 2
        if stride is None:
            stride = ws
-        if isinstance(pad, (tuple, list)):
-            if tuple(pad) != (0, 0) and not self.ignore_border:
+        if pad is None:
+            pad = (0,) * nd
+        elif isinstance(pad, (tuple, list)):
+            if max(pad) != 0 and not self.ignore_border:
                raise NotImplementedError(
                    'padding works only with ignore_border=True')
            if isinstance(ws, (tuple, list)):
-                if pad[0] >= ws[0] or pad[1] >= ws[1]:
+                if any(pad[i] >= ws[i] for i in range(nd)):
                    raise NotImplementedError(
                        'padding_h and padding_w must be smaller than strides')
        ws = tensor.as_tensor_variable(ws)
@@ -1070,6 +1541,7 @@ class DownsampleFactorMaxGradGrad(OpenMPOp):
        assert ws.ndim == 1
        assert stride.ndim == 1
        assert pad.ndim == 1
+        assert x.ndim == maxout.ndim == gz.ndim >= nd
        if not ws.dtype.startswith('int'):
            raise TypeError('Pool downsample parameters must be ints.')
        if not stride.dtype.startswith('int'):
@@ -1081,49 +1553,56 @@ class DownsampleFactorMaxGradGrad(OpenMPOp):
    def perform(self, node, inp, out):
        x, maxout, ggx, ws, stride, pad = inp
        z, = out
-        assert ws.shape == stride.shape == pad.shape == (2,)
-        if len(x.shape) != 4:
+        nd = self.ndim
+        if nd is None:
+            nd = len(x.shape) - 2
+        assert ws.shape == stride.shape == pad.shape == (nd,)
+        if len(x.shape) < nd:
            raise NotImplementedError(
-                'DownsampleFactorMaxGradGrad requires 4D input for now')
+                'DownsampleFactorMaxGradGrad requires input '
+                'with {} or more dimensions'.format(nd))
        if (z[0] is None) or (z[0].shape != maxout.shape):
            z[0] = numpy.zeros(maxout.shape, dtype=x.dtype)
        ggz = z[0]  # grad wrt maxout_grad has the same shape as maxout
-        # number of pooling output rows
-        pr = ggz.shape[-2]
-        # number of pooling output cols
-        pc = ggz.shape[-1]
-        ws0, ws1 = ws
-        st0, st1 = stride
-        pd0, pd1 = pad
-        img_rows = x.shape[-2] + 2 * pd0
-        img_cols = x.shape[-1] + 2 * pd1
+        # size of pooling output
+        pool_out_shp = ggz.shape[-nd:]
+        img_shp = tuple(x.shape[-nd + i] + 2 * pad[i] for i in xrange(nd))

        # pad the image and its gradients
-        if pd0 != 0 and pd1 != 0:
-            y_padded = numpy.zeros(
-                (x.shape[0], x.shape[1], img_rows, img_cols),
-                dtype=x.dtype) + x.min() - 1
-            y_padded[:, :, pd0:(img_rows - pd0), pd1:(img_cols - pd1)] = x
-            ggx_padded = numpy.zeros(
-                (x.shape[0], x.shape[1], img_rows, img_cols),
-                dtype=x.dtype)
-            ggx_padded[:, :, pd0:(img_rows - pd0), pd1:(img_cols - pd1)] = ggx
+        if max(pad) > 0:
+            y_padded = numpy.zeros(x.shape[:-nd] + img_shp, dtype=x.dtype)
+            y_padded[(slice(None),) * (len(x.shape) - nd) +
+                     tuple(slice(pad[i], img_shp[i] - pad[i]) for i in xrange(nd))] = x
+            ggx_padded = numpy.zeros(x.shape[:-nd] + img_shp, dtype=x.dtype)
+            ggx_padded[(slice(None),) * (len(x.shape) - nd) +
+                       tuple(slice(pad[i], img_shp[i] - pad[i]) for i in xrange(nd))] = ggx

        else:
            y_padded = x
            ggx_padded = ggx
-        for n in xrange(x.shape[0]):
-            for k in xrange(x.shape[1]):
-                for r in xrange(pr):
-                    row_st = r * st0
-                    row_end = builtins.min(row_st + ws0, img_rows)
-                    for c in xrange(pc):
-                        col_st = c * st1
-                        col_end = builtins.min(col_st + ws1, img_cols)
-                        for row_ind in xrange(row_st, row_end):
-                            for col_ind in xrange(col_st, col_end):
-                                if (maxout[n, k, r, c] == y_padded[n, k, row_ind, col_ind]):
-                                    ggz[n, k, r, c] = ggx_padded[n, k, row_ind, col_ind]
+
+        # precompute the region boundaries for each dimension
+        region_ranges = [[] for i in xrange(nd)]
+        for i in xrange(nd):
+            for j in xrange(pool_out_shp[i]):
+                start = j * stride[i]
+                end = builtins.min(start + ws[i], img_shp[i])
+                region_ranges[i].append(xrange(start, end))
+
+        # iterate over non-pooling dimensions
+        for k in numpy.ndindex(*x.shape[:-nd]):
+            ggxk = ggx_padded[k]
+            ggzk = ggz[k]
+            yk = y_padded[k]
+            maxoutk = maxout[k]
+            # iterate over pooling regions
+            for r in numpy.ndindex(*pool_out_shp):
+                # iterate inside region
+                maxout_value = maxoutk[r]
+                for c in itertools.product(*[region_ranges[i][r[i]]
+                                             for i in xrange(nd)]):
+                    if maxout_value == yk[c]:
+                        ggzk[r] += ggxk[c]

    def infer_shape(self, node, in_shapes):
        return [in_shapes[1]]
@@ -1133,8 +1612,9 @@ class DownsampleFactorMaxGradGrad(OpenMPOp):
        gz, = grads
        return [theano.tensor.zeros_like(x),
                theano.tensor.zeros_like(maxout),
-                MaxPoolGrad(ignore_border=self.ignore_border)(x, maxout, gz,
-                                                              ws, stride, pad),
+                MaxPoolGrad(ignore_border=self.ignore_border,
+                            ndim=self.ndim)(x, maxout, gz,
+                                            ws, stride, pad),
                DisconnectedType()(),
                DisconnectedType()(),
                DisconnectedType()()]
@@ -1147,106 +1627,182 @@ class DownsampleFactorMaxGradGrad(OpenMPOp):
            raise theano.gof.utils.MethodNotDefined()
        x, maxout, ggx, ws, stride, pad = inp
        z, = out  # the grad of grad
+        nd = self.ndim
+        if nd is None:
+            nd = node.inputs[0].ndim - 2
+        total_ndim = node.inputs[0].ndim
+        non_pool_ndim = total_ndim - nd
        fail = sub['fail']
        ignore_border = int(self.ignore_border)
        if self.openmp:
-            omp_parallel = '#pragma omp parallel for private(r_st, r_end, c_st, c_end, maximum) schedule(static)'
+            # run in parallel over each pooling block
+            omp_parallel = '#pragma omp parallel for private(r_st, r_end, r_idx, i_idx, o_idx, maximum) schedule(static)'
        else:
            omp_parallel = ''
-        return """
-        int ws0, ws1, st0, st1, pd0, pd1;
+        ccode = """
        int z_typenum = PyArray_ObjectType((PyObject*)%(maxout)s, 0);
-        int z_r, z_c;
-        int r, c; // shape of the padded_input
-        if(PyArray_DIM(%(ws)s, 0)!=2)
+        int z[%(nd)s]; // shape of the output
+        int r[%(nd)s]; // shape of the padded_input
+        int ws[%(nd)s];
+        int st[%(nd)s];
+        int pd[%(nd)s];
+        if(PyArray_DIM(%(ws)s, 0)!=%(nd)s)
        {
-            PyErr_SetString(PyExc_ValueError, "ws must be a vector of size 2");
+            PyErr_SetString(PyExc_ValueError, "ws must be a vector of size %(nd)s");
            %(fail)s;
        }
-        if(PyArray_DIM(%(stride)s, 0)!=2)
+        if(PyArray_DIM(%(stride)s, 0)!=%(nd)s)
        {
-            PyErr_SetString(PyExc_ValueError, "stride must be a vector of size 2");
+            PyErr_SetString(PyExc_ValueError, "stride must be a vector of size %(nd)s");
            %(fail)s;
        }
-        if(PyArray_DIM(%(pad)s, 0)!=2)
+        if(PyArray_DIM(%(pad)s, 0)!=%(nd)s)
        {
-            PyErr_SetString(PyExc_ValueError, "pad must be a vector of size 2");
+            PyErr_SetString(PyExc_ValueError, "pad must be a vector of size %(nd)s");
            %(fail)s;
        }
-        // Getting ws, stride and pad
-        ws0 = *((npy_intp*)PyArray_GETPTR1(%(ws)s, 0));
-        ws1 = *((npy_intp*)PyArray_GETPTR1(%(ws)s, 1));
-        st0 = *((npy_intp*)PyArray_GETPTR1(%(stride)s, 0));
-        st1 = *((npy_intp*)PyArray_GETPTR1(%(stride)s, 1));
-        pd0 = *((npy_intp*)PyArray_GETPTR1(%(pad)s, 0));
-        pd1 = *((npy_intp*)PyArray_GETPTR1(%(pad)s, 1));
-        z_r = PyArray_DIMS(%(maxout)s)[2];
-        z_c = PyArray_DIMS(%(maxout)s)[3];
-        r = PyArray_DIMS(%(x)s)[2];
-        c = PyArray_DIMS(%(x)s)[3];
-        r += pd0 * 2;
-        c += pd1 * 2;
-        // allocating memory for output
-        if ((!%(z)s)
-          || !PyArray_ISCONTIGUOUS(%(z)s)
-          || *PyArray_DIMS(%(z)s)!=4
-          ||(PyArray_DIMS(%(z)s)[0] != PyArray_DIMS(%(maxout)s)[0])
-          ||(PyArray_DIMS(%(z)s)[1] != PyArray_DIMS(%(maxout)s)[1])
-          ||(PyArray_DIMS(%(z)s)[2] != PyArray_DIMS(%(maxout)s)[2])
-          ||(PyArray_DIMS(%(z)s)[3] != PyArray_DIMS(%(maxout)s)[3])
-          )
+        for (int i=0; i<%(nd)s; i++)
+        {
+            ws[i] = *((npy_intp*)PyArray_GETPTR1(%(ws)s, i));
+            st[i] = *((npy_intp*)PyArray_GETPTR1(%(stride)s, i));
+            pd[i] = *((npy_intp*)PyArray_GETPTR1(%(pad)s, i));
+            z[i] = PyArray_DIMS(%(maxout)s)[%(non_pool_ndim)s + i];
+            r[i] = PyArray_DIMS(%(x)s)[%(non_pool_ndim)s + i] + 2 * pd[i];
+        }
+        // allocating memory for output, if necessary
+        int mem_nec;
+        mem_nec = 0;
+        if ((!%(z)s) || !PyArray_ISCONTIGUOUS(%(z)s)
+            || *PyArray_DIMS(%(z)s)!=%(total_ndim)s)
+        {
+            mem_nec = 1;
+        }
+        if (!mem_nec)
+        {
+            for (int i=0; i<%(total_ndim)s; i++)
+            {
+                if (PyArray_DIMS(%(z)s)[i] != PyArray_DIMS(%(maxout)s)[i])
+                {
+                    mem_nec = 1;
+                    break;
+                }
+            }
+        }
+        if (mem_nec)
        {
          Py_XDECREF(%(z)s);
-          %(z)s = (PyArrayObject*) PyArray_ZEROS(4, PyArray_DIMS(%(maxout)s), z_typenum,0);
+          %(z)s = (PyArrayObject*) PyArray_ZEROS(%(total_ndim)s, PyArray_DIMS(%(maxout)s), z_typenum,0);
        }
        else {
          PyArray_FILLWBYTE(%(z)s, 0);
        }
        dtype_%(maxout)s maximum; // temp var for maximum value in a region
-        int r_st, r_end, c_st, c_end;
+        // will be used to hold start and end index of a region
+        int r_st[%(nd)s];
+        int r_end[%(nd)s];
+        // index for iterating over the pooling regions
+        int r_idx[%(nd)s];
+        // placeholder for PyArray indexing (output)
+        long int o_idx[%(total_ndim)s];
+        // placeholder for PyArray indexing (input)
+        long int i_idx[%(total_ndim)s];
+        // loop over non-pooling dimensions
+        int non_pooling_prod;
+        non_pooling_prod = 1;
+        for (int i=0; i<%(non_pool_ndim)s; i++)
+        {
+            non_pooling_prod *= PyArray_DIMS(%(x)s)[i];
+        }
        %(omp_parallel)s
-        for(int t = 0; t < PyArray_DIMS(%(x)s)[0] * PyArray_DIMS(%(x)s)[1]; t++){
-            int b = t %% PyArray_DIMS(%(x)s)[0];
-            int k = t / PyArray_DIMS(%(x)s)[0];
-                for(int i=0; i < z_r; i++){
-                  r_st = i * st0;
-                  r_end = r_st + ws0;
+        // first loop over non-pooling dimensions
+        for (int t=0; t<non_pooling_prod; t++)
+        {
+            // compute the non-pooling index in each dimension
+            if (%(non_pool_ndim)s!=0)
+            {
+                o_idx[0] = t;
+                i_idx[0] = t;
+                for (int i=1; i<%(non_pool_ndim)s; i++)
+                {
+                    o_idx[i] = o_idx[i - 1] / PyArray_DIMS(%(x)s)[i - 1];
+                    o_idx[i - 1] = o_idx[i - 1] %% PyArray_DIMS(%(x)s)[i - 1];
+                    i_idx[i] = o_idx[i];
+                    i_idx[i - 1] = o_idx[i - 1];
+                }
+            }
+
+            // then loop over each region in each pooling dimension
+        """
+
+        for i in xrange(nd):
+            ccode += """
+                for (r_idx[%(i)s]=0; r_idx[%(i)s] < z[%(i)s]; r_idx[%(i)s]++) {
+                  r_st[%(i)s] = r_idx[%(i)s] * st[%(i)s];
+                  r_end[%(i)s] = r_st[%(i)s] + ws[%(i)s];
                  // skip the padding
-                  r_st = r_st < pd0 ? pd0 : r_st;
-                  r_end = r_end > (r - pd0) ? r - pd0 : r_end;
+                  r_st[%(i)s] = r_st[%(i)s] < pd[%(i)s] ? pd[%(i)s] : r_st[%(i)s];
+                  r_end[%(i)s] = r_end[%(i)s] > (r[%(i)s] - pd[%(i)s]) ? r[%(i)s] - pd[%(i)s] : r_end[%(i)s];
                  // from padded_img space to img space
-                  r_st -= pd0;
-                  r_end -= pd0;
-                  for(int j=0; j<z_c; j++){
-                    c_st = j * st1;
-                    c_end = c_st + ws1;
-                    // skip the padding
-                    c_st = c_st < pd1 ? pd1 : c_st;
-                    c_end = c_end > (c - pd1) ? c - pd1 : c_end;
-                    // from padding_img space into img space
-                    c_st -= pd1;
-                    c_end -= pd1;
+                  r_st[%(i)s] -= pd[%(i)s];
+                  r_end[%(i)s] -= pd[%(i)s];
+                  // use the index to find the correct position in the output
+                  o_idx[%(non_pool_ndim)s + %(i)s] = r_idx[%(i)s];
+            """ % dict(i=i, non_pool_ndim=non_pool_ndim)
+
+        ccode += """
+                  dtype_%(z)s * z;
+                  if (%(total_ndim)s == 4)
+                  {
+                    // the maximum value
+                    maximum = ((dtype_%(maxout)s*)(PyArray_GETPTR4(%(maxout)s,o_idx[0],o_idx[1],o_idx[2],o_idx[3])))[0];
+                    // z at this position
+                    z = ((dtype_%(z)s*)(PyArray_GETPTR4(%(z)s,o_idx[0],o_idx[1],o_idx[2],o_idx[3])));
+                  }
+                  else
+                  {
                    // the maximum value
-                    maximum = ((dtype_%(maxout)s*)(PyArray_GETPTR4(%(maxout)s,b,k,i,j)))[0];
+                    maximum = ((dtype_%(maxout)s*)(PyArray_GetPtr(%(maxout)s,o_idx)))[0];
                    // z at this position
-                    dtype_%(z)s * z = ((dtype_%(z)s*)(PyArray_GETPTR4(%(z)s, b, k, i, j)));
-                    // go through the pooled region in the unpadded input
-                    for(int m=r_st; m<r_end; m++)
+                    z = ((dtype_%(z)s*)(PyArray_GetPtr(%(z)s,o_idx)));
+                  }
+        """
+        for i in xrange(nd):
+            ccode += """
+                  // go through the pooled region in the unpadded input
+                  for(int m%(i)s=r_st[%(i)s]; m%(i)s<r_end[%(i)s]; m%(i)s++)
+                  {
+                    i_idx[%(non_pool_ndim)s + %(i)s] = m%(i)s;
+                """ % dict(i=i, non_pool_ndim=non_pool_ndim)
+        ccode += """
+                    dtype_%(x)s a;
+                    dtype_%(ggx)s * ggx;
+                    if (%(total_ndim)s == 4)
                    {
-                      for(int n=c_st; n<c_end; n++)
-                      {
-                        dtype_%(x)s a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,b,k,m,n)))[0];
-                        dtype_%(ggx)s * ggx = (
-                          (dtype_%(ggx)s*)(PyArray_GETPTR4(%(ggx)s, b, k, m, n)));
-                        if (a == maximum){
-                          z[0] += ggx[0];
-                        }
-                      }
+                      a = ((dtype_%(x)s*)(PyArray_GETPTR4(%(x)s,i_idx[0],i_idx[1],i_idx[2],i_idx[3])))[0];
+                      ggx = ((dtype_%(ggx)s*)(PyArray_GETPTR4(%(ggx)s,i_idx[0],i_idx[1],i_idx[2],i_idx[3])));
                    }
-                  }
-                }
-              }
-        """ % locals()
+                    else
+                    {
+                      a = ((dtype_%(x)s*)(PyArray_GetPtr(%(x)s,i_idx)))[0];
+                      ggx = ((dtype_%(ggx)s*)(PyArray_GetPtr(%(ggx)s,i_idx)));
+                    }
+                    if (a == maximum){
+                      z[0] += ggx[0];
+                    }
+        """
+        for i in xrange(nd):
+            ccode += """
+                  } // for loop over region
+                """
+        for i in xrange(nd):
+            ccode += """
+              } // loop over pooling dimension
+            """
+
+        ccode += """
+          } // for loop over non-pooling dimensions
+        """
+        return ccode % locals()

    def c_code_cache_version(self):
-        return (0, 3, self.openmp)
+        return (0, 4, self.openmp)
--- a/theano/tensor/signal/tests/test_pool.py
+++ b/theano/tensor/signal/tests/test_pool.py
 from __future__ import absolute_import, print_function, division

 from nose.plugins.skip import SkipTest
+from nose_parameterized import parameterized
 from itertools import product
 import os
 import unittest
@@ -14,7 +15,7 @@ import numpy
 import theano
 import theano.tensor as tensor
 from theano.tests import unittest_tools as utt
-from theano.tensor.signal.pool import (Pool, pool_2d,
+from theano.tensor.signal.pool import (Pool, pool_2d, pool_3d,
                                       MaxPoolGrad, AveragePoolGrad,
                                       max_pool_2d_same_size,
                                       DownsampleFactorMaxGradGrad)
@@ -57,6 +58,36 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                    output_val[k][i, j] = func(patch)
        return output_val

+    @staticmethod
+    def numpy_max_pool_nd(input, ds, ignore_border=False, mode='max'):
+        '''Helper function, implementing pool_nd in pure numpy'''
+        if len(input.shape) < len(ds):
+            raise NotImplementedError('input should have at least %s dim,'
+                                      ' shape is %s'
+                                      % (str(ds), str(input.shape)))
+        nd = len(ds)
+        si = [0] * nd
+        if not ignore_border:
+            for i in range(nd):
+                if input.shape[-nd + i] % ds[i]:
+                    si[i] += 1
+        out_shp = list(input.shape[:-nd])
+        for i in range(nd):
+            out_shp.append(input.shape[-nd + i] // ds[i] + si[i])
+        output_val = numpy.zeros(out_shp)
+        func = numpy.max
+        if mode == 'sum':
+            func = numpy.sum
+        elif mode != 'max':
+            func = numpy.average
+
+        for l in numpy.ndindex(*input.shape[:-nd]):
+            for r in numpy.ndindex(*output_val.shape[-nd:]):
+                patch = input[l][tuple(slice(r[i] * ds[i], (r[i] + 1) * ds[i])
+                                       for i in range(nd))]
+                output_val[l][r] = func(patch)
+        return output_val
+
    @staticmethod
    def numpy_max_pool_2d_stride_padding(
            x, ds, ignore_border=True, st=None, padding=(0, 0), mode='max'):
@@ -111,6 +142,60 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                    output_val[k][i, j] = func(patch)
        return output_val

+    @staticmethod
+    def numpy_max_pool_nd_stride_padding(
+            input, ds, ignore_border=True, st=None, padding=None, mode='max'):
+        assert ignore_border
+        nd = len(ds)
+        if padding is None:
+            padding = (0,) * nd
+        if st is None:
+            st = (0,) * nd
+        assert len(padding) == len(ds) == len(st)
+        assert all(ds[i] > padding[i] for i in range(nd))
+
+        def pad_img(x):
+            # initialize padded input
+            y = numpy.zeros(
+                x.shape[0:-nd] +
+                tuple(x.shape[-nd + i] + padding[i] * 2 for i in range(nd)),
+                dtype=x.dtype)
+            # place the unpadded input in the center
+            block = ((slice(None),) * (len(x.shape) - nd) +
+                     tuple(slice(padding[i], x.shape[-nd + i] + padding[i])
+                           for i in range(nd)))
+            y[block] = x
+            return y
+
+        pad_img_shp = list(input.shape[:-nd])
+        out_shp = list(input.shape[:-nd])
+        for i in range(nd):
+            padded_size = input.shape[-nd + i] + 2 * padding[i]
+            pad_img_shp.append(padded_size)
+            out_shp.append((padded_size - ds[i]) // st[i] + 1)
+        output_val = numpy.zeros(out_shp)
+        padded_input = pad_img(input)
+        func = numpy.max
+        if mode == 'sum':
+            func = numpy.sum
+        elif mode != 'max':
+            func = numpy.average
+        inc_pad = mode == 'average_inc_pad'
+
+        for l in numpy.ndindex(*input.shape[:-nd]):
+            for r in numpy.ndindex(*output_val.shape[-nd:]):
+                region = []
+                for i in range(nd):
+                    r_st = r[i] * st[i]
+                    r_end = builtins.min(r_st + ds[i], pad_img_shp[-nd + i])
+                    if not inc_pad:
+                        r_st = builtins.max(r_st, padding[i])
+                        r_end = builtins.min(r_end, input.shape[-nd + i] + padding[i])
+                    region.append(slice(r_st, r_end))
+                patch = padded_input[l][region]
+                output_val[l][r] = func(patch)
+        return output_val
+
    @staticmethod
    def numpy_max_pool_2d_stride(input, ds, ignore_border=False, st=None,
                                 mode='max'):
@@ -174,84 +259,167 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                    output_val[k][i, j] = func(patch)
        return output_val

+    @staticmethod
+    def numpy_max_pool_nd_stride(input, ds, ignore_border=False, st=None,
+                                 mode='max'):
+        '''Helper function, implementing pooling in pure numpy
+           this function provides st input to indicate the stide size
+           for the pooling regions. if not indicated, st == sd.'''
+        nd = len(ds)
+        if st is None:
+            st = ds
+        assert len(st) == len(ds)
+
+        out_shp = list(input.shape[:-nd])
+        for i in range(nd):
+            out = 0
+            if input.shape[-nd + i] - ds[i] >= 0:
+                out = (input.shape[-nd + i] - ds[i]) // st[i] + 1
+            if not ignore_border:
+                if out > 0:
+                    if input.shape[-nd + i] - ((out - 1) * st[i] + ds[i]) > 0:
+                        if input.shape[-nd + i] - out * st[i] > 0:
+                            out += 1
+                else:
+                    if input.shape[-nd + i] > 0:
+                        out += 1
+            out_shp.append(out)
+
+        func = numpy.max
+        if mode == 'sum':
+            func = numpy.sum
+        elif mode != 'max':
+            func = numpy.average
+
+        output_val = numpy.zeros(out_shp)
+        for l in numpy.ndindex(*input.shape[:-nd]):
+            for r in numpy.ndindex(*output_val.shape[-nd:]):
+                region = []
+                for i in range(nd):
+                    r_st = r[i] * st[i]
+                    r_end = builtins.min(r_st + ds[i], input.shape[-nd + i])
+                    region.append(slice(r_st, r_end))
+                patch = input[l][region]
+                output_val[l][r] = func(patch)
+        return output_val
+
    def test_DownsampleFactorMax(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
-        # generate random images
-        maxpoolshps = ((1, 1), (2, 2), (3, 3), (2, 3))
-        imval = rng.rand(4, 2, 16, 16)
-        images = tensor.dtensor4()
-        for maxpoolshp, ignore_border, mode in product(maxpoolshps,
-                                                       [True, False],
-                                                       ['max',
-                                                        'sum',
-                                                        'average_inc_pad',
-                                                        'average_exc_pad']):
-                # print 'maxpoolshp =', maxpoolshp
-                # print 'ignore_border =', ignore_border
-
-                # Pure Numpy computation
-                numpy_output_val = self.numpy_max_pool_2d(imval, maxpoolshp,
-                                                          ignore_border,
-                                                          mode=mode)
+        # maxpool, input size
+        examples = (
+            ((2,), (16,)),
+            ((2,), (4, 16,)),
+            ((2,), (4, 2, 16,)),
+            ((1, 1), (4, 2, 16, 16)),
+            ((2, 2), (4, 2, 16, 16)),
+            ((3, 3), (4, 2, 16, 16)),
+            ((3, 2), (4, 2, 16, 16)),
+            ((3, 2, 2), (3, 2, 16, 16, 16)),
+            ((2, 3, 2), (3, 2, 16, 16, 16)),
+            ((2, 2, 3), (3, 2, 16, 16, 16)),
+            ((2, 2, 3, 2), (3, 2, 6, 6, 6, 5)),
+        )
+
+        for example, ignore_border, mode in product(examples,
+                                                    [True, False],
+                                                    ['max',
+                                                     'sum',
+                                                     'average_inc_pad',
+                                                     'average_exc_pad']):
+            (maxpoolshp, inputsize) = example
+            imval = rng.rand(*inputsize)
+            images = theano.shared(imval)
+
+            # Pure Numpy computation
+            numpy_output_val = self.numpy_max_pool_nd(imval, maxpoolshp,
+                                                      ignore_border,
+                                                      mode=mode)
+
+            # The pool_2d or pool_3d helper methods
+            if len(maxpoolshp) == 2:
                output = pool_2d(images, maxpoolshp, ignore_border,
                                 mode=mode)
-                f = function([images, ], [output, ])
-                output_val = f(imval)
+                f = function([], [output, ])
+                output_val = f()
                utt.assert_allclose(output_val, numpy_output_val)
-
-                # Pool op
-                maxpool_op = Pool(ignore_border=ignore_border,
-                                  mode=mode)(images, maxpoolshp)
-
-                output_shape = Pool.out_shape(imval.shape, maxpoolshp,
-                                              ignore_border=ignore_border)
-                utt.assert_allclose(numpy.asarray(output_shape), numpy_output_val.shape)
-                f = function([images], maxpool_op)
-                output_val = f(imval)
+            elif len(maxpoolshp) == 3:
+                output = pool_3d(images, maxpoolshp, ignore_border,
+                                 mode=mode)
+                f = function([], [output, ])
+                output_val = f()
                utt.assert_allclose(output_val, numpy_output_val)

+            # Pool op
+            maxpool_op = Pool(ndim=len(maxpoolshp),
+                              ignore_border=ignore_border,
+                              mode=mode)(images, maxpoolshp)
+
+            output_shape = Pool.out_shape(imval.shape, maxpoolshp,
+                                          ndim=len(maxpoolshp),
+                                          ignore_border=ignore_border)
+            utt.assert_allclose(numpy.asarray(output_shape), numpy_output_val.shape)
+            f = function([], maxpool_op)
+            output_val = f()
+            utt.assert_allclose(output_val, numpy_output_val)
+
    def test_DownsampleFactorMaxStride(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
-        maxpoolshps = ((1, 1), (3, 3), (5, 3), (16, 16))
-        stridesizes = ((1, 1), (3, 3), (5, 7),)
-        # generate random images
-        imval = rng.rand(4, 10, 16, 16)
-        # The same for each mode
-        outputshps = (
-            (4, 10, 16, 16), (4, 10, 6, 6), (4, 10, 4, 3),
-            (4, 10, 16, 16), (4, 10, 6, 6), (4, 10, 4, 3),
-            (4, 10, 14, 14), (4, 10, 5, 5), (4, 10, 3, 2),
-            (4, 10, 14, 14), (4, 10, 6, 6), (4, 10, 4, 3),
-            (4, 10, 12, 14), (4, 10, 4, 5), (4, 10, 3, 2),
-            (4, 10, 12, 14), (4, 10, 5, 6), (4, 10, 4, 3),
-            (4, 10, 1, 1), (4, 10, 1, 1), (4, 10, 1, 1),
-            (4, 10, 1, 1), (4, 10, 1, 1), (4, 10, 1, 1),)
-
-        images = tensor.dtensor4()
-        indx = 0
-        for mode, maxpoolshp, ignore_border in product(['max',
-                                                        'sum',
-                                                        'average_inc_pad',
-                                                        'average_exc_pad'],
-                                                       maxpoolshps,
-                                                       [True, False]):
-                for stride in stridesizes:
-                    outputshp = outputshps[indx % len(outputshps)]
-                    indx += 1
-                    # Pool op
-                    numpy_output_val = \
-                        self.numpy_max_pool_2d_stride(imval, maxpoolshp,
-                                                      ignore_border, stride,
-                                                      mode)
-                    assert numpy_output_val.shape == outputshp, (
-                        "outshape is %s, calculated shape is %s"
-                        % (outputshp, numpy_output_val.shape))
-                    maxpool_op = \
-                        Pool(ignore_border=ignore_border, mode=mode)(
-                            images, maxpoolshp, stride)
-                    f = function([images], maxpool_op)
-                    output_val = f(imval)
-                    utt.assert_allclose(output_val, numpy_output_val)
+        # maxpool, stride, ignore_border, input, output sizes
+        examples = (
+            ((1, 1), (1, 1), True, (4, 10, 16, 16), (4, 10, 16, 16)),
+            ((1, 1), (3, 3), True, (4, 10, 16, 16), (4, 10, 6, 6)),
+            ((1, 1), (5, 7), True, (4, 10, 16, 16), (4, 10, 4, 3)),
+            ((1, 1), (1, 1), False, (4, 10, 16, 16), (4, 10, 16, 16)),
+            ((1, 1), (3, 3), False, (4, 10, 16, 16), (4, 10, 6, 6)),
+            ((1, 1), (5, 7), False, (4, 10, 16, 16), (4, 10, 4, 3)),
+            ((3, 3), (1, 1), True, (4, 10, 16, 16), (4, 10, 14, 14)),
+            ((3, 3), (3, 3), True, (4, 10, 16, 16), (4, 10, 5, 5)),
+            ((3, 3), (5, 7), True, (4, 10, 16, 16), (4, 10, 3, 2)),
+            ((3, 3), (1, 1), False, (4, 10, 16, 16), (4, 10, 14, 14)),
+            ((3, 3), (3, 3), False, (4, 10, 16, 16), (4, 10, 6, 6)),
+            ((3, 3), (5, 7), False, (4, 10, 16, 16), (4, 10, 4, 3)),
+            ((5, 3), (1, 1), True, (4, 10, 16, 16), (4, 10, 12, 14)),
+            ((5, 3), (3, 3), True, (4, 10, 16, 16), (4, 10, 4, 5)),
+            ((5, 3), (5, 7), True, (4, 10, 16, 16), (4, 10, 3, 2)),
+            ((5, 3), (1, 1), False, (4, 10, 16, 16), (4, 10, 12, 14)),
+            ((5, 3), (3, 3), False, (4, 10, 16, 16), (4, 10, 5, 6)),
+            ((5, 3), (5, 7), False, (4, 10, 16, 16), (4, 10, 4, 3)),
+            ((16, 16), (1, 1), True, (4, 10, 16, 16), (4, 10, 1, 1)),
+            ((16, 16), (3, 3), True, (4, 10, 16, 16), (4, 10, 1, 1)),
+            ((16, 16), (5, 7), True, (4, 10, 16, 16), (4, 10, 1, 1)),
+            ((16, 16), (1, 1), False, (4, 10, 16, 16), (4, 10, 1, 1)),
+            ((16, 16), (3, 3), False, (4, 10, 16, 16), (4, 10, 1, 1)),
+            ((16, 16), (5, 7), False, (4, 10, 16, 16), (4, 10, 1, 1)),
+            ((3,), (5,), True, (16,), (3,)),
+            ((3,), (5,), True, (2, 16,), (2, 3,)),
+            ((5,), (3,), True, (2, 3, 16,), (2, 3, 4,)),
+            ((5, 1, 3), (3, 3, 3), True, (2, 16, 16, 16), (2, 4, 6, 5)),
+            ((5, 1, 3), (3, 3, 3), True, (4, 2, 16, 16, 16), (4, 2, 4, 6, 5)),
+        )
+
+        for example, mode in product(examples, ['max',
+                                                'sum',
+                                                'average_inc_pad',
+                                                'average_exc_pad']):
+            (maxpoolshp, stride, ignore_border, inputshp, outputshp) = example
+            # generate random images
+            imval = rng.rand(*inputshp)
+            images = theano.shared(imval)
+            # Pool op
+            numpy_output_val = \
+                self.numpy_max_pool_nd_stride(imval, maxpoolshp,
+                                              ignore_border, stride,
+                                              mode)
+            assert numpy_output_val.shape == outputshp, (
+                "outshape is %s, calculated shape is %s"
+                % (outputshp, numpy_output_val.shape))
+            maxpool_op = \
+                Pool(ndim=len(maxpoolshp),
+                     ignore_border=ignore_border,
+                     mode=mode)(images, maxpoolshp, stride)
+            f = function([], maxpool_op)
+            output_val = f()
+            utt.assert_allclose(output_val, numpy_output_val)

    def test_DownsampleFactorMaxStrideExtra(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
@@ -287,7 +455,8 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                    "outshape is %s, calculated shape is %s"
                    % (outputshp, numpy_output_val.shape))
                maxpool_op = \
-                    Pool(ignore_border=ignore_border, mode=mode)(
+                    Pool(ignore_border=ignore_border,
+                         ndim=len(maxpoolshp), mode=mode)(
                        images, maxpoolshp, stride)
                f = function([images], maxpool_op)
                output_val = f(imval)
@@ -296,319 +465,351 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
    def test_DownsampleFactorMaxPaddingStride(self):
        ignore_border = True  # padding does not support ignore_border=False
        rng = numpy.random.RandomState(utt.fetch_seed())
-        maxpoolsizes = [(3, 3), (4, 4), (3, 4), (4, 3), (2, 2)]
-        stridesizes = [(2, 2), (2, 2), (1, 1), (1, 2), (2, 2)]
-        paddingsizes = [(2, 2), (1, 2), (2, 1), (0, 0), (1, 1)]
-        imgsizes = [(5, 5), (5, 5), (5, 6), (6, 5), (5, 5)]
-        m = 4  # minibatch
-        c = 2  # channel size
-        images = tensor.dtensor4()
-        for indx, mode in product(numpy.arange(len(maxpoolsizes)),
-                                  ['max', 'sum', 'average_inc_pad',
-                                   'average_exc_pad']):
-            imgsize = imgsizes[indx]
-            imval = rng.rand(m, c, imgsize[0], imgsize[1]) - 0.5
-
-            stridesize = stridesizes[indx]
-            maxpoolsize = maxpoolsizes[indx]
-            paddingsize = paddingsizes[indx]
-            numpy_output_val = self.numpy_max_pool_2d_stride_padding(
-                imval, maxpoolsize, ignore_border,
+        # maxpool, stride, padding, input sizes
+        examples = (
+            ((3,), (2,), (2,), (5,)),
+            ((3,), (2,), (2,), (4, 5)),
+            ((3,), (2,), (2,), (4, 2, 5)),
+            ((3,), (2,), (2,), (4, 2, 5, 5)),
+            ((3, 3), (2, 2), (2, 2), (4, 2, 5, 5)),
+            ((4, 4), (2, 2), (1, 2), (4, 2, 5, 5)),
+            ((3, 4), (1, 1), (2, 1), (4, 2, 5, 6)),
+            ((4, 3), (1, 2), (0, 0), (4, 2, 6, 5)),
+            ((2, 2), (2, 2), (1, 1), (4, 2, 5, 5)),
+            ((4, 3, 2), (1, 2, 2), (0, 2, 1), (4, 6, 6, 5)),
+            ((4, 3, 2), (1, 2, 2), (0, 2, 1), (4, 2, 6, 5, 5)),
+        )
+        for example, mode in product(examples,
+                                     ['max', 'sum', 'average_inc_pad',
+                                      'average_exc_pad']):
+            (maxpoolshp, stridesize, paddingsize, inputsize) = example
+            imval = rng.rand(*inputsize) - 0.5
+            images = theano.shared(imval)
+
+            numpy_output_val = self.numpy_max_pool_nd_stride_padding(
+                imval, maxpoolshp, ignore_border,
                stridesize, paddingsize, mode)
-            maxpool_op = Pool(ignore_border=ignore_border, mode=mode)(
-                images, maxpoolsize, stridesize, paddingsize)
-            f = function([images], maxpool_op)
-            output_val = f(imval)
+            maxpool_op = Pool(
+                ndim=len(maxpoolshp),
+                ignore_border=ignore_border,
+                mode=mode
+                )(images, maxpoolshp, stridesize, paddingsize)
+            f = function([], maxpool_op)
+            output_val = f()
            utt.assert_allclose(output_val, numpy_output_val)

    def test_DownsampleFactorMaxPaddingStride_grad(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
-        imgsizes = ((10, 10), (10, 5), (5, 5))
-        maxpoolsizes = ((5, 3), (3, 5), (3, 3))
-        stridesizes = ((3, 2), (2, 3), (3, 3))
-        paddingsizes = ((2, 2), (2, 1), (2, 2))
+        # maxpool, stride, padding, input sizes
+        examples = (
+            ((10,), (5,), (3,), (2,)),
+            ((10,), (5,), (3,), (2, 2)),
+            ((10,), (5,), (3,), (1, 1, 2)),
+            ((10, 10), (5, 3), (3, 2), (1, 1, 2, 2)),
+            ((10, 5), (3, 5), (2, 3), (1, 1, 2, 1)),
+            ((5, 5), (3, 3), (3, 3), (1, 1, 2, 2)),
+            ((5, 5, 5), (3, 3, 3), (3, 3, 3), (1, 1, 2, 2, 2)),
+        )
        # average_inc_pad and average_exc_pad do not
        # support grad with padding
        for mode in ['max', 'sum']:
-            for i in range(len(imgsizes)):
-                imgsize = imgsizes[i]
-                imval = rng.rand(1, 1, imgsize[0], imgsize[1]) * 10.0
-                maxpoolsize = maxpoolsizes[i]
-                stridesize = stridesizes[i]
-                paddingsize = paddingsizes[i]
+            for example in examples:
+                (maxpoolshp, stridesize, paddingsize, inputsize) = example
+                imval = rng.rand(*inputsize) * 10.0

                def mp(input):
-                    return Pool(ignore_border=True, mode=mode)(
-                        input, maxpoolsize, stridesize, paddingsize)
+                    return Pool(
+                        ndim=len(maxpoolshp),
+                        ignore_border=True,
+                        mode=mode,
+                        )(input, maxpoolshp, stridesize, paddingsize)
                utt.verify_grad(mp, [imval], rng=rng)

    def test_DownsampleFactorMax_grad(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
-        maxpoolshps = ((1, 1), (3, 2), (2, 3))
-        imval = rng.rand(2, 3, 3, 4) * 10.0
-        # more variance means numeric gradient will be more accurate
-
-        for maxpoolshp, ignore_border, mode in product(maxpoolshps,
-                                                       [True, False],
-                                                       ['max',
-                                                        'sum',
-                                                        'average_inc_pad',
-                                                        'average_exc_pad']):
+        # maxpool, input sizes
+        examples = (
+            ((2,), (3,)),
+            ((2,), (2, 3)),
+            ((2,), (2, 3, 3)),
+            ((1, 1), (2, 3, 3, 4)),
+            ((3, 2), (2, 3, 3, 4)),
+            ((2, 3), (2, 3, 3, 4)),
+            ((1, 1, 1), (2, 3, 3)),
+            ((3, 2, 2), (2, 3, 3, 4)),
+            ((2, 3, 2), (2, 3, 3, 4, 4)),
+            ((2, 2, 3), (2, 3, 3, 4, 4)),
+        )
+
+        for example, ignore_border, mode in product(examples,
+                                                    [True, False],
+                                                    ['max',
+                                                     'sum',
+                                                     'average_inc_pad',
+                                                     'average_exc_pad']):
+            (maxpoolshp, inputsize) = example
+            imval = rng.rand(*inputsize) * 10.0
+
+            # more variance means numeric gradient will be more accurate
            def mp(input):
-                return Pool(ignore_border=ignore_border, mode=mode)(
-                    input, maxpoolshp)
+                return Pool(ndim=len(maxpoolshp),
+                            ignore_border=ignore_border,
+                            mode=mode)(input, maxpoolshp)
            utt.verify_grad(mp, [imval], rng=rng)

-    def test_DownsampleFactorMax_grad_st(self):
+    # pool, stride, input sizes
+    pool_grad_st_examples = (
+        ((1,), (1,), (16,)),
+        ((1,), (3,), (1, 16)),
+        ((1,), (5,), (1, 2, 16)),
+        ((2,), (1,), (16,)),
+        ((2,), (3,), (1, 16)),
+        ((2,), (5,), (1, 2, 16)),
+        ((1, 1), (1, 1), (1, 2, 16, 16)),
+        ((1, 1), (3, 3), (1, 2, 16, 16)),
+        ((1, 1), (5, 7), (1, 2, 16, 16)),
+        ((3, 3), (1, 1), (1, 2, 16, 16)),
+        ((3, 3), (3, 3), (1, 2, 16, 16)),
+        ((3, 3), (5, 7), (1, 2, 16, 16)),
+        ((5, 3), (1, 1), (1, 2, 16, 16)),
+        ((5, 3), (3, 3), (1, 2, 16, 16)),
+        ((5, 3), (5, 7), (1, 2, 16, 16)),
+        ((5, 1, 2), (1, 1, 1), (16, 3, 16)),
+        ((5, 1, 2), (3, 1, 2), (1, 16, 3, 16)),
+        ((5, 1, 2), (5, 1, 4), (1, 2, 16, 3, 16)),
+        ((5, 3), (3, 2), (1, 2, 16, 16)),
+        ((5, 3), (7, 5), (1, 2, 16, 16)),
+        ((5, 3), (10, 6), (1, 2, 16, 16)),
+        ((5, 5), (1, 1), (1, 2, 8, 5)),
+        ((3, 2), (2, 3), (1, 2, 8, 5)),
+        ((7, 7), (10, 10), (1, 2, 8, 5)),
+        ((9, 9), (1, 1), (1, 2, 8, 5)),
+    )
+
+    @parameterized.expand(product(pool_grad_st_examples,
+                                  [True, False],
+                                  ['max',
+                                   'sum',
+                                   'average_inc_pad',
+                                   'average_exc_pad']),
+                          testcase_func_name=utt.custom_name_func)
+    def test_DownsampleFactorMax_grad_st(self, example, ignore_border, mode):
        # checks the gradient for the case that stride is used
        rng = numpy.random.RandomState(utt.fetch_seed())
-        maxpoolshps = ((1, 1), (3, 3), (5, 3))
-        stridesizes = ((1, 1), (3, 3), (5, 7))
-        imval = rng.rand(1, 2, 16, 16)
-
-        for maxpoolshp, ignore_border, mode, stride in product(maxpoolshps,
-                                                               [True, False],
-                                                               ['max',
-                                                                'sum',
-                                                                'average_inc_pad',
-                                                                'average_exc_pad'],
-                                                               stridesizes):
-            def mp(input):
-                return Pool(ignore_border=ignore_border, mode=mode)(
-                    input, maxpoolshp, stride)
-            utt.verify_grad(mp, [imval], rng=rng)

-    def test_DownsampleFactorMax_grad_st_extra(self):
-        # checks the gradient for the case
-        # that stride is used for extra examples
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        maxpoolshps = ((5, 3), (5, 3), (5, 3), (5, 5), (3, 2), (7, 7), (9, 9))
-        stridesizes = ((3, 2), (7, 5), (10, 6), (1, 1),
-                       (2, 3), (10, 10), (1, 1))
-        imvsizs = ((16, 16), (16, 16), (16, 16), (8, 5),
-                   (8, 5), (8, 5), (8, 5))
+        (maxpoolshp, stridesize, inputsize) = example
+        imval = rng.rand(*inputsize)

-        for mode in ['max', 'sum', 'average_inc_pad', 'average_exc_pad']:
-            for indx in numpy.arange(len(maxpoolshps)):
-                imvsize = imvsizs[indx]
-                imval = rng.rand(1, 2, imvsize[0], imvsize[1])
-                stride = stridesizes[indx]
-                maxpoolshp = maxpoolshps[indx]
-                for ignore_border in [True, False]:
-                    def mp(input):
-                        return Pool(ignore_border=ignore_border, mode=mode)(
-                            input, maxpoolshp, stride)
-                    utt.verify_grad(mp, [imval], rng=rng)
+        def mp(input):
+            return Pool(ndim=len(maxpoolshp),
+                        ignore_border=ignore_border,
+                        mode=mode)(input, maxpoolshp, stridesize)
+        utt.verify_grad(mp, [imval], rng=rng)

    def test_DownsampleFactorMaxGrad_grad(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
-        maxpoolshps = ((1, 1), (3, 2), (2, 3))
-        imval = rng.rand(2, 3, 3, 4) * 10.0
-        # more variance means numeric gradient will be more accurate
-
-        for maxpoolshp in maxpoolshps:
+        # maxpool, input sizes
+        examples = (
+            ((2,), (2,)),
+            ((2,), (2, 3)),
+            ((1, 1), (2, 3, 3, 4)),
+            ((3, 2), (2, 3, 3, 4)),
+            ((2, 3), (2, 3, 3, 4)),
+            ((1, 1, 1), (2, 3, 3, 4)),
+            ((3, 2, 2), (2, 3, 3, 4)),
+            ((2, 3, 2), (2, 3, 3, 4)),
+            ((2, 2, 3), (2, 3, 3, 4)),
+            ((2, 2, 3), (2, 1, 3, 3, 4)),
+        )
+
+        for (maxpoolshp, inputsize) in examples:
+            imval = rng.rand(*inputsize) * 10.0
+            # more variance means numeric gradient will be more accurate
            for ignore_border in [True, False]:
                # print 'maxpoolshp =', maxpoolshp
                # print 'ignore_border =', ignore_border
                # The shape of the gradient will be the shape of the output
                grad_shape = Pool.out_shape(
-                    imval.shape, maxpoolshp, ignore_border=ignore_border)
+                    imval.shape, maxpoolshp, ndim=len(maxpoolshp), ignore_border=ignore_border)
                grad_val = rng.rand(*grad_shape) * 10.0

                def mp(input, grad):
-                    out = Pool(ignore_border=ignore_border)(input, maxpoolshp)
-                    grad_op = MaxPoolGrad(ignore_border=ignore_border)
+                    out = Pool(
+                        ndim=len(maxpoolshp),
+                        ignore_border=ignore_border)(input, maxpoolshp)
+                    grad_op = MaxPoolGrad(
+                        ndim=len(maxpoolshp),
+                        ignore_border=ignore_border)
                    return grad_op(input, out, grad, maxpoolshp)

                utt.verify_grad(mp, [imval, grad_val], rng=rng)

    def test_AveragePoolGrad_grad(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
-        avgpoolshps = ((1, 1), (3, 2), (2, 3))
-        imval = rng.rand(2, 3, 3, 4) * 10.0
-        # more variance means numeric gradient will be more accurate
-
-        for avgpoolshp in avgpoolshps:
+        # avgpool, input sizes
+        examples = (
+            ((2,), (2,)),
+            ((2,), (2, 3)),
+            ((1, 1), (2, 3, 3, 4)),
+            ((3, 2), (2, 3, 3, 4)),
+            ((2, 3), (2, 3, 3, 4)),
+            ((1, 1, 1), (2, 3, 3, 4)),
+            ((3, 2, 2), (2, 3, 3, 4)),
+            ((2, 3, 2), (2, 3, 3, 4)),
+            ((2, 2, 3), (2, 3, 3, 4)),
+            ((2, 2, 3), (2, 1, 3, 3, 4)),
+        )
+
+        for (avgpoolshp, inputsize) in examples:
+            imval = rng.rand(*inputsize) * 10.0
+            # more variance means numeric gradient will be more accurate
            for ignore_border in [True, False]:
                for mode in ['sum', 'average_inc_pad', 'average_exc_pad']:
                    # print 'maxpoolshp =', maxpoolshp
                    # print 'ignore_border =', ignore_border
                    # The shape of the gradient will be the shape of the output
                    grad_shape = Pool.out_shape(
-                        imval.shape, avgpoolshp, ignore_border=ignore_border)
+                        imval.shape, avgpoolshp, ndim=len(avgpoolshp),
+                        ignore_border=ignore_border)
                    grad_val = rng.rand(*grad_shape) * 10.0

                    def mp(input, grad):
-                        grad_op = AveragePoolGrad(ignore_border=ignore_border,
-                                                  mode=mode)
+                        grad_op = AveragePoolGrad(
+                            ndim=len(avgpoolshp),
+                            ignore_border=ignore_border, mode=mode)
                        return grad_op(input, grad, avgpoolshp)

                    utt.verify_grad(mp, [imval, grad_val], rng=rng)

-    def test_DownsampleFactorMaxGrad_grad_st(self):
+    @parameterized.expand(product(pool_grad_st_examples,
+                                  [True, False]),
+                          testcase_func_name=utt.custom_name_func)
+    def test_DownsampleFactorMaxGrad_grad_st(self, example, ignore_border):
        # checks the gradient of the gradient for
        # the case that stride is used
        rng = numpy.random.RandomState(utt.fetch_seed())
-        maxpoolshps = ((1, 1), (3, 3), (5, 3))
-        stridesizes = ((1, 1), (3, 3), (5, 7))
-        imval = rng.rand(1, 2, 16, 16)
+        (maxpoolshp, stride, inputsize) = example
+        imval = rng.rand(*inputsize)
+        grad_shape = Pool.out_shape(
+            imval.shape, maxpoolshp, ndim=len(maxpoolshp),
+            ignore_border=ignore_border, st=stride)

-        for maxpoolshp in maxpoolshps:
-            for ignore_border in [True, False]:
-                for stride in stridesizes:
-                    grad_shape = Pool.out_shape(
-                        imval.shape, maxpoolshp,
-                        ignore_border=ignore_border, st=stride)
-                    grad_val = rng.rand(*grad_shape)
+        # skip the grad verification when the output is empty
+        if numpy.prod(grad_shape) != 0:
+            grad_val = rng.rand(*grad_shape)

-                    def mp(input, grad):
-                        out = Pool(ignore_border=ignore_border)(
-                            input, maxpoolshp, stride)
-                        grad_op = MaxPoolGrad(ignore_border=ignore_border)
-                        return grad_op(input, out, grad, maxpoolshp, stride)
+            def mp(input, grad):
+                out = Pool(
+                    ndim=len(maxpoolshp),
+                    ignore_border=ignore_border)(input, maxpoolshp, stride)
+                grad_op = MaxPoolGrad(
+                    ndim=len(maxpoolshp),
+                    ignore_border=ignore_border)
+                return grad_op(input, out, grad, maxpoolshp, stride)

-                    utt.verify_grad(mp, [imval, grad_val], rng=rng)
+                utt.verify_grad(mp, [imval, grad_val], rng=rng)

-    def test_AveragePoolGrad_grad_st(self):
+    @parameterized.expand(product(pool_grad_st_examples,
+                                  [True, False],
+                                  ['sum',
+                                   'average_inc_pad',
+                                   'average_exc_pad']),
+                          testcase_func_name=utt.custom_name_func)
+    def test_AveragePoolGrad_grad_st(self, example, ignore_border, mode):
        # checks the gradient of the gradient for
        # the case that stride is used
        rng = numpy.random.RandomState(utt.fetch_seed())
-        avgpoolshps = ((1, 1), (3, 3), (5, 3))
-        stridesizes = ((1, 1), (3, 3), (5, 7))
-        imval = rng.rand(1, 2, 16, 16)
-
-        for avgpoolshp in avgpoolshps:
-            for ignore_border in [True, False]:
-                for mode in ['sum', 'average_inc_pad', 'average_exc_pad']:
-                    for stride in stridesizes:
-                        grad_shape = Pool.out_shape(
-                            imval.shape, avgpoolshp,
-                            ignore_border=ignore_border, st=stride)
-                        grad_val = rng.rand(*grad_shape)
-
-                        def mp(input, grad):
-                            grad_op = AveragePoolGrad(
-                                ignore_border=ignore_border, mode=mode)
-                            return grad_op(input, grad, avgpoolshp, stride)
-
-                        utt.verify_grad(mp, [imval, grad_val], rng=rng)
-
-    def test_DownsampleFactorMaxGrad_grad_st_extra(self):
-        # checks the gradient of the gradient for the case that
-        # stride is used for extra examples
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        maxpoolshps = ((5, 3), (5, 3), (5, 3), (5, 5), (3, 2), (7, 7), (9, 9))
-        stridesizes = ((3, 2), (7, 5), (10, 6), (1, 1),
-                       (2, 3), (10, 10), (1, 1))
-        imvsizs = ((16, 16), (16, 16), (16, 16), (8, 5),
-                   (8, 5), (8, 5), (8, 5))
-
-        for indx in numpy.arange(len(maxpoolshps)):
-            imvsize = imvsizs[indx]
-            imval = rng.rand(1, 2, imvsize[0], imvsize[1])
-            stride = stridesizes[indx]
-            maxpoolshp = maxpoolshps[indx]
-            for ignore_border in [True, False]:
-                grad_shape = Pool.out_shape(
-                    imval.shape, maxpoolshp,
-                    ignore_border=ignore_border, st=stride)
-                grad_val = rng.rand(*grad_shape)
-
-                def mp(input, grad):
-                    out = Pool(ignore_border=ignore_border)(input, maxpoolshp,
-                                                            stride)
-                    grad_op = MaxPoolGrad(ignore_border=ignore_border)
-                    return grad_op(input, out, grad, maxpoolshp, stride)
-
-                # skip the grad verification when the output is empty
-                if numpy.prod(grad_shape) == 0:
-                    continue
-                utt.verify_grad(mp, [imval, grad_val], rng=rng)
-
-    def test_AveragePoolGrad_grad_st_extra(self):
-        # checks the gradient of the gradient for the case that
-        # stride is used for extra examples
-        rng = numpy.random.RandomState(utt.fetch_seed())
-        avgpoolshps = ((5, 3), (5, 3), (5, 3), (5, 5), (3, 2), (7, 7), (9, 9))
-        stridesizes = ((3, 2), (7, 5), (10, 6), (1, 1),
-                       (2, 3), (10, 10), (1, 1))
-        imvsizs = ((16, 16), (16, 16), (16, 16), (8, 5),
-                   (8, 5), (8, 5), (8, 5))
+        (avgpoolshp, stride, inputsize) = example
+        imval = rng.rand(*inputsize)
+        grad_shape = Pool.out_shape(
+            imval.shape, avgpoolshp,
+            ndim=len(avgpoolshp),
+            ignore_border=ignore_border, st=stride)

-        for indx in numpy.arange(len(avgpoolshps)):
-            imvsize = imvsizs[indx]
-            imval = rng.rand(1, 2, imvsize[0], imvsize[1])
-            stride = stridesizes[indx]
-            avgpoolshp = avgpoolshps[indx]
-            for ignore_border in [True, False]:
-                for mode in ['sum', 'average_inc_pad', 'average_exc_pad']:
-                    grad_shape = Pool.out_shape(
-                        imval.shape, avgpoolshp,
-                        ignore_border=ignore_border, st=stride)
-                    grad_val = rng.rand(*grad_shape)
+        # skip the grad verification when the output is empty
+        if numpy.prod(grad_shape) != 0:
+            grad_val = rng.rand(*grad_shape)

-                    def mp(input, grad):
-                        grad_op = AveragePoolGrad(ignore_border=ignore_border,
-                                                  mode=mode)
-                        return grad_op(input, grad, avgpoolshp, stride)
+            def mp(input, grad):
+                grad_op = AveragePoolGrad(
+                    ndim=len(avgpoolshp),
+                    ignore_border=ignore_border,
+                    mode=mode)
+                return grad_op(input, grad, avgpoolshp, stride)

-                    # skip the grad verification when the output is empty
-                    if numpy.prod(grad_shape) == 0:
-                        continue
-                    utt.verify_grad(mp, [imval, grad_val], rng=rng)
+            utt.verify_grad(mp, [imval, grad_val], rng=rng)

    def test_DownsampleFactorMaxPaddingStride_grad_grad(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
-        imgsizes = ((10, 10), (10, 5), (5, 5))
-        maxpoolsizes = ((5, 3), (3, 5), (3, 3))
-        stridesizes = ((3, 2), (2, 3), (3, 3))
-        paddingsizes = ((2, 2), (2, 1), (2, 2))
-
-        for i in range(len(imgsizes)):
-            imgsize = imgsizes[i]
-            imval = rng.rand(1, 1, imgsize[0], imgsize[1]) * 10.0
-            maxpoolsize = maxpoolsizes[i]
-            stridesize = stridesizes[i]
-            paddingsize = paddingsizes[i]
+        # maxpool, stride, padding, input sizes
+        examples = (
+            ((3,), (2,), (2,), (10,)),
+            ((3,), (2,), (2,), (2, 10,)),
+            ((3,), (2,), (2,), (2, 1, 10,)),
+            ((5, 3), (3, 2), (2, 2), (1, 1, 10, 10)),
+            ((5, 3), (3, 2), (2, 2), (1, 1, 10, 10)),
+            ((3, 5), (2, 3), (2, 1), (1, 1, 10, 5)),
+            ((3, 3), (3, 3), (2, 2), (1, 1, 5, 5)),
+            ((5, 3, 3), (3, 2, 2), (2, 2, 2), (1, 1, 10, 5, 5)),
+            ((3, 5, 3), (2, 3, 2), (2, 1, 2), (1, 1, 5, 10, 5)),
+            ((3, 3, 5), (2, 2, 3), (2, 2, 1), (1, 1, 5, 5, 10)),
+        )
+
+        for (maxpoolshp, stridesize, paddingsize, inputsize) in examples:
+            imval = rng.rand(*inputsize) * 10.0

            grad_shape = Pool.out_shape(imval.shape,
-                                        maxpoolsize, st=stridesize,
+                                        maxpoolshp,
+                                        ndim=len(maxpoolshp),
+                                        st=stridesize,
                                        ignore_border=True,
                                        padding=paddingsize)
            grad_val = rng.rand(*grad_shape) * 10.0

            def mp(input, grad):
-                out = Pool(ignore_border=True)(input, maxpoolsize, stridesize,
-                                               paddingsize)
-                grad_op = MaxPoolGrad(ignore_border=True)
-                return grad_op(input, out, grad, maxpoolsize, stridesize,
-                               paddingsize)
+                out = Pool(
+                    ndim=len(maxpoolshp),
+                    ignore_border=True,
+                    )(input, maxpoolshp, stridesize, paddingsize)
+                grad_op = MaxPoolGrad(ndim=len(maxpoolshp),
+                                      ignore_border=True)
+                return grad_op(input, out, grad, maxpoolshp, stridesize, paddingsize)
            utt.verify_grad(mp, [imval, grad_val], rng=rng)

    def test_AveragePoolPaddingStride_grad_grad(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
-        imgsizes = ((10, 10), (10, 5), (5, 5))
-        avgpoolsizes = ((5, 3), (3, 5), (3, 3))
-        stridesizes = ((3, 2), (2, 3), (3, 3))
-        paddingsizes = ((2, 2), (2, 1), (2, 2))
-
-        for i in range(len(imgsizes)):
-            imgsize = imgsizes[i]
-            imval = rng.rand(1, 1, imgsize[0], imgsize[1]) * 10.0
-            avgpoolsize = avgpoolsizes[i]
-            stridesize = stridesizes[i]
-            paddingsize = paddingsizes[i]
+        # avgpool, stride, padding, input sizes
+        examples = (
+            ((3,), (2,), (2,), (10,)),
+            ((3,), (2,), (2,), (2, 10,)),
+            ((3,), (2,), (2,), (2, 1, 10,)),
+            ((5, 3), (3, 2), (2, 2), (1, 1, 10, 10)),
+            ((5, 3), (3, 2), (2, 2), (1, 1, 10, 10)),
+            ((3, 5), (2, 3), (2, 1), (1, 1, 10, 5)),
+            ((3, 3), (3, 3), (2, 2), (1, 1, 5, 5)),
+            ((5, 3, 3), (3, 2, 2), (2, 2, 2), (1, 1, 10, 5, 5)),
+            ((3, 5, 3), (2, 3, 2), (2, 1, 2), (1, 1, 5, 10, 5)),
+            ((3, 3, 5), (2, 2, 3), (2, 2, 1), (1, 1, 5, 5, 10)),
+        )
+
+        for (avgpoolshp, stridesize, paddingsize, inputsize) in examples:
+            imval = rng.rand(*inputsize) * 10.0

            # 'average_exc_pad' with non-zero padding is not implemented
            for mode in ['sum', 'average_inc_pad']:
                grad_shape = Pool.out_shape(imval.shape,
-                                            avgpoolsize, st=stridesize,
-                                            ignore_border=True, padding=paddingsize)
+                                            avgpoolshp,
+                                            ndim=len(avgpoolshp),
+                                            st=stridesize,
+                                            ignore_border=True,
+                                            padding=paddingsize)
                grad_val = rng.rand(*grad_shape) * 10.0

                def mp(input, grad):
-                    grad_op = AveragePoolGrad(ignore_border=True, mode=mode)
-                    return grad_op(input, grad, avgpoolsize, stridesize, paddingsize)
+                    grad_op = AveragePoolGrad(ndim=len(avgpoolshp),
+                                              ignore_border=True,
+                                              mode=mode)
+                    return grad_op(input, grad, avgpoolshp, stridesize, paddingsize)
                utt.verify_grad(mp, [imval, grad_val], rng=rng)

    def test_DownsampleFactorMax_hessian(self):
@@ -630,26 +831,31 @@ class TestDownsampleFactorMax(utt.InferShapeTester):

    def test_DownsampleFactorMaxGradGrad_grad(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
-        imgsizes = ((10, 10), (10, 5), (5, 5))
-        maxpoolsizes = ((5, 3), (3, 5), (3, 3))
-        stridesizes = ((3, 2), (2, 3), (3, 3))
-        paddingsizes = ((2, 2), (2, 1), (2, 2))
-
-        for i in range(len(imgsizes)):
-            imgsize = imgsizes[i]
-            imval1 = rng.rand(1, 1, imgsize[0], imgsize[1]) * 10.0
-            imval2 = rng.rand(1, 1, imgsize[0], imgsize[1]) * 10.0
-            maxpoolsize = maxpoolsizes[i]
-            stridesize = stridesizes[i]
-            paddingsize = paddingsizes[i]
+        # maxpool, stride, padding, input sizes
+        examples = (
+            ((3,), (2,), (2,), (10,)),
+            ((3,), (2,), (2,), (2, 10,)),
+            ((3,), (2,), (2,), (2, 1, 10,)),
+            ((5, 3), (3, 2), (2, 2), (1, 1, 10, 10)),
+            ((5, 3), (3, 2), (2, 2), (1, 1, 10, 10)),
+            ((3, 5), (2, 3), (2, 1), (1, 1, 10, 5)),
+            ((3, 3), (3, 3), (2, 2), (1, 1, 5, 5)),
+            ((5, 3, 3), (3, 2, 2), (2, 2, 2), (1, 1, 10, 5, 5)),
+            ((3, 5, 3), (2, 3, 2), (2, 1, 2), (1, 1, 5, 10, 5)),
+            ((3, 3, 5), (2, 2, 3), (2, 2, 1), (1, 1, 5, 5, 10)),
+        )
+
+        for (maxpoolshp, stridesize, paddingsize, inputsize) in examples:
+            imval1 = rng.rand(*inputsize) * 10.0
+            imval2 = rng.rand(*inputsize) * 10.0

            def mp(input1, input2):
-                op1 = Pool(ignore_border=True)
-                pooled_out = op1(input1, maxpoolsize, stride=stridesize,
-                                 pad=paddingsize)
-                op2 = DownsampleFactorMaxGradGrad(ignore_border=True)
-                out = op2(input1, pooled_out, input2, ws=maxpoolsize,
-                          stride=stridesize, pad=paddingsize)
+                op1 = Pool(ndim=len(maxpoolshp), ignore_border=True)
+                pooled_out = op1(input1, maxpoolshp, stridesize, paddingsize)
+                op2 = DownsampleFactorMaxGradGrad(
+                    ndim=len(maxpoolshp),
+                    ignore_border=True)
+                out = op2(input1, pooled_out, input2, maxpoolshp, stridesize, paddingsize)
                return out
            utt.verify_grad(mp, [imval1, imval2], rng=rng)

@@ -679,6 +885,32 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                                   mode=mode)
                utt.verify_grad(mp, [imval], rng=rng)

+    def test_max_pool_3d_3D(self):
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        maxpoolshps = ((1, 1, 1), (3, 2, 1))
+        imval = rng.rand(4, 5, 6)
+        images = tensor.dtensor3()
+
+        for maxpoolshp, ignore_border, mode in product(maxpoolshps,
+                                                       [True, False],
+                                                       ['max', 'sum',
+                                                        'average_inc_pad',
+                                                        'average_exc_pad']):
+                # print 'maxpoolshp =', maxpoolshp
+                # print 'ignore_border =', ignore_border
+                numpy_output_val = self.numpy_max_pool_nd(imval, maxpoolshp,
+                                                          ignore_border,
+                                                          mode=mode)
+                output = pool_3d(images, maxpoolshp, ignore_border,
+                                 mode=mode)
+                output_val = function([images], output)(imval)
+                utt.assert_allclose(output_val, numpy_output_val)
+
+                def mp(input):
+                    return pool_3d(input, maxpoolshp, ignore_border,
+                                   mode=mode)
+                utt.verify_grad(mp, [imval], rng=rng)
+
    def test_max_pool_2d_2D_same_size(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
        test_input_array = numpy.array([[[