Merge pull request #4862 from gvtulder/f-abstractconv3d

Adding an AbstractConv3d interface

Merge pull request #4862 from gvtulder/f-abstractconv3d
146ef971 · Frédéric Bastien · GitHub · 1b5d4fc4 · a50d7907 · 146ef971
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -31,13 +31,18 @@
    that will be replaced by an actual convolution implementation during
    the optimization phase.

+    As of October 2016 (version 0.9.0dev3), there is also a conv3d interface that provides
+    a similar operation for 3D convolution. :func:`nnet.conv3d <theano.tensor.nnet.conv3d>`
+    defines the abstract theano graph convolution operation
+    :func:`nnet.abstract_conv.AbstractConv3d <theano.tensor.nnet.abstract_conv.AbstractConv3d>`.
+
    Since the abstract Op does not have any implementation, it will prevent
    computations in the un-optimized graph, and cause problems with DebugMode,
    test values, and when compiling with optimizer=None.

    By default, if :ref:`cuDNN <libdoc_cuda_dnn>`
    is available, we will use it, otherwise we will fall back to using the
-    gemm version (slower then cuDNN in most cases and uses more memory).
+    gemm version (slower than cuDNN in most cases and uses more memory).

    Either cuDNN and the gemm version can be disabled using the Theano flags
    ``optimizer_excluding=conv_dnn`` and ``optimizer_excluding=conv_gemm``,
@@ -51,9 +56,9 @@
    option. Disabling the gemm version is only useful if cuDNN is unavailable
    and you run out of GPU memory.

-    There are two other implementations: An FFT-based convolution integrated
-    into Theano, and an implementation by Alex Krizhevsky available via
-    Pylearn2. See the documentation below on how to use them.
+    There are two other implementations of 2D convolution: An FFT-based
+    convolution integrated into Theano, and an implementation by Alex Krizhevsky
+    available via Pylearn2. See the documentation below on how to use them.

    Old conv2d interface is still accessible through :func:`nnet.conv.conv2d <theano.tensor.nnet.conv.conv2d>`.

@@ -146,8 +151,8 @@ TODO: Give examples on how to use these things! They are pretty complicated.
      ``THEANO_FLAGS=optimizer_excluding=conv_gemm`` in your environment.
    - :func:`dnn_conv <theano.sandbox.cuda.dnn.dnn_conv>` GPU-only
      convolution using NVIDIA's cuDNN library. This requires that you have
-      cuDNN installed and available, which in turn requires CUDA 6.5 and a GPU
-      with compute capability 3.0 or more.
+      cuDNN 4.0 or newer installed and available, which in turn requires CUDA 7.0
+      and a GPU with compute capability 3.0 or more.

      If cuDNN is available, by default, Theano will replace all nnet.conv2d
      operations with dnn_conv. To explicitly disable it, set
@@ -190,12 +195,31 @@ TODO: Give examples on how to use these things! They are pretty complicated.
      please see the warning about a bug in CUDA 5.0 to 6.0
      in :func:`GpuCorrMM <theano.sandbox.cuda.blas.GpuCorrMM>`.

+    - :func:`Corr3dMM <theano.tensor.nnet.corr3d.Corr3dMM>`
+      This is a CPU-only 3d correlation implementation based on
+      the 2d version (:func:`CorrMM <theano.tensor.nnet.corr.CorrMM>`).
+      It does not flip the kernel. As it provides a gradient, you can use it as a
+      replacement for nnet.conv3d. For convolutions done on CPU,
+      nnet.conv3d will be replaced by Corr3dMM. To explicitly disable it, set
+      ``THEANO_FLAGS=optimizer_excluding=conv_gemm`` in your environment.
+
+    - :func:`dnn_conv3d <theano.sandbox.cuda.dnn.dnn_conv3d>` GPU-only
+      convolution using NVIDIA's cuDNN library. This requires that you have
+      cuDNN 4.0 or newer installed and available, which in turn requires CUDA 7.0
+      and a GPU with compute capability 3.0 or more.
+
+      If cuDNN is available, by default, Theano will replace all nnet.conv3d
+      operations with dnn_conv3d. To explicitly disable it, set
+      ``THEANO_FLAGS=optimizer_excluding=conv_dnn`` in your environment.
+      As dnn_conv3d has a gradient defined, you can also use it manually.
+
    - :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>`
      Another conv3d implementation that uses the conv2d with data reshaping.
      It is faster in some cases than conv3d, and work on the GPU.
      It flip the kernel.

 .. autofunction:: theano.tensor.nnet.conv2d
+.. autofunction:: theano.tensor.nnet.conv3d
 .. autofunction:: theano.sandbox.cuda.fftconv.conv2d_fft
 .. autofunction:: theano.tensor.nnet.Conv3D.conv3D
 .. autofunction:: theano.sandbox.cuda.fftconv.conv3d_fft

--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@ PLATFORMS           = ["Windows", "Linux", "Solaris", "Mac OS-X", "Unix"]
 MAJOR               = 0
 MINOR               = 9
 MICRO               = 0
-SUFFIX              = "dev2"  # Should be blank except for rc's, betas, etc.
+SUFFIX              = "dev3"  # Should be blank except for rc's, betas, etc.
 ISRELEASED          = False

 VERSION             = '%d.%d.%d%s' % (MAJOR, MINOR, MICRO, SUFFIX)

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -19,6 +19,9 @@ from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
 from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradWeights,
                                              AbstractConv2d_gradInputs,
+                                              AbstractConv3d,
+                                              AbstractConv3d_gradWeights,
+                                              AbstractConv3d_gradInputs,
                                              get_conv_output_shape)
 from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
@@ -962,6 +965,122 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    return gpu_dnn_conv(algo=algo)(img, kerns, out, desc)


+def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
+               conv_mode='conv', direction_hint=None,
+               algo='none', precision=None):
+    """
+    GPU convolution using cuDNN from NVIDIA.
+
+    The memory layout to use is 'bc012', that is 'batch', 'channel',
+    'first dim', 'second dim', 'third dim' in that order.
+
+    Parameters
+    ----------
+    img
+        Images to do the convolution over.
+    kerns
+        Convolution filters.
+    border_mode
+        One of 'valid', 'full', 'half'; additionally, the padding size
+        could be directly specified by an integer or a pair of integers.
+    subsample
+        Perform subsampling of the output (default: (1, 1)).
+    conv_mode
+        Perform convolution (kernels flipped) or cross-correlation.
+        One of 'conv', 'cross' (default: 'conv').
+    direction_hint
+        Used by graph optimizers to change algorithm choice.
+        By default, GpuDnnConv will be used to carry out the convolution.
+        If border_mode is 'valid', subsample is (1, 1) and direction_hint is
+        'bprop weights', it will use GpuDnnConvGradW.
+        If border_mode is 'full', subsample is (1, 1) and direction_hint is
+        *not* 'forward!', it will use GpuDnnConvGradI.
+        This parameter is used internally by graph optimizers and may be
+        removed at any time without a deprecation period. You have been warned.
+    algo : convolution implementation to use. Only 'none' is implemented
+        for the conv3d. Default is the value of :attr:`config.dnn.conv.algo_fwd`.
+    precision : {'as_input_f32', 'as_input', 'float16', 'float32', 'float64'}
+        Description of the dtype in which the computation of the convolution
+        should be done. Possible values are 'as_input', 'float16', 'float32'
+        and 'float64'. Default is the value of
+        :attr:`config.dnn.conv.precision`.
+
+    .. warning:: The cuDNN library only works with GPUs that have a compute
+        capability of 3.0 or higer. This means that older GPUs will not
+        work with this Op.
+
+    """
+
+    # Establish dtype in which to perform the computation of the convolution
+    if precision is None:
+        precision = theano.config.dnn.conv.precision
+    if precision == 'as_input' or precision == 'as_input_f32':
+        nprec = theano.scalar.upcast(img.dtype, kerns.dtype)
+        if nprec == 'float16' and precision == 'as_input_f32':
+            precision = 'float32'
+        else:
+            precision = nprec
+
+    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
+    ctx_name = infer_context_name(img, kerns)
+    if (border_mode == 'valid' and subsample == (1, 1, 1) and
+            direction_hint == 'bprop weights'):
+        # Special case: We are asked to use GpuDnnConvGradW. We need to set
+        # up a suitable 'fake' convolution to compute the gradient for.
+        img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
+        if conv_mode == 'conv':
+            # We need to flip manually. These 'kerns' are not the kernels
+            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
+            kerns = kerns[:, :, ::-1, ::-1]
+        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
+        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
+        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
+        shape4 = shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1
+        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(
+            shape_i(kerns, 1, fgraph),
+            shape_i(img, 1, fgraph), shape2, shape3, shape4)
+        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
+                              conv_mode='cross', precision=precision)(out.shape)
+        conv = gpu_dnn_conv_gradW()(img, kerns, out, desc)
+        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
+
+    elif (border_mode == 'full' and subsample == (1, 1, 1) and
+          direction_hint != 'forward!'):
+        # Special case: We can be faster by using GpuDnnConvGradI to compute
+        # the full convolution as the backward pass of a valid convolution.
+        # We just need to set up a suitable 'fake' valid convolution.
+        img = gpu_contiguous(img)  # cudnn v2 rc3 need contiguous data
+        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
+        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
+        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
+        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
+        shape4 = shape_i(img, 4, fgraph) + shape_i(kerns, 4, fgraph) - 1
+        out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(shape_i(img, 0, fgraph),
+                                                         shape_i(kerns, 1, fgraph),
+                                                         shape2, shape3, shape4)
+        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
+                              conv_mode=conv_mode, precision=precision)(kerns.shape)
+        return gpu_dnn_conv_gradI()(kerns, img, out, desc)
+
+    # Standard case: We use GpuDnnConv with suitable padding.
+    # contig_version will return a gpu_contiguous copy
+    # if the img contains negative strides
+    img = gpu_contiguous(img)
+    kerns = gpu_contiguous(kerns)
+    desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
+                             conv_mode=conv_mode, precision=precision)(kerns.shape)
+    desc_op = desc.owner.op
+    # We can use Shape_i and bypass the infer_shape here as this is on
+    # the input of node and it will always be present.
+    ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
+    kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
+    out_shp = get_conv_output_shape(ishape, kshape,
+                                    desc_op.border_mode,
+                                    desc_op.subsample)
+    out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
+    return gpu_dnn_conv(algo=algo)(img, kerns, out, desc)
+
+
 def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
                   subsample=(1, 1), conv_mode='conv'):
    ctx_name = infer_context_name(img, topgrad)
@@ -976,6 +1095,20 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
    return gpu_dnn_conv_gradW()(img, topgrad, out, desc)


+def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid',
+                     subsample=(1, 1, 1), conv_mode='conv'):
+    ctx_name = infer_context_name(img, topgrad)
+    img = as_gpuarray_variable(img, ctx_name)
+    topgrad = as_gpuarray_variable(topgrad, ctx_name)
+    img = gpu_contiguous(img)
+    topgrad = gpu_contiguous(topgrad)
+    kerns_shp = as_tensor_variable(kerns_shp)
+    desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
+                             conv_mode=conv_mode)(kerns_shp)
+    out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*kerns_shp)
+    return gpu_dnn_conv_gradW()(img, topgrad, out, desc)
+
+
 def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
                  subsample=(1, 1), conv_mode='conv'):
    ctx_name = infer_context_name(kerns, topgrad)
@@ -990,6 +1123,20 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
    return gpu_dnn_conv_gradI()(kerns, topgrad, out, desc)


+def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid',
+                    subsample=(1, 1, 1), conv_mode='conv'):
+    ctx_name = infer_context_name(kerns, topgrad)
+    kerns = as_gpuarray_variable(kerns, ctx_name)
+    topgrad = as_gpuarray_variable(topgrad, ctx_name)
+    kerns = gpu_contiguous(kerns)
+    topgrad = gpu_contiguous(topgrad)
+    img_shp = as_tensor_variable(img_shp)
+    desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
+                             conv_mode=conv_mode)(kerns.shape)
+    out = gpu_alloc_empty(ctx_name, kerns.dtype)(*img_shp)
+    return gpu_dnn_conv_gradI()(kerns, topgrad, out, desc)
+
+
 class GpuDnnPoolDesc(Op):

    """
@@ -1775,31 +1922,85 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    return [rval]


+@register_opt2([AbstractConv3d, AbstractConv3d_gradWeights,
+                AbstractConv3d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
+def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
+    if (not isinstance(op, (AbstractConv3d,
+                            AbstractConv3d_gradWeights,
+                            AbstractConv3d_gradInputs))):
+        return
+
+    if (op.filter_dilation != (1, 1, 1)):
+        return None
+
+    inp1 = inputs[0]
+    inp2 = inputs[1]
+
+    if not dnn_available(inp1.type.context_name):
+        raise_no_cudnn()
+
+    if op.filter_flip:
+        conv_mode = 'conv'
+    else:
+        conv_mode = 'cross'
+
+    if isinstance(op, AbstractConv3d):
+        rval = dnn_conv3d(inp1, inp2,
+                          border_mode=op.border_mode,
+                          subsample=op.subsample,
+                          direction_hint='forward!',
+                          conv_mode=conv_mode)
+    elif isinstance(op, AbstractConv3d_gradWeights):
+        shape = (inp2.shape[1], inp1.shape[1],
+                 inputs[2][0], inputs[2][1], inputs[2][2])
+        rval = dnn_gradweight3d(inp1, inp2, shape,
+                                border_mode=op.border_mode,
+                                subsample=op.subsample,
+                                conv_mode=conv_mode)
+    elif isinstance(op, AbstractConv3d_gradInputs):
+        shape = (inp2.shape[0], inp1.shape[1],
+                 inputs[2][0], inputs[2][1], inputs[2][2])
+        rval = dnn_gradinput3d(inp1, inp2, shape,
+                               border_mode=op.border_mode,
+                               subsample=op.subsample,
+                               conv_mode=conv_mode)
+    return [rval]
+
+
 @register_opt('fast_compile', 'conv_dnn', 'cudnn')
-@local_optimizer([AbstractConv2d])
+@local_optimizer([AbstractConv2d, AbstractConv3d])
 def local_abstractconv_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
-    return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
+    if isinstance(node.op, AbstractConv2d):
+        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
+    elif isinstance(node.op, AbstractConv3d):
+        return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)


 @register_opt('fast_compile', 'conv_dnn', 'cudnn')
-@local_optimizer([AbstractConv2d_gradWeights])
+@local_optimizer([AbstractConv2d_gradWeights, AbstractConv3d_gradWeights])
 def local_abstractconv_gw_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
-    return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
+    if isinstance(node.op, AbstractConv2d_gradWeights):
+        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
+    elif isinstance(node.op, AbstractConv3d_gradWeights):
+        return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)


 @register_opt('fast_compile', 'conv_dnn', 'cudnn')
-@local_optimizer([AbstractConv2d_gradInputs])
+@local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs])
 def local_abstractconv_gi_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
-    return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
+    if isinstance(node.op, AbstractConv2d_gradInputs):
+        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
+    elif isinstance(node.op, AbstractConv3d_gradInputs):
+        return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)


 @inplace_allocempty(GpuDnnConv, 2)

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -24,7 +24,10 @@ from theano.tensor.nnet.conv import ConvOp
 from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
 from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradWeights,
-                                              AbstractConv2d_gradInputs)
+                                              AbstractConv2d_gradInputs,
+                                              AbstractConv3d,
+                                              AbstractConv3d_gradWeights,
+                                              AbstractConv3d_gradInputs)

 from theano.tests.breakpoint import PdbBreakpoint

@@ -1297,18 +1300,24 @@ def local_inplace_sparseblockouter(node):
 @register_opt('fast_compile', 'conv_dnn', 'cudnn')
 @op_lifter([AbstractConv2d,
            AbstractConv2d_gradWeights,
-            AbstractConv2d_gradInputs])
-def local_gpua_abstractconv2d(op, context_name, inputs, outputs):
+            AbstractConv2d_gradInputs,
+            AbstractConv3d,
+            AbstractConv3d_gradWeights,
+            AbstractConv3d_gradInputs])
+def local_gpua_abstractconv(op, context_name, inputs, outputs):
    if isinstance(outputs[0].type, GpuArrayType):
        # Don't handle this node here, it's already on the GPU.
        return
-    return local_gpua_lift_abstractconv2d_graph(op, context_name, inputs, outputs)
+    return local_gpua_lift_abstractconv_graph(op, context_name, inputs, outputs)


 @register_opt2([AbstractConv2d,
                AbstractConv2d_gradWeights,
-                AbstractConv2d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
-def local_gpua_lift_abstractconv2d_graph(op, context_name, inputs, outputs):
+                AbstractConv2d_gradInputs,
+                AbstractConv3d,
+                AbstractConv3d_gradWeights,
+                AbstractConv3d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
+def local_gpua_lift_abstractconv_graph(op, context_name, inputs, outputs):
    inps = list(inputs)
    inps[0] = as_gpuarray_variable(inputs[0],
                                   context_name=context_name)

--- a/theano/gpuarray/tests/test_abstractconv.py
+++ b/theano/gpuarray/tests/test_abstractconv.py
@@ -47,6 +47,39 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
                           filter_flip=flip, target_op=GpuDnnConvGradI)


+class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
+    @classmethod
+    def setup_class(cls):
+        test_abstract_conv.BaseTestConv3d.setup_class()
+        cls.shared = staticmethod(gpuarray_shared_constructor)
+        # provide_shape is not used by the cuDNN impementation
+        cls.provide_shape = [False]
+
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
+        if not dnn_available(test_ctx_name):
+            raise SkipTest(dnn_available.msg)
+        mode = mode_with_gpu
+
+        if fd != (1, 1, 1):
+            raise SkipTest("Doesn't have CUDNN implementation")
+        o = self.get_output_shape(i, f, s, b, fd)
+
+        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
+                     verify_grad=True, mode=mode,
+                     provide_shape=provide_shape, border_mode=b,
+                     filter_flip=flip, target_op=GpuDnnConv)
+        self.run_gradweight(inputs_shape=i, filters_shape=f,
+                            output_shape=o, subsample=s,
+                            verify_grad=True, mode=mode,
+                            provide_shape=provide_shape, border_mode=b,
+                            filter_flip=flip, target_op=GpuDnnConvGradW)
+        self.run_gradinput(inputs_shape=i, filters_shape=f,
+                           output_shape=o, subsample=s,
+                           verify_grad=True, mode=mode,
+                           provide_shape=provide_shape, border_mode=b,
+                           filter_flip=flip, target_op=GpuDnnConvGradI)
+
+
 class TestDnnConvTypes(test_abstract_conv.TestConvTypes):
    def setUp(self):
        self.input = gpu_ftensor4()

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -779,6 +779,201 @@ def test_dnn_conv_grad():
    utt.verify_grad(dconvw, [img_val, kern_val, out_val])


+def get_conv3d_test_cases():
+    # Every element of test_shapes follows the format
+    # [input_shape, filter_shape, subsample]
+    test_shapes = [[(128, 3, 5, 5, 5), (64, 3, 1, 2, 4), (1, 1, 1)],
+                   [(8, 4, 20, 12, 15), (5, 4, 6, 12, 4), (2, 2, 2)],
+                   [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 3, 3)],
+                   [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1)],
+                   [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1)],
+                   # Test with 1x1x1 filters
+                   [(8, 1, 10, 10, 10), (10, 1, 1, 1, 1), (1, 1, 1)],
+                   # Test with dimensions larger than 1024 (thread block dim)
+                   [(1025, 1, 2, 3, 4), (5, 1, 1, 2, 3), (1, 1, 1)],
+                   [(8, 1, 2, 3, 4), (1025, 1, 1, 2, 3), (1, 1, 1)],
+                   [(8, 1025, 2, 3, 4), (5, 1025, 1, 1, 2), (1, 1, 1)],
+                   [(8, 1, 1030, 3, 4), (5, 1, 1025, 1, 1), (1, 1, 1)],
+                   [(8, 1, 2, 1030, 4), (5, 1, 2, 1025, 1), (1, 1, 1)],
+                   [(8, 1, 2, 3, 1030), (5, 1, 1, 2, 1025), (1, 1, 1)],
+                   # The equivalent of this caused a crash with conv2d
+                   [(1, 1, 1, 44800, 1), (6, 1, 1, 1, 1), (1, 1, 1)]]
+
+    # With border mode 'full', test with kernel bigger than image in some/all
+    # dimensions
+    test_shapes_full = [[(6, 2, 2, 2, 2), (4, 2, 3, 1, 1), (1, 1, 1)],
+                        [(6, 2, 2, 2, 2), (4, 2, 1, 3, 1), (1, 1, 1)],
+                        [(6, 2, 2, 2, 2), (4, 2, 1, 1, 3), (1, 1, 1)],
+                        [(6, 2, 2, 2, 2), (4, 2, 5, 5, 5), (1, 1, 1)]]
+    border_modes = ['valid', 'full', 'half', (1, 2, 3), (3, 2, 1), 1, 2]
+    conv_modes = ['conv', 'cross']
+
+    itt = chain(product(test_shapes, border_modes, conv_modes),
+                product(test_shapes_full, ['full'], conv_modes))
+
+    return itt
+
+
+def test_conv3d_fwd():
+
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+
+    def run_conv3d_fwd(inputs_shape, filters_shape, subsample,
+                       border_mode, conv_mode):
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+
+        # Scale down the input values to prevent very large absolute errors
+        # due to float rounding
+        inputs_val /= 10
+        filters_val /= 10
+
+        inputs = theano.shared(inputs_val)
+        filters = theano.shared(filters_val)
+        bias = theano.shared(numpy.zeros(filters_shape[0]).astype('float32'))
+
+        # Compile a theano function for the cuDNN implementation
+        conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
+                              border_mode=border_mode, subsample=subsample,
+                              conv_mode=conv_mode)
+        f = theano.function([], conv, mode=mode_with_gpu)
+
+        # If conv_mode is 'conv' the reference implementation should use
+        # filters filpped according to the width, height and time axis
+        if conv_mode == 'conv':
+            flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
+        else:
+            flipped_filters = filters
+
+        # If border mode is anything but 'valid', the reference implementation
+        # should operate on padded inputs
+        if border_mode == 'valid':
+            padded_inputs = inputs
+        else:
+            if border_mode == 'full':
+                pad_per_dim = [filters_shape[i] - 1 for i in range(2, 5)]
+            elif border_mode == 'half':
+                pad_per_dim = [filters_shape[i] // 2 for i in range(2, 5)]
+            else:
+                if isinstance(border_mode, int):
+                    pad_per_dim = [border_mode] * 3
+                else:
+                    pad_per_dim = border_mode
+
+            pad_before_after = ([(0, 0), (0, 0)] +
+                                [(p, p) for p in pad_per_dim])
+            padded_inputs_val = numpy.pad(inputs_val, pad_before_after,
+                                          'constant')
+            padded_inputs = theano.shared(padded_inputs_val)
+
+        # Compile a theano function for the reference implementation
+        conv_ref = theano.tensor.nnet.conv3D(
+            V=padded_inputs.dimshuffle(0, 2, 3, 4, 1),
+            W=flipped_filters.dimshuffle(0, 2, 3, 4, 1),
+            b=bias, d=subsample)
+        f_ref = theano.function([], conv_ref.dimshuffle(0, 4, 1, 2, 3), mode="FAST_RUN")
+
+        # Compare the results of the two implementations
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+
+    test_cases = get_conv3d_test_cases()
+    for (i_shape, f_shape, subsample), border_mode, conv_mode in test_cases:
+        yield (run_conv3d_fwd, i_shape, f_shape, subsample, border_mode,
+               conv_mode)
+
+
+def test_conv3d_bwd():
+
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+
+    def run_conv3d_bwd(inputs_shape, filters_shape, subsample,
+                       border_mode, conv_mode):
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+
+        inputs = theano.shared(inputs_val)
+        filters = theano.shared(filters_val)
+        bias = theano.shared(numpy.zeros(filters_shape[0]).astype('float32'))
+
+        # Compile a theano function for the cuDNN implementation
+        conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
+                              border_mode=border_mode, subsample=subsample,
+                              conv_mode=conv_mode)
+
+        grad_i, grad_w = theano.tensor.grad(conv.sum(), [inputs, filters])
+
+        f = theano.function([], [grad_i, grad_w], mode=mode_with_gpu)
+
+        # If conv_mode is 'conv' the reference implementation should use
+        # filters filpped according to the width, height and time axis
+        if conv_mode == 'conv':
+            flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
+        else:
+            flipped_filters = filters
+
+        # If border mode is anything but 'valid', the reference implementation
+        # should operate on padded inputs
+        if border_mode == 'valid':
+            padded_inputs = inputs
+        else:
+            if border_mode == 'full':
+                pad_per_dim = [filters_shape[i] - 1 for i in range(2, 5)]
+            elif border_mode == 'half':
+                pad_per_dim = [filters_shape[i] // 2 for i in range(2, 5)]
+            else:
+                if isinstance(border_mode, int):
+                    pad_per_dim = [border_mode] * 3
+                else:
+                    pad_per_dim = border_mode
+
+            pad_before_after = ([(0, 0), (0, 0)] +
+                                [(p, p) for p in pad_per_dim])
+            padded_inputs_val = numpy.pad(inputs_val, pad_before_after,
+                                          'constant')
+            padded_inputs = theano.shared(padded_inputs_val)
+
+        # Compile a theano function for the reference implementation
+        conv_ref = theano.tensor.nnet.conv3D(
+            V=padded_inputs.dimshuffle(0, 2, 3, 4, 1),
+            W=flipped_filters.dimshuffle(0, 2, 3, 4, 1),
+            b=bias, d=subsample)
+        (grad_padded_i_ref,
+         grad_w_ref) = theano.tensor.grad(conv_ref.sum(),
+                                          [padded_inputs, filters])
+
+        # Recover grad_i_ref from grad_padded_i_ref
+        if border_mode == 'valid':
+            grad_i_ref = grad_padded_i_ref
+        else:
+            shp = grad_padded_i_ref.shape
+            grad_i_ref = grad_padded_i_ref[
+                :, :,
+                pad_per_dim[0]:shp[2] - pad_per_dim[0],
+                pad_per_dim[1]:shp[3] - pad_per_dim[1],
+                pad_per_dim[2]:shp[4] - pad_per_dim[2]]
+
+        f_ref = theano.function([], [grad_i_ref, grad_w_ref], mode="FAST_RUN")
+
+        # Compare the results of the two implementations
+        res_ref = f_ref()
+        res = f()
+        # Needed for big size for some seed
+        # raise rtol to make the test pass with more seed.
+        utt.assert_allclose(res_ref[0], res[0], rtol=2e-5)
+        utt.assert_allclose(res_ref[1], res[1], rtol=2e-5)
+
+    test_cases = get_conv3d_test_cases()
+    for (i_shape, f_shape, subsample), border_mode, conv_mode in test_cases:
+        yield (run_conv3d_bwd, i_shape, f_shape, subsample, border_mode,
+               conv_mode)
+
+
 def test_version():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -865,8 +865,8 @@ class BaseGpuCorrMM(GpuOp):
    __props__ = ('border_mode', 'subsample', 'filter_dilation')

    def __init__(self, border_mode="valid", subsample=(1, 1),
-                 filter_dilation=(1, 1), pad=(0, 0)):
-        if pad != (0, 0):
+                 filter_dilation=(1, 1), pad=None):
+        if pad is not None:
            _logger.warning(
                'do not use pad for BaseGpuCorrMM; please set padding in '
                'border_mode parameter, see the docstring for more details')
@@ -1216,7 +1216,7 @@ class GpuCorrMM(BaseGpuCorrMM):
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
                 filter_dilation=(1, 1),
-                 pad=(0, 0)):
+                 pad=None):
        super(GpuCorrMM, self).__init__(border_mode, subsample,
                                        filter_dilation, pad)

@@ -1267,7 +1267,7 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
                 filter_dilation=(1, 1),
-                 pad=(0, 0)):
+                 pad=None):
        super(GpuCorrMM_gradWeights, self).__init__(border_mode,
                                                    subsample,
                                                    filter_dilation,
@@ -1338,7 +1338,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
                 filter_dilation=(1, 1),
-                 pad=(0, 0)):
+                 pad=None):
        super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample,
                                                   filter_dilation, pad)

@@ -1396,29 +1396,64 @@ class BaseGpuCorr3dMM(GpuOp):
    Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
    `GpuCorr3dMM_gradInputs`. Cannot be used directly.

+    Parameters
+    ----------
+    border_mode : {'valid', 'full', 'half'}
+        Additionally, the padding size could be directly specified by an integer
+        or a tuple of three integers
+    subsample
+        Perform subsampling of the output (default: (1, 1, 1)).
+    filter_dilation
+        Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
+    pad
+        *deprecated*, now you should always use border_mode.
    """

-    __props__ = ('border_mode', 'subsample', 'pad')
+    check_broadcast = False
+    __props__ = ('border_mode', 'subsample', 'filter_dilation')

    def __init__(self, border_mode="valid",
                 subsample=(1, 1, 1),
-                 pad=(0, 0, 0)):
-        if border_mode != "valid":
-            raise ValueError("border_mode must be 'valid'")
+                 filter_dilation=(1, 1, 1),
+                 pad=None):
+        if pad is not None:
+            _logger.warning(
+                'do not use pad for BaseGpuCorr3dMM; please set padding in '
+                'border_mode parameter, see the docstring for more details')
+            if border_mode != "valid":
+                raise ValueError("border_mode must be 'valid' if pad is given")
+            border_mode = pad
+        if isinstance(border_mode, integer_types):
+            border_mode = (border_mode, border_mode, border_mode)
+        if isinstance(border_mode, tuple):
+            pad_h, pad_w, pad_d = map(int, border_mode)
+            border_mode = (pad_h, pad_w, pad_d)
+        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
+                border_mode in ('valid', 'full', 'half')):
+            raise ValueError(
+                'invalid border_mode {}, which must be either '
+                '"valid", "full", "half", an integer or a tuple of three'
+                ' integers'.format(border_mode))
        self.border_mode = border_mode
        if len(subsample) != 3:
            raise ValueError("subsample must have three elements")
-        self.subsample = subsample
-        if (pad not in ("half", "full")) and (len(pad) != 3):
-            raise ValueError("pad must be 'half', 'full', or have three elements")
-        self.pad = pad
+        if len(filter_dilation) != 3:
+            raise ValueError("filter_dilation must have three elements")
+        self.subsample = tuple(subsample)
+        self.filter_dilation = tuple(filter_dilation)
+
+    @property
+    def pad(self):
+        if self.border_mode != 'valid':
+            return self.border_mode
+        return (0, 0, 0)

    def __str__(self):
-        return '%s{%s, %s, pad=%r}' % (
+        return '%s{%s, %s, %s}' % (
            self.__class__.__name__,
            self.border_mode,
            str(self.subsample),
-            self.pad)
+            str(self.filter_dilation))

    def flops(self, inp, outp):
        """ Useful with the hack in profiling to print the MFlops"""
@@ -1440,7 +1475,7 @@ class BaseGpuCorr3dMM(GpuOp):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 23)
+        return (0, 25)

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -1503,15 +1538,17 @@ class BaseGpuCorr3dMM(GpuOp):
            Ignored otherwise.

        """
-        if self.border_mode != "valid":
-            raise ValueError("mode must be 'valid'")
        dH, dW, dD = self.subsample
-        if self.pad == "half":
+        dilH, dilW, dilD = self.filter_dilation
+        if self.border_mode == "half":
            padH = padW = padD = -1
-        elif self.pad == "full":
+        elif self.border_mode == "full":
            padH = padW = padD = -2
+        elif isinstance(self.border_mode, tuple):
+            padH, padW, padD = self.border_mode
        else:
-            padH, padW, padD = self.pad
+            assert self.border_mode == "valid"
+            padH = padW = padD = 0
        if direction == "forward":
            direction = 0
            out = top
@@ -1556,6 +1593,9 @@ class BaseGpuCorr3dMM(GpuOp):
    int dH = %(dH)s;
    int dW = %(dW)s;
    int dD = %(dD)s;
+    int dilH = %(dilH)s;
+    int dilW = %(dilW)s;
+    int dilD = %(dilD)s;
    int padH = %(padH)s;
    int padW = %(padW)s;
    int padD = %(padD)s;
@@ -1585,12 +1625,12 @@ class BaseGpuCorr3dMM(GpuOp):
      else if (padH == -2)
      {
        // vertical full padding, we can infer the kernel height
-        kH = 2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
+        kH = (2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1;
      }
      else
      {
        // explicit padding, we can infer the kernel height
-        kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
+        kH = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1 ;
      }
      if ((dW != 1) || (padW == -1))
      {
@@ -1598,11 +1638,11 @@ class BaseGpuCorr3dMM(GpuOp):
      }
      else if (padW == -2)
      {
-         kW = 2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
+        kW = (2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
      }
      else
      {
-        kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
+        kW = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
      }
      if ((dD != 1) || (padD == -1))
      {
@@ -1610,22 +1650,27 @@ class BaseGpuCorr3dMM(GpuOp):
      }
      else if (padD == -2)
      {
-         kD = 2 - CudaNdarray_HOST_DIMS(bottom)[4] + (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD;
+        kD = (2 - CudaNdarray_HOST_DIMS(bottom)[4] + (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
      }
      else
      {
-        kD = CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD;
+        kD = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD - 1) / dilD+ 1;
      }
    }

+    // Implicit dilated kernel size
+    int dil_kH = (kH - 1) * dilH + 1;
+    int dil_kW = (kW - 1) * dilW + 1;
+    int dil_kD = (kD - 1) * dilD + 1;
+
    // Auto-padding if requested
    if (padH == -1)
    { // vertical half padding
-      padH = kH / 2;
+      padH = dil_kH / 2;
    }
    else if (padH == -2)
    { // vertical full padding
-      padH = kH - 1;
+      padH = dil_kH - 1;
    }
    else if (padH < 0)
    {
@@ -1633,10 +1678,10 @@ class BaseGpuCorr3dMM(GpuOp):
      %(fail)s
    }
    if (padW == -1) {  // horizontal half padding
-      padW = kW / 2;
+      padW = dil_kW / 2;
    }
    else if (padW == -2) {  // horizontal full padding
-      padW = kW - 1;
+      padW = dil_kW - 1;
    }
    else if (padW < 0)
    {
@@ -1645,11 +1690,11 @@ class BaseGpuCorr3dMM(GpuOp):
    }
    if (padD == -1)
    { // horizontal half padding
-      padD = kD / 2;
+      padD = dil_kD / 2;
    }
    else if (padD == -2)
    { // horizontal full padding
-      padD = kD - 1;
+      padD = dil_kD - 1;
    }
    else if (padD < 0)
    {
@@ -1662,16 +1707,16 @@ class BaseGpuCorr3dMM(GpuOp):
    switch(direction) {
    case 0:  // forward pass
        // output is top: (batchsize, num_filters, height, width, depth)
-        // height and width: top = (bottom + 2*pad - weight) / sample + 1
+        // height, width and depth: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
-        out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1;
-        out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1;
-        out_dim[4] = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - CudaNdarray_HOST_DIMS(weights)[4]) / dD + 1;
+        out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - ((CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
+        out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - ((CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
+        out_dim[4] = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - ((CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1;
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width, depth)
-        // height, width and depth: weights = bottom + 2*pad - (top-1) * sample
+        // height, width and depth: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = CudaNdarray_HOST_DIMS(top)[1];
        out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[1];
        out_dim[2] = kH;  // already inferred further above
@@ -1680,12 +1725,12 @@ class BaseGpuCorr3dMM(GpuOp):
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width, depth)
-        // height, width and depth: bottom = (top-1) * sample + weights - 2*pad
+        // height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
-        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH;
-        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW;
-        out_dim[4] = (dD != 1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + CudaNdarray_HOST_DIMS(weights)[4] - 2*padD;
+        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
+        out_dim[4] = (dD != 1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + (CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");
@@ -1716,7 +1761,8 @@ class BaseGpuCorr3dMM(GpuOp):
    }

    // Call CUDA code
-    out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dD, padH, padW, padD);
+    out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dD,
+                    dilH, dilW, dilD, padH, padW, padD);
    if (out2==NULL){
       %(fail)s
    }
@@ -1731,22 +1777,28 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
    Parameters
    ----------
    border_mode
-        Currently supports "valid" only; "full" can be simulated by setting
-        `pad="full"` (at the cost of performance), or by using
-        `GpuCorrMM_gradInputs`.
+        The width of a border of implicit zeros to pad the
+        input with. Must be a tuple with 3 elements giving the width of
+        the padding on each side, or a single integer to pad the same
+        on all sides, or a string shortcut setting the padding at runtime:
+        ``'valid'`` for ``(0, 0, 0)`` (valid convolution, no padding), ``'full'``
+        for ``(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)``
+        (full convolution), ``'half'`` for ``(kernel_rows // 2,
+        kernel_columns // 2, kernel_depth // 2)`` (same convolution for
+        odd-sized kernels). Note that the three widths are each
+        applied twice, once per side (left and right, top and bottom, front
+        and back).
    subsample
        The subsample operation applied to each output image. Should be a tuple
        with 3 elements. `(sv, sh, sl)` is equivalent to
        `GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`, but faster.
        Set to `(1, 1, 1)` to disable subsampling.
+    filter_dilation
+        The filter dilation operation applied to each input image.
+        Should be a tuple with 3 elements.
+        Set to `(1, 1, 1)` to disable filter dilation.
    pad
-        The width of a border of implicit zeros to pad the input image with.
-        Should be a tuple with 3 elements giving the numbers of rows and columns
-        to pad on each side, or "half" to set the padding
-        to `(kernel_rows // 2, kernel_columns // 2, kernel_depth // 2)`,
-        or "full" to set the padding
-        to `(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)` at runtime.
-        Set to `(0, 0, 0)` to disable padding.
+        Deprecated alias for `border_mode`.

    Notes
    -----
@@ -1765,8 +1817,10 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
        batchsize or number of filters) may also work around the CUBLAS bug.

    """
-    def __init__(self, border_mode="valid", subsample=(1, 1, 1), pad=(0, 0, 0)):
-        super(GpuCorr3dMM, self).__init__(border_mode, subsample, pad)
+    def __init__(self, border_mode="valid", subsample=(1, 1, 1),
+                 filter_dilation=(1, 1, 1), pad=None):
+        super(GpuCorr3dMM, self).__init__(border_mode, subsample,
+                                          filter_dilation, pad)

    def make_node(self, img, kern):
        img = as_cuda_ndarray_variable(img)
@@ -1792,14 +1846,12 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
        top = gpu_contiguous(top)
        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
                                          self.subsample,
-                                          self.pad)(weights,
-                                                    top,
-                                                    bottom.shape[-3:])
+                                          self.filter_dilation)(
+            weights, top, bottom.shape[-3:])
        d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
                                            self.subsample,
-                                            self.pad)(bottom,
-                                                      top,
-                                                      weights.shape[-3:])
+                                            self.filter_dilation)(
+            bottom, top, weights.shape[-3:])
        return d_bottom, d_weights


@@ -1815,8 +1867,10 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):

    def __init__(self, border_mode="valid",
                 subsample=(1, 1, 1),
-                 pad=(0, 0, 0)):
-        super(GpuCorr3dMM_gradWeights, self).__init__(border_mode, subsample, pad)
+                 filter_dilation=(1, 1, 1),
+                 pad=None):
+        super(GpuCorr3dMM_gradWeights, self).__init__(border_mode, subsample,
+                                                      filter_dilation, pad)

    def make_node(self, img, topgrad, shape=None):
        img = as_cuda_ndarray_variable(img)
@@ -1828,10 +1882,14 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
            raise TypeError('img must be 5D tensor')
        if topgrad.type.ndim != 5:
            raise TypeError('topgrad must be 5D tensor')
-        if self.subsample != (1, 1, 1) or self.pad == "half":
+        if self.subsample != (1, 1, 1) or self.border_mode == "half":
            if shape is None:
-                raise ValueError('shape must be given if subsample != (1, 1, 1), or pad == "half"')
+                raise ValueError('shape must be given if subsample != (1, 1, 1)'
+                                 ' or border_mode == "half"')
            height_width_depth = [shape[0], shape[1], shape[2]]
+            assert shape[0].ndim == 0
+            assert shape[1].ndim == 0
+            assert shape[2].ndim == 0
        else:
            height_width_depth = []

@@ -1850,9 +1908,13 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
        bottom, top = inp[:2]
        weights, = grads
        weights = gpu_contiguous(weights)
-        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)(weights, top, bottom.shape[-3:])
-        d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)(
-            bottom, weights)
+        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
+                                          self.subsample,
+                                          self.filter_dilation)(weights,
+                                                                top,
+                                                                bottom.shape[-3:])
+        d_top = GpuCorr3dMM(
+            self.border_mode, self.subsample, self.filter_dilation)(bottom, weights)
        d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
        return (d_bottom, d_top) + d_height_width_depth

@@ -1875,8 +1937,10 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):

    def __init__(self, border_mode="valid",
                 subsample=(1, 1, 1),
-                 pad=(0, 0, 0)):
-        super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample, pad)
+                 filter_dilation=(1, 1, 1),
+                 pad=None):
+        super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample,
+                                                     filter_dilation, pad)

    def make_node(self, kern, topgrad, shape=None):
        kern = as_cuda_ndarray_variable(kern)
@@ -1888,6 +1952,10 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
        if self.subsample != (1, 1, 1) and shape is None:
            raise ValueError('shape must be given if subsample != (1, 1, 1)')
        height_width_depth = [shape[0], shape[1], shape[2]] if self.subsample != (1, 1, 1) else []
+        if height_width_depth:
+            assert shape[0].ndim == 0
+            assert shape[1].ndim == 0
+            assert shape[2].ndim == 0

        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
                         False, False, False]
@@ -1906,12 +1974,12 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
        bottom = gpu_contiguous(bottom)
        d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
                                            self.subsample,
-                                            self.pad)(bottom,
-                                                      top,
-                                                      weights.shape[-3:])
+                                            self.filter_dilation)(bottom,
+                                                                  top,
+                                                                  weights.shape[-3:])
        d_top = GpuCorr3dMM(self.border_mode,
                            self.subsample,
-                            self.pad)(bottom, weights)
+                            self.filter_dilation)(bottom, weights)
        d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
            * 3 if len(inp) == 5 else ()
        return (d_weights, d_top) + d_height_width_depth

--- a/theano/sandbox/cuda/corr3d_gemm.cu
+++ b/theano/sandbox/cuda/corr3d_gemm.cu
@@ -52,6 +52,54 @@ inline int GET_BLOCKS(const int N) {

 // (Adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
 // Kernels for fast unfold + copy
+// CUDA kernel for the case of dilation
+__global__ void dilated_im3d2col_kernel(const int n, const float* data_im,
+    const int height, const int width, const int depth,
+    const int kernel_h, const int kernel_w, const int kernel_d,
+    const int dilation_h, const int dilation_w, const int dilation_d,
+    const int pad_h, const int pad_w, const int pad_d,
+    const int stride_h, const int stride_w, const int stride_d,
+    const int height_col, const int width_col, const int depth_col,
+    float* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int w_index = index / depth_col;
+    const int h_index = w_index / width_col;
+    const int d_col = index % depth_col;
+    const int h_col = h_index % height_col;
+    const int w_col = w_index % width_col;
+    const int c_im = h_index / height_col;
+    const int c_col = c_im * kernel_h * kernel_w * kernel_d;
+    const int h_offset = h_col * stride_h - pad_h;
+    const int w_offset = w_col * stride_w - pad_w;
+    const int d_offset = d_col * stride_d - pad_d;
+    float* data_col_ptr = data_col;
+    data_col_ptr += c_col * (height_col * width_col * depth_col) +
+      h_col * (width_col * depth_col) + w_col * depth_col + d_col;
+    const float* data_im_ptr = data_im;
+    data_im_ptr += c_im * (height * width * depth) +
+      h_offset * (width * depth) + w_offset * depth + d_offset;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      int h_im = h_offset + i * dilation_h;
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        int w_im = w_offset + j * dilation_w;
+        for (int k = 0; k < kernel_d; ++k)
+        {
+          int d_im = d_offset + k * dilation_d;
+          *data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
+                           h_im < height && w_im < width && d_im < depth) ?
+                           data_im_ptr[i * dilation_h * (width * depth) +
+                                       j * dilation_w * depth +
+                                       k * dilation_d] : 0;
+          data_col_ptr += height_col * width_col * depth_col;
+        }
+      }
+    }
+  }
+}
+
 __global__ void im3d2col_kernel(const int n, const float* data_im,
                                const int height, const int width, const int depth,
                                const int kernel_h, const int kernel_w, const int kernel_d,
@@ -62,41 +110,35 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
 {
  CUDA_KERNEL_LOOP(index, n)
  {
-    int d_out = index % depth_col;
-    int w_index = index / depth_col;
-    int w_out = w_index % width_col;
-    int h_index = w_index / width_col;
-    int h_out = h_index % height_col;
-
-    int channel_in = h_index / height_col;
-    //channel_in = 1;
-
-    int channel_out = channel_in * kernel_h * kernel_w * kernel_d;
-
-    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride_w - pad_w;
-    int d_in = d_out * stride_d - pad_d;
-
+    const int w_index = index / depth_col;
+    const int h_index = w_index / width_col;
+    const int d_col = index % depth_col;
+    const int h_col = h_index % height_col;
+    const int w_col = w_index % width_col;
+    const int c_im = h_index / height_col;
+    const int c_col = c_im * kernel_h * kernel_w * kernel_d;
+    const int h_offset = h_col * stride_h - pad_h;
+    const int w_offset = w_col * stride_w - pad_w;
+    const int d_offset = d_col * stride_d - pad_d;
    float* data_col_ptr = data_col;
-    data_col_ptr += channel_out * (height_col * width_col * depth_col) +
-      h_out * (width_col * depth_col) + w_out * depth_col + d_out;
-
+    data_col_ptr += c_col * (height_col * width_col * depth_col) +
+      h_col * (width_col * depth_col) + w_col * depth_col + d_col;
    const float* data_im_ptr = data_im;
-    data_im_ptr += channel_in * (height * width * depth) +
-      h_in * (width * depth) + w_in * depth + d_in;
+    data_im_ptr += c_im * (height * width * depth) +
+      h_offset * (width * depth) + w_offset * depth + d_offset;

    for (int i = 0; i < kernel_h; ++i)
    {
-      int h = h_in + i;
+      int h_im = h_offset + i;
      for (int j = 0; j < kernel_w; ++j)
      {
-        int w = w_in + j;
+        int w_im = w_offset + j;
        for (int k = 0; k < kernel_d; ++k)
        {
-          int d = d_in + k;
-          *data_col_ptr = (h >= 0 && w >= 0 && d >= 0 &&
-                           h < height && w < width && d < depth) ?
-                           data_im_ptr[i * (width * depth) + j *depth + k] : 0;
+          int d_im = d_offset + k;
+          *data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
+                           h_im < height && w_im < width && d_im < depth) ?
+                           data_im_ptr[i * (width * depth) + j * depth + k] : 0;
          data_col_ptr += height_col * width_col * depth_col;
        }
      }
@@ -107,31 +149,105 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
 void im3d2col(const float* data_im, const int channels,
              const int height, const int width, const int depth,
              const int kernel_h, const int kernel_w, const int kernel_d,
+              const int dilation_h, const int dilation_w, const int dilation_d,
              const int pad_h, const int pad_w, const int pad_d,
              const int stride_h, const int stride_w, const int stride_d,
              float* data_col)
 {
  // We are going to launch channels * height_col * width_col * depth_col kernels, each
  // kernel responsible for copying a single-channel grid.
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-  int depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1;
+  int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
+  int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
+  int dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
+  int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
+  int depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
  int num_kernels = channels * height_col * width_col * depth_col;
-  im3d2col_kernel<<<GET_BLOCKS(num_kernels),
-                    CUDA_NUM_THREADS>>>(num_kernels, data_im,
-                                        height, width, depth,
-                                        kernel_h, kernel_w, kernel_d,
-                                        pad_h, pad_w, pad_d,
-                                        stride_h, stride_w, stride_d,
-                                        height_col, width_col, depth_col,
-                                        data_col);
+  if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
+    dilated_im3d2col_kernel<<<GET_BLOCKS(num_kernels),
+                      CUDA_NUM_THREADS>>>(num_kernels, data_im,
+                                          height, width, depth,
+                                          kernel_h, kernel_w, kernel_d,
+                                          dilation_h, dilation_w, dilation_d,
+                                          pad_h, pad_w, pad_d,
+                                          stride_h, stride_w, stride_d,
+                                          height_col, width_col, depth_col,
+                                          data_col);
+  }
+  else{
+    im3d2col_kernel<<<GET_BLOCKS(num_kernels),
+                      CUDA_NUM_THREADS>>>(num_kernels, data_im,
+                                          height, width, depth,
+                                          kernel_h, kernel_w, kernel_d,
+                                          pad_h, pad_w, pad_d,
+                                          stride_h, stride_w, stride_d,
+                                          height_col, width_col, depth_col,
+                                          data_col);
+  }
 }

+// CUDA kernel for the case of dilation
+__global__ void dilated_col2im3d_kernel(
+    const int n, const float* data_col,
+    const int height, const int width, const int depth,
+    const int channels,
+    const int kernel_h, const int kernel_w, const int kernel_d,
+    const int dilation_h, const int dilation_w, const int dilation_d,
+    const int pad_h, const int pad_w, const int pad_d,
+    const int stride_h, const int stride_w, const int stride_d,
+    const int height_col, const int width_col, const int depth_col,
+    float* data_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    float val = 0;
+    const int d_im = index % depth + pad_d;
+    const int w_index = index / depth;
+    const int w_im = w_index % width + pad_w;
+    const int h_index = w_index / width;
+    const int h_im = h_index % height + pad_h;
+    const int c_im = h_index / height;
+    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+    int kernel_extent_d = (kernel_d - 1) * dilation_d + 1;
+    // compute the start and end of the output
+    const int d_col_start = (d_im < kernel_extent_d) ? 0 : (d_im - kernel_extent_d) / stride_d + 1;
+    const int d_col_end = min(d_im / stride_d + 1, depth_col);
+    const int w_col_start = (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+    const int w_col_end = min(w_im / stride_w + 1, width_col);
+    const int h_col_start = (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+    const int h_col_end = min(h_im / stride_h + 1, height_col);
+    // TODO: use LCM of stride and dilation to avoid unnecessary loops
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          int h_k = (h_im - h_col * stride_h);
+          int w_k = (w_im - w_col * stride_w);
+          int d_k = (d_im - d_col * stride_d);
+          if (h_k % dilation_h == 0 && w_k % dilation_w == 0 && d_k % dilation_d == 0) {
+            h_k /= dilation_h;
+            w_k /= dilation_w;
+            d_k /= dilation_d;
+            int data_col_index = c_im * kernel_h * kernel_w * kernel_d * height_col * width_col * depth_col +
+                                 h_k             * kernel_w * kernel_d * height_col * width_col * depth_col +
+                                 w_k                        * kernel_d * height_col * width_col * depth_col +
+                                 d_k                                   * height_col * width_col * depth_col +
+                                 h_col                                              * width_col * depth_col +
+                                 w_col                                                          * depth_col +
+                                 d_col;
+            val += data_col[data_col_index];
+          }
+        }
+      }
+   }
+   data_im[index] = val;
+  }
+}

 __global__ void col2im3d_kernel(const int n, const float* data_col,
                                const int height, const int width, const int depth,
                                const int channels,
-                                const int patch_h, const int patch_w, const int patch_d,
+                                const int kernel_h, const int kernel_w, const int kernel_d,
                                const int pad_h, const int pad_w, const int pad_d,
                                const int stride_h, const int stride_w, const int stride_d,
                                const int height_col, const int width_col, const int depth_col,
@@ -140,59 +256,78 @@ __global__ void col2im3d_kernel(const int n, const float* data_col,
  CUDA_KERNEL_LOOP(index, n)
  {
    float val = 0;
-    int d = index % depth + pad_d;
-    int w_index = index / depth;
-    int w = w_index % width + pad_w;
-    int h_index = w_index / width;
-    int h = h_index % height + pad_h;
-    int c = h_index / height;
+    const int d_im = index % depth + pad_d;
+    const int w_index = index / depth;
+    const int w_im = w_index % width + pad_w;
+    const int h_index = w_index / width;
+    const int h_im = h_index % height + pad_h;
+    const int c_im = h_index / height;

    // compute the start and end of the output
-    int d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1;
-    int d_col_end = min(d / stride_d + 1, depth_col);
-    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
-    int w_col_end = min(w / stride_w + 1, width_col);
-    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
-    int h_col_end = min(h / stride_h + 1, height_col);
+    const int d_col_start = (d_im < kernel_d) ? 0 : (d_im - kernel_d) / stride_d + 1;
+    const int d_col_end = min(d_im / stride_d + 1, depth_col);
+    const int w_col_start = (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
+    const int w_col_end = min(w_im / stride_w + 1, width_col);
+    const int h_col_start = (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
+    const int h_col_end = min(h_im / stride_h + 1, height_col);

    int offset =
-      (c * patch_h * patch_w * patch_d + h * patch_w * patch_d + w * patch_d + d) * height_col * width_col * depth_col;
+      (c_im * kernel_h * kernel_w * kernel_d + h_im * kernel_w * kernel_d +
+       w_im * kernel_d + d_im) * height_col * width_col * depth_col;

-    int coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
-    int coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
+    int coeff_h_col = (1 - stride_h * kernel_w * kernel_d * height_col) * width_col * depth_col;
+    int coeff_w_col = (1 - stride_w * kernel_d * height_col * width_col) * depth_col;
    int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
-    for (int d_col = d_col_start; d_col < d_col_end; ++d_col)
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
+        }
      }
   }
-    data_im[index] = val;
+   data_im[index] = val;
  }
 }

 void col2im3d(const float* data_col, const int channels,
              const int height, const int width, const int depth,
              const int patch_h, const int patch_w, const int patch_d,
+              const int dilation_h, const int dilation_w, const int dilation_d,
              const int pad_h, const int pad_w, const int pad_d,
              const int stride_h, const int stride_w, const int stride_d,
              float* data_im)
 {
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-  int depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1;
+  int dil_patch_h = (patch_h - 1) * dilation_h + 1;
+  int dil_patch_w = (patch_w - 1) * dilation_w + 1;
+  int dil_patch_d = (patch_d - 1) * dilation_d + 1;
+  int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
+  int depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
  int num_kernels = channels * height * width * depth;

  // To avoid involving atomic operations, we will launch one kernel per
  // bottom dimension, and then in the kernel add up the top dimensions.
-  col2im3d_kernel<<<GET_BLOCKS(num_kernels),
-                    CUDA_NUM_THREADS>>>(num_kernels, data_col,
-                                        height, width, depth, channels,
-                                        patch_h, patch_w, patch_d,
-                                        pad_h, pad_w, pad_d,
-                                        stride_h, stride_w, stride_d,
-                                        height_col, width_col, depth_col,
-                                        data_im);
+  if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
+    dilated_col2im3d_kernel<<<GET_BLOCKS(num_kernels),
+                              CUDA_NUM_THREADS>>>(num_kernels, data_col,
+                                                  height, width, depth, channels,
+                                                  patch_h, patch_w, patch_d,
+                                                  dilation_h, dilation_w, dilation_d,
+                                                  pad_h, pad_w, pad_d,
+                                                  stride_h, stride_w, stride_d,
+                                                  height_col, width_col, depth_col,
+                                                  data_im);
+  }
+  else{
+    col2im3d_kernel<<<GET_BLOCKS(num_kernels),
+                      CUDA_NUM_THREADS>>>(num_kernels, data_col,
+                                          height, width, depth, channels,
+                                          patch_h, patch_w, patch_d,
+                                          pad_h, pad_w, pad_d,
+                                          stride_h, stride_w, stride_d,
+                                          height_col, width_col, depth_col,
+                                          data_im);
+  }
 }


@@ -210,6 +345,9 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
                      const int dH = 1,
                      const int dW = 1,
                      const int dD = 1,
+                      const int dilH = 1,
+                      const int dilW = 1,
+                      const int dilD = 1,
                      const int padH = 0,
                      const int padW = 0,
                      const int padD = 0)
@@ -286,10 +424,14 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
                      "GpuCorr3dMM images and kernel must have the same stack size\n");
      return 0;
    }
+    // implicit dilated filter
+    const int dil_kH = (kH - 1) * dilH + 1;
+    const int dil_kW = (kW - 1) * dilW + 1;
+    const int dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const int topHeight = int((bottomHeight + 2*padH - kH) / dH) + 1;
-    const int topWidth  = int((bottomWidth + 2*padW - kW) / dW) + 1;
-    const int topDepth  = int((bottomDepth + 2*padD - kD) / dD) + 1;
+    const int topHeight = int((bottomHeight + 2*padH - dil_kH) / dH) + 1;
+    const int topWidth  = int((bottomWidth + 2*padW - dil_kW) / dW) + 1;
+    const int topDepth  = int((bottomDepth + 2*padD - dil_kD) / dD) + 1;
    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
        nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
        topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
@@ -345,6 +487,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
                 nChannels,
                 bottomHeight, bottomWidth, bottomDepth,
                 kH, kW, kD,
+                 dilH, dilW, dilD,
                 padH, padW, padD,
                 dH, dW, dD,
                 col->devdata);
@@ -392,6 +535,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
        im3d2col(bottom->devdata + n * bottom_stride, nChannels,
                 bottomHeight, bottomWidth, bottomDepth,
                 kH, kW, kD,
+                 dilH, dilW, dilD,
                 padH, padW, padD,
                 dH, dW, dD,
                 col->devdata);
@@ -461,6 +605,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
        col2im3d(col->devdata, nChannels,
                 bottomHeight, bottomWidth, bottomDepth,
                 kH, kW, kD,
+                 dilH, dilW, dilD,
                 padH, padW, padD,
                 dH, dW, dD, bottom->devdata + n * bottom_stride);
        cudaError_t err = cudaGetLastError();

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -37,7 +37,10 @@ from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler

 from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradWeights,
-                                              AbstractConv2d_gradInputs)
+                                              AbstractConv2d_gradInputs,
+                                              AbstractConv3d,
+                                              AbstractConv3d_gradWeights,
+                                              AbstractConv3d_gradInputs)


 def c_define_tensor_desc(desc):
@@ -1232,7 +1235,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
    :warning: dnn_conv3d only works with cuDNN library 3.0

    """
-    if border_mode == (0, 0):
+    if border_mode == (0, 0, 0):
        border_mode = 'valid'

    # Establish dtype in which to perform the computation of the convolution
@@ -1319,6 +1322,32 @@ def dnn_gradweight(img, topgrad,
    return GpuDnnConvGradW()(img, topgrad, out, desc)


+def dnn_gradweight3d(img, topgrad,
+                     kerns_shp,
+                     border_mode='valid', subsample=(1, 1, 1),
+                     conv_mode='conv'):
+    """
+    GPU convolution gradient with respect to weight using cuDNN from NVIDIA.
+
+    The memory layout to use is 'bct01', that is 'batch', 'channel',
+    'first dim', 'second dim' in that order.
+
+    FIXME parameters doc
+
+    :warning: The cuDNN library only works with GPU that have a compute
+      capability of 3.0 or higer.  This means that older GPU will not
+      work with this Op.
+    """
+
+    img = gpu_contiguous(img)
+    topgrad = gpu_contiguous(topgrad)
+    kerns_shp = theano.tensor.as_tensor_variable(kerns_shp)
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+                          conv_mode=conv_mode)(img.shape, kerns_shp)
+    out = gpu_alloc_empty(*kerns_shp)
+    return GpuDnnConv3dGradW()(img, topgrad, out, desc)
+
+
 def dnn_gradinput(kerns, topgrad,
                  img_shp,
                  border_mode='valid', subsample=(1, 1),
@@ -1346,6 +1375,33 @@ def dnn_gradinput(kerns, topgrad,
    return GpuDnnConvGradI()(kerns, topgrad, out, desc)


+def dnn_gradinput3d(kerns, topgrad,
+                    img_shp,
+                    border_mode='valid', subsample=(1, 1),
+                    conv_mode='conv'):
+    """
+    GPU convolution gradient with respect to input using cuDNN from NVIDIA.
+
+    The memory layout to use is 'bct01', that is 'batch', 'channel',
+    'first dim', 'second dim' in that order.
+
+    FIXME parameters doc
+
+    :warning: The cuDNN library only works with GPU that have a compute
+      capability of 3.0 or higer.  This means that older GPU will not
+      work with this Op.
+    """
+
+    kerns = gpu_contiguous(kerns)
+    topgrad = gpu_contiguous(topgrad)
+    img_shp = theano.tensor.as_tensor_variable(img_shp)
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+                          conv_mode=conv_mode)(img_shp, kerns.shape)
+
+    out = gpu_alloc_empty(*img_shp)
+    return GpuDnnConv3dGradI()(kerns, topgrad, out, desc)
+
+
 class GpuDnnPoolDesc(GpuOp):
    """
    This Op builds a pooling descriptor for use in the other pooling operations.
@@ -3176,3 +3232,53 @@ def local_abstractconv_cudnn(node):
                             subsample=node.op.subsample,
                             conv_mode=conv_mode)
        return [rval]
+
+
+@local_optimizer([AbstractConv3d,
+                  AbstractConv3d_gradWeights,
+                  AbstractConv3d_gradInputs])
+def local_abstractconv3d_cudnn(node):
+    if (not isinstance(node.op, (AbstractConv3d,
+                                 AbstractConv3d_gradWeights,
+                                 AbstractConv3d_gradInputs))):
+        return None
+    if (node.op.filter_dilation != (1, 1, 1)):
+        return None
+
+    inp1 = node.inputs[0]
+    inp2 = node.inputs[1]
+
+    if (not isinstance(inp1.type, CudaNdarrayType) or
+            not isinstance(inp2.type, CudaNdarrayType)):
+        return None
+
+    if not dnn_available():
+        return None
+
+    if node.op.filter_flip:
+        conv_mode = 'conv'
+    else:
+        conv_mode = 'cross'
+    if (isinstance(node.op, AbstractConv3d)):
+        rval = dnn_conv3d(inp1, inp2,
+                          border_mode=node.op.border_mode,
+                          subsample=node.op.subsample,
+                          direction_hint='forward',
+                          conv_mode=conv_mode)
+        return [rval]
+    if (isinstance(node.op, AbstractConv3d_gradWeights)):
+        shape = (inp2.shape[1], inp1.shape[1],
+                 node.inputs[2][0], node.inputs[2][1], node.inputs[2][2])
+        rval = dnn_gradweight3d(inp1, inp2, shape,
+                                border_mode=node.op.border_mode,
+                                subsample=node.op.subsample,
+                                conv_mode=conv_mode)
+        return [rval]
+    if (isinstance(node.op, AbstractConv3d_gradInputs)):
+        shape = (inp2.shape[0], inp1.shape[1],
+                 node.inputs[2][0], node.inputs[2][1], node.inputs[2][2])
+        rval = dnn_gradinput3d(inp1, inp2, shape,
+                               border_mode=node.op.border_mode,
+                               subsample=node.op.subsample,
+                               conv_mode=conv_mode)
+        return [rval]
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -87,10 +87,13 @@ from theano.tensor import slinalg
 from theano.tensor.nnet.Conv3D import Conv3D
 from theano.tests.breakpoint import PdbBreakpoint

-from theano.tensor.nnet.abstract_conv import (BaseAbstractConv2d,
+from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
                                              AbstractConv2d,
                                              AbstractConv2d_gradWeights,
-                                              AbstractConv2d_gradInputs)
+                                              AbstractConv2d_gradInputs,
+                                              AbstractConv3d,
+                                              AbstractConv3d_gradWeights,
+                                              AbstractConv3d_gradInputs)
 from theano.tensor.opt import register_specialize_device


@@ -2726,8 +2729,11 @@ optdb.register('local_inplace_gpu_sparse_block_outer',
 @local_optimizer([gpu_from_host,
                  AbstractConv2d,
                  AbstractConv2d_gradWeights,
-                  AbstractConv2d_gradInputs])
-def local_conv2d_gpu_conv(node):
+                  AbstractConv2d_gradInputs,
+                  AbstractConv3d,
+                  AbstractConv3d_gradWeights,
+                  AbstractConv3d_gradInputs])
+def local_conv_gpu_conv(node):
    """
    gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host)

@@ -2736,7 +2742,7 @@ def local_conv2d_gpu_conv(node):
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and isinstance(host_input.owner.op,
-                                           BaseAbstractConv2d):
+                                           BaseAbstractConv):

            conv = host_input.owner.op
            inps = list(host_input.owner.inputs)
@@ -2749,7 +2755,7 @@ def local_conv2d_gpu_conv(node):
            out.tag.values_eq_approx = values_eq_approx_high_tol
            return [out]

-    if isinstance(node.op, BaseAbstractConv2d):
+    if isinstance(node.op, BaseAbstractConv):
        # conv(host_from_gpu) -> host_from_gpu(gpu_conv)
        inp1 = node.inputs[0]
        inp2 = node.inputs[1]
@@ -2779,7 +2785,7 @@ def local_conv2d_gpu_conv(node):
                return [tensor.as_tensor_variable(out)]
            else:
                return [out]
-register_opt()(local_conv2d_gpu_conv)
+register_opt()(local_conv_gpu_conv)


 # Corrmm opt
@@ -2849,6 +2855,76 @@ def local_abstractconv_gemm(node):
    return [rval]


+# Corrmm opt
+@local_optimizer([AbstractConv3d])
+def local_abstractconv3d_gemm(node):
+    if not isinstance(node.op, AbstractConv3d):
+        return None
+    img, kern = node.inputs
+    if (not isinstance(img.type, CudaNdarrayType) or
+            not isinstance(kern.type, CudaNdarrayType)):
+        return None
+
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+    if ((border_mode == 'full') and (subsample == (1, 1, 1))):
+        if not node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1, ::-1]
+        # need to dimshuffle the kernel for full convolution
+        kern = kern.dimshuffle(1, 0, 2, 3, 4)
+        # call GpuCorr3dMM_gradInputs
+        rval = GpuCorr3dMM_gradInputs('valid',
+                                      subsample,
+                                      filter_dilation)(
+            gpu_contiguous(kern), gpu_contiguous(img))
+    else:
+        # need to flip the kernel if necessary
+        if node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1, ::-1]
+        # By default use GpuCorr3dMM
+        rval = GpuCorr3dMM(border_mode,
+                           subsample,
+                           filter_dilation)(gpu_contiguous(img),
+                                            gpu_contiguous(kern))
+
+        # call GpuCorr3dMM_gradWeights if good
+        # (the latter is faster if
+        #   batchsize * kernelHeight * kernelWidth * kernelDepth
+        # is larger than
+        #   inputChannels * outputHeight * outputWidth * outputDepth.
+        # GpuConv does not always store information on the batchsize and
+        # channels, though, so we only use what information we have.)
+        if ((subsample == (1, 1, 1)) and (filter_dilation == (1, 1, 1)) and
+                (node.op.imshp is not None) and
+                (None not in node.op.imshp[-3:]) and
+                (node.op.kshp is not None) and
+                (None not in node.op.kshp) and
+                border_mode != "half"):
+            # we know the kernel and output size
+            prod1 = node.op.kshp[0] * node.op.kshp[1] * node.op.kshp[2]
+            prod2 = ((node.op.imshp[-3] - node.op.kshp[0] + 1) *
+                     (node.op.imshp[-2] - node.op.kshp[1] + 1) *
+                     (node.op.imshp[-1] - node.op.kshp[2] + 1))
+            if (None not in node.op.imshp[:1]):
+                # we also know batchsize and input channels
+                prod1 *= node.op.imshp[0]
+                prod2 *= node.op.imshp[1]
+            # compare to decide
+            if prod1 > prod2:
+                # (we need to wrap the result in as_cuda_ndarray_variable,
+                # because we are not allowed to replace a CudaNdarray with
+                # a DimShuffle instance in a graph optimization)
+                rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
+                    GpuCorr3dMM_gradWeights(border_mode,
+                                            subsample,
+                                            filter_dilation)(
+                        gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)),
+                        gpu_contiguous(kern.dimshuffle(1, 0, 2, 3, 4))
+                    ).dimshuffle(1, 0, 2, 3, 4))
+    return [rval]
+
+
 @local_optimizer([AbstractConv2d_gradWeights])
 def local_abstractconv_gradweight_gemm(node):
    if not isinstance(node.op, AbstractConv2d_gradWeights):
@@ -2869,6 +2945,26 @@ def local_abstractconv_gradweight_gemm(node):
    return [rval]


+@local_optimizer([AbstractConv3d_gradWeights])
+def local_abstractconv3d_gradweight_gemm(node):
+    if not isinstance(node.op, AbstractConv3d_gradWeights):
+        return None
+    img, topgrad, shape = node.inputs
+    if not isinstance(img.type, CudaNdarrayType) or \
+            not isinstance(topgrad.type, CudaNdarrayType):
+        return None
+
+    rval = GpuCorr3dMM_gradWeights(border_mode=node.op.border_mode,
+                                   subsample=node.op.subsample,
+                                   filter_dilation=node.op.filter_dilation)(
+        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
+    if node.op.filter_flip:
+        rval = rval[:, :, ::-1, ::-1, ::-1]
+    rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
+    rval = as_cuda_ndarray_variable(rval)
+    return [rval]
+
+
 @local_optimizer([AbstractConv2d_gradInputs])
 def local_abstractconv_gradinputs_gemm(node):
    if not isinstance(node.op, AbstractConv2d_gradInputs):
@@ -2887,6 +2983,26 @@ def local_abstractconv_gradinputs_gemm(node):
        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
    return [rval]

+
+@local_optimizer([AbstractConv3d_gradInputs])
+def local_abstractconv3d_gradinputs_gemm(node):
+    if not isinstance(node.op, AbstractConv3d_gradInputs):
+        return None
+    kern, topgrad, shape = node.inputs
+    if not isinstance(kern.type, CudaNdarrayType) or \
+            not isinstance(topgrad.type, CudaNdarrayType):
+        return None
+
+    if node.op.filter_flip:
+        kern = kern[:, :, ::-1, ::-1, ::-1]
+
+    rval = GpuCorr3dMM_gradInputs(border_mode=node.op.border_mode,
+                                  subsample=node.op.subsample,
+                                  filter_dilation=node.op.filter_dilation)(
+        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
+    return [rval]
+
+
 # Register GPU convolution implementation
 # They are tried in a specific order so we can control
 # which ones take precedence over others.
@@ -2899,18 +3015,36 @@ conv_groupopt.register('local_abstractconv_dnn',
                       dnn.local_abstractconv_cudnn, 20,
                       'conv_dnn',
                       'gpu', 'fast_compile', 'fast_run', 'cudnn')
+conv_groupopt.register('local_abstractconv3d_dnn',
+                       dnn.local_abstractconv3d_cudnn, 20,
+                       'conv_dnn',
+                       'gpu', 'fast_compile', 'fast_run', 'cudnn')
 # The GEMM-based convolution comes last to catch all remaining cases.
 # It can be disabled by excluding 'conv_gemm'.
 conv_groupopt.register('local_abstractconv_gemm', local_abstractconv_gemm, 30,
                       'conv_gemm',
                       'gpu', 'fast_compile', 'fast_run')

+conv_groupopt.register('local_abstractconv3d_gemm', local_abstractconv3d_gemm, 30,
+                       'conv_gemm',
+                       'gpu', 'fast_compile', 'fast_run')
+
 conv_groupopt.register('local_abstractconv_gradweight_gemm',
                       local_abstractconv_gradweight_gemm, 30,
                       'conv_gemm',
                       'gpu', 'fast_compile', 'fast_run')

+conv_groupopt.register('local_abstractconv3d_gradweight_gemm',
+                       local_abstractconv3d_gradweight_gemm, 30,
+                       'conv_gemm',
+                       'gpu', 'fast_compile', 'fast_run')
+
 conv_groupopt.register('local_abstractconv_gradinputs_gemm',
                       local_abstractconv_gradinputs_gemm, 30,
                       'conv_gemm',
                       'gpu', 'fast_compile', 'fast_run')
+
+conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
+                       local_abstractconv3d_gradinputs_gemm, 30,
+                       'conv_gemm',
+                       'gpu', 'fast_compile', 'fast_run')
--- a/theano/sandbox/cuda/tests/test_abstractconv.py
+++ b/theano/sandbox/cuda/tests/test_abstractconv.py
@@ -7,9 +7,11 @@ from theano.sandbox.cuda import float32_shared_constructor as gpu_shared

 from theano.sandbox.cuda.dnn import (
    dnn_available,
-    GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI)
+    GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI,
+    GpuDnnConv3d, GpuDnnConv3dGradW, GpuDnnConv3dGradI)
 from theano.sandbox.cuda.blas import (
-    GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs)
+    GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs,
+    GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
 from nose.plugins.skip import SkipTest

 import theano.sandbox.cuda as cuda
@@ -56,6 +58,40 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
                           filter_dilation=fd)


+class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
+    @classmethod
+    def setup_class(cls):
+        test_abstract_conv.BaseTestConv3d.setup_class()
+        # provide_shape is not used by the cuDNN impementation
+        cls.provide_shape = [False]
+        cls.shared = staticmethod(gpu_shared)
+
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
+        if fd != (1, 1, 1):
+            raise SkipTest("No dilation implementation for cuDNN ConvOp.")
+        if not dnn_available():
+            raise SkipTest(cuda.dnn.dnn_available.msg)
+        mode = mode_with_gpu
+        o = self.get_output_shape(i, f, s, b, fd)
+        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
+                     verify_grad=True, mode=mode,
+                     provide_shape=provide_shape, border_mode=b,
+                     filter_flip=flip, target_op=GpuDnnConv3d,
+                     filter_dilation=fd)
+        self.run_gradweight(inputs_shape=i, filters_shape=f,
+                            output_shape=o, subsample=s,
+                            verify_grad=True, mode=mode,
+                            provide_shape=provide_shape, border_mode=b,
+                            filter_flip=flip, target_op=GpuDnnConv3dGradW,
+                            filter_dilation=fd)
+        self.run_gradinput(inputs_shape=i, filters_shape=f,
+                           output_shape=o, subsample=s,
+                           verify_grad=True, mode=mode,
+                           provide_shape=provide_shape, border_mode=b,
+                           filter_flip=flip, target_op=GpuDnnConv3dGradI,
+                           filter_dilation=fd)
+
+
 class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
    @classmethod
    def setup_class(cls):
@@ -89,6 +125,39 @@ class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
                           filter_dilation=fd)


+class TestCorrMMConv3d(test_abstract_conv.BaseTestConv3d):
+    @classmethod
+    def setup_class(cls):
+        test_abstract_conv.BaseTestConv3d.setup_class()
+        cls.shared = staticmethod(gpu_shared)
+        cls.mode = mode_with_gpu.excluding('cudnn')
+
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
+        mode = self.mode
+        o = self.get_output_shape(i, f, s, b, fd)
+        self.run_fwd(inputs_shape=i, filters_shape=f,
+                     subsample=s, verify_grad=True, mode=mode,
+                     provide_shape=provide_shape, border_mode=b,
+                     filter_flip=flip, target_op=(GpuCorr3dMM,
+                                                  GpuCorr3dMM_gradWeights,
+                                                  GpuCorr3dMM_gradInputs),
+                     filter_dilation=fd)
+        self.run_gradweight(inputs_shape=i, filters_shape=f,
+                            output_shape=o, subsample=s,
+                            verify_grad=True, mode=mode,
+                            provide_shape=provide_shape, border_mode=b,
+                            filter_flip=flip,
+                            target_op=GpuCorr3dMM_gradWeights,
+                            filter_dilation=fd)
+        self.run_gradinput(inputs_shape=i, filters_shape=f,
+                           output_shape=o, subsample=s,
+                           verify_grad=True, mode=mode,
+                           provide_shape=provide_shape, border_mode=b,
+                           filter_flip=flip,
+                           target_op=GpuCorr3dMM_gradInputs,
+                           filter_dilation=fd)
+
+
 class TestDnnConvTypes(test_abstract_conv.TestConvTypes):
    def setUp(self):
        self.input = cuda.ftensor4()

--- a/theano/sandbox/cuda/tests/test_gemmcorr3d.py
+++ b/theano/sandbox/cuda/tests/test_gemmcorr3d.py
 from __future__ import absolute_import, print_function, division
 import unittest
 import numpy
+from six.moves import xrange
+try:
+    from scipy import ndimage
+except ImportError:
+    ndimage = None

 import theano
 from theano.tests import unittest_tools as utt
@@ -21,31 +26,127 @@ else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')


+# python reference implementation of a 3D convolution
+# see also: theano.tensor.nnet.tests.test_conv3d2d
+# expects: (batch, 0, channels, 1, 2)
+def pyconv3d(signals, filters, border_mode='valid', dilation=(1, 1, 1)):
+    Ns, Ts, C, Hs, Ws = signals.shape
+    Nf, Tf, C, Hf, Wf = filters.shape
+    Tdil, Hdil, Wdil = dilation
+    Tfdil = (Tf - 1) * Tdil + 1
+    Hfdil = (Hf - 1) * Hdil + 1
+    Wfdil = (Wf - 1) * Wdil + 1
+
+    # if border_mode is not 'valid', the signals need zero-padding
+    if border_mode == 'full':
+        Tpad = Tfdil - 1
+        Hpad = Hfdil - 1
+        Wpad = Wfdil - 1
+    elif border_mode == 'half':
+        Tpad = Tfdil // 2
+        Hpad = Hfdil // 2
+        Wpad = Wfdil // 2
+    elif isinstance(border_mode, tuple):
+        Tpad, Hpad, Wpad = map(int, border_mode)
+    else:
+        Tpad = 0
+        Hpad = 0
+        Wpad = 0
+
+    if Tpad > 0 or Hpad > 0 or Wpad > 0:
+        # zero-pad signals
+        signals_padded = numpy.zeros((Ns, Ts + 2 * Tpad, C,
+                                      Hs + 2 * Hpad, Ws + 2 * Wpad), 'float32')
+        signals_padded[:, Tpad:(Ts + Tpad), :, Hpad:(Hs + Hpad),
+                       Wpad:(Ws + Wpad)] = signals
+        Ns, Ts, C, Hs, Ws = signals_padded.shape
+        signals = signals_padded
+
+    Tfdil2 = Tfdil // 2
+    Hfdil2 = Hfdil // 2
+    Wfdil2 = Wfdil // 2
+
+    dilated_filters = numpy.zeros((Nf, Tfdil, C, Hfdil, Wfdil), dtype=filters.dtype)
+    dilated_filters[:, ::Tdil, :, ::Hdil, ::Wdil] = filters
+
+    # perform valid convolution on the padded signals
+    rval = numpy.zeros((Ns, Ts - Tfdil + 1, Nf, Hs - Hfdil + 1, Ws - Wfdil + 1))
+    for ns in xrange(Ns):
+        for nf in xrange(Nf):
+            for c in xrange(C):
+                s_i = signals[ns, :, c, :, :]
+                f_i = dilated_filters[nf, :, c, :, :]
+                r_i = rval[ns, :, nf, :, :]
+                # scipy.signal.convolve performs valid convolution,
+                # but is quite slow. scipy.ndimage.convolve is faster
+                # only supports 'same' convolution.
+                # origin must be -1 for even filters, 0 for odd filters
+                o_i = ndimage.convolve(s_i, f_i, mode='constant', cval=1,
+                                       origin=(f_i.shape[0] % 2 - 1,
+                                               f_i.shape[1] % 2 - 1,
+                                               f_i.shape[2] % 2 - 1))
+                # crop to get the result of 'valid' convolution
+                o_i = o_i[Tfdil2:(r_i.shape[0] + Tfdil2),
+                          Hfdil2:(r_i.shape[1] + Hfdil2),
+                          Wfdil2:(r_i.shape[2] + Wfdil2)]
+                # the result should be equal to 'valid' convolution
+                # utt.assert_allclose(o_i, signal.convolve(s_i, f_i, mode='valid'))
+                r_i += o_i
+    return rval
+
+
 class TestCorr3DMM(unittest.TestCase):

    def run_conv_valid(self, inputs_shape, filters_shape,
-                       subsample=(1, 1, 1)):
+                       border_mode='valid',
+                       filter_dilation=(1, 1, 1),
+                       subsample=(1, 1, 1),
+                       verify_grad=False):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')

        inputs = shared(inputs_val)
        filters = shared(filters_val)
        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
-        conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters,
-                                             b=bias, d=subsample)
-        conv = GpuCorr3dMM(border_mode="valid",
+
+        if filter_dilation == (1, 1, 1) and border_mode in ('valid', (0, 0, 0)):
+            conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters,
+                                                 b=bias, d=subsample)
+            f_ref = theano.function([], conv_ref)
+            res_ref = f_ref()
+        elif subsample == (1, 1, 1):
+            if ndimage is None:
+                raise SkipTest('This test needs SciPy.')
+            # input = b012c
+            # pyconv3d wants = b0c12 = (0, 1, 4, 2, 3)
+            # pyconv3d outputs = b0c12 = (0, 1, 3, 4, 2)
+            res_ref = pyconv3d(signals=inputs_val.transpose(0, 1, 4, 2, 3),
+                               filters=filters_val.transpose(0, 1, 4, 2, 3)[:, ::-1, :, ::-1, ::-1],
+                               dilation=filter_dilation,
+                               border_mode=border_mode).transpose(0, 1, 3, 4, 2)
+        else:
+            raise SkipTest('No reference implementation that combines '
+                           'border_mode and subsampling.')
+
+        conv = GpuCorr3dMM(border_mode=border_mode,
+                           filter_dilation=filter_dilation,
                           subsample=subsample)(
                               inputs.dimshuffle(0, 4, 1, 2, 3),
                               filters.dimshuffle(0, 4, 1, 2, 3))
        conv = conv.dimshuffle(0, 2, 3, 4, 1)

-        f_ref = theano.function([], conv_ref)
        f = theano.function([], conv, mode=mode_with_gpu)

-        res_ref = f_ref()
        res = f()
        utt.assert_allclose(res_ref, res)

+        if verify_grad:
+            utt.verify_grad(GpuCorr3dMM(border_mode=border_mode,
+                                        filter_dilation=filter_dilation,
+                                        subsample=subsample),
+                            [inputs_val.transpose(0, 4, 1, 2, 3),
+                             filters_val.transpose(0, 4, 1, 2, 3)])
+
    def test_valid(self):
        self.run_conv_valid(inputs_shape=(16, 20, 12, 16, 1),
                            filters_shape=(10, 6, 12, 4, 1))
@@ -68,6 +169,50 @@ class TestCorr3DMM(unittest.TestCase):
                            filters_shape=(10, 6, 12, 4, 1),
                            subsample=(1, 2, 3))

+    def test_border_mode(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='valid')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='half')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='full')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(0, 0, 0))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(1, 2, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(3, 2, 1))
+
+    def test_filter_dilation(self):
+        inputs_shape = [16, 20, 12, 15, 1]
+        filters_shape = [10, 6, 5, 4, 1]
+
+        for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
+            for border_mode in ['valid', 'half', 'full']:
+                self.run_conv_valid(inputs_shape=inputs_shape,
+                                    filters_shape=filters_shape,
+                                    filter_dilation=filter_dilation,
+                                    border_mode=border_mode)
+
+    def test_verify_gradients(self):
+        # use a small example to check the gradients
+        inputs_shape = [2, 7, 9, 6, 1]
+        filters_shape = [1, 3, 3, 2, 1]
+
+        for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
+            for border_mode in ['valid', 'half', 'full', (2, 1, 3)]:
+                self.run_conv_valid(inputs_shape=inputs_shape,
+                                    filters_shape=filters_shape,
+                                    filter_dilation=filter_dilation,
+                                    border_mode=border_mode,
+                                    verify_grad=True)
+
    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
                       subsample=(1, 1, 1)):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')

--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
@@ -32,6 +32,7 @@ from .bn import batch_normalization

 import warnings
 from .abstract_conv import conv2d as abstract_conv2d
+from .abstract_conv import conv3d


 def conv2d(input, filters, input_shape=None, filter_shape=None,

--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
@@ -20,7 +20,7 @@ import numpy
 import numpy as np

 try:
-    from scipy.signal.signaltools import _valfrommode, _bvalfromboundary
+    from scipy.signal.signaltools import _valfrommode, _bvalfromboundary, convolve
    from scipy.signal.sigtools import _convolve2d
    imported_scipy_signal = True
 except ImportError:
@@ -163,6 +163,105 @@ def conv2d(input,
    return conv_op(input, filters)


+def conv3d(input,
+           filters,
+           input_shape=None,
+           filter_shape=None,
+           border_mode='valid',
+           subsample=(1, 1, 1),
+           filter_flip=True,
+           filter_dilation=(1, 1, 1)):
+    """
+    This function will build the symbolic graph for convolving a mini-batch of a
+    stack of 3D inputs with a set of 3D filters. The implementation is modelled
+    after Convolutional Neural Networks (CNN).
+
+
+    Parameters
+    ----------
+    input: symbolic 5D tensor
+        Mini-batch of feature map stacks, of shape
+        (batch size, input channels, input depth, input rows, input columns).
+        See the optional parameter ``input_shape``.
+
+    filters: symbolic 5D tensor
+        Set of filters used in CNN layer of shape
+        (output channels, input channels, filter depth, filter rows, filter columns).
+        See the optional parameter ``filter_shape``.
+
+    input_shape: None, tuple/list of len 5 of int or Constant variable
+        The shape of the input parameter.
+        Optional, possibly used to choose an optimal implementation.
+        You can give ``None`` for any element of the list to specify that this
+        element is not known at compile time.
+
+    filter_shape: None, tuple/list of len 5 of int or Constant variable
+        The shape of the filters parameter.
+        Optional, possibly used to choose an optimal implementation.
+        You can give ``None`` for any element of the list to specify that this
+        element is not known at compile time.
+
+    border_mode: str, int or tuple of three int
+        Either of the following:
+
+        ``'valid'``: apply filter wherever it completely overlaps with the
+            input. Generates output of shape: input shape - filter shape + 1
+        ``'full'``: apply filter wherever it partly overlaps with the input.
+            Generates output of shape: input shape + filter shape - 1
+        ``'half'``: pad input with a symmetric border of ``filter // 2``,
+            then perform a valid convolution. For filters with an odd
+            number of slices, rows and columns, this leads to the output
+            shape being equal to the input shape.
+        ``int``: pad input with a symmetric border of zeros of the given
+            width, then perform a valid convolution.
+        ``(int1, int2, int3)``
+            pad input with a symmetric border of ``int1``, ``int2`` and
+            ``int3`` columns, then perform a valid convolution.
+
+    subsample: tuple of len 3
+        Factor by which to subsample the output.
+        Also called strides elsewhere.
+
+    filter_flip: bool
+        If ``True``, will flip the filter x, y and z dimensions before
+        sliding them over the input. This operation is normally
+        referred to as a convolution, and this is the default. If
+        ``False``, the filters are not flipped and the operation is
+        referred to as a cross-correlation.
+
+    filter_dilation: tuple of len 3
+        Factor by which to subsample (stride) the input.
+        Also called dilation elsewhere.
+
+    Returns
+    -------
+    Symbolic 5D tensor
+        Set of feature maps generated by convolutional layer. Tensor is
+        is of shape (batch size, output channels, output depth,
+        output rows, output columns)
+
+    Notes
+    -----
+        If cuDNN is available, it will be used on the
+        GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
+        "caffe style convolution".
+
+        This is only supported in Theano 0.8 or the development
+        version until it is released.
+
+    """
+
+    input = as_tensor_variable(input)
+    filters = as_tensor_variable(filters)
+    conv_op = AbstractConv3d(imshp=input_shape,
+                             kshp=filter_shape,
+                             border_mode=border_mode,
+                             subsample=subsample,
+                             filter_flip=filter_flip,
+                             filter_dilation=filter_dilation)
+    return conv_op(input, filters)
+
+
 def conv2d_grad_wrt_inputs(output_grad,
                           filters,
                           input_shape,
@@ -298,6 +397,141 @@ def conv2d_grad_wrt_inputs(output_grad,
    return grad_input_op(filters, output_grad, input_shape[-2:])


+def conv3d_grad_wrt_inputs(output_grad,
+                           filters,
+                           input_shape,
+                           filter_shape=None,
+                           border_mode='valid',
+                           subsample=(1, 1, 1),
+                           filter_flip=True,
+                           filter_dilation=(1, 1, 1)):
+    """Compute conv output gradient w.r.t its inputs
+
+    This function builds the symbolic graph for getting the
+    gradient of the output of a convolution (namely output_grad)
+    w.r.t the input of the convolution, given a set of 3D filters
+    used by the convolution, such that the output_grad is upsampled
+    to the input_shape.
+
+    Parameters
+    ----------
+    output_grad : symbolic 5D tensor
+        mini-batch of feature map stacks, of shape (batch size, input
+        channels, input depth, input rows, input columns).  This is the
+        tensor that will be upsampled or the output gradient of the
+        convolution whose gradient will be taken with respect to the
+        input of the convolution.
+    filters : symbolic 5D tensor
+        set of filters used in CNN layer of shape (output channels,
+        input channels, filter depth, filter rows, filter columns).
+        See the optional parameter ``filter_shape``.
+    input_shape : [None/int/Constant] * 2 + [Tensor/int/Constant] * 2
+        The shape of the input (upsampled) parameter.
+        A tuple/list of len 5, with the first two dimensions
+        being None or int or Constant and the last three dimensions being
+        Tensor or int or Constant.
+        Not Optional, since given the output_grad shape
+        and the subsample values, multiple input_shape may be
+        plausible.
+    filter_shape : None or [None/int/Constant] * 5
+        The shape of the filters parameter. None or a tuple/list of len 5.
+        Optional, possibly used  to choose an optimal implementation.
+        You can give ``None`` for any element of the list to specify that
+        this element is not known at compile time.
+    border_mode : str, int or tuple of three int
+        Either of the following:
+
+          ``'valid'``
+            apply filter wherever it completely overlaps with the
+            input. Generates output of shape: input shape - filter
+            shape + 1
+
+          ``'full'``
+            apply filter wherever it partly overlaps with the input.
+            Generates output of shape: input shape + filter shape - 1
+
+          ``'half'``
+            pad input with a symmetric border of ``filter // 2``,
+            then perform a valid convolution. For filters with an odd
+            number of slices, rows and columns, this leads to the output
+            shape being equal to the input shape. It is known as 'same'
+            elsewhere.
+
+          ``int``
+            pad input with a symmetric border of zeros of the given
+            width, then perform a valid convolution.
+
+          ``(int1, int2, int3)``
+            pad input with a symmetric border of ``int1``, ``int2`` and
+            ``int3`` columns, then perform a valid convolution.
+
+    subsample : tuple of len 3
+        The subsampling used in the forward pass.  Also called strides
+        elsewhere.
+    filter_flip : bool
+        If ``True``, will flip the filter x, y and z dimensions before
+        sliding them over the input. This operation is normally
+        referred to as a convolution, and this is the default. If
+        ``False``, the filters are not flipped and the operation is
+        referred to as a cross-correlation.
+    filter_dilation : tuple of len 3
+        The filter dilation used in the forward pass.
+        Also known as input striding.
+
+    Returns
+    -------
+    symbolic 5D tensor
+        set of feature maps generated by convolutional layer. Tensor
+        is of shape (batch size, output channels, output depth,
+        output rows, output columns)
+
+    Notes
+    -----
+
+    :note: If cuDNN is available, it will be used on the
+        GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
+        "caffe style convolution".
+
+    :note: This is only supported in Theano 0.8 or the development
+        version until it is released.
+
+    """
+
+    filters = as_tensor_variable(filters)
+    output_grad = as_tensor_variable(output_grad)
+
+    # checking the type of input_shape
+    for dim in [0, 1]:
+        assert isinstance(input_shape[dim], (theano.tensor.TensorConstant,
+                                             integer_types, type(None)))
+    for dim in [2, 3, 4]:
+        assert isinstance(input_shape[dim], (theano.tensor.TensorVariable,
+                                             theano.tensor.TensorConstant,
+                                             integer_types))
+
+    # checking the type of filter_shape
+    if filter_shape is not None:
+        for dim in [0, 1, 2, 3, 4]:
+            assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
+                                                  integer_types, type(None)))
+
+    # setting the last three dimensions of input_shape to None, if
+    # the type of these dimensions is TensorVariable.
+    numerical_input_shape = list(input_shape)
+    for dim in [2, 3, 4]:
+        if isinstance(input_shape[dim], theano.tensor.TensorVariable):
+            numerical_input_shape[dim] = None
+
+    grad_input_op = AbstractConv3d_gradInputs(imshp=numerical_input_shape,
+                                              kshp=filter_shape,
+                                              border_mode=border_mode,
+                                              subsample=subsample,
+                                              filter_flip=filter_flip,
+                                              filter_dilation=filter_dilation)
+
+    return grad_input_op(filters, output_grad, input_shape[-3:])
+
+
 def conv2d_grad_wrt_weights(input,
                            output_grad,
                            filter_shape,
@@ -425,6 +659,132 @@ def conv2d_grad_wrt_weights(input,
    return gradWeight_op(input, output_grad, filter_shape[-2:])


+def conv3d_grad_wrt_weights(input,
+                            output_grad,
+                            filter_shape,
+                            input_shape=None,
+                            border_mode='valid',
+                            subsample=(1, 1, 1),
+                            filter_flip=True,
+                            filter_dilation=(1, 1, 1)):
+    """Compute conv output gradient w.r.t its weights
+
+    This function will build the symbolic graph for getting the
+    gradient of the output of a convolution (output_grad) w.r.t its weights.
+
+    Parameters
+    ----------
+    input : symbolic 5D tensor
+        mini-batch of feature map stacks, of shape (batch size, input
+        channels, input depth, input rows, input columns).  This is the input
+        of the convolution in the forward pass.
+    output_grad : symbolic 5D tensor
+        mini-batch of feature map stacks, of shape (batch size, input
+        channels, input depth, input rows, input columns).  This is the
+        gradient of the output of convolution.
+    filter_shape : [None/int/Constant] * 2 + [Tensor/int/Constant] * 2
+        The shape of the filter parameter.  A tuple/list of len 5, with the
+        first two dimensions being None or int or Constant and the last three
+        dimensions being Tensor or int or Constant.
+        Not Optional, since given the output_grad shape and
+        the input_shape, multiple filter_shape may be plausible.
+    input_shape : None or [None/int/Constant] * 5
+        The shape of the input parameter. None or a tuple/list of len 5.
+        Optional, possibly used to choose an optimal implementation.
+        You can give ``None`` for any element of the list to specify
+        that this element is not known at compile time.
+    border_mode : str, int or tuple of two ints
+        Either of the following:
+
+          ``'valid'``
+            apply filter wherever it completely overlaps with the
+            input. Generates output of shape: input shape - filter
+            shape + 1
+
+          ``'full'``
+            apply filter wherever it partly overlaps with the input.
+            Generates output of shape: input shape + filter shape - 1
+
+          ``'half'``
+            pad input with a symmetric border of ``filter rows // 2``
+            rows and ``filter columns // 2`` columns, then perform a
+            valid convolution. For filters with an odd number of rows
+            and columns, this leads to the output shape being equal to
+            the input shape. It is known as 'same' elsewhere.
+
+          ``int``
+            pad input with a symmetric border of zeros of the given
+            width, then perform a valid convolution.
+
+          ``(int1, int2, int3)``
+            pad input with a symmetric border of ``int1``, ``int2`` and
+            ``int3``, then perform a valid convolution.
+    subsample : tuple of len 3
+        The subsampling used in the forward pass of the convolutional
+        operation.  Also called strides elsewhere.
+    filter_flip : bool
+        If ``True``, will flip the filters before sliding them over the
+        input. This operation is normally referred to as a convolution,
+        and this is the default. If ``False``, the filters are not
+        flipped and the operation is referred to as a cross-correlation.
+    filter_dilation : tuple of len 3
+        The filter dilation used in the forward pass.
+        Also known as input striding.
+
+    Returns
+    -------
+    symbolic 5D tensor
+        set of feature maps generated by convolutional layer. Tensor
+        is of shape (batch size, output channels, output time, output
+        rows, output columns)
+
+    Notes
+    -----
+
+    :note: If cuDNN is available, it will be used on the
+        GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
+        "caffe style convolution".
+
+    :note: This is only supported in Theano 0.8 or the development
+        version until it is released.
+
+    """
+
+    input = as_tensor_variable(input)
+    output_grad = as_tensor_variable(output_grad)
+
+    # checking the type of filter_shape
+    for dim in [0, 1]:
+        assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
+                                              integer_types, type(None)))
+    for dim in [2, 3, 4]:
+        assert isinstance(filter_shape[dim], (theano.tensor.TensorVariable,
+                                              theano.tensor.TensorConstant,
+                                              integer_types))
+
+    # checking the type of input_shape
+    if input_shape is not None:
+        for dim in [0, 1, 2, 3, 4]:
+            assert isinstance(input_shape[dim], (theano.tensor.TensorConstant,
+                                                 integer_types, type(None)))
+
+    # setting the last three dimensions of filter_shape to None, if
+    # the type of these dimensions is TensorVariable.
+    numerical_filter_shape = list(filter_shape)
+    for dim in [2, 3, 4]:
+        if isinstance(filter_shape[dim], theano.tensor.TensorVariable):
+            numerical_filter_shape[dim] = None
+
+    gradWeight_op = AbstractConv3d_gradWeights(imshp=input_shape,
+                                               kshp=numerical_filter_shape,
+                                               border_mode=border_mode,
+                                               subsample=subsample,
+                                               filter_flip=filter_flip,
+                                               filter_dilation=filter_dilation)
+
+    return gradWeight_op(input, output_grad, filter_shape[:-3])
+
+
 def bilinear_kernel_2D(ratio, normalize=True):
    """Compute 2D kernel for bilinear upsampling

@@ -608,45 +968,46 @@ def bilinear_upsampling(input,
                                  row * ratio, col * ratio))


-class BaseAbstractConv2d(Op):
+class BaseAbstractConv(Op):
    """Base class for AbstractConv

-    Define an abstract convolution op that will be replaced with the
-    appropriate implementation
-
    Parameters
    ----------
-     imshp: None, tuple/list of len 4 of int or Constant variable
+     convdim: The number of convolution dimensions (2 or 3).
+
+     imshp: None, tuple/list of len ``(2 + convdim)`` of int or Constant variable
        The shape of the input parameter.
        Optional, possibly used to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that this
        element is not known at compile time.
        imshp is defined w.r.t the forward conv.

-     kshp: None, tuple/list of len 4 of int or Constant variable
+     kshp: None, tuple/list of len ``(2 + convdim)`` of int or Constant variable
        The shape of the filters parameter.
        Optional, possibly used to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that this
        element is not known at compile time.
        kshp is defined w.r.t the forward conv.

-     border_mode: str, int or tuple of two int
+     border_mode: str, int or tuple of ``convdim`` ints
        Either of the following:

        ``'valid'``: apply filter wherever it completely overlaps with the
            input. Generates output of shape: input shape - filter shape + 1
        ``'full'``: apply filter wherever it partly overlaps with the input.
            Generates output of shape: input shape + filter shape - 1
-        ``'half'``: pad input with a symmetric border of ``filter rows // 2``
-            rows and ``filter columns // 2`` columns, then perform a valid
-            convolution. For filters with an odd number of rows and columns, this
-            leads to the output shape being equal to the input shape.
+        ``'half'``: pad input with a symmetric border of ``filter size // 2``
+            in each convolution dimension, then perform a valid convolution.
+            For filters with an odd filter size, this leads to the output
+            shape being equal to the input shape.
        ``int``: pad input with a symmetric border of zeros of the given
            width, then perform a valid convolution.
-        ``(int1, int2)``: pad input with a symmetric border of ``int1`` rows
-            and ``int2`` columns, then perform a valid convolution.
+        ``(int1, int2)``: (for 2D) pad input with a symmetric border of ``int1``,
+            ``int2``, then perform a valid convolution.
+        ``(int1, int2, int3)``: (for 3D) pad input with a symmetric border of
+            ``int1``, ``int2`` and ``int3``, then perform a valid convolution.

-    subsample: tuple of len 2
+    subsample: tuple of len ``convdim``
        Factor by which to subsample the output.
        Also called strides elsewhere.

@@ -657,34 +1018,46 @@ class BaseAbstractConv2d(Op):
        are not flipped and the operation is referred to as a
        cross-correlation.

-    filter_dilation: tuple of len 2
+    filter_dilation: tuple of len ``convdim``
        Factor by which to subsample (stride) the input.
        Also called dilation factor.
    """
    check_broadcast = False
-    __props__ = ('border_mode', 'subsample', 'filter_flip',
+    __props__ = ('convdim', 'border_mode', 'subsample', 'filter_flip',
                 'imshp', 'kshp', 'filter_dilation')

-    def __init__(self,
+    def __init__(self, convdim,
                 imshp=None, kshp=None, border_mode="valid",
-                 subsample=(1, 1), filter_flip=True,
-                 filter_dilation=(1, 1)):
+                 subsample=None, filter_flip=True, filter_dilation=None):
+
+        self.convdim = convdim
+        if convdim not in (2, 3):
+            raise ValueError(
+                'convolution dimension {} is not supported', convdim)
+
+        if subsample is None:
+            subsample = (1,) * convdim
+        if filter_dilation is None:
+            filter_dilation = (1,) * convdim

        if isinstance(border_mode, integer_types):
-            border_mode = (border_mode, border_mode)
+            border_mode = (border_mode,) * convdim
        if isinstance(border_mode, tuple):
-            pad_h, pad_w = map(int, border_mode)
-            border_mode = (pad_h, pad_w)
-        if border_mode == (0, 0):
+            if len(border_mode) != convdim:
+                raise ValueError(
+                    'border mode must have exactly {} values, '
+                    'but was {}'.format(convdim, border_mode))
+            border_mode = tuple(map(int, border_mode))
+        if border_mode == (0,) * convdim:
            border_mode = 'valid'
        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
                border_mode in ('valid', 'full', 'half')):
            raise ValueError(
                'invalid border_mode {}, which must be either '
-                '"valid", "full", "half", an integer or a pair of'
-                ' integers'.format(border_mode))
+                '"valid", "full", "half", an integer or a tuple of {}'
+                ' integers'.format(border_mode, convdim))

-        self.imshp = tuple(imshp) if imshp else (None,) * 4
+        self.imshp = tuple(imshp) if imshp else (None,) * (2 + convdim)
        for imshp_i in self.imshp:
            if imshp_i is not None:
                # Components of imshp should be constant or ints
@@ -696,7 +1069,7 @@ class BaseAbstractConv2d(Op):
                            ValueError("imshp should be None or a tuple of "
                                       "constant int values"),
                            sys.exc_info()[2])
-        self.kshp = tuple(kshp) if kshp else (None,) * 4
+        self.kshp = tuple(kshp) if kshp else (None,) * (2 + convdim)
        for kshp_i in self.kshp:
            if kshp_i is not None:
                # Components of kshp should be constant or ints
@@ -711,36 +1084,41 @@ class BaseAbstractConv2d(Op):
        self.border_mode = border_mode
        self.filter_flip = filter_flip

-        if len(subsample) != 2:
-            raise ValueError("subsample must have two elements")
+        if len(subsample) != convdim:
+            raise ValueError("subsample must have {} elements".format(convdim))
        self.subsample = tuple(subsample)
-        if len(filter_dilation) != 2:
-            raise ValueError("filter_dilation must have two elements")
+        if len(filter_dilation) != convdim:
+            raise ValueError("filter_dilation must have {} elements".format(convdim))
        self.filter_dilation = tuple(filter_dilation)

-    def flops(self, inp, outp):
-        """ Useful with the hack in profiling to print the MFlops"""
-        # if the output shape is correct, then this gives the correct
-        # flops for any direction, sampling, padding, and border mode
-        inputs, filters = inp
-        outputs, = outp
-        assert inputs[1] == filters[1]
-        # nb mul and add by output pixel
-        flops = filters[2] * filters[3] * 2
-        # nb flops by output image
-        flops *= outputs[2] * outputs[3]
-        # nb patch multiplied
-        flops *= inputs[1] * filters[0] * inputs[0]
-        return flops
-
    def do_constant_folding(self, node):
        # Disable constant folding since there is no implementation.
        # This may change in the future.
        return False

-    def conv2d(self, img, kern, mode="valid", dilation=(1, 1)):
+    def flops(self, inp, outp):
+        """ Useful with the hack in profiling to print the MFlops"""
+        if self.convdim == 2:
+            # if the output shape is correct, then this gives the correct
+            # flops for any direction, sampling, padding, and border mode
+            inputs, filters = inp
+            outputs, = outp
+            assert inputs[1] == filters[1]
+            # nb mul and add by output pixel
+            flops = filters[2] * filters[3] * 2
+            # nb flops by output image
+            flops *= outputs[2] * outputs[3]
+            # nb patch multiplied
+            flops *= inputs[1] * filters[0] * inputs[0]
+            return flops
+        else:
+            # TODO implement for convdim == 3
+            raise NotImplementedError(
+                'flops not implemented for convdim={}', self.convdim)
+
+    def conv(self, img, kern, mode="valid", dilation=1):
        """
-        Basic slow python implementatation for DebugMode
+        Basic slow Python 2D or 3D convolution for DebugMode
        """

        if not imported_scipy_signal:
@@ -751,48 +1129,70 @@ class BaseAbstractConv2d(Op):
            raise ValueError(
                'invalid mode {}, which must be either '
                '"valid" or "full"'.format(mode))
+        if isinstance(dilation, integer_types):
+            dilation = (dilation,) * self.convdim
+        if len(dilation) != self.convdim:
+            raise ValueError(
+                'invalid dilation {}, expected {} values'.format(dilation,
+                                                                 self.convdim))

        out_shape = get_conv_output_shape(img.shape, kern.shape,
-                                          mode, [1, 1], dilation)
+                                          mode, [1] * self.convdim, dilation)

        out = numpy.zeros(out_shape, dtype=img.dtype)
-        dil_kern_shp = kern.shape[:-2] + ((kern.shape[-2] - 1) * dilation[0] + 1,
-                                          (kern.shape[-1] - 1) * dilation[1] + 1)
+        dil_kern_shp = kern.shape[:-self.convdim] + tuple(
+            (kern.shape[-self.convdim + i] - 1) * dilation[i] + 1
+            for i in range(self.convdim))
        dilated_kern = numpy.zeros(dil_kern_shp, dtype=kern.dtype)
-        dilated_kern[:, :,
-                     ::dilation[0],
-                     ::dilation[1]] = kern
-        val = _valfrommode(mode)
-        bval = _bvalfromboundary('fill')
-
-        with warnings.catch_warnings():
-            warnings.simplefilter('ignore', numpy.ComplexWarning)
+        dilated_kern[(slice(None), slice(None)) +
+                     tuple(slice(None, None, dilation[i]) for i in range(self.convdim))
+                     ] = kern
+
+        if self.convdim == 2:
+            val = _valfrommode(mode)
+            bval = _bvalfromboundary('fill')
+
+            with warnings.catch_warnings():
+                warnings.simplefilter('ignore', numpy.ComplexWarning)
+                for b in xrange(img.shape[0]):
+                    for n in xrange(kern.shape[0]):
+                        for im0 in xrange(img.shape[1]):
+                            # some cast generates a warning here
+                            out[b, n, ...] += _convolve2d(img[b, im0, ...],
+                                                          dilated_kern[n, im0, ...],
+                                                          1, val, bval, 0)
+        elif self.convdim == 3:
            for b in xrange(img.shape[0]):
                for n in xrange(kern.shape[0]):
                    for im0 in xrange(img.shape[1]):
-                        # some cast generates a warning here
-                        out[b, n, ...] += _convolve2d(img[b, im0, ...],
-                                                      dilated_kern[n, im0, ...],
-                                                      1, val, bval, 0)
+                        out[b, n, ...] += convolve(img[b, im0, ...],
+                                                   dilated_kern[n, im0, ...],
+                                                   mode)
+        else:
+            raise NotImplementedError('only 2D and 3D convolution are implemented')
        return out


-class AbstractConv2d(BaseAbstractConv2d):
+class AbstractConv(BaseAbstractConv):
    """ Abstract Op for the forward convolution.
-    Refer to :func:`BaseAbstractConv2d <theano.tensor.nnet.abstract_conv.BaseAbstractConv2d>`
+    Refer to :func:`BaseAbstractConv <theano.tensor.nnet.abstract_conv.BaseAbstractConv>`
    for a more detailed documentation.
    """

    def __init__(self,
+                 convdim,
                 imshp=None,
                 kshp=None,
                 border_mode="valid",
-                 subsample=(1, 1),
+                 subsample=None,
                 filter_flip=True,
-                 filter_dilation=(1, 1)):
-        super(AbstractConv2d, self).__init__(imshp, kshp, border_mode,
-                                             subsample, filter_flip,
-                                             filter_dilation)
+                 filter_dilation=None):
+        super(AbstractConv, self).__init__(convdim=convdim,
+                                           imshp=imshp, kshp=kshp,
+                                           border_mode=border_mode,
+                                           subsample=subsample,
+                                           filter_flip=filter_flip,
+                                           filter_dilation=filter_dilation)

    def make_node(self, img, kern):
        # Make sure both inputs are Variables with the same Type
@@ -804,14 +1204,13 @@ class AbstractConv2d(BaseAbstractConv2d):
                               broadcastable=kern.broadcastable)
        kern = ktype.filter_variable(kern)

-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
+        if img.type.ndim != 2 + self.convdim:
+            raise TypeError('img must be %dD tensor' % (2 + self.convdim))
+        if kern.type.ndim != 2 + self.convdim:
+            raise TypeError('kern must be %dD tensor' % (2 + self.convdim))

        broadcastable = [img.broadcastable[0],
-                         kern.broadcastable[0],
-                         False, False]
+                         kern.broadcastable[0]] + ([False] * self.convdim)
        output = img.type.clone(broadcastable=broadcastable)()
        return Apply(self, [img, kern], [output])

@@ -819,8 +1218,8 @@ class AbstractConv2d(BaseAbstractConv2d):
        img, kern = inp
        img = numpy.asarray(img)
        kern = numpy.asarray(kern)
-        dil_kernshp = ((kern.shape[2] - 1) * self.filter_dilation[0] + 1,
-                       (kern.shape[3] - 1) * self.filter_dilation[1] + 1)
+        dil_kernshp = tuple((kern.shape[2 + i] - 1) * self.filter_dilation[i] + 1
+                            for i in range(self.convdim))
        o, = out_
        mode = self.border_mode

@@ -828,25 +1227,30 @@ class AbstractConv2d(BaseAbstractConv2d):
                mode in ('valid', 'full', 'half')):
            raise ValueError(
                'invalid border_mode {}, which must be either '
-                '"valid", "full", "half", an integer or a pair of'
+                '"valid", "full", "half", an integer or a tuple of'
                ' integers'.format(mode))

        if mode == "full":
-            mode = (dil_kernshp[0] - 1, dil_kernshp[1] - 1)
+            mode = tuple(dil_kernshp[i] - 1 for i in range(self.convdim))
        elif mode == "half":
-            mode = (dil_kernshp[0] // 2, dil_kernshp[1] // 2)
+            mode = tuple(dil_kernshp[i] // 2 for i in range(self.convdim))
        if isinstance(mode, tuple):
-            pad_h, pad_w = map(int, mode)
+            pad = tuple(int(mode[i]) for i in range(self.convdim))
            mode = "valid"
-            new_img = numpy.zeros((img.shape[0], img.shape[1],
-                                   img.shape[2] + 2 * pad_h,
-                                   img.shape[3] + 2 * pad_w), dtype=img.dtype)
-            new_img[:, :, pad_h:img.shape[2] + pad_h, pad_w:img.shape[3] + pad_w] = img
+            new_img = numpy.zeros((img.shape[0], img.shape[1]) +
+                                  tuple(img.shape[i + 2] + 2 * pad[i]
+                                        for i in range(self.convdim)),
+                                  dtype=img.dtype)
+            new_img[(slice(None), slice(None)) +
+                    tuple(slice(pad[i], img.shape[i + 2] + pad[i])
+                          for i in range(self.convdim))] = img
            img = new_img
        if not self.filter_flip:
-            kern = kern[:, :, ::-1, ::-1]
-        conv_out = self.conv2d(img, kern, mode="valid", dilation=self.filter_dilation)
-        conv_out = conv_out[:, :, ::self.subsample[0], ::self.subsample[1]]
+            kern = kern[(slice(None), slice(None)) + (slice(None, None, -1),) * self.convdim]
+        conv_out = self.conv(img, kern, mode="valid", dilation=self.filter_dilation)
+        conv_out = conv_out[(slice(None), slice(None)) +
+                            tuple(slice(None, None, self.subsample[i])
+                                  for i in range(self.convdim))]

        o[0] = node.outputs[0].type.filter(conv_out)

@@ -861,6 +1265,42 @@ class AbstractConv2d(BaseAbstractConv2d):
                rval += self.make_node(inputs[0], eval_points[1]).outputs[0]
        return [rval]

+    def infer_shape(self, node, input_shapes):
+        imshp = input_shapes[0]
+        kshp = input_shapes[1]
+
+        # replace symbolic shapes with known constant shapes
+        if self.imshp is not None:
+            imshp = [imshp[i] if self.imshp[i] is None else self.imshp[i]
+                     for i in range(2 + self.convdim)]
+        if self.kshp is not None:
+            kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
+                    for i in range(2 + self.convdim)]
+        res = get_conv_output_shape(imshp, kshp, self.border_mode,
+                                    self.subsample, self.filter_dilation)
+        return [res]
+
+
+class AbstractConv2d(AbstractConv):
+    """ Abstract Op for the forward convolution.
+    Refer to :func:`BaseAbstractConv <theano.tensor.nnet.abstract_conv.BaseAbstractConv>`
+    for a more detailed documentation.
+    """
+
+    def __init__(self,
+                 imshp=None,
+                 kshp=None,
+                 border_mode="valid",
+                 subsample=(1, 1),
+                 filter_flip=True,
+                 filter_dilation=(1, 1)):
+        super(AbstractConv2d, self).__init__(convdim=2,
+                                             imshp=imshp, kshp=kshp,
+                                             border_mode=border_mode,
+                                             subsample=subsample,
+                                             filter_flip=filter_flip,
+                                             filter_dilation=filter_dilation)
+
    def grad(self, inp, grads):
        bottom, weights = inp
        top, = grads
@@ -889,25 +1329,59 @@ class AbstractConv2d(BaseAbstractConv2d):
        d_weights = weights.type.filter_variable(d_weights)
        return d_bottom, d_weights

-    def infer_shape(self, node, input_shapes):
-        imshp = input_shapes[0]
-        kshp = input_shapes[1]

-        # replace symbolic shapes with known constant shapes
-        if self.imshp is not None:
-            imshp = [imshp[i] if self.imshp[i] is None else self.imshp[i]
-                     for i in range(4)]
-        if self.kshp is not None:
-            kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
-                    for i in range(4)]
-        res = get_conv_output_shape(imshp, kshp, self.border_mode,
-                                    self.subsample, self.filter_dilation)
-        return [res]
+class AbstractConv3d(AbstractConv):
+    """ Abstract Op for the forward convolution.
+    Refer to :func:`BaseAbstractConv <theano.tensor.nnet.abstract_conv.BaseAbstractConv>`
+    for a more detailed documentation.
+    """
+
+    def __init__(self,
+                 imshp=None,
+                 kshp=None,
+                 border_mode="valid",
+                 subsample=(1, 1, 1),
+                 filter_flip=True,
+                 filter_dilation=(1, 1, 1)):
+        super(AbstractConv3d, self).__init__(convdim=3,
+                                             imshp=imshp, kshp=kshp,
+                                             border_mode=border_mode,
+                                             subsample=subsample,
+                                             filter_flip=filter_flip,
+                                             filter_dilation=filter_dilation)

+    def grad(self, inp, grads):
+        bottom, weights = inp
+        top, = grads
+        d_bottom = AbstractConv3d_gradInputs(self.imshp, self.kshp,
+                                             self.border_mode,
+                                             self.subsample,
+                                             self.filter_flip,
+                                             self.filter_dilation)(
+            weights, top, bottom.shape[-3:])
+        d_weights = AbstractConv3d_gradWeights(self.imshp, self.kshp,
+                                               self.border_mode,
+                                               self.subsample,
+                                               self.filter_flip,
+                                               self.filter_dilation)(

-class AbstractConv2d_gradWeights(BaseAbstractConv2d):
-    """Gradient wrt. filters for `AbstractConv2d`.
-    Refer to :func:`BaseAbstractConv2d <theano.tensor.nnet.abstract_conv.BaseAbstractConv2d>`
+            bottom, top, weights.shape[-3:])
+
+        # Make sure that the broadcastable pattern of the inputs is used
+        # for the gradients, even if the grad opts are not able to infer
+        # that the dimensions are broadcastable.
+        # Also make sure that the gradient lives on the same device than
+        # the corresponding input.
+        d_bottom = patternbroadcast(d_bottom, bottom.broadcastable)
+        d_bottom = bottom.type.filter_variable(d_bottom)
+        d_weights = patternbroadcast(d_weights, weights.broadcastable)
+        d_weights = weights.type.filter_variable(d_weights)
+        return d_bottom, d_weights
+
+
+class AbstractConv_gradWeights(BaseAbstractConv):
+    """Gradient wrt. filters for `AbstractConv`.
+    Refer to :func:`BaseAbstractConv <theano.tensor.nnet.abstract_conv.BaseAbstractConv>`
    for a more detailed documentation.

    :note: You will not want to use this directly, but rely on
@@ -916,17 +1390,19 @@ class AbstractConv2d_gradWeights(BaseAbstractConv2d):

    """
    def __init__(self,
+                 convdim,
                 imshp=None,
                 kshp=None,
                 border_mode="valid",
-                 subsample=(1, 1),
+                 subsample=None,
                 filter_flip=True,
-                 filter_dilation=(1, 1)):
-        super(AbstractConv2d_gradWeights, self).__init__(imshp, kshp,
-                                                         border_mode,
-                                                         subsample,
-                                                         filter_flip,
-                                                         filter_dilation)
+                 filter_dilation=None):
+        super(AbstractConv_gradWeights, self).__init__(convdim=convdim,
+                                                       imshp=imshp, kshp=kshp,
+                                                       border_mode=border_mode,
+                                                       subsample=subsample,
+                                                       filter_flip=filter_flip,
+                                                       filter_dilation=filter_dilation)

    # Update shape/height_width
    def make_node(self, img, topgrad, shape):
@@ -939,15 +1415,14 @@ class AbstractConv2d_gradWeights(BaseAbstractConv2d):
                               broadcastable=topgrad.broadcastable)
        topgrad = gtype.filter_variable(topgrad)

-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-        if topgrad.type.ndim != 4:
-            raise TypeError('topgrad must be 4D tensor')
+        if img.type.ndim != 2 + self.convdim:
+            raise TypeError('img must be %dD tensor' % (2 + self.convdim))
+        if topgrad.type.ndim != 2 + self.convdim:
+            raise TypeError('topgrad must be %dD tensor' % (2 + self.convdim))

        shape = as_tensor_variable(shape)
        broadcastable = [topgrad.broadcastable[1],
-                         img.broadcastable[1],
-                         False, False]
+                         img.broadcastable[1]] + ([False] * self.convdim)
        output = img.type.clone(broadcastable=broadcastable)()
        return Apply(self, [img, topgrad, shape], [output])

@@ -963,45 +1438,97 @@ class AbstractConv2d_gradWeights(BaseAbstractConv2d):
                mode in ('valid', 'full', 'half')):
            raise ValueError(
                'invalid border_mode {}, which must be either '
-                '"valid", "full", "half", an integer or a pair of'
+                '"valid", "full", "half", an integer or a tuple of'
                ' integers'.format(mode))

-        dil_shape = ((shape[0] - 1) * self.filter_dilation[0] + 1,
-                     (shape[1] - 1) * self.filter_dilation[1] + 1)
+        dil_shape = tuple((shape[i] - 1) * self.filter_dilation[i] + 1
+                          for i in range(self.convdim))

        if mode == "full":
-            mode = (dil_shape[0] - 1, dil_shape[1] - 1)
+            mode = tuple(dil_shape[i] - 1 for i in range(self.convdim))
        elif mode == "half":
-            mode = (dil_shape[0] // 2, dil_shape[1] // 2)
+            mode = tuple(dil_shape[i] // 2 for i in range(self.convdim))
        if isinstance(mode, tuple):
-            pad_h, pad_w = map(int, mode)
+            pad = tuple(int(mode[i]) for i in range(self.convdim))

            mode = "valid"
-            new_img = numpy.zeros((img.shape[0], img.shape[1],
-                                   img.shape[2] + 2 * pad_h,
-                                   img.shape[3] + 2 * pad_w), dtype=img.dtype)
-            new_img[:, :, pad_h:img.shape[2] + pad_h, pad_w:img.shape[3] + pad_w] = img
+            new_img = numpy.zeros((img.shape[0], img.shape[1]) +
+                                  tuple(img.shape[i + 2] + 2 * pad[i]
+                                        for i in range(self.convdim)),
+                                  dtype=img.dtype)
+            new_img[(slice(None), slice(None)) +
+                    tuple(slice(pad[i], img.shape[i + 2] + pad[i])
+                          for i in range(self.convdim))] = img
            img = new_img

-        if self.subsample[0] > 1 or self.subsample[1] > 1:
-            new_shape = (topgrad.shape[0], topgrad.shape[1],
-                         img.shape[2] - dil_shape[0] + 1,
-                         img.shape[3] - dil_shape[1] + 1)
+        if any(self.subsample[i] > 1 for i in range(self.convdim)):
+            new_shape = ((topgrad.shape[0], topgrad.shape[1]) +
+                         tuple(img.shape[i + 2] - dil_shape[i] + 1
+                               for i in range(self.convdim)))
            new_topgrad = numpy.zeros((new_shape), dtype=topgrad.dtype)
-            new_topgrad[:, :, ::self.subsample[0], ::self.subsample[1]] = topgrad
+            new_topgrad[(slice(None), slice(None)) +
+                        tuple(slice(None, None, self.subsample[i])
+                              for i in range(self.convdim))] = topgrad
            topgrad = new_topgrad

-        topgrad = topgrad.transpose(1, 0, 2, 3)[:, :, ::-1, ::-1]
-        img = img.transpose(1, 0, 2, 3)
-        kern = self.conv2d(img, topgrad, mode="valid")
-        if self.filter_dilation[0] > 1 or self.filter_dilation[1] > 1:
-            kern = kern[:, :, ::self.filter_dilation[0], ::self.filter_dilation[1]]
+        axes_order = (1, 0) + tuple(range(2, self.convdim + 2))
+        flip_filters = ((slice(None), slice(None)) +
+                        (slice(None, None, -1),) * self.convdim)
+        topgrad = topgrad.transpose(axes_order)[flip_filters]
+        img = img.transpose(axes_order)
+        kern = self.conv(img, topgrad, mode="valid")
+        if any(self.filter_dilation[i] > 1 for i in range(self.convdim)):
+            kern = kern[(slice(None), slice(None)) +
+                        tuple(slice(None, None, self.filter_dilation[i])
+                              for i in range(self.convdim))]
        if self.filter_flip:
-            kern = kern.transpose(1, 0, 2, 3)[:, :, ::-1, ::-1]
+            kern = kern.transpose(axes_order)[flip_filters]
        else:
-            kern = kern.transpose(1, 0, 2, 3)
+            kern = kern.transpose(axes_order)
        o[0] = node.outputs[0].type.filter(kern)

+    def connection_pattern(self, node):
+        return [[1], [1], [0]]  # no connection to height, width
+
+    def infer_shape(self, node, input_shapes):
+        # We use self.kshp (that was passed when creating the Op) if possible,
+        # or fall back to the `shape` input of the node.
+        # TODO: when there is no subsampling, try to infer the kernel shape
+        # from the shapes of inputs.
+        imshp = input_shapes[0]
+        topshp = input_shapes[1]
+        kshp = self.kshp[:] if self.kshp is not None else [None] * (2 + self.convdim)
+        fallback_kshp = ([topshp[1], imshp[1]] +
+                         [node.inputs[2][i] for i in range(self.convdim)])
+        kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i]
+                for i in range(2 + self.convdim)]
+        return [kshp]
+
+
+class AbstractConv2d_gradWeights(AbstractConv_gradWeights):
+    """Gradient wrt. filters for `AbstractConv2d`.
+    Refer to :func:`BaseAbstractConv <theano.tensor.nnet.abstract_conv.BaseAbstractConv>`
+    for a more detailed documentation.
+
+    :note: You will not want to use this directly, but rely on
+           Theano's automatic differentiation or graph optimization to
+           use it as needed.
+
+    """
+    def __init__(self,
+                 imshp=None,
+                 kshp=None,
+                 border_mode="valid",
+                 subsample=(1, 1),
+                 filter_flip=True,
+                 filter_dilation=(1, 1)):
+        super(AbstractConv2d_gradWeights, self).__init__(convdim=2,
+                                                         imshp=imshp, kshp=kshp,
+                                                         border_mode=border_mode,
+                                                         subsample=subsample,
+                                                         filter_flip=filter_flip,
+                                                         filter_dilation=filter_dilation)
+
    def grad(self, inp, grads):
        bottom, top = inp[:2]
        weights, = grads
@@ -1031,26 +1558,64 @@ class AbstractConv2d_gradWeights(BaseAbstractConv2d):
        d_height_width = (theano.gradient.DisconnectedType()(),)
        return (d_bottom, d_top) + d_height_width

-    def connection_pattern(self, node):
-        return [[1], [1], [0]]  # no connection to height, width

-    def infer_shape(self, node, input_shapes):
-        # We use self.kshp (that was passed when creating the Op) if possible,
-        # or fall back to the `shape` input of the node.
-        # TODO: when there is no subsampling, try to infer the kernel shape
-        # from the shapes of inputs.
-        imshp = input_shapes[0]
-        topshp = input_shapes[1]
-        kshp = self.kshp[:] if self.kshp is not None else [None] * 4
-        fallback_kshp = [topshp[1], imshp[1], node.inputs[2][0], node.inputs[2][1]]
-        kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i]
-                for i in range(4)]
-        return [kshp]
+class AbstractConv3d_gradWeights(AbstractConv_gradWeights):
+    """Gradient wrt. filters for `AbstractConv3d`.
+    Refer to :func:`BaseAbstractConv <theano.tensor.nnet.abstract_conv.BaseAbstractConv>`
+    for a more detailed documentation.

+    :note: You will not want to use this directly, but rely on
+           Theano's automatic differentiation or graph optimization to
+           use it as needed.

-class AbstractConv2d_gradInputs(BaseAbstractConv2d):
-    """Gradient wrt. inputs for `AbstractConv2d`.
-    Refer to :func:`BaseAbstractConv2d <theano.tensor.nnet.abstract_conv.BaseAbstractConv2d>`
+    """
+    def __init__(self,
+                 imshp=None,
+                 kshp=None,
+                 border_mode="valid",
+                 subsample=(1, 1, 1),
+                 filter_flip=True,
+                 filter_dilation=(1, 1, 1)):
+        super(AbstractConv3d_gradWeights, self).__init__(convdim=3,
+                                                         imshp=imshp, kshp=kshp,
+                                                         border_mode=border_mode,
+                                                         subsample=subsample,
+                                                         filter_flip=filter_flip,
+                                                         filter_dilation=filter_dilation)
+
+    def grad(self, inp, grads):
+        bottom, top = inp[:2]
+        weights, = grads
+        d_bottom = AbstractConv3d_gradInputs(self.imshp, self.kshp,
+                                             self.border_mode,
+                                             self.subsample,
+                                             self.filter_flip,
+                                             self.filter_dilation)(weights,
+                                                                   top,
+                                                                   bottom.shape[-3:])
+        d_top = AbstractConv3d(self.imshp,
+                               self.kshp,
+                               self.border_mode,
+                               self.subsample,
+                               self.filter_flip,
+                               self.filter_dilation)(bottom, weights)
+        # Make sure that the broadcastable pattern of the inputs is used
+        # for the gradients, even if the grad opts are not able to infer
+        # that the dimensions are broadcastable.
+        # Also make sure that the gradient lives on the same device than
+        # the corresponding input.
+        d_bottom = patternbroadcast(d_bottom, bottom.broadcastable)
+        d_bottom = bottom.type.filter_variable(d_bottom)
+        d_top = patternbroadcast(d_top, top.broadcastable)
+        d_top = top.type.filter_variable(d_top)
+
+        d_depth_height_width = (theano.gradient.DisconnectedType()(),)
+        return (d_bottom, d_top) + d_depth_height_width
+
+
+class AbstractConv_gradInputs(BaseAbstractConv):
+    """Gradient wrt. inputs for `AbstractConv`.
+    Refer to :func:`BaseAbstractConv <theano.tensor.nnet.abstract_conv.BaseAbstractConv>`
    for a more detailed documentation.

    :note: You will not want to use this directly, but rely on
@@ -1060,17 +1625,19 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):
    """

    def __init__(self,
+                 convdim,
                 imshp=None,
                 kshp=None,
                 border_mode="valid",
-                 subsample=(1, 1),
+                 subsample=None,
                 filter_flip=True,
-                 filter_dilation=(1, 1)):
-        super(AbstractConv2d_gradInputs, self).__init__(imshp, kshp,
-                                                        border_mode,
-                                                        subsample,
-                                                        filter_flip,
-                                                        filter_dilation)
+                 filter_dilation=None):
+        super(AbstractConv_gradInputs, self).__init__(convdim=convdim,
+                                                      imshp=imshp, kshp=kshp,
+                                                      border_mode=border_mode,
+                                                      subsample=subsample,
+                                                      filter_flip=filter_flip,
+                                                      filter_dilation=filter_dilation)

    # Update shape/height_width
    def make_node(self, kern, topgrad, shape):
@@ -1083,15 +1650,14 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):
                                broadcastable=topgrad.broadcastable)
        topgrad = gtype.filter_variable(topgrad)

-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
-        if topgrad.type.ndim != 4:
-            raise TypeError('topgrad must be 4D tensor')
+        if kern.type.ndim != 2 + self.convdim:
+            raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
+        if topgrad.type.ndim != 2 + self.convdim:
+            raise TypeError('topgrad must be %dD tensor' % (2 + self.convdim))

        shape = as_tensor_variable(shape)
        broadcastable = [topgrad.type.broadcastable[0],
-                         kern.type.broadcastable[1],
-                         False, False]
+                         kern.type.broadcastable[1]] + ([False] * self.convdim)
        output = kern.type.clone(broadcastable=broadcastable)()
        return Apply(self, [kern, topgrad, shape], [output])

@@ -1106,35 +1672,86 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):
                mode in ('valid', 'full', 'half')):
            raise ValueError(
                'invalid border_mode {}, which must be either '
-                '"valid", "full", "half", an integer or a pair of'
+                '"valid", "full", "half", an integer or a tuple of'
                ' integers'.format(mode))

-        dil_kernshp = ((kern.shape[2] - 1) * self.filter_dilation[0] + 1,
-                       (kern.shape[3] - 1) * self.filter_dilation[1] + 1)
-        pad_h, pad_w = 0, 0
+        dil_kernshp = tuple((kern.shape[i + 2] - 1) * self.filter_dilation[i] + 1
+                            for i in range(self.convdim))
+        pad = (0,) * self.convdim
        if mode == "full":
-            pad_h, pad_w = (dil_kernshp[0] - 1, dil_kernshp[1] - 1)
+            pad = tuple(dil_kernshp[i] - 1 for i in range(self.convdim))
        elif mode == "half":
-            pad_h, pad_w = (dil_kernshp[0] // 2, dil_kernshp[1] // 2)
+            pad = tuple(dil_kernshp[i] // 2 for i in range(self.convdim))
        elif isinstance(mode, tuple):
-            pad_h, pad_w = map(int, self.border_mode)
-        if self.subsample[0] > 1 or self.subsample[1] > 1:
-            new_shape = (topgrad.shape[0], topgrad.shape[1],
-                         shape[0] + 2 * pad_h - dil_kernshp[0] + 1,
-                         shape[1] + 2 * pad_w - dil_kernshp[1] + 1)
+            pad = tuple(mode[i] for i in range(self.convdim))
+        if any(self.subsample[i] > 1 for i in range(self.convdim)):
+            new_shape = ((topgrad.shape[0], topgrad.shape[1]) +
+                         tuple(shape[i] + 2 * pad[i] - dil_kernshp[i] + 1
+                               for i in range(self.convdim)))
            new_topgrad = numpy.zeros((new_shape), dtype=topgrad.dtype)
-            new_topgrad[:, :, ::self.subsample[0], ::self.subsample[1]] = topgrad
+            new_topgrad[(slice(None), slice(None)) +
+                        tuple(slice(None, None, self.subsample[i])
+                              for i in range(self.convdim))] = topgrad
            topgrad = new_topgrad
-        kern = kern.transpose(1, 0, 2, 3)
+
+        axes_order = (1, 0) + tuple(range(2, self.convdim + 2))
+        flip_filters = ((slice(None), slice(None)) +
+                        (slice(None, None, -1),) * self.convdim)
+        kern = kern.transpose(axes_order)
        if self.filter_flip:
-            topgrad = topgrad[:, :, ::-1, ::-1]
-        img = self.conv2d(topgrad, kern, mode="full", dilation=self.filter_dilation)
+            topgrad = topgrad[flip_filters]
+        img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation)
        if self.filter_flip:
-            img = img[:, :, ::-1, ::-1]
-        if pad_h > 0 or pad_w > 0:
-            img = img[:, :, pad_h:img.shape[2] - pad_h, pad_w:img.shape[3] - pad_w]
+            img = img[flip_filters]
+        if any(p > 0 for p in pad):
+            img = img[(slice(None), slice(None)) +
+                      tuple(slice(pad[i], img.shape[i + 2] - pad[i])
+                            for i in range(self.convdim))]
        o[0] = node.outputs[0].type.filter(img)

+    def connection_pattern(self, node):
+        return [[1], [1], [0]]  # no connection to height, width
+
+    def infer_shape(self, node, input_shapes):
+        # We use self.imshp (that was passed when creating the Op) if possible,
+        # or fall back to the `shape` input of the node.
+        # TODO: when there is no subsampling, try to infer the image shape
+        # from the shapes of inputs.
+        kshp = input_shapes[0]
+        topshp = input_shapes[1]
+        imshp = self.imshp[:] if self.imshp is not None else [None] * (2 + self.convdim)
+        fallback_imshp = ([topshp[0], kshp[1]] +
+                          [node.inputs[2][i] for i in range(self.convdim)])
+        imshp = [fallback_imshp[i] if imshp[i] is None else imshp[i]
+                 for i in range(2 + self.convdim)]
+        return [imshp]
+
+
+class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
+    """Gradient wrt. inputs for `AbstractConv2d`.
+    Refer to :func:`BaseAbstractConv <theano.tensor.nnet.abstract_conv.BaseAbstractConv>`
+    for a more detailed documentation.
+
+    :note: You will not want to use this directly, but rely on
+           Theano's automatic differentiation or graph optimization to
+           use it as needed.
+
+    """
+
+    def __init__(self,
+                 imshp=None,
+                 kshp=None,
+                 border_mode="valid",
+                 subsample=(1, 1),
+                 filter_flip=True,
+                 filter_dilation=(1, 1)):
+        super(AbstractConv2d_gradInputs, self).__init__(convdim=2,
+                                                        imshp=imshp, kshp=kshp,
+                                                        border_mode=border_mode,
+                                                        subsample=subsample,
+                                                        filter_flip=filter_flip,
+                                                        filter_dilation=filter_dilation)
+
    def grad(self, inp, grads):
        weights, top = inp[:2]
        bottom, = grads
@@ -1162,19 +1779,55 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):
        d_height_width = (theano.gradient.DisconnectedType()(),)
        return (d_weights, d_top) + d_height_width

-    def connection_pattern(self, node):
-        return [[1], [1], [0]]  # no connection to height, width

-    def infer_shape(self, node, input_shapes):
-        # We use self.imshp (that was passed when creating the Op) if possible,
-        # or fall back to the `shape` input of the node.
-        # TODO: when there is no subsampling, try to infer the image shape
-        # from the shapes of inputs.
-        kshp = input_shapes[0]
-        topshp = input_shapes[1]
-        imshp = self.imshp[:] if self.imshp is not None else [None] * 4
-        fallback_imshp = [topshp[0], kshp[1], node.inputs[2][0],
-                          node.inputs[2][1]]
-        imshp = [fallback_imshp[i] if imshp[i] is None else imshp[i]
-                 for i in range(4)]
-        return [imshp]
+class AbstractConv3d_gradInputs(AbstractConv_gradInputs):
+    """Gradient wrt. inputs for `AbstractConv3d`.
+    Refer to :func:`BaseAbstractConv <theano.tensor.nnet.abstract_conv.BaseAbstractConv>`
+    for a more detailed documentation.
+
+    :note: You will not want to use this directly, but rely on
+           Theano's automatic differentiation or graph optimization to
+           use it as needed.
+
+    """
+
+    def __init__(self,
+                 imshp=None,
+                 kshp=None,
+                 border_mode="valid",
+                 subsample=(1, 1, 1),
+                 filter_flip=True,
+                 filter_dilation=(1, 1, 1)):
+        super(AbstractConv3d_gradInputs, self).__init__(convdim=3,
+                                                        imshp=imshp, kshp=kshp,
+                                                        border_mode=border_mode,
+                                                        subsample=subsample,
+                                                        filter_flip=filter_flip,
+                                                        filter_dilation=filter_dilation)
+
+    def grad(self, inp, grads):
+        weights, top = inp[:2]
+        bottom, = grads
+        d_weights = AbstractConv3d_gradWeights(self.imshp, self.kshp,
+                                               self.border_mode,
+                                               self.subsample,
+                                               self.filter_flip,
+                                               self.filter_dilation)(bottom, top,
+                                                                     weights.shape[-3:])
+        d_top = AbstractConv3d(self.imshp, self.kshp,
+                               self.border_mode,
+                               self.subsample,
+                               self.filter_flip,
+                               self.filter_dilation)(bottom, weights)
+        # Make sure that the broadcastable pattern of the inputs is used
+        # for the gradients, even if the grad opts are not able to infer
+        # that the dimensions are broadcastable.
+        # Also make sure that the gradient lives on the same device than
+        # the corresponding input.
+        d_weights = patternbroadcast(d_weights, weights.broadcastable)
+        d_weights = weights.type.filter_variable(d_weights)
+        d_top = patternbroadcast(d_top, top.broadcastable)
+        d_top = top.type.filter_variable(d_top)
+
+        d_depth_height_width = (theano.gradient.DisconnectedType()(),)
+        return (d_weights, d_top) + d_depth_height_width
--- a/theano/tensor/nnet/corr3d.py
+++ b/theano/tensor/nnet/corr3d.py
+from __future__ import absolute_import, print_function, division
+import os
+import logging
+
+from six import integer_types
+
+import theano
+from theano import Apply
+from theano import gof
+from theano.tensor import as_tensor_variable, TensorType
+from theano.tensor.nnet.abstract_conv import get_conv_output_shape
+from theano.tensor import blas_headers
+from theano.tensor.blas import ldflags, blas_header_version
+
+_logger = logging.getLogger(__name__)
+
+
+class BaseCorr3dMM(gof.OpenMPOp):
+    """
+    Base class for `Corr3dMM`, `Corr3dMM_gradWeights` and
+    `Corr3dMM_gradInputs`. Cannot be used directly.
+    Parameters
+    ----------
+    border_mode : {'valid', 'full', 'half'}
+        Additionally, the padding size could be directly specified by an integer
+        or a tuple of three of integers
+    subsample
+        Perform subsampling of the output (default: (1, 1, 1)).
+    filter_dilation
+        Perform dilated correlation (default: (1, 1, 1))
+    """
+    check_broadcast = False
+    __props__ = ('border_mode', 'subsample', 'filter_dilation')
+
+    def __init__(self, border_mode="valid", subsample=(1, 1, 1),
+                 filter_dilation=(1, 1, 1), openmp=None):
+        super(BaseCorr3dMM, self).__init__(openmp=openmp)
+        if isinstance(border_mode, integer_types):
+            if border_mode < 0:
+                raise ValueError(
+                    'invalid border_mode {}, which must be a '
+                    'non-negative integer'.format(border_mode))
+            border_mode = (border_mode, border_mode, border_mode)
+        if isinstance(border_mode, tuple):
+            if len(border_mode) != 3 or min(border_mode) < 0:
+                raise ValueError(
+                    'invalid border_mode {}, which must be a tuple of '
+                    'three non-negative integers'.format(border_mode))
+            pad_h, pad_w, pad_d = map(int, border_mode)
+            border_mode = (pad_h, pad_w, pad_d)
+        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
+                border_mode in ('valid', 'full', 'half')):
+            raise ValueError(
+                'invalid border_mode {}, which must be either '
+                '"valid", "full", "half", an integer or a tuple of three'
+                ' integers'.format(border_mode))
+        self.border_mode = border_mode
+        if len(subsample) != 3:
+            raise ValueError("subsample must have three elements")
+        if len(filter_dilation) != 3:
+            raise ValueError("filter_dilation must have three elements")
+        self.subsample = tuple(subsample)
+        self.filter_dilation = tuple(filter_dilation)
+
+        if not theano.config.blas.ldflags:
+            raise NotImplementedError("C code for corrMM* classes need a blas library.")
+        else:
+            if 'openblas' in theano.config.blas.ldflags:
+                self.blas_type = 'openblas'
+            elif 'mkl' in theano.config.blas.ldflags:
+                self.blas_type = 'mkl'
+            else:
+                self.blas_type = ''
+
+    @property
+    def pad(self):
+        if self.border_mode != 'valid':
+            return self.border_mode
+        return (0, 0, 0)
+
+    def __str__(self):
+        return '%s{%s, %s, %s}' % (
+            self.__class__.__name__,
+            self.border_mode,
+            str(self.subsample),
+            str(self.filter_dilation))
+
+    @staticmethod
+    def as_common_dtype(in1, in2):
+        """
+        Upcast input variables if neccesary.
+        """
+        dtype = theano.scalar.upcast(in1.dtype, in2.dtype)
+        return in1.astype(dtype), in2.astype(dtype)
+
+    def c_support_code(self):
+        ccodes = blas_headers.blas_header_text()
+        if self.blas_type == 'openblas':
+            ccodes += blas_headers.openblas_threads_text()
+        elif self.blas_type == 'mkl':
+            ccodes += blas_headers.mkl_threads_text()
+        return ccodes
+
+    def c_libraries(self):
+        return ldflags()
+
+    def c_compile_args(self):
+        compile_args = ldflags(libs=False, flags=True)
+        compile_args += super(BaseCorr3dMM, self).c_compile_args()
+        return compile_args
+
+    def c_lib_dirs(self):
+        return ldflags(libs=False, libs_dir=True)
+
+    def c_header_dirs(self):
+        return ldflags(libs=False, include_dir=True)
+
+    def c_headers(self):
+        headers = ['<stdio.h>']
+        headers += super(BaseCorr3dMM, self).c_headers()
+        return headers
+
+    def c_code_cache_version(self):
+        # raise this whenever modifying any of the support_code_files
+        return (1, self.openmp, blas_header_version())
+
+    def c_support_code_apply(self, node, nodename):
+        # REMEMBER TO RAISE c_code_cache_version when changing any of
+        # these files
+        sub = {}
+        dtype = str(node.__dict__['inputs'][0].dtype)
+        assert dtype in ('float32', 'float64')
+        if dtype == 'float32':
+            sub['gemm'] = 'sgemm_'
+            sub['float_type'] = 'npy_float'
+            sub['float_typenum'] = 'NPY_FLOAT'
+            sub['n_bytes'] = 4
+            sub['c_float_type'] = 'float'
+        else:
+            sub['gemm'] = 'dgemm_'
+            sub['float_type'] = 'npy_double'
+            sub['float_typenum'] = 'NPY_DOUBLE'
+            sub['n_bytes'] = 8
+            sub['c_float_type'] = 'double'
+
+        if self.openmp:
+            sub['omp_flags'] = '#pragma omp parallel for schedule(static)'
+            sub['omp_get_max_threads'] = 'omp_get_max_threads()'
+            sub['omp_get_thread_num'] = 'omp_get_thread_num()'
+
+            if self.blas_type == 'openblas':
+                sub['blas_set_num_threads'] = 'openblas_set_num_threads'
+                sub['blas_get_num_threads'] = 'openblas_get_num_threads()'
+            elif self.blas_type == 'mkl':
+                sub['blas_set_num_threads'] = 'mkl_set_num_threads'
+                sub['blas_get_num_threads'] = 'mkl_get_max_threads()'
+            else:
+                sub['blas_set_num_threads'] = ''
+                sub['blas_get_num_threads'] = '0'
+        else:
+            sub['omp_flags'] = ''
+            sub['omp_get_max_threads'] = '1'
+            sub['omp_get_thread_num'] = '0'
+            sub['blas_set_num_threads'] = ''
+            sub['blas_get_num_threads'] = '0'
+
+        files = ['corr3d_gemm.c']
+        codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
+                 for f in files]
+        final_code = ''
+        for code in codes:
+            final_code += code
+        return final_code % sub
+
+    def c_code_helper(self, bottom, weights, top, direction, sub,
+                      height=None, width=None, depth=None):
+        """
+        This generates the C code for Corr3dMM (direction="forward"),
+        Corr3dMM_gradWeights (direction="backprop weights"), and
+        Corr3dMM_gradInputs (direction="backprop inputs").
+        Depending on the direction, one of bottom, weights, top will
+        receive the output, while the other two serve as inputs.
+
+        :param bottom: Variable name of the input images in the forward pass,
+            or the gradient of the input images in backprop wrt. inputs
+        :param weights: Variable name of the filters in the forward pass,
+            or the gradient of the filters in backprop wrt. weights
+        :param top: Variable name of the output images / feature maps in the
+            forward pass, or the gradient of the outputs in the backprop passes
+        :param direction: "forward" to correlate bottom with weights and store
+            results in top,
+            "backprop weights" to do a valid convolution of bottom with top
+            (swapping the first two dimensions) and store results in weights,
+            and "backprop inputs" to do a full convolution of top with weights
+            (swapping the first two dimensions) and store results in bottom.
+        :param sub: Dictionary of substitutions useable to help generating the
+            C code.
+        :param height: If self.subsample[0] != 1, a variable giving the height
+            of the filters for direction="backprop weights" or the height of
+            the input images for direction="backprop inputs".
+
+            If self.border_mode == 'half', a variable giving the height of the
+            filters for direction="backprop weights".  Ignored otherwise.
+        :param width: If self.subsample[1] != 1, a variable giving the width
+            of the filters for direction="backprop weights" or the width of the
+            input images for direction="backprop inputs".
+
+            If self.border_mode == 'half', a variable giving the width of the
+            filters for direction="backprop weights".  Ignored otherwise.
+        :param depth: If self.subsample[1] != 1, a variable giving the depth
+            of the filters for direction="backprop weights" or the depth of the
+            input images for direction="backprop inputs".
+
+            If self.border_mode == 'half', a variable giving the depth of the
+            filters for direction="backprop weights".  Ignored otherwise.
+        """
+        dH, dW, dD = self.subsample
+        dilH, dilW, dilD = self.filter_dilation
+        if self.border_mode == "half":
+            padH = padW = padD = -1
+        elif self.border_mode == "full":
+            padH = padW = padD = -2
+        elif isinstance(self.border_mode, tuple):
+            padH, padW, padD = self.border_mode
+        else:
+            assert self.border_mode == "valid"
+            padH = padW = padD = 0
+        if direction == "forward":
+            direction = 0
+            out = top
+        elif direction == "backprop weights":
+            direction = 1
+            out = weights
+        elif direction == "backprop inputs":
+            direction = 2
+            out = bottom
+        else:
+            raise ValueError("direction must be one of 'forward', "
+                             "'backprop weights', 'backprop inputs'")
+        # When subsampling, we cannot unambiguously infer the height and width
+        # of bottom and weights from top, so we require them to be given.
+        # Similarly, when border_mode="half", we cannot infer the weight size.
+        if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
+            if not height:
+                raise ValueError("height must be given for backprop with vertical sampling or border_mode='half'")
+            height = '(*(npy_int64 *)(PyArray_DATA(%s)))' % height
+        else:
+            height = '-1'
+        if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
+            if not width:
+                raise ValueError("width must be given for backprop with horizontal sampling or border_mode='half'")
+            width = '(*(npy_int64 *)(PyArray_DATA(%s)))' % width
+        else:
+            width = '-1'
+        if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
+            if not depth:
+                raise ValueError("depth must be given for backprop with depth sampling or border_mode='half'")
+            depth = '(*(npy_int64 *)(PyArray_DATA(%s)))' % depth
+        else:
+            depth = '-1'
+        sub = sub.copy()
+        sub.update(locals())
+
+        return """
+    // Mandatory args
+    int direction = %(direction)s;  // forward, bprop weights, bprop inputs
+
+    // Optional args
+    int dH = %(dH)s;
+    int dW = %(dW)s;
+    int dD = %(dD)s;
+    int dilH = %(dilH)s;
+    int dilW = %(dilW)s;
+    int dilD = %(dilD)s;
+    int padH = %(padH)s;
+    int padW = %(padW)s;
+    int padD = %(padD)s;
+
+    PyArrayObject * bottom = %(bottom)s;
+    PyArrayObject * weights = %(weights)s;
+    PyArrayObject * top = %(top)s;
+    PyArrayObject * out2 = NULL;
+
+    // Obtain or infer kernel width, height and depth
+    // (we need to know it early to be able to handle auto-padding)
+    int kH, kW, kD;
+    if (direction != 1) {
+        // weight is an input variable, we can just read its shape
+        kH = PyArray_DIMS(weights)[2];
+        kW = PyArray_DIMS(weights)[3];
+        kD = PyArray_DIMS(weights)[4];
+    }
+    else {
+        if ((dH != 1) || (padH == -1)) {
+            // vertical subsampling or half padding, kernel height is specified
+            kH = %(height)s;
+        }
+        else if (padH == -2) {
+            // vertical full padding, we can infer the kernel height
+            kH = (2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH - 1)/ dilH + 1;
+        }
+        else {
+            // explicit padding, we can infer the kernel height
+            kH = (PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
+        }
+        if ((dW != 1) || (padW == -1)) {
+            kW = %(width)s;
+        }
+        else if (padW == -2) {
+            kW = (2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
+        }
+        else {
+            kW = (PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
+        }
+        if ((dD != 1) || (padD == -1)) {
+            kD = %(depth)s;
+        }
+        else if (padD == -2) {
+            kD = (2 - PyArray_DIMS(bottom)[4] + (PyArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
+        }
+        else {
+            kD = (PyArray_DIMS(bottom)[4] + 2*padD - (PyArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
+        }
+    }
+
+    // Implicit dilated kernel size
+    int dil_kH = (kH - 1) * dilH + 1;
+    int dil_kW = (kW - 1) * dilW + 1;
+    int dil_kD = (kD - 1) * dilD + 1;
+
+    // Auto-padding if requested
+    if (padH == -1) {  // vertical half padding
+        padH = dil_kH / 2;
+    }
+    else if (padH == -2) {  // vertical full padding
+        padH = dil_kH - 1;
+    }
+    else if (padH < 0) {
+        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padH must be >= -2");
+        %(fail)s
+    }
+    if (padW == -1) {  // horizontal half padding
+        padW = dil_kW / 2;
+    }
+    else if (padW == -2) {  // horizontal full padding
+        padW = dil_kW - 1;
+    }
+    else if (padW < 0) {
+        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padW must be >= -2");
+        %(fail)s
+    }
+    if (padD == -1) {  // depth half padding
+        padD = dil_kD / 2;
+    }
+    else if (padD == -2) {  // depth full padding
+        padD = dil_kD - 1;
+    }
+    else if (padD < 0) {
+        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padD must be >= -2");
+        %(fail)s
+    }
+
+    // Infer output shape
+    npy_intp out_dim[5];
+    switch(direction) {
+    case 0:  // forward pass
+        // output is top: (batchsize, num_filters, height, width, depth)
+        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
+        out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
+        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
+        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1);
+        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1);
+        out_dim[4] = (npy_intp)((PyArray_DIMS(bottom)[4] + 2*padD - ((PyArray_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1);
+        break;
+    case 1:  // backprop wrt. weights
+        // output is weights: (num_filters, num_channels, height, width, depth)
+        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
+        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
+        out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1];
+        out_dim[2] = (npy_intp)kH;  // already inferred further above
+        out_dim[3] = (npy_intp)kW;  // how convenient
+        out_dim[4] = (npy_intp)kD;
+        break;
+    case 2:  // backprop wrt. inputs
+        // output is bottom: (batchsize, num_channels, height, width, depth)
+        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
+        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
+        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1];
+        out_dim[2] = (npy_intp)((dH != 1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
+        out_dim[3] = (npy_intp)((dW != 1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
+        out_dim[4] = (npy_intp)((dD != 1) ? %(depth)s : (PyArray_DIMS(top)[4] - 1) * dD + (PyArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD);
+        break;
+    default:
+        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: direction must be 0, 1, or 2\\n");
+        %(fail)s
+    }
+
+    // Prepare output array
+    int typenum;
+    if ( !(%(out)s
+           && PyArray_NDIM(%(out)s)==4
+           && PyArray_IS_C_CONTIGUOUS(%(out)s)
+           && PyArray_DIMS(%(out)s)[0]==out_dim[0]
+           && PyArray_DIMS(%(out)s)[1]==out_dim[1]
+           && PyArray_DIMS(%(out)s)[2]==out_dim[2]
+           && PyArray_DIMS(%(out)s)[3]==out_dim[3]
+           && PyArray_DIMS(%(out)s)[4]==out_dim[4]))
+    {
+        Py_XDECREF(%(out)s);
+        if (direction != 1) {
+          typenum = PyArray_TYPE(weights);
+        }
+        else {
+          typenum = PyArray_TYPE(bottom);
+        }
+        //Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
+        %(out)s = (PyArrayObject*)PyArray_ZEROS(5,
+                                          out_dim,
+                                          typenum,
+                                          0);
+        if (NULL == %(out)s)
+        {
+            PyErr_Format(PyExc_RuntimeError,
+                    "BaseCorr3dMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld x %%lld",
+                    (long long)out_dim[0], (long long)out_dim[1],
+                    (long long)out_dim[2], (long long)out_dim[3], (long long)out_dim[4]);
+            %(fail)s
+        }
+    }
+
+    // Call corr3dMM code
+    out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction,
+                    dH, dW, dD, dilH, dilW, dilD, padH, padW, padD);
+    if (out2==NULL){
+       %(fail)s
+    }
+    assert (out2 == %(out)s);
+
+""" % sub
+
+
+class Corr3dMM(BaseCorr3dMM):
+    """
+    CPU correlation implementation using Matrix Multiplication.
+
+    Parameters
+    ----------
+    border_mode
+        The width of a border of implicit zeros to pad the
+        input with. Must be a tuple with 3 elements giving the width of
+        the padding on each side, or a single integer to pad the same
+        on all sides, or a string shortcut setting the padding at runtime:
+        ``'valid'`` for ``(0, 0, 0)`` (valid convolution, no padding), ``'full'``
+        for ``(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)``
+        (full convolution), ``'half'`` for ``(kernel_rows // 2,
+        kernel_columns // 2, kernel_depth // 2)`` (same convolution for
+        odd-sized kernels). Note that the three widths are each
+        applied twice, once per side (left and right, top and bottom, front
+        and back).
+    subsample
+        The subsample operation applied to each output image. Should be a tuple
+        with 3 elements. Set to `(1, 1, 1)` to disable subsampling.
+    filter_dilation
+        The filter dilation operation applied to each input image.
+        Should be a tuple with 3 elements.
+        Set to `(1, 1, 1)` to disable filter dilation.
+
+    """
+
+    def make_node(self, img, kern):
+        img = as_tensor_variable(img)
+        kern = as_tensor_variable(kern)
+        img, kern = self.as_common_dtype(img, kern)
+        if img.type.ndim != 5:
+            raise TypeError('img must be 5D tensor')
+        if kern.type.ndim != 5:
+            raise TypeError('kern must be 5D tensor')
+
+        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
+                         False, False, False]
+        dtype = img.type.dtype
+        return Apply(self, [img, kern], [TensorType(dtype, broadcastable)()])
+
+    def infer_shape(self, node, input_shape):
+        imshp = input_shape[0]
+        kshp = input_shape[1]
+        res = get_conv_output_shape(
+            imshp,
+            kshp,
+            self.border_mode,
+            self.subsample,
+            self.filter_dilation)
+        return [res]
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        bottom, weights = inp
+        top, = out_
+        direction = "forward"
+        return super(Corr3dMM, self).c_code_helper(bottom, weights, top, direction, sub)
+
+    def grad(self, inp, grads):
+        bottom, weights = inp
+        top, = grads
+        d_bottom = Corr3dMM_gradInputs(self.border_mode,
+                                       self.subsample,
+                                       self.filter_dilation)(weights, top,
+                                                             bottom.shape[-3:])
+        d_weights = Corr3dMM_gradWeights(self.border_mode,
+                                         self.subsample,
+                                         self.filter_dilation)(bottom, top,
+                                                               weights.shape[-3:])
+        return d_bottom, d_weights
+
+
+class Corr3dMM_gradWeights(BaseCorr3dMM):
+    """
+    Gradient wrt. filters for `Corr3dMM`.
+
+    Notes
+    -----
+    You will not want to use this directly, but rely on
+    Theano's automatic differentiation or graph optimization to
+    use it as needed.
+
+    """
+
+    def make_node(self, img, topgrad, shape=None):
+        img = as_tensor_variable(img)
+        topgrad = as_tensor_variable(topgrad)
+        img, topgrad = self.as_common_dtype(img, topgrad)
+        if img.type.ndim != 5:
+            raise TypeError('img must be 5D tensor')
+        if topgrad.type.ndim != 5:
+            raise TypeError('topgrad must be 5D tensor')
+        if self.subsample != (1, 1, 1) or self.border_mode == "half":
+            if shape is None:
+                raise ValueError('shape must be given if subsample != (1, 1, 1)'
+                                 ' or border_mode == "half"')
+            height_width_depth = [as_tensor_variable(shape[0]).astype('int64'),
+                                  as_tensor_variable(shape[1]).astype('int64'),
+                                  as_tensor_variable(shape[2]).astype('int64')]
+        else:
+            height_width_depth = []
+
+        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
+                         False, False, False]
+        dtype = img.type.dtype
+        return Apply(self, [img, topgrad] + height_width_depth,
+                     [TensorType(dtype, broadcastable)()])
+
+    def infer_shape(self, node, input_shape):
+        if self.border_mode == "half":
+            padH = padW = padD = -1
+        elif self.border_mode == "full":
+            padH = padW = padD = -2
+        elif isinstance(self.border_mode, tuple):
+            padH, padW, padD = self.border_mode
+        else:
+            assert self.border_mode == "valid"
+            padH = padW = padD = 0
+        dH, dW, dD = self.subsample
+        imshp = input_shape[0]
+        topshp = input_shape[1]
+        ssize, imshp = imshp[1], list(imshp[2:])
+        nkern, topshp = topshp[1], list(topshp[2:])
+        height_width_depth = node.inputs[-3:]
+        if ((dH != 1) or (padH == -1)):
+            # vertical subsampling or half padding, kernel height is specified
+            kH = height_width_depth[0]
+        elif padH == -2:
+            # vertical full padding, we can infer the kernel height
+            kH = 2 - imshp[0] + (topshp[0] - 1) * dH
+        else:
+            # explicit padding, we can infer the kernel height
+            kH = imshp[0] + 2 * padH - (topshp[0] - 1) * dH
+        if ((dW != 1) or (padW == -1)):
+            kW = height_width_depth[1]
+        elif (padW == -2):
+            kW = 2 - imshp[1] + (topshp[1] - 1) * dW
+        else:
+            kW = imshp[1] + 2 * padW - (topshp[1] - 1) * dW
+        if ((dD != 1) or (padD == -1)):
+            kD = height_width_depth[2]
+        elif (padD == -2):
+            kD = 2 - imshp[2] + (topshp[2] - 1) * dD
+        else:
+            kD = imshp[2] + 2 * padD - (topshp[2] - 1) * dD
+        return [(nkern, ssize, kH, kW, kD)]
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        bottom, top = inp[:2]
+        height, width, depth = inp[2:] or (None, None, None)
+        weights, = out_
+        direction = "backprop weights"
+        return super(Corr3dMM_gradWeights,
+                     self).c_code_helper(bottom, weights, top, direction,
+                                         sub, height, width, depth)
+
+    def grad(self, inp, grads):
+        bottom, top = inp[:2]
+        weights, = grads
+        d_bottom = Corr3dMM_gradInputs(self.border_mode,
+                                       self.subsample,
+                                       self.filter_dilation)(weights, top,
+                                                             bottom.shape[-3:])
+        d_top = Corr3dMM(self.border_mode,
+                         self.subsample,
+                         self.filter_dilation)(bottom, weights)
+        d_height_width_depth = ((theano.gradient.DisconnectedType()(),) * 3
+                                if len(inp) == 5 else ())
+        return (d_bottom, d_top) + d_height_width_depth
+
+    def connection_pattern(self, node):
+        if node.nin == 2:
+            return [[1], [1]]
+        else:
+            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
+
+
+class Corr3dMM_gradInputs(BaseCorr3dMM):
+    """
+    Gradient wrt. inputs for `Corr3dMM`.
+
+    Notes
+    -----
+    You will not want to use this directly, but rely on
+    Theano's automatic differentiation or graph optimization to
+    use it as needed.
+
+    """
+
+    def make_node(self, kern, topgrad, shape=None):
+        kern = as_tensor_variable(kern)
+        topgrad = as_tensor_variable(topgrad)
+        kern, topgrad = self.as_common_dtype(kern, topgrad)
+        if kern.type.ndim != 5:
+            raise TypeError('kern must be 5D tensor')
+        if topgrad.type.ndim != 5:
+            raise TypeError('topgrad must be 5D tensor')
+        if self.subsample != (1, 1, 1) and shape is None:
+            raise ValueError('shape must be given if subsample != (1, 1, 1)')
+        if self.subsample != (1, 1, 1):
+            height_width_depth = [as_tensor_variable(shape[0]).astype('int64'),
+                                  as_tensor_variable(shape[1]).astype('int64'),
+                                  as_tensor_variable(shape[2]).astype('int64')]
+        else:
+            height_width_depth = []
+
+        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+                         False, False, False]
+        dtype = kern.type.dtype
+        return Apply(self, [kern, topgrad] + height_width_depth,
+                     [TensorType(dtype, broadcastable)()])
+
+    def infer_shape(self, node, input_shape):
+        if self.border_mode == "half":
+            padH = padW = padD = -1
+        elif self.border_mode == "full":
+            padH = padW = padD = -2
+        elif isinstance(self.border_mode, tuple):
+            padH, padW, padD = self.border_mode
+        else:
+            assert self.border_mode == "valid"
+            padH = padW = padD = 0
+        dH, dW, dD = self.subsample
+        kshp = input_shape[0]
+        topshp = input_shape[1]
+        ssize, kshp = kshp[1], list(kshp[2:])
+        bsize, topshp = topshp[0], list(topshp[2:])
+        height_width_depth = node.inputs[-3:]
+        if padH == -1:
+            padH = kshp[0] // 2
+        elif padH == -2:
+            padH = kshp[0] - 1
+        elif padH < -2:
+            raise ValueError('Corr3dMM_gradInputs: border_mode must be >= 0.')
+        if padW == -1:
+            padW = kshp[1] // 2
+        elif padW == -2:
+            padW = kshp[1] - 1
+        elif padW < -2:
+            raise ValueError('Corr3dMM_gradInputs: border_mode must be >= 0.')
+        if padD == -1:
+            padD = kshp[2] // 2
+        elif padD == -2:
+            padD = kshp[2] - 1
+        elif padD < -2:
+            raise ValueError('Corr3dMM_gradInputs: border_mode must be >= 0.')
+
+        if dH != 1:
+            out_shp0 = height_width_depth[0]
+        else:
+            out_shp0 = (topshp[0] - 1) * dH + kshp[0] - 2 * padH
+        if dW != 1:
+            out_shp1 = height_width_depth[1]
+        else:
+            out_shp1 = (topshp[1] - 1) * dW + kshp[1] - 2 * padW
+        if dD != 1:
+            out_shp2 = height_width_depth[2]
+        else:
+            out_shp2 = (topshp[2] - 1) * dD + kshp[2] - 2 * padD
+        out_shp = (out_shp0, out_shp1, out_shp2)
+        return [(bsize, ssize) + out_shp]
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        weights, top = inp[:2]
+        height, width, depth = inp[2:] or (None, None, None)
+        bottom, = out_
+        direction = "backprop inputs"
+        return super(Corr3dMM_gradInputs,
+                     self).c_code_helper(bottom, weights, top, direction, sub,
+                                         height, width, depth)
+
+    def grad(self, inp, grads):
+        weights, top = inp[:2]
+        bottom, = grads
+        d_weights = Corr3dMM_gradWeights(self.border_mode,
+                                         self.subsample,
+                                         self.filter_dilation)(bottom,
+                                                               top,
+                                                               weights.shape[-3:])
+        d_top = Corr3dMM(self.border_mode,
+                         self.subsample,
+                         self.filter_dilation)(bottom, weights)
+        d_height_width_depth = ((theano.gradient.DisconnectedType()(),) * 3
+                                if len(inp) == 5 else ())
+        return (d_weights, d_top) + d_height_width_depth
+
+    def connection_pattern(self, node):
+        if node.nin == 2:
+            return [[1], [1]]
+        else:
+            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
--- a/theano/tensor/nnet/corr3d_gemm.c
+++ b/theano/tensor/nnet/corr3d_gemm.c
+// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
+// sources are clearly marked. Below we reproduce the original license of
+// the Caffe software.
+/*
+Copyright (c) 2014, The Regents of the University of California (Regents)
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cpp)
+// Loops for fast unfold + copy
+void im3d2col(const %(float_type)s* data_im, const int channels,
+    const int height, const int width, const int depth,
+    const int kernel_h, const int kernel_w, const int kernel_d,
+    const int dilation_h, const int dilation_w, const int dilation_d,
+    const int pad_h, const int pad_w, const int pad_d,
+    const int stride_h, const int stride_w, const int stride_d,
+    %(float_type)s* data_col) {
+  // Implicit dilated kernel size
+  int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
+  int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
+  int dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
+  int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
+  int depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
+  int channels_col = channels * kernel_h * kernel_w * kernel_d;
+  for (int c = 0; c < channels_col; ++c) {
+    int d_offset = c %% kernel_d;
+    int w_offset = (c / kernel_d) %% kernel_w;
+    int h_offset = (c / kernel_w / kernel_d) %% kernel_h;
+    int c_im = c / kernel_h / kernel_w / kernel_d;
+    for (int h = 0; h < height_col; ++h) {
+      int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
+      for (int w = 0; w < width_col; ++w) {
+        int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
+        for (int d = 0; d < depth_col; ++d) {
+          int d_pad = d * stride_d - pad_d + d_offset * dilation_d;
+          if (h_pad >= 0 && h_pad < height
+              && w_pad >= 0 && w_pad < width
+              && d_pad >= 0 && d_pad < depth)
+            data_col[(npy_intp)((c * height_col + h) * width_col + w) * depth_col + d] =
+              data_im[(npy_intp)((c_im * height + h_pad) * width + w_pad) * depth + d_pad];
+          else
+            data_col[(npy_intp)((c * height_col + h) * width_col + w) * depth_col + d] = 0.;
+        }
+      }
+    }
+  }
+}
+
+// Unlike the Caffe and Theano GPU verions, the data_im array is set to zero
+// before the col2im call rather than doing it here. So, the result is just
+// accumulated into data_im.
+void col2im3d(const %(float_type)s* data_col, const int channels,
+    const int height, const int width, const int depth,
+    const int patch_h, const int patch_w, const int patch_d,
+    const int dilation_h, const int dilation_w, const int dilation_d,
+    const int pad_h, const int pad_w, const int pad_d,
+    const int stride_h, const int stride_w, const int stride_d,
+    %(float_type)s* data_im) {
+  // Implicit dilated patch
+  int dil_patch_h = (patch_h - 1) * dilation_h + 1;
+  int dil_patch_w = (patch_w - 1) * dilation_w + 1;
+  int dil_patch_d = (patch_d - 1) * dilation_d + 1;
+  int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
+  int depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
+  int num_kernels = channels * height * width * depth;
+  int channels_col = channels * patch_h * patch_w * patch_d;
+  for (int c = 0; c < channels_col; ++c) {
+    int d_offset = c %% patch_d;
+    int w_offset = (c / patch_d) %% patch_w;
+    int h_offset = (c / patch_w / patch_d) %% patch_h;
+    int c_im = c / patch_h / patch_w / patch_d;
+    for (int h = 0; h < height_col; ++h) {
+      int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
+      for (int w = 0; w < width_col; ++w) {
+        int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
+        for (int d = 0; d < depth_col; ++d) {
+          int d_pad = d * stride_d - pad_d + d_offset * dilation_d;
+          if (h_pad >= 0 && h_pad < height
+              && w_pad >= 0 && w_pad < width
+              && d_pad >= 0 && d_pad < depth)
+            data_im[(npy_intp)((c_im * height + h_pad) * width + w_pad) * depth + d_pad] +=
+              data_col[(npy_intp)((c * height_col + h) * width_col + w) * depth_col + d];
+        }
+      }
+    }
+  }
+}
+
+
+// Theano op code
+// GPU version authors: Arjun Jain, Frederic Bastien, Jan Schlueter
+// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
+//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
+// CPU version author: Jesse Livezey
+// CPU version adapted from GPU version
+PyArrayObject* corr3dMM(PyArrayObject* bottom,
+                        PyArrayObject* weight,
+                        PyArrayObject* top,
+                        const int direction,
+                        const int dH = 1,
+                        const int dW = 1,
+                        const int dD = 1,
+                        const int dilH = 1,
+                        const int dilW = 1,
+                        const int dilD = 1,
+                        const int padH = 0,
+                        const int padW = 0,
+                        const int padD = 0)
+{
+    if (PyArray_NDIM(bottom) != 5)
+    {
+        PyErr_SetString(PyExc_ValueError, "Corr3dMM requires bottom of 5D");
+        return NULL;
+    }
+    if (PyArray_TYPE(bottom) != %(float_typenum)s)
+    {
+        PyErr_SetString(PyExc_ValueError, "Corr3dMM received bottom with wrong type.");
+        return NULL;
+    }
+
+    if (PyArray_NDIM(weight) != 5)
+    {
+        PyErr_SetString(PyExc_ValueError, "Corr3dMM requires weight of 5D");
+        return NULL;
+    }
+    if (PyArray_TYPE(weight) != %(float_typenum)s)
+    {
+        PyErr_SetString(PyExc_ValueError, "Corr3dMM received weight with wrong type.");
+        return NULL;
+    }
+
+    if (PyArray_NDIM(top) != 5)
+    {
+        PyErr_SetString(PyExc_ValueError, "Corr3dMM requires top of 5D");
+        return NULL;
+    }
+    if (PyArray_TYPE(top) != %(float_typenum)s)
+    {
+        PyErr_SetString(PyExc_ValueError, "Corr3dMM received top with wrong type.");
+        return NULL;
+    }
+    // Ensure data is contiguous
+    bottom = PyArray_GETCONTIGUOUS(bottom);
+    weight = PyArray_GETCONTIGUOUS(weight);
+    top = PyArray_GETCONTIGUOUS(top);
+
+    // Extract some shape information for later and check shape consistency
+    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
+    const int batchSize = PyArray_DIMS(bottom)[0];
+    const int nChannels = PyArray_DIMS(bottom)[1];
+    const int bottomHeight = PyArray_DIMS(bottom)[2];
+    const int bottomWidth = PyArray_DIMS(bottom)[3];
+    const int bottomDepth = PyArray_DIMS(bottom)[4];
+    // weights: (nFilters, nChannels, rows, columns, slices)
+    const int nFilters = PyArray_DIMS(weight)[0];
+    const int kH = PyArray_DIMS(weight)[2];
+    const int kW = PyArray_DIMS(weight)[3];
+    const int kD = PyArray_DIMS(weight)[4];
+    if (nChannels != PyArray_DIMS(weight)[1]) {
+        PyErr_SetString(PyExc_ValueError,
+                "Corr3dMM images and kernel must have the same stack size\n");
+        return NULL;
+    }
+    // implicit dilated filter
+    const int dil_kH = (kH - 1) * dilH + 1;
+    const int dil_kW = (kW - 1) * dilW + 1;
+    const int dil_kD = (kD - 1) * dilD + 1;
+    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
+    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
+    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    const int topDepth  = (bottomDepth + 2*padD - dil_kD) / dD + 1;
+    if (batchSize != PyArray_DIMS(top)[0] ||
+            nFilters != PyArray_DIMS(top)[1] ||
+            topHeight != PyArray_DIMS(top)[2] ||
+            topWidth != PyArray_DIMS(top)[3] ||
+            topDepth != PyArray_DIMS(top)[4]) {
+        PyErr_Format(PyExc_ValueError,
+                "Corr3dMM shape inconsistency:\n"
+                "  bottom shape: %%d %%d %%d %%d %%d\n"
+                "  weight shape: %%d %%d %%d %%d %%d\n"
+                "  top shape: %%ld %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d %%d)\n",
+                batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
+                nFilters, nChannels, kH, kW, kD,
+                PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
+                PyArray_DIMS(top)[2], PyArray_DIMS(top)[3], PyArray_DIMS(top)[4],
+                batchSize, nFilters, topHeight, topWidth, topDepth);
+        return NULL;
+    }
+
+    // Create temporary columns
+    int max_threads = %(omp_get_max_threads)s;
+    if (batchSize < max_threads) {
+        max_threads = batchSize;
+    }
+    npy_intp col_dim[3];
+    col_dim[0] = (npy_intp)max_threads;
+    col_dim[1] = (npy_intp)(nChannels * kW * kH * kD);
+    col_dim[2] = (npy_intp)(topHeight * topWidth * topDepth);
+
+    //Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
+    PyArrayObject* col = (PyArrayObject*)PyArray_ZEROS(3,
+            col_dim,
+            PyArray_TYPE(top),
+            0); 
+    if (NULL == col) {
+        PyErr_Format(PyExc_RuntimeError,
+                "Corr3dMM failed to allocate working memory of"
+                " %%ld x %%ld x %%ld\n",
+                col_dim[0], col_dim[1], col_dim[2]);
+        return NULL;
+    }
+
+    // Define some useful variables
+    const int bottom_stride = PyArray_STRIDES(bottom)[0]/%(n_bytes)f;
+    const int top_stride = PyArray_STRIDES(top)[0]/%(n_bytes)f;
+    const int K_ = col_dim[1];
+    const int N_ = col_dim[2];
+    const int col_stride = (K_ * N_);
+    const int M_ = nFilters;
+    const %(c_float_type)s one = 1.0;
+    const %(c_float_type)s zero = 0.0;
+    char NTrans = 'N';
+    char Trans = 'T';
+    PyArrayObject *output;
+
+    if (direction == 0) {  // forward pass
+        output = top;
+        // valid correlation: im3d2col, then gemm
+        // Iterate over batch
+        int blas_threads_saved = %(blas_get_num_threads)s;
+        // Always forcing gemm to one thread when OpenMP is enalbed for best and stable performance.
+        %(blas_set_num_threads)s(1);
+        %(omp_flags)s
+        for (int n = 0; n < batchSize; ++n) {
+            int tid = %(omp_get_thread_num)s;
+            // First, im3d2col
+            im3d2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels,
+                     bottomHeight, bottomWidth, bottomDepth,
+                     kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD,
+                     (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
+            // Second, gemm
+            %(gemm)s(&NTrans, &NTrans,
+                   &N_, &M_, &K_,
+                   &one,
+                   (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride, &N_,
+                   (%(float_type)s*)PyArray_DATA(weight), &K_,
+                   &zero,
+                   (%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_);
+        }
+        // Restore to previous blas threads
+        %(blas_set_num_threads)s(blas_threads_saved);
+    }
+    else if (direction == 1) {  // backprop wrt. weights
+        output = weight;
+        npy_intp weight_dim[2];
+        weight_dim[0] = (npy_intp)max_threads;
+        weight_dim[1] = (npy_intp)(M_ * K_);
+        PyArrayObject* local_weight = (PyArrayObject*)PyArray_ZEROS(2,
+                                   weight_dim, PyArray_TYPE(weight), 0);
+
+        if (NULL == local_weight)
+        {
+            PyErr_Format(PyExc_RuntimeError,
+                    "Corr3dMM failed to allocate weight memory of %%ld x %%ld\n",
+                    weight_dim[0], weight_dim[1]);
+            return NULL;
+        }
+        
+        // valid convolution: im2col, then gemm
+        // Iterate over batch
+        int blas_threads_saved = %(blas_get_num_threads)s;
+        // Always forcing gemm to one thread when OpenMP is enalbed for best and stable performance.
+        %(blas_set_num_threads)s(1);
+        // OMP for batch-level paralization
+        %(omp_flags)s
+        for (int n = 0; n < batchSize; ++n) {
+            int tid = %(omp_get_thread_num)s;
+            // First, im2col
+            im3d2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels,
+                     bottomHeight, bottomWidth, bottomDepth,
+                     kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD,
+                     (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
+            // Second, gemm
+            // Note that we accumulate into weight. We do so by setting beta = 0
+            // for the first iteration and beta = 1 for subsequent ones. (This
+            // is faster than setting weight to all zeros before the loop.)
+            %(gemm)s(&Trans, &NTrans,
+                   &K_, &M_, &N_,
+                   &one,
+                   (%(float_type)s*)PyArray_DATA(col) + tid * col_stride, &N_,
+                   (%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_,
+                   (n == 0) ? &zero : &one,
+                   (%(float_type)s*)PyArray_DATA(local_weight) + 
+                   tid * weight_dim[1], &K_);
+        }
+        // Restore to previous blas threads
+        %(blas_set_num_threads)s(blas_threads_saved);
+
+        //aggregate weights
+        memset((%(float_type)s*)PyArray_DATA(weight), 0, M_ * K_*sizeof(%(float_type)s));
+        /*
+         * Put index "j" into outer loop to get the
+         * correct result when openmp is used.
+         */
+        %(omp_flags)s
+        for(int j = 0; j < weight_dim[1]; ++j){
+            for(int i = 0; i < max_threads; ++i){
+                ((%(float_type)s*)PyArray_DATA(weight))[j] += 
+                    *((%(float_type)s*)PyArray_DATA(local_weight) +
+                    i * weight_dim[1] + j);
+            }
+        }
+        Py_DECREF(local_weight);
+    }
+    else if (direction == 2) {  // backprop wrt. inputs
+        output = bottom;
+        // bottom is set to zero here rather than inside of col2im
+        PyArray_FILLWBYTE(bottom, 0);
+        // full convolution: gemm, then col2im3d
+        // Iterate over batch
+
+        int blas_threads_saved = %(blas_get_num_threads)s;
+        // Always forcing gemm to one thread when OpenMP is enalbed for best and stable performance.
+        %(blas_set_num_threads)s(1);
+        %(omp_flags)s
+        for (int n = 0; n < batchSize; ++n) {
+            // gemm into columns
+            int tid = %(omp_get_thread_num)s;
+            %(gemm)s(&NTrans, &Trans,
+                   &N_, &K_, &M_,
+                   &one,
+                   (%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_,
+                   (%(float_type)s*)PyArray_DATA(weight), &K_,
+                   &zero,
+                   (%(float_type)s*)PyArray_DATA(col) + tid * col_stride, &N_);
+            // col2im back to the data
+            col2im3d((%(float_type)s*)PyArray_DATA(col) + tid * col_stride, nChannels,
+                     bottomHeight, bottomWidth, bottomDepth,
+                     kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD,
+                     (%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride);
+        }
+        // Restore to previous blas threads
+        %(blas_set_num_threads)s(blas_threads_saved);
+    }
+    // Free temporary columns
+    Py_DECREF(col);
+    // decref from contiguous check
+    Py_DECREF(bottom);
+    Py_DECREF(weight);
+    Py_DECREF(top);
+
+    // Note that we don't change the refcount of the output matrix here. Output
+    // (re)allocation and refcounting is done in BaseCorr3dMM.c_code_helper();
+    // in here output is just aliased to one of bottom, weights, or top.
+    return output;
+}
+
--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -10,6 +10,8 @@ from theano.gof.opt import copy_stack_trace

 from theano.tensor.nnet.corr import (
    CorrMM, CorrMM_gradInputs, CorrMM_gradWeights)
+from theano.tensor.nnet.corr3d import (
+    Corr3dMM, Corr3dMM_gradInputs, Corr3dMM_gradWeights)
 from theano.tensor.nnet.blocksparse import (
    SparseBlockGemv,
    SparseBlockOuter,
@@ -18,6 +20,9 @@ from theano.tensor.nnet.blocksparse import (
 from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradWeights,
                                              AbstractConv2d_gradInputs)
+from theano.tensor.nnet.abstract_conv import (AbstractConv3d,
+                                              AbstractConv3d_gradWeights,
+                                              AbstractConv3d_gradInputs)
 from theano.tensor.nnet.abstract_conv import get_conv_output_shape
 from theano.tensor.opt import register_specialize_device
 from theano.tensor import TensorType
@@ -25,6 +30,7 @@ from theano.tensor import opt

 # Cpu implementation
 from theano.tensor.nnet.conv import conv2d, ConvOp
+from theano.tensor.nnet.Conv3D import conv3D
 from theano.tensor.nnet.ConvGrad3D import convGrad3D
 from theano.tensor.nnet.ConvTransp3D import convTransp3D

@@ -86,6 +92,28 @@ def local_abstractconv_gemm(node):
    return [rval]


+@local_optimizer([AbstractConv3d])
+def local_abstractconv3d_gemm(node):
+    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+        return
+    if not isinstance(node.op, AbstractConv3d):
+        return None
+    img, kern = node.inputs
+    if not isinstance(img.type, TensorType) or \
+       not isinstance(kern.type, TensorType):
+        return None
+
+    # need to flip the kernel if necessary
+    if node.op.filter_flip:
+        kern = kern[:, :, ::-1, ::-1, ::-1]
+    rval = Corr3dMM(border_mode=node.op.border_mode,
+                    subsample=node.op.subsample,
+                    filter_dilation=node.op.filter_dilation)(img, kern)
+    copy_stack_trace(node.outputs[0], rval)
+
+    return [rval]
+
+
 @local_optimizer([AbstractConv2d_gradWeights])
 def local_abstractconv_gradweight_gemm(node):
    if theano.config.cxx == "" or not theano.config.blas.ldflags:
@@ -111,6 +139,31 @@ def local_abstractconv_gradweight_gemm(node):
    return [rval]


+@local_optimizer([AbstractConv3d_gradWeights])
+def local_abstractconv3d_gradweight_gemm(node):
+    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+        return
+    if not isinstance(node.op, AbstractConv3d_gradWeights):
+        return None
+    img, topgrad, shape = node.inputs
+    if not isinstance(img.type, TensorType) or \
+       not isinstance(topgrad.type, TensorType):
+        return None
+
+    rval = Corr3dMM_gradWeights(border_mode=node.op.border_mode,
+                                subsample=node.op.subsample,
+                                filter_dilation=node.op.filter_dilation)(img, topgrad, shape)
+    copy_stack_trace(node.outputs[0], rval)
+
+    # need to flip the kernel if necessary
+    if node.op.filter_flip:
+        rval = rval[:, :, ::-1, ::-1, ::-1]
+    rval = theano.tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
+    copy_stack_trace(node.outputs[0], rval)
+
+    return [rval]
+
+
 @local_optimizer([AbstractConv2d_gradInputs])
 def local_abstractconv_gradinputs_gemm(node):
    if theano.config.cxx == "" or not theano.config.blas.ldflags:
@@ -134,6 +187,29 @@ def local_abstractconv_gradinputs_gemm(node):
    return [rval]


+@local_optimizer([AbstractConv3d_gradInputs])
+def local_abstractconv3d_gradinputs_gemm(node):
+    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+        return
+    if not isinstance(node.op, AbstractConv3d_gradInputs):
+        return None
+    kern, topgrad, shape = node.inputs
+    if not isinstance(kern.type, TensorType) or \
+       not isinstance(topgrad.type, TensorType):
+        return None
+
+    # need to flip the kernel if necessary
+    if node.op.filter_flip:
+        kern = kern[:, :, ::-1, ::-1, ::-1]
+    rval = Corr3dMM_gradInputs(border_mode=node.op.border_mode,
+                               subsample=node.op.subsample,
+                               filter_dilation=node.op.filter_dilation)(kern, topgrad,
+                                                                        shape)
+    copy_stack_trace(node.outputs[0], rval)
+
+    return [rval]
+
+
 @local_optimizer([AbstractConv2d])
 def local_conv2d_cpu(node):

@@ -159,6 +235,37 @@ def local_conv2d_cpu(node):
    return [rval]


+@local_optimizer([AbstractConv3d])
+def local_conv3d_cpu(node):
+    if not isinstance(node.op, AbstractConv3d):
+        return None
+
+    img, kern = node.inputs
+    if ((not isinstance(img.type, TensorType) or
+         not isinstance(kern.type, TensorType))):
+        return None
+    if node.op.border_mode not in ['valid', (0, 0, 0)]:
+        return None
+    if node.op.filter_dilation != (1, 1, 1):
+        return None
+
+    bias = theano.tensor.zeros_like(kern[:, 0, 0, 0, 0])
+
+    # need to flip the kernel if necessary (conv3D does not flip)
+    if node.op.filter_flip:
+        kern = kern[:, :, ::-1, ::-1, ::-1]
+
+    # conv3D expects shape (batch, row, column, time, channel)
+    img = img.dimshuffle(0, 2, 3, 4, 1)
+    kern = kern.dimshuffle(0, 2, 3, 4, 1)
+
+    rval = conv3D(img, kern, bias, node.op.subsample)
+    copy_stack_trace(node.outputs[0], rval)
+    rval = rval.dimshuffle(0, 4, 1, 2, 3)
+
+    return [rval]
+
+
 @local_optimizer([AbstractConv2d_gradWeights])
 def local_conv2d_gradweight_cpu(node):
    if not isinstance(node.op, AbstractConv2d_gradWeights):
@@ -277,6 +384,39 @@ def local_conv2d_gradweight_cpu(node):
    return [res]


+@local_optimizer([AbstractConv3d_gradWeights])
+def local_conv3d_gradweight_cpu(node):
+    if not isinstance(node.op, AbstractConv3d_gradWeights):
+        return None
+
+    img, topgrad, shape = node.inputs
+    if ((not isinstance(img.type, TensorType) or
+         not isinstance(topgrad.type, TensorType))):
+        return None
+    if node.op.border_mode not in ['valid', (0, 0, 0)]:
+        return None
+    if node.op.filter_dilation != (1, 1, 1):
+        return None
+
+    # conv3D expects shape (batch, row, column, time, channel)
+    img = img.dimshuffle(0, 2, 3, 4, 1)
+    topgrad = topgrad.dimshuffle(0, 2, 3, 4, 1)
+
+    W_shape = (topgrad.shape[4], shape[0], shape[1], shape[2], img.shape[4])
+
+    rval = convGrad3D(img, node.op.subsample, W_shape, topgrad)
+    copy_stack_trace(node.outputs[0], rval)
+    rval = rval.dimshuffle(0, 4, 1, 2, 3)
+
+    # need to flip the kernel if necessary (conv3D does not flip)
+    if node.op.filter_flip:
+        rval = rval[:, :, ::-1, ::-1, ::-1]
+
+    rval = theano.tensor.patternbroadcast(rval,
+                                          node.outputs[0].broadcastable)
+    return [rval]
+
+
 @local_optimizer([AbstractConv2d_gradInputs])
 def local_conv2d_gradinputs_cpu(node):
    if not isinstance(node.op, AbstractConv2d_gradInputs):
@@ -366,6 +506,38 @@ def local_conv2d_gradinputs_cpu(node):
    return [din]


+@local_optimizer([AbstractConv3d_gradInputs])
+def local_conv3d_gradinputs_cpu(node):
+    if not isinstance(node.op, AbstractConv3d_gradInputs):
+        return None
+
+    kern, topgrad, shape = node.inputs
+    if ((not isinstance(kern.type, TensorType) or
+         not isinstance(topgrad.type, TensorType))):
+        return None
+    if node.op.border_mode not in ['valid', (0, 0, 0)]:
+        return None
+    if node.op.filter_dilation != (1, 1, 1):
+        return None
+
+    # need to flip the kernel if necessary (conv3D does not flip)
+    if node.op.filter_flip:
+        kern = kern[:, :, ::-1, ::-1, ::-1]
+
+    # conv3D expects shape (batch, row, column, time, channel)
+    kern = kern.dimshuffle(0, 2, 3, 4, 1)
+    topgrad = topgrad.dimshuffle(0, 2, 3, 4, 1)
+    bias = theano.tensor.zeros_like(kern[0, 0, 0, 0, :])
+
+    rval = convTransp3D(kern, bias, node.op.subsample, topgrad, shape)
+    copy_stack_trace(node.outputs[0], rval)
+    rval = rval.dimshuffle(0, 4, 1, 2, 3)
+
+    rval = theano.tensor.patternbroadcast(rval,
+                                          node.outputs[0].broadcastable)
+    return [rval]
+
+
 # Register Cpu Optmization
 conv_groupopt = theano.gof.optdb.LocalGroupDB()
 conv_groupopt.__name__ = "conv_opts"
@@ -381,6 +553,14 @@ conv_groupopt.register('local_abstractconv_gradweight_gemm',
 conv_groupopt.register('local_abstractconv_gradinputs_gemm',
                       local_abstractconv_gradinputs_gemm, 30,
                       'conv_gemm', 'fast_compile', 'fast_run')
+conv_groupopt.register('local_abstractconv3d_gemm', local_abstractconv3d_gemm, 30,
+                       'conv_gemm', 'fast_compile', 'fast_run')
+conv_groupopt.register('local_abstractconv3d_gradweight_gemm',
+                       local_abstractconv3d_gradweight_gemm, 30,
+                       'conv_gemm', 'fast_compile', 'fast_run')
+conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
+                       local_abstractconv3d_gradinputs_gemm, 30,
+                       'conv_gemm', 'fast_compile', 'fast_run')
 # Legacy convolution
 conv_groupopt.register('local_conv2d_cpu', local_conv2d_cpu, 40,
                       'fast_compile', 'fast_run')
@@ -390,16 +570,30 @@ conv_groupopt.register('local_conv2d_gradweight_cpu',
 conv_groupopt.register('local_conv2d_gradinputs_cpu',
                       local_conv2d_gradinputs_cpu, 40,
                       'fast_compile', 'fast_run')
+conv_groupopt.register('local_conv3d_cpu', local_conv3d_cpu, 40,
+                       'fast_compile', 'fast_run')
+conv_groupopt.register('local_conv3d_gradweight_cpu',
+                       local_conv3d_gradweight_cpu, 40,
+                       'fast_compile', 'fast_run')
+conv_groupopt.register('local_conv3d_gradinputs_cpu',
+                       local_conv3d_gradinputs_cpu, 40,
+                       'fast_compile', 'fast_run')


 # Verify that no AbstractConv are present in the graph
 @local_optimizer([AbstractConv2d,
                  AbstractConv2d_gradWeights,
-                  AbstractConv2d_gradInputs])
+                  AbstractConv2d_gradInputs,
+                  AbstractConv3d,
+                  AbstractConv3d_gradWeights,
+                  AbstractConv3d_gradInputs])
 def local_abstractconv_check(node):
    if isinstance(node.op, (AbstractConv2d,
                            AbstractConv2d_gradWeights,
-                            AbstractConv2d_gradInputs)):
+                            AbstractConv2d_gradInputs,
+                            AbstractConv3d,
+                            AbstractConv3d_gradWeights,
+                            AbstractConv3d_gradInputs)):
        raise AssertionError(
            '%s Theano optimization failed: there is no implementation '
            'available supporting the requested options. Did you exclude '

--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -9,7 +9,7 @@ import theano
 from theano import tensor
 from theano.gof.opt import check_stack_trace
 from theano.tests import unittest_tools as utt
-from theano.tensor.nnet import corr, abstract_conv as conv
+from theano.tensor.nnet import corr, corr3d, abstract_conv as conv
 from theano.tensor.nnet.abstract_conv import get_conv_output_shape
 from theano.tensor.nnet.abstract_conv import AbstractConv2d
 from theano.tensor.nnet.abstract_conv import AbstractConv2d_gradInputs
@@ -20,13 +20,16 @@ from theano.tensor.nnet.abstract_conv import bilinear_upsampling
 from theano.tensor.nnet.conv import ConvOp
 from theano.tensor.nnet.corr import (CorrMM, CorrMM_gradWeights,
                                     CorrMM_gradInputs)
+from theano.tensor.nnet.corr3d import (Corr3dMM, Corr3dMM_gradWeights,
+                                       Corr3dMM_gradInputs)
+from theano.tensor.nnet.Conv3D import Conv3D
 from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
 from theano.tensor.nnet.ConvTransp3D import ConvTransp3D


-def conv_corr(inputs, filters, border_mode="valid",
-              subsample=(1, 1), conv_mode='conv',
-              filter_dilation=(1, 1)):
+def conv2d_corr(inputs, filters, border_mode="valid",
+                subsample=(1, 1), conv_mode='conv',
+                filter_dilation=(1, 1)):
    if conv_mode == 'conv':
        filters = filters[:, :, ::-1, ::-1]
    return corr.CorrMM(border_mode,
@@ -34,9 +37,9 @@ def conv_corr(inputs, filters, border_mode="valid",
                       filter_dilation)(inputs, filters)


-def conv_corr_gw(inputs, topgrad, filters_shape,
-                 border_mode="valid", subsample=(1, 1),
-                 conv_mode='conv', filter_dilation=(1, 1)):
+def conv2d_corr_gw(inputs, topgrad, filters_shape,
+                   border_mode="valid", subsample=(1, 1),
+                   conv_mode='conv', filter_dilation=(1, 1)):
    rval = corr.CorrMM_gradWeights(border_mode,
                                   subsample,
                                   filter_dilation)(inputs, topgrad,
@@ -46,9 +49,9 @@ def conv_corr_gw(inputs, topgrad, filters_shape,
    return rval


-def conv_corr_gi(filters, topgrad, inputs_shape,
-                 border_mode="valid", subsample=(1, 1),
-                 conv_mode='conv', filter_dilation=(1, 1)):
+def conv2d_corr_gi(filters, topgrad, inputs_shape,
+                   border_mode="valid", subsample=(1, 1),
+                   conv_mode='conv', filter_dilation=(1, 1)):
    if conv_mode == 'conv':
        filters = filters[:, :, ::-1, ::-1]
    return corr.CorrMM_gradInputs(border_mode,
@@ -58,6 +61,40 @@ def conv_corr_gi(filters, topgrad, inputs_shape,
                                                   inputs_shape[2:])


+def conv3d_corr(inputs, filters, border_mode="valid",
+                subsample=(1, 1, 1), conv_mode='conv',
+                filter_dilation=(1, 1, 1)):
+    if conv_mode == 'conv':
+        filters = filters[:, :, ::-1, ::-1, ::-1]
+    return corr3d.Corr3dMM(border_mode,
+                           subsample,
+                           filter_dilation)(inputs, filters)
+
+
+def conv3d_corr_gw(inputs, topgrad, filters_shape,
+                   border_mode="valid", subsample=(1, 1, 1),
+                   conv_mode='conv', filter_dilation=(1, 1, 1)):
+    rval = corr3d.Corr3dMM_gradWeights(border_mode,
+                                       subsample,
+                                       filter_dilation)(inputs, topgrad,
+                                                        filters_shape[2:])
+    if conv_mode == 'conv':
+        rval = rval[:, :, ::-1, ::-1, ::-1]
+    return rval
+
+
+def conv3d_corr_gi(filters, topgrad, inputs_shape,
+                   border_mode="valid", subsample=(1, 1, 1),
+                   conv_mode='conv', filter_dilation=(1, 1, 1)):
+    if conv_mode == 'conv':
+        filters = filters[:, :, ::-1, ::-1, ::-1]
+    return corr3d.Corr3dMM_gradInputs(border_mode,
+                                      subsample,
+                                      filter_dilation)(filters,
+                                                       topgrad,
+                                                       inputs_shape[2:])
+
+
 class TestGetConvOutShape(unittest.TestCase):
    def test_basic(self):
        image_shape, kernel_shape = (3, 2, 12, 9), (4, 2, 5, 6)
@@ -77,35 +114,36 @@ class TestGetConvOutShape(unittest.TestCase):
        self.assertTrue(test3_params == (3, 4, 20, 7))
        self.assertTrue(test4_params == (3, 4, 6, 4))

+    def test_basic_3d(self):
+        image_shape, kernel_shape = (3, 2, 12, 9, 7), (4, 2, 5, 6, 4)
+        sub_sample = (1, 2, 1)
+        filter_dilation = (2, 1, 1)
+        test1_params = get_conv_output_shape(
+            image_shape, kernel_shape, 'valid', sub_sample, filter_dilation)
+        test2_params = get_conv_output_shape(
+            image_shape, kernel_shape, 'half', sub_sample, filter_dilation)
+        test3_params = get_conv_output_shape(
+            image_shape, kernel_shape, 'full', sub_sample, filter_dilation)
+        test4_params = get_conv_output_shape(
+            image_shape, kernel_shape, (1, 2, 3), sub_sample, filter_dilation)
+
+        self.assertTrue(test1_params == (3, 4, 4, 2, 4))
+        self.assertTrue(test2_params == (3, 4, 12, 5, 8))
+        self.assertTrue(test3_params == (3, 4, 20, 7, 10))
+        self.assertTrue(test4_params == (3, 4, 6, 4, 10))

-class BaseTestConv2d:
-    @classmethod
-    def setup_class(cls):
-        if theano.config.blas.ldflags == '':
-            raise SkipTest("BLAS required for reference")
-        cls.inputs_shapes = [(8, 1, 6, 6), (8, 1, 8, 8), (2, 1, 7, 7),
-                             (6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)]
-        cls.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
-                              (1, 1, 2, 3), (4, 1, 1, 3), (4, 5, 3, 2)]
-        cls.subsamples = [(1, 1), (2, 2), (2, 4)]
-        cls.filters_dilations = [(1, 1), (1, 2), (2, 1)]
-        cls.border_modes = ["valid", "half", "full", (0, 0), (1, 1), (5, 5), (5, 2)]
-        cls.filter_flip = [True, False]
-        cls.provide_shape = [True, False]
-        cls.shared = staticmethod(theano.compile.shared)

+class BaseTestConv(object):
    def get_output_shape(self, inputs_shape, filters_shape,
                         subsample, border_mode, filter_dilation):
-        dil_filters = ((filters_shape[2] - 1) * filter_dilation[0] + 1,
-                       (filters_shape[3] - 1) * filter_dilation[1] + 1)
+        dil_filters = tuple((s - 1) * d + 1 for s, d in zip(filters_shape[2:],
+                                                            filter_dilation))
        if border_mode == "valid":
-            border_mode = (0, 0)
+            border_mode = (0,) * (len(inputs_shape) - 2)
        if border_mode == "half":
-            border_mode = (dil_filters[0] // 2,
-                           dil_filters[1] // 2)
+            border_mode = tuple(d // 2 for d in dil_filters)
        if border_mode == "full":
-            border_mode = (dil_filters[0] - 1,
-                           dil_filters[1] - 1)
+            border_mode = tuple(d - 1 for d in dil_filters)
        batch_size = inputs_shape[0]
        num_filters = filters_shape[0]
        return ((batch_size, num_filters,) +
@@ -116,14 +154,24 @@ class BaseTestConv2d:
                                                  subsample, border_mode,
                                                  filter_dilation)))

-    def run_fwd(self, inputs_shape, filters_shape, ref=conv_corr,
-                subsample=(1, 1), verify_grad=True, mode=None,
+    def run_fwd(self, inputs_shape, filters_shape,
+                conv_fn, conv_op, ref,
+                subsample=None, verify_grad=True, mode=None,
                border_mode='valid', filter_flip=True,
                provide_shape=False, target_op=None,
-                check_trace=False, filter_dilation=(1, 1)):
+                check_trace=False, filter_dilation=None):
+        if subsample is None:
+            subsample = (1,) * (len(inputs_shape) - 2)
+        if filter_dilation is None:
+            filter_dilation = (1,) * (len(inputs_shape) - 2)
+
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')

+        # scale down values to prevent rounding errors
+        inputs_val /= 10
+        filters_val /= 10
+
        inputs = self.shared(inputs_val)
        filters = self.shared(filters_val)

@@ -143,13 +191,13 @@ class BaseTestConv2d:
                    subsample=subsample,
                    conv_mode=conv_mode,
                    filter_dilation=filter_dilation)
-        c = conv.conv2d(inputs, filters,
-                        border_mode=border_mode,
-                        subsample=subsample,
-                        filter_flip=filter_flip,
-                        input_shape=imshp,
-                        filter_shape=kshp,
-                        filter_dilation=filter_dilation)
+        c = conv_fn(inputs, filters,
+                    border_mode=border_mode,
+                    subsample=subsample,
+                    filter_flip=filter_flip,
+                    input_shape=imshp,
+                    filter_shape=kshp,
+                    filter_dilation=filter_dilation)

        f_ref = theano.function([], c_ref, mode='FAST_RUN')
        f = theano.function([], c, mode=mode)
@@ -164,19 +212,24 @@ class BaseTestConv2d:
        res = numpy.array(f())
        utt.assert_allclose(res_ref, res)
        if verify_grad:
-            utt.verify_grad(conv.AbstractConv2d(border_mode=border_mode,
-                                                imshp=imshp, kshp=kshp,
-                                                subsample=subsample,
-                                                filter_dilation=filter_dilation),
+            utt.verify_grad(conv_op(border_mode=border_mode,
+                                    imshp=imshp, kshp=kshp,
+                                    subsample=subsample,
+                                    filter_dilation=filter_dilation),
                            [inputs_val, filters_val],
                            mode=mode)

    def run_gradweight(self, inputs_shape, filters_shape, output_shape,
-                       ref=conv_corr_gw, subsample=(1, 1),
+                       gradWeights_fn, ref, subsample=None,
                       filter_flip=True, verify_grad=True, mode=None,
                       border_mode='valid', provide_shape=False,
                       target_op=None, check_trace=False,
-                       filter_dilation=(1, 1)):
+                       filter_dilation=None):
+        if subsample is None:
+            subsample = (1,) * (len(inputs_shape) - 2)
+        if filter_dilation is None:
+            filter_dilation = (1,) * (len(inputs_shape) - 2)
+
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        output_val = numpy.random.random(output_shape).astype('float32')

@@ -193,12 +246,12 @@ class BaseTestConv2d:
            conv_mode = 'conv'
        else:
            conv_mode = 'cross'
-        c = conv.AbstractConv2d_gradWeights(border_mode=border_mode,
-                                            filter_flip=filter_flip,
-                                            subsample=subsample,
-                                            imshp=imshp, kshp=kshp,
-                                            filter_dilation=filter_dilation)
-        c = c(inputs, output, filters_shape[-2:])
+        c = gradWeights_fn(border_mode=border_mode,
+                           filter_flip=filter_flip,
+                           subsample=subsample,
+                           imshp=imshp, kshp=kshp,
+                           filter_dilation=filter_dilation)
+        c = c(inputs, output, filters_shape[2:])
        c_ref = ref(inputs, output,
                    filters_shape,
                    border_mode=border_mode,
@@ -218,22 +271,28 @@ class BaseTestConv2d:
        res = numpy.array(f())
        utt.assert_allclose(res_ref, res)

-        def abstract_conv2d_gradweight(inputs_val, output_val):
-            conv_op = conv.AbstractConv2d_gradWeights(border_mode=border_mode,
-                                                      subsample=subsample,
-                                                      filter_dilation=filter_dilation)
-            return conv_op(inputs_val, output_val, filters_shape[-2:])
+        def abstract_conv_gradweight(inputs_val, output_val):
+            conv_op = gradWeights_fn(border_mode=border_mode,
+                                     subsample=subsample,
+                                     filter_dilation=filter_dilation)
+            return conv_op(inputs_val, output_val, filters_shape[2:])

        if verify_grad:
-            utt.verify_grad(abstract_conv2d_gradweight,
+            utt.verify_grad(abstract_conv_gradweight,
                            [inputs_val, output_val],
                            mode=mode, eps=1)

    def run_gradinput(self, inputs_shape, filters_shape, output_shape,
-                      ref=conv_corr_gi, subsample=(1, 1), filter_flip=True,
+                      gradInputs_fn, ref,
+                      subsample=None, filter_flip=True,
                      verify_grad=True, mode=None, border_mode='valid',
                      provide_shape=False, target_op=None,
-                      check_trace=False, filter_dilation=(1, 1)):
+                      check_trace=False, filter_dilation=None):
+        if subsample is None:
+            subsample = (1,) * (len(inputs_shape) - 2)
+        if filter_dilation is None:
+            filter_dilation = (1,) * (len(inputs_shape) - 2)
+
        output_val = numpy.random.random(output_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')
        output = self.shared(output_val)
@@ -249,12 +308,12 @@ class BaseTestConv2d:
            conv_mode = 'conv'
        else:
            conv_mode = 'cross'
-        c = conv.AbstractConv2d_gradInputs(border_mode=border_mode,
-                                           subsample=subsample,
-                                           filter_flip=filter_flip,
-                                           imshp=imshp, kshp=kshp,
-                                           filter_dilation=filter_dilation)
-        c = c(filters, output, inputs_shape[-2:])
+        c = gradInputs_fn(border_mode=border_mode,
+                          subsample=subsample,
+                          filter_flip=filter_flip,
+                          imshp=imshp, kshp=kshp,
+                          filter_dilation=filter_dilation)
+        c = c(filters, output, inputs_shape[2:])
        c_ref = ref(filters, output, inputs_shape,
                    border_mode=border_mode, subsample=subsample,
                    conv_mode=conv_mode, filter_dilation=filter_dilation)
@@ -271,24 +330,24 @@ class BaseTestConv2d:
        res = numpy.array(f())
        utt.assert_allclose(res_ref, res)

-        def abstract_conv2d_gradinputs(filters_val, output_val):
-            conv_op = conv.AbstractConv2d_gradInputs(border_mode=border_mode,
-                                                     subsample=subsample,
-                                                     filter_dilation=filter_dilation)
-            return conv_op(filters_val, output_val, inputs_shape[-2:])
+        def abstract_conv_gradinputs(filters_val, output_val):
+            conv_op = gradInputs_fn(border_mode=border_mode,
+                                    subsample=subsample,
+                                    filter_dilation=filter_dilation)
+            return conv_op(filters_val, output_val, inputs_shape[2:])

        if verify_grad:
-            utt.verify_grad(abstract_conv2d_gradinputs,
+            utt.verify_grad(abstract_conv_gradinputs,
                            [filters_val, output_val],
                            mode=mode, eps=1)

    def test_all(self):
-        if type(self) is BaseTestConv2d:
+        if type(self) is BaseTestConv:
            raise SkipTest("base class")
-        ds = [1, 1]
-        db = (0, 0)
-        dflip = True in self.filter_flip
-        dprovide_shape = True in self.provide_shape
+        ds = self.default_subsamples
+        db = self.default_border_mode
+        dflip = self.default_filter_flip
+        dprovide_shape = self.default_provide_shape
        for (i, f) in zip(self.inputs_shapes, self.filters_shapes):
            for provide_shape in self.provide_shape:
                yield (self.tcase, i, f, ds, db, dflip, provide_shape)
@@ -301,6 +360,57 @@ class BaseTestConv2d:
                yield (self.tcase, i, f, ds, db, flip, dprovide_shape)


+class BaseTestConv2d(BaseTestConv):
+    @classmethod
+    def setup_class(cls):
+        if theano.config.blas.ldflags == '':
+            raise SkipTest("BLAS required for reference")
+        cls.inputs_shapes = [(8, 1, 6, 6), (8, 1, 8, 8), (2, 1, 7, 7),
+                             (6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)]
+        cls.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
+                              (1, 1, 2, 3), (4, 1, 1, 3), (4, 5, 3, 2)]
+        cls.subsamples = [(1, 1), (2, 2), (2, 4)]
+        cls.default_subsamples = (1, 1)
+        cls.filters_dilations = [(1, 1), (1, 2), (2, 1)]
+        cls.border_modes = ["valid", "half", "full", (0, 0), (1, 1), (5, 5), (5, 2)]
+        cls.default_border_mode = (0, 0)
+        cls.filter_flip = [True, False]
+        cls.default_filter_flip = True
+        cls.provide_shape = [True, False]
+        cls.default_provide_shape = True
+        cls.shared = staticmethod(theano.compile.shared)
+
+    def run_fwd(self, inputs_shape, filters_shape,
+                conv_fn=conv.conv2d, conv_op=conv.AbstractConv2d,
+                ref=conv2d_corr, **kwargs):
+        super(BaseTestConv2d, self).run_fwd(
+            inputs_shape=inputs_shape,
+            filters_shape=filters_shape,
+            conv_fn=conv_fn,
+            conv_op=conv_op,
+            ref=ref, **kwargs)
+
+    def run_gradweight(self, inputs_shape, filters_shape, output_shape,
+                       gradWeights_fn=conv.AbstractConv2d_gradWeights,
+                       ref=conv2d_corr_gw, **kwargs):
+        super(BaseTestConv2d, self).run_gradweight(
+            inputs_shape=inputs_shape,
+            filters_shape=filters_shape,
+            output_shape=output_shape,
+            gradWeights_fn=gradWeights_fn,
+            ref=ref, **kwargs)
+
+    def run_gradinput(self, inputs_shape, filters_shape, output_shape,
+                      gradInputs_fn=conv.AbstractConv2d_gradInputs,
+                      ref=conv2d_corr_gi, **kwargs):
+        super(BaseTestConv2d, self).run_gradinput(
+            inputs_shape=inputs_shape,
+            filters_shape=filters_shape,
+            output_shape=output_shape,
+            gradInputs_fn=gradInputs_fn,
+            ref=ref, **kwargs)
+
+
 class TestCorrConv2d(BaseTestConv2d):
    @classmethod
    def setup_class(cls):
@@ -483,6 +593,187 @@ class TestCpuConv2d(BaseTestConv2d):
                          filter_dilation=fd)


+class BaseTestConv3d(BaseTestConv):
+    @classmethod
+    def setup_class(cls):
+        if theano.config.blas.ldflags == '':
+            raise SkipTest("BLAS required for reference")
+        cls.inputs_shapes = [(2, 1, 6, 6, 6), (2, 2, 7, 5, 6)]
+        cls.filters_shapes = [(3, 1, 2, 2, 2), (1, 2, 2, 3, 1)]
+        cls.subsamples = [(1, 1, 1), (2, 2, 2), (1, 2, 3)]
+        cls.default_subsamples = (1, 1, 1)
+        cls.filters_dilations = [(1, 1, 1), (1, 2, 1), (2, 1, 2)]
+        cls.border_modes = ["valid", "half", "full", (0, 0, 0), (2, 2, 3)]
+        cls.default_border_mode = (0, 0, 0)
+        cls.filter_flip = [True, False]
+        cls.default_filter_flip = True
+        cls.provide_shape = [True, False]
+        cls.default_provide_shape = True
+        cls.shared = staticmethod(theano.compile.shared)
+
+    def run_fwd(self, inputs_shape, filters_shape,
+                conv_fn=conv.conv3d, conv_op=conv.AbstractConv3d,
+                ref=conv3d_corr, **kwargs):
+        super(BaseTestConv3d, self).run_fwd(
+            inputs_shape=inputs_shape,
+            filters_shape=filters_shape,
+            conv_fn=conv_fn,
+            conv_op=conv_op,
+            ref=ref, **kwargs)
+
+    def run_gradweight(self, inputs_shape, filters_shape, output_shape,
+                       gradWeights_fn=conv.AbstractConv3d_gradWeights,
+                       ref=conv3d_corr_gw, **kwargs):
+        super(BaseTestConv3d, self).run_gradweight(
+            inputs_shape=inputs_shape,
+            filters_shape=filters_shape,
+            output_shape=output_shape,
+            gradWeights_fn=gradWeights_fn,
+            ref=ref, **kwargs)
+
+    def run_gradinput(self, inputs_shape, filters_shape, output_shape,
+                      gradInputs_fn=conv.AbstractConv3d_gradInputs,
+                      ref=conv3d_corr_gi, **kwargs):
+        super(BaseTestConv3d, self).run_gradinput(
+            inputs_shape=inputs_shape,
+            filters_shape=filters_shape,
+            output_shape=output_shape,
+            gradInputs_fn=gradInputs_fn,
+            ref=ref, **kwargs)
+
+
+class TestCorrConv3d(BaseTestConv3d):
+    @classmethod
+    def setup_class(cls):
+        if theano.config.blas.ldflags == "":
+            raise SkipTest()
+        BaseTestConv3d.setup_class()
+
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
+        o = self.get_output_shape(i, f, s, b, fd)
+        if (not theano.config.blas.ldflags or
+                not theano.config.cxx or
+                theano.config.mode == "FAST_COMPILE"):
+            raise SkipTest("Need blas to test conv3d")
+        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
+                     verify_grad=True, provide_shape=provide_shape,
+                     border_mode=b, filter_flip=flip,
+                     target_op=Corr3dMM, check_trace=True,
+                     filter_dilation=fd)
+        self.run_gradweight(inputs_shape=i, filters_shape=f,
+                            output_shape=o, subsample=s, verify_grad=True,
+                            provide_shape=provide_shape, border_mode=b,
+                            filter_flip=flip, target_op=Corr3dMM_gradWeights,
+                            check_trace=True, filter_dilation=fd)
+        self.run_gradinput(inputs_shape=i, filters_shape=f,
+                           output_shape=o, subsample=s, verify_grad=True,
+                           provide_shape=provide_shape, border_mode=b,
+                           filter_flip=flip, target_op=Corr3dMM_gradInputs,
+                           check_trace=True, filter_dilation=fd)
+
+
+class TestCpuConv3d(BaseTestConv3d):
+    @classmethod
+    def setup(cls):
+        BaseTestConv3d.setup_class()
+        cls.mode = theano.compile.mode.get_default_mode().excluding('conv_gemm')
+        cls.opt_err = theano.config.on_opt_error
+        theano.config.on_opt_error = 'ignore'
+
+    @classmethod
+    def tearDown(cls):
+        theano.config.on_opt_error = cls.opt_err
+
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
+        if fd != (1, 1, 1):
+            raise SkipTest("No dilation implementation for basic cpu Conv3D.")
+        mode = self.mode
+        o = self.get_output_shape(i, f, s, b, fd)
+        fwd_OK = True
+        gradweight_OK = True
+        gradinput_OK = True
+
+        if b not in ((0, 0, 0), 'valid'):
+            fwd_OK = False
+            gradweight_OK = False
+            gradinput_OK = False
+
+        if fwd_OK:
+            if not theano.config.blas.ldflags:
+                raise SkipTest("Need blas to test conv3d")
+            self.run_fwd(inputs_shape=i, filters_shape=f,
+                         subsample=s, verify_grad=(gradweight_OK and gradinput_OK),
+                         mode=mode, provide_shape=provide_shape,
+                         border_mode=b, filter_flip=flip, target_op=Conv3D,
+                         check_trace=True, filter_dilation=fd)
+
+        else:
+            assert_raises(AssertionError,
+                          self.run_fwd,
+                          inputs_shape=i,
+                          filters_shape=f,
+                          subsample=s,
+                          verify_grad=False,
+                          mode=mode,
+                          provide_shape=provide_shape,
+                          border_mode=b,
+                          filter_flip=flip,
+                          check_trace=True,
+                          filter_dilation=fd)
+
+        if gradweight_OK:
+            if not theano.config.blas.ldflags:
+                raise SkipTest("Need blas to test conv3d")
+            self.run_gradweight(inputs_shape=i, filters_shape=f,
+                                output_shape=o, subsample=s,
+                                verify_grad=False, mode=mode,
+                                provide_shape=provide_shape, border_mode=b,
+                                filter_flip=flip,
+                                target_op=ConvGrad3D,
+                                check_trace=True,
+                                filter_dilation=fd)
+        else:
+            assert_raises(AssertionError,
+                          self.run_gradweight,
+                          inputs_shape=i,
+                          filters_shape=f,
+                          output_shape=o,
+                          subsample=s,
+                          verify_grad=False,
+                          mode=mode,
+                          provide_shape=provide_shape,
+                          border_mode=b,
+                          filter_flip=flip,
+                          check_trace=True,
+                          filter_dilation=fd)
+
+        if gradinput_OK:
+            if not theano.config.blas.ldflags:
+                raise SkipTest("Need blas to test conv3d")
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=False, mode=mode,
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip,
+                               target_op=ConvTransp3D,
+                               check_trace=True,
+                               filter_dilation=fd)
+        else:
+            assert_raises(AssertionError,
+                          self.run_gradinput,
+                          inputs_shape=i,
+                          filters_shape=f,
+                          output_shape=o,
+                          subsample=s,
+                          verify_grad=False,
+                          mode=mode,
+                          provide_shape=provide_shape,
+                          border_mode=b,
+                          filter_flip=flip,
+                          check_trace=True,
+                          filter_dilation=fd)
+
+
 def test_constant_shapes():
    # Check that the `imshp` and `kshp` parameters of the AbstractConv Ops
    # are rejected if not constant or None

--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -10,7 +10,6 @@ import theano
 import theano.tensor as T
 from theano.tests import unittest_tools as utt
 from theano.tensor.nnet import corr, conv
-from theano.tensor.basic import _allclose


 class TestCorr2D(utt.InferShapeTester):
@@ -132,7 +131,7 @@ class TestCorr2D(utt.InferShapeTester):
                                icol:icol + dil_fil_shape2d[1]:filter_dilation[1]] * filter2d[::-1, ::-1]
                            ).sum()

-        self.assertTrue(_allclose(theano_output, ref_output))
+        utt.assert_allclose(theano_output, ref_output)

        # TEST GRADIENT
        if verify_grad:

--- a/theano/tensor/nnet/tests/test_corr3d.py
+++ b/theano/tensor/nnet/tests/test_corr3d.py
+from __future__ import absolute_import, print_function, division
+
+from nose.plugins.skip import SkipTest
+from nose.plugins.attrib import attr
+from nose.tools import assert_equals
+import numpy
+from six import integer_types
+
+import theano
+import theano.tensor as T
+from theano.tests import unittest_tools as utt
+from theano.tensor.nnet import corr3d, conv
+
+
+class TestCorr3D(utt.InferShapeTester):
+    if theano.config.mode == "FAST_COMPILE":
+        mode = theano.compile.get_mode("FAST_RUN")
+    else:
+        mode = None
+    dtype = theano.config.floatX
+
+    def setUp(self):
+        super(TestCorr3D, self).setUp()
+        self.input = T.tensor5('input', dtype=self.dtype)
+        self.input.name = 'default_V'
+        self.filters = T.tensor5('filters', dtype=self.dtype)
+        self.filters.name = 'default_filters'
+        if not conv.imported_scipy_signal and theano.config.cxx == "":
+            raise SkipTest("Corr3dMM tests need SciPy or a c++ compiler")
+        if not theano.config.blas.ldflags:
+            raise SkipTest("Corr3dMM tests need a BLAS")
+
+    def validate(self, image_shape, filter_shape,
+                 border_mode='valid', subsample=(1, 1, 1),
+                 input=None, filters=None, verify_grad=True,
+                 non_contiguous=False, filter_dilation=(1, 1, 1)):
+        """
+        :param image_shape: The constant shape info passed to corr3dMM.
+        :param filter_shape: The constant shape info passed to corr3dMM.
+        """
+        N_image_shape = [T.get_scalar_constant_value(T.as_tensor_variable(x))
+                         for x in image_shape]
+        N_filter_shape = [T.get_scalar_constant_value(T.as_tensor_variable(x))
+                          for x in filter_shape]
+
+        if input is None:
+            input = self.input
+        if filters is None:
+            filters = self.filters
+
+        # THEANO IMPLEMENTATION
+
+        # we create a symbolic function so that verify_grad can work
+        def sym_Corr3dMM(input, filters):
+            # define theano graph and function
+            input.name = 'input'
+            filters.name = 'filters'
+            rval = corr3d.Corr3dMM(border_mode, subsample,
+                                   filter_dilation)(input, filters)
+            rval.name = 'corr_output'
+            return rval
+
+        output = sym_Corr3dMM(input, filters)
+        output.name = 'Corr3dMM()(%s,%s)' % (input.name, filters.name)
+        theano_corr = theano.function([input, filters], output, mode=self.mode)
+
+        # initialize input and compute result
+        image_data = numpy.random.random(N_image_shape).astype(self.dtype)
+        filter_data = numpy.random.random(N_filter_shape).astype(self.dtype)
+        image_data /= 10
+        filter_data /= 10
+        if non_contiguous:
+            image_data = numpy.transpose(image_data, axes=(0, 1, 4, 3, 2))
+            image_data = image_data.copy()
+            image_data = numpy.transpose(image_data, axes=(0, 1, 4, 3, 2))
+            filter_data = numpy.transpose(filter_data, axes=(0, 1, 4, 3, 2))
+            filter_data = filter_data.copy()
+            filter_data = numpy.transpose(filter_data, axes=(0, 1, 4, 3, 2))
+            assert not image_data.flags['CONTIGUOUS']
+            assert not filter_data.flags['CONTIGUOUS']
+
+        theano_output = theano_corr(image_data, filter_data)
+
+        # REFERENCE IMPLEMENTATION
+        # Testing correlation, not convolution. Reverse filters.
+        filter_data_corr = numpy.array(filter_data[:, :, ::-1, ::-1, ::-1],
+                                       copy=True,
+                                       order='C')
+        orig_image_data = image_data
+        img_shape3d = numpy.array(N_image_shape[-3:])
+        fil_shape3d = numpy.array(N_filter_shape[-3:])
+        dil_shape3d = numpy.array(filter_dilation)
+        dil_fil_shape3d = (fil_shape3d - 1) * dil_shape3d + 1
+        subsample3d = numpy.array(subsample)
+        if border_mode == 'full':
+            padHWD = (dil_fil_shape3d - 1)
+        elif border_mode == 'valid':
+            padHWD = numpy.array([0, 0, 0])
+        elif border_mode == 'half':
+            padHWD = numpy.floor(dil_fil_shape3d / 2).astype('int32')
+        elif isinstance(border_mode, tuple):
+            padHWD = numpy.array(border_mode)
+        elif isinstance(border_mode, integer_types):
+            padHWD = numpy.array([border_mode, border_mode, border_mode])
+        else:
+            raise NotImplementedError('Unsupported border_mode {}'.format(border_mode))
+        out_shape3d = numpy.floor((img_shape3d + 2 * (padHWD) - dil_fil_shape3d) / subsample3d) + 1
+        # avoid numpy deprecation
+        out_shape3d = out_shape3d.astype('int32')
+        out_shape = (N_image_shape[0], N_filter_shape[0]) + tuple(out_shape3d)
+        ref_output = numpy.zeros(out_shape)
+
+        # loop over output feature maps
+        ref_output.fill(0)
+        image_data2 = numpy.zeros((N_image_shape[0], N_image_shape[1],
+                                   N_image_shape[2] + 2 * padHWD[0],
+                                   N_image_shape[3] + 2 * padHWD[1],
+                                   N_image_shape[4] + 2 * padHWD[2]))
+        image_data2[:, :,
+                    padHWD[0]:padHWD[0] + N_image_shape[2],
+                    padHWD[1]:padHWD[1] + N_image_shape[3],
+                    padHWD[2]:padHWD[2] + N_image_shape[4]] = image_data
+        image_data = image_data2
+        N_image_shape = image_data.shape
+        for bb in range(N_image_shape[0]):
+            for nn in range(N_filter_shape[0]):
+                for im0 in range(N_image_shape[1]):
+                    filter3d = filter_data_corr[nn, im0, :, :, :]
+                    image3d = image_data[bb, im0, :, :, :]
+                    for row in range(ref_output.shape[2]):
+                        irow = row * subsample[0]  # image row
+                        for col in range(ref_output.shape[3]):
+                            icol = col * subsample[1]  # image col
+                            for slc in range(ref_output.shape[4]):
+                                islc = slc * subsample[2]  # image slice
+                                ref_output[bb, nn, row, col, slc] += (image3d[
+                                    irow:irow + dil_fil_shape3d[0]:filter_dilation[0],
+                                    icol:icol + dil_fil_shape3d[1]:filter_dilation[1],
+                                    islc:islc + dil_fil_shape3d[2]:filter_dilation[2]
+                                    ] * filter3d[::-1, ::-1, ::-1]
+                                ).sum()
+
+        utt.assert_allclose(theano_output, ref_output)
+
+        # TEST GRADIENT
+        if verify_grad:
+            utt.verify_grad(sym_Corr3dMM, [orig_image_data, filter_data],
+                            mode=self.mode)
+
+    @attr('slow')
+    def test_basic(self):
+        """
+        Tests that basic correlations work for odd and even
+        dimensions of image and filter shapes, as well as rectangular
+        images and filters.
+        """
+        border_modes = ['valid', 'full', 'half', (1, 1, 1),
+                        (2, 1, 1), (1, 2, 1), (1, 1, 2),
+                        (3, 3, 3), 1]
+        img_shapes = [(2, 2, 3, 3, 3), (3, 2, 8, 8, 8), (3, 2, 7, 5, 5), (3, 2, 7, 5, 5),
+                      (1, 2, 8, 8, 8), (1, 2, 7, 5, 5)]
+        fil_shapes = [(2, 2, 2, 2, 2), (1, 2, 5, 5, 5), (2, 2, 2, 3, 2), (2, 2, 3, 2, 2),
+                      (1, 2, 5, 5, 5), (1, 2, 2, 3, 3)]
+
+        for border_mode in border_modes:
+            for img, fil in zip(img_shapes, fil_shapes):
+                self.validate(img, fil, border_mode, verify_grad=False)
+
+        # Very slow on with 'full' or 'half'
+        self.validate((1, 10, 213, 129, 129), (46, 10, 212, 1, 1), 'valid', verify_grad=False)
+
+    def test_img_kernel_same_shape(self):
+        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), 'full')
+        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), 'valid')
+        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), 'half')
+        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), (1, 1, 1))
+        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), 1)
+
+    @attr('slow')
+    def test_subsample(self):
+        """
+        Tests correlation where subsampling != (1,1,1)
+        """
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'valid', subsample=(2, 2, 2))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'valid', subsample=(2, 1, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 'valid', subsample=(3, 3, 3))
+
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'full', subsample=(2, 2, 2))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'full', subsample=(2, 1, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 'full', subsample=(3, 3, 3))
+
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'half', subsample=(2, 2, 2))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'half', subsample=(2, 1, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 'half', subsample=(3, 3, 3))
+
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), (1, 1, 1), subsample=(2, 2, 2))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), (2, 1, 1), subsample=(2, 1, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), (1, 2, 2), subsample=(3, 3, 3))
+
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 1, subsample=(3, 3, 3))
+
+    def test_filter_dilation(self):
+        """
+        Tests correlation where filter dilation != (1,1,1)
+        """
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'valid', filter_dilation=(2, 2, 2))
+        self.validate((3, 2, 14, 10, 10), (2, 2, 2, 3, 3), 'valid', filter_dilation=(3, 1, 1))
+        self.validate((1, 1, 14, 14, 14), (1, 1, 3, 3, 3), 'valid', filter_dilation=(2, 3, 3))
+
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'full', filter_dilation=(2, 2, 2))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'full', filter_dilation=(3, 1, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 'full', filter_dilation=(2, 3, 3))
+
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'half', filter_dilation=(2, 2, 2))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'half', filter_dilation=(3, 1, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 'half', filter_dilation=(2, 3, 3))
+
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), (1, 1, 1), filter_dilation=(2, 2, 2))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), (2, 1, 1), filter_dilation=(2, 1, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), (1, 2, 1), filter_dilation=(1, 2, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), (1, 1, 2), filter_dilation=(1, 1, 2))
+
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 1, subsample=(3, 3, 3), filter_dilation=(2, 2, 2))
+
+    @attr('slow')
+    def test_shape_Constant_tensor(self):
+        """
+        Tests correlation where the {image,filter}_shape is a Constant tensor.
+        """
+        as_t = T.as_tensor_variable
+        border_modes = ['valid', 'full', 'half', (1, 1, 1), (2, 1, 1),
+                        (1, 2, 1), (1, 1, 2), (3, 3, 3), 1]
+
+        for border_mode in border_modes:
+            self.validate((as_t(3), as_t(2), as_t(7), as_t(5), as_t(5)),
+                          (5, 2, 2, 3, 3), border_mode)
+            self.validate(as_t([3, 2, 7, 5, 5]), (5, 2, 2, 3, 3), border_mode)
+            self.validate(as_t((3, 2, 7, 5, 5)), (5, 2, 2, 3, 3), border_mode)
+            self.validate((3, 2, 7, 5, 5), (as_t(5), as_t(2), as_t(2),
+                          as_t(3), as_t(3)), 'valid')
+            self.validate((3, 2, 7, 5, 5), as_t([5, 2, 2, 3, 3]), border_mode)
+            self.validate(as_t([3, 2, 7, 5, 5]), as_t([5, 2, 2, 3, 3]), border_mode)
+
+    def test_invalid_filter_shape(self):
+        """
+        Tests scenario where filter_shape[1] != input_shape[1]
+        """
+        self.assertRaises(ValueError, self.validate,
+                          (3, 2, 8, 8, 8), (4, 3, 5, 5, 8),
+                          'valid')
+
+    def test_full_mode(self):
+        """
+        Tests basic correlation in full mode and case where filter
+        is larger than the input image.
+        """
+        self.validate((3, 2, 5, 5, 5), (4, 2, 8, 8, 8), 'full')
+
+        def f():
+            self.validate((3, 2, 5, 5, 5), (4, 2, 8, 8, 8), 'valid')
+        self.assertRaises(Exception, f)
+
+    def test_wrong_input(self):
+        """
+        Make sure errors are raised when image and kernel are not 5D tensors
+        """
+        self.assertRaises(Exception, self.validate, (3, 2, 8, 8, 8), (4, 2, 5, 5, 5),
+                          'valid', input=T.dmatrix())
+        self.assertRaises(Exception, self.validate, (3, 2, 8, 8, 8), (4, 2, 5, 5, 5),
+                          'valid', filters=T.dvector())
+        self.assertRaises(Exception, self.validate, (3, 2, 8, 8, 8), (4, 2, 5, 5, 5),
+                          'valid', input=T.dtensor3())
+        self.assertRaises(Exception, self.validate, (3, 2, 8, 8, 8), (4, 2, 5, 5, 5),
+                          'valid', input=T.dtensor4())
+
+    def test_dtype_upcast(self):
+        """
+        Checks dtype upcast for Corr3dMM methods.
+        """
+        def rand(shape, dtype='float64'):
+            r = numpy.asarray(numpy.random.rand(*shape), dtype=dtype)
+            return r * 2 - 1
+
+        ops = [corr3d.Corr3dMM, corr3d.Corr3dMM_gradWeights, corr3d.Corr3dMM_gradInputs]
+        a_shapes = [[4, 5, 6, 3, 3], [1, 5, 6, 3, 3], [1, 5, 6, 3, 3]]
+        b_shapes = [[7, 5, 3, 2, 2], [1, 5, 3, 1, 1], [7, 1, 3, 1, 1]]
+        dtypes = ['float32', 'float64']
+
+        for op, a_shape, b_shape in zip(ops, a_shapes, b_shapes):
+            for a_dtype in dtypes:
+                for b_dtype in dtypes:
+                    c_dtype = theano.scalar.upcast(a_dtype, b_dtype)
+                    a_tens = T.tensor5(dtype=a_dtype)
+                    b_tens = T.tensor5(dtype=b_dtype)
+                    a_tens_val = rand(a_shape, dtype=a_dtype)
+                    b_tens_val = rand(b_shape, dtype=b_dtype)
+
+                    c_tens = op()(a_tens, b_tens)
+                    f = theano.function([a_tens, b_tens], c_tens, mode=self.mode)
+                    assert_equals(f(a_tens_val, b_tens_val).dtype, c_dtype)
+
+    @attr('slow')
+    def test_infer_shape_forward(self):
+        if theano.config.mode == "FAST_COMPILE":
+            raise SkipTest("Corr3dMM don't work in FAST_COMPILE")
+
+        def rand(*shape):
+            r = numpy.asarray(numpy.random.rand(*shape), dtype='float64')
+            return r * 2 - 1
+        corr3dMM = corr3d.Corr3dMM
+
+        adtens = T.dtensor5()
+        bdtens = T.dtensor5()
+        aivec_vals = [[4, 5, 6, 3, 3], [6, 2, 8, 3, 3], [3, 6, 7, 5, 5],
+                      [3, 6, 7, 5, 5], [5, 2, 4, 3, 3]]
+        bivec_vals = [[7, 5, 3, 2, 2], [4, 2, 5, 3, 3], [5, 6, 3, 2, 2],
+                      [5, 6, 2, 3, 3], [6, 2, 4, 3, 3]]
+        modes = ['valid', 'full', 'half', (1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2), 1]
+        subsamples = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2)]
+
+        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
+            adtens_val = rand(*aivec_val)
+            bdtens_val = rand(*bivec_val)
+            for mode in modes:
+                for subsample in subsamples:
+                    # Corr3dMM
+                    cdtens = corr3dMM(border_mode=mode, subsample=subsample)(adtens, bdtens)
+                    self._compile_and_check([adtens, bdtens],
+                                            [cdtens],
+                                            [adtens_val, bdtens_val], corr3dMM,
+                                            warn=False)
+
+    @attr('slow')
+    def test_infer_shape_gradW(self):
+        if theano.config.mode == "FAST_COMPILE":
+            raise SkipTest("Corr3dMM don't work in FAST_COMPILE")
+
+        def rand(*shape):
+            r = numpy.asarray(numpy.random.rand(*shape), dtype='float64')
+            return r * 2 - 1
+        corr3dMM = corr3d.Corr3dMM
+        gradW = corr3d.Corr3dMM_gradWeights
+
+        adtens = T.dtensor5()
+        bdtens = T.dtensor5()
+        aivec_vals = [[1, 5, 6, 3, 3], [8, 2, 7, 3, 3], [1, 6, 9, 4, 4],
+                      [9, 6, 8, 5, 5], [9, 1, 6, 8, 8]]
+        bivec_vals = [[7, 5, 3, 1, 1], [4, 2, 5, 3, 3], [12, 6, 3, 2, 2],
+                      [5, 6, 1, 3, 3], [11, 1, 3, 3, 3]]
+        modes = ['valid', 'full', 'half', (1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2), 1]
+        subsamples = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2)]
+
+        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
+            adtens_val = rand(*aivec_val)
+            bdtens_val = rand(*bivec_val)
+            for mode in modes:
+                for subsample in subsamples:
+                    # Corr3dMM
+                    cdtens = corr3dMM(border_mode=mode, subsample=subsample)(adtens, bdtens)
+                    f = theano.function([adtens, bdtens], cdtens)
+                    cdtens_val = f(adtens_val, bdtens_val)
+                    # Corr3dMM_gradWeights
+                    shape = (theano.shared(bivec_val[2]), theano.shared(bivec_val[3]),
+                             theano.shared(bivec_val[4]))
+                    bdtens_g = gradW(border_mode=mode,
+                                     subsample=subsample)(adtens, cdtens, shape=shape)
+                    self._compile_and_check([adtens, cdtens],
+                                            [bdtens_g],
+                                            [adtens_val, cdtens_val], gradW,
+                                            warn=False)
+
+    @attr('slow')
+    def test_infer_shape_gradI(self):
+        if theano.config.mode == "FAST_COMPILE":
+            raise SkipTest("Corr3dMM don't work in FAST_COMPILE")
+
+        def rand(*shape):
+            r = numpy.asarray(numpy.random.rand(*shape), dtype='float64')
+            return r * 2 - 1
+        corr3dMM = corr3d.Corr3dMM
+        gradI = corr3d.Corr3dMM_gradInputs
+
+        adtens = T.dtensor5()
+        bdtens = T.dtensor5()
+        aivec_vals = [[1, 5, 6, 3, 3], [8, 2, 7, 3, 3], [1, 6, 9, 4, 4],
+                      [9, 6, 8, 5, 5], [9, 1, 6, 8, 8]]
+        bivec_vals = [[7, 5, 3, 1, 1], [4, 2, 5, 3, 3], [12, 6, 3, 2, 2],
+                      [5, 6, 1, 3, 3], [7, 1, 3, 4, 4]]
+        modes = ['valid', 'full', 'half', (1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2), 1]
+        subsamples = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2)]
+
+        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
+            adtens_val = rand(*aivec_val)
+            bdtens_val = rand(*bivec_val)
+            for mode in modes:
+                for subsample in subsamples:
+                    # Corr3dMM
+                    cdtens = corr3dMM(border_mode=mode, subsample=subsample)(adtens, bdtens)
+                    f = theano.function([adtens, bdtens], cdtens)
+                    cdtens_val = f(adtens_val, bdtens_val)
+                    # Corr3dMM_gradInputs
+                    shape = (theano.shared(aivec_val[2]), theano.shared(aivec_val[3]),
+                             theano.shared(aivec_val[4]))
+                    adtens_g = gradI(border_mode=mode,
+                                     subsample=subsample)(bdtens, cdtens, shape=shape)
+                    self._compile_and_check([bdtens, cdtens],
+                                            [adtens_g],
+                                            [bdtens_val, cdtens_val], gradI,
+                                            warn=False)
+
+    def test_non_contiguous(self):
+        self.validate((2, 2, 3, 3, 3), (2, 2, 2, 2, 2), 'valid', non_contiguous=True)
+        self.validate((3, 2, 8, 8, 8), (4, 2, 5, 5, 5), 'valid', non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 2, 3, 3), 'valid', non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 3, 2, 2), 'valid', non_contiguous=True)
+        self.validate((3, 2, 8, 8, 8), (4, 2, 5, 5, 5), 'full', non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 2, 3, 3), 'full', non_contiguous=True)
+        self.validate((3, 2, 8, 8, 8), (4, 2, 5, 5, 5), 'half', non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 2, 3, 3), 'half', non_contiguous=True)
+        self.validate((3, 2, 8, 8, 8), (4, 2, 5, 5, 5), (1, 1, 1), non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 2, 3, 3), (1, 1, 2), non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 2, 3, 3), (1, 2, 1), non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 2, 3, 3), (2, 1, 1), non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 2, 3, 3), 2, non_contiguous=True)
+
+
+if __name__ == '__main__':
+
+    t = TestCorr3D('setUp')
+    t.setUp()
+    t.test_infer_shape()