Merge pull request #4862 from gvtulder/f-abstractconv3d

Adding an AbstractConv3d interface

Merge pull request #4862 from gvtulder/f-abstractconv3d
146ef971 · Frédéric Bastien · GitHub · 1b5d4fc4 · a50d7907 · 146ef971
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -31,13 +31,18 @@
    that will be replaced by an actual convolution implementation during
    the optimization phase.
+    As of October 2016 (version 0.9.0dev3), there is also a conv3d interface that provides
+    a similar operation for 3D convolution. :func:`nnet.conv3d <theano.tensor.nnet.conv3d>`
+    defines the abstract theano graph convolution operation
+    :func:`nnet.abstract_conv.AbstractConv3d <theano.tensor.nnet.abstract_conv.AbstractConv3d>`.
    Since the abstract Op does not have any implementation, it will prevent
    computations in the un-optimized graph, and cause problems with DebugMode,
    test values, and when compiling with optimizer=None.
    By default, if :ref:`cuDNN <libdoc_cuda_dnn>`
    is available, we will use it, otherwise we will fall back to using the
-    gemm version (slower then cuDNN in most cases and uses more memory).
+    gemm version (slower than cuDNN in most cases and uses more memory).
    Either cuDNN and the gemm version can be disabled using the Theano flags
    ``optimizer_excluding=conv_dnn`` and ``optimizer_excluding=conv_gemm``,
@@ -51,9 +56,9 @@
    option. Disabling the gemm version is only useful if cuDNN is unavailable
    and you run out of GPU memory.
-    There are two other implementations: An FFT-based convolution integrated
+    There are two other implementations of 2D convolution: An FFT-based
-    into Theano, and an implementation by Alex Krizhevsky available via
+    convolution integrated into Theano, and an implementation by Alex Krizhevsky
-    Pylearn2. See the documentation below on how to use them.
+    available via Pylearn2. See the documentation below on how to use them.
    Old conv2d interface is still accessible through :func:`nnet.conv.conv2d <theano.tensor.nnet.conv.conv2d>`.
@@ -146,8 +151,8 @@ TODO: Give examples on how to use these things! They are pretty complicated.
      ``THEANO_FLAGS=optimizer_excluding=conv_gemm`` in your environment.
    - :func:`dnn_conv <theano.sandbox.cuda.dnn.dnn_conv>` GPU-only
      convolution using NVIDIA's cuDNN library. This requires that you have
-      cuDNN installed and available, which in turn requires CUDA 6.5 and a GPU
+      cuDNN 4.0 or newer installed and available, which in turn requires CUDA 7.0
-      with compute capability 3.0 or more.
+      and a GPU with compute capability 3.0 or more.
      If cuDNN is available, by default, Theano will replace all nnet.conv2d
      operations with dnn_conv. To explicitly disable it, set
@@ -190,12 +195,31 @@ TODO: Give examples on how to use these things! They are pretty complicated.
      please see the warning about a bug in CUDA 5.0 to 6.0
      in :func:`GpuCorrMM <theano.sandbox.cuda.blas.GpuCorrMM>`.
+    - :func:`Corr3dMM <theano.tensor.nnet.corr3d.Corr3dMM>`
+      This is a CPU-only 3d correlation implementation based on
+      the 2d version (:func:`CorrMM <theano.tensor.nnet.corr.CorrMM>`).
+      It does not flip the kernel. As it provides a gradient, you can use it as a
+      replacement for nnet.conv3d. For convolutions done on CPU,
+      nnet.conv3d will be replaced by Corr3dMM. To explicitly disable it, set
+      ``THEANO_FLAGS=optimizer_excluding=conv_gemm`` in your environment.
+    - :func:`dnn_conv3d <theano.sandbox.cuda.dnn.dnn_conv3d>` GPU-only
+      convolution using NVIDIA's cuDNN library. This requires that you have
+      cuDNN 4.0 or newer installed and available, which in turn requires CUDA 7.0
+      and a GPU with compute capability 3.0 or more.
+      If cuDNN is available, by default, Theano will replace all nnet.conv3d
+      operations with dnn_conv3d. To explicitly disable it, set
+      ``THEANO_FLAGS=optimizer_excluding=conv_dnn`` in your environment.
+      As dnn_conv3d has a gradient defined, you can also use it manually.
    - :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>`
      Another conv3d implementation that uses the conv2d with data reshaping.
      It is faster in some cases than conv3d, and work on the GPU.
      It flip the kernel.
 .. autofunction:: theano.tensor.nnet.conv2d
+.. autofunction:: theano.tensor.nnet.conv3d
 .. autofunction:: theano.sandbox.cuda.fftconv.conv2d_fft
 .. autofunction:: theano.tensor.nnet.Conv3D.conv3D
 .. autofunction:: theano.sandbox.cuda.fftconv.conv3d_fft

--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@ PLATFORMS           = ["Windows", "Linux", "Solaris", "Mac OS-X", "Unix"]
 MAJOR               = 0
 MINOR               = 9
 MICRO               = 0
-SUFFIX              = "dev2"  # Should be blank except for rc's, betas, etc.
+SUFFIX              = "dev3"  # Should be blank except for rc's, betas, etc.
 ISRELEASED          = False
 VERSION             = '%d.%d.%d%s' % (MAJOR, MINOR, MICRO, SUFFIX)

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -24,7 +24,10 @@ from theano.tensor.nnet.conv import ConvOp
 from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
 from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradWeights,
-                                              AbstractConv2d_gradInputs)
+                                              AbstractConv2d_gradInputs,
+                                              AbstractConv3d,
+                                              AbstractConv3d_gradWeights,
+                                              AbstractConv3d_gradInputs)
 from theano.tests.breakpoint import PdbBreakpoint
@@ -1297,18 +1300,24 @@ def local_inplace_sparseblockouter(node):
 @register_opt('fast_compile', 'conv_dnn', 'cudnn')
 @op_lifter([AbstractConv2d,
            AbstractConv2d_gradWeights,
-            AbstractConv2d_gradInputs])
+            AbstractConv2d_gradInputs,
-def local_gpua_abstractconv2d(op, context_name, inputs, outputs):
+            AbstractConv3d,
+            AbstractConv3d_gradWeights,
+            AbstractConv3d_gradInputs])
+def local_gpua_abstractconv(op, context_name, inputs, outputs):
    if isinstance(outputs[0].type, GpuArrayType):
        # Don't handle this node here, it's already on the GPU.
        return
-    return local_gpua_lift_abstractconv2d_graph(op, context_name, inputs, outputs)
+    return local_gpua_lift_abstractconv_graph(op, context_name, inputs, outputs)
 @register_opt2([AbstractConv2d,
                AbstractConv2d_gradWeights,
-                AbstractConv2d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
+                AbstractConv2d_gradInputs,
-def local_gpua_lift_abstractconv2d_graph(op, context_name, inputs, outputs):
+                AbstractConv3d,
+                AbstractConv3d_gradWeights,
+                AbstractConv3d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
+def local_gpua_lift_abstractconv_graph(op, context_name, inputs, outputs):
    inps = list(inputs)
    inps[0] = as_gpuarray_variable(inputs[0],
                                   context_name=context_name)

--- a/theano/gpuarray/tests/test_abstractconv.py
+++ b/theano/gpuarray/tests/test_abstractconv.py
@@ -47,6 +47,39 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
                           filter_flip=flip, target_op=GpuDnnConvGradI)
+class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
+    @classmethod
+    def setup_class(cls):
+        test_abstract_conv.BaseTestConv3d.setup_class()
+        cls.shared = staticmethod(gpuarray_shared_constructor)
+        # provide_shape is not used by the cuDNN impementation
+        cls.provide_shape = [False]
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
+        if not dnn_available(test_ctx_name):
+            raise SkipTest(dnn_available.msg)
+        mode = mode_with_gpu
+        if fd != (1, 1, 1):
+            raise SkipTest("Doesn't have CUDNN implementation")
+        o = self.get_output_shape(i, f, s, b, fd)
+        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
+                     verify_grad=True, mode=mode,
+                     provide_shape=provide_shape, border_mode=b,
+                     filter_flip=flip, target_op=GpuDnnConv)
+        self.run_gradweight(inputs_shape=i, filters_shape=f,
+                            output_shape=o, subsample=s,
+                            verify_grad=True, mode=mode,
+                            provide_shape=provide_shape, border_mode=b,
+                            filter_flip=flip, target_op=GpuDnnConvGradW)
+        self.run_gradinput(inputs_shape=i, filters_shape=f,
+                           output_shape=o, subsample=s,
+                           verify_grad=True, mode=mode,
+                           provide_shape=provide_shape, border_mode=b,
+                           filter_flip=flip, target_op=GpuDnnConvGradI)
 class TestDnnConvTypes(test_abstract_conv.TestConvTypes):
    def setUp(self):
        self.input = gpu_ftensor4()

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -779,6 +779,201 @@ def test_dnn_conv_grad():
    utt.verify_grad(dconvw, [img_val, kern_val, out_val])
+def get_conv3d_test_cases():
+    # Every element of test_shapes follows the format
+    # [input_shape, filter_shape, subsample]
+    test_shapes = [[(128, 3, 5, 5, 5), (64, 3, 1, 2, 4), (1, 1, 1)],
+                   [(8, 4, 20, 12, 15), (5, 4, 6, 12, 4), (2, 2, 2)],
+                   [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 3, 3)],
+                   [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1)],
+                   [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1)],
+                   # Test with 1x1x1 filters
+                   [(8, 1, 10, 10, 10), (10, 1, 1, 1, 1), (1, 1, 1)],
+                   # Test with dimensions larger than 1024 (thread block dim)
+                   [(1025, 1, 2, 3, 4), (5, 1, 1, 2, 3), (1, 1, 1)],
+                   [(8, 1, 2, 3, 4), (1025, 1, 1, 2, 3), (1, 1, 1)],
+                   [(8, 1025, 2, 3, 4), (5, 1025, 1, 1, 2), (1, 1, 1)],
+                   [(8, 1, 1030, 3, 4), (5, 1, 1025, 1, 1), (1, 1, 1)],
+                   [(8, 1, 2, 1030, 4), (5, 1, 2, 1025, 1), (1, 1, 1)],
+                   [(8, 1, 2, 3, 1030), (5, 1, 1, 2, 1025), (1, 1, 1)],
+                   # The equivalent of this caused a crash with conv2d
+                   [(1, 1, 1, 44800, 1), (6, 1, 1, 1, 1), (1, 1, 1)]]
+    # With border mode 'full', test with kernel bigger than image in some/all
+    # dimensions
+    test_shapes_full = [[(6, 2, 2, 2, 2), (4, 2, 3, 1, 1), (1, 1, 1)],
+                        [(6, 2, 2, 2, 2), (4, 2, 1, 3, 1), (1, 1, 1)],
+                        [(6, 2, 2, 2, 2), (4, 2, 1, 1, 3), (1, 1, 1)],
+                        [(6, 2, 2, 2, 2), (4, 2, 5, 5, 5), (1, 1, 1)]]
+    border_modes = ['valid', 'full', 'half', (1, 2, 3), (3, 2, 1), 1, 2]
+    conv_modes = ['conv', 'cross']
+    itt = chain(product(test_shapes, border_modes, conv_modes),
+                product(test_shapes_full, ['full'], conv_modes))
+    return itt
+def test_conv3d_fwd():
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+    def run_conv3d_fwd(inputs_shape, filters_shape, subsample,
+                       border_mode, conv_mode):
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        # Scale down the input values to prevent very large absolute errors
+        # due to float rounding
+        inputs_val /= 10
+        filters_val /= 10
+        inputs = theano.shared(inputs_val)
+        filters = theano.shared(filters_val)
+        bias = theano.shared(numpy.zeros(filters_shape[0]).astype('float32'))
+        # Compile a theano function for the cuDNN implementation
+        conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
+                              border_mode=border_mode, subsample=subsample,
+                              conv_mode=conv_mode)
+        f = theano.function([], conv, mode=mode_with_gpu)
+        # If conv_mode is 'conv' the reference implementation should use
+        # filters filpped according to the width, height and time axis
+        if conv_mode == 'conv':
+            flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
+        else:
+            flipped_filters = filters
+        # If border mode is anything but 'valid', the reference implementation
+        # should operate on padded inputs
+        if border_mode == 'valid':
+            padded_inputs = inputs
+        else:
+            if border_mode == 'full':
+                pad_per_dim = [filters_shape[i] - 1 for i in range(2, 5)]
+            elif border_mode == 'half':
+                pad_per_dim = [filters_shape[i] // 2 for i in range(2, 5)]
+            else:
+                if isinstance(border_mode, int):
+                    pad_per_dim = [border_mode] * 3
+                else:
+                    pad_per_dim = border_mode
+            pad_before_after = ([(0, 0), (0, 0)] +
+                                [(p, p) for p in pad_per_dim])
+            padded_inputs_val = numpy.pad(inputs_val, pad_before_after,
+                                          'constant')
+            padded_inputs = theano.shared(padded_inputs_val)
+        # Compile a theano function for the reference implementation
+        conv_ref = theano.tensor.nnet.conv3D(
+            V=padded_inputs.dimshuffle(0, 2, 3, 4, 1),
+            W=flipped_filters.dimshuffle(0, 2, 3, 4, 1),
+            b=bias, d=subsample)
+        f_ref = theano.function([], conv_ref.dimshuffle(0, 4, 1, 2, 3), mode="FAST_RUN")
+        # Compare the results of the two implementations
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+    test_cases = get_conv3d_test_cases()
+    for (i_shape, f_shape, subsample), border_mode, conv_mode in test_cases:
+        yield (run_conv3d_fwd, i_shape, f_shape, subsample, border_mode,
+               conv_mode)
+def test_conv3d_bwd():
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+    def run_conv3d_bwd(inputs_shape, filters_shape, subsample,
+                       border_mode, conv_mode):
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs = theano.shared(inputs_val)
+        filters = theano.shared(filters_val)
+        bias = theano.shared(numpy.zeros(filters_shape[0]).astype('float32'))
+        # Compile a theano function for the cuDNN implementation
+        conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
+                              border_mode=border_mode, subsample=subsample,
+                              conv_mode=conv_mode)
+        grad_i, grad_w = theano.tensor.grad(conv.sum(), [inputs, filters])
+        f = theano.function([], [grad_i, grad_w], mode=mode_with_gpu)
+        # If conv_mode is 'conv' the reference implementation should use
+        # filters filpped according to the width, height and time axis
+        if conv_mode == 'conv':
+            flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
+        else:
+            flipped_filters = filters
+        # If border mode is anything but 'valid', the reference implementation
+        # should operate on padded inputs
+        if border_mode == 'valid':
+            padded_inputs = inputs
+        else:
+            if border_mode == 'full':
+                pad_per_dim = [filters_shape[i] - 1 for i in range(2, 5)]
+            elif border_mode == 'half':
+                pad_per_dim = [filters_shape[i] // 2 for i in range(2, 5)]
+            else:
+                if isinstance(border_mode, int):
+                    pad_per_dim = [border_mode] * 3
+                else:
+                    pad_per_dim = border_mode
+            pad_before_after = ([(0, 0), (0, 0)] +
+                                [(p, p) for p in pad_per_dim])
+            padded_inputs_val = numpy.pad(inputs_val, pad_before_after,
+                                          'constant')
+            padded_inputs = theano.shared(padded_inputs_val)
+        # Compile a theano function for the reference implementation
+        conv_ref = theano.tensor.nnet.conv3D(
+            V=padded_inputs.dimshuffle(0, 2, 3, 4, 1),
+            W=flipped_filters.dimshuffle(0, 2, 3, 4, 1),
+            b=bias, d=subsample)
+        (grad_padded_i_ref,
+         grad_w_ref) = theano.tensor.grad(conv_ref.sum(),
+                                          [padded_inputs, filters])
+        # Recover grad_i_ref from grad_padded_i_ref
+        if border_mode == 'valid':
+            grad_i_ref = grad_padded_i_ref
+        else:
+            shp = grad_padded_i_ref.shape
+            grad_i_ref = grad_padded_i_ref[
+                :, :,
+                pad_per_dim[0]:shp[2] - pad_per_dim[0],
+                pad_per_dim[1]:shp[3] - pad_per_dim[1],
+                pad_per_dim[2]:shp[4] - pad_per_dim[2]]
+        f_ref = theano.function([], [grad_i_ref, grad_w_ref], mode="FAST_RUN")
+        # Compare the results of the two implementations
+        res_ref = f_ref()
+        res = f()
+        # Needed for big size for some seed
+        # raise rtol to make the test pass with more seed.
+        utt.assert_allclose(res_ref[0], res[0], rtol=2e-5)
+        utt.assert_allclose(res_ref[1], res[1], rtol=2e-5)
+    test_cases = get_conv3d_test_cases()
+    for (i_shape, f_shape, subsample), border_mode, conv_mode in test_cases:
+        yield (run_conv3d_bwd, i_shape, f_shape, subsample, border_mode,
+               conv_mode)
 def test_version():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
--- a/theano/sandbox/cuda/corr3d_gemm.cu
+++ b/theano/sandbox/cuda/corr3d_gemm.cu
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -37,7 +37,10 @@ from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
 from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradWeights,
-                                              AbstractConv2d_gradInputs)
+                                              AbstractConv2d_gradInputs,
+                                              AbstractConv3d,
+                                              AbstractConv3d_gradWeights,
+                                              AbstractConv3d_gradInputs)
 def c_define_tensor_desc(desc):
@@ -1232,7 +1235,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
    :warning: dnn_conv3d only works with cuDNN library 3.0
    """
-    if border_mode == (0, 0):
+    if border_mode == (0, 0, 0):
        border_mode = 'valid'
    # Establish dtype in which to perform the computation of the convolution
@@ -1319,6 +1322,32 @@ def dnn_gradweight(img, topgrad,
    return GpuDnnConvGradW()(img, topgrad, out, desc)
+def dnn_gradweight3d(img, topgrad,
+                     kerns_shp,
+                     border_mode='valid', subsample=(1, 1, 1),
+                     conv_mode='conv'):
+    """
+    GPU convolution gradient with respect to weight using cuDNN from NVIDIA.
+    The memory layout to use is 'bct01', that is 'batch', 'channel',
+    'first dim', 'second dim' in that order.
+    FIXME parameters doc
+    :warning: The cuDNN library only works with GPU that have a compute
+      capability of 3.0 or higer.  This means that older GPU will not
+      work with this Op.
+    """
+    img = gpu_contiguous(img)
+    topgrad = gpu_contiguous(topgrad)
+    kerns_shp = theano.tensor.as_tensor_variable(kerns_shp)
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+                          conv_mode=conv_mode)(img.shape, kerns_shp)
+    out = gpu_alloc_empty(*kerns_shp)
+    return GpuDnnConv3dGradW()(img, topgrad, out, desc)
 def dnn_gradinput(kerns, topgrad,
                  img_shp,
                  border_mode='valid', subsample=(1, 1),
@@ -1346,6 +1375,33 @@ def dnn_gradinput(kerns, topgrad,
    return GpuDnnConvGradI()(kerns, topgrad, out, desc)
+def dnn_gradinput3d(kerns, topgrad,
+                    img_shp,
+                    border_mode='valid', subsample=(1, 1),
+                    conv_mode='conv'):
+    """
+    GPU convolution gradient with respect to input using cuDNN from NVIDIA.
+    The memory layout to use is 'bct01', that is 'batch', 'channel',
+    'first dim', 'second dim' in that order.
+    FIXME parameters doc
+    :warning: The cuDNN library only works with GPU that have a compute
+      capability of 3.0 or higer.  This means that older GPU will not
+      work with this Op.
+    """
+    kerns = gpu_contiguous(kerns)
+    topgrad = gpu_contiguous(topgrad)
+    img_shp = theano.tensor.as_tensor_variable(img_shp)
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+                          conv_mode=conv_mode)(img_shp, kerns.shape)
+    out = gpu_alloc_empty(*img_shp)
+    return GpuDnnConv3dGradI()(kerns, topgrad, out, desc)
 class GpuDnnPoolDesc(GpuOp):
    """
    This Op builds a pooling descriptor for use in the other pooling operations.
@@ -3176,3 +3232,53 @@ def local_abstractconv_cudnn(node):
                             subsample=node.op.subsample,
                             conv_mode=conv_mode)
        return [rval]
+@local_optimizer([AbstractConv3d,
+                  AbstractConv3d_gradWeights,
+                  AbstractConv3d_gradInputs])
+def local_abstractconv3d_cudnn(node):
+    if (not isinstance(node.op, (AbstractConv3d,
+                                 AbstractConv3d_gradWeights,
+                                 AbstractConv3d_gradInputs))):
+        return None
+    if (node.op.filter_dilation != (1, 1, 1)):
+        return None
+    inp1 = node.inputs[0]
+    inp2 = node.inputs[1]
+    if (not isinstance(inp1.type, CudaNdarrayType) or
+            not isinstance(inp2.type, CudaNdarrayType)):
+        return None
+    if not dnn_available():
+        return None
+    if node.op.filter_flip:
+        conv_mode = 'conv'
+    else:
+        conv_mode = 'cross'
+    if (isinstance(node.op, AbstractConv3d)):
+        rval = dnn_conv3d(inp1, inp2,
+                          border_mode=node.op.border_mode,
+                          subsample=node.op.subsample,
+                          direction_hint='forward',
+                          conv_mode=conv_mode)
+        return [rval]
+    if (isinstance(node.op, AbstractConv3d_gradWeights)):
+        shape = (inp2.shape[1], inp1.shape[1],
+                 node.inputs[2][0], node.inputs[2][1], node.inputs[2][2])
+        rval = dnn_gradweight3d(inp1, inp2, shape,
+                                border_mode=node.op.border_mode,
+                                subsample=node.op.subsample,
+                                conv_mode=conv_mode)
+        return [rval]
+    if (isinstance(node.op, AbstractConv3d_gradInputs)):
+        shape = (inp2.shape[0], inp1.shape[1],
+                 node.inputs[2][0], node.inputs[2][1], node.inputs[2][2])
+        rval = dnn_gradinput3d(inp1, inp2, shape,
+                               border_mode=node.op.border_mode,
+                               subsample=node.op.subsample,
+                               conv_mode=conv_mode)
+        return [rval]
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -87,10 +87,13 @@ from theano.tensor import slinalg
 from theano.tensor.nnet.Conv3D import Conv3D
 from theano.tests.breakpoint import PdbBreakpoint
-from theano.tensor.nnet.abstract_conv import (BaseAbstractConv2d,
+from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
                                              AbstractConv2d,
                                              AbstractConv2d_gradWeights,
-                                              AbstractConv2d_gradInputs)
+                                              AbstractConv2d_gradInputs,
+                                              AbstractConv3d,
+                                              AbstractConv3d_gradWeights,
+                                              AbstractConv3d_gradInputs)
 from theano.tensor.opt import register_specialize_device
@@ -2726,8 +2729,11 @@ optdb.register('local_inplace_gpu_sparse_block_outer',
 @local_optimizer([gpu_from_host,
                  AbstractConv2d,
                  AbstractConv2d_gradWeights,
-                  AbstractConv2d_gradInputs])
+                  AbstractConv2d_gradInputs,
-def local_conv2d_gpu_conv(node):
+                  AbstractConv3d,
+                  AbstractConv3d_gradWeights,
+                  AbstractConv3d_gradInputs])
+def local_conv_gpu_conv(node):
    """
    gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host)
@@ -2736,7 +2742,7 @@ def local_conv2d_gpu_conv(node):
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and isinstance(host_input.owner.op,
-                                           BaseAbstractConv2d):
+                                           BaseAbstractConv):
            conv = host_input.owner.op
            inps = list(host_input.owner.inputs)
@@ -2749,7 +2755,7 @@ def local_conv2d_gpu_conv(node):
            out.tag.values_eq_approx = values_eq_approx_high_tol
            return [out]
-    if isinstance(node.op, BaseAbstractConv2d):
+    if isinstance(node.op, BaseAbstractConv):
        # conv(host_from_gpu) -> host_from_gpu(gpu_conv)
        inp1 = node.inputs[0]
        inp2 = node.inputs[1]
@@ -2779,7 +2785,7 @@ def local_conv2d_gpu_conv(node):
                return [tensor.as_tensor_variable(out)]
            else:
                return [out]
-register_opt()(local_conv2d_gpu_conv)
+register_opt()(local_conv_gpu_conv)
 # Corrmm opt
@@ -2849,6 +2855,76 @@ def local_abstractconv_gemm(node):
    return [rval]
+# Corrmm opt
+@local_optimizer([AbstractConv3d])
+def local_abstractconv3d_gemm(node):
+    if not isinstance(node.op, AbstractConv3d):
+        return None
+    img, kern = node.inputs
+    if (not isinstance(img.type, CudaNdarrayType) or
+            not isinstance(kern.type, CudaNdarrayType)):
+        return None
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+    if ((border_mode == 'full') and (subsample == (1, 1, 1))):
+        if not node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1, ::-1]
+        # need to dimshuffle the kernel for full convolution
+        kern = kern.dimshuffle(1, 0, 2, 3, 4)
+        # call GpuCorr3dMM_gradInputs
+        rval = GpuCorr3dMM_gradInputs('valid',
+                                      subsample,
+                                      filter_dilation)(
+            gpu_contiguous(kern), gpu_contiguous(img))
+    else:
+        # need to flip the kernel if necessary
+        if node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1, ::-1]
+        # By default use GpuCorr3dMM
+        rval = GpuCorr3dMM(border_mode,
+                           subsample,
+                           filter_dilation)(gpu_contiguous(img),
+                                            gpu_contiguous(kern))
+        # call GpuCorr3dMM_gradWeights if good
+        # (the latter is faster if
+        #   batchsize * kernelHeight * kernelWidth * kernelDepth
+        # is larger than
+        #   inputChannels * outputHeight * outputWidth * outputDepth.
+        # GpuConv does not always store information on the batchsize and
+        # channels, though, so we only use what information we have.)
+        if ((subsample == (1, 1, 1)) and (filter_dilation == (1, 1, 1)) and
+                (node.op.imshp is not None) and
+                (None not in node.op.imshp[-3:]) and
+                (node.op.kshp is not None) and
+                (None not in node.op.kshp) and
+                border_mode != "half"):
+            # we know the kernel and output size
+            prod1 = node.op.kshp[0] * node.op.kshp[1] * node.op.kshp[2]
+            prod2 = ((node.op.imshp[-3] - node.op.kshp[0] + 1) *
+                     (node.op.imshp[-2] - node.op.kshp[1] + 1) *
+                     (node.op.imshp[-1] - node.op.kshp[2] + 1))
+            if (None not in node.op.imshp[:1]):
+                # we also know batchsize and input channels
+                prod1 *= node.op.imshp[0]
+                prod2 *= node.op.imshp[1]
+            # compare to decide
+            if prod1 > prod2:
+                # (we need to wrap the result in as_cuda_ndarray_variable,
+                # because we are not allowed to replace a CudaNdarray with
+                # a DimShuffle instance in a graph optimization)
+                rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
+                    GpuCorr3dMM_gradWeights(border_mode,
+                                            subsample,
+                                            filter_dilation)(
+                        gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)),
+                        gpu_contiguous(kern.dimshuffle(1, 0, 2, 3, 4))
+                    ).dimshuffle(1, 0, 2, 3, 4))
+    return [rval]
 @local_optimizer([AbstractConv2d_gradWeights])
 def local_abstractconv_gradweight_gemm(node):
    if not isinstance(node.op, AbstractConv2d_gradWeights):
@@ -2869,6 +2945,26 @@ def local_abstractconv_gradweight_gemm(node):
    return [rval]
+@local_optimizer([AbstractConv3d_gradWeights])
+def local_abstractconv3d_gradweight_gemm(node):
+    if not isinstance(node.op, AbstractConv3d_gradWeights):
+        return None
+    img, topgrad, shape = node.inputs
+    if not isinstance(img.type, CudaNdarrayType) or \
+            not isinstance(topgrad.type, CudaNdarrayType):
+        return None
+    rval = GpuCorr3dMM_gradWeights(border_mode=node.op.border_mode,
+                                   subsample=node.op.subsample,
+                                   filter_dilation=node.op.filter_dilation)(
+        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
+    if node.op.filter_flip:
+        rval = rval[:, :, ::-1, ::-1, ::-1]
+    rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
+    rval = as_cuda_ndarray_variable(rval)
+    return [rval]
 @local_optimizer([AbstractConv2d_gradInputs])
 def local_abstractconv_gradinputs_gemm(node):
    if not isinstance(node.op, AbstractConv2d_gradInputs):
@@ -2887,6 +2983,26 @@ def local_abstractconv_gradinputs_gemm(node):
        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
    return [rval]
+@local_optimizer([AbstractConv3d_gradInputs])
+def local_abstractconv3d_gradinputs_gemm(node):
+    if not isinstance(node.op, AbstractConv3d_gradInputs):
+        return None
+    kern, topgrad, shape = node.inputs
+    if not isinstance(kern.type, CudaNdarrayType) or \
+            not isinstance(topgrad.type, CudaNdarrayType):
+        return None
+    if node.op.filter_flip:
+        kern = kern[:, :, ::-1, ::-1, ::-1]
+    rval = GpuCorr3dMM_gradInputs(border_mode=node.op.border_mode,
+                                  subsample=node.op.subsample,
+                                  filter_dilation=node.op.filter_dilation)(
+        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
+    return [rval]
 # Register GPU convolution implementation
 # They are tried in a specific order so we can control
 # which ones take precedence over others.
@@ -2899,18 +3015,36 @@ conv_groupopt.register('local_abstractconv_dnn',
                       dnn.local_abstractconv_cudnn, 20,
                       'conv_dnn',
                       'gpu', 'fast_compile', 'fast_run', 'cudnn')
+conv_groupopt.register('local_abstractconv3d_dnn',
+                       dnn.local_abstractconv3d_cudnn, 20,
+                       'conv_dnn',
+                       'gpu', 'fast_compile', 'fast_run', 'cudnn')
 # The GEMM-based convolution comes last to catch all remaining cases.
 # It can be disabled by excluding 'conv_gemm'.
 conv_groupopt.register('local_abstractconv_gemm', local_abstractconv_gemm, 30,
                       'conv_gemm',
                       'gpu', 'fast_compile', 'fast_run')
+conv_groupopt.register('local_abstractconv3d_gemm', local_abstractconv3d_gemm, 30,
+                       'conv_gemm',
+                       'gpu', 'fast_compile', 'fast_run')
 conv_groupopt.register('local_abstractconv_gradweight_gemm',
                       local_abstractconv_gradweight_gemm, 30,
                       'conv_gemm',
                       'gpu', 'fast_compile', 'fast_run')
+conv_groupopt.register('local_abstractconv3d_gradweight_gemm',
+                       local_abstractconv3d_gradweight_gemm, 30,
+                       'conv_gemm',
+                       'gpu', 'fast_compile', 'fast_run')
 conv_groupopt.register('local_abstractconv_gradinputs_gemm',
                       local_abstractconv_gradinputs_gemm, 30,
                       'conv_gemm',
                       'gpu', 'fast_compile', 'fast_run')
+conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
+                       local_abstractconv3d_gradinputs_gemm, 30,
+                       'conv_gemm',
+                       'gpu', 'fast_compile', 'fast_run')
--- a/theano/sandbox/cuda/tests/test_abstractconv.py
+++ b/theano/sandbox/cuda/tests/test_abstractconv.py
@@ -7,9 +7,11 @@ from theano.sandbox.cuda import float32_shared_constructor as gpu_shared
 from theano.sandbox.cuda.dnn import (
    dnn_available,
-    GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI)
+    GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI,
+    GpuDnnConv3d, GpuDnnConv3dGradW, GpuDnnConv3dGradI)
 from theano.sandbox.cuda.blas import (
-    GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs)
+    GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs,
+    GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
 from nose.plugins.skip import SkipTest
 import theano.sandbox.cuda as cuda
@@ -56,6 +58,40 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
                           filter_dilation=fd)
+class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
+    @classmethod
+    def setup_class(cls):
+        test_abstract_conv.BaseTestConv3d.setup_class()
+        # provide_shape is not used by the cuDNN impementation
+        cls.provide_shape = [False]
+        cls.shared = staticmethod(gpu_shared)
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
+        if fd != (1, 1, 1):
+            raise SkipTest("No dilation implementation for cuDNN ConvOp.")
+        if not dnn_available():
+            raise SkipTest(cuda.dnn.dnn_available.msg)
+        mode = mode_with_gpu
+        o = self.get_output_shape(i, f, s, b, fd)
+        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
+                     verify_grad=True, mode=mode,
+                     provide_shape=provide_shape, border_mode=b,
+                     filter_flip=flip, target_op=GpuDnnConv3d,
+                     filter_dilation=fd)
+        self.run_gradweight(inputs_shape=i, filters_shape=f,
+                            output_shape=o, subsample=s,
+                            verify_grad=True, mode=mode,
+                            provide_shape=provide_shape, border_mode=b,
+                            filter_flip=flip, target_op=GpuDnnConv3dGradW,
+                            filter_dilation=fd)
+        self.run_gradinput(inputs_shape=i, filters_shape=f,
+                           output_shape=o, subsample=s,
+                           verify_grad=True, mode=mode,
+                           provide_shape=provide_shape, border_mode=b,
+                           filter_flip=flip, target_op=GpuDnnConv3dGradI,
+                           filter_dilation=fd)
 class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
    @classmethod
    def setup_class(cls):
@@ -89,6 +125,39 @@ class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
                           filter_dilation=fd)
+class TestCorrMMConv3d(test_abstract_conv.BaseTestConv3d):
+    @classmethod
+    def setup_class(cls):
+        test_abstract_conv.BaseTestConv3d.setup_class()
+        cls.shared = staticmethod(gpu_shared)
+        cls.mode = mode_with_gpu.excluding('cudnn')
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
+        mode = self.mode
+        o = self.get_output_shape(i, f, s, b, fd)
+        self.run_fwd(inputs_shape=i, filters_shape=f,
+                     subsample=s, verify_grad=True, mode=mode,
+                     provide_shape=provide_shape, border_mode=b,
+                     filter_flip=flip, target_op=(GpuCorr3dMM,
+                                                  GpuCorr3dMM_gradWeights,
+                                                  GpuCorr3dMM_gradInputs),
+                     filter_dilation=fd)
+        self.run_gradweight(inputs_shape=i, filters_shape=f,
+                            output_shape=o, subsample=s,
+                            verify_grad=True, mode=mode,
+                            provide_shape=provide_shape, border_mode=b,
+                            filter_flip=flip,
+                            target_op=GpuCorr3dMM_gradWeights,
+                            filter_dilation=fd)
+        self.run_gradinput(inputs_shape=i, filters_shape=f,
+                           output_shape=o, subsample=s,
+                           verify_grad=True, mode=mode,
+                           provide_shape=provide_shape, border_mode=b,
+                           filter_flip=flip,
+                           target_op=GpuCorr3dMM_gradInputs,
+                           filter_dilation=fd)
 class TestDnnConvTypes(test_abstract_conv.TestConvTypes):
    def setUp(self):
        self.input = cuda.ftensor4()

--- a/theano/sandbox/cuda/tests/test_gemmcorr3d.py
+++ b/theano/sandbox/cuda/tests/test_gemmcorr3d.py
 from __future__ import absolute_import, print_function, division
 import unittest
 import numpy
+from six.moves import xrange
+try:
+    from scipy import ndimage
+except ImportError:
+    ndimage = None
 import theano
 from theano.tests import unittest_tools as utt
@@ -21,31 +26,127 @@ else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+# python reference implementation of a 3D convolution
+# see also: theano.tensor.nnet.tests.test_conv3d2d
+# expects: (batch, 0, channels, 1, 2)
+def pyconv3d(signals, filters, border_mode='valid', dilation=(1, 1, 1)):
+    Ns, Ts, C, Hs, Ws = signals.shape
+    Nf, Tf, C, Hf, Wf = filters.shape
+    Tdil, Hdil, Wdil = dilation
+    Tfdil = (Tf - 1) * Tdil + 1
+    Hfdil = (Hf - 1) * Hdil + 1
+    Wfdil = (Wf - 1) * Wdil + 1
+    # if border_mode is not 'valid', the signals need zero-padding
+    if border_mode == 'full':
+        Tpad = Tfdil - 1
+        Hpad = Hfdil - 1
+        Wpad = Wfdil - 1
+    elif border_mode == 'half':
+        Tpad = Tfdil // 2
+        Hpad = Hfdil // 2
+        Wpad = Wfdil // 2
+    elif isinstance(border_mode, tuple):
+        Tpad, Hpad, Wpad = map(int, border_mode)
+    else:
+        Tpad = 0
+        Hpad = 0
+        Wpad = 0
+    if Tpad > 0 or Hpad > 0 or Wpad > 0:
+        # zero-pad signals
+        signals_padded = numpy.zeros((Ns, Ts + 2 * Tpad, C,
+                                      Hs + 2 * Hpad, Ws + 2 * Wpad), 'float32')
+        signals_padded[:, Tpad:(Ts + Tpad), :, Hpad:(Hs + Hpad),
+                       Wpad:(Ws + Wpad)] = signals
+        Ns, Ts, C, Hs, Ws = signals_padded.shape
+        signals = signals_padded
+    Tfdil2 = Tfdil // 2
+    Hfdil2 = Hfdil // 2
+    Wfdil2 = Wfdil // 2
+    dilated_filters = numpy.zeros((Nf, Tfdil, C, Hfdil, Wfdil), dtype=filters.dtype)
+    dilated_filters[:, ::Tdil, :, ::Hdil, ::Wdil] = filters
+    # perform valid convolution on the padded signals
+    rval = numpy.zeros((Ns, Ts - Tfdil + 1, Nf, Hs - Hfdil + 1, Ws - Wfdil + 1))
+    for ns in xrange(Ns):
+        for nf in xrange(Nf):
+            for c in xrange(C):
+                s_i = signals[ns, :, c, :, :]
+                f_i = dilated_filters[nf, :, c, :, :]
+                r_i = rval[ns, :, nf, :, :]
+                # scipy.signal.convolve performs valid convolution,
+                # but is quite slow. scipy.ndimage.convolve is faster
+                # only supports 'same' convolution.
+                # origin must be -1 for even filters, 0 for odd filters
+                o_i = ndimage.convolve(s_i, f_i, mode='constant', cval=1,
+                                       origin=(f_i.shape[0] % 2 - 1,
+                                               f_i.shape[1] % 2 - 1,
+                                               f_i.shape[2] % 2 - 1))
+                # crop to get the result of 'valid' convolution
+                o_i = o_i[Tfdil2:(r_i.shape[0] + Tfdil2),
+                          Hfdil2:(r_i.shape[1] + Hfdil2),
+                          Wfdil2:(r_i.shape[2] + Wfdil2)]
+                # the result should be equal to 'valid' convolution
+                # utt.assert_allclose(o_i, signal.convolve(s_i, f_i, mode='valid'))
+                r_i += o_i
+    return rval
 class TestCorr3DMM(unittest.TestCase):
    def run_conv_valid(self, inputs_shape, filters_shape,
-                       subsample=(1, 1, 1)):
+                       border_mode='valid',
+                       filter_dilation=(1, 1, 1),
+                       subsample=(1, 1, 1),
+                       verify_grad=False):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')
        inputs = shared(inputs_val)
        filters = shared(filters_val)
        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
-        conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters,
-                                             b=bias, d=subsample)
+        if filter_dilation == (1, 1, 1) and border_mode in ('valid', (0, 0, 0)):
-        conv = GpuCorr3dMM(border_mode="valid",
+            conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters,
+                                                 b=bias, d=subsample)
+            f_ref = theano.function([], conv_ref)
+            res_ref = f_ref()
+        elif subsample == (1, 1, 1):
+            if ndimage is None:
+                raise SkipTest('This test needs SciPy.')
+            # input = b012c
+            # pyconv3d wants = b0c12 = (0, 1, 4, 2, 3)
+            # pyconv3d outputs = b0c12 = (0, 1, 3, 4, 2)
+            res_ref = pyconv3d(signals=inputs_val.transpose(0, 1, 4, 2, 3),
+                               filters=filters_val.transpose(0, 1, 4, 2, 3)[:, ::-1, :, ::-1, ::-1],
+                               dilation=filter_dilation,
+                               border_mode=border_mode).transpose(0, 1, 3, 4, 2)
+        else:
+            raise SkipTest('No reference implementation that combines '
+                           'border_mode and subsampling.')
+        conv = GpuCorr3dMM(border_mode=border_mode,
+                           filter_dilation=filter_dilation,
                           subsample=subsample)(
                               inputs.dimshuffle(0, 4, 1, 2, 3),
                               filters.dimshuffle(0, 4, 1, 2, 3))
        conv = conv.dimshuffle(0, 2, 3, 4, 1)
-        f_ref = theano.function([], conv_ref)
        f = theano.function([], conv, mode=mode_with_gpu)
-        res_ref = f_ref()
        res = f()
        utt.assert_allclose(res_ref, res)
+        if verify_grad:
+            utt.verify_grad(GpuCorr3dMM(border_mode=border_mode,
+                                        filter_dilation=filter_dilation,
+                                        subsample=subsample),
+                            [inputs_val.transpose(0, 4, 1, 2, 3),
+                             filters_val.transpose(0, 4, 1, 2, 3)])
    def test_valid(self):
        self.run_conv_valid(inputs_shape=(16, 20, 12, 16, 1),
                            filters_shape=(10, 6, 12, 4, 1))
@@ -68,6 +169,50 @@ class TestCorr3DMM(unittest.TestCase):
                            filters_shape=(10, 6, 12, 4, 1),
                            subsample=(1, 2, 3))
+    def test_border_mode(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='valid')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='half')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='full')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(0, 0, 0))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(1, 2, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(3, 2, 1))
+    def test_filter_dilation(self):
+        inputs_shape = [16, 20, 12, 15, 1]
+        filters_shape = [10, 6, 5, 4, 1]
+        for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
+            for border_mode in ['valid', 'half', 'full']:
+                self.run_conv_valid(inputs_shape=inputs_shape,
+                                    filters_shape=filters_shape,
+                                    filter_dilation=filter_dilation,
+                                    border_mode=border_mode)
+    def test_verify_gradients(self):
+        # use a small example to check the gradients
+        inputs_shape = [2, 7, 9, 6, 1]
+        filters_shape = [1, 3, 3, 2, 1]
+        for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
+            for border_mode in ['valid', 'half', 'full', (2, 1, 3)]:
+                self.run_conv_valid(inputs_shape=inputs_shape,
+                                    filters_shape=filters_shape,
+                                    filter_dilation=filter_dilation,
+                                    border_mode=border_mode,
+                                    verify_grad=True)
    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
                       subsample=(1, 1, 1)):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')

--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
@@ -32,6 +32,7 @@ from .bn import batch_normalization
 import warnings
 from .abstract_conv import conv2d as abstract_conv2d
+from .abstract_conv import conv3d
 def conv2d(input, filters, input_shape=None, filter_shape=None,

--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
--- a/theano/tensor/nnet/corr3d.py
+++ b/theano/tensor/nnet/corr3d.py
--- a/theano/tensor/nnet/corr3d_gemm.c
+++ b/theano/tensor/nnet/corr3d_gemm.c
--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
--- a/theano/tensor/nnet/tests/test_corr3d.py
+++ b/theano/tensor/nnet/tests/test_corr3d.py