CPU implementation for Corr3DMM and gradients.

The new Corr3dMM opts are also used to optimise AbstractConv. The code is similar to the 3D version in corr_gem.c.

CPU implementation for Corr3DMM and gradients.
401a4dbe · Gijs van Tulder · 2681cd70 · 401a4dbe · 401a4dbe · 401a4dbe
--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
@@ -234,7 +234,7 @@ def conv3d(input, filters, input_shape=None, filter_shape=None,
    Notes
    -----
        If cuDNN is available, it will be used on the
-        GPU. Otherwise, it is the *CorrMM* convolution that will be used
+        GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
        "caffe style convolution".
        This is only supported in Theano 0.8 or the development

--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
@@ -417,7 +417,7 @@ def conv3d_grad_wrt_inputs(output_grad,
    -----
    :note: If cuDNN is available, it will be used on the
-        GPU. Otherwise, it is the *CorrMM* convolution that will be used
+        GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
        "caffe style convolution".
    :note: This is only supported in Theano 0.8 or the development
@@ -670,7 +670,7 @@ def conv3d_grad_wrt_weights(input,
    -----
    :note: If cuDNN is available, it will be used on the
-        GPU. Otherwise, it is the *CorrMM* convolution that will be used
+        GPU. Otherwise, it is the *Corr3dMM* convolution that will be used
        "caffe style convolution".
    :note: This is only supported in Theano 0.8 or the development

--- a/theano/tensor/nnet/corr3d.py
+++ b/theano/tensor/nnet/corr3d.py
+from __future__ import absolute_import, print_function, division
+import os
+import logging
+from six import integer_types
+import theano
+from theano import Apply
+from theano import gof
+from theano.tensor import as_tensor_variable, TensorType
+from theano.tensor.nnet.abstract_conv import get_conv_output_shape
+from theano.tensor import blas_headers
+from theano.tensor.blas import ldflags, blas_header_version
+_logger = logging.getLogger(__name__)
+class BaseCorr3dMM(gof.OpenMPOp):
+    """
+    Base class for `Corr3dMM`, `Corr3dMM_gradWeights` and
+    `Corr3dMM_gradInputs`. Cannot be used directly.
+    Parameters
+    ----------
+    border_mode : {'valid', 'full', 'half'}
+        Additionally, the padding size could be directly specified by an integer
+        or a tuple of three of integers
+    subsample
+        Perform subsampling of the output (default: (1, 1, 1)).
+    filter_dilation
+        Perform dilated correlation (default: (1, 1, 1))
+    """
+    check_broadcast = False
+    __props__ = ('border_mode', 'subsample', 'filter_dilation')
+    def __init__(self, border_mode="valid", subsample=(1, 1, 1),
+                 filter_dilation=(1, 1, 1), openmp=None):
+        super(BaseCorr3dMM, self).__init__(openmp=openmp)
+        if isinstance(border_mode, integer_types):
+            if border_mode < 0:
+                raise ValueError(
+                    'invalid border_mode {}, which must be a '
+                    'non-negative integer'.format(border_mode))
+            border_mode = (border_mode, border_mode, border_mode)
+        if isinstance(border_mode, tuple):
+            if len(border_mode) != 3 or min(border_mode) < 0:
+                raise ValueError(
+                    'invalid border_mode {}, which must be a tuple of '
+                    'three non-negative integers'.format(border_mode))
+            pad_h, pad_w, pad_d = map(int, border_mode)
+            border_mode = (pad_h, pad_w, pad_d)
+        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
+                border_mode in ('valid', 'full', 'half')):
+            raise ValueError(
+                'invalid border_mode {}, which must be either '
+                '"valid", "full", "half", an integer or a tuple of three'
+                ' integers'.format(border_mode))
+        self.border_mode = border_mode
+        if len(subsample) != 3:
+            raise ValueError("subsample must have three elements")
+        if len(filter_dilation) != 3:
+            raise ValueError("filter_dilation must have three elements")
+        self.subsample = tuple(subsample)
+        self.filter_dilation = tuple(filter_dilation)
+        if not theano.config.blas.ldflags:
+            raise NotImplementedError("C code for corrMM* classes need a blas library.")
+        else:
+            if 'openblas' in theano.config.blas.ldflags:
+                self.blas_type = 'openblas'
+            elif 'mkl' in theano.config.blas.ldflags:
+                self.blas_type = 'mkl'
+            else:
+                self.blas_type = ''
+    @property
+    def pad(self):
+        if self.border_mode != 'valid':
+            return self.border_mode
+        return (0, 0, 0)
+    def __str__(self):
+        return '%s{%s, %s, %s}' % (
+            self.__class__.__name__,
+            self.border_mode,
+            str(self.subsample),
+            str(self.filter_dilation))
+    @staticmethod
+    def as_common_dtype(in1, in2):
+        """
+        Upcast input variables if neccesary.
+        """
+        dtype = theano.scalar.upcast(in1.dtype, in2.dtype)
+        return in1.astype(dtype), in2.astype(dtype)
+    def c_support_code(self):
+        ccodes = blas_headers.blas_header_text()
+        if self.blas_type == 'openblas':
+            ccodes += blas_headers.openblas_threads_text()
+        elif self.blas_type == 'mkl':
+            ccodes += blas_headers.mkl_threads_text()
+        return ccodes
+    def c_libraries(self):
+        return ldflags()
+    def c_compile_args(self):
+        compile_args = ldflags(libs=False, flags=True)
+        compile_args += super(BaseCorr3dMM, self).c_compile_args()
+        return compile_args
+    def c_lib_dirs(self):
+        return ldflags(libs=False, libs_dir=True)
+    def c_header_dirs(self):
+        return ldflags(libs=False, include_dir=True)
+    def c_headers(self):
+        headers = ['<stdio.h>']
+        headers += super(BaseCorr3dMM, self).c_headers()
+        return headers
+    def c_code_cache_version(self):
+        # raise this whenever modifying any of the support_code_files
+        return (1, self.openmp, blas_header_version())
+    def c_support_code_apply(self, node, nodename):
+        # REMEMBER TO RAISE c_code_cache_version when changing any of
+        # these files
+        sub = {}
+        dtype = str(node.__dict__['inputs'][0].dtype)
+        assert dtype in ('float32', 'float64')
+        if dtype == 'float32':
+            sub['gemm'] = 'sgemm_'
+            sub['float_type'] = 'npy_float'
+            sub['float_typenum'] = 'NPY_FLOAT'
+            sub['n_bytes'] = 4
+            sub['c_float_type'] = 'float'
+        else:
+            sub['gemm'] = 'dgemm_'
+            sub['float_type'] = 'npy_double'
+            sub['float_typenum'] = 'NPY_DOUBLE'
+            sub['n_bytes'] = 8
+            sub['c_float_type'] = 'double'
+        if self.openmp:
+            sub['omp_flags'] = '#pragma omp parallel for schedule(static)'
+            sub['omp_get_max_threads'] = 'omp_get_max_threads()'
+            sub['omp_get_thread_num'] = 'omp_get_thread_num()'
+            if self.blas_type == 'openblas':
+                sub['blas_set_num_threads'] = 'openblas_set_num_threads'
+                sub['blas_get_num_threads'] = 'openblas_get_num_threads()'
+            elif self.blas_type == 'mkl':
+                sub['blas_set_num_threads'] = 'mkl_set_num_threads'
+                sub['blas_get_num_threads'] = 'mkl_get_max_threads()'
+            else:
+                sub['blas_set_num_threads'] = ''
+                sub['blas_get_num_threads'] = '0'
+        else:
+            sub['omp_flags'] = ''
+            sub['omp_get_max_threads'] = '1'
+            sub['omp_get_thread_num'] = '0'
+            sub['blas_set_num_threads'] = ''
+            sub['blas_get_num_threads'] = '0'
+        files = ['corr3d_gemm.c']
+        codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
+                 for f in files]
+        final_code = ''
+        for code in codes:
+            final_code += code
+        return final_code % sub
+    def c_code_helper(self, bottom, weights, top, direction, sub,
+                      height=None, width=None, depth=None):
+        """
+        This generates the C code for Corr3dMM (direction="forward"),
+        Corr3dMM_gradWeights (direction="backprop weights"), and
+        Corr3dMM_gradInputs (direction="backprop inputs").
+        Depending on the direction, one of bottom, weights, top will
+        receive the output, while the other two serve as inputs.
+        :param bottom: Variable name of the input images in the forward pass,
+            or the gradient of the input images in backprop wrt. inputs
+        :param weights: Variable name of the filters in the forward pass,
+            or the gradient of the filters in backprop wrt. weights
+        :param top: Variable name of the output images / feature maps in the
+            forward pass, or the gradient of the outputs in the backprop passes
+        :param direction: "forward" to correlate bottom with weights and store
+            results in top,
+            "backprop weights" to do a valid convolution of bottom with top
+            (swapping the first two dimensions) and store results in weights,
+            and "backprop inputs" to do a full convolution of top with weights
+            (swapping the first two dimensions) and store results in bottom.
+        :param sub: Dictionary of substitutions useable to help generating the
+            C code.
+        :param height: If self.subsample[0] != 1, a variable giving the height
+            of the filters for direction="backprop weights" or the height of
+            the input images for direction="backprop inputs".
+            If self.border_mode == 'half', a variable giving the height of the
+            filters for direction="backprop weights".  Ignored otherwise.
+        :param width: If self.subsample[1] != 1, a variable giving the width
+            of the filters for direction="backprop weights" or the width of the
+            input images for direction="backprop inputs".
+            If self.border_mode == 'half', a variable giving the width of the
+            filters for direction="backprop weights".  Ignored otherwise.
+        :param depth: If self.subsample[1] != 1, a variable giving the depth
+            of the filters for direction="backprop weights" or the depth of the
+            input images for direction="backprop inputs".
+            If self.border_mode == 'half', a variable giving the depth of the
+            filters for direction="backprop weights".  Ignored otherwise.
+        """
+        dH, dW, dD = self.subsample
+        dilH, dilW, dilD = self.filter_dilation
+        if self.border_mode == "half":
+            padH = padW = padD = -1
+        elif self.border_mode == "full":
+            padH = padW = padD = -2
+        elif isinstance(self.border_mode, tuple):
+            padH, padW, padD = self.border_mode
+        else:
+            assert self.border_mode == "valid"
+            padH = padW = padD = 0
+        if direction == "forward":
+            direction = 0
+            out = top
+        elif direction == "backprop weights":
+            direction = 1
+            out = weights
+        elif direction == "backprop inputs":
+            direction = 2
+            out = bottom
+        else:
+            raise ValueError("direction must be one of 'forward', "
+                             "'backprop weights', 'backprop inputs'")
+        # When subsampling, we cannot unambiguously infer the height and width
+        # of bottom and weights from top, so we require them to be given.
+        # Similarly, when border_mode="half", we cannot infer the weight size.
+        if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
+            if not height:
+                raise ValueError("height must be given for backprop with vertical sampling or border_mode='half'")
+            height = '(*(npy_int64 *)(PyArray_DATA(%s)))' % height
+        else:
+            height = '-1'
+        if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
+            if not width:
+                raise ValueError("width must be given for backprop with horizontal sampling or border_mode='half'")
+            width = '(*(npy_int64 *)(PyArray_DATA(%s)))' % width
+        else:
+            width = '-1'
+        if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
+            if not depth:
+                raise ValueError("depth must be given for backprop with depth sampling or border_mode='half'")
+            depth = '(*(npy_int64 *)(PyArray_DATA(%s)))' % depth
+        else:
+            depth = '-1'
+        sub = sub.copy()
+        sub.update(locals())
+        return """
+    // Mandatory args
+    int direction = %(direction)s;  // forward, bprop weights, bprop inputs
+    // Optional args
+    int dH = %(dH)s;
+    int dW = %(dW)s;
+    int dD = %(dD)s;
+    int dilH = %(dilH)s;
+    int dilW = %(dilW)s;
+    int dilD = %(dilD)s;
+    int padH = %(padH)s;
+    int padW = %(padW)s;
+    int padD = %(padD)s;
+    PyArrayObject * bottom = %(bottom)s;
+    PyArrayObject * weights = %(weights)s;
+    PyArrayObject * top = %(top)s;
+    PyArrayObject * out2 = NULL;
+    // Obtain or infer kernel width, height and depth
+    // (we need to know it early to be able to handle auto-padding)
+    int kH, kW, kD;
+    if (direction != 1) {
+        // weight is an input variable, we can just read its shape
+        kH = PyArray_DIMS(weights)[2];
+        kW = PyArray_DIMS(weights)[3];
+        kD = PyArray_DIMS(weights)[4];
+    }
+    else {
+        if ((dH != 1) || (padH == -1)) {
+            // vertical subsampling or half padding, kernel height is specified
+            kH = %(height)s;
+        }
+        else if (padH == -2) {
+            // vertical full padding, we can infer the kernel height
+            kH = (2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH - 1)/ dilH + 1;
+        }
+        else {
+            // explicit padding, we can infer the kernel height
+            kH = (PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
+        }
+        if ((dW != 1) || (padW == -1)) {
+            kW = %(width)s;
+        }
+        else if (padW == -2) {
+            kW = (2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
+        }
+        else {
+            kW = (PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
+        }
+        if ((dD != 1) || (padD == -1)) {
+            kD = %(depth)s;
+        }
+        else if (padD == -2) {
+            kD = (2 - PyArray_DIMS(bottom)[4] + (PyArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
+        }
+        else {
+            kD = (PyArray_DIMS(bottom)[4] + 2*padD - (PyArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
+        }
+    }
+    // Implicit dilated kernel size
+    int dil_kH = (kH - 1) * dilH + 1;
+    int dil_kW = (kW - 1) * dilW + 1;
+    int dil_kD = (kD - 1) * dilD + 1;
+    // Auto-padding if requested
+    if (padH == -1) {  // vertical half padding
+        padH = dil_kH / 2;
+    }
+    else if (padH == -2) {  // vertical full padding
+        padH = dil_kH - 1;
+    }
+    else if (padH < 0) {
+        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padH must be >= -2");
+        %(fail)s
+    }
+    if (padW == -1) {  // horizontal half padding
+        padW = dil_kW / 2;
+    }
+    else if (padW == -2) {  // horizontal full padding
+        padW = dil_kW - 1;
+    }
+    else if (padW < 0) {
+        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padW must be >= -2");
+        %(fail)s
+    }
+    if (padD == -1) {  // depth half padding
+        padD = dil_kD / 2;
+    }
+    else if (padD == -2) {  // depth full padding
+        padD = dil_kD - 1;
+    }
+    else if (padD < 0) {
+        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padD must be >= -2");
+        %(fail)s
+    }
+    // Infer output shape
+    npy_intp out_dim[5];
+    switch(direction) {
+    case 0:  // forward pass
+        // output is top: (batchsize, num_filters, height, width, depth)
+        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
+        out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
+        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
+        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1);
+        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1);
+        out_dim[4] = (npy_intp)((PyArray_DIMS(bottom)[4] + 2*padD - ((PyArray_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1);
+        break;
+    case 1:  // backprop wrt. weights
+        // output is weights: (num_filters, num_channels, height, width, depth)
+        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
+        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
+        out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1];
+        out_dim[2] = (npy_intp)kH;  // already inferred further above
+        out_dim[3] = (npy_intp)kW;  // how convenient
+        out_dim[4] = (npy_intp)kD;
+        break;
+    case 2:  // backprop wrt. inputs
+        // output is bottom: (batchsize, num_channels, height, width, depth)
+        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
+        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
+        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1];
+        out_dim[2] = (npy_intp)((dH != 1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
+        out_dim[3] = (npy_intp)((dW != 1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
+        out_dim[4] = (npy_intp)((dD != 1) ? %(depth)s : (PyArray_DIMS(top)[4] - 1) * dD + (PyArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD);
+        break;
+    default:
+        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: direction must be 0, 1, or 2\\n");
+        %(fail)s
+    }
+    // Prepare output array
+    int typenum;
+    if ( !(%(out)s
+           && PyArray_NDIM(%(out)s)==4
+           && PyArray_IS_C_CONTIGUOUS(%(out)s)
+           && PyArray_DIMS(%(out)s)[0]==out_dim[0]
+           && PyArray_DIMS(%(out)s)[1]==out_dim[1]
+           && PyArray_DIMS(%(out)s)[2]==out_dim[2]
+           && PyArray_DIMS(%(out)s)[3]==out_dim[3]
+           && PyArray_DIMS(%(out)s)[4]==out_dim[4]))
+    {
+        Py_XDECREF(%(out)s);
+        if (direction != 1) {
+          typenum = PyArray_TYPE(weights);
+        }
+        else {
+          typenum = PyArray_TYPE(bottom);
+        }
+        //Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
+        %(out)s = (PyArrayObject*)PyArray_ZEROS(5,
+                                          out_dim,
+                                          typenum,
+                                          0);
+        if (NULL == %(out)s)
+        {
+            PyErr_Format(PyExc_RuntimeError,
+                    "BaseCorr3dMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld x %%lld",
+                    (long long)out_dim[0], (long long)out_dim[1],
+                    (long long)out_dim[2], (long long)out_dim[3], (long long)out_dim[4]);
+            %(fail)s
+        }
+    }
+    // Call corr3dMM code
+    out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction,
+                    dH, dW, dD, dilH, dilW, dilD, padH, padW, padD);
+    if (out2==NULL){
+       %(fail)s
+    }
+    assert (out2 == %(out)s);
+""" % sub
+class Corr3dMM(BaseCorr3dMM):
+    """
+    CPU correlation implementation using Matrix Multiplication.
+    Parameters
+    ----------
+    border_mode
+        The width of a border of implicit zeros to pad the
+        input with. Must be a tuple with 3 elements giving the width of
+        the padding on each side, or a single integer to pad the same
+        on all sides, or a string shortcut setting the padding at runtime:
+        ``'valid'`` for ``(0, 0, 0)`` (valid convolution, no padding), ``'full'``
+        for ``(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)``
+        (full convolution), ``'half'`` for ``(kernel_rows // 2,
+        kernel_columns // 2, kernel_depth // 2)`` (same convolution for
+        odd-sized kernels). Note that the three widths are each
+        applied twice, once per side (left and right, top and bottom, front
+        and back).
+    subsample
+        The subsample operation applied to each output image. Should be a tuple
+        with 3 elements. Set to `(1, 1, 1)` to disable subsampling.
+    filter_dilation
+        The filter dilation operation applied to each input image.
+        Should be a tuple with 3 elements.
+        Set to `(1, 1, 1)` to disable filter dilation.
+    """
+    def make_node(self, img, kern):
+        img = as_tensor_variable(img)
+        kern = as_tensor_variable(kern)
+        img, kern = self.as_common_dtype(img, kern)
+        if img.type.ndim != 5:
+            raise TypeError('img must be 5D tensor')
+        if kern.type.ndim != 5:
+            raise TypeError('kern must be 5D tensor')
+        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
+                         False, False, False]
+        dtype = img.type.dtype
+        return Apply(self, [img, kern], [TensorType(dtype, broadcastable)()])
+    def infer_shape(self, node, input_shape):
+        imshp = input_shape[0]
+        kshp = input_shape[1]
+        res = get_conv_output_shape(
+            imshp,
+            kshp,
+            self.border_mode,
+            self.subsample,
+            self.filter_dilation)
+        return [res]
+    def c_code(self, node, nodename, inp, out_, sub):
+        bottom, weights = inp
+        top, = out_
+        direction = "forward"
+        return super(Corr3dMM, self).c_code_helper(bottom, weights, top, direction, sub)
+    def grad(self, inp, grads):
+        bottom, weights = inp
+        top, = grads
+        d_bottom = Corr3dMM_gradInputs(self.border_mode,
+                                       self.subsample,
+                                       self.filter_dilation)(weights, top,
+                                                             bottom.shape[-3:])
+        d_weights = Corr3dMM_gradWeights(self.border_mode,
+                                         self.subsample,
+                                         self.filter_dilation)(bottom, top,
+                                                               weights.shape[-3:])
+        return d_bottom, d_weights
+class Corr3dMM_gradWeights(BaseCorr3dMM):
+    """
+    Gradient wrt. filters for `Corr3dMM`.
+    Notes
+    -----
+    You will not want to use this directly, but rely on
+    Theano's automatic differentiation or graph optimization to
+    use it as needed.
+    """
+    def make_node(self, img, topgrad, shape=None):
+        img = as_tensor_variable(img)
+        topgrad = as_tensor_variable(topgrad)
+        img, topgrad = self.as_common_dtype(img, topgrad)
+        if img.type.ndim != 5:
+            raise TypeError('img must be 5D tensor')
+        if topgrad.type.ndim != 5:
+            raise TypeError('topgrad must be 5D tensor')
+        if self.subsample != (1, 1, 1) or self.border_mode == "half":
+            if shape is None:
+                raise ValueError('shape must be given if subsample != (1, 1, 1)'
+                                 ' or border_mode == "half"')
+            height_width_depth = [as_tensor_variable(shape[0]).astype('int64'),
+                                  as_tensor_variable(shape[1]).astype('int64'),
+                                  as_tensor_variable(shape[2]).astype('int64')]
+        else:
+            height_width_depth = []
+        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
+                         False, False, False]
+        dtype = img.type.dtype
+        return Apply(self, [img, topgrad] + height_width_depth,
+                     [TensorType(dtype, broadcastable)()])
+    def infer_shape(self, node, input_shape):
+        if self.border_mode == "half":
+            padH = padW = padD = -1
+        elif self.border_mode == "full":
+            padH = padW = padD = -2
+        elif isinstance(self.border_mode, tuple):
+            padH, padW, padD = self.border_mode
+        else:
+            assert self.border_mode == "valid"
+            padH = padW = padD = 0
+        dH, dW, dD = self.subsample
+        imshp = input_shape[0]
+        topshp = input_shape[1]
+        ssize, imshp = imshp[1], list(imshp[2:])
+        nkern, topshp = topshp[1], list(topshp[2:])
+        height_width_depth = node.inputs[-3:]
+        if ((dH != 1) or (padH == -1)):
+            # vertical subsampling or half padding, kernel height is specified
+            kH = height_width_depth[0]
+        elif padH == -2:
+            # vertical full padding, we can infer the kernel height
+            kH = 2 - imshp[0] + (topshp[0] - 1) * dH
+        else:
+            # explicit padding, we can infer the kernel height
+            kH = imshp[0] + 2 * padH - (topshp[0] - 1) * dH
+        if ((dW != 1) or (padW == -1)):
+            kW = height_width_depth[1]
+        elif (padW == -2):
+            kW = 2 - imshp[1] + (topshp[1] - 1) * dW
+        else:
+            kW = imshp[1] + 2 * padW - (topshp[1] - 1) * dW
+        if ((dD != 1) or (padD == -1)):
+            kD = height_width_depth[2]
+        elif (padD == -2):
+            kD = 2 - imshp[2] + (topshp[2] - 1) * dD
+        else:
+            kD = imshp[2] + 2 * padD - (topshp[2] - 1) * dD
+        return [(nkern, ssize, kH, kW, kD)]
+    def c_code(self, node, nodename, inp, out_, sub):
+        bottom, top = inp[:2]
+        height, width, depth = inp[2:] or (None, None, None)
+        weights, = out_
+        direction = "backprop weights"
+        return super(Corr3dMM_gradWeights,
+                     self).c_code_helper(bottom, weights, top, direction,
+                                         sub, height, width, depth)
+    def grad(self, inp, grads):
+        bottom, top = inp[:2]
+        weights, = grads
+        d_bottom = Corr3dMM_gradInputs(self.border_mode,
+                                       self.subsample,
+                                       self.filter_dilation)(weights, top,
+                                                             bottom.shape[-3:])
+        d_top = Corr3dMM(self.border_mode,
+                         self.subsample,
+                         self.filter_dilation)(bottom, weights)
+        d_height_width_depth = ((theano.gradient.DisconnectedType()(),) * 3
+                                if len(inp) == 5 else ())
+        return (d_bottom, d_top) + d_height_width_depth
+    def connection_pattern(self, node):
+        if node.nin == 2:
+            return [[1], [1]]
+        else:
+            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
+class Corr3dMM_gradInputs(BaseCorr3dMM):
+    """
+    Gradient wrt. inputs for `Corr3dMM`.
+    Notes
+    -----
+    You will not want to use this directly, but rely on
+    Theano's automatic differentiation or graph optimization to
+    use it as needed.
+    """
+    def make_node(self, kern, topgrad, shape=None):
+        kern = as_tensor_variable(kern)
+        topgrad = as_tensor_variable(topgrad)
+        kern, topgrad = self.as_common_dtype(kern, topgrad)
+        if kern.type.ndim != 5:
+            raise TypeError('kern must be 5D tensor')
+        if topgrad.type.ndim != 5:
+            raise TypeError('topgrad must be 5D tensor')
+        if self.subsample != (1, 1, 1) and shape is None:
+            raise ValueError('shape must be given if subsample != (1, 1, 1)')
+        if self.subsample != (1, 1, 1):
+            height_width_depth = [as_tensor_variable(shape[0]).astype('int64'),
+                                  as_tensor_variable(shape[1]).astype('int64'),
+                                  as_tensor_variable(shape[2]).astype('int64')]
+        else:
+            height_width_depth = []
+        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+                         False, False, False]
+        dtype = kern.type.dtype
+        return Apply(self, [kern, topgrad] + height_width_depth,
+                     [TensorType(dtype, broadcastable)()])
+    def infer_shape(self, node, input_shape):
+        if self.border_mode == "half":
+            padH = padW = padD = -1
+        elif self.border_mode == "full":
+            padH = padW = padD = -2
+        elif isinstance(self.border_mode, tuple):
+            padH, padW, padD = self.border_mode
+        else:
+            assert self.border_mode == "valid"
+            padH = padW = padD = 0
+        dH, dW, dD = self.subsample
+        kshp = input_shape[0]
+        topshp = input_shape[1]
+        ssize, kshp = kshp[1], list(kshp[2:])
+        bsize, topshp = topshp[0], list(topshp[2:])
+        height_width_depth = node.inputs[-3:]
+        if padH == -1:
+            padH = kshp[0] // 2
+        elif padH == -2:
+            padH = kshp[0] - 1
+        elif padH < -2:
+            raise ValueError('Corr3dMM_gradInputs: border_mode must be >= 0.')
+        if padW == -1:
+            padW = kshp[1] // 2
+        elif padW == -2:
+            padW = kshp[1] - 1
+        elif padW < -2:
+            raise ValueError('Corr3dMM_gradInputs: border_mode must be >= 0.')
+        if padD == -1:
+            padD = kshp[2] // 2
+        elif padD == -2:
+            padD = kshp[2] - 1
+        elif padD < -2:
+            raise ValueError('Corr3dMM_gradInputs: border_mode must be >= 0.')
+        if dH != 1:
+            out_shp0 = height_width_depth[0]
+        else:
+            out_shp0 = (topshp[0] - 1) * dH + kshp[0] - 2 * padH
+        if dW != 1:
+            out_shp1 = height_width_depth[1]
+        else:
+            out_shp1 = (topshp[1] - 1) * dW + kshp[1] - 2 * padW
+        if dD != 1:
+            out_shp2 = height_width_depth[2]
+        else:
+            out_shp2 = (topshp[2] - 1) * dD + kshp[2] - 2 * padD
+        out_shp = (out_shp0, out_shp1, out_shp2)
+        return [(bsize, ssize) + out_shp]
+    def c_code(self, node, nodename, inp, out_, sub):
+        weights, top = inp[:2]
+        height, width, depth = inp[2:] or (None, None, None)
+        bottom, = out_
+        direction = "backprop inputs"
+        return super(Corr3dMM_gradInputs,
+                     self).c_code_helper(bottom, weights, top, direction, sub,
+                                         height, width, depth)
+    def grad(self, inp, grads):
+        weights, top = inp[:2]
+        bottom, = grads
+        d_weights = Corr3dMM_gradWeights(self.border_mode,
+                                         self.subsample,
+                                         self.filter_dilation)(bottom,
+                                                               top,
+                                                               weights.shape[-3:])
+        d_top = Corr3dMM(self.border_mode,
+                         self.subsample,
+                         self.filter_dilation)(bottom, weights)
+        d_height_width_depth = ((theano.gradient.DisconnectedType()(),) * 3
+                                if len(inp) == 5 else ())
+        return (d_weights, d_top) + d_height_width_depth
+    def connection_pattern(self, node):
+        if node.nin == 2:
+            return [[1], [1]]
+        else:
+            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
--- a/theano/tensor/nnet/corr3d_gemm.c
+++ b/theano/tensor/nnet/corr3d_gemm.c
+// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
+// sources are clearly marked. Below we reproduce the original license of
+// the Caffe software.
+/*
+Copyright (c) 2014, The Regents of the University of California (Regents)
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cpp)
+// Loops for fast unfold + copy
+void im3d2col(const %(float_type)s* data_im, const int channels,
+    const int height, const int width, const int depth,
+    const int kernel_h, const int kernel_w, const int kernel_d,
+    const int dilation_h, const int dilation_w, const int dilation_d,
+    const int pad_h, const int pad_w, const int pad_d,
+    const int stride_h, const int stride_w, const int stride_d,
+    %(float_type)s* data_col) {
+  // Implicit dilated kernel size
+  int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
+  int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
+  int dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
+  int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
+  int depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
+  int channels_col = channels * kernel_h * kernel_w * kernel_d;
+  for (int c = 0; c < channels_col; ++c) {
+    int d_offset = c %% kernel_d;
+    int w_offset = (c / kernel_d) %% kernel_w;
+    int h_offset = (c / kernel_w / kernel_d) %% kernel_h;
+    int c_im = c / kernel_h / kernel_w / kernel_d;
+    for (int h = 0; h < height_col; ++h) {
+      int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
+      for (int w = 0; w < width_col; ++w) {
+        int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
+        for (int d = 0; d < depth_col; ++d) {
+          int d_pad = d * stride_d - pad_d + d_offset * dilation_d;
+          if (h_pad >= 0 && h_pad < height
+              && w_pad >= 0 && w_pad < width
+              && d_pad >= 0 && d_pad < depth)
+            data_col[(npy_intp)((c * height_col + h) * width_col + w) * depth_col + d] =
+              data_im[(npy_intp)((c_im * height + h_pad) * width + w_pad) * depth + d_pad];
+          else
+            data_col[(npy_intp)((c * height_col + h) * width_col + w) * depth_col + d] = 0.;
+        }
+      }
+    }
+  }
+}
+// Unlike the Caffe and Theano GPU verions, the data_im array is set to zero
+// before the col2im call rather than doing it here. So, the result is just
+// accumulated into data_im.
+void col2im3d(const %(float_type)s* data_col, const int channels,
+    const int height, const int width, const int depth,
+    const int patch_h, const int patch_w, const int patch_d,
+    const int dilation_h, const int dilation_w, const int dilation_d,
+    const int pad_h, const int pad_w, const int pad_d,
+    const int stride_h, const int stride_w, const int stride_d,
+    %(float_type)s* data_im) {
+  // Implicit dilated patch
+  int dil_patch_h = (patch_h - 1) * dilation_h + 1;
+  int dil_patch_w = (patch_w - 1) * dilation_w + 1;
+  int dil_patch_d = (patch_d - 1) * dilation_d + 1;
+  int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
+  int depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
+  int num_kernels = channels * height * width * depth;
+  int channels_col = channels * patch_h * patch_w * patch_d;
+  for (int c = 0; c < channels_col; ++c) {
+    int d_offset = c %% patch_d;
+    int w_offset = (c / patch_d) %% patch_w;
+    int h_offset = (c / patch_w / patch_d) %% patch_h;
+    int c_im = c / patch_h / patch_w / patch_d;
+    for (int h = 0; h < height_col; ++h) {
+      int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
+      for (int w = 0; w < width_col; ++w) {
+        int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
+        for (int d = 0; d < depth_col; ++d) {
+          int d_pad = d * stride_d - pad_d + d_offset * dilation_d;
+          if (h_pad >= 0 && h_pad < height
+              && w_pad >= 0 && w_pad < width
+              && d_pad >= 0 && d_pad < depth)
+            data_im[(npy_intp)((c_im * height + h_pad) * width + w_pad) * depth + d_pad] +=
+              data_col[(npy_intp)((c * height_col + h) * width_col + w) * depth_col + d];
+        }
+      }
+    }
+  }
+}
+// Theano op code
+// GPU version authors: Arjun Jain, Frederic Bastien, Jan Schlueter
+// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
+//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
+// CPU version author: Jesse Livezey
+// CPU version adapted from GPU version
+PyArrayObject* corr3dMM(PyArrayObject* bottom,
+                        PyArrayObject* weight,
+                        PyArrayObject* top,
+                        const int direction,
+                        const int dH = 1,
+                        const int dW = 1,
+                        const int dD = 1,
+                        const int dilH = 1,
+                        const int dilW = 1,
+                        const int dilD = 1,
+                        const int padH = 0,
+                        const int padW = 0,
+                        const int padD = 0)
+{
+    if (PyArray_NDIM(bottom) != 5)
+    {
+        PyErr_SetString(PyExc_ValueError, "Corr3dMM requires bottom of 5D");
+        return NULL;
+    }
+    if (PyArray_TYPE(bottom) != %(float_typenum)s)
+    {
+        PyErr_SetString(PyExc_ValueError, "Corr3dMM received bottom with wrong type.");
+        return NULL;
+    }
+    if (PyArray_NDIM(weight) != 5)
+    {
+        PyErr_SetString(PyExc_ValueError, "Corr3dMM requires weight of 5D");
+        return NULL;
+    }
+    if (PyArray_TYPE(weight) != %(float_typenum)s)
+    {
+        PyErr_SetString(PyExc_ValueError, "Corr3dMM received weight with wrong type.");
+        return NULL;
+    }
+    if (PyArray_NDIM(top) != 5)
+    {
+        PyErr_SetString(PyExc_ValueError, "Corr3dMM requires top of 5D");
+        return NULL;
+    }
+    if (PyArray_TYPE(top) != %(float_typenum)s)
+    {
+        PyErr_SetString(PyExc_ValueError, "Corr3dMM received top with wrong type.");
+        return NULL;
+    }
+    // Ensure data is contiguous
+    bottom = PyArray_GETCONTIGUOUS(bottom);
+    weight = PyArray_GETCONTIGUOUS(weight);
+    top = PyArray_GETCONTIGUOUS(top);
+    // Extract some shape information for later and check shape consistency
+    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
+    const int batchSize = PyArray_DIMS(bottom)[0];
+    const int nChannels = PyArray_DIMS(bottom)[1];
+    const int bottomHeight = PyArray_DIMS(bottom)[2];
+    const int bottomWidth = PyArray_DIMS(bottom)[3];
+    const int bottomDepth = PyArray_DIMS(bottom)[4];
+    // weights: (nFilters, nChannels, rows, columns, slices)
+    const int nFilters = PyArray_DIMS(weight)[0];
+    const int kH = PyArray_DIMS(weight)[2];
+    const int kW = PyArray_DIMS(weight)[3];
+    const int kD = PyArray_DIMS(weight)[4];
+    if (nChannels != PyArray_DIMS(weight)[1]) {
+        PyErr_SetString(PyExc_ValueError,
+                "Corr3dMM images and kernel must have the same stack size\n");
+        return NULL;
+    }
+    // implicit dilated filter
+    const int dil_kH = (kH - 1) * dilH + 1;
+    const int dil_kW = (kW - 1) * dilW + 1;
+    const int dil_kD = (kD - 1) * dilD + 1;
+    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
+    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
+    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    const int topDepth  = (bottomDepth + 2*padD - dil_kD) / dD + 1;
+    if (batchSize != PyArray_DIMS(top)[0] ||
+            nFilters != PyArray_DIMS(top)[1] ||
+            topHeight != PyArray_DIMS(top)[2] ||
+            topWidth != PyArray_DIMS(top)[3] ||
+            topDepth != PyArray_DIMS(top)[4]) {
+        PyErr_Format(PyExc_ValueError,
+                "Corr3dMM shape inconsistency:\n"
+                "  bottom shape: %%d %%d %%d %%d %%d\n"
+                "  weight shape: %%d %%d %%d %%d %%d\n"
+                "  top shape: %%ld %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d %%d)\n",
+                batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
+                nFilters, nChannels, kH, kW, kD,
+                PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
+                PyArray_DIMS(top)[2], PyArray_DIMS(top)[3], PyArray_DIMS(top)[4],
+                batchSize, nFilters, topHeight, topWidth, topDepth);
+        return NULL;
+    }
+    // Create temporary columns
+    int max_threads = %(omp_get_max_threads)s;
+    if (batchSize < max_threads) {
+        max_threads = batchSize;
+    }
+    npy_intp col_dim[3];
+    col_dim[0] = (npy_intp)max_threads;
+    col_dim[1] = (npy_intp)(nChannels * kW * kH * kD);
+    col_dim[2] = (npy_intp)(topHeight * topWidth * topDepth);
+    //Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
+    PyArrayObject* col = (PyArrayObject*)PyArray_ZEROS(3,
+            col_dim,
+            PyArray_TYPE(top),
+            0); 
+    if (NULL == col) {
+        PyErr_Format(PyExc_RuntimeError,
+                "Corr3dMM failed to allocate working memory of"
+                " %%ld x %%ld x %%ld\n",
+                col_dim[0], col_dim[1], col_dim[2]);
+        return NULL;
+    }
+    // Define some useful variables
+    const int bottom_stride = PyArray_STRIDES(bottom)[0]/%(n_bytes)f;
+    const int top_stride = PyArray_STRIDES(top)[0]/%(n_bytes)f;
+    const int K_ = col_dim[1];
+    const int N_ = col_dim[2];
+    const int col_stride = (K_ * N_);
+    const int M_ = nFilters;
+    const %(c_float_type)s one = 1.0;
+    const %(c_float_type)s zero = 0.0;
+    char NTrans = 'N';
+    char Trans = 'T';
+    PyArrayObject *output;
+    if (direction == 0) {  // forward pass
+        output = top;
+        // valid correlation: im3d2col, then gemm
+        // Iterate over batch
+        int blas_threads_saved = %(blas_get_num_threads)s;
+        // Always forcing gemm to one thread when OpenMP is enalbed for best and stable performance.
+        %(blas_set_num_threads)s(1);
+        %(omp_flags)s
+        for (int n = 0; n < batchSize; ++n) {
+            int tid = %(omp_get_thread_num)s;
+            // First, im3d2col
+            im3d2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels,
+                     bottomHeight, bottomWidth, bottomDepth,
+                     kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD,
+                     (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
+            // Second, gemm
+            %(gemm)s(&NTrans, &NTrans,
+                   &N_, &M_, &K_,
+                   &one,
+                   (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride, &N_,
+                   (%(float_type)s*)PyArray_DATA(weight), &K_,
+                   &zero,
+                   (%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_);
+        }
+        // Restore to previous blas threads
+        %(blas_set_num_threads)s(blas_threads_saved);
+    }
+    else if (direction == 1) {  // backprop wrt. weights
+        output = weight;
+        npy_intp weight_dim[2];
+        weight_dim[0] = (npy_intp)max_threads;
+        weight_dim[1] = (npy_intp)(M_ * K_);
+        PyArrayObject* local_weight = (PyArrayObject*)PyArray_ZEROS(2,
+                                   weight_dim, PyArray_TYPE(weight), 0);
+        if (NULL == local_weight)
+        {
+            PyErr_Format(PyExc_RuntimeError,
+                    "Corr3dMM failed to allocate weight memory of %%ld x %%ld\n",
+                    weight_dim[0], weight_dim[1]);
+            return NULL;
+        }
+        // valid convolution: im2col, then gemm
+        // Iterate over batch
+        int blas_threads_saved = %(blas_get_num_threads)s;
+        // Always forcing gemm to one thread when OpenMP is enalbed for best and stable performance.
+        %(blas_set_num_threads)s(1);
+        // OMP for batch-level paralization
+        %(omp_flags)s
+        for (int n = 0; n < batchSize; ++n) {
+            int tid = %(omp_get_thread_num)s;
+            // First, im2col
+            im3d2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels,
+                     bottomHeight, bottomWidth, bottomDepth,
+                     kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD,
+                     (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
+            // Second, gemm
+            // Note that we accumulate into weight. We do so by setting beta = 0
+            // for the first iteration and beta = 1 for subsequent ones. (This
+            // is faster than setting weight to all zeros before the loop.)
+            %(gemm)s(&Trans, &NTrans,
+                   &K_, &M_, &N_,
+                   &one,
+                   (%(float_type)s*)PyArray_DATA(col) + tid * col_stride, &N_,
+                   (%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_,
+                   (n == 0) ? &zero : &one,
+                   (%(float_type)s*)PyArray_DATA(local_weight) + 
+                   tid * weight_dim[1], &K_);
+        }
+        // Restore to previous blas threads
+        %(blas_set_num_threads)s(blas_threads_saved);
+        //aggregate weights
+        memset((%(float_type)s*)PyArray_DATA(weight), 0, M_ * K_*sizeof(%(float_type)s));
+        /*
+         * Put index "j" into outer loop to get the
+         * correct result when openmp is used.
+         */
+        %(omp_flags)s
+        for(int j = 0; j < weight_dim[1]; ++j){
+            for(int i = 0; i < max_threads; ++i){
+                ((%(float_type)s*)PyArray_DATA(weight))[j] += 
+                    *((%(float_type)s*)PyArray_DATA(local_weight) +
+                    i * weight_dim[1] + j);
+            }
+        }
+        Py_DECREF(local_weight);
+    }
+    else if (direction == 2) {  // backprop wrt. inputs
+        output = bottom;
+        // bottom is set to zero here rather than inside of col2im
+        PyArray_FILLWBYTE(bottom, 0);
+        // full convolution: gemm, then col2im3d
+        // Iterate over batch
+        int blas_threads_saved = %(blas_get_num_threads)s;
+        // Always forcing gemm to one thread when OpenMP is enalbed for best and stable performance.
+        %(blas_set_num_threads)s(1);
+        %(omp_flags)s
+        for (int n = 0; n < batchSize; ++n) {
+            // gemm into columns
+            int tid = %(omp_get_thread_num)s;
+            %(gemm)s(&NTrans, &Trans,
+                   &N_, &K_, &M_,
+                   &one,
+                   (%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_,
+                   (%(float_type)s*)PyArray_DATA(weight), &K_,
+                   &zero,
+                   (%(float_type)s*)PyArray_DATA(col) + tid * col_stride, &N_);
+            // col2im back to the data
+            col2im3d((%(float_type)s*)PyArray_DATA(col) + tid * col_stride, nChannels,
+                     bottomHeight, bottomWidth, bottomDepth,
+                     kH, kW, kD, dilH, dilW, dilD, padH, padW, padD, dH, dW, dD,
+                     (%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride);
+        }
+        // Restore to previous blas threads
+        %(blas_set_num_threads)s(blas_threads_saved);
+    }
+    // Free temporary columns
+    Py_DECREF(col);
+    // decref from contiguous check
+    Py_DECREF(bottom);
+    Py_DECREF(weight);
+    Py_DECREF(top);
+    // Note that we don't change the refcount of the output matrix here. Output
+    // (re)allocation and refcounting is done in BaseCorr3dMM.c_code_helper();
+    // in here output is just aliased to one of bottom, weights, or top.
+    return output;
+}
--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -10,6 +10,8 @@ from theano.gof.opt import copy_stack_trace
 from theano.tensor.nnet.corr import (
    CorrMM, CorrMM_gradInputs, CorrMM_gradWeights)
+from theano.tensor.nnet.corr3d import (
+    Corr3dMM, Corr3dMM_gradInputs, Corr3dMM_gradWeights)
 from theano.tensor.nnet.blocksparse import (
    SparseBlockGemv,
    SparseBlockOuter,
@@ -90,6 +92,28 @@ def local_abstractconv_gemm(node):
    return [rval]
+@local_optimizer([AbstractConv3d])
+def local_abstractconv3d_gemm(node):
+    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+        return
+    if not isinstance(node.op, AbstractConv3d):
+        return None
+    img, kern = node.inputs
+    if not isinstance(img.type, TensorType) or \
+       not isinstance(kern.type, TensorType):
+        return None
+    # need to flip the kernel if necessary
+    if node.op.filter_flip:
+        kern = kern[:, :, ::-1, ::-1, ::-1]
+    rval = Corr3dMM(border_mode=node.op.border_mode,
+                    subsample=node.op.subsample,
+                    filter_dilation=node.op.filter_dilation)(img, kern)
+    copy_stack_trace(node.outputs[0], rval)
+    return [rval]
 @local_optimizer([AbstractConv2d_gradWeights])
 def local_abstractconv_gradweight_gemm(node):
    if theano.config.cxx == "" or not theano.config.blas.ldflags:
@@ -115,6 +139,31 @@ def local_abstractconv_gradweight_gemm(node):
    return [rval]
+@local_optimizer([AbstractConv3d_gradWeights])
+def local_abstractconv3d_gradweight_gemm(node):
+    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+        return
+    if not isinstance(node.op, AbstractConv3d_gradWeights):
+        return None
+    img, topgrad, shape = node.inputs
+    if not isinstance(img.type, TensorType) or \
+       not isinstance(topgrad.type, TensorType):
+        return None
+    rval = Corr3dMM_gradWeights(border_mode=node.op.border_mode,
+                                subsample=node.op.subsample,
+                                filter_dilation=node.op.filter_dilation)(img, topgrad, shape)
+    copy_stack_trace(node.outputs[0], rval)
+    # need to flip the kernel if necessary
+    if node.op.filter_flip:
+        rval = rval[:, :, ::-1, ::-1, ::-1]
+    rval = theano.tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
+    copy_stack_trace(node.outputs[0], rval)
+    return [rval]
 @local_optimizer([AbstractConv2d_gradInputs])
 def local_abstractconv_gradinputs_gemm(node):
    if theano.config.cxx == "" or not theano.config.blas.ldflags:
@@ -138,6 +187,29 @@ def local_abstractconv_gradinputs_gemm(node):
    return [rval]
+@local_optimizer([AbstractConv3d_gradInputs])
+def local_abstractconv3d_gradinputs_gemm(node):
+    if theano.config.cxx == "" or not theano.config.blas.ldflags:
+        return
+    if not isinstance(node.op, AbstractConv3d_gradInputs):
+        return None
+    kern, topgrad, shape = node.inputs
+    if not isinstance(kern.type, TensorType) or \
+       not isinstance(topgrad.type, TensorType):
+        return None
+    # need to flip the kernel if necessary
+    if node.op.filter_flip:
+        kern = kern[:, :, ::-1, ::-1, ::-1]
+    rval = Corr3dMM_gradInputs(border_mode=node.op.border_mode,
+                               subsample=node.op.subsample,
+                               filter_dilation=node.op.filter_dilation)(kern, topgrad,
+                                                                        shape)
+    copy_stack_trace(node.outputs[0], rval)
+    return [rval]
 @local_optimizer([AbstractConv2d])
 def local_conv2d_cpu(node):
@@ -481,6 +553,14 @@ conv_groupopt.register('local_abstractconv_gradweight_gemm',
 conv_groupopt.register('local_abstractconv_gradinputs_gemm',
                       local_abstractconv_gradinputs_gemm, 30,
                       'conv_gemm', 'fast_compile', 'fast_run')
+conv_groupopt.register('local_abstractconv3d_gemm', local_abstractconv3d_gemm, 30,
+                       'conv_gemm', 'fast_compile', 'fast_run')
+conv_groupopt.register('local_abstractconv3d_gradweight_gemm',
+                       local_abstractconv3d_gradweight_gemm, 30,
+                       'conv_gemm', 'fast_compile', 'fast_run')
+conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
+                       local_abstractconv3d_gradinputs_gemm, 30,
+                       'conv_gemm', 'fast_compile', 'fast_run')
 # Legacy convolution
 conv_groupopt.register('local_conv2d_cpu', local_conv2d_cpu, 40,
                       'fast_compile', 'fast_run')

--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -20,6 +20,8 @@ from theano.tensor.nnet.abstract_conv import bilinear_upsampling
 from theano.tensor.nnet.conv import ConvOp
 from theano.tensor.nnet.corr import (CorrMM, CorrMM_gradWeights,
                                     CorrMM_gradInputs)
+from theano.tensor.nnet.corr3d import (Corr3dMM, Corr3dMM_gradWeights,
+                                       Corr3dMM_gradInputs)
 from theano.tensor.nnet.Conv3D import Conv3D
 from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
 from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
@@ -734,11 +736,9 @@ class TestCorrConv3d(BaseTestConv3d):
        BaseTestConv3d.setup_class()
    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
-        if b not in ((0, 0, 0), 'valid'):
-            raise SkipTest("Only border_mode valid is implemented for basic cpu Conv3D.")
-        if fd != (1, 1, 1):
-            raise SkipTest("No dilation implementation for basic cpu Conv3D.")
        o = self.get_output_shape(i, f, s, b, fd)
+        if fd != (1, 1, 1):
+            raise SkipTest("No reference implementation for 3D dilation.")
        if (not theano.config.blas.ldflags or
                not theano.config.cxx or
                theano.config.mode == "FAST_COMPILE"):
@@ -746,17 +746,17 @@ class TestCorrConv3d(BaseTestConv3d):
        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
                     verify_grad=True, provide_shape=provide_shape,
                     border_mode=b, filter_flip=flip,
-                     target_op=Conv3D, check_trace=True,
+                     target_op=Corr3dMM, check_trace=True,
                     filter_dilation=fd)
        self.run_gradweight(inputs_shape=i, filters_shape=f,
                            output_shape=o, subsample=s, verify_grad=True,
                            provide_shape=provide_shape, border_mode=b,
-                            filter_flip=flip, target_op=ConvGrad3D,
+                            filter_flip=flip, target_op=Corr3dMM_gradWeights,
                            check_trace=True, filter_dilation=fd)
        self.run_gradinput(inputs_shape=i, filters_shape=f,
                           output_shape=o, subsample=s, verify_grad=True,
                           provide_shape=provide_shape, border_mode=b,
-                           filter_flip=flip, target_op=ConvTransp3D,
+                           filter_flip=flip, target_op=Corr3dMM_gradInputs,
                           check_trace=True, filter_dilation=fd)
@@ -764,7 +764,6 @@ class TestCpuConv3d(BaseTestConv3d):
    @classmethod
    def setup(cls):
        BaseTestConv3d.setup_class()
-        # TODO check how conv_gemm works for conv3d
        cls.mode = theano.compile.mode.get_default_mode().excluding('conv_gemm')
        cls.opt_err = theano.config.on_opt_error
        theano.config.on_opt_error = 'ignore'

--- a/theano/tensor/nnet/tests/test_corr3d.py
+++ b/theano/tensor/nnet/tests/test_corr3d.py
+from __future__ import absolute_import, print_function, division
+from nose.plugins.skip import SkipTest
+from nose.plugins.attrib import attr
+from nose.tools import assert_equals
+import numpy
+from six import integer_types
+import theano
+import theano.tensor as T
+from theano.tests import unittest_tools as utt
+from theano.tensor.nnet import corr3d, conv
+class TestCorr3D(utt.InferShapeTester):
+    if theano.config.mode == "FAST_COMPILE":
+        mode = theano.compile.get_mode("FAST_RUN")
+    else:
+        mode = None
+    dtype = theano.config.floatX
+    def setUp(self):
+        super(TestCorr3D, self).setUp()
+        self.input = T.tensor5('input', dtype=self.dtype)
+        self.input.name = 'default_V'
+        self.filters = T.tensor5('filters', dtype=self.dtype)
+        self.filters.name = 'default_filters'
+        if not conv.imported_scipy_signal and theano.config.cxx == "":
+            raise SkipTest("Corr3dMM tests need SciPy or a c++ compiler")
+        if not theano.config.blas.ldflags:
+            raise SkipTest("Corr3dMM tests need a BLAS")
+    def validate(self, image_shape, filter_shape,
+                 border_mode='valid', subsample=(1, 1, 1),
+                 input=None, filters=None, verify_grad=True,
+                 non_contiguous=False, filter_dilation=(1, 1, 1)):
+        """
+        :param image_shape: The constant shape info passed to corr3dMM.
+        :param filter_shape: The constant shape info passed to corr3dMM.
+        """
+        N_image_shape = [T.get_scalar_constant_value(T.as_tensor_variable(x))
+                         for x in image_shape]
+        N_filter_shape = [T.get_scalar_constant_value(T.as_tensor_variable(x))
+                          for x in filter_shape]
+        if input is None:
+            input = self.input
+        if filters is None:
+            filters = self.filters
+        # THEANO IMPLEMENTATION
+        # we create a symbolic function so that verify_grad can work
+        def sym_Corr3dMM(input, filters):
+            # define theano graph and function
+            input.name = 'input'
+            filters.name = 'filters'
+            rval = corr3d.Corr3dMM(border_mode, subsample,
+                                   filter_dilation)(input, filters)
+            rval.name = 'corr_output'
+            return rval
+        output = sym_Corr3dMM(input, filters)
+        output.name = 'Corr3dMM()(%s,%s)' % (input.name, filters.name)
+        theano_corr = theano.function([input, filters], output, mode=self.mode)
+        # initialize input and compute result
+        image_data = numpy.random.random(N_image_shape).astype(self.dtype)
+        filter_data = numpy.random.random(N_filter_shape).astype(self.dtype)
+        image_data /= 10
+        filter_data /= 10
+        if non_contiguous:
+            image_data = numpy.transpose(image_data, axes=(0, 1, 4, 3, 2))
+            image_data = image_data.copy()
+            image_data = numpy.transpose(image_data, axes=(0, 1, 4, 3, 2))
+            filter_data = numpy.transpose(filter_data, axes=(0, 1, 4, 3, 2))
+            filter_data = filter_data.copy()
+            filter_data = numpy.transpose(filter_data, axes=(0, 1, 4, 3, 2))
+            assert not image_data.flags['CONTIGUOUS']
+            assert not filter_data.flags['CONTIGUOUS']
+        theano_output = theano_corr(image_data, filter_data)
+        # REFERENCE IMPLEMENTATION
+        # Testing correlation, not convolution. Reverse filters.
+        filter_data_corr = numpy.array(filter_data[:, :, ::-1, ::-1, ::-1],
+                                       copy=True,
+                                       order='C')
+        orig_image_data = image_data
+        img_shape3d = numpy.array(N_image_shape[-3:])
+        fil_shape3d = numpy.array(N_filter_shape[-3:])
+        dil_shape3d = numpy.array(filter_dilation)
+        dil_fil_shape3d = (fil_shape3d - 1) * dil_shape3d + 1
+        subsample3d = numpy.array(subsample)
+        if border_mode == 'full':
+            padHWD = (dil_fil_shape3d - 1)
+        elif border_mode == 'valid':
+            padHWD = numpy.array([0, 0, 0])
+        elif border_mode == 'half':
+            padHWD = numpy.floor(dil_fil_shape3d / 2).astype('int32')
+        elif isinstance(border_mode, tuple):
+            padHWD = numpy.array(border_mode)
+        elif isinstance(border_mode, integer_types):
+            padHWD = numpy.array([border_mode, border_mode, border_mode])
+        else:
+            raise NotImplementedError('Unsupported border_mode {}'.format(border_mode))
+        out_shape3d = numpy.floor((img_shape3d + 2 * (padHWD) - dil_fil_shape3d) / subsample3d) + 1
+        # avoid numpy deprecation
+        out_shape3d = out_shape3d.astype('int32')
+        out_shape = (N_image_shape[0], N_filter_shape[0]) + tuple(out_shape3d)
+        ref_output = numpy.zeros(out_shape)
+        # loop over output feature maps
+        ref_output.fill(0)
+        image_data2 = numpy.zeros((N_image_shape[0], N_image_shape[1],
+                                   N_image_shape[2] + 2 * padHWD[0],
+                                   N_image_shape[3] + 2 * padHWD[1],
+                                   N_image_shape[4] + 2 * padHWD[2]))
+        image_data2[:, :,
+                    padHWD[0]:padHWD[0] + N_image_shape[2],
+                    padHWD[1]:padHWD[1] + N_image_shape[3],
+                    padHWD[2]:padHWD[2] + N_image_shape[4]] = image_data
+        image_data = image_data2
+        N_image_shape = image_data.shape
+        for bb in range(N_image_shape[0]):
+            for nn in range(N_filter_shape[0]):
+                for im0 in range(N_image_shape[1]):
+                    filter3d = filter_data_corr[nn, im0, :, :, :]
+                    image3d = image_data[bb, im0, :, :, :]
+                    for row in range(ref_output.shape[2]):
+                        irow = row * subsample[0]  # image row
+                        for col in range(ref_output.shape[3]):
+                            icol = col * subsample[1]  # image col
+                            for slc in range(ref_output.shape[4]):
+                                islc = slc * subsample[2]  # image slice
+                                ref_output[bb, nn, row, col, slc] += (image3d[
+                                    irow:irow + dil_fil_shape3d[0]:filter_dilation[0],
+                                    icol:icol + dil_fil_shape3d[1]:filter_dilation[1],
+                                    islc:islc + dil_fil_shape3d[2]:filter_dilation[2]
+                                    ] * filter3d[::-1, ::-1, ::-1]
+                                ).sum()
+        utt.assert_allclose(theano_output, ref_output)
+        # TEST GRADIENT
+        if verify_grad:
+            utt.verify_grad(sym_Corr3dMM, [orig_image_data, filter_data],
+                            mode=self.mode)
+    @attr('slow')
+    def test_basic(self):
+        """
+        Tests that basic correlations work for odd and even
+        dimensions of image and filter shapes, as well as rectangular
+        images and filters.
+        """
+        border_modes = ['valid', 'full', 'half', (1, 1, 1),
+                        (2, 1, 1), (1, 2, 1), (1, 1, 2),
+                        (3, 3, 3), 1]
+        img_shapes = [(2, 2, 3, 3, 3), (3, 2, 8, 8, 8), (3, 2, 7, 5, 5), (3, 2, 7, 5, 5),
+                      (1, 2, 8, 8, 8), (1, 2, 7, 5, 5)]
+        fil_shapes = [(2, 2, 2, 2, 2), (1, 2, 5, 5, 5), (2, 2, 2, 3, 2), (2, 2, 3, 2, 2),
+                      (1, 2, 5, 5, 5), (1, 2, 2, 3, 3)]
+        for border_mode in border_modes:
+            for img, fil in zip(img_shapes, fil_shapes):
+                self.validate(img, fil, border_mode, verify_grad=False)
+        # Very slow on with 'full' or 'half'
+        self.validate((1, 10, 213, 129, 129), (46, 10, 212, 1, 1), 'valid', verify_grad=False)
+    def test_img_kernel_same_shape(self):
+        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), 'full')
+        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), 'valid')
+        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), 'half')
+        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), (1, 1, 1))
+        self.validate((3, 2, 3, 3, 3), (1, 2, 3, 3, 3), 1)
+    @attr('slow')
+    def test_subsample(self):
+        """
+        Tests correlation where subsampling != (1,1,1)
+        """
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'valid', subsample=(2, 2, 2))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'valid', subsample=(2, 1, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 'valid', subsample=(3, 3, 3))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'full', subsample=(2, 2, 2))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'full', subsample=(2, 1, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 'full', subsample=(3, 3, 3))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'half', subsample=(2, 2, 2))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'half', subsample=(2, 1, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 'half', subsample=(3, 3, 3))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), (1, 1, 1), subsample=(2, 2, 2))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), (2, 1, 1), subsample=(2, 1, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), (1, 2, 2), subsample=(3, 3, 3))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 1, subsample=(3, 3, 3))
+    def test_filter_dilation(self):
+        """
+        Tests correlation where filter dilation != (1,1,1)
+        """
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'valid', filter_dilation=(2, 2, 2))
+        self.validate((3, 2, 14, 10, 10), (2, 2, 2, 3, 3), 'valid', filter_dilation=(3, 1, 1))
+        self.validate((1, 1, 14, 14, 14), (1, 1, 3, 3, 3), 'valid', filter_dilation=(2, 3, 3))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'full', filter_dilation=(2, 2, 2))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'full', filter_dilation=(3, 1, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 'full', filter_dilation=(2, 3, 3))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'half', filter_dilation=(2, 2, 2))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), 'half', filter_dilation=(3, 1, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 'half', filter_dilation=(2, 3, 3))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), (1, 1, 1), filter_dilation=(2, 2, 2))
+        self.validate((3, 2, 7, 5, 5), (2, 2, 2, 3, 3), (2, 1, 1), filter_dilation=(2, 1, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), (1, 2, 1), filter_dilation=(1, 2, 1))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), (1, 1, 2), filter_dilation=(1, 1, 2))
+        self.validate((1, 1, 6, 6, 6), (1, 1, 3, 3, 3), 1, subsample=(3, 3, 3), filter_dilation=(2, 2, 2))
+    @attr('slow')
+    def test_shape_Constant_tensor(self):
+        """
+        Tests correlation where the {image,filter}_shape is a Constant tensor.
+        """
+        as_t = T.as_tensor_variable
+        border_modes = ['valid', 'full', 'half', (1, 1, 1), (2, 1, 1),
+                        (1, 2, 1), (1, 1, 2), (3, 3, 3), 1]
+        for border_mode in border_modes:
+            self.validate((as_t(3), as_t(2), as_t(7), as_t(5), as_t(5)),
+                          (5, 2, 2, 3, 3), border_mode)
+            self.validate(as_t([3, 2, 7, 5, 5]), (5, 2, 2, 3, 3), border_mode)
+            self.validate(as_t((3, 2, 7, 5, 5)), (5, 2, 2, 3, 3), border_mode)
+            self.validate((3, 2, 7, 5, 5), (as_t(5), as_t(2), as_t(2),
+                          as_t(3), as_t(3)), 'valid')
+            self.validate((3, 2, 7, 5, 5), as_t([5, 2, 2, 3, 3]), border_mode)
+            self.validate(as_t([3, 2, 7, 5, 5]), as_t([5, 2, 2, 3, 3]), border_mode)
+    def test_invalid_filter_shape(self):
+        """
+        Tests scenario where filter_shape[1] != input_shape[1]
+        """
+        self.assertRaises(ValueError, self.validate,
+                          (3, 2, 8, 8, 8), (4, 3, 5, 5, 8),
+                          'valid')
+    def test_full_mode(self):
+        """
+        Tests basic correlation in full mode and case where filter
+        is larger than the input image.
+        """
+        self.validate((3, 2, 5, 5, 5), (4, 2, 8, 8, 8), 'full')
+        def f():
+            self.validate((3, 2, 5, 5, 5), (4, 2, 8, 8, 8), 'valid')
+        self.assertRaises(Exception, f)
+    def test_wrong_input(self):
+        """
+        Make sure errors are raised when image and kernel are not 5D tensors
+        """
+        self.assertRaises(Exception, self.validate, (3, 2, 8, 8, 8), (4, 2, 5, 5, 5),
+                          'valid', input=T.dmatrix())
+        self.assertRaises(Exception, self.validate, (3, 2, 8, 8, 8), (4, 2, 5, 5, 5),
+                          'valid', filters=T.dvector())
+        self.assertRaises(Exception, self.validate, (3, 2, 8, 8, 8), (4, 2, 5, 5, 5),
+                          'valid', input=T.dtensor3())
+        self.assertRaises(Exception, self.validate, (3, 2, 8, 8, 8), (4, 2, 5, 5, 5),
+                          'valid', input=T.dtensor4())
+    def test_dtype_upcast(self):
+        """
+        Checks dtype upcast for Corr3dMM methods.
+        """
+        def rand(shape, dtype='float64'):
+            r = numpy.asarray(numpy.random.rand(*shape), dtype=dtype)
+            return r * 2 - 1
+        ops = [corr3d.Corr3dMM, corr3d.Corr3dMM_gradWeights, corr3d.Corr3dMM_gradInputs]
+        a_shapes = [[4, 5, 6, 3, 3], [1, 5, 6, 3, 3], [1, 5, 6, 3, 3]]
+        b_shapes = [[7, 5, 3, 2, 2], [1, 5, 3, 1, 1], [7, 1, 3, 1, 1]]
+        dtypes = ['float32', 'float64']
+        for op, a_shape, b_shape in zip(ops, a_shapes, b_shapes):
+            for a_dtype in dtypes:
+                for b_dtype in dtypes:
+                    c_dtype = theano.scalar.upcast(a_dtype, b_dtype)
+                    a_tens = T.tensor5(dtype=a_dtype)
+                    b_tens = T.tensor5(dtype=b_dtype)
+                    a_tens_val = rand(a_shape, dtype=a_dtype)
+                    b_tens_val = rand(b_shape, dtype=b_dtype)
+                    c_tens = op()(a_tens, b_tens)
+                    f = theano.function([a_tens, b_tens], c_tens, mode=self.mode)
+                    assert_equals(f(a_tens_val, b_tens_val).dtype, c_dtype)
+    @attr('slow')
+    def test_infer_shape_forward(self):
+        if theano.config.mode == "FAST_COMPILE":
+            raise SkipTest("Corr3dMM don't work in FAST_COMPILE")
+        def rand(*shape):
+            r = numpy.asarray(numpy.random.rand(*shape), dtype='float64')
+            return r * 2 - 1
+        corr3dMM = corr3d.Corr3dMM
+        adtens = T.dtensor5()
+        bdtens = T.dtensor5()
+        aivec_vals = [[4, 5, 6, 3, 3], [6, 2, 8, 3, 3], [3, 6, 7, 5, 5],
+                      [3, 6, 7, 5, 5], [5, 2, 4, 3, 3]]
+        bivec_vals = [[7, 5, 3, 2, 2], [4, 2, 5, 3, 3], [5, 6, 3, 2, 2],
+                      [5, 6, 2, 3, 3], [6, 2, 4, 3, 3]]
+        modes = ['valid', 'full', 'half', (1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2), 1]
+        subsamples = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2)]
+        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
+            adtens_val = rand(*aivec_val)
+            bdtens_val = rand(*bivec_val)
+            for mode in modes:
+                for subsample in subsamples:
+                    # Corr3dMM
+                    cdtens = corr3dMM(border_mode=mode, subsample=subsample)(adtens, bdtens)
+                    self._compile_and_check([adtens, bdtens],
+                                            [cdtens],
+                                            [adtens_val, bdtens_val], corr3dMM,
+                                            warn=False)
+    @attr('slow')
+    def test_infer_shape_gradW(self):
+        if theano.config.mode == "FAST_COMPILE":
+            raise SkipTest("Corr3dMM don't work in FAST_COMPILE")
+        def rand(*shape):
+            r = numpy.asarray(numpy.random.rand(*shape), dtype='float64')
+            return r * 2 - 1
+        corr3dMM = corr3d.Corr3dMM
+        gradW = corr3d.Corr3dMM_gradWeights
+        adtens = T.dtensor5()
+        bdtens = T.dtensor5()
+        aivec_vals = [[1, 5, 6, 3, 3], [8, 2, 7, 3, 3], [1, 6, 9, 4, 4],
+                      [9, 6, 8, 5, 5], [9, 1, 6, 8, 8]]
+        bivec_vals = [[7, 5, 3, 1, 1], [4, 2, 5, 3, 3], [12, 6, 3, 2, 2],
+                      [5, 6, 1, 3, 3], [11, 1, 3, 3, 3]]
+        modes = ['valid', 'full', 'half', (1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2), 1]
+        subsamples = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2)]
+        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
+            adtens_val = rand(*aivec_val)
+            bdtens_val = rand(*bivec_val)
+            for mode in modes:
+                for subsample in subsamples:
+                    # Corr3dMM
+                    cdtens = corr3dMM(border_mode=mode, subsample=subsample)(adtens, bdtens)
+                    f = theano.function([adtens, bdtens], cdtens)
+                    cdtens_val = f(adtens_val, bdtens_val)
+                    # Corr3dMM_gradWeights
+                    shape = (theano.shared(bivec_val[2]), theano.shared(bivec_val[3]),
+                             theano.shared(bivec_val[4]))
+                    bdtens_g = gradW(border_mode=mode,
+                                     subsample=subsample)(adtens, cdtens, shape=shape)
+                    self._compile_and_check([adtens, cdtens],
+                                            [bdtens_g],
+                                            [adtens_val, cdtens_val], gradW,
+                                            warn=False)
+    @attr('slow')
+    def test_infer_shape_gradI(self):
+        if theano.config.mode == "FAST_COMPILE":
+            raise SkipTest("Corr3dMM don't work in FAST_COMPILE")
+        def rand(*shape):
+            r = numpy.asarray(numpy.random.rand(*shape), dtype='float64')
+            return r * 2 - 1
+        corr3dMM = corr3d.Corr3dMM
+        gradI = corr3d.Corr3dMM_gradInputs
+        adtens = T.dtensor5()
+        bdtens = T.dtensor5()
+        aivec_vals = [[1, 5, 6, 3, 3], [8, 2, 7, 3, 3], [1, 6, 9, 4, 4],
+                      [9, 6, 8, 5, 5], [9, 1, 6, 8, 8]]
+        bivec_vals = [[7, 5, 3, 1, 1], [4, 2, 5, 3, 3], [12, 6, 3, 2, 2],
+                      [5, 6, 1, 3, 3], [7, 1, 3, 4, 4]]
+        modes = ['valid', 'full', 'half', (1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2), 1]
+        subsamples = [(1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2)]
+        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
+            adtens_val = rand(*aivec_val)
+            bdtens_val = rand(*bivec_val)
+            for mode in modes:
+                for subsample in subsamples:
+                    # Corr3dMM
+                    cdtens = corr3dMM(border_mode=mode, subsample=subsample)(adtens, bdtens)
+                    f = theano.function([adtens, bdtens], cdtens)
+                    cdtens_val = f(adtens_val, bdtens_val)
+                    # Corr3dMM_gradInputs
+                    shape = (theano.shared(aivec_val[2]), theano.shared(aivec_val[3]),
+                             theano.shared(aivec_val[4]))
+                    adtens_g = gradI(border_mode=mode,
+                                     subsample=subsample)(bdtens, cdtens, shape=shape)
+                    self._compile_and_check([bdtens, cdtens],
+                                            [adtens_g],
+                                            [bdtens_val, cdtens_val], gradI,
+                                            warn=False)
+    def test_non_contiguous(self):
+        self.validate((2, 2, 3, 3, 3), (2, 2, 2, 2, 2), 'valid', non_contiguous=True)
+        self.validate((3, 2, 8, 8, 8), (4, 2, 5, 5, 5), 'valid', non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 2, 3, 3), 'valid', non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 3, 2, 2), 'valid', non_contiguous=True)
+        self.validate((3, 2, 8, 8, 8), (4, 2, 5, 5, 5), 'full', non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 2, 3, 3), 'full', non_contiguous=True)
+        self.validate((3, 2, 8, 8, 8), (4, 2, 5, 5, 5), 'half', non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 2, 3, 3), 'half', non_contiguous=True)
+        self.validate((3, 2, 8, 8, 8), (4, 2, 5, 5, 5), (1, 1, 1), non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 2, 3, 3), (1, 1, 2), non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 2, 3, 3), (1, 2, 1), non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 2, 3, 3), (2, 1, 1), non_contiguous=True)
+        self.validate((3, 2, 7, 5, 5), (5, 2, 2, 3, 3), 2, non_contiguous=True)
+if __name__ == '__main__':
+    t = TestCorr3D('setUp')
+    t.setUp()
+    t.test_infer_shape()