Merge pull request #3285 from JesseLivezey/cpu_corr3

[WIP] CpuCorrMM closes #3026 - redux

Merge pull request #3285 from JesseLivezey/cpu_corr3
0c599015 · Pascal Lamblin · 4736c9b3 · bbe6cb61 · 0c599015 · 0c599015
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -120,6 +120,13 @@ TODO: Give examples on how to use these things! They are pretty complicated.
      available. To explicitly disable the graph optimizer, set
      ``THEANO_FLAGS=optimizer_excluding=conv_gemm`` in your environment.
      If using it, please see the warning about a bug in CUDA 5.0 to 6.0 below.
+    - :func:`CorrMM <theano.tensor.nnet.corr.CorrMM>`
+      This is a CPU-only 2d correlation implementation taken from
+      `caffe <https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cpp>`_
+      and also used by Torch. It does not flip the kernel. As it provides a gradient,
+      you can use it as a replacement for nnet.conv2d. There is currently no
+      optimization to move this to GPU. This will be added when the new convolution
+      interface is finished.
    - :func:`dnn_conv <theano.sandbox.cuda.dnn.dnn_conv>` GPU-only
      convolution using NVIDIA's cuDNN library. This requires that you have
      cuDNN installed and available, which in turn requires CUDA 6.5 and a GPU

--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
+import os
+import logging
+import theano
+from theano import Apply
+from theano import gof
+from theano.tensor import as_tensor_variable, TensorType
+from theano.tensor.blas_headers import blas_header_text
+from theano.tensor.blas import ldflags
+_logger = logging.getLogger(__name__)
+class BaseCorrMM(gof.Op):
+    """
+    Base class for `CorrMM`, `CorrMM_gradWeights` and
+    `CorrMM_gradInputs`. Cannot be used directly.
+    Parameters
+    ----------
+    border_mode : {'valid', 'full', 'half'}
+        Additionally, the padding size could be directly specified by an integer
+        or a pair of integers
+    subsample
+        Perform subsampling of the output (default: (1, 1)).
+    """
+    check_broadcast = False
+    __props__ = ('border_mode', 'subsample')
+    def __init__(self, border_mode="valid", subsample=(1, 1)):
+        if isinstance(border_mode, int):
+            if border_mode < 0:
+                raise ValueError(
+                    'invalid border_mode {}, which must be a '
+                    'non-negative integer'.format(border_mode))
+            border_mode = (border_mode, border_mode)
+        if isinstance(border_mode, tuple):
+            if len(border_mode) != 2 or border_mode[0] < 0 or border_mode[1] < 0:
+                raise ValueError(
+                    'invalid border_mode {}, which must be a '
+                    'pair of non-negative integers'.format(border_mode))
+            pad_h, pad_w = map(int, border_mode)
+            border_mode = (pad_h, pad_w)
+        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
+                border_mode in ('valid', 'full', 'half')):
+            raise ValueError(
+                'invalid border_mode {}, which must be either '
+                '"valid", "full", "half", an integer or a pair of'
+                ' integers'.format(border_mode))
+        self.border_mode = border_mode
+        if len(subsample) != 2:
+            raise ValueError("subsample must have two elements")
+        self.subsample = subsample
+    @property
+    def pad(self):
+        if self.border_mode != 'valid':
+            return self.border_mode
+        return (0, 0)
+    def __str__(self):
+        return '%s{%s, %s}' % (
+            self.__class__.__name__,
+            self.border_mode,
+            str(self.subsample))
+    def c_support_code(self):
+        return blas_header_text()
+    def c_libraries(self):
+        return ldflags()
+    def c_compile_args(self):
+        return ldflags(libs=False, flags=True)
+    def c_lib_dirs(self):
+        return ldflags(libs=False, libs_dir=True)
+    def c_header_dirs(self):
+        return ldflags(libs=False, include_dir=True)
+    def c_headers(self):
+        return ['<stdio.h>']
+    def c_code_cache_version(self):
+        # raise this whenever modifying any of the support_code_files
+        return (1, 0)
+    def c_support_code_apply(self, node, nodename):
+        # REMEMBER TO RAISE c_code_cache_version when changing any of
+        # these files
+        sub = {}
+        dtype = str(node.__dict__['inputs'][0].dtype)
+        assert dtype in ('float32', 'float64')
+        if dtype == 'float32':
+            sub['gemm'] = 'sgemm_'
+            sub['float_type'] = 'npy_float'
+            sub['float_typenum'] = 'NPY_FLOAT'
+            sub['n_bytes'] = 4
+            sub['c_float_type'] = 'float'
+        else:
+            sub['gemm'] = 'dgemm_'
+            sub['float_type'] = 'npy_double'
+            sub['float_typenum'] = 'NPY_DOUBLE'
+            sub['n_bytes'] = 8
+            sub['c_float_type'] = 'double'
+        files = ['corr_gemm.c']
+        codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
+                 for f in files]
+        final_code = ''
+        for code in codes:
+            final_code += code
+        return final_code % sub
+    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
+        """
+        This generates the C code for CorrMM (direction="forward"),
+        CorrMM_gradWeights (direction="backprop weights"), and
+        CorrMM_gradInputs (direction="backprop inputs").
+        Depending on the direction, one of bottom, weights, top will
+        receive the output, while the other two serve as inputs.
+        :param bottom: Variable name of the input images in the forward pass,
+            or the gradient of the input images in backprop wrt. inputs
+        :param weights: Variable name of the filters in the forward pass,
+            or the gradient of the filters in backprop wrt. weights
+        :param top: Variable name of the output images / feature maps in the
+            forward pass, or the gradient of the outputs in the backprop passes
+        :param direction: "forward" to correlate bottom with weights and store
+            results in top,
+            "backprop weights" to do a valid convolution of bottom with top
+            (swapping the first two dimensions) and store results in weights,
+            and "backprop inputs" to do a full convolution of top with weights
+            (swapping the first two dimensions) and store results in bottom.
+        :param sub: Dictionary of substitutions useable to help generating the
+            C code.
+        :param height: If self.subsample[0] != 1, a variable giving the height
+            of the filters for direction="backprop weights" or the height of
+            the input images for direction="backprop inputs".
+            If self.border_mode == 'half', a variable giving the height of the
+            filters for direction="backprop weights".  Ignored otherwise.
+        :param width: If self.subsample[1] != 1, a variable giving the width
+            of the filters for direction="backprop weights" or the width of the
+            input images for direction="backprop inputs".
+            If self.border_mode == 'half', a variable giving the width of the
+            filters for direction="backprop weights".  Ignored otherwise.
+        """
+        dH, dW = self.subsample
+        if self.border_mode == "half":
+            padH = padW = -1
+        elif self.border_mode == "full":
+            padH = padW = -2
+        elif isinstance(self.border_mode, tuple):
+            padH, padW = self.border_mode
+        else:
+            assert self.border_mode == "valid"
+            padH = padW = 0
+        if direction == "forward":
+            direction = 0
+            out = top
+        elif direction == "backprop weights":
+            direction = 1
+            out = weights
+        elif direction == "backprop inputs":
+            direction = 2
+            out = bottom
+        else:
+            raise ValueError("direction must be one of 'forward', "
+                             "'backprop weights', 'backprop inputs'")
+        # When subsampling, we cannot unambiguously infer the height and width
+        # of bottom and weights from top, so we require them to be given.
+        # Similarly, when border_mode="half", we cannot infer the weight size.
+        if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
+            if not height:
+                raise ValueError("height must be given for backprop with vertical sampling or border_mode='half'")
+            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
+        else:
+            height = 'NULL'
+        if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
+            if not width:
+                raise ValueError("width must be given for backprop with horizontal sampling or border_mode='half'")
+            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
+        else:
+            width = 'NULL'
+        sub = sub.copy()
+        sub.update(locals())
+        return """
+    // Mandatory args
+    int direction = %(direction)s;  // forward, bprop weights, bprop inputs
+    // Optional args
+    int dH = %(dH)s;
+    int dW = %(dW)s;
+    int padH = %(padH)s;
+    int padW = %(padW)s;
+    PyArrayObject * bottom = %(bottom)s;
+    PyArrayObject * weights = %(weights)s;
+    PyArrayObject * top = %(top)s;
+    PyArrayObject * out2 = NULL;
+    // Obtain or infer kernel width and height
+    // (we need to know it early to be able to handle auto-padding)
+    int kH, kW;
+    if (direction != 1) {
+        // weight is an input variable, we can just read its shape
+        kH = PyArray_DIMS(weights)[2];
+        kW = PyArray_DIMS(weights)[3];
+    }
+    else {
+        if ((dH != 1) || (padH == -1)) {
+            // vertical subsampling or half padding, kernel height is specified
+            kH = %(height)s;
+        }
+        else if (padH == -2) {
+            // vertical full padding, we can infer the kernel height
+            kH = 2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH;
+        }
+        else {
+            // explicit padding, we can infer the kernel height
+            kH = PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH;
+        }
+        if ((dW != 1) || (padW == -1)) {
+            kW = %(width)s;
+        }
+        else if (padW == -2) {
+            kW = 2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW;
+        }
+        else {
+            kW = PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW;
+        }
+    }
+    // Auto-padding if requested
+    if (padH == -1) {  // vertical half padding
+        padH = kH / 2;
+    }
+    else if (padH == -2) {  // vertical full padding
+        padH = kH - 1;
+    }
+    else if (padH < 0) {
+        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padH must be >= -2");
+        %(fail)s
+    }
+    if (padW == -1) {  // horizontal half padding
+        padW = kW / 2;
+    }
+    else if (padW == -2) {  // horizontal full padding
+        padW = kW - 1;
+    }
+    else if (padW < 0) {
+        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padW must be >= -2");
+        %(fail)s
+    }
+    // Infer output shape
+    npy_intp out_dim[4];
+    switch(direction) {
+    case 0:  // forward pass
+        // output is top: (batchsize, num_filters, height, width)
+        // height and width: top = (bottom + 2*pad - weight) / sample + 1
+        out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
+        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
+        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - PyArray_DIMS(weights)[2]) / dH + 1);
+        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - PyArray_DIMS(weights)[3]) / dW + 1);
+        break;
+    case 1:  // backprop wrt. weights
+        // output is weights: (num_filters, num_channels, height, width)
+        // height and width: weights = bottom + 2*pad - (top - 1) * sample
+        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
+        out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1];
+        out_dim[2] = (npy_intp)kH;  // already inferred further above
+        out_dim[3] = (npy_intp)kW;  // how convenient
+        break;
+    case 2:  // backprop wrt. inputs
+        // output is bottom: (batchsize, num_channels, height, width)
+        // height and width: bottom = (top - 1) * sample + weights - 2*pad
+        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
+        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1];
+        out_dim[2] = (npy_intp)((dH != 1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + PyArray_DIMS(weights)[2] - 2*padH);
+        out_dim[3] = (npy_intp)((dW != 1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + PyArray_DIMS(weights)[3] - 2*padW);
+        break;
+    default:
+        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: direction must be 0, 1, or 2\\n");
+        %(fail)s
+    }
+    // Prepare output array
+    int typenum;
+    if ( !(%(out)s
+           && PyArray_NDIM(%(out)s)==4
+           && PyArray_IS_C_CONTIGUOUS(%(out)s)
+           && PyArray_DIMS(%(out)s)[0]==out_dim[0]
+           && PyArray_DIMS(%(out)s)[1]==out_dim[1]
+           && PyArray_DIMS(%(out)s)[2]==out_dim[2]
+           && PyArray_DIMS(%(out)s)[3]==out_dim[3]))
+    {
+        Py_XDECREF(%(out)s);
+        if (direction != 1) {
+          typenum = PyArray_TYPE(weights);
+        }
+        else {
+          typenum = PyArray_TYPE(bottom);
+        }
+        %(out)s = (PyArrayObject*)PyArray_EMPTY(4,
+                                          out_dim,
+                                          typenum,
+                                          0);
+        if (NULL == %(out)s)
+        {
+            PyErr_Format(PyExc_RuntimeError,
+                    "BaseCorrMM: Failed to allocate output of %%d x %%d x %%d x %%d",
+                    out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
+            %(fail)s
+        }
+    }
+    // Call corrMM code
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, padH, padW);
+    if (out2==NULL){
+       %(fail)s
+    }
+    assert (out2 == %(out)s);
+""" % sub
+class CorrMM(BaseCorrMM):
+    """
+    CPU correlation implementation using Matrix Multiplication.
+    Parameters
+    ----------
+    border_mode
+        The width of a border of implicit zeros to pad the
+        input with. Must be a tuple with 2 elements giving the numbers of rows
+        and columns to pad on each side, or a single integer to pad the same
+        on all sides, or a string shortcut setting the padding at runtime:
+        ``'valid'`` for ``(0, 0)`` (valid convolution, no padding), ``'full'``
+        for ``(kernel_rows - 1, kernel_columns - 1)`` (full convolution),
+        ``'half'`` for ``(kernel_rows // 2, kernel_columns // 2)`` (same
+        convolution for odd-sized kernels). Note that the two widths are each
+        applied twice, once per side (left and right, top and bottom).
+    subsample
+        The subsample operation applied to each output image.
+        Should be a tuple with 2 elements.
+        `(sv, sh)` is equivalent to `CorrMM(...)(...)[:,:,::sv, ::sh]`,
+        but faster.
+        Set to `(1, 1)` to disable subsampling.
+    """
+    def __init__(self, border_mode="valid", subsample=(1, 1)):
+        super(CorrMM, self).__init__(border_mode, subsample)
+    def make_node(self, img, kern):
+        img = as_tensor_variable(img)
+        kern = as_tensor_variable(kern)
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
+                         False, False]
+        dtype = img.type.dtype
+        return Apply(self, [img, kern], [TensorType(dtype, broadcastable)()])
+    def infer_shape(self, node, input_shape):
+        if self.border_mode == "half":
+            padH = padW = -1
+        elif self.border_mode == "full":
+            padH = padW = -2
+        elif isinstance(self.border_mode, tuple):
+            padH, padW = self.border_mode
+        else:
+            assert self.border_mode == "valid"
+            padH = padW = 0
+        dH, dW = self.subsample
+        imshp = input_shape[0]
+        kshp = input_shape[1]
+        bsize, imshp = imshp[0], list(imshp[2:])
+        nkern, kshp = kshp[0], list(kshp[2:])
+        kH, kW = kshp
+        if padH == -1:
+            padH = kH // 2
+        elif padH == -2:
+            padH = kH - 1
+        elif padH < 0:
+            raise ValueError("CorrMM: border_mode must be >= 0")
+        if padW == -1:
+            padW = kW // 2
+        elif padW == -2:
+            padW = kW - 1
+        elif padW < 0:
+            raise ValueError("CorrMM: border_mode must be >= 0")
+        out_shp0 = (imshp[0] + 2 * padH - kshp[0]) // dH + 1
+        out_shp1 = (imshp[1] + 2 * padW - kshp[1]) // dW + 1
+        out_shp = (out_shp0, out_shp1)
+        return [(bsize, nkern) + out_shp]
+    def c_code(self, node, nodename, inp, out_, sub):
+        bottom, weights = inp
+        top, = out_
+        direction = "forward"
+        return super(CorrMM, self).c_code_helper(bottom, weights, top, direction, sub)
+    def grad(self, inp, grads):
+        bottom, weights = inp
+        top, = grads
+        d_bottom = CorrMM_gradInputs(self.border_mode,
+                                     self.subsample)(weights, top,
+                                                     bottom.shape[-2:])
+        d_weights = CorrMM_gradWeights(self.border_mode,
+                                       self.subsample)(bottom, top,
+                                                       weights.shape[-2:])
+        return d_bottom, d_weights
+class CorrMM_gradWeights(BaseCorrMM):
+    """
+    Gradient wrt. filters for `CorrMM`.
+    Notes
+    -----
+    You will not want to use this directly, but rely on
+    Theano's automatic differentiation or graph optimization to
+    use it as needed.
+    """
+    def __init__(self, border_mode="valid", subsample=(1, 1)):
+        super(CorrMM_gradWeights, self).__init__(border_mode, subsample)
+    def make_node(self, img, topgrad, shape=None):
+        img = as_tensor_variable(img)
+        topgrad = as_tensor_variable(topgrad)
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if topgrad.type.ndim != 4:
+            raise TypeError('topgrad must be 4D tensor')
+        if self.subsample != (1, 1) or self.border_mode == "half":
+            if shape is None:
+                raise ValueError('shape must be given if subsample != (1, 1)'
+                                 ' or border_mode == "half"')
+            height_width = [shape[0], shape[1]]
+        else:
+            height_width = []
+        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
+                         False, False]
+        dtype = img.type.dtype
+        return Apply(self, [img, topgrad] + height_width,
+                     [TensorType(dtype, broadcastable)()])
+    def infer_shape(self, node, input_shape):
+        if self.border_mode == "half":
+            padH = padW = -1
+        elif self.border_mode == "full":
+            padH = padW = -2
+        elif isinstance(self.border_mode, tuple):
+            padH, padW = self.border_mode
+        else:
+            assert self.border_mode == "valid"
+            padH = padW = 0
+        dH, dW = self.subsample
+        imshp = input_shape[0]
+        topshp = input_shape[1]
+        ssize, imshp = imshp[1], list(imshp[2:])
+        nkern, topshp = topshp[1], list(topshp[2:])
+        height_width = node.inputs[-2:]
+        if ((dH != 1) or (padH == -1)):
+            # vertical subsampling or half padding, kernel height is specified
+            kH = height_width[0]
+        elif padH == -2:
+            # vertical full padding, we can infer the kernel height
+            kH = 2 - imshp[0] + (topshp[0] - 1) * dH
+        else:
+            # explicit padding, we can infer the kernel height
+            kH = imshp[0] + 2 * padH - (topshp[0] - 1) * dH
+        if ((dW != 1) or (padW == -1)):
+            kW = height_width[1]
+        elif (padW == -2):
+            kW = 2 - imshp[1] + (topshp[1] - 1) * dW
+        else:
+            kW = imshp[1] + 2 * padW - (topshp[1] - 1) * dW
+        return [(nkern, ssize, kH, kW)]
+    def c_code(self, node, nodename, inp, out_, sub):
+        bottom, top = inp[:2]
+        height, width = inp[2:] or (None, None)
+        weights, = out_
+        direction = "backprop weights"
+        return super(CorrMM_gradWeights,
+                     self).c_code_helper(bottom, weights, top, direction,
+                                         sub, height, width)
+    def grad(self, inp, grads):
+        bottom, top = inp[:2]
+        weights, = grads
+        d_bottom = CorrMM_gradInputs(self.border_mode,
+                                     self.subsample)(weights, top,
+                                                     bottom.shape[-2:])
+        d_top = CorrMM(self.border_mode,
+                       self.subsample)(bottom, weights)
+        d_height_width = ((theano.gradient.DisconnectedType()(),) * 2
+                          if len(inp) == 4 else ())
+        return (d_bottom, d_top) + d_height_width
+    def connection_pattern(self, node):
+        if node.nin == 2:
+            return [[1], [1]]
+        else:
+            return [[1], [1], [0], [0]]  # no connection to height, width
+class CorrMM_gradInputs(BaseCorrMM):
+    """
+    Gradient wrt. inputs for `CorrMM`.
+    Notes
+    -----
+    You will not want to use this directly, but rely on
+    Theano's automatic differentiation or graph optimization to
+    use it as needed.
+    """
+    def __init__(self, border_mode="valid", subsample=(1, 1)):
+        super(CorrMM_gradInputs, self).__init__(border_mode, subsample)
+    def make_node(self, kern, topgrad, shape=None):
+        kern = as_tensor_variable(kern)
+        topgrad = as_tensor_variable(topgrad)
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+        if topgrad.type.ndim != 4:
+            raise TypeError('topgrad must be 4D tensor')
+        if self.subsample != (1, 1) and shape is None:
+            raise ValueError('shape must be given if subsample != (1, 1)')
+        height_width = [shape[0], shape[1]] if self.subsample != (1, 1) else []
+        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+                         False, False]
+        dtype = kern.type.dtype
+        return Apply(self, [kern, topgrad] + height_width,
+                     [TensorType(dtype, broadcastable)()])
+    def infer_shape(self, node, input_shape):
+        if self.border_mode == "half":
+            padH = padW = -1
+        elif self.border_mode == "full":
+            padH = padW = -2
+        elif isinstance(self.border_mode, tuple):
+            padH, padW = self.border_mode
+        else:
+            assert self.border_mode == "valid"
+            padH = padW = 0
+        dH, dW = self.subsample
+        kshp = input_shape[0]
+        topshp = input_shape[1]
+        ssize, kshp = kshp[1], list(kshp[2:])
+        bsize, topshp = topshp[0], list(topshp[2:])
+        height_width = node.inputs[-2:]
+        if padH == -1:
+            padH = kshp[0] // 2
+        elif padH == -2:
+            padH = kshp[0] - 1
+        elif padH < -2:
+            raise ValueError('CorrMM_gradInputs: border_mode must be >= 0.')
+        if padW == -1:
+            padW = kshp[1] // 2
+        elif padW == -2:
+            padW = kshp[1] - 1
+        elif padW < -2:
+            raise ValueError('CorrMM_gradInputs: border_mode must be >= 0.')
+        if dH != 1:
+            out_shp0 = height_width[0]
+        else:
+            out_shp0 = (topshp[0] - 1) * dH + kshp[0] - 2 * padH
+        if dW != 1:
+            out_shp1 = height_width[1]
+        else:
+            out_shp1 = (topshp[1] - 1) * dW + kshp[1] - 2 * padW
+        out_shp = (out_shp0, out_shp1)
+        return [(bsize, ssize) + out_shp]
+    def c_code(self, node, nodename, inp, out_, sub):
+        weights, top = inp[:2]
+        height, width = inp[2:] or (None, None)
+        bottom, = out_
+        direction = "backprop inputs"
+        return super(CorrMM_gradInputs,
+                     self).c_code_helper(bottom, weights, top, direction, sub,
+                                         height,
+                                         width)
+    def grad(self, inp, grads):
+        weights, top = inp[:2]
+        bottom, = grads
+        d_weights = CorrMM_gradWeights(self.border_mode,
+                                       self.subsample)(bottom,
+                                                       top,
+                                                       weights.shape[-2:])
+        d_top = CorrMM(self.border_mode,
+                       self.subsample)(bottom, weights)
+        d_height_width = ((theano.gradient.DisconnectedType()(),) *
+                          2 if len(inp) == 4 else ())
+        return (d_weights, d_top) + d_height_width
+    def connection_pattern(self, node):
+        if node.nin == 2:
+            return [[1], [1]]
+        else:
+            return [[1], [1], [0], [0]]  # no connection to height, width
--- a/theano/tensor/nnet/corr_gemm.c
+++ b/theano/tensor/nnet/corr_gemm.c
+// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
+// sources are clearly marked. Below we reproduce the original license of
+// the Caffe software.
+/*
+Copyright (c) 2014, The Regents of the University of California (Regents)
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met: 
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer. 
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution. 
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cpp)
+// Loops for fast unfold + copy
+void im2col(const %(float_type)s* data_im, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    %(float_type)s* data_col) {
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int channels_col = channels * kernel_h * kernel_w;
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c %% kernel_w;
+    int h_offset = (c / kernel_w) %% kernel_h;
+    int c_im = c / kernel_h / kernel_w;
+    for (int h = 0; h < height_col; ++h) {
+      for (int w = 0; w < width_col; ++w) {
+        int h_pad = h * stride_h - pad_h + h_offset;
+        int w_pad = w * stride_w - pad_w + w_offset;
+        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+          data_col[(c * height_col + h) * width_col + w] =
+            data_im[(c_im * height + h_pad) * width + w_pad];
+        else
+          data_col[(c * height_col + h) * width_col + w] = 0.;
+      }
+    }
+  }
+}
+// Unlike the Caffe and Theano GPU verions, the data_im array is set to zero
+// before the col2im call rather than doing it here. So, the result is just
+// accumulated into data_im.
+void col2im(const %(float_type)s* data_col, const int channels,
+    const int height, const int width, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, %(float_type)s* data_im) {
+  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int num_kernels = channels * height * width;
+  int channels_col = channels * patch_h * patch_w;
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c %% patch_w;
+    int h_offset = (c / patch_w) %% patch_h;
+    int c_im = c / patch_h / patch_w;
+    for (int h = 0; h < height_col; ++h) {
+      for (int w = 0; w < width_col; ++w) {
+        int h_pad = h * stride_h - pad_h + h_offset;
+        int w_pad = w * stride_w - pad_w + w_offset;
+        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+          data_im[(c_im * height + h_pad) * width + w_pad] +=
+            data_col[(c * height_col + h) * width_col + w];
+      }
+    }
+  }
+}
+// Theano op code
+// GPU version authors: Arjun Jain, Frederic Bastien, Jan Schlueter
+// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
+//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
+// CPU version author: Jesse Livezey
+// CPU version adapted from GPU version
+PyArrayObject* corrMM(PyArrayObject* bottom,
+                           PyArrayObject* weight,
+                           PyArrayObject* top,
+                           const int direction,
+                           const int dH = 1,
+                           const int dW = 1,
+                           const int padH = 0,
+                           const int padW = 0)
+{
+    if (PyArray_NDIM(bottom) != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "CorrMM requires bottom of 4D");
+        return NULL;
+    }
+    if (PyArray_TYPE(bottom) != %(float_typenum)s)
+    {
+        PyErr_SetString(PyExc_ValueError, "CorrMM received bottom with wrong type.");
+        return NULL;
+    }
+    if (PyArray_NDIM(weight) != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "CorrMM requires weight of 4D");
+        return NULL;
+    }
+    if (PyArray_TYPE(weight) != %(float_typenum)s)
+    {
+        PyErr_SetString(PyExc_ValueError, "CorrMM received weight with wrong type.");
+        return NULL;
+    }
+    if (PyArray_NDIM(top) != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "CorrMM requires top of 4D");
+        return NULL;
+    }
+    if (PyArray_TYPE(top) != %(float_typenum)s)
+    {
+        PyErr_SetString(PyExc_ValueError, "CorrMM received top with wrong type.");
+        return NULL;
+    }
+    // Ensure data is contiguous
+    bottom = PyArray_GETCONTIGUOUS(bottom);
+    weight = PyArray_GETCONTIGUOUS(weight);
+    top = PyArray_GETCONTIGUOUS(top);
+    // Extract some shape information for later and check shape consistency
+    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth)
+    const int batchSize = PyArray_DIMS(bottom)[0];
+    const int nChannels = PyArray_DIMS(bottom)[1];
+    const int bottomHeight = PyArray_DIMS(bottom)[2];
+    const int bottomWidth = PyArray_DIMS(bottom)[3];
+    // weights: (nFilters, nChannels, rows, columns)
+    const int nFilters = PyArray_DIMS(weight)[0];
+    const int kH = PyArray_DIMS(weight)[2];
+    const int kW = PyArray_DIMS(weight)[3];
+    if (nChannels != PyArray_DIMS(weight)[1]) {
+        PyErr_SetString(PyExc_ValueError,
+                "CorrMM images and kernel must have the same stack size\n");
+        return NULL;
+    }
+    // top: (batchSize, nFilters, topHeight, topWidth)
+    const int topHeight = (bottomHeight + 2*padH - kH) / dH + 1;
+    const int topWidth  = (bottomWidth + 2*padW - kW) / dW + 1;
+    if (batchSize != PyArray_DIMS(top)[0] ||
+            nFilters != PyArray_DIMS(top)[1] ||
+            topHeight != PyArray_DIMS(top)[2] ||
+            topWidth != PyArray_DIMS(top)[3]) {
+        PyErr_Format(PyExc_ValueError,
+                "CorrMM shape inconsistency:\n"
+                "  bottom shape: %%d %%d %%d %%d\n"
+                "  weight shape: %%d %%d %%d %%d\n"
+                "  top shape: %%d %%d %%d %%d (expected %%d %%d %%d %%d)\n",
+                batchSize, nChannels, bottomHeight, bottomWidth,
+                nFilters, nChannels, kH, kW,
+                PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
+                PyArray_DIMS(top)[2], PyArray_DIMS(top)[3],
+                batchSize, nFilters, topHeight, topWidth);
+        return NULL;
+    }
+    // Create temporary columns
+    npy_intp col_dim[2];
+    col_dim[0] = (npy_intp)(nChannels * kW * kH);
+    col_dim[1] = (npy_intp)(topHeight * topWidth);
+    PyArrayObject* col = (PyArrayObject*)PyArray_EMPTY(2,
+		                           col_dim,
+                                           PyArray_TYPE(top),
+					   0);
+    if (NULL == col)
+    {
+        PyErr_Format(PyExc_RuntimeError,
+                "CorrMM failed to allocate working memory of %%d x %%d\n",
+                col_dim[0], col_dim[1]);
+        return NULL;
+    }
+    // Define some useful variables
+    const int bottom_stride = PyArray_STRIDES(bottom)[0]/%(n_bytes)f;
+    const int top_stride = PyArray_STRIDES(top)[0]/%(n_bytes)f;
+    const int K_ = col_dim[0];
+    const int N_ = col_dim[1];
+    const int M_ = nFilters;
+    const %(c_float_type)s one = 1.0;
+    const %(c_float_type)s zero = 0.0;
+    char NTrans = 'N';
+    char Trans = 'T';
+    PyArrayObject *output;
+    if (direction == 0) {  // forward pass
+        output = top;
+        // valid correlation: im2col, then gemm
+        // Iterate over batch
+        for (int n = 0; n < batchSize; n++) {
+            // First, im2col
+            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels, bottomHeight,
+                    bottomWidth, kH, kW, padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(col));
+            // Second, gemm
+            %(gemm)s(&NTrans, &NTrans,
+                   &N_, &M_, &K_,
+                   &one,
+                   (%(float_type)s*)PyArray_DATA(col), &N_,
+                   (%(float_type)s*)PyArray_DATA(weight), &K_,
+                   &zero,
+                   (%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_);
+        }
+        /*
+        // Original caffe code for comparison
+        // Note that this code was translated from the Theano GPU code,
+        // not the Caffe CPU code.
+        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
+        // Note that this is for grouped convolution; we can ignore groups here,
+        // but the group-related offsets help explain what M_, N_ and K_ are
+        int weight_offset = M_ * K_;
+        int col_offset = K_ * N_;
+        int top_offset = M_ * N_;
+        for (int n = 0; n < num_; ++n) {
+          // First, im2col
+          im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
+              width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
+              col_data);
+          // Second, innerproduct with groups
+          for (int g = 0; g < group_; ++g) {
+            caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
+              (Dtype)1., weight + weight_offset * g, col_data + col_offset * g,
+              (Dtype)0., top_data + (*top)[i]->offset(n) + top_offset * g);
+            == (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
+            cublasSgemm(CUBLAS_OP_N, CUBLAS_OP_N,
+              N_, M_, K_,
+              1.,
+              col_data + col_offset * g, N_,
+              weight + weight_offset * g, K_,
+              0.,
+              top_data + (*top)[i]->offset(n) + top_offset * g, N_);
+          }
+        }
+        */
+    }
+    else if (direction == 1) {  // backprop wrt. weights
+        output = weight;
+        // valid convolution: im2col, then gemm
+        // Iterate over batch
+        for (int n = 0; n < batchSize; n++) {
+            // First, im2col
+            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels, bottomHeight,
+                    bottomWidth, kH, kW, padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(col));
+            // Second, gemm
+            // Note that we accumulate into weight. We do so by setting beta = 0
+            // for the first iteration and beta = 1 for subsequent ones. (This
+            // is faster than setting weight to all zeros before the loop.)
+            %(gemm)s(&Trans, &NTrans,
+                   &K_, &M_, &N_,
+                   &one,
+                   (%(float_type)s*)PyArray_DATA(col), &N_,
+                   (%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_,
+                   (n == 0) ? &zero : &one,
+                   (%(float_type)s*)PyArray_DATA(weight), &K_);
+        }
+        /*
+        // Original caffe code for comparison
+        // Note that this code was translated from the Theano GPU code,
+        // not the Caffe CPU code.
+        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
+        // Note that this is for grouped convolution; we can ignore groups
+        for (int n = 0; n < num_; ++n) {
+          // Since we saved memory in the forward pass by not storing all col
+          // data, we will need to recompute them.
+          im2col_gpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
+                     width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+                     stride_h_, stride_w_, col_data);
+          // gradient w.r.t. weight. Note that we will accumulate diffs.
+          for (int g = 0; g < group_; ++g) {
+            caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
+                (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
+                col_data + col_offset * g, (Dtype)1.,
+                weight_diff + weight_offset * g);
+            == (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
+            cublasSgemm(CUBLAS_OP_T, CUBLAS_OP_N, K_, M_, N_,
+                1.0,
+                col_data + col_offset * g, N_,
+                top_diff + top[i]->offset(n) + top_offset * g, N_,
+                1.0,
+                weight_diff + weight_offset * g, K_);
+          }
+        }
+        */
+    }
+    else if (direction == 2) {  // backprop wrt. inputs
+        output = bottom;
+	// bottom is set to zero here rather than inside of col2im
+        PyArray_FILLWBYTE(bottom, 0);
+        // full convolution: gemm, then col2im
+        // Iterate over batch
+        for (int n = 0; n < batchSize; n++) {
+            // gemm into columns
+            %(gemm)s(&NTrans, &Trans,
+                   &N_, &K_, &M_,
+                   &one,
+                   (%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_,
+                   (%(float_type)s*)PyArray_DATA(weight), &K_,
+                   &zero,
+                   (%(float_type)s*)PyArray_DATA(col), &N_);
+            // col2im back to the data
+            col2im((%(float_type)s*)PyArray_DATA(col), nChannels, bottomHeight, bottomWidth,
+                    kH, kW, padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride);
+        }
+        /*
+        // Original caffe code for comparison
+        // Note that this code was translated from the Theano GPU code,
+        // not the Caffe CPU code.
+        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
+        for (int n = 0; n < num_; ++n) {
+          // gradient w.r.t. bottom data, if necessary
+          if (propagate_down[i]) {
+            for (int g = 0; g < group_; ++g) {
+              caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
+                  (Dtype)1., weight + weight_offset * g,
+                  top_diff + top[i]->offset(n) + top_offset * g,
+                  (Dtype)0., col_diff + col_offset * g);
+              == (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
+              cublasSgemm(CUBLAS_OP_N, CUBLAS_OP_T, N_, K_, M_,
+                  1.,
+                  top_diff + top[i]->offset(n) + top_offset * g, N_,
+                  weight + weight_offset * g, K_,
+                  0.,
+                  col_diff + col_offset * g, N_);
+            }
+            // col2im back to the data
+            col2im_gpu(col_diff, channels_, height_, width_,
+                kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
+                bottom_diff + (*bottom)[i]->offset(n));
+          }
+        }
+        */
+    }
+    // Free temporary columns
+    Py_DECREF(col);
+    // decref from contiguous check
+    Py_DECREF(bottom);
+    Py_DECREF(weight);
+    Py_DECREF(top);
+    // Note that we don't change the refcount of the output matrix here. Output
+    // (re)allocation and refcounting is done in BaseCorrMM.c_code_helper();
+    // in here output is just aliased to one of bottom, weights, or top.
+    return output;
+}
--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
+from nose.plugins.skip import SkipTest
+from nose.plugins.attrib import attr
+import numpy
+import theano
+import theano.tensor as T
+from theano.tests import unittest_tools as utt
+from theano.tensor.nnet import corr, conv
+from theano.tensor.basic import _allclose
+class TestCorr2D(utt.InferShapeTester):
+    if theano.config.mode == "FAST_COMPILE":
+        mode = theano.compile.get_mode("FAST_RUN")
+    else:
+        mode = None
+    dtype = theano.config.floatX
+    def setUp(self):
+        super(TestCorr2D, self).setUp()
+        self.input = T.tensor4('input', dtype=self.dtype)
+        self.input.name = 'default_V'
+        self.filters = T.tensor4('filters', dtype=self.dtype)
+        self.filters.name = 'default_filters'
+        if not conv.imported_scipy_signal and theano.config.cxx == "":
+            raise SkipTest("CorrMM tests need SciPy or a c++ compiler")
+    def validate(self, image_shape, filter_shape,
+                 border_mode='valid', subsample=(1, 1),
+                 input=None, filters=None,
+                 verify_grad=True, non_contiguous=False):
+        """
+        :param image_shape: The constant shape info passed to corrMM.
+        :param filter_shape: The constant shape info passed to corrMM.
+        """
+        N_image_shape = [T.get_scalar_constant_value(T.as_tensor_variable(x))
+                         for x in image_shape]
+        N_filter_shape = [T.get_scalar_constant_value(T.as_tensor_variable(x))
+                          for x in filter_shape]
+        if input is None:
+            input = self.input
+        if filters is None:
+            filters = self.filters
+        # THEANO IMPLEMENTATION
+        # we create a symbolic function so that verify_grad can work
+        def sym_CorrMM(input, filters):
+            # define theano graph and function
+            input.name = 'input'
+            filters.name = 'filters'
+            rval = corr.CorrMM(border_mode, subsample)(input, filters)
+            rval.name = 'corr_output'
+            return rval
+        output = sym_CorrMM(input, filters)
+        output.name = 'CorrMM()(%s,%s)' % (input.name, filters.name)
+        theano_corr = theano.function([input, filters], output, mode=self.mode)
+        # initialize input and compute result
+        image_data = numpy.random.random(N_image_shape).astype(self.dtype)
+        filter_data = numpy.random.random(N_filter_shape).astype(self.dtype)
+        if non_contiguous:
+            image_data = numpy.transpose(image_data, axes=(0, 1, 3, 2))
+            image_data = image_data.copy()
+            image_data = numpy.transpose(image_data, axes=(0, 1, 3, 2))
+            filter_data = numpy.transpose(filter_data, axes=(0, 1, 3, 2))
+            filter_data = filter_data.copy()
+            filter_data = numpy.transpose(filter_data, axes=(0, 1, 3, 2))
+            assert not image_data.flags['CONTIGUOUS']
+            assert not filter_data.flags['CONTIGUOUS']
+        theano_output = theano_corr(image_data, filter_data)
+        # REFERENCE IMPLEMENTATION
+        # Testing correlation, not convolution. Reverse filters.
+        filter_data_corr = numpy.array(filter_data[:, :, ::-1, ::-1],
+                                       copy=True,
+                                       order='C')
+        orig_image_data = image_data
+        img_shape2d = numpy.array(N_image_shape[-2:])
+        fil_shape2d = numpy.array(N_filter_shape[-2:])
+        subsample2d = numpy.array(subsample)
+        if border_mode == 'full':
+            padHW = (fil_shape2d - 1)
+        elif border_mode == 'valid':
+            padHW = numpy.array([0, 0])
+        elif border_mode == 'half':
+            padHW = numpy.floor(fil_shape2d / 2)
+        elif isinstance(border_mode, tuple):
+            padHW = numpy.array(border_mode)
+        elif isinstance(border_mode, int):
+            padHW = numpy.array([border_mode, border_mode])
+        else:
+            raise NotImplementedError('Unsupported border_mode {}'.format(border_mode))
+        out_shape2d = numpy.floor((img_shape2d + 2 * (padHW) - fil_shape2d) / subsample2d) + 1
+        out_shape = (N_image_shape[0], N_filter_shape[0]) + tuple(out_shape2d)
+        ref_output = numpy.zeros(out_shape)
+        # loop over output feature maps
+        ref_output.fill(0)
+        image_data2 = numpy.zeros((N_image_shape[0], N_image_shape[1],
+                                   N_image_shape[2] + 2 * padHW[0],
+                                   N_image_shape[3] + 2 * padHW[1]))
+        image_data2[:, :, padHW[0]:padHW[0] + N_image_shape[2],
+                    padHW[1]:padHW[1] + N_image_shape[3]] = image_data
+        image_data = image_data2
+        N_image_shape = image_data.shape
+        for bb in range(N_image_shape[0]):
+            for nn in range(N_filter_shape[0]):
+                for im0 in range(N_image_shape[1]):
+                    filter2d = filter_data_corr[nn, im0, :, :]
+                    image2d = image_data[bb, im0, :, :]
+                    for row in range(ref_output.shape[2]):
+                        irow = row * subsample[0]  # image row
+                        for col in range(ref_output.shape[3]):
+                            icol = col * subsample[1]  # image col
+                            ref_output[bb, nn, row, col] += (image2d[
+                                irow:irow + N_filter_shape[2],
+                                icol:icol + N_filter_shape[3]] * filter2d[::-1, ::-1]
+                            ).sum()
+        self.assertTrue(_allclose(theano_output, ref_output))
+        # TEST GRADIENT
+        if verify_grad:
+            utt.verify_grad(sym_CorrMM, [orig_image_data, filter_data])
+    @attr('slow')
+    def test_basic(self):
+        """
+        Tests that basic correlations work for odd and even
+        dimensions of image and filter shapes, as well as rectangular
+        images and filters.
+        """
+        border_modes = ['valid', 'full', 'half', (1, 1), (2, 1), (1, 2),
+                        (3, 3), 1]
+        img_shapes = [(2, 2, 3, 3), (3, 2, 8, 8), (3, 2, 7, 5), (3, 2, 7, 5),
+                      (3, 2, 8, 8), (3, 2, 7, 5)]
+        fil_shapes = [(2, 2, 2, 2), (4, 2, 5, 5), (5, 2, 2, 3), (5, 2, 3, 2),
+                      (4, 2, 5, 5), (5, 2, 2, 3)]
+        for border_mode in border_modes:
+            for img, fil in zip(img_shapes, fil_shapes):
+                self.validate(img, fil, border_mode, verify_grad=False)
+        # Very slow on with 'full' or 'half'
+        self.validate((1, 10, 213, 129), (46, 10, 212, 1), 'valid', verify_grad=False)
+    def test_img_kernel_same_shape(self):
+        self.validate((3, 2, 3, 3), (4, 2, 3, 3), 'full')
+        self.validate((3, 2, 3, 3), (4, 2, 3, 3), 'valid')
+        self.validate((3, 2, 3, 3), (4, 2, 3, 3), 'half')
+        self.validate((3, 2, 3, 3), (4, 2, 3, 3), (1, 1))
+        self.validate((3, 2, 3, 3), (4, 2, 3, 3), 1)
+    @attr('slow')
+    def test_subsample(self):
+        """
+        Tests correlation where subsampling != (1,1)
+        """
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'valid', subsample=(2, 2))
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'valid', subsample=(2, 1))
+        self.validate((1, 1, 6, 6), (1, 1, 3, 3), 'valid', subsample=(3, 3))
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'full', subsample=(2, 2))
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'full', subsample=(2, 1))
+        self.validate((1, 1, 6, 6), (1, 1, 3, 3), 'full', subsample=(3, 3))
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'half', subsample=(2, 2))
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'half', subsample=(2, 1))
+        self.validate((1, 1, 6, 6), (1, 1, 3, 3), 'half', subsample=(3, 3))
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), (1, 1), subsample=(2, 2))
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), (2, 1), subsample=(2, 1))
+        self.validate((1, 1, 6, 6), (1, 1, 3, 3), (1, 2), subsample=(3, 3))
+        self.validate((1, 1, 6, 6), (1, 1, 3, 3), 1, subsample=(3, 3))
+    @attr('slow')
+    def test_shape_Constant_tensor(self):
+        """
+        Tests correlation where the {image,filter}_shape is a Constant tensor.
+        """
+        as_t = T.as_tensor_variable
+        border_modes = ['valid', 'full', 'half', (1, 1), (2, 1), (1, 2), (3, 3), 1]
+        for border_mode in border_modes:
+            self.validate((as_t(3), as_t(2), as_t(7), as_t(5)),
+                          (5, 2, 2, 3), border_mode)
+            self.validate(as_t([3, 2, 7, 5]), (5, 2, 2, 3), border_mode)
+            self.validate(as_t((3, 2, 7, 5)), (5, 2, 2, 3), border_mode)
+            self.validate((3, 2, 7, 5), (as_t(5), as_t(2), as_t(2),
+                          as_t(3)), 'valid')
+            self.validate((3, 2, 7, 5), as_t([5, 2, 2, 3]), border_mode)
+            self.validate(as_t([3, 2, 7, 5]), as_t([5, 2, 2, 3]), border_mode)
+    def test_invalid_filter_shape(self):
+        """
+        Tests scenario where filter_shape[1] != input_shape[1]
+        """
+        self.assertRaises(ValueError, self.validate,
+                          (3, 2, 8, 8), (4, 3, 5, 5),
+                          'valid')
+    def test_full_mode(self):
+        """
+        Tests basic correlation in full mode and case where filter
+        is larger than the input image.
+        """
+        self.validate((3, 2, 5, 5), (4, 2, 8, 8), 'full')
+        def f():
+            self.validate((3, 2, 5, 5), (4, 2, 8, 8), 'valid')
+        self.assertRaises(Exception, f)
+    def test_wrong_input(self):
+        """
+        Make sure errors are raised when image and kernel are not 4D tensors
+        """
+        self.assertRaises(Exception, self.validate, (3, 2, 8, 8), (4, 2, 5, 5),
+                          'valid', input=T.dmatrix())
+        self.assertRaises(Exception, self.validate, (3, 2, 8, 8), (4, 2, 5, 5),
+                          'valid', filters=T.dvector())
+        self.assertRaises(Exception, self.validate, (3, 2, 8, 8), (4, 2, 5, 5),
+                          'valid', input=T.dtensor3())
+    @attr('slow')
+    def test_infer_shape_forward(self):
+        def rand(*shape):
+            r = numpy.asarray(numpy.random.rand(*shape), dtype='float64')
+            return r * 2 - 1
+        corrMM = corr.CorrMM
+        adtens = T.dtensor4()
+        bdtens = T.dtensor4()
+        aivec_vals = [[4, 5, 6, 3], [6, 2, 8, 3], [3, 6, 7, 5],
+                      [3, 6, 7, 5], [5, 2, 4, 3]]
+        bivec_vals = [[7, 5, 3, 2], [4, 2, 5, 3], [5, 6, 3, 2],
+                      [5, 6, 2, 3], [6, 2, 4, 3]]
+        modes = ['valid', 'full', 'half', (1, 1), (2, 1), (1, 2), 1]
+        subsamples = [(1, 1), (2, 1), (1, 2)]
+        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
+            adtens_val = rand(*aivec_val)
+            bdtens_val = rand(*bivec_val)
+            for mode in modes:
+                for subsample in subsamples:
+                    # CorrMM
+                    cdtens = corrMM(border_mode=mode, subsample=subsample)(adtens, bdtens)
+                    self._compile_and_check([adtens, bdtens],
+                                            [cdtens],
+                                            [adtens_val, bdtens_val], corrMM,
+                                            warn=False)
+    @attr('slow')
+    def test_infer_shape_gradW(self):
+        def rand(*shape):
+            r = numpy.asarray(numpy.random.rand(*shape), dtype='float64')
+            return r * 2 - 1
+        corrMM = corr.CorrMM
+        gradW = corr.CorrMM_gradWeights
+        adtens = T.dtensor4()
+        bdtens = T.dtensor4()
+        aivec_vals = [[1, 5, 6, 3], [8, 2, 7, 3], [1, 6, 9, 4],
+                      [9, 6, 8, 5], [9, 1, 6, 8]]
+        bivec_vals = [[7, 5, 3, 1], [4, 2, 5, 3], [12, 6, 3, 2],
+                      [5, 6, 1, 3], [11, 1, 3, 3]]
+        modes = ['valid', 'full', 'half', (1, 1), (2, 1), (1, 2), 1]
+        subsamples = [(1, 1), (2, 1), (1, 2)]
+        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
+            adtens_val = rand(*aivec_val)
+            bdtens_val = rand(*bivec_val)
+            for mode in modes:
+                for subsample in subsamples:
+                    # CorrMM
+                    cdtens = corrMM(border_mode=mode, subsample=subsample)(adtens, bdtens)
+                    f = theano.function([adtens, bdtens], cdtens)
+                    cdtens_val = f(adtens_val, bdtens_val)
+                    # CorrMM_gradWeights
+                    shape = (theano.shared(bivec_val[2]), theano.shared(bivec_val[3]))
+                    bdtens_g = gradW(border_mode=mode,
+                                     subsample=subsample)(adtens, cdtens, shape=shape)
+                    self._compile_and_check([adtens, cdtens],
+                                            [bdtens_g],
+                                            [adtens_val, cdtens_val], gradW,
+                                            warn=False)
+    @attr('slow')
+    def test_infer_shape_gradI(self):
+        def rand(*shape):
+            r = numpy.asarray(numpy.random.rand(*shape), dtype='float64')
+            return r * 2 - 1
+        corrMM = corr.CorrMM
+        gradI = corr.CorrMM_gradInputs
+        adtens = T.dtensor4()
+        bdtens = T.dtensor4()
+        aivec_vals = [[1, 5, 6, 3], [8, 2, 7, 3], [1, 6, 9, 4],
+                      [9, 6, 8, 5], [9, 1, 6, 8]]
+        bivec_vals = [[7, 5, 3, 1], [4, 2, 5, 3], [12, 6, 3, 2],
+                      [5, 6, 1, 3], [7, 1, 3, 4]]
+        modes = ['valid', 'full', 'half', (1, 1), (2, 1), (1, 2), 1]
+        subsamples = [(1, 1), (2, 1), (1, 2)]
+        for aivec_val, bivec_val in zip(aivec_vals, bivec_vals):
+            adtens_val = rand(*aivec_val)
+            bdtens_val = rand(*bivec_val)
+            for mode in modes:
+                for subsample in subsamples:
+                    # CorrMM
+                    cdtens = corrMM(border_mode=mode, subsample=subsample)(adtens, bdtens)
+                    f = theano.function([adtens, bdtens], cdtens)
+                    cdtens_val = f(adtens_val, bdtens_val)
+                    # CorrMM_gradInputs
+                    shape = (theano.shared(aivec_val[2]), theano.shared(aivec_val[3]))
+                    adtens_g = gradI(border_mode=mode,
+                                     subsample=subsample)(bdtens, cdtens, shape=shape)
+                    self._compile_and_check([bdtens, cdtens],
+                                            [adtens_g],
+                                            [bdtens_val, cdtens_val], gradI,
+                                            warn=False)
+    def test_non_contiguous(self):
+        self.validate((2, 2, 3, 3), (2, 2, 2, 2), 'valid', non_contiguous=True)
+        self.validate((3, 2, 8, 8), (4, 2, 5, 5), 'valid', non_contiguous=True)
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'valid', non_contiguous=True)
+        self.validate((3, 2, 7, 5), (5, 2, 3, 2), 'valid', non_contiguous=True)
+        self.validate((3, 2, 8, 8), (4, 2, 5, 5), 'full', non_contiguous=True)
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'full', non_contiguous=True)
+        self.validate((3, 2, 8, 8), (4, 2, 5, 5), 'half', non_contiguous=True)
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'half', non_contiguous=True)
+        self.validate((3, 2, 8, 8), (4, 2, 5, 5), (1, 1), non_contiguous=True)
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), (1, 2), non_contiguous=True)
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), (2, 1), non_contiguous=True)
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 2, non_contiguous=True)
+if __name__ == '__main__':
+    t = TestCorr2D('setUp')
+    t.setUp()
+    t.test_infer_shape()