Add GpuCorrMM and GpuCorr3dMM to gpuarray backend.

691be8f5 · Gijs van Tulder · 146ef971 · 691be8f5 · 691be8f5 · 691be8f5
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
 from __future__ import absolute_import, print_function, division
 import os.path
+from six import integer_types

+import theano
 from theano import Apply, config, Op

 from theano.compile import optdb
@@ -8,7 +10,8 @@ from theano.gof import LocalOptGroup
 from theano.tensor.basic import as_tensor_variable
 from theano.tensor.opt import in2out

-from .basic_ops import as_gpuarray_variable, infer_context_name
+from .basic_ops import (GpuArrayType, CGpuKernelBase,
+                        as_gpuarray_variable, gpu_contiguous, infer_context_name)
 from .opt_util import inplace_allocempty

 try:
@@ -410,6 +413,1129 @@ gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
 gpugemmbatch_inplace = GpuGemmBatch(inplace=True)


+class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
+    """
+    Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
+    `GpuCorrMM_gradInputs`. Cannot be used directly.
+
+    Parameters
+    ----------
+    border_mode : {'valid', 'full', 'half'}
+        Additionally, the padding size could be directly specified by an integer
+        or a pair of integers
+    subsample
+        Perform subsampling of the output (default: (1, 1)).
+    filter_dilation
+        Perform subsampling of the input, also known as dilation (default: (1, 1)).
+    """
+
+    check_broadcast = False
+    __props__ = ('border_mode', 'subsample', 'filter_dilation')
+
+    def __init__(self, border_mode="valid", subsample=(1, 1),
+                 filter_dilation=(1, 1)):
+        if isinstance(border_mode, integer_types):
+            border_mode = (border_mode, border_mode)
+        if isinstance(border_mode, tuple):
+            pad_h, pad_w = map(int, border_mode)
+            border_mode = (pad_h, pad_w)
+        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
+                border_mode in ('valid', 'full', 'half')):
+            raise ValueError(
+                'invalid border_mode {}, which must be either '
+                '"valid", "full", "half", an integer or a pair of'
+                ' integers'.format(border_mode))
+        self.border_mode = border_mode
+        if len(subsample) != 2:
+            raise ValueError("subsample must have two elements")
+        if len(filter_dilation) != 2:
+            raise ValueError("filter_dilation must have two elements")
+        self.subsample = tuple(subsample)
+        self.filter_dilation = tuple(filter_dilation)
+        CGpuKernelBase.__init__(self, ['corr_gemm.c'])
+
+    @property
+    def pad(self):
+        if self.border_mode != 'valid':
+            return self.border_mode
+        return (0, 0)
+
+    def __str__(self):
+        return '%s{%s, %s, %s}' % (
+            self.__class__.__name__,
+            self.border_mode,
+            str(self.subsample),
+            str(self.filter_dilation))
+
+    def flops(self, inp, outp):
+        """
+        Useful with the hack in profilemode to print the MFlops.
+
+        """
+        # if the output shape is correct, then this gives the correct
+        # flops for any direction, sampling, padding, and border mode
+        inputs, filters = inp
+        outputs, = outp
+        assert inputs[1] == filters[1]
+        # nb mul and add by output pixel
+        flops = filters[2] * filters[3] * 2
+        # nb flops by output image
+        flops *= outputs[2] * outputs[3]
+        # nb patch multiplied
+        flops *= inputs[1] * filters[0] * inputs[0]
+        return flops
+
+    def get_params(self, node):
+        return node.inputs[0].type.context
+
+    def c_code_cache_version(self):
+        # raise this whenever modifying any of the support_code_files
+        return (0, 1)
+
+    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
+        """
+        This generates the C code for GpuCorrMM (direction="forward"),
+        GpuCorrMM_gradWeights (direction="backprop weights"), and
+        GpuCorrMM_gradInputs (direction="backprop inputs").
+        Depending on the direction, one of bottom, weights, top will
+        receive the output, while the other two serve as inputs.
+
+        Parameters
+        ----------
+        bottom
+            Variable name of the input images in the forward pass,
+            or the gradient of the input images in backprop wrt. inputs
+        weights
+            Variable name of the filters in the forward pass,
+            or the gradient of the filters in backprop wrt. weights
+        top
+            Variable name of the output images / feature maps in the
+            forward pass, or the gradient of the outputs in the backprop passes
+        direction : {'forward', 'backprop weights', 'backprop inputs'}
+            "forward" to correlate bottom with weights and store results in top,
+            "backprop weights" to do a valid convolution of bottom with top
+            (swapping the first two dimensions) and store results in weights,
+            and "backprop inputs" to do a full convolution of top with weights
+            (swapping the first two dimensions) and store results in bottom.
+        sub
+            Dictionary of substitutions useable to help generating the C code.
+        height
+            If self.subsample[0] != 1, a variable giving the height of the
+            filters for direction="backprop weights" or the height of the input
+            images for direction="backprop inputs".
+            If self.border_mode == 'half', a variable giving the height of the
+            filters for direction="backprop weights".
+            Ignored otherwise.
+        width
+            If self.subsample[1] != 1, a variable giving the width of the
+            filters for direction="backprop weights" or the width of the
+            input images for direction="backprop inputs".
+            If self.border_mode == 'half', a variable giving the width of the
+            filters for direction="backprop weights".
+            Ignored otherwise.
+
+        """
+        dH, dW = self.subsample
+        dilH, dilW = self.filter_dilation
+        if self.border_mode == "half":
+            padH = padW = -1
+        elif self.border_mode == "full":
+            padH = padW = -2
+        elif isinstance(self.border_mode, tuple):
+            padH, padW = self.border_mode
+        else:
+            assert self.border_mode == "valid"
+            padH = padW = 0
+        if direction == "forward":
+            direction = 0
+            out = top
+        elif direction == "backprop weights":
+            direction = 1
+            out = weights
+        elif direction == "backprop inputs":
+            direction = 2
+            out = bottom
+        else:
+            raise ValueError("direction must be one of 'forward', "
+                             "'backprop weights', 'backprop inputs'")
+        # When subsampling, we cannot unambiguously infer the height and width
+        # of bottom and weights from top, so we require them to be given.
+        # Similarly, when pad="half", we cannot infer the weight size.
+        if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
+            if not height:
+                raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
+            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
+        else:
+            height = '0'
+        if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
+            if not width:
+                raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
+            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
+        else:
+            width = '0'
+        sync = ""
+        if config.gpuarray.sync:
+            sync = """
+            int err = GpuArray_sync(&%(out)s->ga);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "BaseGpuCorrMM error: gpuarray sync failed.");
+                %(fail)s;
+            }
+            """ % locals()
+        sub = sub.copy()
+        sub.update(locals())
+
+        return """
+    // Mandatory args
+    int direction = %(direction)s;  // forward, bprop weights, bprop inputs
+
+    // Optional args
+    size_t dH = %(dH)s;
+    size_t dW = %(dW)s;
+    size_t dilH = %(dilH)s;
+    size_t dilW = %(dilW)s;
+    int padH = %(padH)s;
+    int padW = %(padW)s;
+
+    PyGpuArrayObject * bottom = %(bottom)s;
+    PyGpuArrayObject * weights = %(weights)s;
+    PyGpuArrayObject * top = %(top)s;
+    PyGpuArrayObject * out2 = NULL;
+
+    // Obtain or infer kernel width and height
+    // (we need to know it early to be able to handle auto-padding)
+    size_t kH, kW;
+    if (direction != 1) {
+        // weight is an input variable, we can just read its shape
+        kH = PyGpuArray_DIMS(weights)[2];
+        kW = PyGpuArray_DIMS(weights)[3];
+    }
+    else {
+        if ((dH != 1) || (padH == -1)) {
+            // vertical subsampling or half padding, kernel height is specified
+            kH = %(height)s;
+        }
+        else if (padH == -2) {
+            // vertical full padding, we can infer the kernel height
+            kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1;
+        }
+        else {
+            // explicit padding, we can infer the kernel height
+            kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
+        }
+        if ((dW != 1) || (padW == -1)) {
+            kW = %(width)s;
+        }
+        else if (padW == -2) {
+            kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
+        }
+        else {
+            kW = (PyGpuArray_DIMS(bottom)[3] + 2*padW - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
+        }
+    }
+
+    // Implicit dilated kernel size
+    size_t dil_kH = (kH - 1) * dilH + 1;
+    size_t dil_kW = (kW - 1) * dilW + 1;
+
+    // Auto-padding if requested
+    if (padH == -1) {  // vertical half padding
+        padH = dil_kH / 2;
+    }
+    else if (padH == -2) {  // vertical full padding
+        padH = dil_kH - 1;
+    }
+    else if (padH < 0) {
+        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padH must be >= -2");
+        %(fail)s
+    }
+    if (padW == -1) {  // horizontal half padding
+        padW = dil_kW / 2;
+    }
+    else if (padW == -2) {  // horizontal full padding
+        padW = dil_kW - 1;
+    }
+    else if (padW < 0) {
+        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padW must be >= -2");
+        %(fail)s
+    }
+
+    // Infer output shape and type
+    size_t out_dim[4];
+    int out_typecode;
+    PyGpuContextObject *out_context;
+    switch(direction) {
+    case 0:  // forward pass
+        // output is top: (batchsize, num_filters, height, width)
+        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
+        out_dim[0] = PyGpuArray_DIMS(bottom)[0];
+        out_dim[1] = PyGpuArray_DIMS(weights)[0];
+        out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
+        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
+        out_typecode = bottom->ga.typecode;
+        out_context = bottom->context;
+        break;
+    case 1:  // backprop wrt. weights
+        // output is weights: (num_filters, num_channels, height, width)
+        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
+        out_dim[0] = PyGpuArray_DIMS(top)[1];
+        out_dim[1] = PyGpuArray_DIMS(bottom)[1];
+        out_dim[2] = kH;  // already inferred further above
+        out_dim[3] = kW;  // how convenient
+        out_typecode = top->ga.typecode;
+        out_context = top->context;
+        break;
+    case 2:  // backprop wrt. inputs
+        // output is bottom: (batchsize, num_channels, height, width)
+        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
+        out_dim[0] = PyGpuArray_DIMS(top)[0];
+        out_dim[1] = PyGpuArray_DIMS(weights)[1];
+        out_dim[2] = (dH != 1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[3] = (dW != 1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
+        out_typecode = top->ga.typecode;
+        out_context = top->context;
+        break;
+    default:
+        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
+        %(fail)s
+    }
+
+    // Prepare output array
+    if (theano_prep_output(&%(out)s, 4, out_dim, out_typecode, GA_C_ORDER, out_context) != 0)
+    {
+        PyErr_Format(PyExc_RuntimeError,
+                "BaseGpuCorrMM: Failed to allocate output of %%ld x %%ld x %%ld x %%ld",
+                out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
+        %(fail)s
+    }
+    if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) {
+        PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
+        %(fail)s
+    }
+
+    // Call GPU code
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW);
+    if (out2==NULL){
+       %(fail)s
+    }
+    assert (out2 == %(out)s);
+
+    %(sync)s
+
+""" % sub
+
+
+class GpuCorrMM(BaseGpuCorrMM):
+    """
+    GPU correlation implementation using Matrix Multiplication.
+
+    Parameters
+    ----------
+    border_mode
+        The width of a border of implicit zeros to pad the
+        input with. Must be a tuple with 2 elements giving the numbers of rows
+        and columns to pad on each side, or a single integer to pad the same
+        on all sides, or a string shortcut setting the padding at runtime:
+        ``'valid'`` for ``(0, 0)`` (valid convolution, no padding), ``'full'``
+        for ``(kernel_rows - 1, kernel_columns - 1)`` (full convolution),
+        ``'half'`` for ``(kernel_rows // 2, kernel_columns // 2)`` (same
+        convolution for odd-sized kernels). Note that the two widths are each
+        applied twice, once per side (left and right, top and bottom).
+    subsample
+        The subsample operation applied to each output image.
+        Should be a tuple with 2 elements.
+        `(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
+        but faster.
+        Set to `(1, 1)` to disable subsampling.
+    filter_dilation
+        The filter dilation operation applied to each input image.
+        Should be a tuple with 2 elements.
+        Set to `(1, 1)` to disable filter dilation.
+
+    Notes
+    -----
+    Currently, the Op requires the inputs, filters and outputs to be
+    C-contiguous. Use :func:`gpu_contiguous
+    <theano.gpuarray.basic_ops.gpu_contiguous>` on these arguments
+    if needed.
+
+    You can either enable the Theano flag `optimizer_including=conv_gemm`
+    to automatically replace all convolution operations with `GpuCorrMM`
+    or one of its gradients, or you can use it as a replacement for
+    :func:`conv2d <theano.tensor.nnet.conv.conv2d>`, called as
+    `GpuCorrMM(subsample=...)(image, filters)`. The latter is currently
+    faster, but note that it computes a correlation -- if you need to
+    compute a convolution, flip the filters as `filters[:,:,::-1,::-1]`.
+
+    """
+    def __init__(self, border_mode="valid",
+                 subsample=(1, 1),
+                 filter_dilation=(1, 1)):
+        super(GpuCorrMM, self).__init__(border_mode, subsample,
+                                        filter_dilation)
+
+    def make_node(self, img, kern):
+        ctx_name = infer_context_name(img, kern)
+        img = as_gpuarray_variable(img, ctx_name)
+        kern = as_gpuarray_variable(kern, ctx_name)
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+
+        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
+                         False, False]
+        return Apply(self, [img, kern], [GpuArrayType(dtype=img.dtype,
+                                                      context_name=ctx_name,
+                                                      broadcastable=broadcastable)()])
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        bottom, weights = inp
+        top, = out_
+        direction = "forward"
+        return super(GpuCorrMM, self).c_code_helper(bottom, weights, top, direction, sub)
+
+    def grad(self, inp, grads):
+        bottom, weights = inp
+        top, = grads
+        top = gpu_contiguous(top)
+        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
+                                        self.subsample,
+                                        self.filter_dilation)(
+            weights, top, bottom.shape[-2:])
+        d_weights = GpuCorrMM_gradWeights(self.border_mode,
+                                          self.subsample,
+                                          self.filter_dilation)(
+            bottom, top, weights.shape[-2:])
+        return d_bottom, d_weights
+
+
+class GpuCorrMM_gradWeights(BaseGpuCorrMM):
+    """
+    Gradient wrt. filters for `GpuCorrMM`.
+
+    Notes
+    -----
+    You will not want to use this directly, but rely on Theano's automatic
+    differentiation or graph optimization to use it as needed.
+
+    """
+
+    def __init__(self, border_mode="valid",
+                 subsample=(1, 1),
+                 filter_dilation=(1, 1)):
+        super(GpuCorrMM_gradWeights, self).__init__(border_mode,
+                                                    subsample,
+                                                    filter_dilation)
+
+    def make_node(self, img, topgrad, shape=None):
+        ctx_name = infer_context_name(img, topgrad)
+        img = as_gpuarray_variable(img, ctx_name)
+        topgrad = as_gpuarray_variable(topgrad, ctx_name)
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if topgrad.type.ndim != 4:
+            raise TypeError('topgrad must be 4D tensor')
+        if self.subsample != (1, 1) or self.border_mode == "half":
+            if shape is None:
+                raise ValueError('shape must be given if subsample != (1, 1)'
+                                 ' or border_mode == "half"')
+            height_width = [shape[0], shape[1]]
+            assert shape[0].ndim == 0
+            assert shape[1].ndim == 0
+        else:
+            height_width = []
+
+        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
+                         False, False]
+        return Apply(self, [img, topgrad] + height_width, [GpuArrayType(dtype=img.dtype,
+                                                                        context_name=ctx_name,
+                                                                        broadcastable=broadcastable)()])
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        bottom, top = inp[:2]
+        height, width = inp[2:] or (None, None)
+        weights, = out_
+        direction = "backprop weights"
+        return super(GpuCorrMM_gradWeights, self).c_code_helper(bottom, weights, top, direction, sub, height, width)
+
+    def grad(self, inp, grads):
+        bottom, top = inp[:2]
+        weights, = grads
+        weights = gpu_contiguous(weights)
+        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
+                                        self.subsample,
+                                        self.filter_dilation)(weights,
+                                                              top,
+                                                              bottom.shape[-2:])
+        d_top = GpuCorrMM(
+            self.border_mode, self.subsample, self.filter_dilation)(bottom, weights)
+        d_height_width = (
+            theano.gradient.DisconnectedType()(),
+            ) * 2 if len(inp) == 4 else ()
+        return (d_bottom, d_top) + d_height_width
+
+    def connection_pattern(self, node):
+        if node.nin == 2:
+            return [[1], [1]]
+        else:
+            return [[1], [1], [0], [0]]  # no connection to height, width
+
+
+class GpuCorrMM_gradInputs(BaseGpuCorrMM):
+    """
+    Gradient wrt. inputs for `GpuCorrMM`.
+
+    Notes
+    -----
+    You will not want to use this directly, but rely on Theano's automatic
+    differentiation or graph optimization to use it as needed.
+
+    """
+
+    def __init__(self, border_mode="valid",
+                 subsample=(1, 1),
+                 filter_dilation=(1, 1)):
+        super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample,
+                                                   filter_dilation)
+
+    def make_node(self, kern, topgrad, shape=None):
+        ctx_name = infer_context_name(kern, topgrad)
+        kern = as_gpuarray_variable(kern, ctx_name)
+        topgrad = as_gpuarray_variable(topgrad, ctx_name)
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+        if topgrad.type.ndim != 4:
+            raise TypeError('topgrad must be 4D tensor')
+        if self.subsample != (1, 1) and shape is None:
+            raise ValueError('shape must be given if subsample != (1, 1)')
+        height_width = [shape[0], shape[1]] if self.subsample != (1, 1) else []
+        if height_width:
+            assert shape[0].ndim == 0
+            assert shape[1].ndim == 0
+
+        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+                         False, False]
+        return Apply(self, [kern, topgrad] + height_width, [GpuArrayType(dtype=topgrad.dtype,
+                                                                         context_name=ctx_name,
+                                                                         broadcastable=broadcastable)()])
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        weights, top = inp[:2]
+        height, width = inp[2:] or (None, None)
+        bottom, = out_
+        direction = "backprop inputs"
+        return super(GpuCorrMM_gradInputs, self).c_code_helper(bottom, weights, top, direction, sub, height, width)
+
+    def grad(self, inp, grads):
+        weights, top = inp[:2]
+        bottom, = grads
+        bottom = gpu_contiguous(bottom)
+        d_weights = GpuCorrMM_gradWeights(self.border_mode,
+                                          self.subsample,
+                                          self.filter_dilation)(bottom,
+                                                                top,
+                                                                weights.shape[-2:])
+        d_top = GpuCorrMM(self.border_mode,
+                          self.subsample,
+                          self.filter_dilation)(bottom, weights)
+        d_height_width = (
+            theano.gradient.DisconnectedType()(),
+            ) * 2 if len(inp) == 4 else ()
+        return (d_weights, d_top) + d_height_width
+
+    def connection_pattern(self, node):
+        if node.nin == 2:
+            return [[1], [1]]
+        else:
+            return [[1], [1], [0], [0]]  # no connection to height, width
+
+
+class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
+    """
+    Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
+    `GpuCorr3dMM_gradInputs`. Cannot be used directly.
+
+    Parameters
+    ----------
+    border_mode : {'valid', 'full', 'half'}
+        Additionally, the padding size could be directly specified by an integer
+        or a pair of integers
+    subsample
+        Perform subsampling of the output (default: (1, 1, 1)).
+    filter_dilation
+        Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
+    """
+
+    check_broadcast = False
+    __props__ = ('border_mode', 'subsample', 'filter_dilation')
+
+    def __init__(self, border_mode="valid", subsample=(1, 1, 1),
+                 filter_dilation=(1, 1, 1)):
+        if isinstance(border_mode, integer_types):
+            border_mode = (border_mode, border_mode, border_mode)
+        if isinstance(border_mode, tuple):
+            pad_h, pad_w, pad_d = map(int, border_mode)
+            border_mode = (pad_h, pad_w, pad_d)
+        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
+                border_mode in ('valid', 'full', 'half')):
+            raise ValueError(
+                'invalid border_mode {}, which must be either '
+                '"valid", "full", "half", an integer or a tuple of'
+                ' three integers'.format(border_mode))
+        self.border_mode = border_mode
+        if len(subsample) != 3:
+            raise ValueError("subsample must have three elements")
+        if len(filter_dilation) != 3:
+            raise ValueError("filter_dilation must have three elements")
+        self.subsample = tuple(subsample)
+        self.filter_dilation = tuple(filter_dilation)
+        CGpuKernelBase.__init__(self, ['corr3d_gemm.c'])
+
+    @property
+    def pad(self):
+        if self.border_mode != 'valid':
+            return self.border_mode
+        return (0, 0, 0)
+
+    def __str__(self):
+        return '%s{%s, %s, %s}' % (
+            self.__class__.__name__,
+            self.border_mode,
+            str(self.subsample),
+            str(self.filter_dilation))
+
+    def flops(self, inp, outp):
+        """
+        Useful with the hack in profilemode to print the MFlops.
+
+        """
+        # if the output shape is correct, then this gives the correct
+        # flops for any direction, sampling, padding, and border mode
+        inputs, filters = inp
+        outputs, = outp
+        assert inputs[1] == filters[1]
+        # nb mul and add by output pixel
+        flops = filters[2] * filters[3] * filters[4] * 2
+        # nb flops by output image
+        flops *= outputs[2] * outputs[3] * outputs[4]
+        # nb patch multiplied
+        flops *= inputs[1] * filters[0] * inputs[0]
+        return flops
+
+    def get_params(self, node):
+        return node.inputs[0].type.context
+
+    def c_code_cache_version(self):
+        # raise this whenever modifying any of the support_code_files
+        return (0, 1)
+
+    def c_code_helper(self, bottom, weights, top, direction, sub,
+                      height=None, width=None, depth=None):
+        """
+        This generates the C code for GpuCorr3dMM (direction="forward"),
+        GpuCorr3dMM_gradWeights (direction="backprop weights"), and
+        GpuCorr3dMM_gradInputs (direction="backprop inputs").
+        Depending on the direction, one of bottom, weights, top will
+        receive the output, while the other two serve as inputs.
+
+        Parameters
+        ----------
+        bottom
+            Variable name of the input images in the forward pass,
+            or the gradient of the input images in backprop wrt. inputs
+        weights
+            Variable name of the filters in the forward pass,
+            or the gradient of the filters in backprop wrt. weights
+        top
+            Variable name of the output images / feature maps in the
+            forward pass, or the gradient of the outputs in the backprop passes
+        direction : {'forward', 'backprop weights', 'backprop inputs'}
+            "forward" to correlate bottom with weights and store results in top,
+            "backprop weights" to do a valid convolution of bottom with top
+            (swapping the first two dimensions) and store results in weights,
+            and "backprop inputs" to do a full convolution of top with weights
+            (swapping the first two dimensions) and store results in bottom.
+        sub
+            Dictionary of substitutions useable to help generating the C code.
+        height
+            If self.subsample[0] != 1, a variable giving the height of the
+            filters for direction="backprop weights" or the height of the input
+            images for direction="backprop inputs".
+            If self.border_mode == 'half', a variable giving the height of the
+            filters for direction="backprop weights".
+            Ignored otherwise.
+        width
+            If self.subsample[1] != 1, a variable giving the width of the
+            filters for direction="backprop weights" or the width of the
+            input images for direction="backprop inputs".
+            If self.border_mode == 'half', a variable giving the width of the
+            filters for direction="backprop weights".
+            Ignored otherwise.
+        depth
+            If self.subsample[2] != 1, a variable giving the depth of the
+            filters for direction="backprop weights" or the depth of the
+            input images for direction="backprop inputs".
+            If self.border_mode == 'half', a variable giving the depth of the
+            filters for direction="backprop weights".
+            Ignored otherwise.
+
+        """
+        dH, dW, dD = self.subsample
+        dilH, dilW, dilD = self.filter_dilation
+        if self.border_mode == "half":
+            padH = padW = padD = -1
+        elif self.border_mode == "full":
+            padH = padW = padD = -2
+        elif isinstance(self.border_mode, tuple):
+            padH, padW, padD = self.border_mode
+        else:
+            assert self.border_mode == "valid"
+            padH = padW = padD = 0
+        if direction == "forward":
+            direction = 0
+            out = top
+        elif direction == "backprop weights":
+            direction = 1
+            out = weights
+        elif direction == "backprop inputs":
+            direction = 2
+            out = bottom
+        else:
+            raise ValueError("direction must be one of 'forward', "
+                             "'backprop weights', 'backprop inputs'")
+        # When subsampling, we cannot unambiguously infer the height and width
+        # of bottom and weights from top, so we require them to be given.
+        # Similarly, when pad="half", we cannot infer the weight size.
+        if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
+            if not height:
+                raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
+            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
+        else:
+            height = '0'
+        if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
+            if not width:
+                raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
+            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
+        else:
+            width = '0'
+        if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
+            if not depth:
+                raise ValueError("depth must be given for backprop with horizontal sampling or pad='half'")
+            depth = '(*(npy_int*)(PyArray_DATA(%s)))' % depth
+        else:
+            depth = '0'
+        sync = ""
+        if config.gpuarray.sync:
+            sync = """
+            int err = GpuArray_sync(&%(out)s->ga);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "BaseGpuCorr3dMM error: gpuarray sync failed.");
+                %(fail)s;
+            }
+            """ % locals()
+        sub = sub.copy()
+        sub.update(locals())
+
+        return """
+    // Mandatory args
+    int direction = %(direction)s;  // forward, bprop weights, bprop inputs
+
+    // Optional args
+    size_t dH = %(dH)s;
+    size_t dW = %(dW)s;
+    size_t dD = %(dD)s;
+    size_t dilH = %(dilH)s;
+    size_t dilW = %(dilW)s;
+    size_t dilD = %(dilD)s;
+    int padH = %(padH)s;
+    int padW = %(padW)s;
+    int padD = %(padD)s;
+
+    PyGpuArrayObject * bottom = %(bottom)s;
+    PyGpuArrayObject * weights = %(weights)s;
+    PyGpuArrayObject * top = %(top)s;
+    PyGpuArrayObject * out2 = NULL;
+
+    // Obtain or infer kernel height, width and depth
+    // (we need to know it early to be able to handle auto-padding)
+    size_t kH, kW, kD;
+    if (direction != 1) {
+        // weight is an input variable, we can just read its shape
+        kH = PyGpuArray_DIMS(weights)[2];
+        kW = PyGpuArray_DIMS(weights)[3];
+        kD = PyGpuArray_DIMS(weights)[4];
+    }
+    else {
+        if ((dH != 1) || (padH == -1)) {
+            // vertical subsampling or half padding, kernel height is specified
+            kH = %(height)s;
+        }
+        else if (padH == -2) {
+            // vertical full padding, we can infer the kernel height
+            kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1;
+        }
+        else {
+            // explicit padding, we can infer the kernel height
+            kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
+        }
+        if ((dW != 1) || (padW == -1)) {
+            kW = %(width)s;
+        }
+        else if (padW == -2) {
+            kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
+        }
+        else {
+            kW = (PyGpuArray_DIMS(bottom)[3] + 2*padW - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
+        }
+        if ((dD != 1) || (padD == -1)) {
+            kD = %(depth)s;
+        }
+        else if (padD == -2) {
+            kD = (2 - PyGpuArray_DIMS(bottom)[4] + (PyGpuArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
+        }
+        else {
+            kD = (PyGpuArray_DIMS(bottom)[4] + 2*padD - (PyGpuArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
+        }
+    }
+
+    // Implicit dilated kernel size
+    size_t dil_kH = (kH - 1) * dilH + 1;
+    size_t dil_kW = (kW - 1) * dilW + 1;
+    size_t dil_kD = (kD - 1) * dilD + 1;
+
+    // Auto-padding if requested
+    if (padH == -1) {  // vertical half padding
+        padH = dil_kH / 2;
+    }
+    else if (padH == -2) {  // vertical full padding
+        padH = dil_kH - 1;
+    }
+    else if (padH < 0) {
+        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padH must be >= -2");
+        %(fail)s
+    }
+    if (padW == -1) {  // horizontal half padding
+        padW = dil_kW / 2;
+    }
+    else if (padW == -2) {  // horizontal full padding
+        padW = dil_kW - 1;
+    }
+    else if (padW < 0) {
+        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padW must be >= -2");
+        %(fail)s
+    }
+    if (padD == -1) {  // depth half padding
+        padD = dil_kD / 2;
+    }
+    else if (padD == -2) {  // depth full padding
+        padD = dil_kD - 1;
+    }
+    else if (padD < 0) {
+        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padD must be >= -2");
+        %(fail)s
+    }
+
+    // Infer output shape and type
+    size_t out_dim[5];
+    int out_typecode;
+    PyGpuContextObject *out_context;
+    switch(direction) {
+    case 0:  // forward pass
+        // output is top: (batchsize, num_filters, height, width, depth)
+        // height, width and depth: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
+        out_dim[0] = PyGpuArray_DIMS(bottom)[0];
+        out_dim[1] = PyGpuArray_DIMS(weights)[0];
+        out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
+        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
+        out_dim[4] = (PyGpuArray_DIMS(bottom)[4] + 2*padD - ((PyGpuArray_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1;
+        out_typecode = bottom->ga.typecode;
+        out_context = bottom->context;
+        break;
+    case 1:  // backprop wrt. weights
+        // output is weights: (num_filters, num_channels, height, width, depth)
+        // height, width and depth: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
+        out_dim[0] = PyGpuArray_DIMS(top)[1];
+        out_dim[1] = PyGpuArray_DIMS(bottom)[1];
+        out_dim[2] = kH;  // already inferred further above
+        out_dim[3] = kW;  // how convenient
+        out_dim[4] = kD;
+        out_typecode = top->ga.typecode;
+        out_context = top->context;
+        break;
+    case 2:  // backprop wrt. inputs
+        // output is bottom: (batchsize, num_channels, height, width, depth)
+        // height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
+        out_dim[0] = PyGpuArray_DIMS(top)[0];
+        out_dim[1] = PyGpuArray_DIMS(weights)[1];
+        out_dim[2] = (dH != 1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[3] = (dW != 1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
+        out_dim[4] = (dD != 1) ? %(depth)s : (PyGpuArray_DIMS(top)[4] - 1) * dD + (PyGpuArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
+        out_typecode = top->ga.typecode;
+        out_context = top->context;
+        break;
+    default:
+        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");
+        %(fail)s
+    }
+
+    // Prepare output array
+    if (theano_prep_output(&%(out)s, 5, out_dim, out_typecode, GA_C_ORDER, out_context) != 0)
+    {
+        PyErr_Format(PyExc_RuntimeError,
+                "BaseGpuCorrMM: Failed to allocate output of %%ld x %%ld x %%ld x %%ld x %%ld",
+                out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]);
+        %(fail)s
+    }
+    if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) {
+        PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
+        %(fail)s
+    }
+
+    // Call GPU code
+    out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction,
+                    dH, dW, dD, dilH, dilW, dilD, padH, padW, padD);
+    if (out2==NULL){
+       %(fail)s
+    }
+    assert (out2 == %(out)s);
+
+    %(sync)s
+
+""" % sub
+
+
+class GpuCorr3dMM(BaseGpuCorr3dMM):
+    """
+    GPU correlation implementation using Matrix Multiplication.
+
+    Parameters
+    ----------
+    border_mode
+        The width of a border of implicit zeros to pad the
+        input with. Must be a tuple with 3 elements giving the width of
+        the padding on each side, or a single integer to pad the same
+        on all sides, or a string shortcut setting the padding at runtime:
+        ``'valid'`` for ``(0, 0, 0)`` (valid convolution, no padding), ``'full'``
+        for ``(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)``
+        (full convolution), ``'half'`` for ``(kernel_rows // 2,
+        kernel_columns // 2, kernel_depth // 2)`` (same convolution for
+        odd-sized kernels). Note that the three widths are each
+        applied twice, once per side (left and right, top and bottom, front
+        and back).
+    subsample
+        The subsample operation applied to each output image. Should be a tuple
+        with 3 elements. `(sv, sh, sl)` is equivalent to
+        `GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`, but faster.
+        Set to `(1, 1, 1)` to disable subsampling.
+    filter_dilation
+        The filter dilation operation applied to each input image.
+        Should be a tuple with 3 elements.
+        Set to `(1, 1, 1)` to disable filter dilation.
+
+    Notes
+    -----
+    Currently, the Op requires the inputs, filters and outputs to be
+    C-contiguous. Use :func:`gpu_contiguous
+    <theano.gpuarray.basic_ops.gpu_contiguous>` on these arguments
+    if needed.
+
+    You can either enable the Theano flag `optimizer_including=conv_gemm`
+    to automatically replace all convolution operations with `GpuCorr3dMM`
+    or one of its gradients, or you can use it as a replacement for
+    :func:`conv2d <theano.tensor.nnet.conv.conv2d>`, called as
+    `GpuCorr3dMM(subsample=...)(image, filters)`. The latter is currently
+    faster, but note that it computes a correlation -- if you need to
+    compute a convolution, flip the filters as `filters[:,:,::-1,::-1,::-1]`.
+
+    """
+    def __init__(self, border_mode="valid",
+                 subsample=(1, 1, 1),
+                 filter_dilation=(1, 1, 1)):
+        super(GpuCorr3dMM, self).__init__(border_mode, subsample,
+                                          filter_dilation)
+
+    def make_node(self, img, kern):
+        ctx_name = infer_context_name(img, kern)
+        img = as_gpuarray_variable(img, ctx_name)
+        kern = as_gpuarray_variable(kern, ctx_name)
+        if img.type.ndim != 5:
+            raise TypeError('img must be 5D tensor')
+        if kern.type.ndim != 5:
+            raise TypeError('kern must be 5D tensor')
+
+        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
+                         False, False, False]
+        return Apply(self, [img, kern], [GpuArrayType(dtype=img.dtype,
+                                                      context_name=ctx_name,
+                                                      broadcastable=broadcastable)()])
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        bottom, weights = inp
+        top, = out_
+        direction = "forward"
+        return super(GpuCorr3dMM, self).c_code_helper(bottom, weights, top, direction, sub)
+
+    def grad(self, inp, grads):
+        bottom, weights = inp
+        top, = grads
+        top = gpu_contiguous(top)
+        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
+                                          self.subsample,
+                                          self.filter_dilation)(
+            weights, top, bottom.shape[-3:])
+        d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
+                                            self.subsample,
+                                            self.filter_dilation)(
+            bottom, top, weights.shape[-3:])
+        return d_bottom, d_weights
+
+
+class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
+    """
+    Gradient wrt. filters for `GpuCorr3dMM`.
+
+    Notes
+    -----
+    You will not want to use this directly, but rely on Theano's automatic
+    differentiation or graph optimization to use it as needed.
+
+    """
+
+    def __init__(self, border_mode="valid",
+                 subsample=(1, 1, 1),
+                 filter_dilation=(1, 1, 1)):
+        super(GpuCorr3dMM_gradWeights, self).__init__(border_mode,
+                                                      subsample,
+                                                      filter_dilation)
+
+    def make_node(self, img, topgrad, shape=None):
+        ctx_name = infer_context_name(img, topgrad)
+        img = as_gpuarray_variable(img, ctx_name)
+        topgrad = as_gpuarray_variable(topgrad, ctx_name)
+        if img.type.ndim != 5:
+            raise TypeError('img must be 5D tensor')
+        if topgrad.type.ndim != 5:
+            raise TypeError('topgrad must be 5D tensor')
+        if self.subsample != (1, 1, 1) or self.border_mode == "half":
+            if shape is None:
+                raise ValueError('shape must be given if subsample != (1, 1, 1)'
+                                 ' or border_mode == "half"')
+            height_width_depth = [shape[0], shape[1], shape[2]]
+            assert shape[0].ndim == 0
+            assert shape[1].ndim == 0
+            assert shape[2].ndim == 0
+        else:
+            height_width_depth = []
+
+        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
+                         False, False, False]
+        return Apply(self, [img, topgrad] + height_width_depth,
+                     [GpuArrayType(dtype=img.dtype,
+                                   context_name=ctx_name,
+                                   broadcastable=broadcastable)()])
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        bottom, top = inp[:2]
+        height, width, depth = inp[2:] or (None, None, None)
+        weights, = out_
+        direction = "backprop weights"
+        return super(GpuCorr3dMM_gradWeights, self).c_code_helper(bottom, weights, top, direction, sub, height, width, depth)
+
+    def grad(self, inp, grads):
+        bottom, top = inp[:2]
+        weights, = grads
+        weights = gpu_contiguous(weights)
+        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
+                                          self.subsample,
+                                          self.filter_dilation)(weights,
+                                                                top,
+                                                                bottom.shape[-3:])
+        d_top = GpuCorr3dMM(
+            self.border_mode, self.subsample, self.filter_dilation)(bottom, weights)
+        d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
+            * 3 if len(inp) == 5 else ()
+        return (d_bottom, d_top) + d_height_width_depth
+
+    def connection_pattern(self, node):
+        if node.nin == 2:
+            return [[1], [1]]
+        else:
+            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
+
+
+class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
+    """
+    Gradient wrt. inputs for `GpuCorr3dMM`.
+
+    Notes
+    -----
+    You will not want to use this directly, but rely on Theano's automatic
+    differentiation or graph optimization to use it as needed.
+
+    """
+
+    def __init__(self, border_mode="valid",
+                 subsample=(1, 1, 1),
+                 filter_dilation=(1, 1, 1)):
+        super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample,
+                                                     filter_dilation)
+
+    def make_node(self, kern, topgrad, shape=None):
+        ctx_name = infer_context_name(kern, topgrad)
+        kern = as_gpuarray_variable(kern, ctx_name)
+        topgrad = as_gpuarray_variable(topgrad, ctx_name)
+        if kern.type.ndim != 5:
+            raise TypeError('kern must be 5D tensor')
+        if topgrad.type.ndim != 5:
+            raise TypeError('topgrad must be 5D tensor')
+        if self.subsample != (1, 1, 1) and shape is None:
+            raise ValueError('shape must be given if subsample != (1, 1, 1)')
+        height_width_depth = [shape[0], shape[1], shape[2]] if self.subsample != (1, 1, 1) else []
+        if height_width_depth:
+            assert shape[0].ndim == 0
+            assert shape[1].ndim == 0
+            assert shape[2].ndim == 0
+
+        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+                         False, False, False]
+        return Apply(self, [kern, topgrad] + height_width_depth,
+                     [GpuArrayType(dtype=topgrad.dtype,
+                                   context_name=ctx_name,
+                                   broadcastable=broadcastable)()])
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        weights, top = inp[:2]
+        height, width, depth = inp[2:] or (None, None, None)
+        bottom, = out_
+        direction = "backprop inputs"
+        return super(GpuCorr3dMM_gradInputs, self).c_code_helper(bottom, weights, top, direction, sub, height, width, depth)
+
+    def grad(self, inp, grads):
+        weights, top = inp[:2]
+        bottom, = grads
+        bottom = gpu_contiguous(bottom)
+        d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
+                                            self.subsample,
+                                            self.filter_dilation)(bottom,
+                                                                  top,
+                                                                  weights.shape[-3:])
+        d_top = GpuCorr3dMM(self.border_mode,
+                            self.subsample,
+                            self.filter_dilation)(bottom, weights)
+        d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
+            * 3 if len(inp) == 5 else ()
+        return (d_weights, d_top) + d_height_width_depth
+
+    def connection_pattern(self, node):
+        if node.nin == 2:
+            return [[1], [1]]
+        else:
+            return [[1], [1], [0], [0], [0]]  # no connection to height, width, depth
+
+
 @inplace_allocempty(GpuGemv, 0)
 def local_inplace_gpuagemv(node, inputs):
    return [gpugemv_inplace(*inputs)]

--- a/theano/gpuarray/corr3d_gemm.c
+++ b/theano/gpuarray/corr3d_gemm.c
+#section kernels
+
+#kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, * : 
+// TODO check kernel flags
+// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
+// sources are clearly marked. Below we reproduce the original license of
+// the Caffe software.
+/*
+Copyright (c) 2014, The Regents of the University of California (Regents)
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
+// Kernels for fast unfold + copy
+// GPU kernel for the case of dilation
+KERNEL void dilated_im3d2col_kernel(const ga_size n,
+    GLOBAL_MEM const DTYPE_i0 * data_im,
+    const ga_size data_im_offset,
+    const ga_size height, const ga_size width, const ga_size depth,
+    const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
+    const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_d,
+    const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
+    const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
+    const ga_size height_col, const ga_size width_col, const ga_size depth_col,
+    GLOBAL_MEM DTYPE_i0 * data_col) {
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < (n); index += LDIM_0 * GDIM_0) {
+    const ga_size w_index = index / depth_col;
+    const ga_size h_index = w_index / width_col;
+    const ga_size d_col = index % depth_col;
+    const ga_size h_col = h_index % height_col;
+    const ga_size w_col = w_index % width_col;
+    const ga_size c_im = h_index / height_col;
+    const ga_size c_col = c_im * kernel_h * kernel_w * kernel_d;
+    const ga_size h_offset = h_col * stride_h - pad_h;
+    const ga_size w_offset = w_col * stride_w - pad_w;
+    const ga_size d_offset = d_col * stride_d - pad_d;
+    DTYPE_i0 * data_col_ptr = data_col;
+    data_col_ptr += c_col * (height_col * width_col * depth_col) +
+      h_col * (width_col * depth_col) + w_col * depth_col + d_col;
+    const DTYPE_i0 * data_im_ptr = data_im + data_im_offset;
+    data_im_ptr += c_im * (height * width * depth) +
+      h_offset * (width * depth) + w_offset * depth + d_offset;
+    for (ga_size i = 0; i < kernel_h; ++i) {
+      ga_size h_im = h_offset + i * dilation_h;
+      for (ga_size j = 0; j < kernel_w; ++j) {
+        ga_size w_im = w_offset + j * dilation_w;
+        for (ga_size k = 0; k < kernel_d; ++k) {
+          ga_size d_im = d_offset + k * dilation_d;
+          *data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
+                           h_im < height && w_im < width && d_im < depth) ?
+                           data_im_ptr[i * dilation_h * (width * depth) +
+                                       j * dilation_w * depth +
+                                       k * dilation_d] : 0;
+          data_col_ptr += height_col * width_col * depth_col;
+        }
+      }
+    }
+  }
+}
+
+#kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, * : 
+KERNEL void im3d2col_kernel(const ga_size n,
+    GLOBAL_MEM const DTYPE_i0 * data_im,
+    const ga_size data_im_offset,
+    const ga_size height, const ga_size width, const ga_size depth,
+    const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
+    const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
+    const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
+    const ga_size height_col, const ga_size width_col, const ga_size depth_col,
+    GLOBAL_MEM DTYPE_i0 * data_col) {
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < (n); index += LDIM_0 * GDIM_0) {
+    const ga_size w_index = index / depth_col;
+    const ga_size h_index = w_index / width_col;
+    const ga_size d_col = index % depth_col;
+    const ga_size h_col = h_index % height_col;
+    const ga_size w_col = w_index % width_col;
+    const ga_size c_im = h_index / height_col;
+    const ga_size c_col = c_im * kernel_h * kernel_w * kernel_d;
+    const ga_size h_offset = h_col * stride_h - pad_h;
+    const ga_size w_offset = w_col * stride_w - pad_w;
+    const ga_size d_offset = d_col * stride_d - pad_d;
+    DTYPE_i0 * data_col_ptr = data_col;
+    data_col_ptr += c_col * (height_col * width_col * depth_col) +
+      h_col * (width_col * depth_col) + w_col * depth_col + d_col;
+    const DTYPE_i0 * data_im_ptr = data_im + data_im_offset;
+    data_im_ptr += c_im * (height * width * depth) +
+      h_offset * (width * depth) + w_offset * depth + d_offset;
+    for (ga_size i = 0; i < kernel_h; ++i) {
+      ga_size h_im = h_offset + i;
+      for (ga_size j = 0; j < kernel_w; ++j) {
+        ga_size w_im = w_offset + j;
+        for (ga_size k = 0; k < kernel_d; ++k) {
+          ga_size d_im = d_offset + k;
+          *data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
+                           h_im < height && w_im < width && d_im < depth) ?
+                           data_im_ptr[i * (width * depth) + j * depth + k] : 0;
+          data_col_ptr += height_col * width_col * depth_col;
+        }
+      }
+    }
+  }
+}
+
+// GPU kernel for the case of dilation
+#kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : 
+KERNEL void dilated_col2im3d_kernel(const ga_size n,
+    GLOBAL_MEM const DTYPE_i0 * data_col,
+    const ga_size height, const ga_size width, const ga_size depth,
+    const ga_size channels,
+    const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
+    const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_d,
+    const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
+    const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
+    const ga_size height_col, const ga_size width_col, const ga_size depth_col,
+    GLOBAL_MEM DTYPE_i0 * data_im,
+    const ga_size data_im_offset) {
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < (n); index += LDIM_0 * GDIM_0) {
+    DTYPE_i0 val = 0;
+    const ga_size d_im = index % depth + pad_d;
+    const ga_size w_index = index / depth;
+    const ga_size w_im = w_index % width + pad_w;
+    const ga_size h_index = w_index / width;
+    const ga_size h_im = h_index % height + pad_h;
+    const ga_size c_im = h_index / height;
+    ga_size kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    ga_size kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+    ga_size kernel_extent_d = (kernel_d - 1) * dilation_d + 1;
+    // compute the start and end of the output
+    const ga_size d_col_start =
+        (d_im < kernel_extent_d) ? 0 : (d_im - kernel_extent_d) / stride_d + 1;
+    const ga_size d_col_end = min(d_im / stride_d + 1, depth_col);
+    const ga_size w_col_start =
+        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+    const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
+    const ga_size h_col_start =
+        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+    const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
+    // TODO: use LCM of stride and dilation to avoid unnecessary loops
+    for (ga_size d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (ga_size h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (ga_size w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          ga_size h_k = (h_im - h_col * stride_h);
+          ga_size w_k = (w_im - w_col * stride_w);
+          ga_size d_k = (d_im - d_col * stride_d);
+          if (h_k % dilation_h == 0 && w_k % dilation_w == 0 && d_k % dilation_d == 0) {
+            h_k /= dilation_h;
+            w_k /= dilation_w;
+            d_k /= dilation_d;
+            ga_size data_col_index = c_im * kernel_h * kernel_w * kernel_d * height_col * width_col * depth_col +
+                                     h_k             * kernel_w * kernel_d * height_col * width_col * depth_col +
+                                     w_k                        * kernel_d * height_col * width_col * depth_col +
+                                     d_k                                   * height_col * width_col * depth_col +
+                                     h_col                                              * width_col * depth_col +
+                                     w_col                                                          * depth_col +
+                                     d_col;
+            val += data_col[data_col_index];
+          }
+        }
+      }
+    }
+    data_im[data_im_offset + index] = val;
+  }
+}
+
+#kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : 
+KERNEL void col2im3d_kernel(const ga_size n,
+    GLOBAL_MEM const DTYPE_i0 * data_col,
+    const ga_size height, const ga_size width, const ga_size depth,
+    const ga_size channels,
+    const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
+    const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
+    const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
+    const ga_size height_col, const ga_size width_col, const ga_size depth_col,
+    GLOBAL_MEM DTYPE_i0 * data_im,
+    const ga_size data_im_offset) {
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < (n); index += LDIM_0 * GDIM_0) {
+    DTYPE_i0 val = 0;
+    const ga_size d_im = index % depth + pad_d;
+    const ga_size w_index = index / depth;
+    const ga_size w_im = w_index % width + pad_w;
+    const ga_size h_index = w_index / width;
+    const ga_size h_im = h_index % height + pad_h;
+    const ga_size c_im = h_index / height;
+
+    // compute the start and end of the output
+    const ga_size d_col_start = (d_im < kernel_d) ? 0 : (d_im - kernel_d) / stride_d + 1;
+    const ga_size d_col_end = min(d_im / stride_d + 1, depth_col);
+    const ga_size w_col_start = (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
+    const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
+    const ga_size h_col_start = (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
+    const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
+
+    ga_size offset =
+      (c_im * kernel_h * kernel_w * kernel_d + h_im * kernel_w * kernel_d +
+       w_im * kernel_d + d_im) * height_col * width_col * depth_col;
+
+    ga_size coeff_h_col = (1 - stride_h * kernel_w * kernel_d * height_col) * width_col * depth_col;
+    ga_size coeff_w_col = (1 - stride_w * kernel_d * height_col * width_col) * depth_col;
+    ga_size coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
+    for (ga_size d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (ga_size h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (ga_size w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
+        }
+      }
+    }
+    data_im[data_im_offset + index] = val;
+  }
+}
+
+
+
+#section support_code_struct
+
+int im3d2col(const size_t max_threads_dim,
+    gpudata * data_im, const size_t data_im_offset, const size_t channels,
+    const size_t height, const size_t width, const size_t depth,
+    const size_t kernel_h, const size_t kernel_w, const size_t kernel_d,
+    const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
+    const size_t pad_h, const size_t pad_w, const size_t pad_d,
+    const size_t stride_h, const size_t stride_w, const size_t stride_d,
+    gpudata * data_col) {
+  // We are going to launch channels * height_col * width_col * depth_col
+  // kernels, each kernel responsible for copying a single-channel grid.
+  size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
+  size_t dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
+  size_t dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
+  size_t height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
+  size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
+  size_t depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
+  size_t num_kernels = channels * height_col * width_col * depth_col;
+  size_t threads_per_block = max_threads_dim;
+  size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
+  int err;
+  GpuKernel *kernel;
+  if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
+    err = dilated_im3d2col_kernel_call(
+      1, &threads_per_block, &n_blocks, 0,
+      num_kernels, data_im, data_im_offset, height, width, depth,
+      kernel_h, kernel_w, kernel_d, dilation_h, dilation_w, dilation_d,
+      pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, height_col,
+      width_col, depth_col, data_col);
+    if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "gpuarray error: dilated_im3d2col_kernel: %s.",
+                     GpuKernel_error(&k_dilated_im3d2col_kernel, err));
+    }
+  }
+  else{
+    err = im3d2col_kernel_call(
+      1, &threads_per_block, &n_blocks, 0,
+      num_kernels, data_im, data_im_offset, height, width, depth,
+      kernel_h, kernel_w, kernel_d, pad_h, pad_w, pad_d,
+      stride_h, stride_w, stride_d, height_col, width_col, depth_col,
+      data_col);
+    if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "gpuarray error: im3d2col_kernel: %s.",
+                     GpuKernel_error(&k_im3d2col_kernel, err));
+    }
+  }
+  return err;
+}
+
+int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t channels,
+    const size_t height, const size_t width, const size_t depth,
+    const size_t patch_h, const size_t patch_w, const size_t patch_d,
+    const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
+    const size_t pad_h, const size_t pad_w, const size_t pad_d,
+    const size_t stride_h, const size_t stride_w, const size_t stride_d,
+    gpudata * data_im, const size_t data_im_offset) {
+  size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
+  size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
+  size_t dil_patch_d = (patch_d - 1) * dilation_d + 1;
+  size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
+  size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
+  size_t depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
+  size_t num_kernels = channels * height * width * depth;
+  size_t threads_per_block = max_threads_dim;
+  size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  int err;
+  if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
+    err = dilated_col2im3d_kernel_call(
+      1, &threads_per_block, &n_blocks, 0,
+      num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
+      patch_d, dilation_h, dilation_w, dilation_d, pad_h, pad_w, pad_d,
+      stride_h, stride_w, stride_d, height_col, width_col, depth_col,
+      data_im, data_im_offset);
+    if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "gpuarray error: dilated_col2im3d_kernel: %s.",
+                     GpuKernel_error(&k_dilated_col2im3d_kernel, err));
+    }
+  }
+  else{
+    err = col2im3d_kernel_call(
+      1, &threads_per_block, &n_blocks, 0,
+      num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
+      patch_d, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d,
+      height_col, width_col, depth_col, data_im, data_im_offset);
+    if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "gpuarray error: col2im3d_kernel: %s.",
+                     GpuKernel_error(&k_col2im3d_kernel, err));
+    }
+  }
+  return err;
+}
+
+
+// Theano op code
+// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
+// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
+//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
+// Adaptation for 3d
+PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
+                           PyGpuArrayObject *const weight,
+                           PyGpuArrayObject *const top,
+                           const size_t direction,
+                           const size_t dH = 1,
+                           const size_t dW = 1,
+                           const size_t dD = 1,
+                           const size_t dilH = 1,
+                           const size_t dilW = 1,
+                           const size_t dilD = 1,
+                           const size_t padH = 0,
+                           const size_t padW = 0,
+                           const size_t padD = 0)
+{
+    if (PyGpuArray_NDIM(bottom) != 5)
+    {
+        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D");
+        return NULL;
+    }
+    if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga))
+    {
+        PyErr_Format(PyExc_ValueError,
+                "GpuCorr3dMM requires bottom to be C-contiguous, "
+                "but strides are: %ld %ld %ld %ld %ld\n",
+                PyGpuArray_STRIDES(bottom)[0],
+                PyGpuArray_STRIDES(bottom)[1],
+                PyGpuArray_STRIDES(bottom)[2],
+                PyGpuArray_STRIDES(bottom)[3],
+                PyGpuArray_STRIDES(bottom)[4]);
+        return NULL;
+    }
+
+    if (PyGpuArray_NDIM(weight) != 5)
+    {
+        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D");
+        return NULL;
+    }
+    if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
+    {
+        PyErr_Format(PyExc_ValueError,
+                "GpuCorr3dMM requires weight to be C-contiguous, "
+                "but strides are: %ld %ld %ld %ld %ld\n",
+                PyGpuArray_STRIDES(weight)[0],
+                PyGpuArray_STRIDES(weight)[1],
+                PyGpuArray_STRIDES(weight)[2],
+                PyGpuArray_STRIDES(weight)[3],
+                PyGpuArray_STRIDES(weight)[4]);
+        return NULL;
+    }
+
+    if (PyGpuArray_NDIM(top) != 5)
+    {
+        PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D");
+        return NULL;
+    }
+    if (!GpuArray_IS_C_CONTIGUOUS(&top->ga))
+    {
+        PyErr_Format(PyExc_ValueError,
+                "GpuCorr3dMM requires top to be C-contiguous, "
+                "but strides are: %ld %ld %ld %ld %ld\n",
+                PyGpuArray_STRIDES(top)[0],
+                PyGpuArray_STRIDES(top)[1],
+                PyGpuArray_STRIDES(top)[2],
+                PyGpuArray_STRIDES(top)[3],
+                PyGpuArray_STRIDES(top)[4]);
+        return NULL;
+    }
+
+    // Extract some shape information for later and check shape consistency
+    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
+    const size_t batchSize = PyGpuArray_DIMS(bottom)[0];
+    const size_t nChannels = PyGpuArray_DIMS(bottom)[1];
+    const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
+    const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
+    const size_t bottomDepth = PyGpuArray_DIMS(bottom)[4];
+    // weights: (nFilters, nChannels, rows, columns, slices)
+    const size_t nFilters = PyGpuArray_DIMS(weight)[0];
+    const size_t kH = PyGpuArray_DIMS(weight)[2];
+    const size_t kW = PyGpuArray_DIMS(weight)[3];
+    const size_t kD = PyGpuArray_DIMS(weight)[4];
+    if (nChannels != PyGpuArray_DIMS(weight)[1]) {
+        PyErr_SetString(PyExc_ValueError,
+                "GpuCorr3dMM images and kernel must have the same stack size\n");
+        return NULL;
+    }
+    // implicit dilated filter
+    const size_t dil_kH = (kH - 1) * dilH + 1;
+    const size_t dil_kW = (kW - 1) * dilW + 1;
+    const size_t dil_kD = (kD - 1) * dilD + 1;
+    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
+    const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
+    const size_t topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    const size_t topDepth  = (bottomDepth + 2*padD - dil_kD) / dD + 1;
+    if (batchSize != PyGpuArray_DIMS(top)[0] ||
+            nFilters != PyGpuArray_DIMS(top)[1] ||
+            topHeight != PyGpuArray_DIMS(top)[2] ||
+            topWidth != PyGpuArray_DIMS(top)[3] ||
+            topDepth != PyGpuArray_DIMS(top)[4]) {
+        PyErr_Format(PyExc_ValueError,
+                "GpuCorr3dMM shape inconsistency:\n"
+                "  bottom shape: %ld %ld %ld %ld %ld\n"
+                "  weight shape: %ld %ld %ld %ld %ld\n"
+                "  top shape: %ld %ld %ld %ld %ld (expected %ld %ld %ld %ld %ld)\n",
+                batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
+                nFilters, nChannels, kH, kW, kD,
+                PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], PyGpuArray_DIMS(top)[4],
+                batchSize, nFilters, topHeight, topWidth, topDepth);
+        return NULL;
+    }
+
+    int err = gpublas_setup(bottom->context->ctx);
+    if (err != GA_NO_ERROR) {
+        PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
+        return NULL;
+    }
+
+    // Get the max threads per blocks
+    size_t max_threads_dim;
+    err = gpucontext_property(bottom->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
+    if (err != GA_NO_ERROR){
+        PyErr_Format(PyExc_RuntimeError,
+                     "Could not fetch max_threads_dim.");
+        return NULL;
+    }
+
+    // Create temporary columns
+    size_t col_dim[2];
+    col_dim[0] = nChannels * kW * kH * kD;
+    col_dim[1] = topHeight * topWidth * topDepth;
+    PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim,
+                                                           bottom->ga.typecode,
+                                                           GA_C_ORDER,
+                                                           bottom->context,
+                                                           Py_None);
+    if (NULL == col)
+    {
+        PyErr_Format(PyExc_RuntimeError,
+                "GpuCorr3dMM failed to allocate working memory of %ld x %ld\n",
+                col_dim[0], col_dim[1]);
+        return NULL;
+    }
+
+    // Define some useful variables
+    const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
+    const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
+    const size_t K_ = col_dim[0];
+    const size_t N_ = col_dim[1];
+    const size_t M_ = nFilters;
+    const DTYPE_INPUT_0 one = 1.0f;
+    const DTYPE_INPUT_0 zero = 0.0f;
+
+    PyGpuArrayObject *output;
+    if (direction == 0) {  // forward pass
+        output = top;
+        // valid correlation: im3d2col, then gemm
+        // Iterate over batch
+        for (size_t n = 0; n < batchSize; n++) {
+            // First, im3d2col
+            err = im3d2col(max_threads_dim,
+                           bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
+                           bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
+                           padH, padW, padD, dH, dW, dD, col->ga.data);
+            if (err != GA_NO_ERROR) {
+                Py_DECREF(col);
+                return NULL;
+            }
+            // Second, gemm
+            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
+                                N_, M_, K_, one,
+                                col->ga.data, 0, N_,
+                                weight->ga.data, 0, K_,
+                                zero,
+                                top->ga.data, n * top_stride, N_);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                        "GpuCorr3dMM encountered an error running sgemm.\n");
+                Py_DECREF(col);
+                return NULL;
+            }
+        }
+    }
+    else if (direction == 1) {  // backprop wrt. weights
+        output = weight;
+        // valid convolution: im3col, then gemm
+        // Iterate over batch
+        for (size_t n = 0; n < batchSize; n++) {
+            // First, im3d2col
+            err = im3d2col(max_threads_dim,
+                           bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
+                           bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
+                           padH, padW, padD, dH, dW, dD, col->ga.data);
+            if (err != GA_NO_ERROR) {
+                Py_DECREF(col);
+                return NULL;
+            }
+            // Second, gemm
+            // Note that we accumulate into weight. We do so by setting beta = 0
+            // for the first iteration and beta = 1 for subsequent ones. (This
+            // is faster than setting weight to all zeros before the loop.)
+            err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
+                                K_, M_, N_, one,
+                                col->ga.data, 0, N_,
+                                top->ga.data, n * top_stride, N_,
+                                (n == 0) ? zero : one,
+                                weight->ga.data, 0, K_);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                        "GpuCorr3dMM encountered an error running sgemm.\n");
+                Py_DECREF(col);
+                return NULL;
+            }
+        }
+    }
+    else if (direction == 2) {  // backprop wrt. inputs
+        output = bottom;
+        // full convolution: gemm, then col2im3d
+        // Iterate over batch
+        for (size_t n = 0; n < batchSize; n++) {
+            // gemm into columns
+            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
+                                N_, K_, M_, one,
+                                top->ga.data, n * top_stride, N_,
+                                weight->ga.data, 0, K_,
+                                zero,
+                                col->ga.data, 0, N_);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                        "GpuCorr3dMM encountered an error running sgemm.\n");
+                Py_DECREF(col);
+                return NULL;
+            }
+            // col2im3d back to the data
+            err = col2im3d(max_threads_dim,
+                           col->ga.data, nChannels,
+                           bottomHeight, bottomWidth, bottomDepth,
+                           kH, kW, kD, dilH, dilW, dilD, padH, padW, padD,
+                           dH, dW, dD, bottom->ga.data, n * bottom_stride);
+            if (err != GA_NO_ERROR) {
+                Py_DECREF(col);
+                return NULL;
+            }
+        }
+    }
+    // Free temporary columns
+    Py_DECREF(col);
+
+    // Note that we don't change the refcount of the output matrix here. Output
+    // (re)allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper();
+    // in here output is just aliased to one of bottom, weights, or top.
+    return output;
+}
--- a/theano/gpuarray/corr_gemm.c
+++ b/theano/gpuarray/corr_gemm.c
+#section kernels
+
+#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, * : 
+// TODO check kernel flags
+// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
+// sources are clearly marked. Below we reproduce the original license of
+// the Caffe software.
+/*
+Copyright (c) 2014, The Regents of the University of California (Regents)
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
+// Kernels for fast unfold + copy
+// GPU kernel for the case of dilation
+KERNEL void dilated_im2col_kernel(const ga_size n,
+    GLOBAL_MEM const DTYPE_i0 * data_im,
+    const ga_size data_im_offset,
+    const ga_size height, const ga_size width,
+    const ga_size kernel_h, const ga_size kernel_w,
+    const ga_size dilation_h, const ga_size dilation_w,
+    const ga_size pad_h, const ga_size pad_w,
+    const ga_size stride_h, const ga_size stride_w,
+    const ga_size height_col, const ga_size width_col,
+    GLOBAL_MEM DTYPE_i0 * data_col) {
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < (n); index += LDIM_0 * GDIM_0) {
+    const ga_size h_index = index / width_col;
+    const ga_size h_col = h_index % height_col;
+    const ga_size w_col = index % width_col;
+    const ga_size c_im = h_index / height_col;
+    const ga_size c_col = c_im * kernel_h * kernel_w;
+    const ga_size h_offset = h_col * stride_h - pad_h;
+    const ga_size w_offset = w_col * stride_w - pad_w;
+    DTYPE_i0 * data_col_ptr = data_col;
+    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
+    const DTYPE_i0 * data_im_ptr = data_im + data_im_offset;
+    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
+    for (ga_size i = 0; i < kernel_h; ++i) {
+      for (ga_size j = 0; j < kernel_w; ++j) {
+        ga_size h_im = h_offset + i * dilation_h;
+        ga_size w_im = w_offset + j * dilation_w;
+        *data_col_ptr =
+          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+            data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, * : 
+KERNEL void im2col_kernel(const ga_size n,
+    GLOBAL_MEM const DTYPE_i0 * data_im,
+    const ga_size data_im_offset,
+    const ga_size height, const ga_size width,
+    const ga_size kernel_h, const ga_size kernel_w,
+    const ga_size pad_h, const ga_size pad_w,
+    const ga_size stride_h, const ga_size stride_w,
+    const ga_size height_col, const ga_size width_col,
+    GLOBAL_MEM DTYPE_i0 * data_col) {
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < (n); index += LDIM_0 * GDIM_0) {
+    const ga_size h_index = index / width_col;
+    const ga_size h_col = h_index % height_col;
+    const ga_size w_col = index % width_col;
+    const ga_size c_im = h_index / height_col;
+    const ga_size c_col = c_im * kernel_h * kernel_w;
+    const ga_size h_offset = h_col * stride_h - pad_h;
+    const ga_size w_offset = w_col * stride_w - pad_w;
+    DTYPE_i0 * data_col_ptr = data_col;
+    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
+    const DTYPE_i0 * data_im_ptr = data_im + data_im_offset;
+    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
+    for (ga_size i = 0; i < kernel_h; ++i) {
+      for (ga_size j = 0; j < kernel_w; ++j) {
+        ga_size h_im = h_offset + i ;
+        ga_size w_im = w_offset + j ;
+        *data_col_ptr =
+          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+           data_im_ptr[i * width + j] : 0;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+// GPU kernel for the case of dilation
+#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : 
+KERNEL void dilated_col2im_kernel(const ga_size n,
+    GLOBAL_MEM const DTYPE_i0 * data_col,
+    const ga_size height, const ga_size width, const ga_size channels,
+    const ga_size kernel_h, const ga_size kernel_w,
+    const ga_size dilation_h, const ga_size dilation_w,
+    const ga_size pad_h, const ga_size pad_w,
+    const ga_size stride_h, const ga_size stride_w,
+    const ga_size height_col, const ga_size width_col,
+    GLOBAL_MEM DTYPE_i0 * data_im,
+    const ga_size data_im_offset) {
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < (n); index += LDIM_0 * GDIM_0) {
+    DTYPE_i0 val = 0;
+    const ga_size w_im = index % width + pad_w;
+    const ga_size h_im = (index / width) % height + pad_h;
+    const ga_size c_im = index / (width * height);
+    ga_size kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    ga_size kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+    // compute the start and end of the output
+    const ga_size w_col_start =
+        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+    const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
+    const ga_size h_col_start =
+        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+    const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
+    // TODO: use LCM of stride and dilation to avoid unnecessary loops
+    for (ga_size h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+      for (ga_size w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+        ga_size h_k = (h_im - h_col * stride_h);
+        ga_size w_k = (w_im - w_col * stride_w);
+        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+          h_k /= dilation_h;
+          w_k /= dilation_w;
+          ga_size data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
+                                height_col + h_col) * width_col + w_col;
+          val += data_col[data_col_index];
+        }
+      }
+    }
+    data_im[data_im_offset + index] = val;
+  }
+}
+
+#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, *, size : 
+KERNEL void col2im_kernel(const ga_size n,
+    GLOBAL_MEM const DTYPE_i0 * data_col,
+    const ga_size height, const ga_size width, const ga_size channels,
+    const ga_size kernel_h, const ga_size kernel_w,
+    const ga_size pad_h, const ga_size pad_w,
+    const ga_size stride_h, const ga_size stride_w,
+    const ga_size height_col, const ga_size width_col,
+    GLOBAL_MEM DTYPE_i0 * data_im,
+    const ga_size data_im_offset) {
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0;
+       index < (n); index += LDIM_0 * GDIM_0) {
+    DTYPE_i0 val = 0;
+    const ga_size w_im = index % width + pad_w;
+    const ga_size h_im = (index / width) % height + pad_h;
+    const ga_size c_im = index / (width * height);
+    // compute the start and end of the output
+    const ga_size w_col_start =
+        (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
+    const ga_size w_col_end = min(w_im / stride_w + 1, width_col);
+    const ga_size h_col_start =
+        (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
+    const ga_size h_col_end = min(h_im / stride_h + 1, height_col);
+    // equivalent implementation, no dilation
+    ga_size offset =
+      (c_im * kernel_h * kernel_w + h_im * kernel_w + w_im) * height_col * width_col;
+    ga_size coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
+    ga_size coeff_w_col = (1 - stride_w * height_col * width_col);
+    for (ga_size h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (ga_size w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+      }
+    }
+    data_im[data_im_offset + index] = val;
+  }
+}
+
+
+
+#section support_code_struct
+
+int im2col(const size_t max_threads_dim,
+    gpudata * data_im, const size_t data_im_offset, const size_t channels,
+    const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w,
+    const size_t dilation_h, const size_t dilation_w,
+    const size_t pad_h, const size_t pad_w,
+    const size_t stride_h, const size_t stride_w,
+    gpudata * data_col) {
+  // We are going to launch channels * height_col * width_col kernels, each
+  // kernel responsible for copying a single-channel grid.
+  size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
+  size_t dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
+  size_t height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
+  size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
+  size_t num_kernels = channels * height_col * width_col;
+  size_t threads_per_block = max_threads_dim;
+  size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
+  int err;
+  GpuKernel *kernel;
+  if(dilation_h != 1 || dilation_w != 1){
+    err = dilated_im2col_kernel_call(
+      1, &threads_per_block, &n_blocks, 0,
+      num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
+      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
+      width_col, data_col);
+    if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "gpuarray error: dilated_im2col_kernel: %s.",
+                     GpuKernel_error(&k_dilated_im2col_kernel, err));
+    }
+  }
+  else{
+    err = im2col_kernel_call(
+      1, &threads_per_block, &n_blocks, 0,
+      num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
+      pad_h, pad_w, stride_h, stride_w, height_col,
+      width_col, data_col);
+    if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "gpuarray error: im2col_kernel: %s.",
+                     GpuKernel_error(&k_im2col_kernel, err));
+    }
+  }
+  return err;
+}
+
+int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channels,
+    const size_t height, const size_t width, const size_t patch_h, const size_t patch_w,
+    const size_t dilation_h, const size_t dilation_w,
+    const size_t pad_h, const size_t pad_w, const size_t stride_h,
+    const size_t stride_w, gpudata * data_im, const size_t data_im_offset) {
+  size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
+  size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
+  size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
+  size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
+  size_t num_kernels = channels * height * width;
+  size_t threads_per_block = max_threads_dim;
+  size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  int err;
+  if(dilation_h != 1 || dilation_w != 1){
+    err = dilated_col2im_kernel_call(
+      1, &threads_per_block, &n_blocks, 0,
+      num_kernels, data_col, height, width, channels, patch_h, patch_w,
+      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
+      height_col, width_col, data_im, data_im_offset);
+    if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "gpuarray error: dilated_col2im_kernel: %s.",
+                     GpuKernel_error(&k_dilated_col2im_kernel, err));
+    }
+  }
+  else{
+    err = col2im_kernel_call(
+      1, &threads_per_block, &n_blocks, 0,
+      num_kernels, data_col, height, width, channels, patch_h, patch_w,
+      pad_h, pad_w, stride_h, stride_w,
+      height_col, width_col, data_im, data_im_offset);
+    if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "gpuarray error: col2im_kernel: %s.",
+                     GpuKernel_error(&k_col2im_kernel, err));
+    }
+  }
+  return err;
+}
+
+
+// Theano op code
+// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
+// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
+//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
+PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
+                         PyGpuArrayObject *const weight,
+                         PyGpuArrayObject *const top,
+                         const size_t direction,
+                         const size_t dH = 1,
+                         const size_t dW = 1,
+                         const size_t dilH = 1,
+                         const size_t dilW = 1,
+                         const size_t padH = 0,
+                         const size_t padW = 0)
+{
+    if (PyGpuArray_NDIM(bottom) != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires bottom of 4D");
+        return NULL;
+    }
+    if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga))
+    {
+        PyErr_Format(PyExc_ValueError,
+                "GpuCorrMM requires bottom to be C-contiguous, "
+                "but strides are: %ld %ld %ld %ld\n",
+                PyGpuArray_STRIDES(bottom)[0],
+                PyGpuArray_STRIDES(bottom)[1],
+                PyGpuArray_STRIDES(bottom)[2],
+                PyGpuArray_STRIDES(bottom)[3]);
+        return NULL;
+    }
+
+    if (PyGpuArray_NDIM(weight) != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D");
+        return NULL;
+    }
+    if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
+    {
+        PyErr_Format(PyExc_ValueError,
+                "GpuCorrMM requires weight to be C-contiguous, "
+                "but strides are: %ld %ld %ld %ld\n",
+                PyGpuArray_STRIDES(weight)[0],
+                PyGpuArray_STRIDES(weight)[1],
+                PyGpuArray_STRIDES(weight)[2],
+                PyGpuArray_STRIDES(weight)[3]);
+        return NULL;
+    }
+
+    if (PyGpuArray_NDIM(top) != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires top of 4D");
+        return NULL;
+    }
+    if (!GpuArray_IS_C_CONTIGUOUS(&top->ga))
+    {
+        PyErr_Format(PyExc_ValueError,
+                "GpuCorrMM requires top to be C-contiguous, "
+                "but strides are: %ld %ld %ld %ld\n",
+                PyGpuArray_STRIDES(top)[0],
+                PyGpuArray_STRIDES(top)[1],
+                PyGpuArray_STRIDES(top)[2],
+                PyGpuArray_STRIDES(top)[3]);
+        return NULL;
+    }
+
+    // Extract some shape information for later and check shape consistency
+    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth)
+    const size_t batchSize = PyGpuArray_DIMS(bottom)[0];
+    const size_t nChannels = PyGpuArray_DIMS(bottom)[1];
+    const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
+    const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
+    // weights: (nFilters, nChannels, rows, columns)
+    const size_t nFilters = PyGpuArray_DIMS(weight)[0];
+    const size_t kH = PyGpuArray_DIMS(weight)[2];
+    const size_t kW = PyGpuArray_DIMS(weight)[3];
+    if (nChannels != PyGpuArray_DIMS(weight)[1]) {
+        PyErr_SetString(PyExc_ValueError,
+                "GpuCorrMM images and kernel must have the same stack size\n");
+        return NULL;
+    }
+    // implicit dilated filter
+    const size_t dil_kH = (kH - 1) * dilH + 1;
+    const size_t dil_kW = (kW - 1) * dilW + 1;
+    // top: (batchSize, nFilters, topHeight, topWidth)
+    const size_t topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
+    const size_t topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
+    if (batchSize != PyGpuArray_DIMS(top)[0] ||
+            nFilters != PyGpuArray_DIMS(top)[1] ||
+            topHeight != PyGpuArray_DIMS(top)[2] ||
+            topWidth != PyGpuArray_DIMS(top)[3]) {
+        PyErr_Format(PyExc_ValueError,
+                "GpuCorrMM shape inconsistency:\n"
+                "  bottom shape: %ld %ld %ld %ld\n"
+                "  weight shape: %ld %ld %ld %ld\n"
+                "  top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
+                batchSize, nChannels, bottomHeight, bottomWidth,
+                nFilters, nChannels, kH, kW,
+                PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
+                batchSize, nFilters, topHeight, topWidth);
+        return NULL;
+    }
+
+    int err = gpublas_setup(bottom->context->ctx);
+    if (err != GA_NO_ERROR) {
+        PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
+        return NULL;
+    }
+
+    // Get the max threads per blocks
+    size_t max_threads_dim;
+    err = gpucontext_property(bottom->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
+    if (err != GA_NO_ERROR){
+        PyErr_Format(PyExc_RuntimeError,
+                     "Could not fetch max_threads_dim.");
+        return NULL;
+    }
+
+    // Create temporary columns
+    size_t col_dim[2];
+    col_dim[0] = nChannels * kW * kH;
+    col_dim[1] = topHeight * topWidth;
+    PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim,
+                                                           bottom->ga.typecode,
+                                                           GA_C_ORDER,
+                                                           bottom->context,
+                                                           Py_None);
+    if (NULL == col)
+    {
+        PyErr_Format(PyExc_RuntimeError,
+                "GpuCorrMM failed to allocate working memory of %ld x %ld\n",
+                col_dim[0], col_dim[1]);
+        return NULL;
+    }
+
+    // Define some useful variables
+    const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
+    const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
+    const size_t K_ = col_dim[0];
+    const size_t N_ = col_dim[1];
+    const size_t M_ = nFilters;
+    const DTYPE_INPUT_0 one = 1.0f;
+    const DTYPE_INPUT_0 zero = 0.0f;
+
+    PyGpuArrayObject *output;
+    if (direction == 0) {  // forward pass
+        output = top;
+        // valid correlation: im2col, then gemm
+        // Iterate over batch
+        for (size_t n = 0; n < batchSize; n++) {
+            // First, im2col
+            err = im2col(max_threads_dim,
+                         bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
+                         bottomWidth, kH, kW, dilH, dilW,
+                         padH, padW, dH, dW, col->ga.data);
+            if (err != GA_NO_ERROR) {
+                Py_DECREF(col);
+                return NULL;
+            }
+            // Second, gemm
+            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
+                                N_, M_, K_, one,
+                                col->ga.data, 0, N_,
+                                weight->ga.data, 0, K_,
+                                zero,
+                                top->ga.data, n * top_stride, N_);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                        "GpuCorrMM encountered an error running sgemm.\n");
+                Py_DECREF(col);
+                return NULL;
+            }
+        }
+    }
+    else if (direction == 1) {  // backprop wrt. weights
+        output = weight;
+        // valid convolution: im2col, then gemm
+        // Iterate over batch
+        for (size_t n = 0; n < batchSize; n++) {
+            // First, im2col
+            err = im2col(max_threads_dim,
+                         bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
+                         bottomWidth, kH, kW, dilH, dilW,
+                         padH, padW, dH, dW, col->ga.data);
+            if (err != GA_NO_ERROR) {
+                Py_DECREF(col);
+                return NULL;
+            }
+            // Second, gemm
+            // Note that we accumulate into weight. We do so by setting beta = 0
+            // for the first iteration and beta = 1 for subsequent ones. (This
+            // is faster than setting weight to all zeros before the loop.)
+            err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
+                                K_, M_, N_, one,
+                                col->ga.data, 0, N_,
+                                top->ga.data, n * top_stride, N_,
+                                (n == 0) ? zero : one,
+                                weight->ga.data, 0, K_);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                        "GpuCorrMM encountered an error running sgemm.\n");
+                Py_DECREF(col);
+                return NULL;
+            }
+        }
+    }
+    else if (direction == 2) {  // backprop wrt. inputs
+        output = bottom;
+        // full convolution: gemm, then col2im
+        // Iterate over batch
+        for (size_t n = 0; n < batchSize; n++) {
+            // gemm into columns
+            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
+                                N_, K_, M_, one,
+                                top->ga.data, n * top_stride, N_,
+                                weight->ga.data, 0, K_,
+                                zero,
+                                col->ga.data, 0, N_);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                        "GpuCorrMM encountered an error running sgemm.\n");
+                Py_DECREF(col);
+                return NULL;
+            }
+            // col2im back to the data
+            err = col2im(max_threads_dim,
+                         col->ga.data, nChannels, bottomHeight, bottomWidth,
+                         kH, kW, dilH, dilW, padH, padW,
+                         dH, dW, bottom->ga.data, n * bottom_stride);
+            if (err != GA_NO_ERROR) {
+                Py_DECREF(col);
+                return NULL;
+            }
+        }
+    }
+    // Free temporary columns
+    Py_DECREF(col);
+
+    // Note that we don't change the refcount of the output matrix here. Output
+    // (re)allocation and refcounting is done in BaseGpuCorrMM.c_code_helper();
+    // in here output is just aliased to one of bottom, weights, or top.
+    return output;
+}
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -1877,8 +1877,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
    return result


-@register_opt2([AbstractConv2d, AbstractConv2d_gradWeights,
-                AbstractConv2d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
 def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    if (not isinstance(op, (AbstractConv2d,
                            AbstractConv2d_gradWeights,
@@ -1922,8 +1920,6 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    return [rval]


-@register_opt2([AbstractConv3d, AbstractConv3d_gradWeights,
-                AbstractConv3d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
 def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
    if (not isinstance(op, (AbstractConv3d,
                            AbstractConv3d_gradWeights,
@@ -1967,7 +1963,6 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
    return [rval]


-@register_opt('fast_compile', 'conv_dnn', 'cudnn')
 @local_optimizer([AbstractConv2d, AbstractConv3d])
 def local_abstractconv_cudnn(node):
    ctx = infer_context_name(*node.inputs)
@@ -1979,7 +1974,6 @@ def local_abstractconv_cudnn(node):
        return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)


-@register_opt('fast_compile', 'conv_dnn', 'cudnn')
 @local_optimizer([AbstractConv2d_gradWeights, AbstractConv3d_gradWeights])
 def local_abstractconv_gw_cudnn(node):
    ctx = infer_context_name(*node.inputs)
@@ -1991,7 +1985,6 @@ def local_abstractconv_gw_cudnn(node):
        return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)


-@register_opt('fast_compile', 'conv_dnn', 'cudnn')
 @local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs])
 def local_abstractconv_gi_cudnn(node):
    ctx = infer_context_name(*node.inputs)

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -22,7 +22,8 @@ from theano.scan_module import scan_utils, scan_op, scan_opt

 from theano.tensor.nnet.conv import ConvOp
 from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
-from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
+from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
+                                              AbstractConv2d,
                                              AbstractConv2d_gradWeights,
                                              AbstractConv2d_gradInputs,
                                              AbstractConv3d,
@@ -43,7 +44,9 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
 from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
                   gpugemm_no_inplace, gpugemm_inplace,
                   gpugemmbatch_no_inplace,
-                   gpugemv_no_inplace, gpugemv_inplace)
+                   gpugemv_no_inplace, gpugemv_inplace,
+                   GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
+                   GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
 from .blocksparse import (GpuSparseBlockGemv, GpuSparseBlockOuter,
                          gpu_sparse_block_outer,
                          gpu_sparse_block_outer_inplace,
@@ -1296,6 +1299,287 @@ def local_inplace_sparseblockouter(node):
        return [GpuSparseBlockOuter(inplace=True)(*node.inputs)]


+# Move to Gpu optimization
+@local_optimizer([GpuFromHost,
+                  AbstractConv2d,
+                  AbstractConv2d_gradWeights,
+                  AbstractConv2d_gradInputs,
+                  AbstractConv3d,
+                  AbstractConv3d_gradWeights,
+                  AbstractConv3d_gradInputs])
+def local_conv_gpu_conv(node):
+    """
+    gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host)
+
+    AbstractConv(host_from_gpu) -> host_from_gpu(AbstractConv)
+    """
+    if isinstance(node.op, GpuFromHost):
+        host_input = node.inputs[0]
+        if host_input.owner and isinstance(host_input.owner.op,
+                                           BaseAbstractConv):
+
+            conv = host_input.owner.op
+            inps = list(host_input.owner.inputs)
+            ctx = infer_context_name(*inps)
+            inps[0] = as_gpuarray_variable(inps[0], context_name=ctx)
+            inps[1] = as_gpuarray_variable(inps[1], context_name=ctx)
+            out = conv(*inps)
+            # out is on the GPU because both inputs are.
+            out = theano.tensor.patternbroadcast(out,
+                                                 node.outputs[0].broadcastable)
+            return [out]
+
+    if isinstance(node.op, BaseAbstractConv):
+        # conv(host_from_gpu) -> host_from_gpu(gpu_conv)
+        inp1 = node.inputs[0]
+        inp2 = node.inputs[1]
+        if ((isinstance(inp1.type, GpuArrayType) and
+             isinstance(inp2.type, GpuArrayType))):
+            # Both inputs are already directly on the GPU, nothing to do
+            return
+
+        inp1_on_gpu = (isinstance(inp1.type, GpuArrayType) or
+                       (inp1.owner and isinstance(inp1.owner.op, HostFromGpu)))
+        inp2_on_gpu = (isinstance(inp2.type, GpuArrayType) or
+                       (inp2.owner and isinstance(inp2.owner.op, HostFromGpu)))
+
+        if inp1_on_gpu or inp2_on_gpu:
+            conv = node.op
+            inps = list(node.inputs)
+            ctx = infer_context_name(*inps)
+            inps[0] = as_gpuarray_variable(inps[0], context_name=ctx)
+            inps[1] = as_gpuarray_variable(inps[1], context_name=ctx)
+            out = conv(*inps)
+            # out is on the GPU because both inputs are.
+            out = theano.tensor.patternbroadcast(
+                out,
+                node.outputs[0].broadcastable)
+            # If the original output was on CPU, we have to transfer it
+            if isinstance(node.outputs[0].type, tensor.TensorType):
+                return [tensor.as_tensor_variable(out)]
+            else:
+                return [out]
+register_opt()(local_conv_gpu_conv)
+
+
+# CorrMM opt
+@local_optimizer([AbstractConv2d])
+def local_abstractconv_gemm(node):
+    if not isinstance(node.op, AbstractConv2d):
+        return None
+    img, kern = node.inputs
+    if (not isinstance(img.type, GpuArrayType) or
+            not isinstance(kern.type, GpuArrayType)):
+        return None
+    ctx = infer_context_name(img, kern)
+
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+    if ((border_mode == 'full') and (subsample == (1, 1))):
+        if not node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1]
+        # need to dimshuffle the kernel for full convolution
+        kern = kern.dimshuffle(1, 0, 2, 3)
+        # call GpuCorrMM_gradInputs
+        rval = GpuCorrMM_gradInputs('valid',
+                                    subsample,
+                                    filter_dilation)(
+            gpu_contiguous(kern), gpu_contiguous(img))
+    else:
+        # need to flip the kernel if necessary
+        if node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1]
+        # By default use GpuCorrMM
+        rval = GpuCorrMM(border_mode,
+                         subsample,
+                         filter_dilation)(gpu_contiguous(img),
+                                          gpu_contiguous(kern))
+
+        # call GpuCorrMM_gradWeights if good
+        # (the latter is faster if batchsize * kernelHeight * kernelWidth
+        # is larger than inputChannels * outputHeight * outputWidth.
+        # GpuConv does not always store information on the batchsize and
+        # channels, though, so we only use what information we have.)
+        if ((subsample == (1, 1)) and (filter_dilation == (1, 1)) and
+                (node.op.imshp is not None) and
+                (None not in node.op.imshp[-2:]) and
+                (node.op.kshp is not None) and
+                (None not in node.op.kshp) and
+                border_mode != "half"):
+            # we know the kernel and output size
+            prod1 = node.op.kshp[0] * node.op.kshp[1]
+            prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
+                     (node.op.imshp[-1] - node.op.kshp[1] + 1))
+            if (None not in node.op.imshp[:1]):
+                # we also know batchsize and input channels
+                prod1 *= node.op.imshp[0]
+                prod2 *= node.op.imshp[1]
+            # compare to decide
+            if prod1 > prod2:
+                rval = GpuCorrMM_gradWeights(border_mode,
+                                             subsample,
+                                             filter_dilation)(
+                    gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
+                    gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)))
+                # (we need to wrap the result in as_gpuarray_variable,
+                # because we are not allowed to replace a GpuArray with
+                # a DimShuffle instance in a graph optimization)
+                rval = as_gpuarray_variable(
+                    rval.dimshuffle(1, 0, 2, 3),
+                    context_name=ctx)
+    return [rval]
+
+
+@local_optimizer([AbstractConv3d])
+def local_abstractconv3d_gemm(node):
+    if not isinstance(node.op, AbstractConv3d):
+        return None
+    img, kern = node.inputs
+    if (not isinstance(img.type, GpuArrayType) or
+            not isinstance(kern.type, GpuArrayType)):
+        return None
+    ctx = infer_context_name(img, kern)
+
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    filter_dilation = node.op.filter_dilation
+    if ((border_mode == 'full') and (subsample == (1, 1, 1))):
+        if not node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1, ::-1]
+        # need to dimshuffle the kernel for full convolution
+        kern = kern.dimshuffle(1, 0, 2, 3, 4)
+        # call GpuCorr3dMM_gradInputs
+        rval = GpuCorr3dMM_gradInputs('valid',
+                                      subsample,
+                                      filter_dilation)(
+            gpu_contiguous(kern), gpu_contiguous(img))
+    else:
+        # need to flip the kernel if necessary
+        if node.op.filter_flip:
+            kern = kern[:, :, ::-1, ::-1, ::-1]
+        # By default use GpuCorr3dMM
+        rval = GpuCorr3dMM(border_mode,
+                           subsample,
+                           filter_dilation)(gpu_contiguous(img),
+                                            gpu_contiguous(kern))
+
+        # call GpuCorr3dMM_gradWeights if good
+        # (the latter is faster if batchsize * kernelHeight * kernelWidth * kernelDepth
+        # is larger than inputChannels * outputHeight * outputWidth * outputDepth.
+        # GpuConv does not always store information on the batchsize and
+        # channels, though, so we only use what information we have.)
+        if ((subsample == (1, 1, 1)) and (filter_dilation == (1, 1, 1)) and
+                (node.op.imshp is not None) and
+                (None not in node.op.imshp[-3:]) and
+                (node.op.kshp is not None) and
+                (None not in node.op.kshp) and
+                border_mode != "half"):
+            # we know the kernel and output size
+            prod1 = node.op.kshp[0] * node.op.kshp[1] * node.op.kshp[2]
+            prod2 = ((node.op.imshp[-3] - node.op.kshp[0] + 1) *
+                     (node.op.imshp[-2] - node.op.kshp[1] + 1) *
+                     (node.op.imshp[-1] - node.op.kshp[2] + 1))
+            if (None not in node.op.imshp[:1]):
+                # we also know batchsize and input channels
+                prod1 *= node.op.imshp[0]
+                prod2 *= node.op.imshp[1]
+            # compare to decide
+            if prod1 > prod2:
+                rval = GpuCorr3dMM_gradWeights(border_mode,
+                                               subsample,
+                                               filter_dilation)(
+                    gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)),
+                    gpu_contiguous(kern.dimshuffle(1, 0, 2, 3, 4)))
+                # (we need to wrap the result in as_gpuarray_variable,
+                # because we are not allowed to replace a GpuArray with
+                # a DimShuffle instance in a graph optimization)
+                rval = as_gpuarray_variable(
+                    rval.dimshuffle(1, 0, 2, 3, 4),
+                    context_name=ctx)
+    return [rval]
+
+
+@local_optimizer([AbstractConv2d_gradWeights])
+def local_abstractconv_gradweights_gemm(node):
+    if not isinstance(node.op, AbstractConv2d_gradWeights):
+        return None
+    img, topgrad, shape = node.inputs
+    if not isinstance(img.type, GpuArrayType) or \
+            not isinstance(topgrad.type, GpuArrayType):
+        return None
+    ctx = infer_context_name(img, topgrad)
+
+    rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
+                                 subsample=node.op.subsample,
+                                 filter_dilation=node.op.filter_dilation)(
+        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
+    if node.op.filter_flip:
+        rval = rval[:, :, ::-1, ::-1]
+    rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
+    rval = as_gpuarray_variable(rval, context_name=ctx)
+    return [rval]
+
+
+@local_optimizer([AbstractConv3d_gradWeights])
+def local_abstractconv3d_gradweights_gemm(node):
+    if not isinstance(node.op, AbstractConv3d_gradWeights):
+        return None
+    img, topgrad, shape = node.inputs
+    if not isinstance(img.type, GpuArrayType) or \
+            not isinstance(topgrad.type, GpuArrayType):
+        return None
+    ctx = infer_context_name(img, topgrad)
+
+    rval = GpuCorr3dMM_gradWeights(border_mode=node.op.border_mode,
+                                   subsample=node.op.subsample,
+                                   filter_dilation=node.op.filter_dilation)(
+        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
+    if node.op.filter_flip:
+        rval = rval[:, :, ::-1, ::-1, ::-1]
+    rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
+    rval = as_gpuarray_variable(rval, context_name=ctx)
+    return [rval]
+
+
+@local_optimizer([AbstractConv2d_gradInputs])
+def local_abstractconv_gradinputs_gemm(node):
+    if not isinstance(node.op, AbstractConv2d_gradInputs):
+        return None
+    kern, topgrad, shape = node.inputs
+    if not isinstance(kern.type, GpuArrayType) or \
+            not isinstance(topgrad.type, GpuArrayType):
+        return None
+
+    if node.op.filter_flip:
+        kern = kern[:, :, ::-1, ::-1]
+
+    rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
+                                subsample=node.op.subsample,
+                                filter_dilation=node.op.filter_dilation)(
+        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
+    return [rval]
+
+
+@local_optimizer([AbstractConv3d_gradInputs])
+def local_abstractconv3d_gradinputs_gemm(node):
+    if not isinstance(node.op, AbstractConv3d_gradInputs):
+        return None
+    kern, topgrad, shape = node.inputs
+    if not isinstance(kern.type, GpuArrayType) or \
+            not isinstance(topgrad.type, GpuArrayType):
+        return None
+
+    if node.op.filter_flip:
+        kern = kern[:, :, ::-1, ::-1, ::-1]
+
+    rval = GpuCorr3dMM_gradInputs(border_mode=node.op.border_mode,
+                                  subsample=node.op.subsample,
+                                  filter_dilation=node.op.filter_dilation)(
+        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
+    return [rval]
+
+
 # This deals with any abstract convs that have a transfer somewhere
 @register_opt('fast_compile', 'conv_dnn', 'cudnn')
 @op_lifter([AbstractConv2d,
@@ -1493,3 +1777,53 @@ optdb.register('gpua_scanOp_make_inplace',
               'gpuarray',
               'inplace',
               'scan')
+
+
+# Register GPU convolution implementation
+# They are tried in a specific order so we can control
+# which ones take precedence over others.
+abstractconv_groupopt = theano.gof.optdb.LocalGroupDB()
+abstractconv_groupopt.__name__ = "gpuarray_abstractconv_opts"
+register_opt()(abstractconv_groupopt)
+
+# cuDNN is first, but only registered if cuDNN is available.
+# (we import these opts here instead of at the top of this file
+# to avoid a circular dependency problem with dnn)
+from .dnn import (local_abstractconv_cudnn, local_abstractconv_gw_cudnn,
+                  local_abstractconv_gi_cudnn)     # noqa: 402
+abstractconv_groupopt.register('local_abstractconv_dnn',
+                               local_abstractconv_cudnn, 20,
+                               'conv_dnn',
+                               'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
+abstractconv_groupopt.register('local_abstractconv_gw_dnn',
+                               local_abstractconv_gw_cudnn, 20,
+                               'conv_dnn',
+                               'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
+abstractconv_groupopt.register('local_abstractconv_gi_dnn',
+                               local_abstractconv_gi_cudnn, 20,
+                               'conv_dnn',
+                               'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
+# The GEMM-based convolution comes last to catch all remaining cases.
+# It can be disabled by excluding 'conv_gemm'.
+abstractconv_groupopt.register('local_abstractconv_gemm', local_abstractconv_gemm, 30,
+                               'conv_gemm',
+                               'gpuarray', 'fast_compile', 'fast_run')
+abstractconv_groupopt.register('local_abstractconv3d_gemm', local_abstractconv3d_gemm, 30,
+                               'conv_gemm',
+                               'gpuarray', 'fast_compile', 'fast_run')
+abstractconv_groupopt.register('local_abstractconv_gradweights_gemm',
+                               local_abstractconv_gradweights_gemm, 30,
+                               'conv_gemm',
+                               'gpuarray', 'fast_compile', 'fast_run')
+abstractconv_groupopt.register('local_abstractconv3d_gradweights_gemm',
+                               local_abstractconv3d_gradweights_gemm, 30,
+                               'conv_gemm',
+                               'gpuarray', 'fast_compile', 'fast_run')
+abstractconv_groupopt.register('local_abstractconv_gradinputs',
+                               local_abstractconv_gradinputs_gemm, 30,
+                               'conv_gemm',
+                               'gpuarray', 'fast_compile', 'fast_run')
+abstractconv_groupopt.register('local_abstractconv3d_gradinputs',
+                               local_abstractconv3d_gradinputs_gemm, 30,
+                               'conv_gemm',
+                               'gpuarray', 'fast_compile', 'fast_run')
--- a/theano/gpuarray/tests/test_abstractconv.py
+++ b/theano/gpuarray/tests/test_abstractconv.py
@@ -7,6 +7,9 @@ import numpy
 from theano.tensor.nnet.tests import test_abstract_conv
 from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
 from ..dnn import dnn_available, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI
+from ..blas import (
+    GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs,
+    GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)

 from .config import mode_with_gpu, test_ctx_name
 from pygpu import gpuarray
@@ -80,6 +83,72 @@ class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
                           filter_flip=flip, target_op=GpuDnnConvGradI)


+class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
+    @classmethod
+    def setup_class(cls):
+        test_abstract_conv.BaseTestConv2d.setup_class()
+        cls.shared = staticmethod(gpuarray_shared_constructor)
+        cls.mode = mode_with_gpu.excluding('cudnn')
+
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
+        mode = self.mode
+        o = self.get_output_shape(i, f, s, b, fd)
+        self.run_fwd(inputs_shape=i, filters_shape=f,
+                     subsample=s, verify_grad=True, mode=mode,
+                     provide_shape=provide_shape, border_mode=b,
+                     filter_flip=flip, target_op=(GpuCorrMM,
+                                                  GpuCorrMM_gradWeights,
+                                                  GpuCorrMM_gradInputs),
+                     filter_dilation=fd)
+        self.run_gradweight(inputs_shape=i, filters_shape=f,
+                            output_shape=o, subsample=s,
+                            verify_grad=True, mode=mode,
+                            provide_shape=provide_shape, border_mode=b,
+                            filter_flip=flip,
+                            target_op=GpuCorrMM_gradWeights,
+                            filter_dilation=fd)
+        self.run_gradinput(inputs_shape=i, filters_shape=f,
+                           output_shape=o, subsample=s,
+                           verify_grad=True, mode=mode,
+                           provide_shape=provide_shape, border_mode=b,
+                           filter_flip=flip,
+                           target_op=GpuCorrMM_gradInputs,
+                           filter_dilation=fd)
+
+
+class TestCorrMMConv3d(test_abstract_conv.BaseTestConv3d):
+    @classmethod
+    def setup_class(cls):
+        test_abstract_conv.BaseTestConv3d.setup_class()
+        cls.shared = staticmethod(gpuarray_shared_constructor)
+        cls.mode = mode_with_gpu.excluding('cudnn')
+
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
+        mode = self.mode
+        o = self.get_output_shape(i, f, s, b, fd)
+        self.run_fwd(inputs_shape=i, filters_shape=f,
+                     subsample=s, verify_grad=True, mode=mode,
+                     provide_shape=provide_shape, border_mode=b,
+                     filter_flip=flip, target_op=(GpuCorr3dMM,
+                                                  GpuCorr3dMM_gradWeights,
+                                                  GpuCorr3dMM_gradInputs),
+                     filter_dilation=fd)
+        self.run_gradweight(inputs_shape=i, filters_shape=f,
+                            output_shape=o, subsample=s,
+                            verify_grad=True, mode=mode,
+                            provide_shape=provide_shape, border_mode=b,
+                            filter_flip=flip,
+                            target_op=GpuCorr3dMM_gradWeights,
+                            filter_dilation=fd)
+        self.run_gradinput(inputs_shape=i, filters_shape=f,
+                           output_shape=o, subsample=s,
+                           verify_grad=True, mode=mode,
+                           provide_shape=provide_shape, border_mode=b,
+                           filter_flip=flip,
+                           target_op=GpuCorr3dMM_gradInputs,
+                           filter_dilation=fd)
+
+
 class TestDnnConvTypes(test_abstract_conv.TestConvTypes):
    def setUp(self):
        self.input = gpu_ftensor4()

--- a/theano/gpuarray/tests/test_gemmcorr.py
+++ b/theano/gpuarray/tests/test_gemmcorr.py
+from __future__ import absolute_import, print_function, division
+import unittest
+import numpy
+
+import theano
+from theano.tests import unittest_tools as utt
+
+from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInputs
+
+from ..type import gpuarray_shared_constructor
+from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
+from .config import mode_with_gpu, mode_without_gpu
+
+
+class TestCorrMM(unittest.TestCase):
+
+    def run_conv_valid(self, inputs_shape, filters_shape,
+                       border_mode='valid',
+                       filter_dilation=(1, 1),
+                       subsample=(1, 1),
+                       verify_grad=False):
+        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
+        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+
+        inputs = gpuarray_shared_constructor(inputs_val)
+        filters = gpuarray_shared_constructor(filters_val)
+
+        conv_ref = CorrMM(border_mode=border_mode,
+                          filter_dilation=filter_dilation,
+                          subsample=subsample)(inputs, filters)
+        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
+
+        conv = GpuCorrMM(border_mode=border_mode,
+                         filter_dilation=filter_dilation,
+                         subsample=subsample)(inputs, filters)
+        f = theano.function([], conv, mode=mode_with_gpu)
+
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+
+        if verify_grad:
+            utt.verify_grad(GpuCorrMM(border_mode=border_mode,
+                                      filter_dilation=filter_dilation,
+                                      subsample=subsample),
+                            [inputs_val, filters_val])
+
+    def test_valid(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            subsample=(2, 2))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            subsample=(2, 2))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            subsample=(3, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            subsample=(3, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            subsample=(3, 2))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            subsample=(1, 2))
+
+    def test_border_mode(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            border_mode='valid')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            border_mode='half')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            border_mode='full')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            border_mode=(0, 0))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            border_mode=(1, 2))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            border_mode=(3, 2))
+
+    def test_filter_dilation(self):
+        inputs_shape = [16, 20, 12, 1]
+        filters_shape = [10, 6, 5, 1]
+
+        for filter_dilation in [(2, 1), (1, 2)]:
+            for border_mode in ['valid', 'half', 'full']:
+                self.run_conv_valid(inputs_shape=inputs_shape,
+                                    filters_shape=filters_shape,
+                                    filter_dilation=filter_dilation,
+                                    border_mode=border_mode)
+
+    def test_verify_gradients(self):
+        # use a small example to check the gradients
+        inputs_shape = [2, 7, 9, 1]
+        filters_shape = [1, 3, 3, 1]
+
+        for filter_dilation in [(2, 1), (1, 2)]:
+            for border_mode in ['valid', 'half', 'full', (2, 1)]:
+                self.run_conv_valid(inputs_shape=inputs_shape,
+                                    filters_shape=filters_shape,
+                                    filter_dilation=filter_dilation,
+                                    border_mode=border_mode,
+                                    verify_grad=True)
+
+    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
+                       subsample=(1, 1)):
+        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
+        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
+        dCdH_shape = [dCdH_shape[i] for i in (0, 3, 1, 2)]
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
+        inputs = gpuarray_shared_constructor(inputs_val)
+        dCdH = gpuarray_shared_constructor(dCdH_val)
+        shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:]))
+
+        if (subsample == (1, 1)):
+            conv_ref = CorrMM_gradWeights(subsample=subsample)(
+                inputs, dCdH)
+            conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
+                inputs, dCdH)
+        else:
+            conv_ref = CorrMM_gradWeights(subsample=subsample)(
+                inputs, dCdH, shape=shape)
+            conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
+                inputs, dCdH, shape=shape)
+
+        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
+        f = theano.function([], conv_gemm, mode=mode_with_gpu)
+
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+
+    def test_gradweight(self):
+        self.run_gradweight(inputs_shape=(16, 10, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            dCdH_shape=(16, 5, 1, 10),
+                            subsample=(1, 1))
+        self.run_gradweight(inputs_shape=(16, 20, 10, 1),
+                            filters_shape=(10, 6, 4, 1),
+                            dCdH_shape=(16, 8, 4, 10),
+                            subsample=(2, 2))
+        self.run_gradweight(inputs_shape=(16, 20, 10, 1),
+                            filters_shape=(10, 6, 3, 1),
+                            dCdH_shape=(16, 5, 3, 10),
+                            subsample=(3, 3))
+        self.run_gradweight(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            dCdH_shape=(16, 8, 1, 10),
+                            subsample=(2, 1))
+
+    def run_gradinput(self, inputs_shape, filters_shape,
+                      subsample=(1, 1)):
+        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
+        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs = gpuarray_shared_constructor(inputs_val)
+        filters = gpuarray_shared_constructor(filters_val)
+
+        bottom_height = (inputs_shape[2] - 1) * subsample[0] + filters_shape[2]
+        bottom_width = (inputs_shape[3] - 1) * subsample[1] + filters_shape[3]
+        bottom_shape = gpuarray_shared_constructor(numpy.array([bottom_height, bottom_width]))
+
+        if (subsample == (1, 1)):
+            conv_ref = CorrMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs)
+            conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs)
+        else:
+            conv_ref = CorrMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs, shape=bottom_shape)
+            conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs, shape=bottom_shape)
+
+        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
+        f = theano.function([], conv_gemm, mode=mode_with_gpu)
+
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+
+    def test_gradinput(self):
+        self.run_gradinput(inputs_shape=(16, 15, 12, 10),
+                           filters_shape=(10, 6, 12, 1))
+        self.run_gradinput(inputs_shape=(16, 15, 12, 10),
+                           filters_shape=(10, 6, 12, 1),
+                           subsample=(2, 2))
+        self.run_gradinput(inputs_shape=(16, 15, 12, 10),
+                           filters_shape=(10, 6, 12, 1),
+                           subsample=(3, 3))
+        self.run_gradinput(inputs_shape=(16, 15, 12, 10),
+                           filters_shape=(10, 6, 12, 1),
+                           subsample=(3, 1))
--- a/theano/gpuarray/tests/test_gemmcorr3d.py
+++ b/theano/gpuarray/tests/test_gemmcorr3d.py
+from __future__ import absolute_import, print_function, division
+import unittest
+import numpy
+
+import theano
+from theano.tests import unittest_tools as utt
+
+from theano.tensor.nnet.corr3d import Corr3dMM, Corr3dMM_gradWeights, Corr3dMM_gradInputs
+
+from ..type import gpuarray_shared_constructor
+from ..blas import GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs
+from .config import mode_with_gpu, mode_without_gpu
+
+
+class TestCorr3dMM(unittest.TestCase):
+
+    def run_conv_valid(self, inputs_shape, filters_shape,
+                       border_mode='valid',
+                       filter_dilation=(1, 1, 1),
+                       subsample=(1, 1, 1),
+                       verify_grad=False):
+        inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
+        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+
+        inputs = gpuarray_shared_constructor(inputs_val)
+        filters = gpuarray_shared_constructor(filters_val)
+
+        conv_ref = Corr3dMM(border_mode=border_mode,
+                            filter_dilation=filter_dilation,
+                            subsample=subsample)(inputs, filters)
+        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
+
+        conv = GpuCorr3dMM(border_mode=border_mode,
+                           filter_dilation=filter_dilation,
+                           subsample=subsample)(inputs, filters)
+        f = theano.function([], conv, mode=mode_with_gpu)
+
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+
+        if verify_grad:
+            utt.verify_grad(GpuCorr3dMM(border_mode=border_mode,
+                                        filter_dilation=filter_dilation,
+                                        subsample=subsample),
+                            [inputs_val, filters_val])
+
+    def test_valid(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 16, 1),
+                            filters_shape=(10, 6, 12, 4, 1))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(2, 2, 2))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(2, 2, 2))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(3, 3, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(3, 3, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(3, 2, 1))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(1, 2, 3))
+
+    def test_border_mode(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='valid')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='half')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='full')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(0, 0, 0))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(1, 2, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(3, 2, 1))
+
+    def test_filter_dilation(self):
+        inputs_shape = [16, 20, 12, 15, 1]
+        filters_shape = [10, 6, 5, 4, 1]
+
+        for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
+            for border_mode in ['valid', 'half', 'full']:
+                self.run_conv_valid(inputs_shape=inputs_shape,
+                                    filters_shape=filters_shape,
+                                    filter_dilation=filter_dilation,
+                                    border_mode=border_mode)
+
+    def test_verify_gradients(self):
+        # use a small example to check the gradients
+        inputs_shape = [2, 7, 9, 6, 1]
+        filters_shape = [1, 3, 3, 2, 1]
+
+        for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
+            for border_mode in ['valid', 'half', 'full', (2, 1, 3)]:
+                self.run_conv_valid(inputs_shape=inputs_shape,
+                                    filters_shape=filters_shape,
+                                    filter_dilation=filter_dilation,
+                                    border_mode=border_mode,
+                                    verify_grad=True)
+
+    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
+                       subsample=(1, 1, 1)):
+        inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
+        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
+        dCdH_shape = [dCdH_shape[i] for i in (0, 4, 1, 2, 3)]
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
+        inputs = gpuarray_shared_constructor(inputs_val)
+        dCdH = gpuarray_shared_constructor(dCdH_val)
+        shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:]))
+
+        if (subsample == (1, 1, 1)):
+            conv_ref = Corr3dMM_gradWeights(subsample=subsample)(
+                inputs, dCdH)
+            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
+                inputs, dCdH)
+        else:
+            conv_ref = Corr3dMM_gradWeights(subsample=subsample)(
+                inputs, dCdH, shape=shape)
+            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
+                inputs, dCdH, shape=shape)
+
+        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
+        f = theano.function([], conv_gemm, mode=mode_with_gpu)
+
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+
+    def test_gradweight(self):
+        self.run_gradweight(inputs_shape=(16, 10, 12, 16, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            dCdH_shape=(16, 5, 1, 13, 10),
+                            subsample=(1, 1, 1))
+        self.run_gradweight(inputs_shape=(16, 20, 10, 16, 1),
+                            filters_shape=(10, 6, 4, 4, 1),
+                            dCdH_shape=(16, 8, 4, 7, 10),
+                            subsample=(2, 2, 2))
+        self.run_gradweight(inputs_shape=(16, 20, 10, 16, 1),
+                            filters_shape=(10, 6, 3, 4, 1),
+                            dCdH_shape=(16, 5, 3, 5, 10),
+                            subsample=(3, 3, 3))
+        self.run_gradweight(inputs_shape=(16, 20, 12, 16, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            dCdH_shape=(16, 8, 1, 5, 10),
+                            subsample=(2, 1, 3))
+
+    def run_gradinput(self, inputs_shape, filters_shape,
+                      subsample=(1, 1, 1)):
+        inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
+        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs = gpuarray_shared_constructor(inputs_val)
+        filters = gpuarray_shared_constructor(filters_val)
+
+        bottom_height = (inputs_shape[2] - 1) * subsample[0] + filters_shape[2]
+        bottom_width = (inputs_shape[3] - 1) * subsample[1] + filters_shape[3]
+        bottom_depth = (inputs_shape[4] - 1) * subsample[2] + filters_shape[4]
+        bottom_shape = gpuarray_shared_constructor(numpy.array([bottom_height, bottom_width, bottom_depth]))
+
+        if (subsample == (1, 1, 1)):
+            conv_ref = Corr3dMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs)
+            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs)
+        else:
+            conv_ref = Corr3dMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs, shape=bottom_shape)
+            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs, shape=bottom_shape)
+
+        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
+        f = theano.function([], conv_gemm, mode=mode_with_gpu)
+
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+
+    def test_gradinput(self):
+        self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
+                           filters_shape=(10, 6, 12, 4, 1))
+        self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
+                           filters_shape=(10, 6, 12, 4, 1),
+                           subsample=(2, 2, 2))
+        self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
+                           filters_shape=(10, 6, 12, 4, 1),
+                           subsample=(3, 3, 3))
+        self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
+                           filters_shape=(10, 6, 12, 4, 1),
+                           subsample=(3, 1, 2))