Add 3d correlation based on matrix multiplication

4acf0a40 · Nicolas Ballas · c022347b · 4acf0a40 · 4acf0a40 · 4acf0a40
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -960,6 +960,503 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
            return [[1], [1], [0], [0]]  # no connection to height, width


+class BaseGpuCorr3dMM(GpuOp):
+    """Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
+    `GpuCorr3dMM_gradInputs`. Cannot be used directly."""
+
+    def __init__(self, border_mode="valid",
+                 subsample=(1, 1, 1),
+                 pad=(0, 0, 0)):
+        if border_mode != "valid":
+            raise ValueError("border_mode must be 'valid'")
+        self.border_mode = border_mode
+        if len(subsample) != 3:
+            raise ValueError("subsample must have three elements")
+        self.subsample = subsample
+        if (pad not in ("half", "full")) and (len(pad) != 3):
+            raise ValueError("pad must be 'half', 'full', or have three elements")
+        self.pad = pad
+
+    def __eq__(self, other):
+        return type(self) == type(other) \
+            and self.border_mode == other.border_mode \
+            and self.subsample == other.subsample \
+            and self.pad == other.pad
+
+    def __hash__(self):
+        # don't use hash(self.version) as hash(-1)==-2 and
+        # hash(-2)==-2 in python!
+        return hash(type(self)) \
+            ^ hash(self.border_mode) \
+            ^ hash(self.subsample) \
+            ^ hash(self.pad)
+
+    def __str__(self):
+        return '%s{%s, %s, pad=%r}' % (
+            self.__class__.__name__,
+            self.border_mode,
+            str(self.subsample),
+            self.pad)
+
+    def flops(self, inp, outp):
+        """ Useful with the hack in profilemode to print the MFlops"""
+        # if the output shape is correct, then this gives the correct
+        # flops for any direction, sampling, padding, and border mode
+        inputs, filters = inp
+        outputs, = outp
+        assert inputs[1] == filters[1]
+        # nb mul and add by output pixel
+        flops = filters[2] * filters[3] * 2
+        # nb flops by output image
+        flops *= outputs[2] * outputs[3]
+        # nb patch multiplied
+        flops *= inputs[1] * filters[0] * inputs[0]
+        return flops
+
+    def c_headers(self):
+        return ['cuda_ndarray.cuh', '<stdio.h>']
+
+    def c_code_cache_version(self):
+        # raise this whenever modifying any of the support_code_files
+        return (0, 1)
+
+    def c_support_code_apply(self, node, nodename):
+        # REMEMBER TO RAISE c_code_cache_version when changing any of
+        # these files
+        files = ['corr3d_gemm.cu']
+        codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
+                for f in files]
+        return reduce(str.__add__, codes)
+
+    def c_code_helper(self, bottom, weights,
+                      top, direction,
+                      sub,
+                      height=None, width=None, depth=None):
+        """
+        This generates the C code for GpuCorrMM (direction="forward"),
+        GpuCorrMM_gradWeights (direction="backprop weights"), and
+        GpuCorrMM_gradInputs (direction="backprop inputs").
+        Depending on the direction, one of bottom, weights, top will
+        receive the output, while the other two serve as inputs.
+
+        :param bottom: Variable name of the input images in the forward pass,
+            or the gradient of the input images in backprop wrt. inputs
+        :param weights: Variable name of the filters in the forward pass,
+            or the gradient of the filters in backprop wrt. weights
+        :param top: Variable name of the output images / feature maps in the
+            forward pass, or the gradient of the outputs in the backprop passes
+        :param direction: "forward" to correlate bottom with weights and store
+            results in top,
+            "backprop weights" to do a valid convolution of bottom with top
+            (swapping the first two dimensions) and store results in weights,
+            and "backprop inputs" to do a full convolution of top with weights
+            (swapping the first two dimensions) and store results in bottom.
+        :param sub: Dictionary of substitutions useable to help generating the
+            C code.
+        :param height: If self.subsample[0] != 1, a variable giving the height
+            of the filters for direction="backprop weights" or the height of the
+            input images for direction="backprop inputs".
+            If self.pad == 'half', a variable giving the height of the filters
+            for direction="backprop weights".
+            Ignored otherwise.
+        :param width: If self.subsample[1] != 1, a variable giving the width
+            of the filters for direction="backprop weights" or the width of the
+            input images for direction="backprop inputs".
+            If self.pad == 'half', a variable giving the width of the filters
+            for direction="backprop weights".
+            Ignored otherwise.
+        """
+        if self.border_mode != "valid":
+            raise ValueError("mode must be 'valid'")
+        dH, dW, dD = self.subsample
+        if self.pad == "half":
+            padH = padW = padD = -1
+        elif self.pad == "full":
+            padH = padW = padD =-2
+        else:
+            padH, padW, padD = self.pad
+        if direction == "forward":
+            direction = 0
+            out = top
+        elif direction == "backprop weights":
+            direction = 1
+            out = weights
+        elif direction == "backprop inputs":
+            direction = 2
+            out = bottom
+        else:
+            raise ValueError("direction must be one of 'forward', "
+                    "'backprop weights', 'backprop inputs'")
+        # When subsampling, we cannot unambiguously infer the height and width
+        # of bottom and weights from top, so we require them to be given.
+        # Similarly, when pad="half", we cannot infer the weight size.
+        if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
+            if not height:
+                raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
+            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
+        else:
+            height = 'NULL'
+        if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
+            if not width:
+                raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
+            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
+        else:
+            width = 'NULL'
+        if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
+            if not depth:
+                raise ValueError("depth must be given for backprop with horizontal sampling or pad='half'")
+            depth = '(*(npy_int*)(PyArray_DATA(%s)))' % depth
+        else:
+            depth = 'NULL'
+        sub = sub.copy()
+        sub.update(locals())
+
+        return """
+    // Mandatory args
+    int direction = %(direction)s;  // forward, bprop weights, bprop inputs
+
+    // Optional args
+    int dH = %(dH)s;
+    int dW = %(dW)s;
+    int dD = %(dD)s;
+    int padH = %(padH)s;
+    int padW = %(padW)s;
+    int padD = %(padD)s;
+
+    CudaNdarray * bottom = %(bottom)s;
+    CudaNdarray * weights = %(weights)s;
+    CudaNdarray * top = %(top)s;
+    CudaNdarray * out2 = NULL;
+
+    // Obtain or infer kernel width and height
+    // (we need to know it early to be able to handle auto-padding)
+    int kH, kW, kD;
+    if (direction != 1)
+    {
+      // weight is an input variable, we can just read its shape
+      kH = CudaNdarray_HOST_DIMS(weights)[2];
+      kW = CudaNdarray_HOST_DIMS(weights)[3];
+      kD = CudaNdarray_HOST_DIMS(weights)[4];
+    }
+    else
+    {
+      if ((dH != 1) || (padH == -1))
+      {
+         // vertical subsampling or half padding, kernel height is specified
+         kH = %(height)s;
+      }
+      else if (padH == -2)
+      {
+        // vertical full padding, we can infer the kernel height
+        kH = 2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
+      }
+      else
+      {
+        // explicit padding, we can infer the kernel height
+        kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
+      }
+      if ((dW != 1) || (padW == -1))
+      {
+        kW = %(width)s;
+      }
+      else if (padW == -2)
+      {
+         kW = 2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
+      }
+      else
+      {
+        kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
+      }
+      if ((dD != 1) || (padD == -1))
+      {
+        kD = %(depth)s;
+      }
+      else if (padD == -2)
+      {
+         kD = 2 - CudaNdarray_HOST_DIMS(bottom)[4] + (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD;
+      }
+      else
+      {
+        kD = CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD;
+      }
+    }
+
+    // Auto-padding if requested
+    if (padH == -1)
+    { // vertical half padding
+      padH = kH / 2;
+    }
+    else if (padH == -2)
+    { // vertical full padding
+      padH = kH - 1;
+    }
+    else if (padH < 0)
+    {
+      PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padH must be >= -2");
+      %(fail)s
+    }
+    if (padW == -1) {  // horizontal half padding
+      padW = kW / 2;
+    }
+    else if (padW == -2) {  // horizontal full padding
+      padW = kW - 1;
+    }
+    else if (padW < 0)
+    {
+      PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padW must be >= -2");
+      %(fail)s
+    }
+    if (padD == -1)
+    { // horizontal half padding
+      padD = kD / 2;
+    }
+    else if (padD == -2)
+    { // horizontal full padding
+      padW = kW - 1;
+    }
+    else if (padW < 0)
+    {
+      PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padW must be >= -2");
+      %(fail)s
+    }
+
+    // Infer output shape
+    int out_dim[5];
+    switch(direction) {
+    case 0:  // forward pass
+        // output is top: (batchsize, num_filters, height, width, depth)
+        // height and width: top = (bottom + 2*pad - weight) / sample + 1
+        out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
+        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
+        out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1;
+        out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1;
+        out_dim[4] = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - CudaNdarray_HOST_DIMS(weights)[4]) / dD + 1;
+        break;
+    case 1:  // backprop wrt. weights
+        // output is weights: (num_filters, num_channels, height, width, depth)
+        // height, width and depth: weights = bottom + 2*pad - (top-1) * sample
+        out_dim[0] = CudaNdarray_HOST_DIMS(top)[1];
+        out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[1];
+        out_dim[2] = kH;  // already inferred further above
+        out_dim[3] = kW;  // how convenient
+        out_dim[4] = kD;
+        break;
+    case 2:  // backprop wrt. inputs
+        // output is bottom: (batchsize, num_channels, height, width, depth)
+        // height, width and depth: bottom = (top-1) * sample + weights - 2*pad
+        out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
+        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
+        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH;
+        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW;
+        out_dim[4] = (dD != 1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + CudaNdarray_HOST_DIMS(weights)[4] - 2*padD;
+        break;
+    default:
+        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");
+        %(fail)s
+    }
+
+    // Prepare output array
+    if (!(%(out)s
+          && %(out)s->nd == 5
+          && CudaNdarray_is_c_contiguous(%(out)s)
+          && CudaNdarray_HOST_DIMS(%(out)s)[0] == out_dim[0]
+          && CudaNdarray_HOST_DIMS(%(out)s)[1] == out_dim[1]
+          && CudaNdarray_HOST_DIMS(%(out)s)[2] == out_dim[2]
+          && CudaNdarray_HOST_DIMS(%(out)s)[3] == out_dim[3]
+          && CudaNdarray_HOST_DIMS(%(out)s)[4] == out_dim[4]))
+    {
+        Py_XDECREF(%(out)s);
+        %(out)s = (CudaNdarray*)CudaNdarray_NewDims(5, out_dim);
+        if (NULL == %(out)s)
+        {
+          PyErr_Format(PyExc_RuntimeError,
+                       "BaseGpuCorr3dM: Failed to allocate output of %%d x %%d x %%d x %%d x %%d",
+                    out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]);
+            %(fail)s
+        }
+    }
+
+    // Call CUDA code
+    out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dD, padH, padW, padD);
+    if (out2==NULL){
+       %(fail)s
+    }
+    assert (out2 == %(out)s);
+
+""" % sub
+
+
+class GpuCorr3dMM(BaseGpuCorr3dMM):
+    """GPU correlation implementation using Matrix Multiplication.
+
+    :warning: For 700 series Nvidia GPUs of compute capability 3.5 and CUDA 5.0
+        to 6.0, there is a bug in CUBLAS' matrix multiplication function that
+        can make GpuCorrMM or its gradients crash for some input and filter
+        shapes. So if you have a Tesla K20, Tesla K40, Quadro K6000, GeForce GT
+        640 (DDR5), GeForce GTX 780 (or Ti), GeForce GTX TITAN (or Black or Z)
+        and experience a crash, switching to CUDA 6.5 or CUDA 4.2 should fix it.
+        If this is not possible, changing the input or filter shapes (e.g., the
+        batchsize or number of filters) may also work around the CUBLAS bug.
+    """
+    def __init__(self, border_mode="valid",
+            subsample=(1, 1, 1),
+            pad=(0, 0, 0)):
+        """
+        :param border_mode: currently supports "valid" only; "full" can be
+            simulated by setting `pad="full"` (at the cost of performance), or
+            by using `GpuCorrMM_gradInputs`
+        :param subsample: the subsample operation applied to each output image.
+            Should be a tuple with 3 elements.
+            `(sv, sh, sl)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`,
+            but faster.
+            Set to `(1, 1, 1)` to disable subsampling.
+        :param pad: the width of a border of implicit zeros to pad the input
+            image with. Should be a tuple with 3 elements giving the numbers of
+            rows and columns to pad on each side, or "half" to set the padding
+            to `(kernel_rows // 2, kernel_columns // 2, kernel_depth // 2)`, or "full" to set the
+            padding to `(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)` at runtime.
+            Set to `(0, 0, 0)` to disable padding.
+
+        :note: Currently, the Op requires the inputs, filters and outputs to be
+            C-contiguous. Use :func:`gpu_contiguous
+            <theano.sandbox.cuda.basic_ops.gpu_contiguous>` on these arguments
+            if needed.
+        """
+        super(GpuCorr3dMM, self).__init__(border_mode, subsample, pad)
+
+    def make_node(self, img, kern):
+        img = as_cuda_ndarray_variable(img)
+        kern = as_cuda_ndarray_variable(kern)
+        if img.type.ndim != 5:
+            raise TypeError('img must be 5D tensor')
+        if kern.type.ndim != 5:
+            raise TypeError('kern must be 5D tensor')
+
+        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
+                         False, False, False]
+        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        bottom, weights = inp
+        top, = out_
+        direction = "forward"
+        return super(GpuCorr3dMM, self).c_code_helper(bottom, weights, top, direction, sub)
+
+    def grad(self, inp, grads):
+        bottom, weights = inp
+        top, = grads
+        top = gpu_contiguous(top)
+        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)(
+                weights, top, bottom.shape[-2:])
+        d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)(
+                bottom, top, weights.shape[-2:])
+        return d_bottom, d_weights
+
+
+class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
+    """Gradient wrt. filters for `GpuCorr3dMM`.
+
+    :note: You will not want to use this directly, but rely on Theano's
+    automatic differentiation or graph optimization to use it as needed."""
+
+    def __init__(self, border_mode="valid",
+                 subsample=(1, 1, 1),
+                 pad=(0, 0, 0)):
+        super(GpuCorr3dMM_gradWeights, self).__init__(border_mode, subsample, pad)
+
+    def make_node(self, img, topgrad, shape=None):
+        img = as_cuda_ndarray_variable(img)
+        topgrad = as_cuda_ndarray_variable(topgrad)
+        if img.type.ndim != 5:
+            raise TypeError('img must be 5D tensor')
+        if topgrad.type.ndim != 5:
+            raise TypeError('topgrad must be 5D tensor')
+        if self.subsample != (1, 1, 1) or self.pad == "half":
+            if shape is None:
+                raise ValueError('shape must be given if subsample != (1, 1, 1), or pad == "half"')
+            height_width_depth = [shape[0], shape[1], shape[2]]
+        else:
+            height_width_depth = []
+
+        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
+                         False, False, False]
+        print [img, topgrad] + height_width_depth
+        return Apply(self, [img, topgrad] + height_width_depth, [CudaNdarrayType(broadcastable)()])
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        bottom, top = inp[:2]
+        height, width, depth = inp[2:] or (None, None, None)
+        weights, = out_
+        direction = "backprop weights"
+        return super(GpuCorr3dMM_gradWeights, self).c_code_helper(bottom, weights, top, direction, sub, height, width, depth)
+
+    def grad(self, inp, grads):
+        bottom, top = inp[:2]
+        weights, = grads
+        weights = gpu_contiguous(weights)
+        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)(weights, top, bottom.shape[-2:])
+        d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)(
+            bottom, weights)
+        d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
+        return (d_bottom, d_top) + d_height_width_depth
+
+    def connection_pattern(self, node):
+        if node.nin == 2:
+            return [[1], [1]]
+        else:
+            return [[1], [1], [0], [0], [0]]  # no connection to height, width
+
+class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
+    """Gradient wrt. inputs for `GpuCorr3dMM`.
+
+    :note: You will not want to use this directly, but rely on Theano's
+    automatic differentiation or graph optimization to use it as needed."""
+
+    def __init__(self, border_mode="valid",
+                 subsample=(1, 1, 1),
+                 pad=(0, 0, 0)):
+        super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample, pad)
+
+    def make_node(self, kern, topgrad, shape=None):
+        kern = as_cuda_ndarray_variable(kern)
+        topgrad = as_cuda_ndarray_variable(topgrad)
+        if kern.type.ndim != 5:
+            raise TypeError('kern must be 5D tensor')
+        if topgrad.type.ndim != 5:
+            raise TypeError('topgrad must be 5D tensor')
+        if self.subsample != (1, 1, 1) and shape is None:
+            raise ValueError('shape must be given if subsample != (1, 1, 1)')
+        height_width_depth = [shape[0], shape[1], shape[2]] if self.subsample != (1, 1, 1) else []
+
+        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+                         False, False, False]
+        return Apply(self, [kern, topgrad] + height_width_depth, [CudaNdarrayType(broadcastable)()])
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        weights, top = inp[:2]
+        height, width, depth = inp[2:] or (None, None, None)
+        bottom, = out_
+        direction = "backprop inputs"
+        return super(GpuCorr3dMM_gradInputs, self).c_code_helper(bottom, weights, top, direction, sub, height, width, depth)
+
+    def grad(self, inp, grads):
+        weights, top = inp[:2]
+        bottom, = grads
+        bottom = gpu_contiguous(bottom)
+        d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)(
+            bottom, top, weights.shape[-2:])
+        d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)(
+                bottom, weights)
+        d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
+        return (d_weights, d_top) + d_height_width_depth
+
+    def connection_pattern(self, node):
+        if node.nin == 2:
+            return [[1], [1], [1]]
+        else:
+            return [[1], [1], [0], [0], [0]]  # no connection to height, width
+
+
 ##
 # Not really a BLAS operation, but whatever.
 #

--- a/theano/sandbox/cuda/corr3d_gemm.cu
+++ b/theano/sandbox/cuda/corr3d_gemm.cu
+// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
+// sources are clearly marked. Below we reproduce the original license of
+// the Caffe software.
+/*
+Copyright (c) 2014, The Regents of the University of California (Regents)
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#undef _GLIBCXX_ATOMIC_BUILTINS
+
+
+// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/caffe_common.hpp)
+// CUDA: grid stride looping
+#define CUDA_KERNEL_LOOP(i, n)                        \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n);                                       \
+       i += blockDim.x * gridDim.x)
+
+// CUDA: thread number configuration.
+// Use 1024 threads per block, which requires cuda sm_2x or above,
+// or fall back to attempt compatibility (best of luck to you).
+#if __CUDA_ARCH__ >= 200
+    const int CUDA_NUM_THREADS = 1024;
+#else
+    const int CUDA_NUM_THREADS = 512;
+#endif
+
+// CUDA: number of blocks for threads.
+inline int GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+
+// (Adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
+// Kernels for fast unfold + copy
+__global__ void im3d2col_kernel(const int n, const float* data_im,
+                                const int height, const int width,
+                                const int depth,
+                                const int kernel_h, const int kernel_w,
+                                const int kernel_d,
+                                const int pad_h, const int pad_w,
+                                const int pad_d,
+                                const int stride_h, const int stride_w,
+                                const int stride_d,
+                                const int height_col, const int width_col,
+                                const int depth_col,
+                                float* data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int d_out = index % depth_col;
+    int w_index = index / depth_col;
+    int w_out = w_index % width_col;
+    int h_index = w_index / width_col;
+    int h_out = h_index % height_col;
+
+    int channel_in = h_index / height_col;
+    //channel_in = 1;
+
+    int channel_out = channel_in * kernel_h * kernel_w * kernel_d;
+
+    int h_in = h_out * stride_h - pad_h;
+    int w_in = w_out * stride_w - pad_w;
+    int d_in = d_out * stride_d - pad_d;
+
+    float* data_col_ptr = data_col;
+    data_col_ptr += channel_out * (height_col * width_col * depth_col) +
+      h_out * (width_col * depth_col) + w_out * depth_col + d_out;
+
+    const float* data_im_ptr = data_im;
+    data_im_ptr += channel_in * (height * width * depth) +
+      h_in * (width * depth) + w_in * depth + d_in;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      int h = h_in + i;
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        int w = w_in + j;
+        for (int k = 0; k < kernel_d; ++k)
+        {
+          int d = d_in + k;
+          *data_col_ptr = (h >= 0 && w >= 0 && d >= 0 &&
+                           h < height && w < width && d < depth) ?
+                           data_im_ptr[i * (width * depth) + j *depth + k] : 0;
+          data_col_ptr += height_col * width_col * depth_col;
+        }
+      }
+    }
+  }
+}
+
+void im3d2col(const float* data_im, const int channels,
+              const int height, const int width, const int depth,
+              const int kernel_h, const int kernel_w, const int kernel_d,
+              const int pad_h, const int pad_w, const int pad_d,
+              const int stride_h, const int stride_w, const int stride_d,
+              float* data_col)
+{
+  // We are going to launch channels * height_col * width_col * depth_col kernels, each
+  // kernel responsible for copying a single-channel grid.
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1;
+  int num_kernels = channels * height_col * width_col * depth_col;
+  im3d2col_kernel<<<GET_BLOCKS(num_kernels),
+                    CUDA_NUM_THREADS>>>(num_kernels, data_im,
+                                        height, width, depth,
+                                        kernel_h, kernel_w, kernel_d,
+                                        pad_h, pad_w, pad_d,
+                                        stride_h, stride_w, stride_d,
+                                        height_col, width_col, depth_col,
+                                        data_col);
+}
+
+
+__global__ void col2im3d_kernel(const int n, const float* data_col,
+                                const int height, const int width,
+                                const int depth, const int channels,
+                                const int patch_h, const int patch_w,
+                                const int patch_d, const int pad_h,
+                                const int pad_w, const int pad_d,
+                                const int stride_h, const int stride_w,
+                                const int stride_d, const int height_col,
+                                const int width_col, const int depth_col,
+                                float* data_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    float val = 0;
+    int d = index % depth + pad_d;
+    int w_index = index / depth;
+    int w = w_index % width + pad_w;
+    int h_index = w_index / width;
+    int h = h_index % height + pad_h;
+    int c = h_index / height;
+
+    // compute the start and end of the output
+    int d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1;
+    int d_col_end = min(d / stride_d + 1, depth_col);
+    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
+    int w_col_end = min(w / stride_w + 1, width_col);
+    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
+    int h_col_end = min(h / stride_h + 1, height_col);
+
+    int offset =
+      (c * patch_h * patch_w * patch_d + h * patch_w * patch_d + w * patch_d + d) * height_col * width_col * depth_col;
+
+    int coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
+    int coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
+    int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col)
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
+      }
+   }
+    data_im[index] = val;
+  }
+}
+
+void col2im3d(const float* data_col, const int channels,
+              const int height, const int width, const int depth,
+              const int patch_h, const int patch_w, const int patch_d,
+              const int pad_h, const int pad_w, const int pad_d,
+              const int stride_h, const int stride_w, const int stride_d,
+              float* data_im)
+{
+  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1;
+  int num_kernels = channels * height * width * depth;
+
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  col2im3d_kernel<<<GET_BLOCKS(num_kernels),
+                    CUDA_NUM_THREADS>>>(num_kernels, data_col,
+                                        height, width, depth, channels,
+                                        patch_h, patch_w, patch_d,
+                                        pad_h, pad_w, pad_d,
+                                        stride_h, stride_w, stride_d,
+                                        height_col, width_col, depth_col,
+                                        data_im);
+}
+
+
+
+
+// Theano op code
+// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas
+// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
+//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
+// Adaptation for 3d
+CudaNdarray* corr3dMM(CudaNdarray *const bottom,
+                      CudaNdarray *const weight,
+                      CudaNdarray *const top,
+                      const int direction,
+                      const int dH = 1,
+                      const int dW = 1,
+                      const int dD = 1,
+                      const int padH = 0,
+                      const int padW = 0,
+                      const int padD = 1)
+{
+    if (bottom->nd != 5)
+    {
+      PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D");
+      return NULL;
+    }
+    if (!CudaNdarray_is_c_contiguous(bottom))
+    {
+      PyErr_Format(PyExc_ValueError,
+                   "GpuCorr3dMM requires bottom to be C-contiguous, "
+                   "but strides are: %d %d %d %d %d\n",
+                   CudaNdarray_HOST_STRIDES(bottom)[0],
+                   CudaNdarray_HOST_STRIDES(bottom)[1],
+                   CudaNdarray_HOST_STRIDES(bottom)[2],
+                   CudaNdarray_HOST_STRIDES(bottom)[3],
+                   CudaNdarray_HOST_STRIDES(bottom)[4]);
+      return 0;
+    }
+    if (weight->nd != 5)
+    {
+      PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D");
+      return 0;
+    }
+    if (!CudaNdarray_is_c_contiguous(weight))
+    {
+      PyErr_Format(PyExc_ValueError,
+                   "GpuCorr3dMM requires weight to be C-contiguous, "
+                   "but strides are: %d %d %d %d %d\n",
+                   CudaNdarray_HOST_STRIDES(weight)[0],
+                   CudaNdarray_HOST_STRIDES(weight)[1],
+                   CudaNdarray_HOST_STRIDES(weight)[2],
+                   CudaNdarray_HOST_STRIDES(weight)[3],
+                   CudaNdarray_HOST_STRIDES(weight)[4]);
+      return 0;
+    }
+
+    if (top->nd != 5)
+    {
+      PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D");
+      return 0;
+    }
+    if (!CudaNdarray_is_c_contiguous(top))
+    {
+      PyErr_Format(PyExc_ValueError,
+                   "GpuCorr3dMM requires top to be C-contiguous, "
+                   "but strides are: %d %d %d %d %d\n",
+                   CudaNdarray_HOST_STRIDES(top)[0],
+                   CudaNdarray_HOST_STRIDES(top)[1],
+                   CudaNdarray_HOST_STRIDES(top)[2],
+                   CudaNdarray_HOST_STRIDES(top)[3],
+                   CudaNdarray_HOST_STRIDES(top)[4]);
+      return 0;
+    }
+
+
+    // Extract some shape information for later and check shape consistency
+    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
+    const int batchSize = CudaNdarray_HOST_DIMS(bottom)[0];
+    const int nChannels = CudaNdarray_HOST_DIMS(bottom)[1];
+    const int bottomHeight = CudaNdarray_HOST_DIMS(bottom)[2];
+    const int bottomWidth = CudaNdarray_HOST_DIMS(bottom)[3];
+    const int bottomDepth = CudaNdarray_HOST_DIMS(bottom)[4];
+    // weights: (nFilters, nChannels, rows, columns, depth)
+    const int nFilters = CudaNdarray_HOST_DIMS(weight)[0];
+    const int kH = CudaNdarray_HOST_DIMS(weight)[2];
+    const int kW = CudaNdarray_HOST_DIMS(weight)[3];
+    const int kD = CudaNdarray_HOST_DIMS(weight)[4];
+    if (nChannels != CudaNdarray_HOST_DIMS(weight)[1])
+    {
+      PyErr_SetString(PyExc_ValueError,
+                      "GpuCorr3dMM images and kernel must have the same stack size\n");
+      return 0;
+    }
+    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
+    const int topHeight = int((bottomHeight + 2*padH - kH) / dH) + 1;
+    const int topWidth  = int((bottomWidth + 2*padW - kW) / dW) + 1;
+    const int topDepth  = int((bottomDepth + 2*padD - kD) / dD) + 1;
+    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
+        nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
+        topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
+        topWidth != CudaNdarray_HOST_DIMS(top)[3] ||
+        topDepth != CudaNdarray_HOST_DIMS(top)[4])
+   {
+     PyErr_Format(PyExc_ValueError,
+                  "GpuCorr3dMM shape inconsistency:\n"
+                  "  bottom shape: %d %d %d %d %d\n"
+                  "  weight shape: %d %d %d %d %d\n"
+                  "  top shape: %d %d %d %d %d (expected %d %d %d %d %d)\n",
+                  batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
+                  nFilters, nChannels, kH, kW, kD,
+                  CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
+                  CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3],
+                  CudaNdarray_HOST_DIMS(top)[4],
+                  batchSize, nFilters, topHeight, topWidth, topDepth);
+        return 0;
+    }
+
+    // Create temporary columns
+    int col_dim[2];
+    col_dim[0] = nChannels * kW * kH * kD;
+    col_dim[1] = topHeight * topWidth * topDepth;
+    CudaNdarray* col = (CudaNdarray*) CudaNdarray_NewDims(2, col_dim);
+    if (0 == col)
+    {
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuCorr3dMM failed to allocate working memory of %d x %d\n",
+                   col_dim[0], col_dim[1]);
+        return 0;
+    }
+
+    // Define some useful variables
+    const int bottom_stride = CudaNdarray_HOST_STRIDES(bottom)[0];
+    const int top_stride = CudaNdarray_HOST_STRIDES(top)[0];
+    const int K_ = col_dim[0];
+    const int N_ = col_dim[1];
+    const int M_ = nFilters;
+    const float one = 1.0f;
+    const float zero = 0.0f;
+
+    CudaNdarray *output;
+    if (direction == 0)
+    { // forward pass
+      output = top;
+      // valid correlation: im2col, then gemm
+      // Iterate over batch
+      for (int n = 0; n < batchSize; n++)
+      {
+        // First, im3d2col
+        im3d2col(bottom->devdata + n * bottom_stride,
+                 nChannels,
+                 bottomHeight, bottomWidth, bottomDepth,
+                 kH, kW, kD,
+                 padH, padW, padD,
+                 dH, dW, dD,
+                 col->devdata);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+          PyErr_Format(PyExc_RuntimeError,
+                       "GpuCorr3dMM encountered a CUDA error in im2col: %s\n"
+                       "This could be a known bug in CUDA, please see the "
+                       "GpuCorr3dMM() documentation.\n",
+                       cudaGetErrorString(err));
+          return 0;
+        }
+        // Second, gemm
+        cublasStatus_t status = cublasSgemm(handle,
+                                            CUBLAS_OP_N, CUBLAS_OP_N,
+                                            N_, M_, K_,
+                                            &one,
+                                            col->devdata, N_,
+                                            weight->devdata, K_,
+                                            &zero,
+                                            top->devdata + n * top_stride, N_);
+        if (status != CUBLAS_STATUS_SUCCESS)
+        {
+          PyErr_Format(PyExc_RuntimeError,
+                       "GpuCorr3dMM encountered a CUBLAS error: %s\n"
+                       "This could be a known bug in CUDA, please see the "
+                       "GpuCorr3dMM() documentation.\n",
+                       cublasGetErrorString(status));
+          return 0;
+        }
+      }
+    }
+    else if (direction == 1)
+    {
+      // backprop wrt. weights
+      output = weight;
+      // valid convolution: im2col, then gemm
+      // Iterate over batch
+      for (int n = 0; n < batchSize; n++)
+      {
+        // First, im2col
+        im3d2col(bottom->devdata + n * bottom_stride, nChannels,
+                 bottomHeight, bottomWidth, bottomDepth,
+                 kH, kW, kD,
+                 padH, padW, padD,
+                 dH, dW, dD,
+                 col->devdata);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+       {
+         PyErr_Format(PyExc_RuntimeError,
+                      "GpuCorr3dMM encountered a CUDA error in im2col: %s\n"
+                      "This could be a known bug in CUDA, please see the "
+                      "GpuCorr3dMM() documentation.\n",
+                      cudaGetErrorString(err));
+         return 0;
+       }
+        // Second, gemm
+        // Note that we accumulate into weight. We do so by setting beta = 0
+        // for the first iteration and beta = 1 for subsequent ones. (This
+        // is faster than setting weight to all zeros before the loop.)
+        cublasStatus_t status = cublasSgemm(handle,
+                                            CUBLAS_OP_T, CUBLAS_OP_N,
+                                            K_, M_, N_,
+                                            &one,
+                                            col->devdata, N_,
+                                            top->devdata + n * top_stride, N_,
+                                            (n == 0) ? &zero : &one,
+                                            weight->devdata, K_);
+        if (status != CUBLAS_STATUS_SUCCESS)
+        {
+          PyErr_Format(PyExc_RuntimeError,
+                       "GpuCorr3dMM encountered a CUBLAS error: %s\n"
+                       "This could be a known bug in CUDA, please see the "
+                       "GpuCorr3dMM() documentation.\n",
+                       cublasGetErrorString(status));
+          return 0;
+        }
+      }
+    }
+    else if (direction == 2)
+    {
+      // backprop wrt. inputs
+      output = bottom;
+      // full convolution: gemm, then col2im3d
+      // Iterate over batch
+      for (int n = 0; n < batchSize; n++)
+      {
+        // gemm into columns
+        cublasStatus_t status = cublasSgemm(handle,
+                                            CUBLAS_OP_N, CUBLAS_OP_T,
+                                            N_, K_, M_,
+                                            &one,
+                                            top->devdata + n * top_stride, N_,
+                                            weight->devdata, K_,
+                                            &zero,
+                                            col->devdata, N_);
+        if (status != CUBLAS_STATUS_SUCCESS)
+        {
+          PyErr_Format(PyExc_RuntimeError,
+                       "GpuCorr3dMM encountered a CUBLAS error: %s\n"
+                       "This could be a known bug in CUDA, please see the "
+                       "GpuCorr3dMM() documentation.\n",
+                       cublasGetErrorString(status));
+          return 0;
+        }
+        // col2im3d back to the data
+        col2im3d(col->devdata, nChannels,
+                 bottomHeight, bottomWidth, bottomDepth,
+                 kH, kW, kD,
+                 padH, padW, padD,
+                 dH, dW, dD, bottom->devdata + n * bottom_stride);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+          PyErr_Format(PyExc_RuntimeError,
+                       "GpuCorr3dMM encountered a CUDA error in col2im: %s\n"
+                       "This could be a known bug in CUDA, please see the "
+                       "GpuCorr3dMM() documentation.\n",
+                       cudaGetErrorString(err));
+          return 0;
+        }
+      }
+    }
+    // Free temporary columns
+    Py_DECREF(col);
+
+    // Note that we don't change the refcount of the output matrix here. Output
+    // allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper();
+    // in here output is just aliased to one of bottom, weights, or top.
+    return output;
+}
--- a/theano/sandbox/cuda/tests/test_gemmcorr3d.py
+++ b/theano/sandbox/cuda/tests/test_gemmcorr3d.py
+import unittest
+import numpy
+
+import theano
+from theano.tests import unittest_tools as utt
+
+# Skip tests if cuda_ndarray is not available.
+from nose.plugins.skip import SkipTest
+import theano.sandbox.cuda as cuda_ndarray
+if not cuda_ndarray.cuda_available:
+    raise SkipTest('Optional package cuda not available')
+from theano.sandbox.cuda import float32_shared_constructor as shared
+from  theano.sandbox.cuda.blas import GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradInputs
+from theano.sandbox.cuda.basic_ops import gpu_contiguous
+
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+
+
+class TestCorr3DMM(unittest.TestCase):
+
+    def run_conv_valid(self, inputs_shape, filters_shape,
+                       subsample = ( 1, 1, 1)):
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+
+        inputs = shared(inputs_val)
+        filters = shared(filters_val)
+        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
+        conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters,
+                                             b=bias, d=subsample)
+        conv = GpuCorr3dMM(border_mode = "valid",
+                           subsample=subsample)(inputs.dimshuffle(0, 4, 1, 2, 3),
+                                                  filters.dimshuffle(0, 4, 1, 2, 3))
+        conv = conv.dimshuffle(0, 2, 3, 4, 1)
+
+        f_ref = theano.function([], conv_ref)
+        f = theano.function([], conv, mode=mode_with_gpu)
+
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res,  rtol=1e-05, atol=1e-05)
+
+    def test_valid(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 32, 16, 1),
+                            filters_shape=(10, 6, 12, 4, 1))
+        self.run_conv_valid(inputs_shape=(16, 20, 32, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(2, 2, 2))
+        self.run_conv_valid(inputs_shape=(16, 20, 32, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(2, 2, 2))
+        self.run_conv_valid(inputs_shape=(16, 20, 32, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(3, 3, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 32, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(3, 3, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 32, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(3, 2, 1))
+        self.run_conv_valid(inputs_shape=(16, 20, 32, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(1, 2, 3))
+
+    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
+                        subsample = (1, 1, 1)):
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs = shared(inputs_val)
+        dCdH = shared(dCdH_val)
+        filters = shared(filters_val)
+
+        conv = theano.tensor.nnet.convGrad3D(V=inputs, dCdH=dCdH,
+                                              WShape=filters_shape,
+                                              d=subsample)
+        img = gpu_contiguous(inputs.dimshuffle(0, 4, 1, 2, 3))
+        topgrad = gpu_contiguous(dCdH.dimshuffle(0, 4, 1, 2, 3))
+        if (subsample == (1, 1, 1)):
+            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(img,
+                                                                     topgrad)
+        else:
+            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(img,
+                                                                     topgrad,
+                                                                     shape=filters.shape[1:4])
+        conv_gemm = conv_gemm.dimshuffle(0, 2, 3, 4, 1)
+        f_ref = theano.function([], conv)
+        f = theano.function([], conv_gemm)
+
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res,  rtol=1e-04, atol=1e-04)
+
+    def test_gradweight(self):
+        self.run_gradweight(inputs_shape = (16, 20, 32, 16, 1),
+                            filters_shape = (10, 6, 12, 4, 1),
+                            dCdH_shape = (16, 15, 21, 13, 10),
+                            subsample = (1, 1, 1))
+        self.run_gradweight(inputs_shape = (16, 20, 32, 16, 1),
+                            filters_shape = (10, 6, 12, 4, 1),
+                            dCdH_shape = (16, 8, 11, 7, 10),
+                            subsample = (2, 2, 2))
+        self.run_gradweight(inputs_shape = (16, 20, 32, 16, 1),
+                            filters_shape = (10, 6, 12, 4, 1),
+                            dCdH_shape = (16, 5, 7, 5, 10),
+                            subsample = (3, 3, 3))
+        self.run_gradweight(inputs_shape = (16, 20, 32, 16, 1),
+                            filters_shape = (10, 6, 12, 4, 1),
+                            dCdH_shape = (16, 8, 21, 5, 10),
+                            subsample = (2, 1, 3))
+
+    def run_gradinput(self, inputs_shape, filters_shape,
+                      subsample = (1, 1, 1)):
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+
+        inputs = shared(inputs_val)
+        filters = shared(filters_val)
+        bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
+        conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=subsample,
+                                               H=inputs)
+        f_ref = theano.function([], conv)
+        res_ref = f_ref()
+
+        ### Get bottom shape using convTransp3D
+        bottom_shape = res_ref.shape
+        bottom_val = numpy.random.random(bottom_shape).astype('float32')
+        bottom = shared(bottom_val)
+
+        weight = gpu_contiguous(filters.dimshuffle(0, 4, 1, 2, 3))
+        top = gpu_contiguous(inputs.dimshuffle(0, 4, 1, 2, 3))
+        if (subsample == (1, 1, 1)):
+            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(kern=weight, topgrad=top)
+        else:
+            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(kern=weight, topgrad=top,
+                                                                    shape = bottom.shape[1:4])
+        conv_gemm = conv_gemm.dimshuffle(0, 2, 3, 4, 1)
+        f = theano.function([], conv_gemm)
+
+        res = f()
+        utt.assert_allclose(res_ref, res,  rtol=1e-04, atol=1e-04)
+
+    def test_gradinput(self):
+        self.run_gradinput(inputs_shape = (16, 15, 21, 12, 10),
+                           filters_shape = (10, 6, 12, 4, 1))
+        self.run_gradinput(inputs_shape = (16, 15, 21, 12, 10),
+                           filters_shape = (10, 6, 12, 4, 1),
+                           subsample=(2,2,2))
+        self.run_gradinput(inputs_shape = (16, 15, 21, 12, 10),
+                           filters_shape = (10, 6, 12, 4, 1),
+                           subsample=(3,3,3))
+        self.run_gradinput(inputs_shape = (16, 15, 21, 12, 10),
+                           filters_shape = (10, 6, 12, 4, 1),
+                           subsample=(3,1,2))
+