Refactored GpuCorrMM to be split into separate ops for the forward pass and the two backward passes

a725adf3 · f0k · e76a29d9 · a725adf3 · a725adf3 · a725adf3
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -8,6 +8,7 @@ from theano.compat.six import StringIO
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda import as_cuda_ndarray_variable
+from theano.sandbox.cuda.basic_ops import gpu_contiguous


 class GpuDot22(GpuOp):
@@ -500,60 +501,22 @@ gpu_ger_no_inplace = GpuGer(inplace=False)
 gpu_ger_inplace = GpuGer(inplace=True)


-class GpuCorrMM(GpuOp):
-    """GPU correlation/convolution implementation using Matrix Multiplication.
+class BaseGpuCorrMM(GpuOp):
+    """Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
+    `GpuCorrMM_gradInputs`. Cannot be used directly."""

-    :note: It doesn't implement the grad. So you shouldn't use it directly, but
-        use :func:`conv2d <theano.tensor.nnet.conv.conv2d>` and then enable the
-        Theano flag ``optimizer_including=conv_gemm`` to automatically replace
-        all convolution operations with `GpuCorrMM`.
-
-    """
-    def __init__(self, border_mode,
+    def __init__(self, border_mode="valid",
            subsample=(1, 1),
            pad=(0, 0)):
-        """
-        :param border_mode: "valid" or "full"
-        :param subsample: the subsample operation applied to each output image.
-            Should be a tuple with 2 elements.
-            (sv, sh) is equivalent to GpuCorrMM(...)(...)[:,:,::sv, ::sh]
-            If border_mode="full", this is instead treated as an upsampling
-            operation applied to each input image.
-            Set to (1, 1) to disable downsampling/upsampling.
-        :param pad: the width of a border of implicit zeros to pad the input
-            image with. Should be a tuple with 2 elements giving the numbers of
-            rows and columns to pad on each side, or "auto" to set the padding
-            to (kernel_rows - 1, kernel_columns - 1) at runtime.
-            If border_mode="full", this is instead treated as the width of a
-            border to crop from the output image.
-            Set to (0, 0) to disable padding/cropping.
-
-        :note: The border_mode changes the meaning of several parameters.
-            If border_mode="valid", the Op does a valid correlation of a padded
-            input image and subsamples it. (To perform a convolution instead,
-            you will need to flip the kernels.)
-            If border_mode="full", the Op does a full convolution of an
-            upsampled input image and crops it. (This can be used as a backward
-            pass of the valid correlation done with border_mode="valid".)
-            Combined with pad="auto", you can use border_mode="valid" to
-            simulate a full correlation with subsampling, or border_mode="full"
-            to simulate a valid convolution with upsampling.
-
-        :note: Currently, the Op requires a very specific memory layout.
-            For border_mode="valid", inputs, filters and outputs must be
-            C-contiguous. For border_mode="full", the same applies, except that
-            the strides of the first two dimensions of the filters (output and
-            input channels) must be swapped compared to C-contiguity.
-        """
+        if border_mode != "valid":
+            raise ValueError("border_mode must be 'valid'")
        self.border_mode = border_mode
+        if len(subsample) != 2:
+            raise ValueError("subsample must have two elements")
        self.subsample = subsample
-        #if (border_mode == "full") and (subsample != (1,1)):
-        #    raise NotImplementedError(
-        #        "GpuCorrMM doesn't support subsampling for border_mode='full'")
+        if (pad != "auto") and (len(pad) != 2):
+            raise ValueError("pad must be 'auto' or have two elements")
        self.pad = pad
-        #if (border_mode == "full") and (pad != (0,0)):
-        #    raise NotImplementedError(
-        #        "GpuCorrMM doesn't support padding for border_mode='full'")

    def __eq__(self, other):
        return type(self) == type(other) \
@@ -576,34 +539,19 @@ class GpuCorrMM(GpuOp):
            str(self.subsample),
            self.pad)

-    def make_node(self, img, kern):
-        img = as_cuda_ndarray_variable(img)
-        kern = as_cuda_ndarray_variable(kern)
-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
-
-        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
-                         False, False]
-        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
-
-    def flops(self, inputs, outputs):
-        images, kerns = inputs
-        out, = outputs
-        assert images[1] == kerns[1]
-        flops = 0
-        if self.border_mode == "valid":
-            # nb mul and add by output pixel
-            flops = kerns[2] * kerns[3] * 2
-            # nb flops by output image
-            flops *= out[2] * out[3]
-            # nb patch multiplied
-            flops *= images[1] * kerns[0] * images[0]
-        else:
-            flops = (images[0] * kerns[0] * images[1] *
-                     kerns[2] * kerns[3] *
-                     images[2] * images[3] * 2)
+    def flops(self, inp, outp):
+        """ Useful with the hack in profilemode to print the MFlops"""
+        # if the output shape is correct, then this gives the correct
+        # flops for any direction, sampling, padding, and border mode
+        inputs, filters = inp
+        outputs, = outp
+        assert inputs[1] == filters[1]
+        # nb mul and add by output pixel
+        flops = filters[2] * filters[3] * 2
+        # nb flops by output image
+        flops *= outputs[2] * outputs[3]
+        # nb patch multiplied
+        flops *= inputs[1] * filters[0] * inputs[0]
        return flops

    def c_headers(self):
@@ -621,61 +569,98 @@ class GpuCorrMM(GpuOp):
                for f in files]
        return reduce(str.__add__, codes)

-    def c_code(self, node, nodename, inp, out_, sub):
-        img, kern = inp
-        out, = out_
-        dx = self.subsample[0]
-        dy = self.subsample[1]
+    def c_code(self, bottom, weights, top, direction, sub):
+        # This is the shared code for GpuCorrMM (direction="forward"),
+        # GpuCorrMM_gradWeights (direction="backprop weights"), and
+        # GpuCorrMM_gradInputs (direction="backprop inputs").
+        # Depending on the direction, one of bottom, weights, top will
+        # receive the output, while the other two serve as inputs.
+        if self.border_mode != "valid":
+            raise ValueError("mode must be 'valid'")
+        dH, dW = self.subsample
        if self.pad == "auto":
            padH = padW = -1
        else:
-            padH = self.pad[0]
-            padW = self.pad[1]
-        if self.border_mode == "valid":
-            bmode = 1
-        elif self.border_mode == "full":
-            bmode = 0
-        else:
-            raise ValueError("mode must be one of 'full' or 'valid'")
+            padH, padW = self.pad
+        if direction == "forward":
+            direction = 0
+            out = top
+        elif direction == "backprop weights":
+            direction = 1
+            out = weights
+        elif direction == "backprop inputs":
+            direction = 2
+            out = bottom
        sub = sub.copy()
        sub.update(locals())

        return """
-    //Mandatory args
-    int mode = %(bmode)s;
+    // Mandatory args
+    int direction = %(direction)s;  // forward, bprop weights, bprop inputs

-    //Optional args
-    int dx = %(dx)s;
-    int dy = %(dy)s;
+    // Optional args
+    int dH = %(dH)s;
+    int dW = %(dW)s;
    int padH = %(padH)s;
    int padW = %(padW)s;
    
-    CudaNdarray * img = %(img)s;
-    CudaNdarray * kern = %(kern)s;
+    CudaNdarray * bottom = %(bottom)s;
+    CudaNdarray * weights = %(weights)s;
+    CudaNdarray * top = %(top)s;
    CudaNdarray * out2 = NULL;

-    //Auto-padding if requested
+    // Obtain or infer kernel width and height
+    int kH, kW;
+    if (direction != 1) {
+        kH = CudaNdarray_HOST_DIMS(weights)[2];
+        kW = CudaNdarray_HOST_DIMS(weights)[3];
+    }
+    else {
+        kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
+        kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
+    }
+
+    // Auto-padding if requested
    if (padH < 0) {
-        padH = CudaNdarray_HOST_DIMS(kern)[2] - 1;
+        padH = kH - 1;
    }
    if (padW < 0) {
-        padW = CudaNdarray_HOST_DIMS(kern)[3] - 1;
+        padW = kW - 1;
    }

+    // Infer output shape
    int out_dim[4];
-    out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
-    out_dim[1] = CudaNdarray_HOST_DIMS(kern)[0];
-    if (mode == 1)  // valid correlation with padding and subsampling
-    {
-        out_dim[2] = ceil_intdiv(CudaNdarray_HOST_DIMS(img)[2] + 2*padH - CudaNdarray_HOST_DIMS(kern)[2] + 1, dx);
-        out_dim[3] = ceil_intdiv(CudaNdarray_HOST_DIMS(img)[3] + 2*padW - CudaNdarray_HOST_DIMS(kern)[3] + 1, dy);
-    }
-    else  // full convolution with upsampling and cropping
-    {
-        out_dim[2] = (CudaNdarray_HOST_DIMS(img)[2] - 1) * dx + CudaNdarray_HOST_DIMS(kern)[2] - 2*padH;
-        out_dim[3] = (CudaNdarray_HOST_DIMS(img)[3] - 1) * dy + CudaNdarray_HOST_DIMS(kern)[3] - 2*padW;
+    switch(direction) {
+    case 0:  // forward pass
+        // output is top: (batchsize, num_filters, height, width)
+        // height and width: top = (bottom + 2*pad - weight) / sample + 1
+        out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
+        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
+        out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1;
+        out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1;
+        break;
+    case 1:  // backprop wrt. weights
+        // output is weights: (num_filters, num_channels, height, width)
+        // height and width: weights = bottom + 2*pad - (top - 1) * sample
+        out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
+        out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[0];
+        out_dim[2] = kH;  // already inferred further above
+        out_dim[3] = kW;  // how convenient
+        break;
+    case 2:  // backprop wrt. inputs
+        // output is bottom: (batchsize, num_channels, height, width)
+        // height and width: bottom = (top - 1) * sample + weights - 2*pad
+        out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
+        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
+        out_dim[2] = (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH;
+        out_dim[3] = (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW;
+        break;
+    default:
+        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
+        %(fail)s
    }

+    // Prepare output array
    if ( !(%(out)s
           && %(out)s->nd==4
           && CudaNdarray_is_c_contiguous(%(out)s)
@@ -688,7 +673,8 @@ class GpuCorrMM(GpuOp):
        %(out)s = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
    }

-    out2 = corrMM(%(img)s, %(kern)s, %(out)s, mode, dx, dy, padH, padW);
+    // Call CUDA code
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, padH, padW);
    if (out2==NULL){
       %(fail)s
    }
@@ -697,6 +683,132 @@ class GpuCorrMM(GpuOp):
 """ % sub


+class GpuCorrMM(BaseGpuCorrMM):
+    """GPU correlation implementation using Matrix Multiplication.
+
+    :note: You can either enable the Theano flag `optimizer_including=conv_gemm`
+        to automatically replace all convolution operations with `GpuCorrMM`
+        or one of its gradients, or you can use it as a replacement for
+        :func:`conv2d <theano.tensor.nnet.conv.conv2d>`, called as
+        `GpuCorrMM(subsample=...)(image, filters)`. The latter is currently
+        faster, but note that it computes a correlation -- if you need to
+        compute a convolution, flip the filters as `filters[:,:,::-1,::-1]`.
+
+    """
+    def __init__(self, border_mode="valid",
+            subsample=(1, 1),
+            pad=(0, 0)):
+        """
+        :param border_mode: currently supports "valid" only; "full" can be
+            simulated by setting `pad="auto"` (at the cost of performance), or
+            by using `GpuCorrMM_gradInputs`
+        :param subsample: the subsample operation applied to each output image.
+            Should be a tuple with 2 elements.
+            `(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
+            but faster.
+            Set to `(1, 1)` to disable subsampling.
+        :param pad: the width of a border of implicit zeros to pad the input
+            image with. Should be a tuple with 2 elements giving the numbers of
+            rows and columns to pad on each side, or "auto" to set the padding
+            to `(kernel_rows - 1, kernel_columns - 1)` at runtime.
+            Set to `(0, 0)` to disable padding.
+
+        :note: Currently, the Op requires the inputs, filters and outputs to be
+            C-contiguous. Use :func:`gpu_contiguous
+            <theano.sandbox.cuda.basic_ops.gpu_contiguous>` on these arguments
+            if needed.
+        """
+        super(GpuCorrMM, self).__init__(border_mode, subsample, pad)
+
+    def make_node(self, img, kern):
+        img = as_cuda_ndarray_variable(img)
+        kern = as_cuda_ndarray_variable(kern)
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+
+        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
+                         False, False]
+        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        bottom, weights = inp
+        top, = out_
+        direction = "forward"
+        return super(GpuCorrMM, self).c_code(bottom, weights, top, direction, sub)
+
+    def grad(self, inp, grads):
+        bottom, weights = inp
+        top, = grads
+        top = gpu_contiguous(top)
+        d_bottom = GpuCorrMM_gradInputs(self.border_mode, self.subsample, self.pad)(
+                weights, top)
+        d_weights = GpuCorrMM_gradWeights(self.border_mode, self.subsample, self.pad)(
+                bottom, top)
+        return d_bottom, d_weights
+
+
+class GpuCorrMM_gradWeights(BaseGpuCorrMM):
+    """Gradient wrt. filters for `GpuCorrMM`.
+
+    :note: You will not want to use this directly, but rely on Theano's
+    automatic differentiation or graph optimization to use it as needed."""
+
+    def __init__(self, border_mode="valid",
+            subsample=(1, 1),
+            pad=(0, 0)):
+        super(GpuCorrMM_gradWeights, self).__init__(border_mode, subsample, pad)
+
+    def make_node(self, img, topgrad):
+        img = as_cuda_ndarray_variable(img)
+        topgrad = as_cuda_ndarray_variable(topgrad)
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if topgrad.type.ndim != 4:
+            raise TypeError('topgrad must be 4D tensor')
+
+        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
+                         False, False]
+        return Apply(self, [img, topgrad], [CudaNdarrayType(broadcastable)()])
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        bottom, top = inp
+        weights, = out_
+        direction = "backprop weights"
+        return super(GpuCorrMM_gradWeights, self).c_code(bottom, weights, top, direction, sub)
+
+
+class GpuCorrMM_gradInputs(BaseGpuCorrMM):
+    """Gradient wrt. inputs for `GpuCorrMM`.
+
+    :note: You will not want to use this directly, but rely on Theano's
+    automatic differentiation or graph optimization to use it as needed."""
+
+    def __init__(self, border_mode="valid",
+            subsample=(1, 1),
+            pad=(0, 0)):
+        super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample, pad)
+
+    def make_node(self, kern, topgrad):
+        kern = as_cuda_ndarray_variable(kern)
+        topgrad = as_cuda_ndarray_variable(topgrad)
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+        if topgrad.type.ndim != 4:
+            raise TypeError('topgrad must be 4D tensor')
+
+        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+                         False, False]
+        return Apply(self, [kern, topgrad], [CudaNdarrayType(broadcastable)()])
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        weights, top = inp
+        bottom, = out_
+        direction = "backprop inputs"
+        return super(GpuCorrMM_gradInputs, self).c_code(bottom, weights, top, direction, sub)
+
+
 ##
 # Not really a BLAS operation, but whatever.
 #

--- a/theano/sandbox/cuda/conv_gemm.cu
+++ b/theano/sandbox/cuda/conv_gemm.cu
@@ -161,18 +161,18 @@ void col2im(const float* data_col, const int channels,
 // Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter
 // Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
 //   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
-CudaNdarray* corrMM(const CudaNdarray *input, 
-                    CudaNdarray *weight,
-                    CudaNdarray *output,
-                    int mode,
-                    int dH = 1,
-                    int dW = 1,
-                    int padH = 0,
-                    int padW = 0)
+CudaNdarray* corrMM(CudaNdarray *const bottom,
+                    CudaNdarray *const weight,
+                    CudaNdarray *const top,
+                    const int direction,
+                    const int dH = 1,
+                    const int dW = 1,
+                    const int padH = 0,
+                    const int padW = 0)
 {
-    if (input->nd != 4)
+    if (bottom->nd != 4)
    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires input of 4D");
+        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires bottom of 4D");
    }
    
    if (weight->nd != 4)
@@ -180,83 +180,75 @@ CudaNdarray* corrMM(const CudaNdarray *input,
        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D");
    }

-    if (output->nd != 4)
+    if (top->nd != 4)
    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires output of 4D");
+        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires top of 4D");
    }

    // Extract some shape information for later and check shape consistency
-    // inputs: (batchSize, nInputPlane, inputHeight, inputWidth)
-    const int batchSize = CudaNdarray_HOST_DIMS(input)[0];
-    const int nInputPlane = CudaNdarray_HOST_DIMS(input)[1];
-    const int inputHeight = CudaNdarray_HOST_DIMS(input)[2];
-    const int inputWidth = CudaNdarray_HOST_DIMS(input)[3];
-    // filters: (nOutputPlane, nInputPlane, rows, columns)
-    const int nOutputPlane = CudaNdarray_HOST_DIMS(weight)[0];
+    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth)
+    const int batchSize = CudaNdarray_HOST_DIMS(bottom)[0];
+    const int nChannels = CudaNdarray_HOST_DIMS(bottom)[1];
+    const int bottomHeight = CudaNdarray_HOST_DIMS(bottom)[2];
+    const int bottomWidth = CudaNdarray_HOST_DIMS(bottom)[3];
+    // weights: (nFilters, nChannels, rows, columns)
+    const int nFilters = CudaNdarray_HOST_DIMS(weight)[0];
    const int kH = CudaNdarray_HOST_DIMS(weight)[2];
    const int kW = CudaNdarray_HOST_DIMS(weight)[3];
-    if (nInputPlane != CudaNdarray_HOST_DIMS(weight)[1]) {
+    if (nChannels != CudaNdarray_HOST_DIMS(weight)[1]) {
        PyErr_SetString(PyExc_ValueError,
                "GpuCorrMM images and kernel must have the same stack size\n");
        return NULL;
    }
-    // outputs: (batchSize, nOutputPlane, outputHeight, outputWidth)
-    int outputHeight, outputWidth;
-    if (mode == 1) {  // valid correlation with padding and subsampling
-        outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
-        outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
-    }
-    else if (mode == 0) {  // full convolution with upsampling and cropping
-        // these would be the shapes for a standard full convolution:
-        //outputHeight = (inputHeight + 2*padH + kH - 2) / dH + 1;
-        //outputWidth  = (inputWidth + 2*padW + kW - 2) / dW + 1;
-        // but here, dH and dW are *upsampling* factors, and padding is reversed
-        // (because the implementation was meant as a backward pass for a CNN)
-        outputHeight = (inputHeight - 1) * dH + kH - 2*padH;
-        outputWidth = (inputWidth - 1) * dW + kW - 2*padW;
-    }
-    if (batchSize != CudaNdarray_HOST_DIMS(output)[0] ||
-            nOutputPlane != CudaNdarray_HOST_DIMS(output)[1] ||
-            outputHeight != CudaNdarray_HOST_DIMS(output)[2] ||
-            outputWidth != CudaNdarray_HOST_DIMS(output)[3]) {
+    // top: (batchSize, nFilters, topHeight, topWidth)
+    const int topHeight = (bottomHeight + 2*padH - kH) / dH + 1;
+    const int topWidth  = (bottomWidth + 2*padW - kW) / dW + 1;
+    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
+            nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
+            topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
+            topWidth != CudaNdarray_HOST_DIMS(top)[3]) {
        PyErr_Format(PyExc_ValueError,
-                "GpuCorrMM output parameter has wrong shape %d %d %d %d, expected %d %d %d %d\n",
-                CudaNdarray_HOST_DIMS(output)[0], CudaNdarray_HOST_DIMS(output)[1],
-                CudaNdarray_HOST_DIMS(output)[2], CudaNdarray_HOST_DIMS(output)[3],
-                batchSize, nOutputPlane, outputHeight, outputWidth);
+                "GpuCorrMM shape inconsistency: From bottom and weights, "
+                "top shape should be %d %d %d %d, but is %d %d %d %d.\n",
+                batchSize, nFilters, topHeight, topWidth,
+                CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
+                CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3]);
        return NULL;
    }

-    if (mode == 1) {  // valid correlation: im2col, then gemm
-        // Create temporary columns (col_data)
-        int col_dim[2];
-        col_dim[0] = nInputPlane * kW * kH;
-        col_dim[1] = outputHeight * outputWidth;
-        CudaNdarray* col_data = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim);
+    // Create temporary columns
+    int col_dim[2];
+    col_dim[0] = nChannels * kW * kH;
+    col_dim[1] = topHeight * topWidth;
+    CudaNdarray* col = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim);
+
+    // Define some useful variables
+    const int bottom_stride = CudaNdarray_HOST_STRIDES(bottom)[0];
+    const int top_stride = CudaNdarray_HOST_STRIDES(top)[0];
+    const int K_ = col_dim[0];
+    const int N_ = col_dim[1];
+    const int M_ = nFilters;
+    const float one = 1.0f;
+    const float zero = 0.0f;

-        // Define some useful variables
-        const int ip_stride = CudaNdarray_HOST_STRIDES(input)[0];
-        const int op_stride = CudaNdarray_HOST_STRIDES(output)[0];
-        const int K_ = col_dim[0];
-        const int N_ = col_dim[1];
-        const int M_ = nOutputPlane;
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-        
+    CudaNdarray *output;
+    if (direction == 0) {  // forward pass
+        output = top;
+        // valid correlation: im2col, then gemm
        // Iterate over batch
        for (int n = 0; n < batchSize; n++) {
            // First, im2col
-            im2col(input->devdata + n * ip_stride, nInputPlane, inputHeight,
-                    inputWidth, kH, kW, padH, padW, dH, dW, col_data->devdata);
+            im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
+                    bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
            // Second, gemm
            cublasStatus_t status = cublasSgemm(handle,
                    CUBLAS_OP_N, CUBLAS_OP_N,
                    N_, M_, K_,
-                    &alpha,
-                    col_data->devdata, N_,
+                    &one,
+                    col->devdata, N_,
                    weight->devdata, K_,
-                    &beta,
-                    output->devdata + n * op_stride, N_);
+                    &zero,
+                    top->devdata + n * top_stride, N_);
            if (status != CUBLAS_STATUS_SUCCESS) {
                PyErr_Format(PyExc_RuntimeError,
                        "GpuCorrMM encountered a CUBLAS error: %s\n",
@@ -264,17 +256,11 @@ CudaNdarray* corrMM(const CudaNdarray *input,
                return NULL;
            }
        }
-        // Free temporary columns
-        Py_DECREF(col_data);
-
        /*
        // Original caffe code for comparison
        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-        // Note that this is for grouped convolution; we can ignore groups
-        const Dtype* bottom_data = bottom[i]->gpu_data();
-        Dtype* top_data = (*top)[i]->mutable_gpu_data();
-        Dtype* col_data = col_buffer_.mutable_gpu_data();
-        const Dtype* weight = this->blobs_[0]->gpu_data();
+        // Note that this is for grouped convolution; we can ignore groups here,
+        // but the group-related offsets help explain what M_, N_ and K_ are
        int weight_offset = M_ * K_;
        int col_offset = K_ * N_;
        int top_offset = M_ * N_;
@@ -300,33 +286,81 @@ CudaNdarray* corrMM(const CudaNdarray *input,
        }
        */
    }
-    else if (mode == 0) {  // full convolution: gemm, then col2im
-        // Create temporary columns (col_diff)
-        int col_dim[2];
-        col_dim[0] = nOutputPlane * kW * kH;
-        col_dim[1] = inputHeight * inputWidth;
-        CudaNdarray* col_diff = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim);
-
-        // Define some useful variables
-        const int ip_stride = CudaNdarray_HOST_STRIDES(input)[0];
-        const int op_stride = CudaNdarray_HOST_STRIDES(output)[0];
-        const int K_ = col_dim[0];
-        const int N_ = col_dim[1];
-        const int M_ = nInputPlane;
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-
+    else if (direction == 1) {  // backprop wrt. weights
+        output = weight;
+        // valid convolution: im2col, then gemm
+        // Initialize target with zeros as we will accumulate into it
+        // (all kernels run on the null stream, so we don't need to synchronize)
+        cudaError_t err = cudaMemsetAsync(weight->devdata, 0,
+                sizeof(float) * M_ * K_);
+        if (err != cudaSuccess) {
+                PyErr_Format(PyExc_RuntimeError,
+                        "GpuCorrMM encountered a CUDA error: %s\n",
+                        cudaGetErrorString(err));
+                return NULL;
+        }
+        // Iterate over batch
+        for (int n = 0; n < batchSize; n++) {
+            // First, im2col
+            im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
+                    bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
+            // Second, gemm
+            cublasStatus_t status = cublasSgemm(handle,
+                    CUBLAS_OP_T, CUBLAS_OP_N,
+                    K_, M_, N_,
+                    &one,
+                    col->devdata, N_,
+                    top->devdata + n * top_stride, N_,
+                    &one,
+                    weight->devdata, K_);
+            if (status != CUBLAS_STATUS_SUCCESS) {
+                PyErr_Format(PyExc_RuntimeError,
+                        "GpuCorrMM encountered a CUBLAS error: %s\n",
+                        cublasGetErrorString(status));
+                return NULL;
+            }
+        }
+        /*
+        // Original caffe code for comparison
+        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
+        // Note that this is for grouped convolution; we can ignore groups
+        for (int n = 0; n < num_; ++n) {
+          // Since we saved memory in the forward pass by not storing all col
+          // data, we will need to recompute them.
+          im2col_gpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
+                     width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+                     stride_h_, stride_w_, col_data);
+          // gradient w.r.t. weight. Note that we will accumulate diffs.
+          for (int g = 0; g < group_; ++g) {
+            caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
+                (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
+                col_data + col_offset * g, (Dtype)1.,
+                weight_diff + weight_offset * g);
+            == (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
+            cublasSgemm(CUBLAS_OP_T, CUBLAS_OP_N, K_, M_, N_,
+                1.0,
+                col_data + col_offset * g, N_,
+                top_diff + top[i]->offset(n) + top_offset * g, N_,
+                1.0,
+                weight_diff + weight_offset * g, K_);
+          }
+        }
+        */
+    }
+    else if (direction == 2) {  // backprop wrt. inputs
+        output = bottom;
+        // full convolution: gemm, then col2im
        // Iterate over batch
        for (int n = 0; n < batchSize; n++) {
            // gemm into columns
            cublasStatus_t status = cublasSgemm(handle,
                    CUBLAS_OP_N, CUBLAS_OP_T,
                    N_, K_, M_,
-                    &alpha,
-                    input->devdata + n * ip_stride, N_,
+                    &one,
+                    top->devdata + n * top_stride, N_,
                    weight->devdata, K_,
-                    &beta,
-                    col_diff->devdata, N_);
+                    &zero,
+                    col->devdata, N_);
            if (status != CUBLAS_STATUS_SUCCESS) {
                PyErr_Format(PyExc_RuntimeError,
                        "GpuCorrMM encountered a CUBLAS error: %s\n",
@@ -334,22 +368,15 @@ CudaNdarray* corrMM(const CudaNdarray *input,
                return NULL;
            }
            // col2im back to the data
-            col2im(col_diff->devdata, nOutputPlane, outputHeight, outputWidth,
-                    kH, kW, padH, padW, dH, dW, output->devdata + n * op_stride);
+            col2im(col->devdata, nChannels, bottomHeight, bottomWidth,
+                    kH, kW, padH, padW, dH, dW, bottom->devdata + n * bottom_stride);
        }
-        // Free temporary columns
-        Py_DECREF(col_diff);
-
        /*
        // Original caffe code for comparison
        // https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
-        // Note that this is the backward pass of a valid convolution, so
-        // top_diff is the input, bottom_diff is the output, weights are weights
-        Dtype* col_data = col_buffer_.mutable_gpu_data();
-        Dtype* col_diff = col_buffer_.mutable_gpu_diff();
-        Dtype* bottom_diff = (*bottom)[i]->mutable_gpu_diff();
        for (int n = 0; n < num_; ++n) {
-            // gradient w.r.t. bottom data, if necessary
+          // gradient w.r.t. bottom data, if necessary
+          if (propagate_down[i]) {
            for (int g = 0; g < group_; ++g) {
              caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
                  (Dtype)1., weight + weight_offset * g,
@@ -367,9 +394,13 @@ CudaNdarray* corrMM(const CudaNdarray *input,
            col2im_gpu(col_diff, channels_, height_, width_,
                kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
                bottom_diff + (*bottom)[i]->offset(n));
+          }
        }
        */
    }
+    // Free temporary columns
+    Py_DECREF(col);
+
    return output;
 }

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -25,7 +25,8 @@ from theano.sandbox.cuda.basic_ops import (
    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape)
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
-        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv, GpuCorrMM)
+        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
+        GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights)
 from theano.sandbox.cuda.blas import gpu_gemv_inplace
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace
@@ -1354,19 +1355,23 @@ def local_conv_gemm(node):
        border_mode = node.op.border_mode
        subsample = node.op.subsample
        pad = (0,0)
-        if (border_mode == 'full') and ((subsample != (1,1)) or (pad != (0,0))):
+        if (border_mode == 'full') and (subsample != (1,1)):
            # need to simulate this via a padded valid convolution
            pad = 'auto'
            border_mode = 'valid'
        if (border_mode == 'valid'):
            # need to flip the kernel for valid convolution
-            kern = gpu_contiguous(kern[:, :, ::-1, ::-1])
+            kern = kern[:, :, ::-1, ::-1]
+            # call GpuCorrMM
+            # TODO: call GpuCorrMM_gradWeights instead if appropriate
+            return [GpuCorrMM('valid', subsample, pad)(
+                    gpu_contiguous(img), gpu_contiguous(kern))]
        elif (border_mode == 'full'):
-            # need to bring kernel into correct memory layout for full convolution
-            kern = gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)).dimshuffle(1, 0, 2, 3)
-        # need C-contiguous inputs
-        img = gpu_contiguous(img)
-        return [GpuCorrMM(border_mode, subsample, pad)(img, kern)]
+            # need to dimshuffle the kernel for full convolution
+            kern = kern.dimshuffle(1, 0, 2, 3)
+            # call GpuCorrMM_gradInputs
+            return [GpuCorrMM_gradInputs('valid', subsample, pad)(
+                    gpu_contiguous(kern), gpu_contiguous(img))]

 gpu_optimizer.register("conv_gemm", local_conv_gemm)


--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -186,7 +186,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
        f = theano.function([i, k], op, mode=theano_mode)
        if cls is not None:
            assert any([isinstance(node.op, cls)
-                        for node in f.maker.fgraph.toposort()]), f.maker.fgraph.toposort()
+                        for node in f.maker.fgraph.toposort()]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort())
        gpuval = f(img, kern)
        t2 = time.time()
        for i in range(nb_iter):
@@ -284,7 +284,7 @@ def exec_conv(version, shapes, verbose, random, mode,
                        cls=cls)
            except Exception, e:
                print ver, id, (ishape, kshape, subshape, istride, kstride)
-                print e
+                print "Exception", type(e), e
                pass
            if not ret:
                failed_version.add(ver)
@@ -634,7 +634,7 @@ def test_valid(conv_gemm=False):
    if conv_gemm:
        # Test the GpuCorrMM version
        mode = theano_mode.including("conv_gemm")
-        cls = cuda.blas.GpuCorrMM
+        cls = cuda.blas.BaseGpuCorrMM
        # dummy version; not used by GpuCorrMM so one version is enough
        version = [-1]
        # Add tests with strided inputs by still square images and filters.
@@ -713,7 +713,7 @@ def test_full(conv_gemm=False):
    if conv_gemm:
        # Test the GpuCorrMM version
        mode = theano_mode.including("conv_gemm")
-        cls = cuda.blas.GpuCorrMM
+        cls = cuda.blas.BaseGpuCorrMM
        # dummy version; not used by GpuCorrMM so one version is enough
        version = [-1]
    else:
@@ -753,7 +753,7 @@ def test_subsample(conv_gemm=False):
    if conv_gemm:
        # Test the GpuCorrMM version
        mode = theano_mode.including("conv_gemm")
-        cls = cuda.blas.GpuCorrMM
+        cls = cuda.blas.BaseGpuCorrMM
        # dummy version; not used by GpuCorrMM so one version is enough
        version_valid = version_full = [-1]
    else: