GPU code and some more tests

e514f4d3 · Vikram · cd10a53c · e514f4d3 · e514f4d3 · e514f4d3
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -468,16 +468,33 @@ class BaseGpuCorrMM(CGpuKernelBase):
    def __init__(self, border_mode="valid", subsample=(1, 1),
                 filter_dilation=(1, 1), num_groups=1, unshared=False):
        if isinstance(border_mode, integer_types):
-            border_mode = (border_mode, border_mode)
-        if isinstance(border_mode, tuple):
-            pad_h, pad_w = map(int, border_mode)
-            border_mode = (pad_h, pad_w)
-        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
-                border_mode in ('valid', 'full', 'half')):
+            if border_mode < 0:
+                raise ValueError(
+                    'invalid border_mode {}, which must be a '
+                    'non-negative integer'.format(border_mode))
+            border_mode = ((border_mode, border_mode),) * 2
+        elif isinstance(border_mode, tuple):
+            if len(border_mode) != 2:
+                raise ValueError(
+                    'invalid border_mode {} which must be a '
+                    'tuple of length 2'.format(border_mode))
+            border = ()
+            for mode in border_mode:
+                if isinstance(mode, integer_types) and mode >= 0:
+                    border += ((mode, mode),)
+                elif isinstance(mode, tuple) and len(mode) == 2 and \
+                        min(mode) >= 0:
+                    border += ((int(mode[0]), int(mode[1])),)
+                else:
+                    raise ValueError(
+                        'invalid border mode {}. The tuple can only contain '
+                        'integers or tuples of length 2'.format(border_mode))
+            border_mode = border
+        elif border_mode not in ('valid', 'full', 'half'):
            raise ValueError(
                'invalid border_mode {}, which must be either '
-                '"valid", "full", "half", an integer or a pair of'
-                ' integers'.format(border_mode))
+                '"valid", "full", "half", an integer or a tuple '
+                'of length 2'.format(border_mode))
        self.border_mode = border_mode
        if len(subsample) != 2:
            raise ValueError("subsample must have two elements")
@@ -495,7 +512,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
    def pad(self):
        if self.border_mode != 'valid':
            return self.border_mode
-        return (0, 0)
+        return ((0, 0),) * 2

    def __str__(self):
        return '%s{%s, %s, %s, %s, %s}' % (
@@ -537,7 +554,7 @@ class BaseGpuCorrMM(CGpuKernelBase):

    def c_code_cache_version(self):
        # Raise this whenever modifying the C code (including the file).
-        return (11,)
+        return (12,)

    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
        """
@@ -587,14 +604,14 @@ class BaseGpuCorrMM(CGpuKernelBase):
        numgroups = self.num_groups
        unshared = int(self.unshared)
        if self.border_mode == "half":
-            padH = padW = -1
+            padH_l = padH_r = padW_l = padW_r = -1
        elif self.border_mode == "full":
-            padH = padW = -2
+            padH_l = padH_r = padW_l = padW_r = -2
        elif isinstance(self.border_mode, tuple):
-            padH, padW = self.border_mode
+            (padH_l, padH_r), (padW_l, padW_r) = self.border_mode
        else:
            assert self.border_mode == "valid"
-            padH = padW = 0
+            padH_l = padH_r = padW_l = padW_r = 0
        if direction == "forward":
            direction = 0
            out = top
@@ -613,13 +630,13 @@ class BaseGpuCorrMM(CGpuKernelBase):
        if height:
            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
        else:
-            if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
+            if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH_l == -1)):
                raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
            height = '-1'
        if width:
            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
        else:
-            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
+            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW_l == -1)):
                raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
            width = '-1'

@@ -635,8 +652,10 @@ class BaseGpuCorrMM(CGpuKernelBase):
    size_t dW = %(dW)s;
    size_t dilH = %(dilH)s;
    size_t dilW = %(dilW)s;
-    int padH = %(padH)s;
-    int padW = %(padW)s;
+    int padH_l = %(padH_l)s;
+    int padH_r = %(padH_r)s;
+    int padW_l = %(padW_l)s;
+    int padW_r = %(padW_r)s;
    int numgroups = %(numgroups)s;
    int unshared = %(unshared)s;

@@ -662,22 +681,22 @@ class BaseGpuCorrMM(CGpuKernelBase):
            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
-        else if (padH == -2) {
+        else if (padH_l == -2 || padH_r == -2) {
            // vertical full padding, we can infer the kernel height
            kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1;
        }
        else {
            // explicit padding, we can infer the kernel height
-            kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
+            kH = (PyGpuArray_DIMS(bottom)[2] + padH_l + padH_r - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
        }
        if (%(width)s != -1) {
            kW = %(width)s;
        }
-        else if (padW == -2) {
+        else if (padW_l == -2 || padW_r == -2) {
            kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
        else {
-            kW = (PyGpuArray_DIMS(bottom)[3] + 2*padW - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
+            kW = (PyGpuArray_DIMS(bottom)[3] + padW_l + padW_r - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
    }

@@ -686,23 +705,23 @@ class BaseGpuCorrMM(CGpuKernelBase):
    dil_kW = (kW - 1) * dilW + 1;

    // Auto-padding if requested
-    if (padH == -1) {  // vertical half padding
-        padH = dil_kH / 2;
+    if (padH_l == -1 || padH_r == -1) {  // vertical half padding
+        padH_l = padH_r = dil_kH / 2;
    }
-    else if (padH == -2) {  // vertical full padding
-        padH = dil_kH - 1;
+    else if (padH_l == -2 || padH_r == -2) {  // vertical full padding
+        padH_l = padH_r = dil_kH - 1;
    }
-    else if (padH < 0) {
+    else if (padH_l < 0 || padH_r < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padH must be >= -2");
        %(fail)s
    }
-    if (padW == -1) {  // horizontal half padding
-        padW = dil_kW / 2;
+    if (padW_l == -1 || padW_r == -1) {  // horizontal half padding
+        padW_l = padW_r = dil_kW / 2;
    }
-    else if (padW == -2) {  // horizontal full padding
-        padW = dil_kW - 1;
+    else if (padW_l == -2 || padW_r == -2) {  // horizontal full padding
+        padW_l = padW_r = dil_kW - 1;
    }
-    else if (padW < 0) {
+    else if (padW_l < 0 || padW_r < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padW must be >= -2");
        %(fail)s
    }
@@ -718,11 +737,11 @@ class BaseGpuCorrMM(CGpuKernelBase):
    switch(direction) {
    case 0:  // forward pass
        // output is top: (batchsize, num_filters, height, width)
-        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
+        // height and width: top = (bottom + pad_l + pad_r - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = PyGpuArray_DIMS(bottom)[0];
        out_dim[1] = PyGpuArray_DIMS(weights)[0];
-        out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1;
-        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1;
+        out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + padH_l + padH_r - ((PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1;
+        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + padW_l + padW_r - ((PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1;
        out_typecode = bottom->ga.typecode;
        out_context = bottom->context;
        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
@@ -810,8 +829,8 @@ class BaseGpuCorrMM(CGpuKernelBase):
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = PyGpuArray_DIMS(top)[0];
        out_dim[1] = PyGpuArray_DIMS(weights)[wdim-3] * numgroups;
-        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - 2*padH;
-        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - 2*padW;
+        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - padH_l - padH_r;
+        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - padW_l - padW_r;
        out_typecode = top->ga.typecode;
        out_context = top->context;
        if (unshared) {
@@ -884,7 +903,8 @@ class BaseGpuCorrMM(CGpuKernelBase):
    }

    // Call GPU code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups, unshared);
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW,
+                padH_l, padH_r, padW_l, padW_r, numgroups, unshared);
    if (out2==NULL){
       %(fail)s
    }

--- a/theano/gpuarray/c_code/corr_gemm.c
+++ b/theano/gpuarray/c_code/corr_gemm.c
@@ -42,7 +42,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
    const ga_size height, const ga_size width,
    const ga_size kernel_h, const ga_size kernel_w,
    const ga_size dilation_h, const ga_size dilation_w,
-    const ga_size pad_h, const ga_size pad_w,
+    const ga_size pad_hl, const ga_size pad_wl,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
    GLOBAL_MEM DTYPE_INPUT_0 * data_col,
@@ -57,8 +57,8 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
    const ga_size w_col = index % width_col;
    const ga_size c_im = h_index / height_col;
    const ga_size c_col = c_im * kernel_h * kernel_w;
-    const ga_size h_offset = h_col * stride_h - pad_h;
-    const ga_size w_offset = w_col * stride_w - pad_w;
+    const ga_size h_offset = h_col * stride_h - pad_hl;
+    const ga_size w_offset = w_col * stride_w - pad_wl;
    GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
@@ -86,7 +86,7 @@ KERNEL void im2col_kernel(const ga_size n,
    // data_im_offset is an offset of elements in the array
    const ga_size height, const ga_size width,
    const ga_size kernel_h, const ga_size kernel_w,
-    const ga_size pad_h, const ga_size pad_w,
+    const ga_size pad_hl, const ga_size pad_wl,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
    GLOBAL_MEM DTYPE_INPUT_0 * data_col,
@@ -101,8 +101,8 @@ KERNEL void im2col_kernel(const ga_size n,
    const ga_size w_col = index % width_col;
    const ga_size c_im = h_index / height_col;
    const ga_size c_col = c_im * kernel_h * kernel_w;
-    const ga_size h_offset = h_col * stride_h - pad_h;
-    const ga_size w_offset = w_col * stride_w - pad_w;
+    const ga_size h_offset = h_col * stride_h - pad_hl;
+    const ga_size w_offset = w_col * stride_w - pad_wl;
    GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
@@ -127,7 +127,7 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
    const ga_size height, const ga_size width, const ga_size channels,
    const ga_size kernel_h, const ga_size kernel_w,
    const ga_size dilation_h, const ga_size dilation_w,
-    const ga_size pad_h, const ga_size pad_w,
+    const ga_size pad_hl, const ga_size pad_wl,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
    GLOBAL_MEM DTYPE_INPUT_0 * data_im,
@@ -141,8 +141,8 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
    DTYPE_INPUT_0 val = 0;
-    const ga_size w_im = index % width + pad_w;
-    const ga_size h_im = (index / width) % height + pad_h;
+    const ga_size w_im = index % width + pad_wl;
+    const ga_size h_im = (index / width) % height + pad_hl;
    const ga_size c_im = index / (width * height);
    ga_size kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
    ga_size kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
@@ -177,7 +177,7 @@ KERNEL void col2im_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
    const ga_size height, const ga_size width, const ga_size channels,
    const ga_size kernel_h, const ga_size kernel_w,
-    const ga_size pad_h, const ga_size pad_w,
+    const ga_size pad_hl, const ga_size pad_wl,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
    GLOBAL_MEM DTYPE_INPUT_0 * data_im,
@@ -191,8 +191,8 @@ KERNEL void col2im_kernel(const ga_size n,
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
    DTYPE_INPUT_0 val = 0;
-    const ga_size w_im = index % width + pad_w;
-    const ga_size h_im = (index / width) % height + pad_h;
+    const ga_size w_im = index % width + pad_wl;
+    const ga_size h_im = (index / width) % height + pad_hl;
    const ga_size c_im = index / (width * height);
    // compute the start and end of the output
    const ga_size w_col_start =
@@ -254,15 +254,16 @@ int rgemm(cb_order o, cb_transpose tA, cb_transpose tB,
 int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels,
    const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w,
    const size_t dilation_h, const size_t dilation_w,
-    const size_t pad_h, const size_t pad_w,
+    const size_t pad_hl, const size_t pad_hr,
+    const size_t pad_wl, const size_t pad_wr,
    const size_t stride_h, const size_t stride_w,
    GpuArray *data_col) {
  // We are going to launch channels * height_col * width_col kernels, each
  // kernel responsible for copying a single-channel grid.
  size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
  size_t dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
-  size_t height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
-  size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
+  size_t height_col = (height + pad_hl + pad_hr - dil_kernel_h) / stride_h + 1;
+  size_t width_col = (width + pad_wl + pad_wr - dil_kernel_w) / stride_w + 1;
  size_t num_kernels = channels * height_col * width_col;
  int err;
  if (dilation_h != 1 || dilation_w != 1) {
@@ -270,7 +271,7 @@ int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels
      1, &num_kernels, 0,
      num_kernels, data_im->data, data_im->offset, data_im_offset,
      height, width, kernel_h, kernel_w,
-      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
+      dilation_h, dilation_w, pad_hl, pad_wl, stride_h, stride_w, height_col,
      width_col, data_col->data, data_col->offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
@@ -282,7 +283,7 @@ int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels
      1, &num_kernels, 0,
      num_kernels, data_im->data, data_im->offset, data_im_offset,
      height, width, kernel_h, kernel_w,
-      pad_h, pad_w, stride_h, stride_w, height_col,
+      pad_hl, pad_wl, stride_h, stride_w, height_col,
      width_col, data_col->data, data_col->offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
@@ -296,12 +297,12 @@ int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels
 int col2im(GpuArray *data_col, const size_t channels,
    const size_t height, const size_t width, const size_t patch_h, const size_t patch_w,
    const size_t dilation_h, const size_t dilation_w,
-    const size_t pad_h, const size_t pad_w, const size_t stride_h,
-    const size_t stride_w, GpuArray *data_im, const size_t data_im_offset) {
+    const size_t pad_hl, const size_t pad_hr, const size_t pad_wl, const size_t pad_wr,
+    const size_t stride_h, const size_t stride_w, GpuArray *data_im, const size_t data_im_offset) {
  size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
  size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
-  size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
-  size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
+  size_t height_col = (height + pad_hl + pad_hr - dil_patch_h) / stride_h + 1;
+  size_t width_col = (width + pad_wl + pad_wr - dil_patch_w) / stride_w + 1;
  size_t num_kernels = channels * height * width;
  // To avoid involving atomic operations, we will launch one kernel per
  // bottom dimension, and then in the kernel add up the top dimensions.
@@ -311,7 +312,7 @@ int col2im(GpuArray *data_col, const size_t channels,
      1, &num_kernels, 0,
      num_kernels, data_col->data, data_col->offset,
      height, width, channels, patch_h, patch_w,
-      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
+      dilation_h, dilation_w, pad_hl, pad_wl, stride_h, stride_w,
      height_col, width_col, data_im->data, data_im->offset, data_im_offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
@@ -323,7 +324,7 @@ int col2im(GpuArray *data_col, const size_t channels,
      1, &num_kernels, 0,
      num_kernels, data_col->data, data_col->offset,
      height, width, channels, patch_h, patch_w,
-      pad_h, pad_w, stride_h, stride_w,
+      pad_hl, pad_wl, stride_h, stride_w,
      height_col, width_col, data_im->data, data_im->offset, data_im_offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
@@ -347,8 +348,10 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
                         const size_t dW = 1,
                         const size_t dilH = 1,
                         const size_t dilW = 1,
-                         const size_t padH = 0,
-                         const size_t padW = 0,
+                         const size_t padH_l = 0,
+                         const size_t padH_r = 0,
+                         const size_t padW_l = 0,
+                         const size_t padW_r = 0,
                         const size_t numgroups = 1,
                         const size_t unshared = 0)
 {
@@ -443,8 +446,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    const size_t dil_kH = (kH - 1) * dilH + 1;
    const size_t dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
-    const size_t topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    const size_t topHeightNoDH = (bottomHeight + padH_l + padH_r - dil_kH);
+    const size_t topWidthNoDW  = (bottomWidth + padW_l + padW_r - dil_kW);
    // the above values might be negative so we need to use Python-like
    // flooring integer division to be compatible with get_conv_output.
    // note: this macro implements Python's // for negative x only
@@ -558,7 +561,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
            err = im2col(&bottom->ga, n * batch_bottom_stride,
                         nChannels, bottomHeight,
                         bottomWidth, kH, kW, dilH, dilW,
-                         padH, padW, dH, dW, &col->ga);
+                         padH_l, padH_r, padW_l, padW_r, dH, dW, &col->ga);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
@@ -618,7 +621,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
            err = im2col(&bottom->ga, n * batch_bottom_stride,
                         nChannels, bottomHeight,
                         bottomWidth, kH, kW, dilH, dilW,
-                         padH, padW, dH, dW, &col->ga);
+                         padH_l, padH_r, padW_l, padW_r, dH, dW, &col->ga);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
@@ -712,7 +715,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
            }
            // col2im back to the data
            err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
-                         kH, kW, dilH, dilW, padH, padW,
+                         kH, kW, dilH, dilW, padH_l, padH_r, padW_l, padW_r,
                         dH, dW, &bottom->ga, n * batch_bottom_stride);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);

--- a/theano/gpuarray/tests/test_gemmcorr.py
+++ b/theano/gpuarray/tests/test_gemmcorr.py
@@ -12,6 +12,7 @@ from ..type import gpuarray_shared_constructor
 from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
 from .config import mode_with_gpu, mode_without_gpu, ref_cast
 from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv
+from theano.tensor.nnet.tests.test_abstract_conv import TestAsymmetricPadding


 class TestCorrMM(unittest.TestCase):
@@ -272,3 +273,10 @@ class TestUnsharedGpuCorr2d(TestUnsharedConv):
    conv2d_op = GpuCorrMM
    conv2d_gradw_op = GpuCorrMM_gradWeights
    conv2d_gradi_op = GpuCorrMM_gradInputs
+
+
+class TestAsymmetricGpu(TestAsymmetricPadding):
+    mode = mode_with_gpu.excluding('cudnn')
+    conv2d_op = GpuCorrMM
+    conv2d_gradw_op = GpuCorrMM_gradWeights
+    conv2d_gradi_op = GpuCorrMM_gradInputs
--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
@@ -195,7 +195,7 @@ class BaseCorrMM(gof.OpenMPOp):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (9, self.openmp, blas_header_version())
+        return (10, self.openmp, blas_header_version())

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -439,7 +439,7 @@ class BaseCorrMM(gof.OpenMPOp):
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width)
-        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
+        // height and width: weights = (bottom + pad_l + pad_r - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
        if (unshared){
            odim = 6;

--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -11,6 +11,7 @@ import theano.tensor as T
 from theano.tests import unittest_tools as utt
 from theano.tensor.nnet import corr, conv
 from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv
+from theano.tensor.nnet.tests.test_abstract_conv import TestAsymmetricPadding


 class TestCorr2D(utt.InferShapeTester):
@@ -462,6 +463,16 @@ class TestUnsharedCorr2d(TestUnsharedConv):
    conv2d_gradi_op = corr.CorrMM_gradInputs


+class TestAsymmetricCorr(TestAsymmetricPadding):
+    if theano.config.mode == "FAST_COMPILE":
+        mode = theano.compile.get_mode("FAST_RUN").excluding('gpuarray')
+    else:
+        mode = None
+    conv2d_op = corr.CorrMM
+    conv2d_gradw_op = corr.CorrMM_gradWeights
+    conv2d_gradi_op = corr.CorrMM_gradInputs
+
+
 if __name__ == '__main__':

    t = TestCorr2D('setUp')