Merge pull request #6331 from vikramnitin9/dilated_causal

Dilated causal convolution

Merge pull request #6331 from vikramnitin9/dilated_causal
ea62004b · Frédéric Bastien · GitHub · c3c477df · 321e2fc6 · ea62004b
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -449,8 +449,8 @@ class BaseGpuCorrMM(CGpuKernelBase):
    Parameters
    ----------
    border_mode : {'valid', 'full', 'half'}
-        Additionally, the padding size could be directly specified by an integer
-        or a pair of integers
+        Additionally, the padding size could be directly specified by an integer,
+        a pair of integers, or two pairs of integers.
    subsample
        Perform subsampling of the output (default: (1, 1)).
    filter_dilation
@@ -468,16 +468,33 @@ class BaseGpuCorrMM(CGpuKernelBase):
    def __init__(self, border_mode="valid", subsample=(1, 1),
                 filter_dilation=(1, 1), num_groups=1, unshared=False):
        if isinstance(border_mode, integer_types):
-            border_mode = (border_mode, border_mode)
-        if isinstance(border_mode, tuple):
-            pad_h, pad_w = map(int, border_mode)
-            border_mode = (pad_h, pad_w)
-        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
-                border_mode in ('valid', 'full', 'half')):
+            if border_mode < 0:
+                raise ValueError(
+                    'invalid border_mode {}, which must be a '
+                    'non-negative integer'.format(border_mode))
+            border_mode = ((border_mode, border_mode),) * 2
+        elif isinstance(border_mode, tuple):
+            if len(border_mode) != 2:
+                raise ValueError(
+                    'invalid border_mode {} which must be a '
+                    'tuple of length 2'.format(border_mode))
+            border = ()
+            for mode in border_mode:
+                if isinstance(mode, tuple) and len(mode) == 2 and \
+                        min(mode) >= 0:
+                    border += ((int(mode[0]), int(mode[1])),)
+                elif mode >= 0:
+                    border += ((int(mode), int(mode)),)
+                else:
+                    raise ValueError(
+                        'invalid border mode {}. The tuple can only contain '
+                        'integers or tuples of length 2'.format(border_mode))
+            border_mode = border
+        elif border_mode not in ('valid', 'full', 'half'):
            raise ValueError(
                'invalid border_mode {}, which must be either '
-                '"valid", "full", "half", an integer or a pair of'
-                ' integers'.format(border_mode))
+                '"valid", "full", "half", an integer or a tuple '
+                'of length 2'.format(border_mode))
        self.border_mode = border_mode
        if len(subsample) != 2:
            raise ValueError("subsample must have two elements")
@@ -495,7 +512,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
    def pad(self):
        if self.border_mode != 'valid':
            return self.border_mode
-        return (0, 0)
+        return ((0, 0),) * 2

    def __str__(self):
        return '%s{%s, %s, %s, %s, %s}' % (
@@ -537,7 +554,7 @@ class BaseGpuCorrMM(CGpuKernelBase):

    def c_code_cache_version(self):
        # Raise this whenever modifying the C code (including the file).
-        return (11,)
+        return (12,)

    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
        """
@@ -587,14 +604,14 @@ class BaseGpuCorrMM(CGpuKernelBase):
        numgroups = self.num_groups
        unshared = int(self.unshared)
        if self.border_mode == "half":
-            padH = padW = -1
+            padH_l = padH_r = padW_l = padW_r = -1
        elif self.border_mode == "full":
-            padH = padW = -2
+            padH_l = padH_r = padW_l = padW_r = -2
        elif isinstance(self.border_mode, tuple):
-            padH, padW = self.border_mode
+            (padH_l, padH_r), (padW_l, padW_r) = self.border_mode
        else:
            assert self.border_mode == "valid"
-            padH = padW = 0
+            padH_l = padH_r = padW_l = padW_r = 0
        if direction == "forward":
            direction = 0
            out = top
@@ -613,13 +630,13 @@ class BaseGpuCorrMM(CGpuKernelBase):
        if height:
            height = '(*(npy_int*)(PyArray_DATA(%s)))' % height
        else:
-            if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)):
+            if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH_l == -1 or padH_r == -1)):
                raise ValueError("height must be given for backprop with vertical sampling or pad='half'")
            height = '-1'
        if width:
            width = '(*(npy_int*)(PyArray_DATA(%s)))' % width
        else:
-            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
+            if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW_l == -1 or padW_r == -1)):
                raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
            width = '-1'

@@ -635,8 +652,10 @@ class BaseGpuCorrMM(CGpuKernelBase):
    size_t dW = %(dW)s;
    size_t dilH = %(dilH)s;
    size_t dilW = %(dilW)s;
-    int padH = %(padH)s;
-    int padW = %(padW)s;
+    int padH_l = %(padH_l)s;
+    int padH_r = %(padH_r)s;
+    int padW_l = %(padW_l)s;
+    int padW_r = %(padW_r)s;
    int numgroups = %(numgroups)s;
    int unshared = %(unshared)s;

@@ -662,22 +681,22 @@ class BaseGpuCorrMM(CGpuKernelBase):
            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
-        else if (padH == -2) {
+        else if (padH_l == -2 || padH_r == -2) {
            // vertical full padding, we can infer the kernel height
            kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1;
        }
        else {
            // explicit padding, we can infer the kernel height
-            kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
+            kH = (PyGpuArray_DIMS(bottom)[2] + padH_l + padH_r - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ;
        }
        if (%(width)s != -1) {
            kW = %(width)s;
        }
-        else if (padW == -2) {
+        else if (padW_l == -2 || padW_r == -2) {
            kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
        else {
-            kW = (PyGpuArray_DIMS(bottom)[3] + 2*padW - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
+            kW = (PyGpuArray_DIMS(bottom)[3] + padW_l + padW_r - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
    }

@@ -686,23 +705,23 @@ class BaseGpuCorrMM(CGpuKernelBase):
    dil_kW = (kW - 1) * dilW + 1;

    // Auto-padding if requested
-    if (padH == -1) {  // vertical half padding
-        padH = dil_kH / 2;
+    if (padH_l == -1 || padH_r == -1) {  // vertical half padding
+        padH_l = padH_r = dil_kH / 2;
    }
-    else if (padH == -2) {  // vertical full padding
-        padH = dil_kH - 1;
+    else if (padH_l == -2 || padH_r == -2) {  // vertical full padding
+        padH_l = padH_r = dil_kH - 1;
    }
-    else if (padH < 0) {
+    else if (padH_l < 0 || padH_r < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padH must be >= -2");
        %(fail)s
    }
-    if (padW == -1) {  // horizontal half padding
-        padW = dil_kW / 2;
+    if (padW_l == -1 || padW_r == -1) {  // horizontal half padding
+        padW_l = padW_r = dil_kW / 2;
    }
-    else if (padW == -2) {  // horizontal full padding
-        padW = dil_kW - 1;
+    else if (padW_l == -2 || padW_r == -2) {  // horizontal full padding
+        padW_l = padW_r = dil_kW - 1;
    }
-    else if (padW < 0) {
+    else if (padW_l < 0 || padW_r < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padW must be >= -2");
        %(fail)s
    }
@@ -718,11 +737,11 @@ class BaseGpuCorrMM(CGpuKernelBase):
    switch(direction) {
    case 0:  // forward pass
        // output is top: (batchsize, num_filters, height, width)
-        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
+        // height and width: top = (bottom + pad_l + pad_r - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = PyGpuArray_DIMS(bottom)[0];
        out_dim[1] = PyGpuArray_DIMS(weights)[0];
-        out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1;
-        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1;
+        out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + padH_l + padH_r - ((PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1;
+        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + padW_l + padW_r - ((PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1;
        out_typecode = bottom->ga.typecode;
        out_context = bottom->context;
        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
@@ -810,8 +829,8 @@ class BaseGpuCorrMM(CGpuKernelBase):
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = PyGpuArray_DIMS(top)[0];
        out_dim[1] = PyGpuArray_DIMS(weights)[wdim-3] * numgroups;
-        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - 2*padH;
-        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - 2*padW;
+        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - padH_l - padH_r;
+        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - padW_l - padW_r;
        out_typecode = top->ga.typecode;
        out_context = top->context;
        if (unshared) {
@@ -884,7 +903,8 @@ class BaseGpuCorrMM(CGpuKernelBase):
    }

    // Call GPU code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups, unshared);
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW,
+                padH_l, padH_r, padW_l, padW_r, numgroups, unshared);
    if (out2==NULL){
       %(fail)s
    }
@@ -907,8 +927,11 @@ class GpuCorrMM(BaseGpuCorrMM):
        ``'valid'`` for ``(0, 0)`` (valid convolution, no padding), ``'full'``
        for ``(kernel_rows - 1, kernel_columns - 1)`` (full convolution),
        ``'half'`` for ``(kernel_rows // 2, kernel_columns // 2)`` (same
-        convolution for odd-sized kernels). Note that the two widths are each
-        applied twice, once per side (left and right, top and bottom).
+        convolution for odd-sized kernels).
+        If it is a tuple containing 2 pairs of integers, then these specify
+        the padding to be applied on each side ((left, right), (top, bottom)).
+        Otherwise, each width is applied twice, once per side (left and right,
+        top and bottom).
    subsample
        The subsample operation applied to each output image.
        Should be a tuple with 2 elements.

--- a/theano/gpuarray/c_code/corr_gemm.c
+++ b/theano/gpuarray/c_code/corr_gemm.c
@@ -43,7 +43,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
    const ga_size height, const ga_size width,
    const ga_size kernel_h, const ga_size kernel_w,
    const ga_size dilation_h, const ga_size dilation_w,
-    const ga_size pad_h, const ga_size pad_w,
+    const ga_size pad_hl, const ga_size pad_wl,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
    GLOBAL_MEM DTYPE_INPUT_0 * data_col,
@@ -58,8 +58,8 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
    const ga_size w_col = index % width_col;
    const ga_size c_im = h_index / height_col;
    const ga_size c_col = c_im * kernel_h * kernel_w;
-    const ga_size h_offset = h_col * stride_h - pad_h;
-    const ga_size w_offset = w_col * stride_w - pad_w;
+    const ga_size h_offset = h_col * stride_h - pad_hl;
+    const ga_size w_offset = w_col * stride_w - pad_wl;
    GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
@@ -88,7 +88,7 @@ KERNEL void im2col_kernel(const ga_size n,
    // data_im_offset is an offset of elements in the array
    const ga_size height, const ga_size width,
    const ga_size kernel_h, const ga_size kernel_w,
-    const ga_size pad_h, const ga_size pad_w,
+    const ga_size pad_hl, const ga_size pad_wl,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
    GLOBAL_MEM DTYPE_INPUT_0 * data_col,
@@ -103,8 +103,8 @@ KERNEL void im2col_kernel(const ga_size n,
    const ga_size w_col = index % width_col;
    const ga_size c_im = h_index / height_col;
    const ga_size c_col = c_im * kernel_h * kernel_w;
-    const ga_size h_offset = h_col * stride_h - pad_h;
-    const ga_size w_offset = w_col * stride_w - pad_w;
+    const ga_size h_offset = h_col * stride_h - pad_hl;
+    const ga_size w_offset = w_col * stride_w - pad_wl;
    GLOBAL_MEM DTYPE_INPUT_0 * data_col_ptr = data_col;
    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im_ptr = data_im + data_im_offset;
@@ -131,7 +131,7 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
    const ga_size height, const ga_size width, const ga_size channels,
    const ga_size kernel_h, const ga_size kernel_w,
    const ga_size dilation_h, const ga_size dilation_w,
-    const ga_size pad_h, const ga_size pad_w,
+    const ga_size pad_hl, const ga_size pad_wl,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
    GLOBAL_MEM DTYPE_INPUT_0 * data_im,
@@ -145,8 +145,8 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
    DTYPE_INPUT_0 val = 0;
-    const ga_size w_im = index % width + pad_w;
-    const ga_size h_im = (index / width) % height + pad_h;
+    const ga_size w_im = index % width + pad_wl;
+    const ga_size h_im = (index / width) % height + pad_hl;
    const ga_size c_im = index / (width * height);
    ga_size kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
    ga_size kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
@@ -182,7 +182,7 @@ KERNEL void col2im_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
    const ga_size height, const ga_size width, const ga_size channels,
    const ga_size kernel_h, const ga_size kernel_w,
-    const ga_size pad_h, const ga_size pad_w,
+    const ga_size pad_hl, const ga_size pad_wl,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
    GLOBAL_MEM DTYPE_INPUT_0 * data_im,
@@ -196,8 +196,8 @@ KERNEL void col2im_kernel(const ga_size n,
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
    DTYPE_INPUT_0 val = 0;
-    const ga_size w_im = index % width + pad_w;
-    const ga_size h_im = (index / width) % height + pad_h;
+    const ga_size w_im = index % width + pad_wl;
+    const ga_size h_im = (index / width) % height + pad_hl;
    const ga_size c_im = index / (width * height);
    // compute the start and end of the output
    const ga_size w_col_start =
@@ -259,15 +259,16 @@ int rgemm(cb_order o, cb_transpose tA, cb_transpose tB,
 int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels,
    const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w,
    const size_t dilation_h, const size_t dilation_w,
-    const size_t pad_h, const size_t pad_w,
+    const size_t pad_hl, const size_t pad_hr,
+    const size_t pad_wl, const size_t pad_wr,
    const size_t stride_h, const size_t stride_w,
    GpuArray *data_col) {
  // We are going to launch channels * height_col * width_col kernels, each
  // kernel responsible for copying a single-channel grid.
  size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
  size_t dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
-  size_t height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
-  size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
+  size_t height_col = (height + pad_hl + pad_hr - dil_kernel_h) / stride_h + 1;
+  size_t width_col = (width + pad_wl + pad_wr - dil_kernel_w) / stride_w + 1;
  size_t num_kernels = channels * height_col * width_col;
  int err;
  if (dilation_h != 1 || dilation_w != 1) {
@@ -275,7 +276,7 @@ int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels
      1, &num_kernels, 0,
      num_kernels, data_im->data, data_im->offset, data_im_offset,
      height, width, kernel_h, kernel_w,
-      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
+      dilation_h, dilation_w, pad_hl, pad_wl, stride_h, stride_w, height_col,
      width_col, data_col->data, data_col->offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
@@ -287,7 +288,7 @@ int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels
      1, &num_kernels, 0,
      num_kernels, data_im->data, data_im->offset, data_im_offset,
      height, width, kernel_h, kernel_w,
-      pad_h, pad_w, stride_h, stride_w, height_col,
+      pad_hl, pad_wl, stride_h, stride_w, height_col,
      width_col, data_col->data, data_col->offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
@@ -301,12 +302,12 @@ int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels
 int col2im(GpuArray *data_col, const size_t channels,
    const size_t height, const size_t width, const size_t patch_h, const size_t patch_w,
    const size_t dilation_h, const size_t dilation_w,
-    const size_t pad_h, const size_t pad_w, const size_t stride_h,
-    const size_t stride_w, GpuArray *data_im, const size_t data_im_offset) {
+    const size_t pad_hl, const size_t pad_hr, const size_t pad_wl, const size_t pad_wr,
+    const size_t stride_h, const size_t stride_w, GpuArray *data_im, const size_t data_im_offset) {
  size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
  size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
-  size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
-  size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
+  size_t height_col = (height + pad_hl + pad_hr - dil_patch_h) / stride_h + 1;
+  size_t width_col = (width + pad_wl + pad_wr - dil_patch_w) / stride_w + 1;
  size_t num_kernels = channels * height * width;
  // To avoid involving atomic operations, we will launch one kernel per
  // bottom dimension, and then in the kernel add up the top dimensions.
@@ -316,7 +317,7 @@ int col2im(GpuArray *data_col, const size_t channels,
      1, &num_kernels, 0,
      num_kernels, data_col->data, data_col->offset,
      height, width, channels, patch_h, patch_w,
-      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
+      dilation_h, dilation_w, pad_hl, pad_wl, stride_h, stride_w,
      height_col, width_col, data_im->data, data_im->offset, data_im_offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
@@ -328,7 +329,7 @@ int col2im(GpuArray *data_col, const size_t channels,
      1, &num_kernels, 0,
      num_kernels, data_col->data, data_col->offset,
      height, width, channels, patch_h, patch_w,
-      pad_h, pad_w, stride_h, stride_w,
+      pad_hl, pad_wl, stride_h, stride_w,
      height_col, width_col, data_im->data, data_im->offset, data_im_offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
@@ -352,8 +353,10 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
                         const size_t dW = 1,
                         const size_t dilH = 1,
                         const size_t dilW = 1,
-                         const size_t padH = 0,
-                         const size_t padW = 0,
+                         const size_t padH_l = 0,
+                         const size_t padH_r = 0,
+                         const size_t padW_l = 0,
+                         const size_t padW_r = 0,
                         const size_t numgroups = 1,
                         const size_t unshared = 0)
 {
@@ -448,8 +451,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    const size_t dil_kH = (kH - 1) * dilH + 1;
    const size_t dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
-    const size_t topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    const size_t topHeightNoDH = (bottomHeight + padH_l + padH_r - dil_kH);
+    const size_t topWidthNoDW  = (bottomWidth + padW_l + padW_r - dil_kW);
    // the above values might be negative so we need to use Python-like
    // flooring integer division to be compatible with get_conv_output.
    // note: this macro implements Python's // for negative x only
@@ -563,7 +566,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
            err = im2col(&bottom->ga, n * batch_bottom_stride,
                         nChannels, bottomHeight,
                         bottomWidth, kH, kW, dilH, dilW,
-                         padH, padW, dH, dW, &col->ga);
+                         padH_l, padH_r, padW_l, padW_r, dH, dW, &col->ga);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
@@ -623,7 +626,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
            err = im2col(&bottom->ga, n * batch_bottom_stride,
                         nChannels, bottomHeight,
                         bottomWidth, kH, kW, dilH, dilW,
-                         padH, padW, dH, dW, &col->ga);
+                         padH_l, padH_r, padW_l, padW_r, dH, dW, &col->ga);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
@@ -717,7 +720,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
            }
            // col2im back to the data
            err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
-                         kH, kW, dilH, dilW, padH, padW,
+                         kH, kW, dilH, dilW, padH_l, padH_r, padW_l, padW_r,
                         dH, dW, &bottom->ga, n * batch_bottom_stride);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -3084,6 +3084,10 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    if op.unshared:
        return None

+    if isinstance(op.border_mode, tuple) and any(isinstance(p, tuple) for p in op.border_mode):
+        # Asymmetric padding not yet supported
+        return None
+
    inp1 = inputs[0]
    inp2 = inputs[1]

@@ -3180,6 +3184,9 @@ def local_abstractconv_cudnn(node):
        return
    if node.op.unshared:
        return None
+    if isinstance(node.op.border_mode, tuple) and any(isinstance(p, tuple) for p in node.op.border_mode):
+        # Asymmetric padding not yet supported
+        return None
    if isinstance(node.op, AbstractConv2d):
        with inherit_stack_trace(node.outputs):
            return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
@@ -3198,6 +3205,9 @@ def local_abstractconv_cudnn_alt(node):
        return None
    if node.op.unshared:
        return None
+    if isinstance(node.op.border_mode, tuple) and any(isinstance(p, tuple) for p in node.op.border_mode):
+        # Asymmetric padding not yet supported
+        return None
    inp1 = node.inputs[0]
    inp2 = node.inputs[1]

@@ -3407,6 +3417,9 @@ def local_abstractconv_gw_cudnn(node):
        return
    if node.op.unshared:
        return None
+    if isinstance(node.op.border_mode, tuple) and any(isinstance(p, tuple) for p in node.op.border_mode):
+        # Asymmetric padding not yet supported
+        return None
    if isinstance(node.op, AbstractConv2d_gradWeights):
        with inherit_stack_trace(node.outputs):
            return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
@@ -3422,6 +3435,9 @@ def local_abstractconv_gi_cudnn(node):
        return
    if node.op.unshared:
        return None
+    if isinstance(node.op.border_mode, tuple) and any(isinstance(p, tuple) for p in node.op.border_mode):
+        # Asymmetric padding not yet supported
+        return None
    if isinstance(node.op, AbstractConv2d_gradInputs):
        with inherit_stack_trace(node.outputs):
            return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)

--- a/theano/gpuarray/tests/test_gemmcorr.py
+++ b/theano/gpuarray/tests/test_gemmcorr.py
@@ -12,6 +12,7 @@ from ..type import gpuarray_shared_constructor
 from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
 from .config import mode_with_gpu, mode_without_gpu, ref_cast
 from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv
+from theano.tensor.nnet.tests.test_abstract_conv import TestAsymmetricPadding, TestCausalConv


 class TestCorrMM(unittest.TestCase):
@@ -272,3 +273,14 @@ class TestUnsharedGpuCorr2d(TestUnsharedConv):
    conv2d_op = GpuCorrMM
    conv2d_gradw_op = GpuCorrMM_gradWeights
    conv2d_gradi_op = GpuCorrMM_gradInputs
+
+
+class TestAsymmetricGpu(TestAsymmetricPadding):
+    mode = mode_with_gpu
+    conv2d_op = GpuCorrMM
+    conv2d_gradw_op = GpuCorrMM_gradWeights
+    conv2d_gradi_op = GpuCorrMM_gradInputs
+
+
+class TestCausalGpuCorr(TestCausalConv):
+    mode = mode_with_gpu
--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
@@ -72,7 +72,7 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
        You can give ``None`` for any element of the list to specify that this
        element is not known at compile time.

-    border_mode: str, int or tuple of two int
+    border_mode: str, int or a tuple of two ints or pairs of ints
        Either of the following:

        ``'valid'``: apply filter wherever it completely overlaps with the
@@ -85,8 +85,11 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
            leads to the output shape being equal to the input shape.
        ``int``: pad input with a symmetric border of zeros of the given
            width, then perform a valid convolution.
-        ``(int1, int2)``: pad input with a symmetric border of ``int1`` rows
-            and ``int2`` columns, then perform a valid convolution.
+        ``(int1, int2)``: (for 2D) pad input with a symmetric border of ``int1``,
+            ``int2``, then perform a valid convolution.
+        ``(int1, (int2, int3))`` or ``((int1, int2), int3)``: (for 2D)
+            pad input with one symmetric border of `int1`` or ``int3``, and
+            one asymmetric border of ``(int2, int3)`` or ``(int1, int2)``.

    subsample: tuple of len 2
        Factor by which to subsample the output.

--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
@@ -53,9 +53,10 @@ def get_conv_output_shape(image_shape, kernel_shape,
        input channels, height and width of the kernel.
        None where undefined.
    border_mode: string, int (symbolic or numeric) or tuple of int (symbolic
-        or numeric). If it is a string, it must be 'valid', 'half' or 'full'.
-        If it is a tuple, its two (or three) elements respectively correspond
-        to the padding on height and width (and possibly depth) axis.
+        or numeric) or pairs of ints. If it is a string, it must be 'valid',
+        'half' or 'full'. If it is a tuple, its two (or three) elements respectively
+        correspond to the padding on height and width (and possibly depth)
+        axis. For asymmetric padding, provide a pair of ints for each dimension.
    subsample: tuple of int (symbolic or numeric). Its two or three elements
        espectively correspond to the subsampling on height and width (and
        possibly depth) axis.
@@ -103,9 +104,11 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
        given axis. None if undefined.
    kernel_shape: int or None. Corresponds to the kernel shape on a given
        axis. None if undefined.
-    border_mode: string or int. If it is a string, it must be
+    border_mode: string, int or tuple of 2 ints. If it is a string, it must be
        'valid', 'half' or 'full'. If it is an integer, it must correspond to
-        the padding on the considered axis.
+        the padding on the considered axis. If it is a tuple, its two elements
+        must correspond to the asymmetric padding (e.g., left and right) on
+        the considered axis.
    subsample: int. It must correspond to the subsampling on the
        considered axis.
    dilation: int. It must correspond to the dilation on the
@@ -123,22 +126,26 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
    # Implicit dilated kernel shape
    dil_kernel_shape = (kernel_shape - 1) * dilation + 1
    if border_mode == "half":
-        pad = dil_kernel_shape // 2
+        pad_l = pad_r = dil_kernel_shape // 2
    elif border_mode == "full":
-        pad = dil_kernel_shape - 1
+        pad_l = pad_r = dil_kernel_shape - 1
    elif border_mode == "valid":
-        pad = 0
+        pad_l = pad_r = 0
    else:
-        pad = border_mode
-        if pad < 0:
+        if isinstance(border_mode, tuple):
+            pad_l, pad_r = border_mode
+        else:
+            pad_l = pad_r = border_mode
+        if pad_l < 0 or pad_r < 0:
            raise ValueError("border_mode must be >= 0")

    # In case of symbolic shape, we want to build the smallest graph
    # (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1
-    if pad == 0:
-        out_shp = (image_shape - dil_kernel_shape)
-    else:
-        out_shp = (image_shape + 2 * pad - dil_kernel_shape)
+    out_shp = (image_shape - dil_kernel_shape)
+    if pad_l != 0:
+        out_shp += pad_l
+    if pad_r != 0:
+        out_shp += pad_r
    if subsample != 1:
        out_shp = out_shp // subsample
    out_shp = out_shp + 1
@@ -168,9 +175,10 @@ def get_conv_gradweights_shape(image_shape, top_shape,
        to: batch size, number of output channels, height and width (and
        possibly depth) of the image. None where undefined.
    border_mode: string, int (symbolic or numeric) or tuple of int (symbolic
-        or numeric). If it is a string, it must be 'valid', 'half' or 'full'.
-        If it is a tuple, its two (or three) elements respectively correspond
-        to the padding on height and width (and possibly depth) axis.
+        or numeric) or pairs of ints. If it is a string, it must be 'valid',
+        'half' or 'full'. If it is a tuple, its two (or three) elements respectively
+        correspond to the padding on height and width (and possibly depth)
+        axis. For asymmetric padding, provide a pair of ints for each dimension.
    subsample: tuple of int (symbolic or numeric). Its two or three elements
        respectively correspond to the subsampling on height and width (and
        possibly depth) axis.
@@ -227,9 +235,11 @@ def get_conv_gradweights_shape_1axis(image_shape, top_shape, border_mode,
        given axis. None if undefined.
    top_shape: int or None. Corresponds to the top shape on a given axis.
        None if undefined.
-    border_mode: string or int. If it is a string, it must be
+    border_mode: string, int or tuple of 2 ints. If it is a string, it must be
        'valid', 'half' or 'full'. If it is an integer, it must correspond to
-        the padding on the considered axis.
+        the padding on the considered axis. If it is a tuple, its two elements
+        must correspond to the asymmetric padding (e.g., left and right) on
+        the considered axis.
    subsample: int. It must correspond to the subsampling on the
        considered axis.
    dilation: int. It must correspond to the dilation on the
@@ -252,9 +262,14 @@ def get_conv_gradweights_shape_1axis(image_shape, top_shape, border_mode,
    elif border_mode == "valid":
        kernel_shape = image_shape - top_shape
    else:
-        if border_mode < 0:
+        if isinstance(border_mode, tuple):
+            pad_l, pad_r = border_mode
+        else:
+            pad_l = pad_r = border_mode
+        if pad_l < 0 or pad_r < 0:
            raise ValueError("border_mode must be >= 0")
-        kernel_shape = (image_shape + 2 * border_mode - top_shape)
+
+        kernel_shape = (image_shape + pad_l + pad_r - top_shape)

    if dilation > 1:
        kernel_shape = kernel_shape / dilation
@@ -284,9 +299,10 @@ def get_conv_gradinputs_shape(kernel_shape, top_shape,
        to: batch size, number of output channels, height and width (and
        possibly depth) of the image. None where undefined.
    border_mode: string, int (symbolic or numeric) or tuple of int (symbolic
-        or numeric). If it is a string, it must be 'valid', 'half' or 'full'.
-        If it is a tuple, its two (or three) elements respectively correspond
-        to the padding on height and width (and possibly depth) axis.
+        or numeric) or pairs of ints. If it is a string, it must be 'valid',
+        'half' or 'full'. If it is a tuple, its two (or three) elements respectively
+        correspond to the padding on height and width (and possibly depth)
+        axis. For asymmetric padding, provide a pair of ints for each dimension.
    subsample: tuple of int (symbolic or numeric). Its two or three elements
        respectively correspond to the subsampling on height and width (and
        possibly depth) axis.
@@ -340,9 +356,11 @@ def get_conv_gradinputs_shape_1axis(kernel_shape, top_shape, border_mode,
        axis. None if undefined.
    top_shape: int or None. Corresponds to the top shape on a given axis.
        None if undefined.
-    border_mode: string or int. If it is a string, it must be
+    border_mode: string, int or tuple of 2 ints. If it is a string, it must be
        'valid', 'half' or 'full'. If it is an integer, it must correspond to
-        the padding on the considered axis.
+        the padding on the considered axis. If it is a tuple, its two elements
+        must correspond to the asymmetric padding (e.g., left and right) on
+        the considered axis.
    subsample: int. It must correspond to the subsampling on the
        considered axis.
    dilation: int. It must correspond to the dilation on the
@@ -363,23 +381,27 @@ def get_conv_gradinputs_shape_1axis(kernel_shape, top_shape, border_mode,
    # Implicit dilated kernel shape
    dil_kernel_shape = (kernel_shape - 1) * dilation + 1
    if border_mode == "half":
-        pad = dil_kernel_shape // 2
+        pad_l = pad_r = dil_kernel_shape // 2
    elif border_mode == "full":
-        pad = dil_kernel_shape - 1
+        pad_l = pad_r = dil_kernel_shape - 1
    elif border_mode == "valid":
-        pad = 0
+        pad_l = pad_r = 0
    else:
-        pad = border_mode
-        if pad < 0:
+        if isinstance(border_mode, tuple):
+            pad_l, pad_r = border_mode
+        else:
+            pad_l = pad_r = border_mode
+        if pad_l < 0 or pad_r < 0:
            raise ValueError("border_mode must be >= 0")

    # In case of symbolic shape, we want to build the smallest graph
    # image_shape = (top_shape - 1) * s - 2 * pad + dil_kernel_shape + a
    # where 0 <= a < subsample, but we have checked that subsample == 1
-    if pad == 0:
-        image_shape = (top_shape + dil_kernel_shape - 1)
-    else:
-        image_shape = (top_shape - 2 * pad + dil_kernel_shape - 1)
+    image_shape = (top_shape + dil_kernel_shape - 1)
+    if pad_l > 0:
+        image_shape -= pad_l
+    if pad_r > 0:
+        image_shape -= pad_r

    return image_shape

@@ -405,9 +427,10 @@ def check_conv_gradinputs_shape(image_shape, kernel_shape, output_shape,
        to: batch size, number of output channels, height and width
        (and possibly depth) of the output. None where undefined.
    border_mode: string, int (symbolic or numeric) or tuple of int (symbolic
-        or numeric). If it is a string, it must be 'valid', 'half' or 'full'.
-        If it is a tuple, its two (or three) elements respectively correspond
-        to the padding on height and width (and possibly depth) axis.
+        or numeric) or pairs of ints. If it is a string, it must be 'valid',
+        'half' or 'full'. If it is a tuple, its two (or three) elements respectively
+        correspond to the padding on height and width (and possibly depth)
+        axis. For asymmetric padding, provide a pair of ints for each dimension.
    subsample: tuple of int (symbolic or numeric). Its two or three elements
        respectively correspond to the subsampling on height and width (and
        possibly depth) axis.
@@ -533,6 +556,63 @@ def assert_shape(x, expected_shape, msg='Unexpected shape.'):
        return x


+def border_mode_to_pad(mode, convdim, kshp):
+    """
+    Computes a tuple for padding given the border_mode parameter
+
+    Parameters
+    ----------
+    mode : int or tuple
+        One of "valid", "full", "half", an integer, or a tuple where each
+        member is either an integer or a tuple of 2 positive integers.
+    convdim : int
+        The dimensionality of the convolution.
+    kshp : List/tuple of length 'convdim', indicating the size of the
+        kernel in the spatial dimensions.
+
+    Returns
+    -------
+    A tuple containing 'convdim' elements, each of which is a tuple of
+    two positive integers corresponding to the padding on the left
+    and the right sides respectively.
+
+    """
+
+    if isinstance(mode, tuple):
+        if len(mode) != convdim:
+            raise ValueError(
+                'invalid border_mode {} which must be a '
+                'tuple of length {}'.format(mode, convdim))
+        border = ()
+        for m in mode:
+            if isinstance(m, integer_types) and m >= 0:
+                border += ((m, m),)
+            elif isinstance(m, tuple) and min(m) >= 0 and \
+                    all(isinstance(b, integer_types) for b in m):
+                if len(m) != 2:
+                    raise NotImplementedError(
+                        'Asymmetric padding not implemented '
+                        'for {}d'.format(len(m)))
+                border += ((m[0], m[1]),)
+            else:
+                raise ValueError(
+                    'invalid border mode {}. The tuple can only contain '
+                    'integers or tuples of length 2'.format(mode))
+        pad = border
+    elif mode == 'full':
+        pad = tuple((kshp[i] - 1,) * 2 for i in range(convdim))
+    elif mode == 'half':
+        pad = tuple((kshp[i] // 2,) * 2 for i in range(convdim))
+    elif mode == 'valid':
+        pad = ((0, 0),) * convdim
+    else:
+        raise ValueError(
+            'invalid border_mode {}, which must be either '
+            '"valid", "full", "half", an integer or a tuple '
+            'of length {}'.format(mode, convdim))
+    return pad
+
+
 def conv2d(input,
           filters,
           input_shape=None,
@@ -632,6 +712,12 @@ def separable_conv2d(input,
            width, then perform a valid convolution.
        ``(int1, int2)``: pad input with a symmetric border of ``int1`` rows
            and ``int2`` columns, then perform a valid convolution.
+        ``(int1, (int2, int3))`` or ``((int1, int2), int3)``:
+            pad input with one symmetric border of `int1`` or ``int3``, and
+            one asymmetric border of ``(int2, int3)`` or ``(int1, int2)``.
+        ``((int1, int2), (int3, int4))``: pad input with an asymmetric
+            border of ``(int1, int2)`` along one dimension and ``(int3, int4)``
+            along the second dimension.

    subsample: tuple of len 2
        Factor by which to subsample the output.
@@ -959,7 +1045,7 @@ def conv2d_grad_wrt_inputs(output_grad,
        Optional, possibly used  to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that
        this element is not known at compile time.
-    border_mode : str, int or tuple of two int
+    border_mode: str, int or a tuple of two ints or pairs of ints
        Either of the following:

          ``'valid'``
@@ -986,6 +1072,14 @@ def conv2d_grad_wrt_inputs(output_grad,
            pad input with a symmetric border of ``int1`` rows and
            ``int2`` columns, then perform a valid convolution.

+          ``(int1, (int2, int3))`` or ``((int1, int2), int3)``
+            pad input with one symmetric border of `int1`` or ``int3``, and
+            one asymmetric border of ``(int2, int3)`` or ``(int1, int2)``.
+
+          ``((int1, int2), (int3, int4))``
+            pad input with an asymmetric border of ``(int1, int2)`` along one dimension and ``(int3, int4)``
+            along the second dimension.
+
    subsample : tuple of len 2
        The subsampling used in the forward pass.  Also called strides
        elsewhere.
@@ -1245,7 +1339,7 @@ def conv2d_grad_wrt_weights(input,
        Optional, possibly used to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify
        that this element is not known at compile time.
-    border_mode : str, int or tuple of two ints
+    border_mode: str, int or a tuple of two ints or pairs of ints
        Either of the following:

          ``'valid'``
@@ -1271,6 +1365,14 @@ def conv2d_grad_wrt_weights(input,
          ``(int1, int2)``
            pad input with a symmetric border of ``int1`` rows and
            ``int2`` columns, then perform a valid convolution.
+
+          ``(int1, (int2, int3))`` or ``((int1, int2), int3)``
+            pad input with one symmetric border of `int1`` or ``int3``, and
+            one asymmetric border of ``(int2, int3)`` or ``(int1, int2)``.
+
+          ``((int1, int2), (int3, int4))``
+            pad input with an asymmetric border of ``(int1, int2)`` along
+            one dimension and ``(int3, int4)`` along the second dimension.
    subsample : tuple of len 2
        The subsampling used in the forward pass of the convolutional
        operation.  Also called strides elsewhere.
@@ -1484,6 +1586,108 @@ def conv3d_grad_wrt_weights(input,
    return gradWeight_op(input, output_grad, filter_shape[-3:])


+def causal_conv1d(input,
+                  filters,
+                  filter_shape,
+                  input_shape=None,
+                  subsample=1,
+                  filter_flip=True,
+                  filter_dilation=1,
+                  num_groups=1,
+                  unshared=False):
+    """
+    Computes (dilated) causal convolution
+
+    The output at time t depends only on the inputs till t-1. Used for
+    modelling temporal data.
+    See [WaveNet: A Generative Model for Raw Audio, section 2.1]
+    (https://arxiv.org/abs/1609.03499).
+
+    Parameters
+    ----------
+    input : symbolic 3D tensor
+        mini-batch of feature vector stacks, of shape
+        (batch_size, input_channels, input_length)
+        See the optional parameter ``input_shape``
+    filters : symbolic 3D tensor
+        Set of filters used in the CNN, of shape
+        (output_channels, input_channels, filter_length)
+    filter_shape : [None/int/Constant] * 2 + [Tensor/int/Constant]
+        The shape of the filters parameter.
+        A tuple/list of len 3, with the first two dimensions
+        being None or int or Constant and the last dimension being
+        Tensor or int or Constant.
+        Not optional, since the filter length is needed to calculate
+        the left padding for causality.
+    input_shape : None or [None/int/Constant] * 3
+        The shape of the input parameter.
+        None, or a tuple/list of len 3.
+        Optional, possibly used to choose an optimal implementation.
+    subsample : int
+        The factor by which to subsample the output. Also called strides
+        elsewhere.
+    filter_dilation : int
+        Factor by which to subsample (stride) the input. Also called
+        dilation factor.
+    num_groups : int
+        Divides the image, kernel and output tensors into num_groups
+        separate groups. Each which carry out convolutions separately
+    unshared : bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different filter will be used for each region of the
+        input.
+
+    Returns
+    -------
+    Symbolic 3D tensor.
+        Set of feature vectors generated by convolutional layer. Tensor is
+        of shape (batch_size, output_channels, output_length)
+
+    Notes
+    -----
+
+    :note: Currently, this is implemented with the 2D convolution ops.
+
+    """
+
+    input = as_tensor_variable(input)
+    filters = as_tensor_variable(filters)
+
+    if input.ndim != 3:
+        raise ValueError('Input should be 3D for causal convolution.')
+    if filters.ndim != 3:
+        raise ValueError('Filters should be 3D for causal convolution')
+
+    input = input.dimshuffle(0, 1, 2, 'x')
+    filters = filters.dimshuffle(0, 1, 2, 'x')
+
+    if input_shape is not None:
+        assert(len(input_shape) == 3)
+        input_shape = tuple(input_shape)
+        input_shape += (1,)
+
+    assert(len(filter_shape) == 3)
+    filter_shape = tuple(filter_shape)
+    filter_shape += (1,)
+
+    left_pad = filter_dilation * (filter_shape[2] - 1)
+
+    subsample = (subsample, 1)
+    filter_dilation = (filter_dilation, 1)
+
+    conv_op = AbstractConv2d(imshp=input_shape,
+                             kshp=filter_shape,
+                             border_mode=((left_pad, 0), 0),
+                             subsample=subsample,
+                             filter_flip=filter_flip,
+                             filter_dilation=filter_dilation,
+                             num_groups=num_groups,
+                             unshared=unshared)
+    output = conv_op(input, filters)
+
+    return output[:, :, :, 0]
+
+
 def bilinear_kernel_2D(ratio, normalize=True):
    """Compute 2D kernel for bilinear upsampling

@@ -1689,7 +1893,7 @@ class BaseAbstractConv(Op):
        element is not known at compile time.
        kshp is defined w.r.t the forward conv.

-     border_mode: str, int or tuple of ``convdim`` ints
+    border_mode: str, int or a tuple of two ints or pairs of ints
        Either of the following:

        ``'valid'``: apply filter wherever it completely overlaps with the
@@ -1704,6 +1908,12 @@ class BaseAbstractConv(Op):
            width, then perform a valid convolution.
        ``(int1, int2)``: (for 2D) pad input with a symmetric border of ``int1``,
            ``int2``, then perform a valid convolution.
+        ``(int1, (int2, int3))`` or ``((int1, int2), int3)``: (for 2D)
+            pad input with one symmetric border of `int1`` or ``int3``, and
+            one asymmetric border of ``(int2, int3)`` or ``(int1, int2)``.
+        ``((int1, int2), (int3, int4))``: (for 2D) pad input with an asymmetric
+            border of ``(int1, int2)`` along one dimension and ``(int3, int4)``
+            along the second dimension.
        ``(int1, int2, int3)``: (for 3D) pad input with a symmetric border of
            ``int1``, ``int2`` and ``int3``, then perform a valid convolution.

@@ -1751,21 +1961,40 @@ class BaseAbstractConv(Op):
            filter_dilation = (1,) * convdim

        if isinstance(border_mode, integer_types):
+            if border_mode < 0:
+                raise ValueError(
+                    'invalid border_mode {}, which must be a '
+                    'non-negative integer'.format(border_mode))
            border_mode = (border_mode,) * convdim
-        if isinstance(border_mode, tuple):
+        elif isinstance(border_mode, tuple):
            if len(border_mode) != convdim:
                raise ValueError(
-                    'border mode must have exactly {} values, '
-                    'but was {}'.format(convdim, border_mode))
-            border_mode = tuple(map(int, border_mode))
-        if border_mode == (0,) * convdim:
-            border_mode = 'valid'
-        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
-                border_mode in ('valid', 'full', 'half')):
+                    'invalid border_mode {}, which must be a '
+                    'tuple of length {}'.format(border_mode, convdim))
+            new_border_mode = ()
+            for mode in border_mode:
+                if not((isinstance(mode, integer_types) and mode >= 0) or
+                        (isinstance(mode, tuple) and len(mode) == 2 and min(mode) >= 0 and
+                         all(isinstance(m, integer_types) for m in mode))):
+                    raise ValueError(
+                        'invalid border mode {}. The tuple can only contain integers '
+                        ' or pairs of integers'.format(border_mode))
+                if isinstance(mode, tuple):
+                    if convdim != 2:
+                        raise NotImplementedError(
+                            'Asymmetric padding not implemented for {}D'.format(convdim))
+                    if mode[0] == mode[1]:
+                        mode = mode[0]
+                new_border_mode += (mode,)
+            border_mode = new_border_mode
+        elif border_mode not in ('valid', 'full', 'half'):
            raise ValueError(
                'invalid border_mode {}, which must be either '
-                '"valid", "full", "half", an integer or a tuple of {}'
-                ' integers'.format(border_mode, convdim))
+                '"valid", "full", "half", an integer or a tuple '
+                'of length {}'.format(border_mode, convdim))
+        if isinstance(border_mode, tuple) and \
+                all(mode == (0, 0) or mode == 0 for mode in border_mode):
+            border_mode = 'valid'

        self.imshp = tuple(imshp) if imshp else (None,) * (2 + convdim)
        for imshp_i in self.imshp:
@@ -2025,27 +2254,16 @@ class AbstractConv(BaseAbstractConv):
                                      % self.convdim)
        o, = out_
        mode = self.border_mode
+        pad = border_mode_to_pad(mode, self.convdim, dil_kernshp)

-        if not ((isinstance(mode, tuple) and min(mode) >= 0) or
-                mode in ('valid', 'full', 'half')):
-            raise ValueError(
-                'invalid border_mode {}, which must be either '
-                '"valid", "full", "half", an integer or a tuple of'
-                ' integers'.format(mode))
-
-        if mode == "full":
-            mode = tuple(dil_kernshp[i] - 1 for i in range(self.convdim))
-        elif mode == "half":
-            mode = tuple(dil_kernshp[i] // 2 for i in range(self.convdim))
-        if isinstance(mode, tuple):
-            pad = tuple(int(mode[i]) for i in range(self.convdim))
+        if any(p != (0, 0) for p in pad):
            mode = "valid"
            new_img = np.zeros((img.shape[0], img.shape[1]) +
-                               tuple(img.shape[i + 2] + 2 * pad[i]
+                               tuple(img.shape[i + 2] + pad[i][0] + pad[i][1]
                                     for i in range(self.convdim)),
                               dtype=img.dtype)
            new_img[(slice(None), slice(None)) +
-                    tuple(slice(pad[i], img.shape[i + 2] + pad[i])
+                    tuple(slice(pad[i][0], img.shape[i + 2] + pad[i][0])
                          for i in range(self.convdim))] = img
            img = new_img
        if not self.filter_flip:
@@ -2080,7 +2298,6 @@ class AbstractConv(BaseAbstractConv):
        conv_out = conv_out[(slice(None), slice(None)) +
                            tuple(slice(None, None, self.subsample[i])
                                  for i in range(self.convdim))]
-
        o[0] = node.outputs[0].type.filter(conv_out)

    def R_op(self, inputs, eval_points):
@@ -2296,34 +2513,21 @@ class AbstractConv_gradWeights(BaseAbstractConv):

        o, = out_

-        mode = self.border_mode
-        if not ((isinstance(mode, tuple) and min(mode) >= 0) or
-                mode in ('valid', 'full', 'half')):
-            raise ValueError(
-                'invalid border_mode {}, which must be either '
-                '"valid", "full", "half", an integer or a tuple of'
-                ' integers'.format(mode))
        if self.unshared and self.convdim != 2:
            raise NotImplementedError('Unshared convolution not implemented for %dD'
                                      % self.convdim)
-
        dil_shape = tuple((shape[i] - 1) * self.filter_dilation[i] + 1
                          for i in range(self.convdim))

-        if mode == "full":
-            mode = tuple(dil_shape[i] - 1 for i in range(self.convdim))
-        elif mode == "half":
-            mode = tuple(dil_shape[i] // 2 for i in range(self.convdim))
-        if isinstance(mode, tuple):
-            pad = tuple(int(mode[i]) for i in range(self.convdim))
+        pad = border_mode_to_pad(self.border_mode, self.convdim, dil_shape)

-            mode = "valid"
+        if any(p != (0, 0) for p in pad):
            new_img = np.zeros((img.shape[0], img.shape[1]) +
-                               tuple(img.shape[i + 2] + 2 * pad[i]
+                               tuple(img.shape[i + 2] + pad[i][0] + pad[i][1]
                                     for i in range(self.convdim)),
                               dtype=img.dtype)
            new_img[(slice(None), slice(None)) +
-                    tuple(slice(pad[i], img.shape[i + 2] + pad[i])
+                    tuple(slice(pad[i][0], img.shape[i + 2] + pad[i][0])
                          for i in range(self.convdim))] = img
            img = new_img

@@ -2611,16 +2815,13 @@ class AbstractConv_gradInputs(BaseAbstractConv):
        topgrad = np.asarray(topgrad)
        o, = out_

-        mode = self.border_mode
-        if not ((isinstance(mode, tuple) and min(mode) >= 0) or
-                mode in ('valid', 'full', 'half')):
-            raise ValueError(
-                'invalid border_mode {}, which must be either '
-                '"valid", "full", "half", an integer or a tuple of'
-                ' integers'.format(mode))
        if self.unshared and self.convdim != 2:
            raise NotImplementedError('Unshared convolution not implemented for %dD'
                                      % self.convdim)
+        dil_kernshp = tuple((kern.shape[-self.convdim + i] - 1) * self.filter_dilation[i] + 1
+                            for i in range(self.convdim))
+
+        pad = border_mode_to_pad(self.border_mode, self.convdim, dil_kernshp)

        imshp = self.imshp[:] if self.imshp is not None else [None] * (2 + self.convdim)
        fallback_imshp = ([topgrad.shape[0], kern.shape[-self.convdim - 1]] +
@@ -2636,20 +2837,9 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                'would produce an output of shape {}, but the given topgrad '
                'has shape {}'.format(tuple(expected_topgrad_shape),
                                      tuple(topgrad.shape)))
-
-        dil_kernshp = tuple((kern.shape[-self.convdim + i] - 1) * self.filter_dilation[i] + 1
-                            for i in range(self.convdim))
-
-        pad = (0,) * self.convdim
-        if mode == "full":
-            pad = tuple(dil_kernshp[i] - 1 for i in range(self.convdim))
-        elif mode == "half":
-            pad = tuple(dil_kernshp[i] // 2 for i in range(self.convdim))
-        elif isinstance(mode, tuple):
-            pad = tuple(mode[i] for i in range(self.convdim))
        if any(self.subsample[i] > 1 for i in range(self.convdim)):
            new_shape = ((topgrad.shape[0], topgrad.shape[1]) +
-                         tuple(shape[i] + 2 * pad[i] - dil_kernshp[i] + 1
+                         tuple(shape[i] + pad[i][0] + pad[i][1] - dil_kernshp[i] + 1
                               for i in range(self.convdim)))
            new_topgrad = np.zeros((new_shape), dtype=topgrad.dtype)
            new_topgrad[(slice(None), slice(None)) +
@@ -2705,9 +2895,9 @@ class AbstractConv_gradInputs(BaseAbstractConv):
            if self.filter_flip:
                img = img[flip_filters]

-        if any(p > 0 for p in pad):
+        if any(p != (0, 0) for p in pad):
            img = img[(slice(None), slice(None)) +
-                      tuple(slice(pad[i], img.shape[i + 2] - pad[i])
+                      tuple(slice(pad[i][0], img.shape[i + 2] - pad[i][1])
                            for i in range(self.convdim))]
        o[0] = node.outputs[0].type.filter(img)


--- a/theano/tensor/nnet/c_code/corr_gemm.c
+++ b/theano/tensor/nnet/c_code/corr_gemm.c
@@ -31,23 +31,23 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 void im2col(const %(float_type)s* data_im, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int dilation_h, const int dilation_w,
-    const int pad_h, const int pad_w,
+    const int pad_hl, const int pad_hr, const int pad_wl, const int pad_wr,
    const int stride_h, const int stride_w,
    %(float_type)s* data_col) {
  // Implicit dilated kernel size
  int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
  int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
-  int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
+  int height_col = (height + pad_hl + pad_hr - dil_kernel_h) / stride_h + 1;
+  int width_col = (width + pad_wl + pad_wr - dil_kernel_w) / stride_w + 1;
  int channels_col = channels * kernel_h * kernel_w;
  for (int c = 0; c < channels_col; ++c) {
    int w_offset = c %% kernel_w;
    int h_offset = (c / kernel_w) %% kernel_h;
    int c_im = c / kernel_h / kernel_w;
    for (int h = 0; h < height_col; ++h) {
-      int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
+      int h_pad = h * stride_h - pad_hl + h_offset * dilation_h;
      for (int w = 0; w < width_col; ++w) {
-        int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
+        int w_pad = w * stride_w - pad_wl + w_offset * dilation_w;
        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
          data_col[(npy_intp)(c * height_col + h) * width_col + w] =
            data_im[(npy_intp)(c_im * height + h_pad) * width + w_pad];
@@ -64,13 +64,14 @@ void im2col(const %(float_type)s* data_im, const int channels,
 void col2im(const %(float_type)s* data_col, const int channels,
    const int height, const int width, const int patch_h, const int patch_w,
    const int dilation_h, const int dilation_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, %(float_type)s* data_im) {
+    const int pad_hl, const int pad_hr, const int pad_wl, const int pad_wr,
+    const int stride_h, const int stride_w,
+    %(float_type)s* data_im) {
  // Implicit dilated patch
  int dil_patch_h = (patch_h - 1) * dilation_h + 1;
  int dil_patch_w = (patch_w - 1) * dilation_w + 1;
-  int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
+  int height_col = (height + pad_hl + pad_hr - dil_patch_h) / stride_h + 1;
+  int width_col = (width + pad_wl + pad_wr - dil_patch_w) / stride_w + 1;
  int num_kernels = channels * height * width;
  int channels_col = channels * patch_h * patch_w;
  for (int c = 0; c < channels_col; ++c) {
@@ -78,9 +79,9 @@ void col2im(const %(float_type)s* data_col, const int channels,
    int h_offset = (c / patch_w) %% patch_h;
    int c_im = c / patch_h / patch_w;
    for (int h = 0; h < height_col; ++h) {
-      int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
+      int h_pad = h * stride_h - pad_hl + h_offset * dilation_h;
      for (int w = 0; w < width_col; ++w) {
-        int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
+        int w_pad = w * stride_w - pad_wl + w_offset * dilation_w;
        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
          data_im[(npy_intp)(c_im * height + h_pad) * width + w_pad] +=
            data_col[(npy_intp)(c * height_col + h) * width_col + w];
@@ -105,8 +106,10 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
                      const int dW = 1,
                      const int dilH = 1,
                      const int dilW = 1,
-                      const int padH = 0,
-                      const int padW = 0,
+                      const int padH_l = 0,
+                      const int padH_r = 0,
+                      const int padW_l = 0,
+                      const int padW_r = 0,
                      const int numgroups = 1,
                      const int unshared = 0)
 {
@@ -172,8 +175,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    const int dil_kH = (kH - 1) * dilH + 1;
    const int dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
-    const int topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
+    const int topHeightNoDH = (bottomHeight + padH_l + padH_r - dil_kH);
+    const int topWidthNoDW  = (bottomWidth + padW_l + padW_r - dil_kW);
    // the above values might be negative so we need to use Python-like
    // flooring integer division to be compatible with get_conv_output.
    // note: this macro implements Python's // for negative x only
@@ -303,7 +306,7 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
            int tid = %(omp_get_thread_num)s;
            // First, im2col
            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride, nChannels,
-                   bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
+                   bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH_l, padH_r, padW_l, padW_r, dH, dW,
                   (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
            // Second, gemm
            if (unshared) {
@@ -396,7 +399,7 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
            int tid = %(omp_get_thread_num)s;
            // First, im2col
            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride,
-                   nChannels, bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
+                   nChannels, bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH_l, padH_r, padW_l, padW_r, dH, dW,
                   (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
            // Second, gemm
            // Note that we accumulate into weight. We do so by setting beta = 0
@@ -519,7 +522,7 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
            }
            // col2im back to the data
            col2im((%(float_type)s*)PyArray_DATA(col) + tid * col_stride, nChannels, bottomHeight, bottomWidth,
-                   kH, kW, dilH, dilW, padH, padW,
+                   kH, kW, dilH, dilW, padH_l, padH_r, padW_l, padW_r,
                   dH, dW, (%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride);
        }
        // Restore to previous blas threads

--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
@@ -34,8 +34,8 @@ class BaseCorrMM(gof.OpenMPOp):
    Parameters
    ----------
    border_mode : {'valid', 'full', 'half'}
-        Additionally, the padding size could be directly specified by an integer
-        or a pair of integers
+        Additionally, the padding size could be directly specified by an integer,
+        a pair of integers, or two pairs of integers.
    subsample
        Perform subsampling of the output (default: (1, 1)).
    filter_dilation
@@ -55,7 +55,8 @@ class BaseCorrMM(gof.OpenMPOp):
                                                ('DIRECTION_BACKPROP_INPUTS', 'backprop inputs')),  # 2
                             dH=int64, dW=int64,
                             dilH=int64, dilW=int64,
-                             padH=int64, padW=int64,
+                             padH_l=int64, padH_r=int64,
+                             padW_l=int64, padW_r=int64,
                             num_groups=int64, unshared=int8)

    def __init__(self, border_mode="valid", subsample=(1, 1),
@@ -66,20 +67,29 @@ class BaseCorrMM(gof.OpenMPOp):
                raise ValueError(
                    'invalid border_mode {}, which must be a '
                    'non-negative integer'.format(border_mode))
-            border_mode = (border_mode, border_mode)
-        if isinstance(border_mode, tuple):
-            if len(border_mode) != 2 or border_mode[0] < 0 or border_mode[1] < 0:
+            border_mode = ((border_mode, border_mode),) * 2
+        elif isinstance(border_mode, tuple):
+            if len(border_mode) != 2:
                raise ValueError(
-                    'invalid border_mode {}, which must be a '
-                    'pair of non-negative integers'.format(border_mode))
-            pad_h, pad_w = map(int, border_mode)
-            border_mode = (pad_h, pad_w)
-        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
-                border_mode in ('valid', 'full', 'half')):
+                    'invalid border_mode {} which must be a '
+                    'tuple of length 2'.format(border_mode))
+            border = ()
+            for mode in border_mode:
+                if isinstance(mode, tuple) and len(mode) == 2 and \
+                        min(mode) >= 0:
+                    border += ((int(mode[0]), int(mode[1])),)
+                elif mode >= 0:
+                    border += ((int(mode), int(mode)),)
+                else:
+                    raise ValueError(
+                        'invalid border mode {}. The tuple can only contain '
+                        'integers or tuples of length 2'.format(border_mode))
+            border_mode = border
+        elif border_mode not in ('valid', 'full', 'half'):
            raise ValueError(
                'invalid border_mode {}, which must be either '
-                '"valid", "full", "half", an integer or a pair of'
-                ' integers'.format(border_mode))
+                '"valid", "full", "half", an integer or a tuple '
+                'of two integers or a pair of integers'.format(border_mode))
        self.border_mode = border_mode
        if len(subsample) != 2:
            raise ValueError("subsample must have two elements")
@@ -110,14 +120,14 @@ class BaseCorrMM(gof.OpenMPOp):
    @property
    def pad(self):
        if self.border_mode == "half":
-            return (-1, -1)
+            return ((-1, -1),) * 2
        elif self.border_mode == "full":
-            return (-2, -2)
+            return ((-2, -2),) * 2
        elif isinstance(self.border_mode, tuple):
            return self.border_mode
        else:
            assert self.border_mode == "valid"
-            return (0, 0)
+            return ((0, 0),) * 2

    # Direction should be converted to real enum value,
    # as it is compared to integer later in c_code_helper().
@@ -129,8 +139,10 @@ class BaseCorrMM(gof.OpenMPOp):
    dilH = property(lambda self: self.filter_dilation[0])
    dilW = property(lambda self: self.filter_dilation[1])

-    padH = property(lambda self: self.pad[0])
-    padW = property(lambda self: self.pad[1])
+    padH_l = property(lambda self: self.pad[0][0])
+    padH_r = property(lambda self: self.pad[0][1])
+    padW_l = property(lambda self: self.pad[1][0])
+    padW_r = property(lambda self: self.pad[1][1])

    def __str__(self):
        return '%s{%s, %s, %s, %s %s}' % (
@@ -183,7 +195,7 @@ class BaseCorrMM(gof.OpenMPOp):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (9, self.openmp, blas_header_version())
+        return (10, self.openmp, blas_header_version())

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -271,13 +283,13 @@ class BaseCorrMM(gof.OpenMPOp):
        if height:
            height = '(*(npy_int64 *)(PyArray_DATA(%s)))' % height
        else:
-            if ((self.direction != 0) and (self.dH != 1)) or ((self.direction == 1) and (self.padH == -1)):
+            if ((self.direction != 0) and (self.dH != 1)) or ((self.direction == 1) and (self.padH_l == -1 or self.padH_r == -1)):
                raise ValueError("height must be given for backprop with vertical sampling or border_mode='half'")
            height = '-1'
        if width:
            width = '(*(npy_int64 *)(PyArray_DATA(%s)))' % width
        else:
-            if ((self.direction != 0) and (self.dW != 1)) or ((self.direction == 1) and (self.padW == -1)):
+            if ((self.direction != 0) and (self.dW != 1)) or ((self.direction == 1) and (self.padW_l == -1 or self.padW_r == -1)):
                raise ValueError("width must be given for backprop with horizontal sampling or border_mode='half'")
            width = '-1'

@@ -290,8 +302,10 @@ class BaseCorrMM(gof.OpenMPOp):
    int dW = %(params)s->dW;
    int dilH = %(params)s->dilH;
    int dilW = %(params)s->dilW;
-    int padH = %(params)s->padH;
-    int padW = %(params)s->padW;
+    int padH_l = %(params)s->padH_l;
+    int padH_r = %(params)s->padH_r;
+    int padW_l = %(params)s->padW_l;
+    int padW_r = %(params)s->padW_r;
    int numgroups = %(params)s->num_groups;
    int unshared = %(params)s->unshared;

@@ -334,23 +348,23 @@ class BaseCorrMM(gof.OpenMPOp):
            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
-        else if (padH == -2) {
+        else if (padH_l == -2 || padH_r == -2) {
            // vertical full padding, we can infer the kernel height
            kH = (2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH - 1)/ dilH + 1;
        }
        else {
            // explicit padding, we can infer the kernel height
-            kH = (PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
+            kH = (PyArray_DIMS(bottom)[2] + padH_l + padH_r - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
        }
        if (%(width)s != -1) {
            // kernel width is specified (perhaps horizontal subsampling or half padding)
            kW = %(width)s;
        }
-        else if (padW == -2) {
+        else if (padW_l == -2 || padW_r == -2) {
            kW = (2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
        else {
-            kW = (PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
+            kW = (PyArray_DIMS(bottom)[3] + padW_l + padW_r - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
    }

@@ -359,24 +373,24 @@ class BaseCorrMM(gof.OpenMPOp):
    dil_kW = (kW - 1) * dilW + 1;

    // Auto-padding if requested
-    if (padH == -1) {  // vertical half padding
-        padH = dil_kH / 2;
+    if (padH_l == -1 || padH_r == -1) {  // vertical half padding
+        padH_l = padH_r = dil_kH / 2;
    }
-    else if (padH == -2) {  // vertical full padding
-        padH = dil_kH - 1;
+    else if (padH_l == -2 || padH_r == -2) {  // vertical full padding
+        padH_l = padH_r = dil_kH - 1;
    }
-    else if (padH < 0) {
-        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padH must be >= -2");
+    else if (padH_l < -2 || padH_r < -2) {
+        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padH_l and padH_r must be >= -2");
        %(fail)s
    }
-    if (padW == -1) {  // horizontal half padding
-        padW = dil_kW / 2;
+    if (padW_l == -1 || padW_r == -1) {  // horizontal half padding
+        padW_l = padW_r = dil_kW / 2;
    }
-    else if (padW == -2) {  // horizontal full padding
-        padW = dil_kW - 1;
+    else if (padW_l == -2 || padW_r == -2) {  // horizontal full padding
+        padW_l = padW_r = dil_kW - 1;
    }
-    else if (padW < 0) {
-        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padW must be >= -2");
+    else if (padW_l < -2 || padW_r < -2) {
+        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padW_l and padW_r must be >= -2");
        %(fail)s
    }

@@ -386,11 +400,11 @@ class BaseCorrMM(gof.OpenMPOp):
    switch(direction) {
    case 0:  // forward pass
        // output is top: (batchsize, num_filters, height, width)
-        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
+        // height and width: top = (bottom + pad_l + pad_r - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
-        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1);
-        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1);
+        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + padH_l + padH_r - ((PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1);
+        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + padW_l + padW_r - ((PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1);
        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
        {
            if (unshared) {
@@ -425,7 +439,7 @@ class BaseCorrMM(gof.OpenMPOp):
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width)
-        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
+        // height and width: weights = (bottom + pad_l + pad_r - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
        if (unshared){
            odim = 6;
@@ -475,8 +489,8 @@ class BaseCorrMM(gof.OpenMPOp):
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[wdim-3] * numgroups;
-        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - 2*padH);
-        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - 2*padW);
+        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - padH_l - padH_r);
+        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - padW_l - padW_r);
        if (unshared) {
            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
            {
@@ -564,7 +578,8 @@ class BaseCorrMM(gof.OpenMPOp):
    }

    // Call corrMM code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups, unshared);
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW,
+                padH_l, padH_r, padW_l, padW_r, numgroups, unshared);
    if (out2==NULL){
       %(fail)s
    }
@@ -588,8 +603,11 @@ class CorrMM(BaseCorrMM):
        ``'valid'`` for ``(0, 0)`` (valid convolution, no padding), ``'full'``
        for ``(kernel_rows - 1, kernel_columns - 1)`` (full convolution),
        ``'half'`` for ``(kernel_rows // 2, kernel_columns // 2)`` (same
-        convolution for odd-sized kernels). Note that the two widths are each
-        applied twice, once per side (left and right, top and bottom).
+        convolution for odd-sized kernels).
+        If it is a tuple containing 2 pairs of integers, then these specify
+        the padding to be applied on each side ((left, right), (top, bottom)).
+        Otherwise, each width is applied twice, once per side (left and right,
+        top and bottom).
    subsample
        The subsample operation applied to each output image.
        Should be a tuple with 2 elements.
@@ -706,14 +724,20 @@ class CorrMM_gradWeights(BaseCorrMM):

    def infer_shape(self, node, input_shape):
        if self.border_mode == "half":
-            padH = padW = -1
+            padH_l = padH_r = padW_l = padW_r = -1
        elif self.border_mode == "full":
-            padH = padW = -2
+            padH_l = padH_r = padW_l = padW_r = -2
        elif isinstance(self.border_mode, tuple):
-            padH, padW = self.border_mode
+            border = ()
+            for mode in self.border_mode:
+                if isinstance(mode, tuple):
+                    border += ((int(mode[0]), int(mode[1])),)
+                else:
+                    border += ((int(mode), int(mode)),)
+            (padH_l, padH_r), (padW_l, padW_r) = border
        else:
            assert self.border_mode == "valid"
-            padH = padW = 0
+            padH_l = padH_r = padW_l = padW_r = 0
        dH, dW = self.subsample
        imshp = input_shape[0]
        topshp = input_shape[1]
@@ -721,21 +745,21 @@ class CorrMM_gradWeights(BaseCorrMM):
        ssize = ssize // self.num_groups
        nkern, topshp = topshp[1], list(topshp[2:])
        height_width = node.inputs[-2:]
-        if ((dH != 1) or (padH == -1)):
+        if ((dH != 1) or (padH_l == -1) or (padH_r == -1)):
            # vertical subsampling or half padding, kernel height is specified
            kH = height_width[0]
-        elif padH == -2:
+        elif (padH_l == -2) or (padH_r == -2):
            # vertical full padding, we can infer the kernel height
            kH = 2 - imshp[0] + (topshp[0] - 1) * dH
        else:
            # explicit padding, we can infer the kernel height
-            kH = imshp[0] + 2 * padH - (topshp[0] - 1) * dH
-        if ((dW != 1) or (padW == -1)):
+            kH = imshp[0] + padH_l + padH_r - (topshp[0] - 1) * dH
+        if ((dW != 1) or (padW_l == -1) or (padW_r == -1)):
            kW = height_width[1]
-        elif (padW == -2):
+        elif (padW_l == -2) or (padW_r == -2):
            kW = 2 - imshp[1] + (topshp[1] - 1) * dW
        else:
-            kW = imshp[1] + 2 * padW - (topshp[1] - 1) * dW
+            kW = imshp[1] + padW_l + padW_r - (topshp[1] - 1) * dW
        if self.unshared is True:
            return [(nkern, topshp[0], topshp[1], ssize, kH, kW)]
        else:
@@ -820,14 +844,20 @@ class CorrMM_gradInputs(BaseCorrMM):

    def infer_shape(self, node, input_shape):
        if self.border_mode == "half":
-            padH = padW = -1
+            padH_l = padH_r = padW_l = padW_r = -1
        elif self.border_mode == "full":
-            padH = padW = -2
+            padH_l = padH_r = padW_l = padW_r = -2
        elif isinstance(self.border_mode, tuple):
-            padH, padW = self.border_mode
+            border = ()
+            for mode in self.border_mode:
+                if isinstance(mode, tuple):
+                    border += ((int(mode[0]), int(mode[1])),)
+                else:
+                    border += ((int(mode), int(mode)),)
+            (padH_l, padH_r), (padW_l, padW_r) = border
        else:
            assert self.border_mode == "valid"
-            padH = padW = 0
+            padH_l = padH_r = padW_l = padW_r = 0
        dH, dW = self.subsample
        kshp = input_shape[0]
        topshp = input_shape[1]
@@ -835,27 +865,27 @@ class CorrMM_gradInputs(BaseCorrMM):
        ssize = ssize * self.num_groups
        bsize, topshp = topshp[0], list(topshp[2:])
        height_width = node.inputs[-2:]
-        if padH == -1:
-            padH = kshp[0] // 2
-        elif padH == -2:
-            padH = kshp[0] - 1
-        elif padH < -2:
+        if padH_l == -1 or padH_r == -1:
+            padH_l = padH_r = kshp[0] // 2
+        elif padH_l == -2 or padH_r == -2:
+            padH_l = padH_r = kshp[0] - 1
+        elif padH_l < -2 or padH_r < -2:
            raise ValueError('CorrMM_gradInputs: border_mode must be >= 0.')
-        if padW == -1:
-            padW = kshp[1] // 2
-        elif padW == -2:
-            padW = kshp[1] - 1
-        elif padW < -2:
+        if padW_l == -1 or padW_r == -1:
+            padW_l = padW_r = kshp[1] // 2
+        elif padW_l == -2 or padW_r == -2:
+            padW_l = padW_r = kshp[1] - 1
+        elif padW_l < -2 or padW_r < -2:
            raise ValueError('CorrMM_gradInputs: border_mode must be >= 0.')

        if dH != 1:
            out_shp0 = height_width[0]
        else:
-            out_shp0 = (topshp[0] - 1) * dH + kshp[0] - 2 * padH
+            out_shp0 = (topshp[0] - 1) * dH + kshp[0] - padH_l - padH_r
        if dW != 1:
            out_shp1 = height_width[1]
        else:
-            out_shp1 = (topshp[1] - 1) * dW + kshp[1] - 2 * padW
+            out_shp1 = (topshp[1] - 1) * dW + kshp[1] - padW_l - padW_r
        out_shp = (out_shp0, out_shp1)
        return [(bsize, ssize) + out_shp]


--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -24,6 +24,7 @@ from theano.tensor.nnet.abstract_conv import bilinear_kernel_1D
 from theano.tensor.nnet.abstract_conv import bilinear_kernel_2D
 from theano.tensor.nnet.abstract_conv import bilinear_upsampling
 from theano.tensor.nnet.abstract_conv import separable_conv2d, separable_conv3d
+from theano.tensor.nnet.abstract_conv import causal_conv1d
 from theano.tensor.nnet.corr import (CorrMM, CorrMM_gradWeights,
                                     CorrMM_gradInputs)
 from theano.tensor.nnet.corr3d import (Corr3dMM, Corr3dMM_gradWeights,
@@ -1894,3 +1895,156 @@ class TestUnsharedConv(unittest.TestCase):

            if verify:
                utt.verify_grad(conv_gradinputs, [kern, top], mode=self.mode, eps=1)
+
+
+class TestAsymmetricPadding(unittest.TestCase):
+    conv2d = theano.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d_gradw = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradi = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    conv2d_op = theano.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d_gradw_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradi_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+
+    mode = theano.compile.mode.Mode(optimizer='None')
+
+    img_shape = [(2, 2, 4, 4), (3, 2, 4, 2), (3, 3, 5, 3)]
+    kern_shape = [(4, 2, 2, 2), (2, 2, 4, 2), (2, 3, 3, 3)]
+    topgrad_shape = [(2, 4, 6, 6), (3, 2, 3, 4), (3, 2, 6, 1)]
+    border_mode = [((1, 2), (2, 1)), ((1, 1), (0, 3)), ((2, 1), (0, 0))]
+
+    def test_fwd(self):
+        img_sym = theano.tensor.tensor4('img')
+        kern_sym = theano.tensor.tensor4('kern')
+
+        for imshp, kshp, pad in zip(self.img_shape, self.kern_shape, self.border_mode):
+            img = np.random.random(imshp).astype(theano.config.floatX)
+            kern = np.random.random(kshp).astype(theano.config.floatX)
+
+            asymmetric_conv_op = self.conv2d(border_mode=pad, subsample=(1, 1),
+                                             filter_dilation=(1, 1))
+            asymmetric_out_sym = asymmetric_conv_op(img_sym, kern_sym)
+            asymmetric_func = theano.function([img_sym, kern_sym], asymmetric_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_op)
+                        for node in asymmetric_func.maker.fgraph.toposort()])
+            asymmetric_output = asymmetric_func(img, kern)
+
+            ref_conv_op = self.conv2d(border_mode="valid", subsample=(1, 1),
+                                      filter_dilation=(1, 1))
+            ref_out_sym = ref_conv_op(img_sym, kern_sym)
+            ref_func = theano.function([img_sym, kern_sym], ref_out_sym, mode=self.mode)
+
+            exp_imshp = (imshp[0], imshp[1],
+                         imshp[2] + pad[0][0] + pad[0][1],
+                         imshp[3] + pad[1][0] + pad[1][1])
+
+            exp_img = np.zeros(exp_imshp, dtype=theano.config.floatX)
+            exp_img[:, :, pad[0][0]:imshp[2] + pad[0][0],
+                    pad[1][0]:imshp[3] + pad[1][0]] = img
+            ref_output = ref_func(exp_img, kern)
+
+            utt.assert_allclose(asymmetric_output, ref_output)
+
+            utt.verify_grad(asymmetric_conv_op, [img, kern], mode=self.mode, eps=1)
+
+    def test_gradweight(self):
+        img_sym = theano.tensor.tensor4('img')
+        top_sym = theano.tensor.tensor4('top')
+
+        for imshp, kshp, topshp, pad in zip(self.img_shape, self.kern_shape, self.topgrad_shape, self.border_mode):
+            img = np.random.random(imshp).astype(theano.config.floatX)
+            top = np.random.random(topshp).astype(theano.config.floatX)
+
+            asymmetric_conv_op = self.conv2d_gradw(border_mode=pad, subsample=(1, 1),
+                                                   filter_dilation=(1, 1))
+            asymmetric_out_sym = asymmetric_conv_op(img_sym, top_sym, kshp[-2:])
+            asymmetric_func = theano.function([img_sym, top_sym], asymmetric_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_gradw_op)
+                        for node in asymmetric_func.maker.fgraph.toposort()])
+            asymmetric_output = asymmetric_func(img, top)
+
+            ref_conv_op = self.conv2d_gradw(border_mode="valid", subsample=(1, 1),
+                                            filter_dilation=(1, 1))
+            ref_out_sym = ref_conv_op(img_sym, top_sym, kshp[-2:])
+            ref_func = theano.function([img_sym, top_sym], ref_out_sym, mode=self.mode)
+
+            exp_imshp = (imshp[0], imshp[1],
+                         imshp[2] + pad[0][0] + pad[0][1],
+                         imshp[3] + pad[1][0] + pad[1][1])
+
+            exp_img = np.zeros(exp_imshp, dtype=theano.config.floatX)
+            exp_img[:, :, pad[0][0]:imshp[2] + pad[0][0],
+                    pad[1][0]:imshp[3] + pad[1][0]] = img
+            ref_output = ref_func(exp_img, top)
+
+            utt.assert_allclose(asymmetric_output, ref_output)
+
+            def conv_gradweight(inputs_val, output_val):
+                return asymmetric_conv_op(inputs_val, output_val, tensor.as_tensor_variable(kshp[-2:]))
+
+            utt.verify_grad(conv_gradweight, [img, top], mode=self.mode, eps=1)
+
+    def test_gradinput(self):
+        kern_sym = theano.tensor.tensor4('kern')
+        top_sym = theano.tensor.tensor4('top')
+
+        for imshp, kshp, topshp, pad in zip(self.img_shape, self.kern_shape, self.topgrad_shape, self.border_mode):
+            kern = np.random.random(kshp).astype(theano.config.floatX)
+            top = np.random.random(topshp).astype(theano.config.floatX)
+
+            asymmetric_conv_op = self.conv2d_gradi(border_mode=pad, subsample=(1, 1),
+                                                   filter_dilation=(1, 1))
+            asymmetric_out_sym = asymmetric_conv_op(kern_sym, top_sym, imshp[-2:])
+            asymmetric_func = theano.function([kern_sym, top_sym], asymmetric_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_gradi_op)
+                        for node in asymmetric_func.maker.fgraph.toposort()])
+            asymmetric_output = asymmetric_func(kern, top)
+
+            ref_conv_op = self.conv2d_gradi(border_mode="valid", subsample=(1, 1),
+                                            filter_dilation=(1, 1))
+            exp_imshp = [imshp[2] + pad[0][0] + pad[0][1],
+                         imshp[3] + pad[1][0] + pad[1][1]]
+            ref_out_sym = ref_conv_op(kern_sym, top_sym, exp_imshp)
+            ref_func = theano.function([kern_sym, top_sym], ref_out_sym, mode=self.mode)
+
+            ref_output = ref_func(kern, top)
+
+            ref_output = ref_output[:, :, pad[0][0]:imshp[2] + pad[0][0],
+                                    pad[1][0]:imshp[3] + pad[1][0]]
+
+            utt.assert_allclose(asymmetric_output, ref_output)
+
+            def conv_gradinputs(filters_val, output_val):
+                return asymmetric_conv_op(filters_val, output_val, tensor.as_tensor_variable(imshp[-2:]))
+
+            utt.verify_grad(conv_gradinputs, [kern, top], mode=self.mode, eps=1)
+
+
+class TestCausalConv(unittest.TestCase):
+    mode = theano.compile.mode.Mode(optimizer='None')
+
+    img = np.array([[[2, 4, 9, 5, 8], [0, 0, 4, 0, 5]],
+                    [[2, 5, 8, 5, 5], [1, 3, 0, 7, 9]],
+                    [[7, 0, 7, 1, 0], [0, 1, 4, 7, 2]]]).astype(theano.config.floatX)
+    kern = np.array([[[5, 3, 1], [3, 1, 0]],
+                     [[6, 4, 9], [2, 2, 7]]]).astype(theano.config.floatX)
+    dilation = 2
+    precomp_top = np.array([[[10, 20, 63, 37, 88], [12, 24, 70, 46, 120]],
+                            [[13, 34, 47, 64, 78], [14, 36, 58, 70, 105]],
+                            [[35, 3, 68, 27, 38], [42, 2, 78, 22, 103]]]).astype(theano.config.floatX)
+
+    def test_interface(self):
+        img_sym = theano.tensor.tensor3('img')
+        kern_sym = theano.tensor.tensor3('kern')
+
+        sym_out = causal_conv1d(img_sym, kern_sym, self.kern.shape, filter_dilation=self.dilation)
+
+        causal_func = theano.function([img_sym, kern_sym], sym_out, mode=self.mode)
+
+        output = causal_func(self.img, self.kern)
+
+        utt.assert_allclose(output, self.precomp_top)
+
+        def causal_conv_fn(inputs_val, filters_val):
+            return causal_conv1d(inputs_val, filters_val, self.kern.shape, filter_dilation=1)
+
+        utt.verify_grad(causal_conv_fn, [self.img, self.kern], mode=self.mode, eps=1)
--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -11,6 +11,7 @@ import theano.tensor as T
 from theano.tests import unittest_tools as utt
 from theano.tensor.nnet import corr, conv
 from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv
+from theano.tensor.nnet.tests.test_abstract_conv import TestAsymmetricPadding, TestCausalConv


 class TestCorr2D(utt.InferShapeTester):
@@ -454,6 +455,23 @@ class TestUnsharedCorr2d(TestUnsharedConv):
    conv2d_gradi_op = corr.CorrMM_gradInputs


+class TestAsymmetricCorr(TestAsymmetricPadding):
+    if theano.config.mode == "FAST_COMPILE":
+        mode = theano.compile.get_mode("FAST_RUN").excluding('gpuarray')
+    else:
+        mode = None
+    conv2d_op = corr.CorrMM
+    conv2d_gradw_op = corr.CorrMM_gradWeights
+    conv2d_gradi_op = corr.CorrMM_gradInputs
+
+
+class TestCausalCorr(TestCausalConv):
+    if theano.config.mode == "FAST_COMPILE":
+        mode = theano.compile.get_mode("FAST_RUN").excluding('gpuarray')
+    else:
+        mode = None
+
+
 if __name__ == '__main__':

    t = TestCorr2D('setUp')