Merge pull request #4587 from niasla/dilated_convolution

Implementation of 2D dilated convolution/correlation.

Merge pull request #4587 from niasla/dilated_convolution
a668c6c5 · Pascal Lamblin · GitHub · d78f44f6 · 2dcf3753 · a668c6c5
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -1393,6 +1393,9 @@ def local_abstractconv_cudnn(node):
    inp1 = node.inputs[0]
    inp2 = node.inputs[1]

+    if (node.op.filter_dilation != (1, 1)):
+        return None
+
    if not isinstance(inp1.type, GpuArrayType):
        return None


--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -855,15 +855,17 @@ class BaseGpuCorrMM(GpuOp):
        or a pair of integers
    subsample
        Perform subsampling of the output (default: (1, 1)).
+    filter_dilation
+        Perform subsampling of the input, also known as dilation (default: (1, 1)).
    pad
        *deprecated*, now you should always use border_mode.
-
    """

    check_broadcast = False
-    __props__ = ('border_mode', 'subsample')
+    __props__ = ('border_mode', 'subsample', 'filter_dilation')

-    def __init__(self, border_mode="valid", subsample=(1, 1), pad=(0, 0)):
+    def __init__(self, border_mode="valid", subsample=(1, 1),
+                 filter_dilation=(1, 1), pad=(0, 0)):
        if pad != (0, 0):
            _logger.warning(
                'do not use pad for BaseGpuCorrMM; please set padding in '
@@ -885,7 +887,10 @@ class BaseGpuCorrMM(GpuOp):
        self.border_mode = border_mode
        if len(subsample) != 2:
            raise ValueError("subsample must have two elements")
-        self.subsample = subsample
+        if len(filter_dilation) != 2:
+            raise ValueError("filter_dilation must have two elements")
+        self.subsample = tuple(subsample)
+        self.filter_dilation = tuple(filter_dilation)

    @property
    def pad(self):
@@ -894,10 +899,11 @@ class BaseGpuCorrMM(GpuOp):
        return (0, 0)

    def __str__(self):
-        return '%s{%s, %s}' % (
+        return '%s{%s, %s, %s}' % (
            self.__class__.__name__,
            self.border_mode,
-            str(self.subsample))
+            str(self.subsample),
+            str(self.filter_dilation))

    def flops(self, inp, outp):
        """
@@ -922,7 +928,7 @@ class BaseGpuCorrMM(GpuOp):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 24)
+        return (0, 26)

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -976,6 +982,7 @@ class BaseGpuCorrMM(GpuOp):

        """
        dH, dW = self.subsample
+        dilH, dilW = self.filter_dilation
        if self.border_mode == "half":
            padH = padW = -1
        elif self.border_mode == "full":
@@ -1022,6 +1029,8 @@ class BaseGpuCorrMM(GpuOp):
    // Optional args
    int dH = %(dH)s;
    int dW = %(dW)s;
+    int dilH = %(dilH)s;
+    int dilW = %(dilW)s;
    int padH = %(padH)s;
    int padW = %(padW)s;

@@ -1045,39 +1054,43 @@ class BaseGpuCorrMM(GpuOp):
        }
        else if (padH == -2) {
            // vertical full padding, we can infer the kernel height
-            kH = 2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
+            kH = (2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1;
        }
        else {
            // explicit padding, we can infer the kernel height
-            kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
+            kH = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1 ;
        }
        if ((dW != 1) || (padW == -1)) {
            kW = %(width)s;
        }
        else if (padW == -2) {
-            kW = 2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
+            kW = (2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
        else {
-            kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
+            kW = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
    }

+    // Implicit dilated kernel size
+    int dil_kH = (kH - 1) * dilH + 1;
+    int dil_kW = (kW - 1) * dilW + 1;
+
    // Auto-padding if requested
    if (padH == -1) {  // vertical half padding
-        padH = kH / 2;
+        padH = dil_kH / 2;
    }
    else if (padH == -2) {  // vertical full padding
-        padH = kH - 1;
+        padH = dil_kH - 1;
    }
    else if (padH < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padH must be >= -2");
        %(fail)s
    }
    if (padW == -1) {  // horizontal half padding
-        padW = kW / 2;
+        padW = dil_kW / 2;
    }
    else if (padW == -2) {  // horizontal full padding
-        padW = kW - 1;
+        padW = dil_kW - 1;
    }
    else if (padW < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padW must be >= -2");
@@ -1089,15 +1102,15 @@ class BaseGpuCorrMM(GpuOp):
    switch(direction) {
    case 0:  // forward pass
        // output is top: (batchsize, num_filters, height, width)
-        // height and width: top = (bottom + 2*pad - weight) / sample + 1
+        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
-        out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1;
-        out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1;
+        out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - ((CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
+        out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - ((CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width)
-        // height and width: weights = bottom + 2*pad - (top - 1) * sample
+        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = CudaNdarray_HOST_DIMS(top)[1];
        out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[1];
        out_dim[2] = kH;  // already inferred further above
@@ -1105,11 +1118,11 @@ class BaseGpuCorrMM(GpuOp):
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width)
-        // height and width: bottom = (top - 1) * sample + weights - 2*pad
+        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
-        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH;
-        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW;
+        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
@@ -1137,7 +1150,7 @@ class BaseGpuCorrMM(GpuOp):
    }

    // Call CUDA code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, padH, padW);
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW);
    if (out2==NULL){
       %(fail)s
    }
@@ -1168,6 +1181,10 @@ class GpuCorrMM(BaseGpuCorrMM):
        `(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
        but faster.
        Set to `(1, 1)` to disable subsampling.
+    filter_dilation
+        The filter dilation operation applied to each input image.
+        Should be a tuple with 2 elements.
+        Set to `(1, 1)` to disable filter dilation.
    pad
        Deprecated alias for `border_mode`.

@@ -1198,8 +1215,10 @@ class GpuCorrMM(BaseGpuCorrMM):
    """
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
+                 filter_dilation=(1, 1),
                 pad=(0, 0)):
-        super(GpuCorrMM, self).__init__(border_mode, subsample, pad)
+        super(GpuCorrMM, self).__init__(border_mode, subsample,
+                                        filter_dilation, pad)

    def make_node(self, img, kern):
        img = as_cuda_ndarray_variable(img)
@@ -1223,9 +1242,13 @@ class GpuCorrMM(BaseGpuCorrMM):
        bottom, weights = inp
        top, = grads
        top = gpu_contiguous(top)
-        d_bottom = GpuCorrMM_gradInputs(self.border_mode, self.subsample)(
+        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
+                                        self.subsample,
+                                        self.filter_dilation)(
            weights, top, bottom.shape[-2:])
-        d_weights = GpuCorrMM_gradWeights(self.border_mode, self.subsample)(
+        d_weights = GpuCorrMM_gradWeights(self.border_mode,
+                                          self.subsample,
+                                          self.filter_dilation)(
            bottom, top, weights.shape[-2:])
        return d_bottom, d_weights

@@ -1243,8 +1266,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):

    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
+                 filter_dilation=(1, 1),
                 pad=(0, 0)):
-        super(GpuCorrMM_gradWeights, self).__init__(border_mode, subsample, pad)
+        super(GpuCorrMM_gradWeights, self).__init__(border_mode,
+                                                    subsample,
+                                                    filter_dilation,
+                                                    pad)

    def make_node(self, img, topgrad, shape=None):
        img = as_cuda_ndarray_variable(img)
@@ -1278,12 +1305,13 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
        bottom, top = inp[:2]
        weights, = grads
        weights = gpu_contiguous(weights)
-        d_bottom = GpuCorrMM_gradInputs(
-            self.border_mode, self.subsample)(weights,
-                                              top,
-                                              bottom.shape[-2:])
+        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
+                                        self.subsample,
+                                        self.filter_dilation)(weights,
+                                                              top,
+                                                              bottom.shape[-2:])
        d_top = GpuCorrMM(
-            self.border_mode, self.subsample)(bottom, weights)
+            self.border_mode, self.subsample, self.filter_dilation)(bottom, weights)
        d_height_width = (
            theano.gradient.DisconnectedType()(),
            ) * 2 if len(inp) == 4 else ()
@@ -1309,8 +1337,10 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):

    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
+                 filter_dilation=(1, 1),
                 pad=(0, 0)):
-        super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample, pad)
+        super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample,
+                                                   filter_dilation, pad)

    def make_node(self, kern, topgrad, shape=None):
        kern = as_cuda_ndarray_variable(kern)
@@ -1341,11 +1371,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
        weights, top = inp[:2]
        bottom, = grads
        bottom = gpu_contiguous(bottom)
-        d_weights = GpuCorrMM_gradWeights(
-            self.border_mode, self.subsample)(
-                bottom, top, weights.shape[-2:])
-        d_top = GpuCorrMM(
-            self.border_mode, self.subsample)(bottom, weights)
+        d_weights = GpuCorrMM_gradWeights(self.border_mode,
+                                          self.subsample,
+                                          self.filter_dilation)(bottom,
+                                                                top,
+                                                                weights.shape[-2:])
+        d_top = GpuCorrMM(self.border_mode,
+                          self.subsample,
+                          self.filter_dilation)(bottom, weights)
        d_height_width = (
            theano.gradient.DisconnectedType()(),
            ) * 2 if len(inp) == 4 else ()
@@ -1871,12 +1904,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
        weights, top = inp[:2]
        bottom, = grads
        bottom = gpu_contiguous(bottom)
-        d_weights = GpuCorr3dMM_gradWeights(
-            self.border_mode, self.subsample, self.pad)(
-                bottom, top, weights.shape[-3:])
-        d_top = GpuCorr3dMM(
-            self.border_mode, self.subsample, self.pad)(
-                bottom, weights)
+        d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
+                                            self.subsample,
+                                            self.pad)(bottom,
+                                                      top,
+                                                      weights.shape[-3:])
+        d_top = GpuCorr3dMM(self.border_mode,
+                            self.subsample,
+                            self.pad)(bottom, weights)
        d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
            * 3 if len(inp) == 5 else ()
        return (d_weights, d_top) + d_height_width_depth

--- a/theano/sandbox/cuda/corr_gemm.cu
+++ b/theano/sandbox/cuda/corr_gemm.cu
@@ -52,6 +52,39 @@ inline int GET_BLOCKS(const int N) {

 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
 // Kernels for fast unfold + copy
+// CUDA kernel for the case of dilation
+__global__ void dilated_im2col_kernel(const int n, const float* data_im,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int dilation_h, const int dilation_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    float* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int h_index = index / width_col;
+    const int h_col = h_index % height_col;
+    const int w_col = index % width_col;
+    const int c_im = h_index / height_col;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col * stride_h - pad_h;
+    const int w_offset = w_col * stride_w - pad_w;
+    float* data_col_ptr = data_col;
+    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
+    const float* data_im_ptr = data_im;
+    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        int h_im = h_offset + i * dilation_h;
+        int w_im = w_offset + j * dilation_w;
+        *data_col_ptr =
+          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+            data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
 __global__ void im2col_kernel(const int n, const float* data_im,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w,
@@ -59,23 +92,24 @@ __global__ void im2col_kernel(const int n, const float* data_im,
    const int height_col, const int width_col,
    float* data_col) {
  CUDA_KERNEL_LOOP(index, n) {
-    int w_out = index % width_col;
-    int h_index = index / width_col;
-    int h_out = h_index % height_col;
-    int channel_in = h_index / height_col;
-    int channel_out = channel_in * kernel_h * kernel_w;
-    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride_w - pad_w;
+    const int h_index = index / width_col;
+    const int h_col = h_index % height_col;
+    const int w_col = index % width_col;
+    const int c_im = h_index / height_col;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col * stride_h - pad_h;
+    const int w_offset = w_col * stride_w - pad_w;
    float* data_col_ptr = data_col;
-    data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
+    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
    const float* data_im_ptr = data_im;
-    data_im_ptr += (channel_in * height + h_in) * width + w_in;
+    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
    for (int i = 0; i < kernel_h; ++i) {
      for (int j = 0; j < kernel_w; ++j) {
-        int h = h_in + i;
-        int w = w_in + j;
-        *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
-            data_im_ptr[i * width + j] : 0;
+        int h_im = h_offset + i ;
+        int w_im = w_offset + j ;
+        *data_col_ptr =
+          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+           data_im_ptr[i * width + j] : 0;
        data_col_ptr += height_col * width_col;
      }
    }
@@ -84,52 +118,97 @@ __global__ void im2col_kernel(const int n, const float* data_im,

 void im2col(const float* data_im, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int dilation_h, const int dilation_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    float* data_col) {
  // We are going to launch channels * height_col * width_col kernels, each
  // kernel responsible for copying a single-channel grid.
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
+  int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
+  int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
  int num_kernels = channels * height_col * width_col;
-  im2col_kernel<<<GET_BLOCKS(num_kernels),
+  if(dilation_h != 1 || dilation_w != 1){
+    dilated_im2col_kernel<<<GET_BLOCKS(num_kernels),
                  CUDA_NUM_THREADS>>>(
-      num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
-      pad_w, stride_h, stride_w, height_col,
+      num_kernels, data_im, height, width, kernel_h, kernel_w,
+      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
      width_col, data_col);
+  }
+  else{
+    im2col_kernel<<<GET_BLOCKS(num_kernels),
+                  CUDA_NUM_THREADS>>>(
+      num_kernels, data_im, height, width, kernel_h, kernel_w,
+      pad_h, pad_w, stride_h, stride_w, height_col,
+      width_col, data_col);
+  }
 }

-__global__ void col2im_kernel(const int n, const float* data_col,
+// CUDA kernel for the case of dilation
+__global__ void dilated_col2im_kernel(const int n, const float* data_col,
    const int height, const int width, const int channels,
-    const int patch_h, const int patch_w,
+    const int kernel_h, const int kernel_w,
+    const int dilation_h, const int dilation_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    const int height_col, const int width_col,
    float* data_im) {
  CUDA_KERNEL_LOOP(index, n) {
    float val = 0;
-    int w = index % width + pad_w;
-    int h = (index / width) % height + pad_h;
-    int c = index / (width * height);
+    const int w_im = index % width + pad_w;
+    const int h_im = (index / width) % height + pad_h;
+    const int c_im = index / (width * height);
+    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
    // compute the start and end of the output
-    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
-    int w_col_end = min(w / stride_w + 1, width_col);
-    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
-    int h_col_end = min(h / stride_h + 1, height_col);
-    /*
-    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        // the col location: [c * width * height + h_out, w_out]
-        int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize
-            + (w - w_col * stride_w);
-        val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+    const int w_col_start =
+        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+    const int w_col_end = min(w_im / stride_w + 1, width_col);
+    const int h_col_start =
+        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+    const int h_col_end = min(h_im / stride_h + 1, height_col);
+    // TODO: use LCM of stride and dilation to avoid unnecessary loops
+    for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+      for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+        int h_k = (h_im - h_col * stride_h);
+        int w_k = (w_im - w_col * stride_w);
+        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+          h_k /= dilation_h;
+          w_k /= dilation_w;
+          int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
+                                height_col + h_col) * width_col + w_col;
+          val += data_col[data_col_index];
+        }
      }
    }
-    */
-    // equivalent implementation
+    data_im[index] = val;
+  }
+}
+
+__global__ void col2im_kernel(const int n, const float* data_col,
+    const int height, const int width, const int channels,
+    const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    float* data_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    float val = 0;
+    const int w_im = index % width + pad_w;
+    const int h_im = (index / width) % height + pad_h;
+    const int c_im = index / (width * height);
+    // compute the start and end of the output
+    const int w_col_start =
+        (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
+    const int w_col_end = min(w_im / stride_w + 1, width_col);
+    const int h_col_start =
+        (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
+    const int h_col_end = min(h_im / stride_h + 1, height_col);
+    // equivalent implementation, no dilation
    int offset =
-        (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
-    int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
+      (c_im * kernel_h * kernel_w + h_im * kernel_w + w_im) * height_col * width_col;
+    int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
    int coeff_w_col = (1 - stride_w * height_col * width_col);
    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
@@ -142,18 +221,30 @@ __global__ void col2im_kernel(const int n, const float* data_col,

 void col2im(const float* data_col, const int channels,
    const int height, const int width, const int patch_h, const int patch_w,
+    const int dilation_h, const int dilation_w,
    const int pad_h, const int pad_w, const int stride_h,
    const int stride_w, float* data_im) {
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int dil_patch_h = (patch_h - 1) * dilation_h + 1;
+  int dil_patch_w = (patch_w - 1) * dilation_w + 1;
+  int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
  int num_kernels = channels * height * width;
  // To avoid involving atomic operations, we will launch one kernel per
  // bottom dimension, and then in the kernel add up the top dimensions.
-  col2im_kernel<<<GET_BLOCKS(num_kernels),
+  if(dilation_h != 1 || dilation_w != 1){
+    dilated_col2im_kernel<<<GET_BLOCKS(num_kernels),
+                  CUDA_NUM_THREADS>>>(
+      num_kernels, data_col, height, width, channels, patch_h, patch_w,
+      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
+      height_col, width_col, data_im);
+  }
+  else{
+    col2im_kernel<<<GET_BLOCKS(num_kernels),
                  CUDA_NUM_THREADS>>>(
      num_kernels, data_col, height, width, channels, patch_h, patch_w,
      pad_h, pad_w, stride_h, stride_w,
      height_col, width_col, data_im);
+  }
 }


@@ -167,6 +258,8 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
                    const int direction,
                    const int dH = 1,
                    const int dW = 1,
+                    const int dilH = 1,
+                    const int dilW = 1,
                    const int padH = 0,
                    const int padW = 0)
 {
@@ -236,9 +329,12 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
                "GpuCorrMM images and kernel must have the same stack size\n");
        return NULL;
    }
+    // implicit dilated filter
+    const int dil_kH = (kH - 1) * dilH + 1;
+    const int dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const int topHeight = (bottomHeight + 2*padH - kH) / dH + 1;
-    const int topWidth  = (bottomWidth + 2*padW - kW) / dW + 1;
+    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
+    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
            nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
            topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
@@ -286,7 +382,8 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
        for (int n = 0; n < batchSize; n++) {
            // First, im2col
            im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
-                    bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
+                   bottomWidth, kH, kW, dilH, dilW,
+                   padH, padW, dH, dW, col->devdata);
            cudaError_t err = cudaGetLastError();
            if (err != cudaSuccess) {
                PyErr_Format(PyExc_RuntimeError,
@@ -353,7 +450,8 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
        for (int n = 0; n < batchSize; n++) {
            // First, im2col
            im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
-                    bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
+                   bottomWidth, kH, kW, dilH, dilW,
+                   padH, padW, dH, dW, col->devdata);
            cudaError_t err = cudaGetLastError();
            if (err != cudaSuccess) {
                PyErr_Format(PyExc_RuntimeError,
@@ -438,7 +536,8 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
            }
            // col2im back to the data
            col2im(col->devdata, nChannels, bottomHeight, bottomWidth,
-                    kH, kW, padH, padW, dH, dW, bottom->devdata + n * bottom_stride);
+                   kH, kW, dilH, dilW, padH, padW,
+                   dH, dW, bottom->devdata + n * bottom_stride);
            cudaError_t err = cudaGetLastError();
            if (err != cudaSuccess) {
                PyErr_Format(PyExc_RuntimeError,

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -3116,6 +3116,8 @@ def local_abstractconv_cudnn(node):
                                 AbstractConv2d_gradWeights,
                                 AbstractConv2d_gradInputs))):
        return None
+    if (node.op.filter_dilation != (1, 1)):
+        return None

    inp1 = node.inputs[0]
    inp2 = node.inputs[1]
@@ -3123,6 +3125,7 @@ def local_abstractconv_cudnn(node):
    if (not isinstance(inp1.type, CudaNdarrayType) or
            not isinstance(inp2.type, CudaNdarrayType)):
        return None
+
    if not dnn_available():
        return None


--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1622,7 +1622,8 @@ def local_conv_gemm(node):
                    # because we are not allowed to replace a CudaNdarray with
                    # a DimShuffle instance in a graph optimization)
                    rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
-                        GpuCorrMM_gradWeights(border_mode, subsample)(
+                        GpuCorrMM_gradWeights(border_mode,
+                                              subsample)(
                            gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
                            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
                        ).dimshuffle(1, 0, 2, 3))
@@ -2769,28 +2770,33 @@ def local_abstractconv_gemm(node):

    border_mode = node.op.border_mode
    subsample = node.op.subsample
-    if (border_mode == 'full') and (subsample == (1, 1)):
+    filter_dilation = node.op.filter_dilation
+    if ((border_mode == 'full') and (subsample == (1, 1))):
        if not node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]
        # need to dimshuffle the kernel for full convolution
        kern = kern.dimshuffle(1, 0, 2, 3)
        # call GpuCorrMM_gradInputs
-        rval = GpuCorrMM_gradInputs('valid', subsample)(
+        rval = GpuCorrMM_gradInputs('valid',
+                                    subsample,
+                                    filter_dilation)(
            gpu_contiguous(kern), gpu_contiguous(img))
    else:
        # need to flip the kernel if necessary
        if node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]
        # By default use GpuCorrMM
-        rval = GpuCorrMM(border_mode, subsample)(gpu_contiguous(img),
-                                                 gpu_contiguous(kern))
+        rval = GpuCorrMM(border_mode,
+                         subsample,
+                         filter_dilation)(gpu_contiguous(img),
+                                          gpu_contiguous(kern))

        # call GpuCorrMM_gradWeights if good
        # (the latter is faster if batchsize * kernelHeight * kernelWidth
        # is larger than inputChannels * outputHeight * outputWidth.
        # GpuConv does not always store information on the batchsize and
        # channels, though, so we only use what information we have.)
-        if ((subsample == (1, 1)) and
+        if ((subsample == (1, 1)) and (filter_dilation == (1, 1)) and
                (node.op.imshp is not None) and
                (None not in node.op.imshp[-2:]) and
                (node.op.kshp is not None) and
@@ -2810,7 +2816,9 @@ def local_abstractconv_gemm(node):
                # because we are not allowed to replace a CudaNdarray with
                # a DimShuffle instance in a graph optimization)
                rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
-                    GpuCorrMM_gradWeights(border_mode, subsample)(
+                    GpuCorrMM_gradWeights(border_mode,
+                                          subsample,
+                                          filter_dilation)(
                        gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
                        gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
                    ).dimshuffle(1, 0, 2, 3))
@@ -2827,7 +2835,8 @@ def local_abstractconv_gradweight_gemm(node):
        return None

    rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
-                                 subsample=node.op.subsample)(
+                                 subsample=node.op.subsample,
+                                 filter_dilation=node.op.filter_dilation)(
        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
    if node.op.filter_flip:
        rval = rval[:, :, ::-1, ::-1]
@@ -2849,7 +2858,8 @@ def local_abstractconv_gradinputs_gemm(node):
        kern = kern[:, :, ::-1, ::-1]

    rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
-                                subsample=node.op.subsample)(
+                                subsample=node.op.subsample,
+                                filter_dilation=node.op.filter_dilation)(
        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
    return [rval]

@@ -2870,10 +2880,12 @@ conv_groupopt.register('local_abstractconv_dnn',
 conv_groupopt.register('local_abstractconv_gemm', local_abstractconv_gemm, 30,
                       'conv_gemm',
                       'gpu', 'fast_compile', 'fast_run')
+
 conv_groupopt.register('local_abstractconv_gradweight_gemm',
                       local_abstractconv_gradweight_gemm, 30,
                       'conv_gemm',
                       'gpu', 'fast_compile', 'fast_run')
+
 conv_groupopt.register('local_abstractconv_gradinputs_gemm',
                       local_abstractconv_gradinputs_gemm, 30,
                       'conv_gemm',

--- a/theano/sandbox/cuda/tests/test_abstractconv.py
+++ b/theano/sandbox/cuda/tests/test_abstractconv.py
@@ -29,25 +29,30 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
        self.provide_shape = [False]
        self.shared = gpu_shared

-    def tcase(self, i, f, s, b, flip, provide_shape):
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
+        if fd != (1, 1):
+            raise SkipTest("No dilation implementation for cuDNN ConvOp.")
        if not dnn_available():
            raise SkipTest(cuda.dnn.dnn_available.msg)
        mode = mode_with_gpu
-        o = self.get_output_shape(i, f, s, b)
+        o = self.get_output_shape(i, f, s, b, fd)
        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
                     verify_grad=True, mode=mode,
                     provide_shape=provide_shape, border_mode=b,
-                     filter_flip=flip, target_op=GpuDnnConv)
+                     filter_flip=flip, target_op=GpuDnnConv,
+                     filter_dilation=fd)
        self.run_gradweight(inputs_shape=i, filters_shape=f,
                            output_shape=o, subsample=s,
                            verify_grad=True, mode=mode,
                            provide_shape=provide_shape, border_mode=b,
-                            filter_flip=flip, target_op=GpuDnnConvGradW)
+                            filter_flip=flip, target_op=GpuDnnConvGradW,
+                            filter_dilation=fd)
        self.run_gradinput(inputs_shape=i, filters_shape=f,
                           output_shape=o, subsample=s,
                           verify_grad=True, mode=mode,
                           provide_shape=provide_shape, border_mode=b,
-                           filter_flip=flip, target_op=GpuDnnConvGradI)
+                           filter_flip=flip, target_op=GpuDnnConvGradI,
+                           filter_dilation=fd)


 class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
@@ -56,28 +61,30 @@ class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
        self.shared = gpu_shared
        self.mode = mode_with_gpu.excluding('cudnn')

-    def tcase(self, i, f, s, b, flip, provide_shape):
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
        mode = self.mode
-        o = self.get_output_shape(i, f, s, b)
-        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
-                     verify_grad=True, mode=mode,
+        o = self.get_output_shape(i, f, s, b, fd)
+        self.run_fwd(inputs_shape=i, filters_shape=f,
+                     subsample=s, verify_grad=True, mode=mode,
                     provide_shape=provide_shape, border_mode=b,
-                     filter_flip=flip,
-                     target_op=(GpuCorrMM,
-                                GpuCorrMM_gradWeights,
-                                GpuCorrMM_gradInputs))
+                     filter_flip=flip, target_op=(GpuCorrMM,
+                                                  GpuCorrMM_gradWeights,
+                                                  GpuCorrMM_gradInputs),
+                     filter_dilation=fd)
        self.run_gradweight(inputs_shape=i, filters_shape=f,
                            output_shape=o, subsample=s,
                            verify_grad=True, mode=mode,
                            provide_shape=provide_shape, border_mode=b,
                            filter_flip=flip,
-                            target_op=GpuCorrMM_gradWeights)
+                            target_op=GpuCorrMM_gradWeights,
+                            filter_dilation=fd)
        self.run_gradinput(inputs_shape=i, filters_shape=f,
                           output_shape=o, subsample=s,
                           verify_grad=True, mode=mode,
                           provide_shape=provide_shape, border_mode=b,
                           filter_flip=flip,
-                           target_op=GpuCorrMM_gradInputs)
+                           target_op=GpuCorrMM_gradInputs,
+                           filter_dilation=fd)


 class TestDnnConvTypes(test_abstract_conv.TestConvTypes):

--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
@@ -35,7 +35,7 @@ from .abstract_conv import conv2d as abstract_conv2d

 def conv2d(input, filters, input_shape=None, filter_shape=None,
           border_mode='valid', subsample=(1, 1), filter_flip=True,
-           image_shape=None, **kwargs):
+           image_shape=None, filter_dilation=(1, 1), **kwargs):
    """
    This function will build the symbolic graph for convolving a mini-batch of a
    stack of 2D inputs with a set of 2D filters. The implementation is modelled
@@ -95,6 +95,10 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
    image_shape: None, tuple/list of len 4 of int or Constant variable
        Deprecated alias for input_shape.

+    filter_dilation: tuple of len 2
+        Factor by which to subsample (stride) the input.
+        Also called dilation elsewhere.
+
    kwargs: Any other keyword arguments are accepted for backwards
            compatibility, but will be ignored.

@@ -140,4 +144,5 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
                             " be provided at the same time.")

    return abstract_conv2d(input, filters, input_shape, filter_shape,
-                           border_mode, subsample, filter_flip)
+                           border_mode, subsample, filter_flip,
+                           filter_dilation)
--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
@@ -32,7 +32,8 @@ _logger = logging.getLogger("theano.tensor.nnet.abstract_conv")


 def get_conv_output_shape(image_shape, kernel_shape,
-                          border_mode, subsample):
+                          border_mode, subsample,
+                          filter_dilation=(1, 1)):
    """
    This function compute the output shape of convolution operation.

@@ -53,6 +54,8 @@ def get_conv_output_shape(image_shape, kernel_shape,
    subsample: tuple of int (symbolic or numeric). Its or three elements
        espectively correspond to the subsampling on height and width (and
        possibly depth) axis.
+    filter_dilation: tuple of int (symbolic or numeric). Its two elements
+        correspond respectively to the dilation on height and width axis.

    Returns
    -------
@@ -65,17 +68,19 @@ def get_conv_output_shape(image_shape, kernel_shape,
    nkern, kshp = kernel_shape[0], kernel_shape[2:]
    if isinstance(border_mode, tuple):
        out_shp = tuple(get_conv_shape_1axis(
-            imshp[i], kshp[i], border_mode[i], subsample[i])
-            for i in range(len(subsample)))
+            imshp[i], kshp[i], border_mode[i],
+            subsample[i], filter_dilation[i]) for i in range(len(subsample)))
    else:
        out_shp = tuple(get_conv_shape_1axis(
-            imshp[i], kshp[i], border_mode, subsample[i])
-            for i in range(len(subsample)))
+            imshp[i], kshp[i], border_mode,
+            subsample[i], filter_dilation[i]) for i in range(len(subsample)))
    return (bsize, nkern) + out_shp


-def get_conv_shape_1axis(image_shape, kernel_shape,
-                         border_mode, subsample):
+# filter dilation set by default to 1
+# for compatibility with other tests.
+def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
+                         subsample, dilation=1):
    """
    This function compute the output shape of convolution operation.

@@ -90,6 +95,8 @@ def get_conv_shape_1axis(image_shape, kernel_shape,
        the padding on the considered axis.
    subsample: int. It must correspond to the subsampling on the
        considered axis.
+    dilation: int. It must correspond to the dilation on the
+        considered axis.

    Returns
    -------
@@ -97,19 +104,22 @@ def get_conv_shape_1axis(image_shape, kernel_shape,
        considered axis. None if undefined.

    """
-    if None in [image_shape, kernel_shape, border_mode, subsample]:
+    if None in [image_shape, kernel_shape, border_mode,
+                subsample, dilation]:
        return None
+    # Implicit dilated kernel shape
+    dil_kernel_shape = (kernel_shape - 1) * dilation + 1
    if border_mode == "half":
-        pad = kernel_shape // 2
+        pad = dil_kernel_shape // 2
    elif border_mode == "full":
-        pad = kernel_shape - 1
+        pad = dil_kernel_shape - 1
    elif border_mode == "valid":
        pad = 0
    else:
        pad = border_mode
        if pad < 0:
            raise ValueError("border_mode must be >= 0")
-    out_shp = (image_shape + 2 * pad - kernel_shape) // subsample + 1
+    out_shp = (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1

    return out_shp

@@ -120,7 +130,8 @@ def conv2d(input,
           filter_shape=None,
           border_mode='valid',
           subsample=(1, 1),
-           filter_flip=True):
+           filter_flip=True,
+           filter_dilation=(1, 1)):
    """This function will build the symbolic graph for convolving a mini-batch of a
    stack of 2D inputs with a set of 2D filters. The implementation is modelled
    after Convolutional Neural Networks (CNN).
@@ -134,7 +145,8 @@ def conv2d(input,
                             kshp=filter_shape,
                             border_mode=border_mode,
                             subsample=subsample,
-                             filter_flip=filter_flip)
+                             filter_flip=filter_flip,
+                             filter_dilation=filter_dilation)
    return conv_op(input, filters)


@@ -144,7 +156,8 @@ def conv2d_grad_wrt_inputs(output_grad,
                           filter_shape=None,
                           border_mode='valid',
                           subsample=(1, 1),
-                           filter_flip=True):
+                           filter_flip=True,
+                           filter_dilation=(1, 1)):
    """Compute conv output gradient w.r.t its inputs

    This function builds the symbolic graph for getting the
@@ -214,6 +227,9 @@ def conv2d_grad_wrt_inputs(output_grad,
        referred to as a convolution, and this is the default. If
        ``False``, the filters are not flipped and the operation is
        referred to as a cross-correlation.
+    filter_dilation : tuple of len 2
+        The filter dilation used in the forward pass.
+        Also known as input striding.

    Returns
    -------
@@ -263,7 +279,8 @@ def conv2d_grad_wrt_inputs(output_grad,
                                              kshp=filter_shape,
                                              border_mode=border_mode,
                                              subsample=subsample,
-                                              filter_flip=filter_flip)
+                                              filter_flip=filter_flip,
+                                              filter_dilation=filter_dilation)

    return grad_input_op(filters, output_grad, input_shape[-2:])

@@ -274,7 +291,8 @@ def conv2d_grad_wrt_weights(input,
                            input_shape=None,
                            border_mode='valid',
                            subsample=(1, 1),
-                            filter_flip=True):
+                            filter_flip=True,
+                            filter_dilation=(1, 1)):
    """Compute conv output gradient w.r.t its weights

    This function will build the symbolic graph for getting the
@@ -327,7 +345,6 @@ def conv2d_grad_wrt_weights(input,
          ``(int1, int2)``
            pad input with a symmetric border of ``int1`` rows and
            ``int2`` columns, then perform a valid convolution.
-
    subsample : tuple of len 2
        The subsampling used in the forward pass of the convolutional
        operation.  Also called strides elsewhere.
@@ -337,6 +354,9 @@ def conv2d_grad_wrt_weights(input,
        referred to as a convolution, and this is the default. If
        ``False``, the filters are not flipped and the operation is
        referred to as a cross-correlation.
+    filter_dilation : tuple of len 2
+        The filter dilation used in the forward pass.
+        Also known as input striding.

    Returns
    -------
@@ -386,7 +406,8 @@ def conv2d_grad_wrt_weights(input,
                                               kshp=numerical_filter_shape,
                                               border_mode=border_mode,
                                               subsample=subsample,
-                                               filter_flip=filter_flip)
+                                               filter_flip=filter_flip,
+                                               filter_dilation=filter_dilation)

    return gradWeight_op(input, output_grad, filter_shape[:-2])

@@ -541,7 +562,8 @@ def bilinear_upsampling(input,
                                               filter_shape=(1, 1, None, 1),
                                               border_mode=(pad, 0),
                                               subsample=(ratio, 1),
-                                               filter_flip=True)
+                                               filter_flip=True,
+                                               filter_dilation=(1, 1))
        # upsampling cols
        upsampled_mat = conv2d_grad_wrt_inputs(output_grad=upsampled_row,
                                               filters=kern[np.newaxis,
@@ -553,7 +575,8 @@ def bilinear_upsampling(input,
                                               filter_shape=(1, 1, 1, None),
                                               border_mode=(0, pad),
                                               subsample=(1, ratio),
-                                               filter_flip=True)
+                                               filter_flip=True,
+                                               filter_dilation=(1, 1))
    else:
        kern = bilinear_kernel_2D(ratio=ratio, normalize=True)
        upsampled_mat = conv2d_grad_wrt_inputs(output_grad=concat_mat,
@@ -565,7 +588,8 @@ def bilinear_upsampling(input,
                                               filter_shape=(1, 1, None, None),
                                               border_mode=(pad, pad),
                                               subsample=(ratio, ratio),
-                                               filter_flip=True)
+                                               filter_flip=True,
+                                               filter_dilation=(1, 1))

    return upsampled_mat.reshape((input.shape[0], input.shape[1],
                                  row * ratio, col * ratio))
@@ -620,14 +644,18 @@ class BaseAbstractConv2d(Op):
        are not flipped and the operation is referred to as a
        cross-correlation.

+    filter_dilation: tuple of len 2
+        Factor by which to subsample (stride) the input.
+        Also called dilation factor.
    """
    check_broadcast = False
-    __props__ = ('border_mode', 'subsample', 'filter_flip', 'imshp', 'kshp')
+    __props__ = ('border_mode', 'subsample', 'filter_flip',
+                 'imshp', 'kshp', 'filter_dilation')

    def __init__(self,
-                 imshp=None, kshp=None,
-                 border_mode="valid", subsample=(1, 1),
-                 filter_flip=True):
+                 imshp=None, kshp=None, border_mode="valid",
+                 subsample=(1, 1), filter_flip=True,
+                 filter_dilation=(1, 1)):

        if isinstance(border_mode, integer_types):
            border_mode = (border_mode, border_mode)
@@ -673,6 +701,9 @@ class BaseAbstractConv2d(Op):
        if len(subsample) != 2:
            raise ValueError("subsample must have two elements")
        self.subsample = tuple(subsample)
+        if len(filter_dilation) != 2:
+            raise ValueError("filter_dilation must have two elements")
+        self.filter_dilation = tuple(filter_dilation)

    def flops(self, inp, outp):
        """ Useful with the hack in profilemode to print the MFlops"""
@@ -694,7 +725,7 @@ class BaseAbstractConv2d(Op):
        # This may change in the future.
        return False

-    def conv2d(self, img, kern, mode="valid"):
+    def conv2d(self, img, kern, mode="valid", dilation=(1, 1)):
        """
        Basic slow python implementatation for DebugMode
        """
@@ -708,8 +739,16 @@ class BaseAbstractConv2d(Op):
                'invalid mode {}, which must be either '
                '"valid" or "full"'.format(mode))

-        out_shape = get_conv_output_shape(img.shape, kern.shape, mode, [1, 1])
+        out_shape = get_conv_output_shape(img.shape, kern.shape,
+                                          mode, [1, 1], dilation)
+
        out = numpy.zeros(out_shape, dtype=img.dtype)
+        dil_kern_shp = kern.shape[:-2] + ((kern.shape[-2] - 1) * dilation[0] + 1,
+                                          (kern.shape[-1] - 1) * dilation[1] + 1)
+        dilated_kern = numpy.zeros(dil_kern_shp, dtype=kern.dtype)
+        dilated_kern[:, :,
+                     ::dilation[0],
+                     ::dilation[1]] = kern
        val = _valfrommode(mode)
        bval = _bvalfromboundary('fill')

@@ -720,7 +759,7 @@ class BaseAbstractConv2d(Op):
                    for im0 in xrange(img.shape[1]):
                        # some cast generates a warning here
                        out[b, n, ...] += _convolve2d(img[b, im0, ...],
-                                                      kern[n, im0, ...],
+                                                      dilated_kern[n, im0, ...],
                                                      1, val, bval, 0)
        return out

@@ -736,10 +775,11 @@ class AbstractConv2d(BaseAbstractConv2d):
                 kshp=None,
                 border_mode="valid",
                 subsample=(1, 1),
-                 filter_flip=True):
-        super(AbstractConv2d, self).__init__(imshp, kshp,
-                                             border_mode, subsample,
-                                             filter_flip)
+                 filter_flip=True,
+                 filter_dilation=(1, 1)):
+        super(AbstractConv2d, self).__init__(imshp, kshp, border_mode,
+                                             subsample, filter_flip,
+                                             filter_dilation)

    def make_node(self, img, kern):
        # Make sure both inputs are Variables with the same Type
@@ -766,6 +806,8 @@ class AbstractConv2d(BaseAbstractConv2d):
        img, kern = inp
        img = numpy.asarray(img)
        kern = numpy.asarray(kern)
+        dil_kernshp = ((kern.shape[2] - 1) * self.filter_dilation[0] + 1,
+                       (kern.shape[3] - 1) * self.filter_dilation[1] + 1)
        o, = out_
        mode = self.border_mode

@@ -777,9 +819,9 @@ class AbstractConv2d(BaseAbstractConv2d):
                ' integers'.format(mode))

        if mode == "full":
-            mode = (kern.shape[2] - 1, kern.shape[3] - 1)
+            mode = (dil_kernshp[0] - 1, dil_kernshp[1] - 1)
        elif mode == "half":
-            mode = (kern.shape[2] // 2, kern.shape[3] // 2)
+            mode = (dil_kernshp[0] // 2, dil_kernshp[1] // 2)
        if isinstance(mode, tuple):
            pad_h, pad_w = map(int, mode)
            mode = "valid"
@@ -790,7 +832,7 @@ class AbstractConv2d(BaseAbstractConv2d):
            img = new_img
        if not self.filter_flip:
            kern = kern[:, :, ::-1, ::-1]
-        conv_out = self.conv2d(img, kern, mode="valid")
+        conv_out = self.conv2d(img, kern, mode="valid", dilation=self.filter_dilation)
        conv_out = conv_out[:, :, ::self.subsample[0], ::self.subsample[1]]

        o[0] = node.outputs[0].type.filter(conv_out)
@@ -812,12 +854,14 @@ class AbstractConv2d(BaseAbstractConv2d):
        d_bottom = AbstractConv2d_gradInputs(self.imshp, self.kshp,
                                             self.border_mode,
                                             self.subsample,
-                                             self.filter_flip)(
+                                             self.filter_flip,
+                                             self.filter_dilation)(
            weights, top, bottom.shape[-2:])
        d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
                                               self.border_mode,
                                               self.subsample,
-                                               self.filter_flip)(
+                                               self.filter_flip,
+                                               self.filter_dilation)(

            bottom, top, weights.shape[-2:])

@@ -844,7 +888,7 @@ class AbstractConv2d(BaseAbstractConv2d):
            kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
                    for i in range(4)]
        res = get_conv_output_shape(imshp, kshp, self.border_mode,
-                                    self.subsample)
+                                    self.subsample, self.filter_dilation)
        return [res]


@@ -863,11 +907,13 @@ class AbstractConv2d_gradWeights(BaseAbstractConv2d):
                 kshp=None,
                 border_mode="valid",
                 subsample=(1, 1),
-                 filter_flip=True):
+                 filter_flip=True,
+                 filter_dilation=(1, 1)):
        super(AbstractConv2d_gradWeights, self).__init__(imshp, kshp,
                                                         border_mode,
                                                         subsample,
-                                                         filter_flip)
+                                                         filter_flip,
+                                                         filter_dilation)

    # Update shape/height_width
    def make_node(self, img, topgrad, shape):
@@ -943,15 +989,16 @@ class AbstractConv2d_gradWeights(BaseAbstractConv2d):
        d_bottom = AbstractConv2d_gradInputs(self.imshp, self.kshp,
                                             self.border_mode,
                                             self.subsample,
-                                             self.filter_flip)(
-                                                 weights,
-                                                 top,
-                                                 bottom.shape[-2:])
+                                             self.filter_flip,
+                                             self.filter_dilation)(weights,
+                                                                   top,
+                                                                   bottom.shape[-2:])
        d_top = AbstractConv2d(self.imshp,
                               self.kshp,
                               self.border_mode,
                               self.subsample,
-                               self.filter_flip)(bottom, weights)
+                               self.filter_flip,
+                               self.filter_dilation)(bottom, weights)
        # Make sure that the broadcastable pattern of the inputs is used
        # for the gradients, even if the grad opts are not able to infer
        # that the dimensions are broadcastable.
@@ -998,11 +1045,13 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):
                 kshp=None,
                 border_mode="valid",
                 subsample=(1, 1),
-                 filter_flip=True):
+                 filter_flip=True,
+                 filter_dilation=(1, 1)):
        super(AbstractConv2d_gradInputs, self).__init__(imshp, kshp,
                                                        border_mode,
                                                        subsample,
-                                                        filter_flip)
+                                                        filter_flip,
+                                                        filter_dilation)

    # Update shape/height_width
    def make_node(self, kern, topgrad, shape):
@@ -1070,12 +1119,15 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):
        bottom, = grads
        d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
                                               self.border_mode,
-                                               self.subsample)(
-                                                   bottom, top,
-                                                   weights.shape[-2:])
+                                               self.subsample,
+                                               self.filter_flip,
+                                               self.filter_dilation)(bottom, top,
+                                                                     weights.shape[-2:])
        d_top = AbstractConv2d(self.imshp, self.kshp,
-                               self.border_mode, self.subsample)(
-                                   bottom, weights)
+                               self.border_mode,
+                               self.subsample,
+                               self.filter_flip,
+                               self.filter_dilation)(bottom, weights)
        # Make sure that the broadcastable pattern of the inputs is used
        # for the gradients, even if the grad opts are not able to infer
        # that the dimensions are broadcastable.

--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
@@ -27,12 +27,14 @@ class BaseCorrMM(gof.Op):
        or a pair of integers
    subsample
        Perform subsampling of the output (default: (1, 1)).
-
+    filter_dilation
+        Perform dilated correlation (default: (1,1))
    """
    check_broadcast = False
-    __props__ = ('border_mode', 'subsample')
+    __props__ = ('border_mode', 'subsample', 'filter_dilation')

-    def __init__(self, border_mode="valid", subsample=(1, 1)):
+    def __init__(self, border_mode="valid", subsample=(1, 1),
+                 filter_dilation=(1, 1)):
        if isinstance(border_mode, integer_types):
            if border_mode < 0:
                raise ValueError(
@@ -55,7 +57,10 @@ class BaseCorrMM(gof.Op):
        self.border_mode = border_mode
        if len(subsample) != 2:
            raise ValueError("subsample must have two elements")
+        if len(filter_dilation) != 2:
+            raise ValueError("filter_dilation must have two elements")
        self.subsample = tuple(subsample)
+        self.filter_dilation = tuple(filter_dilation)

    @property
    def pad(self):
@@ -64,10 +69,11 @@ class BaseCorrMM(gof.Op):
        return (0, 0)

    def __str__(self):
-        return '%s{%s, %s}' % (
+        return '%s{%s, %s, %s}' % (
            self.__class__.__name__,
            self.border_mode,
-            str(self.subsample))
+            str(self.subsample),
+            str(self.filter_dilation))

    def c_support_code(self):
        return blas_header_text()
@@ -89,7 +95,7 @@ class BaseCorrMM(gof.Op):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (1, 1)
+        return (1, 2)

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -155,6 +161,7 @@ class BaseCorrMM(gof.Op):
        if not theano.config.blas.ldflags:
            raise NotImplementedError("C code for CorrMM* classes need a blas library.")
        dH, dW = self.subsample
+        dilH, dilW = self.filter_dilation
        if self.border_mode == "half":
            padH = padW = -1
        elif self.border_mode == "full":
@@ -201,6 +208,8 @@ class BaseCorrMM(gof.Op):
    // Optional args
    int dH = %(dH)s;
    int dW = %(dW)s;
+    int dilH = %(dilH)s;
+    int dilW = %(dilW)s;
    int padH = %(padH)s;
    int padW = %(padW)s;

@@ -224,39 +233,43 @@ class BaseCorrMM(gof.Op):
        }
        else if (padH == -2) {
            // vertical full padding, we can infer the kernel height
-            kH = 2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH;
+            kH = (2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH - 1)/ dilH + 1;
        }
        else {
            // explicit padding, we can infer the kernel height
-            kH = PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH;
+            kH = (PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
        }
        if ((dW != 1) || (padW == -1)) {
            kW = %(width)s;
        }
        else if (padW == -2) {
-            kW = 2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW;
+            kW = (2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
        else {
-            kW = PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW;
+            kW = (PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
    }

+    // Implicit dilated kernel size
+    int dil_kH = (kH - 1) * dilH + 1;
+    int dil_kW = (kW - 1) * dilW + 1;
+
    // Auto-padding if requested
    if (padH == -1) {  // vertical half padding
-        padH = kH / 2;
+        padH = dil_kH / 2;
    }
    else if (padH == -2) {  // vertical full padding
-        padH = kH - 1;
+        padH = dil_kH - 1;
    }
    else if (padH < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padH must be >= -2");
        %(fail)s
    }
    if (padW == -1) {  // horizontal half padding
-        padW = kW / 2;
+        padW = dil_kW / 2;
    }
    else if (padW == -2) {  // horizontal full padding
-        padW = kW - 1;
+        padW = dil_kW - 1;
    }
    else if (padW < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padW must be >= -2");
@@ -268,15 +281,15 @@ class BaseCorrMM(gof.Op):
    switch(direction) {
    case 0:  // forward pass
        // output is top: (batchsize, num_filters, height, width)
-        // height and width: top = (bottom + 2*pad - weight) / sample + 1
+        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
-        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - PyArray_DIMS(weights)[2]) / dH + 1);
-        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - PyArray_DIMS(weights)[3]) / dW + 1);
+        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1);
+        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1);
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width)
-        // height and width: weights = bottom + 2*pad - (top - 1) * sample
+        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
        out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1];
        out_dim[2] = (npy_intp)kH;  // already inferred further above
@@ -284,11 +297,11 @@ class BaseCorrMM(gof.Op):
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width)
-        // height and width: bottom = (top - 1) * sample + weights - 2*pad
+        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1];
-        out_dim[2] = (npy_intp)((dH != 1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + PyArray_DIMS(weights)[2] - 2*padH);
-        out_dim[3] = (npy_intp)((dW != 1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + PyArray_DIMS(weights)[3] - 2*padW);
+        out_dim[2] = (npy_intp)((dH != 1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
+        out_dim[3] = (npy_intp)((dW != 1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: direction must be 0, 1, or 2\\n");
@@ -326,7 +339,7 @@ class BaseCorrMM(gof.Op):
    }

    // Call corrMM code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, padH, padW);
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW);
    if (out2==NULL){
       %(fail)s
    }
@@ -357,10 +370,15 @@ class CorrMM(BaseCorrMM):
        `(sv, sh)` is equivalent to `CorrMM(...)(...)[:,:,::sv, ::sh]`,
        but faster.
        Set to `(1, 1)` to disable subsampling.
+    filter_dilation
+        The filter dilation operation applied to each input image.
+        Should be a tuple with 2 elements.
+        Set to `(1, 1)` to disable filter dilation.

    """
-    def __init__(self, border_mode="valid", subsample=(1, 1)):
-        super(CorrMM, self).__init__(border_mode, subsample)
+    def __init__(self, border_mode="valid", subsample=(1, 1),
+                 filter_dilation=(1, 1)):
+        super(CorrMM, self).__init__(border_mode, subsample, filter_dilation)

    def make_node(self, img, kern):
        img = as_tensor_variable(img)
@@ -382,7 +400,8 @@ class CorrMM(BaseCorrMM):
            imshp,
            kshp,
            self.border_mode,
-            self.subsample)
+            self.subsample,
+            self.filter_dilation)
        return [res]

    def c_code(self, node, nodename, inp, out_, sub):
@@ -395,11 +414,13 @@ class CorrMM(BaseCorrMM):
        bottom, weights = inp
        top, = grads
        d_bottom = CorrMM_gradInputs(self.border_mode,
-                                     self.subsample)(weights, top,
-                                                     bottom.shape[-2:])
+                                     self.subsample,
+                                     self.filter_dilation)(weights, top,
+                                                           bottom.shape[-2:])
        d_weights = CorrMM_gradWeights(self.border_mode,
-                                       self.subsample)(bottom, top,
-                                                       weights.shape[-2:])
+                                       self.subsample,
+                                       self.filter_dilation)(bottom, top,
+                                                             weights.shape[-2:])
        return d_bottom, d_weights


@@ -415,8 +436,11 @@ class CorrMM_gradWeights(BaseCorrMM):

    """

-    def __init__(self, border_mode="valid", subsample=(1, 1)):
-        super(CorrMM_gradWeights, self).__init__(border_mode, subsample)
+    def __init__(self, border_mode="valid", subsample=(1, 1),
+                 filter_dilation=(1, 1)):
+        super(CorrMM_gradWeights, self).__init__(border_mode,
+                                                 subsample,
+                                                 filter_dilation)

    def make_node(self, img, topgrad, shape=None):
        img = as_tensor_variable(img)
@@ -485,10 +509,12 @@ class CorrMM_gradWeights(BaseCorrMM):
        bottom, top = inp[:2]
        weights, = grads
        d_bottom = CorrMM_gradInputs(self.border_mode,
-                                     self.subsample)(weights, top,
-                                                     bottom.shape[-2:])
+                                     self.subsample,
+                                     self.filter_dilation)(weights, top,
+                                                           bottom.shape[-2:])
        d_top = CorrMM(self.border_mode,
-                       self.subsample)(bottom, weights)
+                       self.subsample,
+                       self.filter_dilation)(bottom, weights)
        d_height_width = ((theano.gradient.DisconnectedType()(),) * 2
                          if len(inp) == 4 else ())
        return (d_bottom, d_top) + d_height_width
@@ -512,8 +538,10 @@ class CorrMM_gradInputs(BaseCorrMM):

    """

-    def __init__(self, border_mode="valid", subsample=(1, 1)):
-        super(CorrMM_gradInputs, self).__init__(border_mode, subsample)
+    def __init__(self, border_mode="valid", subsample=(1, 1), filter_dilation=(1, 1)):
+        super(CorrMM_gradInputs, self).__init__(border_mode,
+                                                subsample,
+                                                filter_dilation)

    def make_node(self, kern, topgrad, shape=None):
        kern = as_tensor_variable(kern)
@@ -586,11 +614,13 @@ class CorrMM_gradInputs(BaseCorrMM):
        weights, top = inp[:2]
        bottom, = grads
        d_weights = CorrMM_gradWeights(self.border_mode,
-                                       self.subsample)(bottom,
-                                                       top,
-                                                       weights.shape[-2:])
+                                       self.subsample,
+                                       self.filter_dilation)(bottom,
+                                                             top,
+                                                             weights.shape[-2:])
        d_top = CorrMM(self.border_mode,
-                       self.subsample)(bottom, weights)
+                       self.subsample,
+                       self.filter_dilation)(bottom, weights)
        d_height_width = ((theano.gradient.DisconnectedType()(),) *
                          2 if len(inp) == 4 else ())
        return (d_weights, d_top) + d_height_width

--- a/theano/tensor/nnet/corr_gemm.c
+++ b/theano/tensor/nnet/corr_gemm.c
@@ -6,13 +6,13 @@ Copyright (c) 2014, The Regents of the University of California (Regents)
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met: 
+modification, are permitted provided that the following conditions are met:

 1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer. 
+   list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution. 
+   and/or other materials provided with the distribution.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
@@ -31,20 +31,24 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // Loops for fast unfold + copy
 void im2col(const %(float_type)s* data_im, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int dilation_h, const int dilation_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    %(float_type)s* data_col) {
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  // Implicit dilated kernel size
+  int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
+  int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
+  int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
  int channels_col = channels * kernel_h * kernel_w;
  for (int c = 0; c < channels_col; ++c) {
    int w_offset = c %% kernel_w;
    int h_offset = (c / kernel_w) %% kernel_h;
    int c_im = c / kernel_h / kernel_w;
    for (int h = 0; h < height_col; ++h) {
+      int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
      for (int w = 0; w < width_col; ++w) {
-        int h_pad = h * stride_h - pad_h + h_offset;
-        int w_pad = w * stride_w - pad_w + w_offset;
+        int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
          data_col[(npy_intp)(c * height_col + h) * width_col + w] =
            data_im[(npy_intp)(c_im * height + h_pad) * width + w_pad];
@@ -60,10 +64,14 @@ void im2col(const %(float_type)s* data_im, const int channels,
 // accumulated into data_im.
 void col2im(const %(float_type)s* data_col, const int channels,
    const int height, const int width, const int patch_h, const int patch_w,
+    const int dilation_h, const int dilation_w,
    const int pad_h, const int pad_w, const int stride_h,
    const int stride_w, %(float_type)s* data_im) {
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  // Implicit dilated patch
+  int dil_patch_h = (patch_h - 1) * dilation_h + 1;
+  int dil_patch_w = (patch_w - 1) * dilation_w + 1;
+  int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
  int num_kernels = channels * height * width;
  int channels_col = channels * patch_h * patch_w;
  for (int c = 0; c < channels_col; ++c) {
@@ -71,9 +79,9 @@ void col2im(const %(float_type)s* data_col, const int channels,
    int h_offset = (c / patch_w) %% patch_h;
    int c_im = c / patch_h / patch_w;
    for (int h = 0; h < height_col; ++h) {
+      int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
      for (int w = 0; w < width_col; ++w) {
-        int h_pad = h * stride_h - pad_h + h_offset;
-        int w_pad = w * stride_w - pad_w + w_offset;
+        int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
          data_im[(npy_intp)(c_im * height + h_pad) * width + w_pad] +=
            data_col[(npy_intp)(c * height_col + h) * width_col + w];
@@ -91,13 +99,15 @@ void col2im(const %(float_type)s* data_col, const int channels,
 // CPU version author: Jesse Livezey
 // CPU version adapted from GPU version
 PyArrayObject* corrMM(PyArrayObject* bottom,
-                           PyArrayObject* weight,
-                           PyArrayObject* top,
-                           const int direction,
-                           const int dH = 1,
-                           const int dW = 1,
-                           const int padH = 0,
-                           const int padW = 0)
+                      PyArrayObject* weight,
+                      PyArrayObject* top,
+                      const int direction,
+                      const int dH = 1,
+                      const int dW = 1,
+                      const int dilH = 1,
+                      const int dilW = 1,
+                      const int padH = 0,
+                      const int padW = 0)
 {
    if (PyArray_NDIM(bottom) != 4)
    {
@@ -109,7 +119,7 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        PyErr_SetString(PyExc_ValueError, "CorrMM received bottom with wrong type.");
        return NULL;
    }
-    
+
    if (PyArray_NDIM(weight) != 4)
    {
        PyErr_SetString(PyExc_ValueError, "CorrMM requires weight of 4D");
@@ -151,9 +161,12 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
                "CorrMM images and kernel must have the same stack size\n");
        return NULL;
    }
+    // implicit dilated filter
+    const int dil_kH = (kH - 1) * dilH + 1;
+    const int dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const int topHeight = (bottomHeight + 2*padH - kH) / dH + 1;
-    const int topWidth  = (bottomWidth + 2*padW - kW) / dW + 1;
+    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
+    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
    if (batchSize != PyArray_DIMS(top)[0] ||
            nFilters != PyArray_DIMS(top)[1] ||
            topHeight != PyArray_DIMS(top)[2] ||
@@ -176,9 +189,9 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    col_dim[0] = (npy_intp)(nChannels * kW * kH);
    col_dim[1] = (npy_intp)(topHeight * topWidth);
    PyArrayObject* col = (PyArrayObject*)PyArray_EMPTY(2,
-		                           col_dim,
-                                           PyArray_TYPE(top),
-					   0);
+                                                       col_dim,
+                                                       PyArray_TYPE(top),
+                                                       0);
    if (NULL == col)
    {
        PyErr_Format(PyExc_RuntimeError,
@@ -206,7 +219,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        for (int n = 0; n < batchSize; n++) {
            // First, im2col
            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels, bottomHeight,
-                    bottomWidth, kH, kW, padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(col));
+                   bottomWidth, kH, kW, dilH, dilW,
+                   padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(col));
            // Second, gemm
            %(gemm)s(&NTrans, &NTrans,
                   &N_, &M_, &K_,
@@ -255,7 +269,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        for (int n = 0; n < batchSize; n++) {
            // First, im2col
            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels, bottomHeight,
-                    bottomWidth, kH, kW, padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(col));
+                   bottomWidth, kH, kW, dilH, dilW,
+                   padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(col));
            // Second, gemm
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
@@ -299,7 +314,7 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    }
    else if (direction == 2) {  // backprop wrt. inputs
        output = bottom;
-	// bottom is set to zero here rather than inside of col2im
+        // bottom is set to zero here rather than inside of col2im
        PyArray_FILLWBYTE(bottom, 0);
        // full convolution: gemm, then col2im
        // Iterate over batch
@@ -314,7 +329,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
                   (%(float_type)s*)PyArray_DATA(col), &N_);
            // col2im back to the data
            col2im((%(float_type)s*)PyArray_DATA(col), nChannels, bottomHeight, bottomWidth,
-                    kH, kW, padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride);
+                   kH, kW, dilH, dilW, padH, padW,
+                   dH, dW, (%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride);
        }
        /*
        // Original caffe code for comparison

--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -79,7 +79,8 @@ def local_abstractconv_gemm(node):
    if node.op.filter_flip:
        kern = kern[:, :, ::-1, ::-1]
    rval = CorrMM(border_mode=node.op.border_mode,
-                  subsample=node.op.subsample)(img, kern)
+                  subsample=node.op.subsample,
+                  filter_dilation=node.op.filter_dilation)(img, kern)
    copy_stack_trace(node.outputs[0], rval)

    return [rval]
@@ -97,7 +98,8 @@ def local_abstractconv_gradweight_gemm(node):
        return None

    rval = CorrMM_gradWeights(border_mode=node.op.border_mode,
-                              subsample=node.op.subsample)(img, topgrad, shape)
+                              subsample=node.op.subsample,
+                              filter_dilation=node.op.filter_dilation)(img, topgrad, shape)
    copy_stack_trace(node.outputs[0], rval)

    # need to flip the kernel if necessary
@@ -124,8 +126,9 @@ def local_abstractconv_gradinputs_gemm(node):
    if node.op.filter_flip:
        kern = kern[:, :, ::-1, ::-1]
    rval = CorrMM_gradInputs(border_mode=node.op.border_mode,
-                             subsample=node.op.subsample)(kern, topgrad,
-                                                          shape)
+                             subsample=node.op.subsample,
+                             filter_dilation=node.op.filter_dilation)(kern, topgrad,
+                                                                      shape)
    copy_stack_trace(node.outputs[0], rval)

    return [rval]
@@ -221,7 +224,9 @@ def local_conv2d_gradweight_cpu(node):
    assert len(op_imshp) == 4 and len(op_kshp) == 4

    outshp = get_conv_output_shape(op_imshp, op_kshp,
-                                   node.op.border_mode, node.op.subsample)[2:]
+                                   node.op.border_mode,
+                                   node.op.subsample,
+                                   node.op.filter_dilation)[2:]
    fulloutshp = get_conv_output_shape(op_imshp, op_kshp,
                                       node.op.border_mode, (1, 1))[2:]

@@ -334,7 +339,9 @@ def local_conv2d_gradinputs_cpu(node):
    filters = filters[:, :, ::-1, ::-1]

    outshp = get_conv_output_shape(op_imshp, op_kshp,
-                                   node.op.border_mode, node.op.subsample)[2:]
+                                   node.op.border_mode,
+                                   node.op.subsample,
+                                   node.op.filter_dilation)[2:]
    fulloutshp = get_conv_output_shape(op_imshp, op_kshp,
                                       node.op.border_mode, (1, 1))[2:]


--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -4,7 +4,6 @@ import numpy
 import numpy as np
 from nose.plugins.skip import SkipTest
 from nose.tools import assert_raises
-
 import theano
 from theano import tensor
 from theano.gof.opt import check_stack_trace
@@ -24,46 +23,57 @@ from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
 from theano.tensor.nnet.ConvTransp3D import ConvTransp3D


-def conv_corr(inputs, filters, border_mode="valid", subsample=(1, 1),
-              conv_mode='conv'):
+def conv_corr(inputs, filters, border_mode="valid",
+              subsample=(1, 1), conv_mode='conv',
+              filter_dilation=(1, 1)):
    if conv_mode == 'conv':
        filters = filters[:, :, ::-1, ::-1]
-    return corr.CorrMM(border_mode, subsample)(inputs, filters)
-
-
-def conv_corr_gw(inputs, topgrad, filters_shape, border_mode="valid",
-                 subsample=(1, 1), conv_mode='conv'):
-    rval = corr.CorrMM_gradWeights(border_mode, subsample)(inputs, topgrad,
-                                                           filters_shape[2:])
+    return corr.CorrMM(border_mode,
+                       subsample,
+                       filter_dilation)(inputs, filters)
+
+
+def conv_corr_gw(inputs, topgrad, filters_shape,
+                 border_mode="valid", subsample=(1, 1),
+                 conv_mode='conv', filter_dilation=(1, 1)):
+    rval = corr.CorrMM_gradWeights(border_mode,
+                                   subsample,
+                                   filter_dilation)(inputs, topgrad,
+                                                    filters_shape[2:])
    if conv_mode == 'conv':
        rval = rval[:, :, ::-1, ::-1]
    return rval


-def conv_corr_gi(filters, topgrad, inputs_shape, border_mode="valid",
-                 subsample=(1, 1), conv_mode='conv'):
+def conv_corr_gi(filters, topgrad, inputs_shape,
+                 border_mode="valid", subsample=(1, 1),
+                 conv_mode='conv', filter_dilation=(1, 1)):
    if conv_mode == 'conv':
        filters = filters[:, :, ::-1, ::-1]
-    return corr.CorrMM_gradInputs(border_mode, subsample)(filters, topgrad,
-                                                          inputs_shape[2:])
+    return corr.CorrMM_gradInputs(border_mode,
+                                  subsample,
+                                  filter_dilation)(filters,
+                                                   topgrad,
+                                                   inputs_shape[2:])


 class TestGetConvOutShape(unittest.TestCase):
    def test_basic(self):
-        image_shape, kernel_shape = (3, 2, 8, 9), (4, 2, 5, 6)
+        image_shape, kernel_shape = (3, 2, 12, 9), (4, 2, 5, 6)
        sub_sample = (1, 2)
+        filter_dilation = (2, 1)
        test1_params = get_conv_output_shape(
-            image_shape, kernel_shape, 'valid', sub_sample)
+            image_shape, kernel_shape, 'valid', sub_sample, filter_dilation)
        test2_params = get_conv_output_shape(
-            image_shape, kernel_shape, 'half', sub_sample)
+            image_shape, kernel_shape, 'half', sub_sample, filter_dilation)
        test3_params = get_conv_output_shape(
-            image_shape, kernel_shape, 'full', sub_sample)
+            image_shape, kernel_shape, 'full', sub_sample, filter_dilation)
        test4_params = get_conv_output_shape(
-            image_shape, kernel_shape, (1, 2), sub_sample)
+            image_shape, kernel_shape, (1, 2), sub_sample, filter_dilation)

        self.assertTrue(test1_params == (3, 4, 4, 2))
-        self.assertTrue(test2_params == (3, 4, 8, 5))
-        self.assertTrue(test3_params == (3, 4, 12, 7))
+        self.assertTrue(test2_params == (3, 4, 12, 5))
+        self.assertTrue(test3_params == (3, 4, 20, 7))
        self.assertTrue(test4_params == (3, 4, 6, 4))


@@ -71,35 +81,41 @@ class BaseTestConv2d(unittest.TestCase):
    def setUp(self):
        if theano.config.blas.ldflags == '':
            raise SkipTest("BLAS required for reference")
-        self.inputs_shapes = [(8, 1, 12, 12), (8, 1, 18, 18), (2, 1, 4, 4),
+        self.inputs_shapes = [(8, 1, 6, 6), (8, 1, 8, 8), (2, 1, 7, 7),
                              (6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)]
        self.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
-                               (1, 1, 2, 5), (4, 1, 2, 2), (4, 5, 2, 2)]
+                               (1, 1, 2, 3), (4, 1, 1, 3), (4, 5, 3, 2)]
        self.subsamples = [(1, 1), (2, 2), (2, 4)]
+        self.filters_dilations = [(1, 1), (1, 2), (2, 1)]
        self.border_modes = ["valid", "full", (0, 0), (1, 1), (5, 5), (5, 2)]
        self.filter_flip = [True, False]
        self.provide_shape = [True, False]
        self.shared = theano.compile.shared

-    def get_output_shape(self, inputs_shape, filters_shape, subsample,
-                         border_mode):
+    def get_output_shape(self, inputs_shape, filters_shape,
+                         subsample, border_mode, filter_dilation):
+        dil_filters = ((filters_shape[2] - 1) * filter_dilation[0] + 1,
+                       (filters_shape[3] - 1) * filter_dilation[1] + 1)
        if border_mode == "valid":
            border_mode = (0, 0)
        if border_mode == "full":
-            border_mode = (filters_shape[2] - 1, filters_shape[3] - 1)
+            border_mode = (dil_filters[0] - 1,
+                           dil_filters[1] - 1)
        batch_size = inputs_shape[0]
        num_filters = filters_shape[0]
        return ((batch_size, num_filters,) +
                tuple(None if i is None or k is None
-                      else ((i + 2 * pad - k) // d + 1)
-                      for i, k, d, pad in zip(inputs_shape[2:],
-                                              filters_shape[2:],
-                                              subsample, border_mode)))
+                      else ((i + 2 * pad - ((k - 1) * fd + 1)) // d + 1)
+                      for i, k, d, pad, fd in zip(inputs_shape[2:],
+                                                  filters_shape[2:],
+                                                  subsample, border_mode,
+                                                  filter_dilation)))

    def run_fwd(self, inputs_shape, filters_shape, ref=conv_corr,
                subsample=(1, 1), verify_grad=True, mode=None,
-                border_mode='valid', filter_flip=True, provide_shape=False,
-                target_op=None, check_trace=False):
+                border_mode='valid', filter_flip=True,
+                provide_shape=False, target_op=None,
+                check_trace=False, filter_dilation=(1, 1)):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')

@@ -120,13 +136,15 @@ class BaseTestConv2d(unittest.TestCase):
        c_ref = ref(inputs, filters,
                    border_mode=border_mode,
                    subsample=subsample,
-                    conv_mode=conv_mode)
+                    conv_mode=conv_mode,
+                    filter_dilation=filter_dilation)
        c = conv.conv2d(inputs, filters,
                        border_mode=border_mode,
                        subsample=subsample,
                        filter_flip=filter_flip,
                        input_shape=imshp,
-                        filter_shape=kshp)
+                        filter_shape=kshp,
+                        filter_dilation=filter_dilation)

        f_ref = theano.function([], c_ref, mode='FAST_RUN')
        f = theano.function([], c, mode=mode)
@@ -143,15 +161,17 @@ class BaseTestConv2d(unittest.TestCase):
        if verify_grad:
            utt.verify_grad(conv.AbstractConv2d(border_mode=border_mode,
                                                imshp=imshp, kshp=kshp,
-                                                subsample=subsample),
+                                                subsample=subsample,
+                                                filter_dilation=filter_dilation),
                            [inputs_val, filters_val],
                            mode=mode)

    def run_gradweight(self, inputs_shape, filters_shape, output_shape,
-                       ref=conv_corr_gw, subsample=(1, 1), filter_flip=True,
-                       verify_grad=True, mode=None, border_mode='valid',
-                       provide_shape=False, target_op=None, check_trace=False):
-
+                       ref=conv_corr_gw, subsample=(1, 1),
+                       filter_flip=True, verify_grad=True, mode=None,
+                       border_mode='valid', provide_shape=False,
+                       target_op=None, check_trace=False,
+                       filter_dilation=(1, 1)):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        output_val = numpy.random.random(output_shape).astype('float32')

@@ -171,13 +191,15 @@ class BaseTestConv2d(unittest.TestCase):
        c = conv.AbstractConv2d_gradWeights(border_mode=border_mode,
                                            filter_flip=filter_flip,
                                            subsample=subsample,
-                                            imshp=imshp, kshp=kshp)
+                                            imshp=imshp, kshp=kshp,
+                                            filter_dilation=filter_dilation)
        c = c(inputs, output, filters_shape[-2:])
        c_ref = ref(inputs, output,
                    filters_shape,
                    border_mode=border_mode,
                    subsample=subsample,
-                    conv_mode=conv_mode)
+                    conv_mode=conv_mode,
+                    filter_dilation=filter_dilation)
        f = theano.function([], c, mode=mode)
        f_ref = theano.function([], c_ref, mode='FAST_RUN')

@@ -193,7 +215,8 @@ class BaseTestConv2d(unittest.TestCase):

        def abstract_conv2d_gradweight(inputs_val, output_val):
            conv_op = conv.AbstractConv2d_gradWeights(border_mode=border_mode,
-                                                      subsample=subsample)
+                                                      subsample=subsample,
+                                                      filter_dilation=filter_dilation)
            return conv_op(inputs_val, output_val, filters_shape[-2:])

        if verify_grad:
@@ -204,8 +227,8 @@ class BaseTestConv2d(unittest.TestCase):
    def run_gradinput(self, inputs_shape, filters_shape, output_shape,
                      ref=conv_corr_gi, subsample=(1, 1), filter_flip=True,
                      verify_grad=True, mode=None, border_mode='valid',
-                      provide_shape=False, target_op=None, check_trace=False):
-
+                      provide_shape=False, target_op=None,
+                      check_trace=False, filter_dilation=(1, 1)):
        output_val = numpy.random.random(output_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')
        output = self.shared(output_val)
@@ -224,11 +247,12 @@ class BaseTestConv2d(unittest.TestCase):
        c = conv.AbstractConv2d_gradInputs(border_mode=border_mode,
                                           subsample=subsample,
                                           filter_flip=filter_flip,
-                                           imshp=imshp, kshp=kshp)
+                                           imshp=imshp, kshp=kshp,
+                                           filter_dilation=filter_dilation)
        c = c(filters, output, inputs_shape[-2:])
        c_ref = ref(filters, output, inputs_shape,
                    border_mode=border_mode, subsample=subsample,
-                    conv_mode=conv_mode)
+                    conv_mode=conv_mode, filter_dilation=filter_dilation)
        f = theano.function([], c, mode=mode)
        f_ref = theano.function([], c_ref, mode='FAST_RUN')

@@ -244,7 +268,8 @@ class BaseTestConv2d(unittest.TestCase):

        def abstract_conv2d_gradinputs(filters_val, output_val):
            conv_op = conv.AbstractConv2d_gradInputs(border_mode=border_mode,
-                                                     subsample=subsample)
+                                                     subsample=subsample,
+                                                     filter_dilation=filter_dilation)
            return conv_op(filters_val, output_val, inputs_shape[-2:])

        if verify_grad:
@@ -266,15 +291,18 @@ class BaseTestConv2d(unittest.TestCase):
                    self.tcase(i, f, ds, db, dflip, provide_shape)
                except SkipTest as e:
                    skipped = e
-            for s in self.subsamples:
-                for b in self.border_modes:
-                    try:
-                        self.tcase(i, f, s, db, dflip, dprovide_shape)
-                    except SkipTest as e:
-                        skipped = e
+            for fd in self.filters_dilations:
+                for s in self.subsamples:
+                    for b in self.border_modes:
+                        try:
+                            self.tcase(i, f, s, db, dflip,
+                                       dprovide_shape, fd)
+                        except SkipTest as e:
+                            skipped = e
            for flip in self.filter_flip:
                try:
-                    self.tcase(i, f, ds, db, flip, dprovide_shape)
+                    self.tcase(i, f, ds, db, flip,
+                               dprovide_shape)
                except SkipTest as e:
                    skipped = e
        if skipped:
@@ -287,26 +315,27 @@ class TestCorrConv2d(BaseTestConv2d):
            raise SkipTest()
        return super(TestCorrConv2d, self).setUp()

-    def tcase(self, i, f, s, b, flip, provide_shape):
-        o = self.get_output_shape(i, f, s, b)
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
+        o = self.get_output_shape(i, f, s, b, fd)
        if (not theano.config.blas.ldflags or
                not theano.config.cxx or
                theano.config.mode == "FAST_COMPILE"):
            raise SkipTest("Need blas to test conv2d")
        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
                     verify_grad=True, provide_shape=provide_shape,
-                     border_mode=b, filter_flip=flip, target_op=CorrMM,
-                     check_trace=True)
+                     border_mode=b, filter_flip=flip,
+                     target_op=CorrMM, check_trace=True,
+                     filter_dilation=fd)
        self.run_gradweight(inputs_shape=i, filters_shape=f,
                            output_shape=o, subsample=s, verify_grad=True,
                            provide_shape=provide_shape, border_mode=b,
                            filter_flip=flip, target_op=CorrMM_gradWeights,
-                            check_trace=True)
+                            check_trace=True, filter_dilation=fd)
        self.run_gradinput(inputs_shape=i, filters_shape=f,
                           output_shape=o, subsample=s, verify_grad=True,
                           provide_shape=provide_shape, border_mode=b,
                           filter_flip=flip, target_op=CorrMM_gradInputs,
-                           check_trace=True)
+                           check_trace=True, filter_dilation=fd)


 class TestCpuConv2d(BaseTestConv2d):
@@ -319,9 +348,11 @@ class TestCpuConv2d(BaseTestConv2d):
    def tearDown(self):
        theano.config.on_opt_error = self.opt_err

-    def tcase(self, i, f, s, b, flip, provide_shape):
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
+        if fd != (1, 1):
+            raise SkipTest("No dilation implementation for basic cpu ConvOp.")
        mode = self.mode
-        o = self.get_output_shape(i, f, s, b)
+        o = self.get_output_shape(i, f, s, b, fd)
        fwd_OK = True
        gradweight_OK = True
        gradinput_OK = True
@@ -347,11 +378,12 @@ class TestCpuConv2d(BaseTestConv2d):
        if fwd_OK:
            if not theano.config.blas.ldflags:
                raise SkipTest("Need blas to test conv2d")
-            self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
-                         verify_grad=(gradweight_OK and gradinput_OK),
+            self.run_fwd(inputs_shape=i, filters_shape=f,
+                         subsample=s, verify_grad=(gradweight_OK and gradinput_OK),
                         mode=mode, provide_shape=provide_shape,
                         border_mode=b, filter_flip=flip, target_op=ConvOp,
-                         check_trace=True)
+                         check_trace=True, filter_dilation=fd)
+
        else:
            self.assertRaises(AssertionError,
                              self.run_fwd,
@@ -363,7 +395,8 @@ class TestCpuConv2d(BaseTestConv2d):
                              provide_shape=provide_shape,
                              border_mode=b,
                              filter_flip=flip,
-                              check_trace=True)
+                              check_trace=True,
+                              filter_dilation=fd)

        if gradweight_OK:
            if not theano.config.blas.ldflags:
@@ -374,7 +407,8 @@ class TestCpuConv2d(BaseTestConv2d):
                                provide_shape=provide_shape, border_mode=b,
                                filter_flip=flip,
                                target_op=(ConvOp, ConvGrad3D),
-                                check_trace=True)
+                                check_trace=True,
+                                filter_dilation=fd)
        else:
            self.assertRaises(AssertionError,
                              self.run_gradweight,
@@ -387,7 +421,8 @@ class TestCpuConv2d(BaseTestConv2d):
                              provide_shape=provide_shape,
                              border_mode=b,
                              filter_flip=flip,
-                              check_trace=True)
+                              check_trace=True,
+                              filter_dilation=fd)

        if gradinput_OK:
            if not theano.config.blas.ldflags:
@@ -398,7 +433,8 @@ class TestCpuConv2d(BaseTestConv2d):
                               provide_shape=provide_shape, border_mode=b,
                               filter_flip=flip,
                               target_op=(ConvOp, ConvTransp3D),
-                               check_trace=True)
+                               check_trace=True,
+                               filter_dilation=fd)
        else:
            self.assertRaises(AssertionError,
                              self.run_gradinput,
@@ -411,7 +447,8 @@ class TestCpuConv2d(BaseTestConv2d):
                              provide_shape=provide_shape,
                              border_mode=b,
                              filter_flip=flip,
-                              check_trace=True)
+                              check_trace=True,
+                              filter_dilation=fd)


 def test_constant_shapes():

--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -32,8 +32,8 @@ class TestCorr2D(utt.InferShapeTester):

    def validate(self, image_shape, filter_shape,
                 border_mode='valid', subsample=(1, 1),
-                 input=None, filters=None,
-                 verify_grad=True, non_contiguous=False):
+                 input=None, filters=None, verify_grad=True,
+                 non_contiguous=False, filter_dilation=(1, 1)):
        """
        :param image_shape: The constant shape info passed to corrMM.
        :param filter_shape: The constant shape info passed to corrMM.
@@ -55,7 +55,8 @@ class TestCorr2D(utt.InferShapeTester):
            # define theano graph and function
            input.name = 'input'
            filters.name = 'filters'
-            rval = corr.CorrMM(border_mode, subsample)(input, filters)
+            rval = corr.CorrMM(border_mode, subsample,
+                               filter_dilation)(input, filters)
            rval.name = 'corr_output'
            return rval

@@ -86,20 +87,22 @@ class TestCorr2D(utt.InferShapeTester):
        orig_image_data = image_data
        img_shape2d = numpy.array(N_image_shape[-2:])
        fil_shape2d = numpy.array(N_filter_shape[-2:])
+        dil_shape2d = numpy.array(filter_dilation)
+        dil_fil_shape2d = (fil_shape2d - 1) * dil_shape2d + 1
        subsample2d = numpy.array(subsample)
        if border_mode == 'full':
-            padHW = (fil_shape2d - 1)
+            padHW = (dil_fil_shape2d - 1)
        elif border_mode == 'valid':
            padHW = numpy.array([0, 0])
        elif border_mode == 'half':
-            padHW = numpy.floor(fil_shape2d / 2).astype('int32')
+            padHW = numpy.floor(dil_fil_shape2d / 2).astype('int32')
        elif isinstance(border_mode, tuple):
            padHW = numpy.array(border_mode)
        elif isinstance(border_mode, integer_types):
            padHW = numpy.array([border_mode, border_mode])
        else:
            raise NotImplementedError('Unsupported border_mode {}'.format(border_mode))
-        out_shape2d = numpy.floor((img_shape2d + 2 * (padHW) - fil_shape2d) / subsample2d) + 1
+        out_shape2d = numpy.floor((img_shape2d + 2 * (padHW) - dil_fil_shape2d) / subsample2d) + 1
        # avoid numpy deprecation
        out_shape2d = out_shape2d.astype('int32')
        out_shape = (N_image_shape[0], N_filter_shape[0]) + tuple(out_shape2d)
@@ -124,8 +127,8 @@ class TestCorr2D(utt.InferShapeTester):
                        for col in range(ref_output.shape[3]):
                            icol = col * subsample[1]  # image col
                            ref_output[bb, nn, row, col] += (image2d[
-                                irow:irow + N_filter_shape[2],
-                                icol:icol + N_filter_shape[3]] * filter2d[::-1, ::-1]
+                                irow:irow + dil_fil_shape2d[0]:filter_dilation[0],
+                                icol:icol + dil_fil_shape2d[1]:filter_dilation[1]] * filter2d[::-1, ::-1]
                            ).sum()

        self.assertTrue(_allclose(theano_output, ref_output))
@@ -186,6 +189,28 @@ class TestCorr2D(utt.InferShapeTester):

        self.validate((1, 1, 6, 6), (1, 1, 3, 3), 1, subsample=(3, 3))

+    def test_filter_dilation(self):
+        """
+        Tests correlation where filter dilation != (1,1)
+        """
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'valid', filter_dilation=(2, 2))
+        self.validate((3, 2, 14, 10), (5, 2, 2, 3), 'valid', filter_dilation=(3, 1))
+        self.validate((1, 1, 14, 14), (1, 1, 3, 3), 'valid', filter_dilation=(2, 3))
+
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'full', filter_dilation=(2, 2))
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'full', filter_dilation=(3, 1))
+        self.validate((1, 1, 6, 6), (1, 1, 3, 3), 'full', filter_dilation=(2, 3))
+
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'half', filter_dilation=(2, 2))
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'half', filter_dilation=(3, 1))
+        self.validate((1, 1, 6, 6), (1, 1, 3, 3), 'half', filter_dilation=(2, 3))
+
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), (1, 1), filter_dilation=(2, 2))
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), (2, 1), filter_dilation=(2, 1))
+        self.validate((1, 1, 6, 6), (1, 1, 3, 3), (1, 2), filter_dilation=(1, 2))
+
+        self.validate((1, 1, 6, 6), (1, 1, 3, 3), 1, subsample=(3, 3), filter_dilation=(2, 2))
+
    @attr('slow')
    def test_shape_Constant_tensor(self):
        """