Implementation of 2D dilated convolution/correlation.

2dcf3753 · Nizar Assaf · c0b294ec · 2dcf3753 · 2dcf3753 · 2dcf3753
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -1393,6 +1393,9 @@ def local_abstractconv_cudnn(node):
    inp1 = node.inputs[0]
    inp2 = node.inputs[1]

+    if (node.op.filter_dilation != (1, 1)):
+        return None
+
    if not isinstance(inp1.type, GpuArrayType):
        return None


--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -855,15 +855,17 @@ class BaseGpuCorrMM(GpuOp):
        or a pair of integers
    subsample
        Perform subsampling of the output (default: (1, 1)).
+    filter_dilation
+        Perform subsampling of the input, also known as dilation (default: (1, 1)).
    pad
        *deprecated*, now you should always use border_mode.
-
    """

    check_broadcast = False
-    __props__ = ('border_mode', 'subsample')
+    __props__ = ('border_mode', 'subsample', 'filter_dilation')

-    def __init__(self, border_mode="valid", subsample=(1, 1), pad=(0, 0)):
+    def __init__(self, border_mode="valid", subsample=(1, 1),
+                 filter_dilation=(1, 1), pad=(0, 0)):
        if pad != (0, 0):
            _logger.warning(
                'do not use pad for BaseGpuCorrMM; please set padding in '
@@ -885,7 +887,10 @@ class BaseGpuCorrMM(GpuOp):
        self.border_mode = border_mode
        if len(subsample) != 2:
            raise ValueError("subsample must have two elements")
-        self.subsample = subsample
+        if len(filter_dilation) != 2:
+            raise ValueError("filter_dilation must have two elements")
+        self.subsample = tuple(subsample)
+        self.filter_dilation = tuple(filter_dilation)

    @property
    def pad(self):
@@ -894,10 +899,11 @@ class BaseGpuCorrMM(GpuOp):
        return (0, 0)

    def __str__(self):
-        return '%s{%s, %s}' % (
+        return '%s{%s, %s, %s}' % (
            self.__class__.__name__,
            self.border_mode,
-            str(self.subsample))
+            str(self.subsample),
+            str(self.filter_dilation))

    def flops(self, inp, outp):
        """
@@ -922,7 +928,7 @@ class BaseGpuCorrMM(GpuOp):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 24)
+        return (0, 26)

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -976,6 +982,7 @@ class BaseGpuCorrMM(GpuOp):

        """
        dH, dW = self.subsample
+        dilH, dilW = self.filter_dilation
        if self.border_mode == "half":
            padH = padW = -1
        elif self.border_mode == "full":
@@ -1022,6 +1029,8 @@ class BaseGpuCorrMM(GpuOp):
    // Optional args
    int dH = %(dH)s;
    int dW = %(dW)s;
+    int dilH = %(dilH)s;
+    int dilW = %(dilW)s;
    int padH = %(padH)s;
    int padW = %(padW)s;

@@ -1045,39 +1054,43 @@ class BaseGpuCorrMM(GpuOp):
        }
        else if (padH == -2) {
            // vertical full padding, we can infer the kernel height
-            kH = 2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
+            kH = (2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1;
        }
        else {
            // explicit padding, we can infer the kernel height
-            kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
+            kH = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1 ;
        }
        if ((dW != 1) || (padW == -1)) {
            kW = %(width)s;
        }
        else if (padW == -2) {
-            kW = 2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
+            kW = (2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
        else {
-            kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
+            kW = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
    }

+    // Implicit dilated kernel size
+    int dil_kH = (kH - 1) * dilH + 1;
+    int dil_kW = (kW - 1) * dilW + 1;
+
    // Auto-padding if requested
    if (padH == -1) {  // vertical half padding
-        padH = kH / 2;
+        padH = dil_kH / 2;
    }
    else if (padH == -2) {  // vertical full padding
-        padH = kH - 1;
+        padH = dil_kH - 1;
    }
    else if (padH < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padH must be >= -2");
        %(fail)s
    }
    if (padW == -1) {  // horizontal half padding
-        padW = kW / 2;
+        padW = dil_kW / 2;
    }
    else if (padW == -2) {  // horizontal full padding
-        padW = kW - 1;
+        padW = dil_kW - 1;
    }
    else if (padW < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padW must be >= -2");
@@ -1089,15 +1102,15 @@ class BaseGpuCorrMM(GpuOp):
    switch(direction) {
    case 0:  // forward pass
        // output is top: (batchsize, num_filters, height, width)
-        // height and width: top = (bottom + 2*pad - weight) / sample + 1
+        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
-        out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1;
-        out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1;
+        out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - ((CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
+        out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - ((CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width)
-        // height and width: weights = bottom + 2*pad - (top - 1) * sample
+        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = CudaNdarray_HOST_DIMS(top)[1];
        out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[1];
        out_dim[2] = kH;  // already inferred further above
@@ -1105,11 +1118,11 @@ class BaseGpuCorrMM(GpuOp):
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width)
-        // height and width: bottom = (top - 1) * sample + weights - 2*pad
+        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
-        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH;
-        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW;
+        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n");
@@ -1137,7 +1150,7 @@ class BaseGpuCorrMM(GpuOp):
    }

    // Call CUDA code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, padH, padW);
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW);
    if (out2==NULL){
       %(fail)s
    }
@@ -1168,6 +1181,10 @@ class GpuCorrMM(BaseGpuCorrMM):
        `(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
        but faster.
        Set to `(1, 1)` to disable subsampling.
+    filter_dilation
+        The filter dilation operation applied to each input image.
+        Should be a tuple with 2 elements.
+        Set to `(1, 1)` to disable filter dilation.
    pad
        Deprecated alias for `border_mode`.

@@ -1198,8 +1215,10 @@ class GpuCorrMM(BaseGpuCorrMM):
    """
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
+                 filter_dilation=(1, 1),
                 pad=(0, 0)):
-        super(GpuCorrMM, self).__init__(border_mode, subsample, pad)
+        super(GpuCorrMM, self).__init__(border_mode, subsample,
+                                        filter_dilation, pad)

    def make_node(self, img, kern):
        img = as_cuda_ndarray_variable(img)
@@ -1223,9 +1242,13 @@ class GpuCorrMM(BaseGpuCorrMM):
        bottom, weights = inp
        top, = grads
        top = gpu_contiguous(top)
-        d_bottom = GpuCorrMM_gradInputs(self.border_mode, self.subsample)(
+        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
+                                        self.subsample,
+                                        self.filter_dilation)(
            weights, top, bottom.shape[-2:])
-        d_weights = GpuCorrMM_gradWeights(self.border_mode, self.subsample)(
+        d_weights = GpuCorrMM_gradWeights(self.border_mode,
+                                          self.subsample,
+                                          self.filter_dilation)(
            bottom, top, weights.shape[-2:])
        return d_bottom, d_weights

@@ -1243,8 +1266,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):

    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
+                 filter_dilation=(1, 1),
                 pad=(0, 0)):
-        super(GpuCorrMM_gradWeights, self).__init__(border_mode, subsample, pad)
+        super(GpuCorrMM_gradWeights, self).__init__(border_mode,
+                                                    subsample,
+                                                    filter_dilation,
+                                                    pad)

    def make_node(self, img, topgrad, shape=None):
        img = as_cuda_ndarray_variable(img)
@@ -1278,12 +1305,13 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
        bottom, top = inp[:2]
        weights, = grads
        weights = gpu_contiguous(weights)
-        d_bottom = GpuCorrMM_gradInputs(
-            self.border_mode, self.subsample)(weights,
+        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
+                                        self.subsample,
+                                        self.filter_dilation)(weights,
                                                              top,
                                                              bottom.shape[-2:])
        d_top = GpuCorrMM(
-            self.border_mode, self.subsample)(bottom, weights)
+            self.border_mode, self.subsample, self.filter_dilation)(bottom, weights)
        d_height_width = (
            theano.gradient.DisconnectedType()(),
            ) * 2 if len(inp) == 4 else ()
@@ -1309,8 +1337,10 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):

    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
+                 filter_dilation=(1, 1),
                 pad=(0, 0)):
-        super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample, pad)
+        super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample,
+                                                   filter_dilation, pad)

    def make_node(self, kern, topgrad, shape=None):
        kern = as_cuda_ndarray_variable(kern)
@@ -1341,11 +1371,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
        weights, top = inp[:2]
        bottom, = grads
        bottom = gpu_contiguous(bottom)
-        d_weights = GpuCorrMM_gradWeights(
-            self.border_mode, self.subsample)(
-                bottom, top, weights.shape[-2:])
-        d_top = GpuCorrMM(
-            self.border_mode, self.subsample)(bottom, weights)
+        d_weights = GpuCorrMM_gradWeights(self.border_mode,
+                                          self.subsample,
+                                          self.filter_dilation)(bottom,
+                                                                top,
+                                                                weights.shape[-2:])
+        d_top = GpuCorrMM(self.border_mode,
+                          self.subsample,
+                          self.filter_dilation)(bottom, weights)
        d_height_width = (
            theano.gradient.DisconnectedType()(),
            ) * 2 if len(inp) == 4 else ()
@@ -1871,12 +1904,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
        weights, top = inp[:2]
        bottom, = grads
        bottom = gpu_contiguous(bottom)
-        d_weights = GpuCorr3dMM_gradWeights(
-            self.border_mode, self.subsample, self.pad)(
-                bottom, top, weights.shape[-3:])
-        d_top = GpuCorr3dMM(
-            self.border_mode, self.subsample, self.pad)(
-                bottom, weights)
+        d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
+                                            self.subsample,
+                                            self.pad)(bottom,
+                                                      top,
+                                                      weights.shape[-3:])
+        d_top = GpuCorr3dMM(self.border_mode,
+                            self.subsample,
+                            self.pad)(bottom, weights)
        d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
            * 3 if len(inp) == 5 else ()
        return (d_weights, d_top) + d_height_width_depth

--- a/theano/sandbox/cuda/corr_gemm.cu
+++ b/theano/sandbox/cuda/corr_gemm.cu
@@ -52,6 +52,39 @@ inline int GET_BLOCKS(const int N) {

 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
 // Kernels for fast unfold + copy
+// CUDA kernel for the case of dilation
+__global__ void dilated_im2col_kernel(const int n, const float* data_im,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int dilation_h, const int dilation_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    float* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int h_index = index / width_col;
+    const int h_col = h_index % height_col;
+    const int w_col = index % width_col;
+    const int c_im = h_index / height_col;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col * stride_h - pad_h;
+    const int w_offset = w_col * stride_w - pad_w;
+    float* data_col_ptr = data_col;
+    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
+    const float* data_im_ptr = data_im;
+    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        int h_im = h_offset + i * dilation_h;
+        int w_im = w_offset + j * dilation_w;
+        *data_col_ptr =
+          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+            data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
 __global__ void im2col_kernel(const int n, const float* data_im,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad_h, const int pad_w,
@@ -59,22 +92,23 @@ __global__ void im2col_kernel(const int n, const float* data_im,
    const int height_col, const int width_col,
    float* data_col) {
  CUDA_KERNEL_LOOP(index, n) {
-    int w_out = index % width_col;
-    int h_index = index / width_col;
-    int h_out = h_index % height_col;
-    int channel_in = h_index / height_col;
-    int channel_out = channel_in * kernel_h * kernel_w;
-    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride_w - pad_w;
+    const int h_index = index / width_col;
+    const int h_col = h_index % height_col;
+    const int w_col = index % width_col;
+    const int c_im = h_index / height_col;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col * stride_h - pad_h;
+    const int w_offset = w_col * stride_w - pad_w;
    float* data_col_ptr = data_col;
-    data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
+    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
    const float* data_im_ptr = data_im;
-    data_im_ptr += (channel_in * height + h_in) * width + w_in;
+    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
    for (int i = 0; i < kernel_h; ++i) {
      for (int j = 0; j < kernel_w; ++j) {
-        int h = h_in + i;
-        int w = w_in + j;
-        *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
+        int h_im = h_offset + i ;
+        int w_im = w_offset + j ;
+        *data_col_ptr =
+          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
           data_im_ptr[i * width + j] : 0;
        data_col_ptr += height_col * width_col;
      }
@@ -84,52 +118,97 @@ __global__ void im2col_kernel(const int n, const float* data_im,

 void im2col(const float* data_im, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int dilation_h, const int dilation_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    float* data_col) {
  // We are going to launch channels * height_col * width_col kernels, each
  // kernel responsible for copying a single-channel grid.
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
+  int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
+  int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
  int num_kernels = channels * height_col * width_col;
+  if(dilation_h != 1 || dilation_w != 1){
+    dilated_im2col_kernel<<<GET_BLOCKS(num_kernels),
+                  CUDA_NUM_THREADS>>>(
+      num_kernels, data_im, height, width, kernel_h, kernel_w,
+      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
+      width_col, data_col);
+  }
+  else{
    im2col_kernel<<<GET_BLOCKS(num_kernels),
                  CUDA_NUM_THREADS>>>(
-      num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
-      pad_w, stride_h, stride_w, height_col,
+      num_kernels, data_im, height, width, kernel_h, kernel_w,
+      pad_h, pad_w, stride_h, stride_w, height_col,
      width_col, data_col);
+  }
 }

-__global__ void col2im_kernel(const int n, const float* data_col,
+// CUDA kernel for the case of dilation
+__global__ void dilated_col2im_kernel(const int n, const float* data_col,
    const int height, const int width, const int channels,
-    const int patch_h, const int patch_w,
+    const int kernel_h, const int kernel_w,
+    const int dilation_h, const int dilation_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    const int height_col, const int width_col,
    float* data_im) {
  CUDA_KERNEL_LOOP(index, n) {
    float val = 0;
-    int w = index % width + pad_w;
-    int h = (index / width) % height + pad_h;
-    int c = index / (width * height);
+    const int w_im = index % width + pad_w;
+    const int h_im = (index / width) % height + pad_h;
+    const int c_im = index / (width * height);
+    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
    // compute the start and end of the output
-    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
-    int w_col_end = min(w / stride_w + 1, width_col);
-    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
-    int h_col_end = min(h / stride_h + 1, height_col);
-    /*
-    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        // the col location: [c * width * height + h_out, w_out]
-        int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize
-            + (w - w_col * stride_w);
-        val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+    const int w_col_start =
+        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+    const int w_col_end = min(w_im / stride_w + 1, width_col);
+    const int h_col_start =
+        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+    const int h_col_end = min(h_im / stride_h + 1, height_col);
+    // TODO: use LCM of stride and dilation to avoid unnecessary loops
+    for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+      for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+        int h_k = (h_im - h_col * stride_h);
+        int w_k = (w_im - w_col * stride_w);
+        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+          h_k /= dilation_h;
+          w_k /= dilation_w;
+          int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
+                                height_col + h_col) * width_col + w_col;
+          val += data_col[data_col_index];
        }
      }
-    */
-    // equivalent implementation
+    }
+    data_im[index] = val;
+  }
+}
+
+__global__ void col2im_kernel(const int n, const float* data_col,
+    const int height, const int width, const int channels,
+    const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    float* data_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    float val = 0;
+    const int w_im = index % width + pad_w;
+    const int h_im = (index / width) % height + pad_h;
+    const int c_im = index / (width * height);
+    // compute the start and end of the output
+    const int w_col_start =
+        (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
+    const int w_col_end = min(w_im / stride_w + 1, width_col);
+    const int h_col_start =
+        (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
+    const int h_col_end = min(h_im / stride_h + 1, height_col);
+    // equivalent implementation, no dilation
    int offset =
-        (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
-    int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
+      (c_im * kernel_h * kernel_w + h_im * kernel_w + w_im) * height_col * width_col;
+    int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
    int coeff_w_col = (1 - stride_w * height_col * width_col);
    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
@@ -142,18 +221,30 @@ __global__ void col2im_kernel(const int n, const float* data_col,

 void col2im(const float* data_col, const int channels,
    const int height, const int width, const int patch_h, const int patch_w,
+    const int dilation_h, const int dilation_w,
    const int pad_h, const int pad_w, const int stride_h,
    const int stride_w, float* data_im) {
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int dil_patch_h = (patch_h - 1) * dilation_h + 1;
+  int dil_patch_w = (patch_w - 1) * dilation_w + 1;
+  int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
  int num_kernels = channels * height * width;
  // To avoid involving atomic operations, we will launch one kernel per
  // bottom dimension, and then in the kernel add up the top dimensions.
+  if(dilation_h != 1 || dilation_w != 1){
+    dilated_col2im_kernel<<<GET_BLOCKS(num_kernels),
+                  CUDA_NUM_THREADS>>>(
+      num_kernels, data_col, height, width, channels, patch_h, patch_w,
+      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
+      height_col, width_col, data_im);
+  }
+  else{
    col2im_kernel<<<GET_BLOCKS(num_kernels),
                  CUDA_NUM_THREADS>>>(
      num_kernels, data_col, height, width, channels, patch_h, patch_w,
      pad_h, pad_w, stride_h, stride_w,
      height_col, width_col, data_im);
+  }
 }


@@ -167,6 +258,8 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
                    const int direction,
                    const int dH = 1,
                    const int dW = 1,
+                    const int dilH = 1,
+                    const int dilW = 1,
                    const int padH = 0,
                    const int padW = 0)
 {
@@ -236,9 +329,12 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
                "GpuCorrMM images and kernel must have the same stack size\n");
        return NULL;
    }
+    // implicit dilated filter
+    const int dil_kH = (kH - 1) * dilH + 1;
+    const int dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const int topHeight = (bottomHeight + 2*padH - kH) / dH + 1;
-    const int topWidth  = (bottomWidth + 2*padW - kW) / dW + 1;
+    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
+    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
            nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
            topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
@@ -286,7 +382,8 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
        for (int n = 0; n < batchSize; n++) {
            // First, im2col
            im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
-                    bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
+                   bottomWidth, kH, kW, dilH, dilW,
+                   padH, padW, dH, dW, col->devdata);
            cudaError_t err = cudaGetLastError();
            if (err != cudaSuccess) {
                PyErr_Format(PyExc_RuntimeError,
@@ -353,7 +450,8 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
        for (int n = 0; n < batchSize; n++) {
            // First, im2col
            im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
-                    bottomWidth, kH, kW, padH, padW, dH, dW, col->devdata);
+                   bottomWidth, kH, kW, dilH, dilW,
+                   padH, padW, dH, dW, col->devdata);
            cudaError_t err = cudaGetLastError();
            if (err != cudaSuccess) {
                PyErr_Format(PyExc_RuntimeError,
@@ -438,7 +536,8 @@ CudaNdarray* corrMM(CudaNdarray *const bottom,
            }
            // col2im back to the data
            col2im(col->devdata, nChannels, bottomHeight, bottomWidth,
-                    kH, kW, padH, padW, dH, dW, bottom->devdata + n * bottom_stride);
+                   kH, kW, dilH, dilW, padH, padW,
+                   dH, dW, bottom->devdata + n * bottom_stride);
            cudaError_t err = cudaGetLastError();
            if (err != cudaSuccess) {
                PyErr_Format(PyExc_RuntimeError,

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -2612,6 +2612,8 @@ def local_abstractconv_cudnn(node):
                                 AbstractConv2d_gradWeights,
                                 AbstractConv2d_gradInputs))):
        return None
+    if (node.op.filter_dilation != (1, 1)):
+        return None

    inp1 = node.inputs[0]
    inp2 = node.inputs[1]
@@ -2619,6 +2621,7 @@ def local_abstractconv_cudnn(node):
    if (not isinstance(inp1.type, CudaNdarrayType) or
            not isinstance(inp2.type, CudaNdarrayType)):
        return None
+
    if not dnn_available():
        return None


--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1622,7 +1622,8 @@ def local_conv_gemm(node):
                    # because we are not allowed to replace a CudaNdarray with
                    # a DimShuffle instance in a graph optimization)
                    rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
-                        GpuCorrMM_gradWeights(border_mode, subsample)(
+                        GpuCorrMM_gradWeights(border_mode,
+                                              subsample)(
                            gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
                            gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
                        ).dimshuffle(1, 0, 2, 3))
@@ -2769,20 +2770,25 @@ def local_abstractconv_gemm(node):

    border_mode = node.op.border_mode
    subsample = node.op.subsample
-    if (border_mode == 'full') and (subsample == (1, 1)):
+    filter_dilation = node.op.filter_dilation
+    if ((border_mode == 'full') and (subsample == (1, 1))):
        if not node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]
        # need to dimshuffle the kernel for full convolution
        kern = kern.dimshuffle(1, 0, 2, 3)
        # call GpuCorrMM_gradInputs
-        rval = GpuCorrMM_gradInputs('valid', subsample)(
+        rval = GpuCorrMM_gradInputs('valid',
+                                    subsample,
+                                    filter_dilation)(
            gpu_contiguous(kern), gpu_contiguous(img))
    else:
        # need to flip the kernel if necessary
        if node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]
        # By default use GpuCorrMM
-        rval = GpuCorrMM(border_mode, subsample)(gpu_contiguous(img),
+        rval = GpuCorrMM(border_mode,
+                         subsample,
+                         filter_dilation)(gpu_contiguous(img),
                                          gpu_contiguous(kern))

        # call GpuCorrMM_gradWeights if good
@@ -2790,7 +2796,7 @@ def local_abstractconv_gemm(node):
        # is larger than inputChannels * outputHeight * outputWidth.
        # GpuConv does not always store information on the batchsize and
        # channels, though, so we only use what information we have.)
-        if ((subsample == (1, 1)) and
+        if ((subsample == (1, 1)) and (filter_dilation == (1, 1)) and
                (node.op.imshp is not None) and
                (None not in node.op.imshp[-2:]) and
                (node.op.kshp is not None) and
@@ -2810,7 +2816,9 @@ def local_abstractconv_gemm(node):
                # because we are not allowed to replace a CudaNdarray with
                # a DimShuffle instance in a graph optimization)
                rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
-                    GpuCorrMM_gradWeights(border_mode, subsample)(
+                    GpuCorrMM_gradWeights(border_mode,
+                                          subsample,
+                                          filter_dilation)(
                        gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
                        gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
                    ).dimshuffle(1, 0, 2, 3))
@@ -2827,7 +2835,8 @@ def local_abstractconv_gradweight_gemm(node):
        return None

    rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
-                                 subsample=node.op.subsample)(
+                                 subsample=node.op.subsample,
+                                 filter_dilation=node.op.filter_dilation)(
        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
    if node.op.filter_flip:
        rval = rval[:, :, ::-1, ::-1]
@@ -2849,7 +2858,8 @@ def local_abstractconv_gradinputs_gemm(node):
        kern = kern[:, :, ::-1, ::-1]

    rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
-                                subsample=node.op.subsample)(
+                                subsample=node.op.subsample,
+                                filter_dilation=node.op.filter_dilation)(
        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
    return [rval]

@@ -2870,10 +2880,12 @@ conv_groupopt.register('local_abstractconv_dnn',
 conv_groupopt.register('local_abstractconv_gemm', local_abstractconv_gemm, 30,
                       'conv_gemm',
                       'gpu', 'fast_compile', 'fast_run')
+
 conv_groupopt.register('local_abstractconv_gradweight_gemm',
                       local_abstractconv_gradweight_gemm, 30,
                       'conv_gemm',
                       'gpu', 'fast_compile', 'fast_run')
+
 conv_groupopt.register('local_abstractconv_gradinputs_gemm',
                       local_abstractconv_gradinputs_gemm, 30,
                       'conv_gemm',

--- a/theano/sandbox/cuda/tests/test_abstractconv.py
+++ b/theano/sandbox/cuda/tests/test_abstractconv.py
@@ -29,25 +29,30 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
        self.provide_shape = [False]
        self.shared = gpu_shared

-    def tcase(self, i, f, s, b, flip, provide_shape):
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
+        if fd != (1, 1):
+            raise SkipTest("No dilation implementation for cuDNN ConvOp.")
        if not dnn_available():
            raise SkipTest(cuda.dnn.dnn_available.msg)
        mode = mode_with_gpu
-        o = self.get_output_shape(i, f, s, b)
+        o = self.get_output_shape(i, f, s, b, fd)
        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
                     verify_grad=True, mode=mode,
                     provide_shape=provide_shape, border_mode=b,
-                     filter_flip=flip, target_op=GpuDnnConv)
+                     filter_flip=flip, target_op=GpuDnnConv,
+                     filter_dilation=fd)
        self.run_gradweight(inputs_shape=i, filters_shape=f,
                            output_shape=o, subsample=s,
                            verify_grad=True, mode=mode,
                            provide_shape=provide_shape, border_mode=b,
-                            filter_flip=flip, target_op=GpuDnnConvGradW)
+                            filter_flip=flip, target_op=GpuDnnConvGradW,
+                            filter_dilation=fd)
        self.run_gradinput(inputs_shape=i, filters_shape=f,
                           output_shape=o, subsample=s,
                           verify_grad=True, mode=mode,
                           provide_shape=provide_shape, border_mode=b,
-                           filter_flip=flip, target_op=GpuDnnConvGradI)
+                           filter_flip=flip, target_op=GpuDnnConvGradI,
+                           filter_dilation=fd)


 class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
@@ -56,28 +61,30 @@ class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
        self.shared = gpu_shared
        self.mode = mode_with_gpu.excluding('cudnn')

-    def tcase(self, i, f, s, b, flip, provide_shape):
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
        mode = self.mode
-        o = self.get_output_shape(i, f, s, b)
-        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
-                     verify_grad=True, mode=mode,
+        o = self.get_output_shape(i, f, s, b, fd)
+        self.run_fwd(inputs_shape=i, filters_shape=f,
+                     subsample=s, verify_grad=True, mode=mode,
                     provide_shape=provide_shape, border_mode=b,
-                     filter_flip=flip,
-                     target_op=(GpuCorrMM,
+                     filter_flip=flip, target_op=(GpuCorrMM,
                                                  GpuCorrMM_gradWeights,
-                                GpuCorrMM_gradInputs))
+                                                  GpuCorrMM_gradInputs),
+                     filter_dilation=fd)
        self.run_gradweight(inputs_shape=i, filters_shape=f,
                            output_shape=o, subsample=s,
                            verify_grad=True, mode=mode,
                            provide_shape=provide_shape, border_mode=b,
                            filter_flip=flip,
-                            target_op=GpuCorrMM_gradWeights)
+                            target_op=GpuCorrMM_gradWeights,
+                            filter_dilation=fd)
        self.run_gradinput(inputs_shape=i, filters_shape=f,
                           output_shape=o, subsample=s,
                           verify_grad=True, mode=mode,
                           provide_shape=provide_shape, border_mode=b,
                           filter_flip=flip,
-                           target_op=GpuCorrMM_gradInputs)
+                           target_op=GpuCorrMM_gradInputs,
+                           filter_dilation=fd)


 class TestDnnConvTypes(test_abstract_conv.TestConvTypes):

--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
@@ -35,7 +35,7 @@ from .abstract_conv import conv2d as abstract_conv2d

 def conv2d(input, filters, input_shape=None, filter_shape=None,
           border_mode='valid', subsample=(1, 1), filter_flip=True,
-           image_shape=None, **kwargs):
+           image_shape=None, filter_dilation=(1, 1), **kwargs):
    """
    This function will build the symbolic graph for convolving a mini-batch of a
    stack of 2D inputs with a set of 2D filters. The implementation is modelled
@@ -95,6 +95,10 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
    image_shape: None, tuple/list of len 4 of int or Constant variable
        Deprecated alias for input_shape.

+    filter_dilation: tuple of len 2
+        Factor by which to subsample (stride) the input.
+        Also called dilation elsewhere.
+
    kwargs: Any other keyword arguments are accepted for backwards
            compatibility, but will be ignored.

@@ -140,4 +144,5 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
                             " be provided at the same time.")

    return abstract_conv2d(input, filters, input_shape, filter_shape,
-                           border_mode, subsample, filter_flip)
+                           border_mode, subsample, filter_flip,
+                           filter_dilation)
--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
@@ -32,7 +32,8 @@ _logger = logging.getLogger("theano.tensor.nnet.abstract_conv")


 def get_conv_output_shape(image_shape, kernel_shape,
-                          border_mode, subsample):
+                          border_mode, subsample,
+                          filter_dilation=(1, 1)):
    """
    This function compute the output shape of convolution operation.

@@ -53,6 +54,8 @@ def get_conv_output_shape(image_shape, kernel_shape,
    subsample: tuple of int (symbolic or numeric). Its or three elements
        espectively correspond to the subsampling on height and width (and
        possibly depth) axis.
+    filter_dilation: tuple of int (symbolic or numeric). Its two elements
+        correspond respectively to the dilation on height and width axis.

    Returns
    -------
@@ -65,17 +68,19 @@ def get_conv_output_shape(image_shape, kernel_shape,
    nkern, kshp = kernel_shape[0], kernel_shape[2:]
    if isinstance(border_mode, tuple):
        out_shp = tuple(get_conv_shape_1axis(
-            imshp[i], kshp[i], border_mode[i], subsample[i])
-            for i in range(len(subsample)))
+            imshp[i], kshp[i], border_mode[i],
+            subsample[i], filter_dilation[i]) for i in range(len(subsample)))
    else:
        out_shp = tuple(get_conv_shape_1axis(
-            imshp[i], kshp[i], border_mode, subsample[i])
-            for i in range(len(subsample)))
+            imshp[i], kshp[i], border_mode,
+            subsample[i], filter_dilation[i]) for i in range(len(subsample)))
    return (bsize, nkern) + out_shp


-def get_conv_shape_1axis(image_shape, kernel_shape,
-                         border_mode, subsample):
+# filter dilation set by default to 1
+# for compatibility with other tests.
+def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
+                         subsample, dilation=1):
    """
    This function compute the output shape of convolution operation.

@@ -90,6 +95,8 @@ def get_conv_shape_1axis(image_shape, kernel_shape,
        the padding on the considered axis.
    subsample: int. It must correspond to the subsampling on the
        considered axis.
+    dilation: int. It must correspond to the dilation on the
+        considered axis.

    Returns
    -------
@@ -97,19 +104,22 @@ def get_conv_shape_1axis(image_shape, kernel_shape,
        considered axis. None if undefined.

    """
-    if None in [image_shape, kernel_shape, border_mode, subsample]:
+    if None in [image_shape, kernel_shape, border_mode,
+                subsample, dilation]:
        return None
+    # Implicit dilated kernel shape
+    dil_kernel_shape = (kernel_shape - 1) * dilation + 1
    if border_mode == "half":
-        pad = kernel_shape // 2
+        pad = dil_kernel_shape // 2
    elif border_mode == "full":
-        pad = kernel_shape - 1
+        pad = dil_kernel_shape - 1
    elif border_mode == "valid":
        pad = 0
    else:
        pad = border_mode
        if pad < 0:
            raise ValueError("border_mode must be >= 0")
-    out_shp = (image_shape + 2 * pad - kernel_shape) // subsample + 1
+    out_shp = (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1

    return out_shp

@@ -120,7 +130,8 @@ def conv2d(input,
           filter_shape=None,
           border_mode='valid',
           subsample=(1, 1),
-           filter_flip=True):
+           filter_flip=True,
+           filter_dilation=(1, 1)):
    """This function will build the symbolic graph for convolving a mini-batch of a
    stack of 2D inputs with a set of 2D filters. The implementation is modelled
    after Convolutional Neural Networks (CNN).
@@ -134,7 +145,8 @@ def conv2d(input,
                             kshp=filter_shape,
                             border_mode=border_mode,
                             subsample=subsample,
-                             filter_flip=filter_flip)
+                             filter_flip=filter_flip,
+                             filter_dilation=filter_dilation)
    return conv_op(input, filters)


@@ -144,7 +156,8 @@ def conv2d_grad_wrt_inputs(output_grad,
                           filter_shape=None,
                           border_mode='valid',
                           subsample=(1, 1),
-                           filter_flip=True):
+                           filter_flip=True,
+                           filter_dilation=(1, 1)):
    """Compute conv output gradient w.r.t its inputs

    This function builds the symbolic graph for getting the
@@ -214,6 +227,9 @@ def conv2d_grad_wrt_inputs(output_grad,
        referred to as a convolution, and this is the default. If
        ``False``, the filters are not flipped and the operation is
        referred to as a cross-correlation.
+    filter_dilation : tuple of len 2
+        The filter dilation used in the forward pass.
+        Also known as input striding.

    Returns
    -------
@@ -263,7 +279,8 @@ def conv2d_grad_wrt_inputs(output_grad,
                                              kshp=filter_shape,
                                              border_mode=border_mode,
                                              subsample=subsample,
-                                              filter_flip=filter_flip)
+                                              filter_flip=filter_flip,
+                                              filter_dilation=filter_dilation)

    return grad_input_op(filters, output_grad, input_shape[-2:])

@@ -274,7 +291,8 @@ def conv2d_grad_wrt_weights(input,
                            input_shape=None,
                            border_mode='valid',
                            subsample=(1, 1),
-                            filter_flip=True):
+                            filter_flip=True,
+                            filter_dilation=(1, 1)):
    """Compute conv output gradient w.r.t its weights

    This function will build the symbolic graph for getting the
@@ -327,7 +345,6 @@ def conv2d_grad_wrt_weights(input,
          ``(int1, int2)``
            pad input with a symmetric border of ``int1`` rows and
            ``int2`` columns, then perform a valid convolution.
-
    subsample : tuple of len 2
        The subsampling used in the forward pass of the convolutional
        operation.  Also called strides elsewhere.
@@ -337,6 +354,9 @@ def conv2d_grad_wrt_weights(input,
        referred to as a convolution, and this is the default. If
        ``False``, the filters are not flipped and the operation is
        referred to as a cross-correlation.
+    filter_dilation : tuple of len 2
+        The filter dilation used in the forward pass.
+        Also known as input striding.

    Returns
    -------
@@ -386,7 +406,8 @@ def conv2d_grad_wrt_weights(input,
                                               kshp=numerical_filter_shape,
                                               border_mode=border_mode,
                                               subsample=subsample,
-                                               filter_flip=filter_flip)
+                                               filter_flip=filter_flip,
+                                               filter_dilation=filter_dilation)

    return gradWeight_op(input, output_grad, filter_shape[:-2])

@@ -541,7 +562,8 @@ def bilinear_upsampling(input,
                                               filter_shape=(1, 1, None, 1),
                                               border_mode=(pad, 0),
                                               subsample=(ratio, 1),
-                                               filter_flip=True)
+                                               filter_flip=True,
+                                               filter_dilation=(1, 1))
        # upsampling cols
        upsampled_mat = conv2d_grad_wrt_inputs(output_grad=upsampled_row,
                                               filters=kern[np.newaxis,
@@ -553,7 +575,8 @@ def bilinear_upsampling(input,
                                               filter_shape=(1, 1, 1, None),
                                               border_mode=(0, pad),
                                               subsample=(1, ratio),
-                                               filter_flip=True)
+                                               filter_flip=True,
+                                               filter_dilation=(1, 1))
    else:
        kern = bilinear_kernel_2D(ratio=ratio, normalize=True)
        upsampled_mat = conv2d_grad_wrt_inputs(output_grad=concat_mat,
@@ -565,7 +588,8 @@ def bilinear_upsampling(input,
                                               filter_shape=(1, 1, None, None),
                                               border_mode=(pad, pad),
                                               subsample=(ratio, ratio),
-                                               filter_flip=True)
+                                               filter_flip=True,
+                                               filter_dilation=(1, 1))

    return upsampled_mat.reshape((input.shape[0], input.shape[1],
                                  row * ratio, col * ratio))
@@ -620,14 +644,18 @@ class BaseAbstractConv2d(Op):
        are not flipped and the operation is referred to as a
        cross-correlation.

+    filter_dilation: tuple of len 2
+        Factor by which to subsample (stride) the input.
+        Also called dilation factor.
    """
    check_broadcast = False
-    __props__ = ('border_mode', 'subsample', 'filter_flip', 'imshp', 'kshp')
+    __props__ = ('border_mode', 'subsample', 'filter_flip',
+                 'imshp', 'kshp', 'filter_dilation')

    def __init__(self,
-                 imshp=None, kshp=None,
-                 border_mode="valid", subsample=(1, 1),
-                 filter_flip=True):
+                 imshp=None, kshp=None, border_mode="valid",
+                 subsample=(1, 1), filter_flip=True,
+                 filter_dilation=(1, 1)):

        if isinstance(border_mode, integer_types):
            border_mode = (border_mode, border_mode)
@@ -673,6 +701,9 @@ class BaseAbstractConv2d(Op):
        if len(subsample) != 2:
            raise ValueError("subsample must have two elements")
        self.subsample = tuple(subsample)
+        if len(filter_dilation) != 2:
+            raise ValueError("filter_dilation must have two elements")
+        self.filter_dilation = tuple(filter_dilation)

    def flops(self, inp, outp):
        """ Useful with the hack in profilemode to print the MFlops"""
@@ -694,7 +725,7 @@ class BaseAbstractConv2d(Op):
        # This may change in the future.
        return False

-    def conv2d(self, img, kern, mode="valid"):
+    def conv2d(self, img, kern, mode="valid", dilation=(1, 1)):
        """
        Basic slow python implementatation for DebugMode
        """
@@ -708,8 +739,16 @@ class BaseAbstractConv2d(Op):
                'invalid mode {}, which must be either '
                '"valid" or "full"'.format(mode))

-        out_shape = get_conv_output_shape(img.shape, kern.shape, mode, [1, 1])
+        out_shape = get_conv_output_shape(img.shape, kern.shape,
+                                          mode, [1, 1], dilation)
+
        out = numpy.zeros(out_shape, dtype=img.dtype)
+        dil_kern_shp = kern.shape[:-2] + ((kern.shape[-2] - 1) * dilation[0] + 1,
+                                          (kern.shape[-1] - 1) * dilation[1] + 1)
+        dilated_kern = numpy.zeros(dil_kern_shp, dtype=kern.dtype)
+        dilated_kern[:, :,
+                     ::dilation[0],
+                     ::dilation[1]] = kern
        val = _valfrommode(mode)
        bval = _bvalfromboundary('fill')

@@ -720,7 +759,7 @@ class BaseAbstractConv2d(Op):
                    for im0 in xrange(img.shape[1]):
                        # some cast generates a warning here
                        out[b, n, ...] += _convolve2d(img[b, im0, ...],
-                                                      kern[n, im0, ...],
+                                                      dilated_kern[n, im0, ...],
                                                      1, val, bval, 0)
        return out

@@ -736,10 +775,11 @@ class AbstractConv2d(BaseAbstractConv2d):
                 kshp=None,
                 border_mode="valid",
                 subsample=(1, 1),
-                 filter_flip=True):
-        super(AbstractConv2d, self).__init__(imshp, kshp,
-                                             border_mode, subsample,
-                                             filter_flip)
+                 filter_flip=True,
+                 filter_dilation=(1, 1)):
+        super(AbstractConv2d, self).__init__(imshp, kshp, border_mode,
+                                             subsample, filter_flip,
+                                             filter_dilation)

    def make_node(self, img, kern):
        # Make sure both inputs are Variables with the same Type
@@ -766,6 +806,8 @@ class AbstractConv2d(BaseAbstractConv2d):
        img, kern = inp
        img = numpy.asarray(img)
        kern = numpy.asarray(kern)
+        dil_kernshp = ((kern.shape[2] - 1) * self.filter_dilation[0] + 1,
+                       (kern.shape[3] - 1) * self.filter_dilation[1] + 1)
        o, = out_
        mode = self.border_mode

@@ -777,9 +819,9 @@ class AbstractConv2d(BaseAbstractConv2d):
                ' integers'.format(mode))

        if mode == "full":
-            mode = (kern.shape[2] - 1, kern.shape[3] - 1)
+            mode = (dil_kernshp[0] - 1, dil_kernshp[1] - 1)
        elif mode == "half":
-            mode = (kern.shape[2] // 2, kern.shape[3] // 2)
+            mode = (dil_kernshp[0] // 2, dil_kernshp[1] // 2)
        if isinstance(mode, tuple):
            pad_h, pad_w = map(int, mode)
            mode = "valid"
@@ -790,7 +832,7 @@ class AbstractConv2d(BaseAbstractConv2d):
            img = new_img
        if not self.filter_flip:
            kern = kern[:, :, ::-1, ::-1]
-        conv_out = self.conv2d(img, kern, mode="valid")
+        conv_out = self.conv2d(img, kern, mode="valid", dilation=self.filter_dilation)
        conv_out = conv_out[:, :, ::self.subsample[0], ::self.subsample[1]]

        o[0] = node.outputs[0].type.filter(conv_out)
@@ -812,12 +854,14 @@ class AbstractConv2d(BaseAbstractConv2d):
        d_bottom = AbstractConv2d_gradInputs(self.imshp, self.kshp,
                                             self.border_mode,
                                             self.subsample,
-                                             self.filter_flip)(
+                                             self.filter_flip,
+                                             self.filter_dilation)(
            weights, top, bottom.shape[-2:])
        d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
                                               self.border_mode,
                                               self.subsample,
-                                               self.filter_flip)(
+                                               self.filter_flip,
+                                               self.filter_dilation)(

            bottom, top, weights.shape[-2:])

@@ -844,7 +888,7 @@ class AbstractConv2d(BaseAbstractConv2d):
            kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
                    for i in range(4)]
        res = get_conv_output_shape(imshp, kshp, self.border_mode,
-                                    self.subsample)
+                                    self.subsample, self.filter_dilation)
        return [res]


@@ -863,11 +907,13 @@ class AbstractConv2d_gradWeights(BaseAbstractConv2d):
                 kshp=None,
                 border_mode="valid",
                 subsample=(1, 1),
-                 filter_flip=True):
+                 filter_flip=True,
+                 filter_dilation=(1, 1)):
        super(AbstractConv2d_gradWeights, self).__init__(imshp, kshp,
                                                         border_mode,
                                                         subsample,
-                                                         filter_flip)
+                                                         filter_flip,
+                                                         filter_dilation)

    # Update shape/height_width
    def make_node(self, img, topgrad, shape):
@@ -943,15 +989,16 @@ class AbstractConv2d_gradWeights(BaseAbstractConv2d):
        d_bottom = AbstractConv2d_gradInputs(self.imshp, self.kshp,
                                             self.border_mode,
                                             self.subsample,
-                                             self.filter_flip)(
-                                                 weights,
+                                             self.filter_flip,
+                                             self.filter_dilation)(weights,
                                                                   top,
                                                                   bottom.shape[-2:])
        d_top = AbstractConv2d(self.imshp,
                               self.kshp,
                               self.border_mode,
                               self.subsample,
-                               self.filter_flip)(bottom, weights)
+                               self.filter_flip,
+                               self.filter_dilation)(bottom, weights)
        # Make sure that the broadcastable pattern of the inputs is used
        # for the gradients, even if the grad opts are not able to infer
        # that the dimensions are broadcastable.
@@ -998,11 +1045,13 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):
                 kshp=None,
                 border_mode="valid",
                 subsample=(1, 1),
-                 filter_flip=True):
+                 filter_flip=True,
+                 filter_dilation=(1, 1)):
        super(AbstractConv2d_gradInputs, self).__init__(imshp, kshp,
                                                        border_mode,
                                                        subsample,
-                                                        filter_flip)
+                                                        filter_flip,
+                                                        filter_dilation)

    # Update shape/height_width
    def make_node(self, kern, topgrad, shape):
@@ -1070,12 +1119,15 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):
        bottom, = grads
        d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
                                               self.border_mode,
-                                               self.subsample)(
-                                                   bottom, top,
+                                               self.subsample,
+                                               self.filter_flip,
+                                               self.filter_dilation)(bottom, top,
                                                                     weights.shape[-2:])
        d_top = AbstractConv2d(self.imshp, self.kshp,
-                               self.border_mode, self.subsample)(
-                                   bottom, weights)
+                               self.border_mode,
+                               self.subsample,
+                               self.filter_flip,
+                               self.filter_dilation)(bottom, weights)
        # Make sure that the broadcastable pattern of the inputs is used
        # for the gradients, even if the grad opts are not able to infer
        # that the dimensions are broadcastable.

--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
@@ -27,12 +27,14 @@ class BaseCorrMM(gof.Op):
        or a pair of integers
    subsample
        Perform subsampling of the output (default: (1, 1)).
-
+    filter_dilation
+        Perform dilated correlation (default: (1,1))
    """
    check_broadcast = False
-    __props__ = ('border_mode', 'subsample')
+    __props__ = ('border_mode', 'subsample', 'filter_dilation')

-    def __init__(self, border_mode="valid", subsample=(1, 1)):
+    def __init__(self, border_mode="valid", subsample=(1, 1),
+                 filter_dilation=(1, 1)):
        if isinstance(border_mode, integer_types):
            if border_mode < 0:
                raise ValueError(
@@ -55,7 +57,10 @@ class BaseCorrMM(gof.Op):
        self.border_mode = border_mode
        if len(subsample) != 2:
            raise ValueError("subsample must have two elements")
+        if len(filter_dilation) != 2:
+            raise ValueError("filter_dilation must have two elements")
        self.subsample = tuple(subsample)
+        self.filter_dilation = tuple(filter_dilation)

    @property
    def pad(self):
@@ -64,10 +69,11 @@ class BaseCorrMM(gof.Op):
        return (0, 0)

    def __str__(self):
-        return '%s{%s, %s}' % (
+        return '%s{%s, %s, %s}' % (
            self.__class__.__name__,
            self.border_mode,
-            str(self.subsample))
+            str(self.subsample),
+            str(self.filter_dilation))

    def c_support_code(self):
        return blas_header_text()
@@ -89,7 +95,7 @@ class BaseCorrMM(gof.Op):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (1, 1)
+        return (1, 2)

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -155,6 +161,7 @@ class BaseCorrMM(gof.Op):
        if not theano.config.blas.ldflags:
            raise NotImplementedError("C code for CorrMM* classes need a blas library.")
        dH, dW = self.subsample
+        dilH, dilW = self.filter_dilation
        if self.border_mode == "half":
            padH = padW = -1
        elif self.border_mode == "full":
@@ -201,6 +208,8 @@ class BaseCorrMM(gof.Op):
    // Optional args
    int dH = %(dH)s;
    int dW = %(dW)s;
+    int dilH = %(dilH)s;
+    int dilW = %(dilW)s;
    int padH = %(padH)s;
    int padW = %(padW)s;

@@ -224,39 +233,43 @@ class BaseCorrMM(gof.Op):
        }
        else if (padH == -2) {
            // vertical full padding, we can infer the kernel height
-            kH = 2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH;
+            kH = (2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH - 1)/ dilH + 1;
        }
        else {
            // explicit padding, we can infer the kernel height
-            kH = PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH;
+            kH = (PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
        }
        if ((dW != 1) || (padW == -1)) {
            kW = %(width)s;
        }
        else if (padW == -2) {
-            kW = 2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW;
+            kW = (2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
        else {
-            kW = PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW;
+            kW = (PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
    }

+    // Implicit dilated kernel size
+    int dil_kH = (kH - 1) * dilH + 1;
+    int dil_kW = (kW - 1) * dilW + 1;
+
    // Auto-padding if requested
    if (padH == -1) {  // vertical half padding
-        padH = kH / 2;
+        padH = dil_kH / 2;
    }
    else if (padH == -2) {  // vertical full padding
-        padH = kH - 1;
+        padH = dil_kH - 1;
    }
    else if (padH < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padH must be >= -2");
        %(fail)s
    }
    if (padW == -1) {  // horizontal half padding
-        padW = kW / 2;
+        padW = dil_kW / 2;
    }
    else if (padW == -2) {  // horizontal full padding
-        padW = kW - 1;
+        padW = dil_kW - 1;
    }
    else if (padW < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padW must be >= -2");
@@ -268,15 +281,15 @@ class BaseCorrMM(gof.Op):
    switch(direction) {
    case 0:  // forward pass
        // output is top: (batchsize, num_filters, height, width)
-        // height and width: top = (bottom + 2*pad - weight) / sample + 1
+        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
-        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - PyArray_DIMS(weights)[2]) / dH + 1);
-        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - PyArray_DIMS(weights)[3]) / dW + 1);
+        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1);
+        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1);
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width)
-        // height and width: weights = bottom + 2*pad - (top - 1) * sample
+        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
        out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1];
        out_dim[2] = (npy_intp)kH;  // already inferred further above
@@ -284,11 +297,11 @@ class BaseCorrMM(gof.Op):
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width)
-        // height and width: bottom = (top - 1) * sample + weights - 2*pad
+        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1];
-        out_dim[2] = (npy_intp)((dH != 1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + PyArray_DIMS(weights)[2] - 2*padH);
-        out_dim[3] = (npy_intp)((dW != 1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + PyArray_DIMS(weights)[3] - 2*padW);
+        out_dim[2] = (npy_intp)((dH != 1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
+        out_dim[3] = (npy_intp)((dW != 1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: direction must be 0, 1, or 2\\n");
@@ -326,7 +339,7 @@ class BaseCorrMM(gof.Op):
    }

    // Call corrMM code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, padH, padW);
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW);
    if (out2==NULL){
       %(fail)s
    }
@@ -357,10 +370,15 @@ class CorrMM(BaseCorrMM):
        `(sv, sh)` is equivalent to `CorrMM(...)(...)[:,:,::sv, ::sh]`,
        but faster.
        Set to `(1, 1)` to disable subsampling.
+    filter_dilation
+        The filter dilation operation applied to each input image.
+        Should be a tuple with 2 elements.
+        Set to `(1, 1)` to disable filter dilation.

    """
-    def __init__(self, border_mode="valid", subsample=(1, 1)):
-        super(CorrMM, self).__init__(border_mode, subsample)
+    def __init__(self, border_mode="valid", subsample=(1, 1),
+                 filter_dilation=(1, 1)):
+        super(CorrMM, self).__init__(border_mode, subsample, filter_dilation)

    def make_node(self, img, kern):
        img = as_tensor_variable(img)
@@ -382,7 +400,8 @@ class CorrMM(BaseCorrMM):
            imshp,
            kshp,
            self.border_mode,
-            self.subsample)
+            self.subsample,
+            self.filter_dilation)
        return [res]

    def c_code(self, node, nodename, inp, out_, sub):
@@ -395,10 +414,12 @@ class CorrMM(BaseCorrMM):
        bottom, weights = inp
        top, = grads
        d_bottom = CorrMM_gradInputs(self.border_mode,
-                                     self.subsample)(weights, top,
+                                     self.subsample,
+                                     self.filter_dilation)(weights, top,
                                                           bottom.shape[-2:])
        d_weights = CorrMM_gradWeights(self.border_mode,
-                                       self.subsample)(bottom, top,
+                                       self.subsample,
+                                       self.filter_dilation)(bottom, top,
                                                             weights.shape[-2:])
        return d_bottom, d_weights

@@ -415,8 +436,11 @@ class CorrMM_gradWeights(BaseCorrMM):

    """

-    def __init__(self, border_mode="valid", subsample=(1, 1)):
-        super(CorrMM_gradWeights, self).__init__(border_mode, subsample)
+    def __init__(self, border_mode="valid", subsample=(1, 1),
+                 filter_dilation=(1, 1)):
+        super(CorrMM_gradWeights, self).__init__(border_mode,
+                                                 subsample,
+                                                 filter_dilation)

    def make_node(self, img, topgrad, shape=None):
        img = as_tensor_variable(img)
@@ -485,10 +509,12 @@ class CorrMM_gradWeights(BaseCorrMM):
        bottom, top = inp[:2]
        weights, = grads
        d_bottom = CorrMM_gradInputs(self.border_mode,
-                                     self.subsample)(weights, top,
+                                     self.subsample,
+                                     self.filter_dilation)(weights, top,
                                                           bottom.shape[-2:])
        d_top = CorrMM(self.border_mode,
-                       self.subsample)(bottom, weights)
+                       self.subsample,
+                       self.filter_dilation)(bottom, weights)
        d_height_width = ((theano.gradient.DisconnectedType()(),) * 2
                          if len(inp) == 4 else ())
        return (d_bottom, d_top) + d_height_width
@@ -512,8 +538,10 @@ class CorrMM_gradInputs(BaseCorrMM):

    """

-    def __init__(self, border_mode="valid", subsample=(1, 1)):
-        super(CorrMM_gradInputs, self).__init__(border_mode, subsample)
+    def __init__(self, border_mode="valid", subsample=(1, 1), filter_dilation=(1, 1)):
+        super(CorrMM_gradInputs, self).__init__(border_mode,
+                                                subsample,
+                                                filter_dilation)

    def make_node(self, kern, topgrad, shape=None):
        kern = as_tensor_variable(kern)
@@ -586,11 +614,13 @@ class CorrMM_gradInputs(BaseCorrMM):
        weights, top = inp[:2]
        bottom, = grads
        d_weights = CorrMM_gradWeights(self.border_mode,
-                                       self.subsample)(bottom,
+                                       self.subsample,
+                                       self.filter_dilation)(bottom,
                                                             top,
                                                             weights.shape[-2:])
        d_top = CorrMM(self.border_mode,
-                       self.subsample)(bottom, weights)
+                       self.subsample,
+                       self.filter_dilation)(bottom, weights)
        d_height_width = ((theano.gradient.DisconnectedType()(),) *
                          2 if len(inp) == 4 else ())
        return (d_weights, d_top) + d_height_width

--- a/theano/tensor/nnet/corr_gemm.c
+++ b/theano/tensor/nnet/corr_gemm.c
@@ -31,20 +31,24 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // Loops for fast unfold + copy
 void im2col(const %(float_type)s* data_im, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int dilation_h, const int dilation_w,
    const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    %(float_type)s* data_col) {
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  // Implicit dilated kernel size
+  int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
+  int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
+  int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
  int channels_col = channels * kernel_h * kernel_w;
  for (int c = 0; c < channels_col; ++c) {
    int w_offset = c %% kernel_w;
    int h_offset = (c / kernel_w) %% kernel_h;
    int c_im = c / kernel_h / kernel_w;
    for (int h = 0; h < height_col; ++h) {
+      int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
      for (int w = 0; w < width_col; ++w) {
-        int h_pad = h * stride_h - pad_h + h_offset;
-        int w_pad = w * stride_w - pad_w + w_offset;
+        int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
          data_col[(npy_intp)(c * height_col + h) * width_col + w] =
            data_im[(npy_intp)(c_im * height + h_pad) * width + w_pad];
@@ -60,10 +64,14 @@ void im2col(const %(float_type)s* data_im, const int channels,
 // accumulated into data_im.
 void col2im(const %(float_type)s* data_col, const int channels,
    const int height, const int width, const int patch_h, const int patch_w,
+    const int dilation_h, const int dilation_w,
    const int pad_h, const int pad_w, const int stride_h,
    const int stride_w, %(float_type)s* data_im) {
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  // Implicit dilated patch
+  int dil_patch_h = (patch_h - 1) * dilation_h + 1;
+  int dil_patch_w = (patch_w - 1) * dilation_w + 1;
+  int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
  int num_kernels = channels * height * width;
  int channels_col = channels * patch_h * patch_w;
  for (int c = 0; c < channels_col; ++c) {
@@ -71,9 +79,9 @@ void col2im(const %(float_type)s* data_col, const int channels,
    int h_offset = (c / patch_w) %% patch_h;
    int c_im = c / patch_h / patch_w;
    for (int h = 0; h < height_col; ++h) {
+      int h_pad = h * stride_h - pad_h + h_offset * dilation_h;
      for (int w = 0; w < width_col; ++w) {
-        int h_pad = h * stride_h - pad_h + h_offset;
-        int w_pad = w * stride_w - pad_w + w_offset;
+        int w_pad = w * stride_w - pad_w + w_offset * dilation_w;
        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
          data_im[(npy_intp)(c_im * height + h_pad) * width + w_pad] +=
            data_col[(npy_intp)(c * height_col + h) * width_col + w];
@@ -96,6 +104,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
                      const int direction,
                      const int dH = 1,
                      const int dW = 1,
+                      const int dilH = 1,
+                      const int dilW = 1,
                      const int padH = 0,
                      const int padW = 0)
 {
@@ -151,9 +161,12 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
                "CorrMM images and kernel must have the same stack size\n");
        return NULL;
    }
+    // implicit dilated filter
+    const int dil_kH = (kH - 1) * dilH + 1;
+    const int dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
-    const int topHeight = (bottomHeight + 2*padH - kH) / dH + 1;
-    const int topWidth  = (bottomWidth + 2*padW - kW) / dW + 1;
+    const int topHeight = (bottomHeight + 2*padH - dil_kH) / dH + 1;
+    const int topWidth  = (bottomWidth + 2*padW - dil_kW) / dW + 1;
    if (batchSize != PyArray_DIMS(top)[0] ||
            nFilters != PyArray_DIMS(top)[1] ||
            topHeight != PyArray_DIMS(top)[2] ||
@@ -206,7 +219,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        for (int n = 0; n < batchSize; n++) {
            // First, im2col
            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels, bottomHeight,
-                    bottomWidth, kH, kW, padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(col));
+                   bottomWidth, kH, kW, dilH, dilW,
+                   padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(col));
            // Second, gemm
            %(gemm)s(&NTrans, &NTrans,
                   &N_, &M_, &K_,
@@ -255,7 +269,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        for (int n = 0; n < batchSize; n++) {
            // First, im2col
            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels, bottomHeight,
-                    bottomWidth, kH, kW, padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(col));
+                   bottomWidth, kH, kW, dilH, dilW,
+                   padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(col));
            // Second, gemm
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
@@ -314,7 +329,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
                   (%(float_type)s*)PyArray_DATA(col), &N_);
            // col2im back to the data
            col2im((%(float_type)s*)PyArray_DATA(col), nChannels, bottomHeight, bottomWidth,
-                    kH, kW, padH, padW, dH, dW, (%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride);
+                   kH, kW, dilH, dilW, padH, padW,
+                   dH, dW, (%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride);
        }
        /*
        // Original caffe code for comparison

--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -79,7 +79,8 @@ def local_abstractconv_gemm(node):
    if node.op.filter_flip:
        kern = kern[:, :, ::-1, ::-1]
    rval = CorrMM(border_mode=node.op.border_mode,
-                  subsample=node.op.subsample)(img, kern)
+                  subsample=node.op.subsample,
+                  filter_dilation=node.op.filter_dilation)(img, kern)
    copy_stack_trace(node.outputs[0], rval)

    return [rval]
@@ -97,7 +98,8 @@ def local_abstractconv_gradweight_gemm(node):
        return None

    rval = CorrMM_gradWeights(border_mode=node.op.border_mode,
-                              subsample=node.op.subsample)(img, topgrad, shape)
+                              subsample=node.op.subsample,
+                              filter_dilation=node.op.filter_dilation)(img, topgrad, shape)
    copy_stack_trace(node.outputs[0], rval)

    # need to flip the kernel if necessary
@@ -124,7 +126,8 @@ def local_abstractconv_gradinputs_gemm(node):
    if node.op.filter_flip:
        kern = kern[:, :, ::-1, ::-1]
    rval = CorrMM_gradInputs(border_mode=node.op.border_mode,
-                             subsample=node.op.subsample)(kern, topgrad,
+                             subsample=node.op.subsample,
+                             filter_dilation=node.op.filter_dilation)(kern, topgrad,
                                                                      shape)
    copy_stack_trace(node.outputs[0], rval)

@@ -221,7 +224,9 @@ def local_conv2d_gradweight_cpu(node):
    assert len(op_imshp) == 4 and len(op_kshp) == 4

    outshp = get_conv_output_shape(op_imshp, op_kshp,
-                                   node.op.border_mode, node.op.subsample)[2:]
+                                   node.op.border_mode,
+                                   node.op.subsample,
+                                   node.op.filter_dilation)[2:]
    fulloutshp = get_conv_output_shape(op_imshp, op_kshp,
                                       node.op.border_mode, (1, 1))[2:]

@@ -334,7 +339,9 @@ def local_conv2d_gradinputs_cpu(node):
    filters = filters[:, :, ::-1, ::-1]

    outshp = get_conv_output_shape(op_imshp, op_kshp,
-                                   node.op.border_mode, node.op.subsample)[2:]
+                                   node.op.border_mode,
+                                   node.op.subsample,
+                                   node.op.filter_dilation)[2:]
    fulloutshp = get_conv_output_shape(op_imshp, op_kshp,
                                       node.op.border_mode, (1, 1))[2:]


--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -4,7 +4,6 @@ import numpy
 import numpy as np
 from nose.plugins.skip import SkipTest
 from nose.tools import assert_raises
-
 import theano
 from theano import tensor
 from theano.gof.opt import check_stack_trace
@@ -24,46 +23,57 @@ from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
 from theano.tensor.nnet.ConvTransp3D import ConvTransp3D


-def conv_corr(inputs, filters, border_mode="valid", subsample=(1, 1),
-              conv_mode='conv'):
+def conv_corr(inputs, filters, border_mode="valid",
+              subsample=(1, 1), conv_mode='conv',
+              filter_dilation=(1, 1)):
    if conv_mode == 'conv':
        filters = filters[:, :, ::-1, ::-1]
-    return corr.CorrMM(border_mode, subsample)(inputs, filters)
+    return corr.CorrMM(border_mode,
+                       subsample,
+                       filter_dilation)(inputs, filters)


-def conv_corr_gw(inputs, topgrad, filters_shape, border_mode="valid",
-                 subsample=(1, 1), conv_mode='conv'):
-    rval = corr.CorrMM_gradWeights(border_mode, subsample)(inputs, topgrad,
+def conv_corr_gw(inputs, topgrad, filters_shape,
+                 border_mode="valid", subsample=(1, 1),
+                 conv_mode='conv', filter_dilation=(1, 1)):
+    rval = corr.CorrMM_gradWeights(border_mode,
+                                   subsample,
+                                   filter_dilation)(inputs, topgrad,
                                                    filters_shape[2:])
    if conv_mode == 'conv':
        rval = rval[:, :, ::-1, ::-1]
    return rval


-def conv_corr_gi(filters, topgrad, inputs_shape, border_mode="valid",
-                 subsample=(1, 1), conv_mode='conv'):
+def conv_corr_gi(filters, topgrad, inputs_shape,
+                 border_mode="valid", subsample=(1, 1),
+                 conv_mode='conv', filter_dilation=(1, 1)):
    if conv_mode == 'conv':
        filters = filters[:, :, ::-1, ::-1]
-    return corr.CorrMM_gradInputs(border_mode, subsample)(filters, topgrad,
+    return corr.CorrMM_gradInputs(border_mode,
+                                  subsample,
+                                  filter_dilation)(filters,
+                                                   topgrad,
                                                   inputs_shape[2:])


 class TestGetConvOutShape(unittest.TestCase):
    def test_basic(self):
-        image_shape, kernel_shape = (3, 2, 8, 9), (4, 2, 5, 6)
+        image_shape, kernel_shape = (3, 2, 12, 9), (4, 2, 5, 6)
        sub_sample = (1, 2)
+        filter_dilation = (2, 1)
        test1_params = get_conv_output_shape(
-            image_shape, kernel_shape, 'valid', sub_sample)
+            image_shape, kernel_shape, 'valid', sub_sample, filter_dilation)
        test2_params = get_conv_output_shape(
-            image_shape, kernel_shape, 'half', sub_sample)
+            image_shape, kernel_shape, 'half', sub_sample, filter_dilation)
        test3_params = get_conv_output_shape(
-            image_shape, kernel_shape, 'full', sub_sample)
+            image_shape, kernel_shape, 'full', sub_sample, filter_dilation)
        test4_params = get_conv_output_shape(
-            image_shape, kernel_shape, (1, 2), sub_sample)
+            image_shape, kernel_shape, (1, 2), sub_sample, filter_dilation)

        self.assertTrue(test1_params == (3, 4, 4, 2))
-        self.assertTrue(test2_params == (3, 4, 8, 5))
-        self.assertTrue(test3_params == (3, 4, 12, 7))
+        self.assertTrue(test2_params == (3, 4, 12, 5))
+        self.assertTrue(test3_params == (3, 4, 20, 7))
        self.assertTrue(test4_params == (3, 4, 6, 4))


@@ -71,35 +81,41 @@ class BaseTestConv2d(unittest.TestCase):
    def setUp(self):
        if theano.config.blas.ldflags == '':
            raise SkipTest("BLAS required for reference")
-        self.inputs_shapes = [(8, 1, 12, 12), (8, 1, 18, 18), (2, 1, 4, 4),
+        self.inputs_shapes = [(8, 1, 6, 6), (8, 1, 8, 8), (2, 1, 7, 7),
                              (6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)]
        self.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
-                               (1, 1, 2, 5), (4, 1, 2, 2), (4, 5, 2, 2)]
+                               (1, 1, 2, 3), (4, 1, 1, 3), (4, 5, 3, 2)]
        self.subsamples = [(1, 1), (2, 2), (2, 4)]
+        self.filters_dilations = [(1, 1), (1, 2), (2, 1)]
        self.border_modes = ["valid", "full", (0, 0), (1, 1), (5, 5), (5, 2)]
        self.filter_flip = [True, False]
        self.provide_shape = [True, False]
        self.shared = theano.compile.shared

-    def get_output_shape(self, inputs_shape, filters_shape, subsample,
-                         border_mode):
+    def get_output_shape(self, inputs_shape, filters_shape,
+                         subsample, border_mode, filter_dilation):
+        dil_filters = ((filters_shape[2] - 1) * filter_dilation[0] + 1,
+                       (filters_shape[3] - 1) * filter_dilation[1] + 1)
        if border_mode == "valid":
            border_mode = (0, 0)
        if border_mode == "full":
-            border_mode = (filters_shape[2] - 1, filters_shape[3] - 1)
+            border_mode = (dil_filters[0] - 1,
+                           dil_filters[1] - 1)
        batch_size = inputs_shape[0]
        num_filters = filters_shape[0]
        return ((batch_size, num_filters,) +
                tuple(None if i is None or k is None
-                      else ((i + 2 * pad - k) // d + 1)
-                      for i, k, d, pad in zip(inputs_shape[2:],
+                      else ((i + 2 * pad - ((k - 1) * fd + 1)) // d + 1)
+                      for i, k, d, pad, fd in zip(inputs_shape[2:],
                                                  filters_shape[2:],
-                                              subsample, border_mode)))
+                                                  subsample, border_mode,
+                                                  filter_dilation)))

    def run_fwd(self, inputs_shape, filters_shape, ref=conv_corr,
                subsample=(1, 1), verify_grad=True, mode=None,
-                border_mode='valid', filter_flip=True, provide_shape=False,
-                target_op=None, check_trace=False):
+                border_mode='valid', filter_flip=True,
+                provide_shape=False, target_op=None,
+                check_trace=False, filter_dilation=(1, 1)):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')

@@ -120,13 +136,15 @@ class BaseTestConv2d(unittest.TestCase):
        c_ref = ref(inputs, filters,
                    border_mode=border_mode,
                    subsample=subsample,
-                    conv_mode=conv_mode)
+                    conv_mode=conv_mode,
+                    filter_dilation=filter_dilation)
        c = conv.conv2d(inputs, filters,
                        border_mode=border_mode,
                        subsample=subsample,
                        filter_flip=filter_flip,
                        input_shape=imshp,
-                        filter_shape=kshp)
+                        filter_shape=kshp,
+                        filter_dilation=filter_dilation)

        f_ref = theano.function([], c_ref, mode='FAST_RUN')
        f = theano.function([], c, mode=mode)
@@ -143,15 +161,17 @@ class BaseTestConv2d(unittest.TestCase):
        if verify_grad:
            utt.verify_grad(conv.AbstractConv2d(border_mode=border_mode,
                                                imshp=imshp, kshp=kshp,
-                                                subsample=subsample),
+                                                subsample=subsample,
+                                                filter_dilation=filter_dilation),
                            [inputs_val, filters_val],
                            mode=mode)

    def run_gradweight(self, inputs_shape, filters_shape, output_shape,
-                       ref=conv_corr_gw, subsample=(1, 1), filter_flip=True,
-                       verify_grad=True, mode=None, border_mode='valid',
-                       provide_shape=False, target_op=None, check_trace=False):
-
+                       ref=conv_corr_gw, subsample=(1, 1),
+                       filter_flip=True, verify_grad=True, mode=None,
+                       border_mode='valid', provide_shape=False,
+                       target_op=None, check_trace=False,
+                       filter_dilation=(1, 1)):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        output_val = numpy.random.random(output_shape).astype('float32')

@@ -171,13 +191,15 @@ class BaseTestConv2d(unittest.TestCase):
        c = conv.AbstractConv2d_gradWeights(border_mode=border_mode,
                                            filter_flip=filter_flip,
                                            subsample=subsample,
-                                            imshp=imshp, kshp=kshp)
+                                            imshp=imshp, kshp=kshp,
+                                            filter_dilation=filter_dilation)
        c = c(inputs, output, filters_shape[-2:])
        c_ref = ref(inputs, output,
                    filters_shape,
                    border_mode=border_mode,
                    subsample=subsample,
-                    conv_mode=conv_mode)
+                    conv_mode=conv_mode,
+                    filter_dilation=filter_dilation)
        f = theano.function([], c, mode=mode)
        f_ref = theano.function([], c_ref, mode='FAST_RUN')

@@ -193,7 +215,8 @@ class BaseTestConv2d(unittest.TestCase):

        def abstract_conv2d_gradweight(inputs_val, output_val):
            conv_op = conv.AbstractConv2d_gradWeights(border_mode=border_mode,
-                                                      subsample=subsample)
+                                                      subsample=subsample,
+                                                      filter_dilation=filter_dilation)
            return conv_op(inputs_val, output_val, filters_shape[-2:])

        if verify_grad:
@@ -204,8 +227,8 @@ class BaseTestConv2d(unittest.TestCase):
    def run_gradinput(self, inputs_shape, filters_shape, output_shape,
                      ref=conv_corr_gi, subsample=(1, 1), filter_flip=True,
                      verify_grad=True, mode=None, border_mode='valid',
-                      provide_shape=False, target_op=None, check_trace=False):
-
+                      provide_shape=False, target_op=None,
+                      check_trace=False, filter_dilation=(1, 1)):
        output_val = numpy.random.random(output_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')
        output = self.shared(output_val)
@@ -224,11 +247,12 @@ class BaseTestConv2d(unittest.TestCase):
        c = conv.AbstractConv2d_gradInputs(border_mode=border_mode,
                                           subsample=subsample,
                                           filter_flip=filter_flip,
-                                           imshp=imshp, kshp=kshp)
+                                           imshp=imshp, kshp=kshp,
+                                           filter_dilation=filter_dilation)
        c = c(filters, output, inputs_shape[-2:])
        c_ref = ref(filters, output, inputs_shape,
                    border_mode=border_mode, subsample=subsample,
-                    conv_mode=conv_mode)
+                    conv_mode=conv_mode, filter_dilation=filter_dilation)
        f = theano.function([], c, mode=mode)
        f_ref = theano.function([], c_ref, mode='FAST_RUN')

@@ -244,7 +268,8 @@ class BaseTestConv2d(unittest.TestCase):

        def abstract_conv2d_gradinputs(filters_val, output_val):
            conv_op = conv.AbstractConv2d_gradInputs(border_mode=border_mode,
-                                                     subsample=subsample)
+                                                     subsample=subsample,
+                                                     filter_dilation=filter_dilation)
            return conv_op(filters_val, output_val, inputs_shape[-2:])

        if verify_grad:
@@ -266,15 +291,18 @@ class BaseTestConv2d(unittest.TestCase):
                    self.tcase(i, f, ds, db, dflip, provide_shape)
                except SkipTest as e:
                    skipped = e
+            for fd in self.filters_dilations:
                for s in self.subsamples:
                    for b in self.border_modes:
                        try:
-                        self.tcase(i, f, s, db, dflip, dprovide_shape)
+                            self.tcase(i, f, s, db, dflip,
+                                       dprovide_shape, fd)
                        except SkipTest as e:
                            skipped = e
            for flip in self.filter_flip:
                try:
-                    self.tcase(i, f, ds, db, flip, dprovide_shape)
+                    self.tcase(i, f, ds, db, flip,
+                               dprovide_shape)
                except SkipTest as e:
                    skipped = e
        if skipped:
@@ -287,26 +315,27 @@ class TestCorrConv2d(BaseTestConv2d):
            raise SkipTest()
        return super(TestCorrConv2d, self).setUp()

-    def tcase(self, i, f, s, b, flip, provide_shape):
-        o = self.get_output_shape(i, f, s, b)
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
+        o = self.get_output_shape(i, f, s, b, fd)
        if (not theano.config.blas.ldflags or
                not theano.config.cxx or
                theano.config.mode == "FAST_COMPILE"):
            raise SkipTest("Need blas to test conv2d")
        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
                     verify_grad=True, provide_shape=provide_shape,
-                     border_mode=b, filter_flip=flip, target_op=CorrMM,
-                     check_trace=True)
+                     border_mode=b, filter_flip=flip,
+                     target_op=CorrMM, check_trace=True,
+                     filter_dilation=fd)
        self.run_gradweight(inputs_shape=i, filters_shape=f,
                            output_shape=o, subsample=s, verify_grad=True,
                            provide_shape=provide_shape, border_mode=b,
                            filter_flip=flip, target_op=CorrMM_gradWeights,
-                            check_trace=True)
+                            check_trace=True, filter_dilation=fd)
        self.run_gradinput(inputs_shape=i, filters_shape=f,
                           output_shape=o, subsample=s, verify_grad=True,
                           provide_shape=provide_shape, border_mode=b,
                           filter_flip=flip, target_op=CorrMM_gradInputs,
-                           check_trace=True)
+                           check_trace=True, filter_dilation=fd)


 class TestCpuConv2d(BaseTestConv2d):
@@ -319,9 +348,11 @@ class TestCpuConv2d(BaseTestConv2d):
    def tearDown(self):
        theano.config.on_opt_error = self.opt_err

-    def tcase(self, i, f, s, b, flip, provide_shape):
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
+        if fd != (1, 1):
+            raise SkipTest("No dilation implementation for basic cpu ConvOp.")
        mode = self.mode
-        o = self.get_output_shape(i, f, s, b)
+        o = self.get_output_shape(i, f, s, b, fd)
        fwd_OK = True
        gradweight_OK = True
        gradinput_OK = True
@@ -347,11 +378,12 @@ class TestCpuConv2d(BaseTestConv2d):
        if fwd_OK:
            if not theano.config.blas.ldflags:
                raise SkipTest("Need blas to test conv2d")
-            self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
-                         verify_grad=(gradweight_OK and gradinput_OK),
+            self.run_fwd(inputs_shape=i, filters_shape=f,
+                         subsample=s, verify_grad=(gradweight_OK and gradinput_OK),
                         mode=mode, provide_shape=provide_shape,
                         border_mode=b, filter_flip=flip, target_op=ConvOp,
-                         check_trace=True)
+                         check_trace=True, filter_dilation=fd)
+
        else:
            self.assertRaises(AssertionError,
                              self.run_fwd,
@@ -363,7 +395,8 @@ class TestCpuConv2d(BaseTestConv2d):
                              provide_shape=provide_shape,
                              border_mode=b,
                              filter_flip=flip,
-                              check_trace=True)
+                              check_trace=True,
+                              filter_dilation=fd)

        if gradweight_OK:
            if not theano.config.blas.ldflags:
@@ -374,7 +407,8 @@ class TestCpuConv2d(BaseTestConv2d):
                                provide_shape=provide_shape, border_mode=b,
                                filter_flip=flip,
                                target_op=(ConvOp, ConvGrad3D),
-                                check_trace=True)
+                                check_trace=True,
+                                filter_dilation=fd)
        else:
            self.assertRaises(AssertionError,
                              self.run_gradweight,
@@ -387,7 +421,8 @@ class TestCpuConv2d(BaseTestConv2d):
                              provide_shape=provide_shape,
                              border_mode=b,
                              filter_flip=flip,
-                              check_trace=True)
+                              check_trace=True,
+                              filter_dilation=fd)

        if gradinput_OK:
            if not theano.config.blas.ldflags:
@@ -398,7 +433,8 @@ class TestCpuConv2d(BaseTestConv2d):
                               provide_shape=provide_shape, border_mode=b,
                               filter_flip=flip,
                               target_op=(ConvOp, ConvTransp3D),
-                               check_trace=True)
+                               check_trace=True,
+                               filter_dilation=fd)
        else:
            self.assertRaises(AssertionError,
                              self.run_gradinput,
@@ -411,7 +447,8 @@ class TestCpuConv2d(BaseTestConv2d):
                              provide_shape=provide_shape,
                              border_mode=b,
                              filter_flip=flip,
-                              check_trace=True)
+                              check_trace=True,
+                              filter_dilation=fd)


 def test_constant_shapes():

--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -32,8 +32,8 @@ class TestCorr2D(utt.InferShapeTester):

    def validate(self, image_shape, filter_shape,
                 border_mode='valid', subsample=(1, 1),
-                 input=None, filters=None,
-                 verify_grad=True, non_contiguous=False):
+                 input=None, filters=None, verify_grad=True,
+                 non_contiguous=False, filter_dilation=(1, 1)):
        """
        :param image_shape: The constant shape info passed to corrMM.
        :param filter_shape: The constant shape info passed to corrMM.
@@ -55,7 +55,8 @@ class TestCorr2D(utt.InferShapeTester):
            # define theano graph and function
            input.name = 'input'
            filters.name = 'filters'
-            rval = corr.CorrMM(border_mode, subsample)(input, filters)
+            rval = corr.CorrMM(border_mode, subsample,
+                               filter_dilation)(input, filters)
            rval.name = 'corr_output'
            return rval

@@ -86,20 +87,22 @@ class TestCorr2D(utt.InferShapeTester):
        orig_image_data = image_data
        img_shape2d = numpy.array(N_image_shape[-2:])
        fil_shape2d = numpy.array(N_filter_shape[-2:])
+        dil_shape2d = numpy.array(filter_dilation)
+        dil_fil_shape2d = (fil_shape2d - 1) * dil_shape2d + 1
        subsample2d = numpy.array(subsample)
        if border_mode == 'full':
-            padHW = (fil_shape2d - 1)
+            padHW = (dil_fil_shape2d - 1)
        elif border_mode == 'valid':
            padHW = numpy.array([0, 0])
        elif border_mode == 'half':
-            padHW = numpy.floor(fil_shape2d / 2).astype('int32')
+            padHW = numpy.floor(dil_fil_shape2d / 2).astype('int32')
        elif isinstance(border_mode, tuple):
            padHW = numpy.array(border_mode)
        elif isinstance(border_mode, integer_types):
            padHW = numpy.array([border_mode, border_mode])
        else:
            raise NotImplementedError('Unsupported border_mode {}'.format(border_mode))
-        out_shape2d = numpy.floor((img_shape2d + 2 * (padHW) - fil_shape2d) / subsample2d) + 1
+        out_shape2d = numpy.floor((img_shape2d + 2 * (padHW) - dil_fil_shape2d) / subsample2d) + 1
        # avoid numpy deprecation
        out_shape2d = out_shape2d.astype('int32')
        out_shape = (N_image_shape[0], N_filter_shape[0]) + tuple(out_shape2d)
@@ -124,8 +127,8 @@ class TestCorr2D(utt.InferShapeTester):
                        for col in range(ref_output.shape[3]):
                            icol = col * subsample[1]  # image col
                            ref_output[bb, nn, row, col] += (image2d[
-                                irow:irow + N_filter_shape[2],
-                                icol:icol + N_filter_shape[3]] * filter2d[::-1, ::-1]
+                                irow:irow + dil_fil_shape2d[0]:filter_dilation[0],
+                                icol:icol + dil_fil_shape2d[1]:filter_dilation[1]] * filter2d[::-1, ::-1]
                            ).sum()

        self.assertTrue(_allclose(theano_output, ref_output))
@@ -186,6 +189,28 @@ class TestCorr2D(utt.InferShapeTester):

        self.validate((1, 1, 6, 6), (1, 1, 3, 3), 1, subsample=(3, 3))

+    def test_filter_dilation(self):
+        """
+        Tests correlation where filter dilation != (1,1)
+        """
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'valid', filter_dilation=(2, 2))
+        self.validate((3, 2, 14, 10), (5, 2, 2, 3), 'valid', filter_dilation=(3, 1))
+        self.validate((1, 1, 14, 14), (1, 1, 3, 3), 'valid', filter_dilation=(2, 3))
+
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'full', filter_dilation=(2, 2))
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'full', filter_dilation=(3, 1))
+        self.validate((1, 1, 6, 6), (1, 1, 3, 3), 'full', filter_dilation=(2, 3))
+
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'half', filter_dilation=(2, 2))
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'half', filter_dilation=(3, 1))
+        self.validate((1, 1, 6, 6), (1, 1, 3, 3), 'half', filter_dilation=(2, 3))
+
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), (1, 1), filter_dilation=(2, 2))
+        self.validate((3, 2, 7, 5), (5, 2, 2, 3), (2, 1), filter_dilation=(2, 1))
+        self.validate((1, 1, 6, 6), (1, 1, 3, 3), (1, 2), filter_dilation=(1, 2))
+
+        self.validate((1, 1, 6, 6), (1, 1, 3, 3), 1, subsample=(3, 3), filter_dilation=(2, 2))
+
    @attr('slow')
    def test_shape_Constant_tensor(self):
        """