Enable border_mode != valid and filter_dilation in GpuCorr3dMM.

This reuses the implementation of GpuCorr2dMM and its gradient ops.

Enable border_mode != valid and filter_dilation in GpuCorr3dMM.
1d2411c6 · Gijs van Tulder · 1d9aff8a · 1d2411c6 · 1d2411c6 · 1d2411c6
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -1396,29 +1396,64 @@ class BaseGpuCorr3dMM(GpuOp):
    Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
    `GpuCorr3dMM_gradInputs`. Cannot be used directly.

+    Parameters
+    ----------
+    border_mode : {'valid', 'full', 'half'}
+        Additionally, the padding size could be directly specified by an integer
+        or a tuple of three integers
+    subsample
+        Perform subsampling of the output (default: (1, 1, 1)).
+    filter_dilation
+        Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
+    pad
+        *deprecated*, now you should always use border_mode.
    """

-    __props__ = ('border_mode', 'subsample', 'pad')
+    check_broadcast = False
+    __props__ = ('border_mode', 'subsample', 'filter_dilation')

    def __init__(self, border_mode="valid",
                 subsample=(1, 1, 1),
+                 filter_dilation=(1, 1, 1),
                 pad=(0, 0, 0)):
-        if border_mode != "valid":
-            raise ValueError("border_mode must be 'valid'")
+        if pad != (0, 0, 0):
+            _logger.warning(
+                'do not use pad for BaseGpuCorr3dMM; please set padding in '
+                'border_mode parameter, see the docstring for more details')
+            if border_mode != "valid":
+                raise ValueError("border_mode must be 'valid' if pad is given")
+            border_mode = pad
+        if isinstance(border_mode, integer_types):
+            border_mode = (border_mode, border_mode, border_mode)
+        if isinstance(border_mode, tuple):
+            pad_h, pad_w, pad_d = map(int, border_mode)
+            border_mode = (pad_h, pad_w, pad_d)
+        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
+                border_mode in ('valid', 'full', 'half')):
+            raise ValueError(
+                'invalid border_mode {}, which must be either '
+                '"valid", "full", "half", an integer or a tuple of three'
+                ' integers'.format(border_mode))
        self.border_mode = border_mode
        if len(subsample) != 3:
            raise ValueError("subsample must have three elements")
-        self.subsample = subsample
-        if (pad not in ("half", "full")) and (len(pad) != 3):
-            raise ValueError("pad must be 'half', 'full', or have three elements")
-        self.pad = pad
+        if len(filter_dilation) != 3:
+            raise ValueError("filter_dilation must have three elements")
+        self.subsample = tuple(subsample)
+        self.filter_dilation = tuple(filter_dilation)
+
+    @property
+    def pad(self):
+        if self.border_mode != 'valid':
+            return self.border_mode
+        return (0, 0, 0)

    def __str__(self):
-        return '%s{%s, %s, pad=%r}' % (
+        return '%s{%s, %s, %s}' % (
            self.__class__.__name__,
            self.border_mode,
            str(self.subsample),
-            self.pad)
+            str(self.filter_dilation))

    def flops(self, inp, outp):
        """ Useful with the hack in profiling to print the MFlops"""
@@ -1440,7 +1475,7 @@ class BaseGpuCorr3dMM(GpuOp):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 23)
+        return (0, 25)

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -1503,15 +1538,17 @@ class BaseGpuCorr3dMM(GpuOp):
            Ignored otherwise.

        """
-        if self.border_mode != "valid":
-            raise ValueError("mode must be 'valid'")
        dH, dW, dD = self.subsample
-        if self.pad == "half":
+        dilH, dilW, dilD = self.filter_dilation
+        if self.border_mode == "half":
            padH = padW = padD = -1
-        elif self.pad == "full":
+        elif self.border_mode == "full":
            padH = padW = padD = -2
+        elif isinstance(self.border_mode, tuple):
+            padH, padW, padD = self.border_mode
        else:
-            padH, padW, padD = self.pad
+            assert self.border_mode == "valid"
+            padH = padW = padD = 0
        if direction == "forward":
            direction = 0
            out = top
@@ -1556,6 +1593,9 @@ class BaseGpuCorr3dMM(GpuOp):
    int dH = %(dH)s;
    int dW = %(dW)s;
    int dD = %(dD)s;
+    int dilH = %(dilH)s;
+    int dilW = %(dilW)s;
+    int dilD = %(dilD)s;
    int padH = %(padH)s;
    int padW = %(padW)s;
    int padD = %(padD)s;
@@ -1585,12 +1625,12 @@ class BaseGpuCorr3dMM(GpuOp):
      else if (padH == -2)
      {
        // vertical full padding, we can infer the kernel height
-        kH = 2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
+        kH = (2 - CudaNdarray_HOST_DIMS(bottom)[2] + (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1;
      }
      else
      {
        // explicit padding, we can infer the kernel height
-        kH = CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH;
+        kH = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - (CudaNdarray_HOST_DIMS(top)[2] - 1)*dH - 1) / dilH + 1 ;
      }
      if ((dW != 1) || (padW == -1))
      {
@@ -1598,11 +1638,11 @@ class BaseGpuCorr3dMM(GpuOp):
      }
      else if (padW == -2)
      {
-         kW = 2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
+        kW = (2 - CudaNdarray_HOST_DIMS(bottom)[3] + (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
      }
      else
      {
-        kW = CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW;
+        kW = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
      }
      if ((dD != 1) || (padD == -1))
      {
@@ -1610,22 +1650,27 @@ class BaseGpuCorr3dMM(GpuOp):
      }
      else if (padD == -2)
      {
-         kD = 2 - CudaNdarray_HOST_DIMS(bottom)[4] + (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD;
+        kD = (2 - CudaNdarray_HOST_DIMS(bottom)[4] + (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
      }
      else
      {
-        kD = CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD;
+        kD = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD - 1) / dilD+ 1;
      }
    }

+    // Implicit dilated kernel size
+    int dil_kH = (kH - 1) * dilH + 1;
+    int dil_kW = (kW - 1) * dilW + 1;
+    int dil_kD = (kD - 1) * dilD + 1;
+
    // Auto-padding if requested
    if (padH == -1)
    { // vertical half padding
-      padH = kH / 2;
+      padH = dil_kH / 2;
    }
    else if (padH == -2)
    { // vertical full padding
-      padH = kH - 1;
+      padH = dil_kH - 1;
    }
    else if (padH < 0)
    {
@@ -1633,10 +1678,10 @@ class BaseGpuCorr3dMM(GpuOp):
      %(fail)s
    }
    if (padW == -1) {  // horizontal half padding
-      padW = kW / 2;
+      padW = dil_kW / 2;
    }
    else if (padW == -2) {  // horizontal full padding
-      padW = kW - 1;
+      padW = dil_kW - 1;
    }
    else if (padW < 0)
    {
@@ -1645,11 +1690,11 @@ class BaseGpuCorr3dMM(GpuOp):
    }
    if (padD == -1)
    { // horizontal half padding
-      padD = kD / 2;
+      padD = dil_kD / 2;
    }
    else if (padD == -2)
    { // horizontal full padding
-      padD = kD - 1;
+      padD = dil_kD - 1;
    }
    else if (padD < 0)
    {
@@ -1662,16 +1707,16 @@ class BaseGpuCorr3dMM(GpuOp):
    switch(direction) {
    case 0:  // forward pass
        // output is top: (batchsize, num_filters, height, width, depth)
-        // height and width: top = (bottom + 2*pad - weight) / sample + 1
+        // height, width and depth: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = CudaNdarray_HOST_DIMS(bottom)[0];
        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[0];
-        out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - CudaNdarray_HOST_DIMS(weights)[2]) / dH + 1;
-        out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - CudaNdarray_HOST_DIMS(weights)[3]) / dW + 1;
-        out_dim[4] = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - CudaNdarray_HOST_DIMS(weights)[4]) / dD + 1;
+        out_dim[2] = (CudaNdarray_HOST_DIMS(bottom)[2] + 2*padH - ((CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
+        out_dim[3] = (CudaNdarray_HOST_DIMS(bottom)[3] + 2*padW - ((CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
+        out_dim[4] = (CudaNdarray_HOST_DIMS(bottom)[4] + 2*padD - ((CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1;
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width, depth)
-        // height, width and depth: weights = bottom + 2*pad - (top-1) * sample
+        // height, width and depth: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = CudaNdarray_HOST_DIMS(top)[1];
        out_dim[1] = CudaNdarray_HOST_DIMS(bottom)[1];
        out_dim[2] = kH;  // already inferred further above
@@ -1680,12 +1725,12 @@ class BaseGpuCorr3dMM(GpuOp):
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width, depth)
-        // height, width and depth: bottom = (top-1) * sample + weights - 2*pad
+        // height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = CudaNdarray_HOST_DIMS(top)[0];
        out_dim[1] = CudaNdarray_HOST_DIMS(weights)[1];
-        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + CudaNdarray_HOST_DIMS(weights)[2] - 2*padH;
-        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + CudaNdarray_HOST_DIMS(weights)[3] - 2*padW;
-        out_dim[4] = (dD != 1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + CudaNdarray_HOST_DIMS(weights)[4] - 2*padD;
+        out_dim[2] = (dH != 1) ? %(height)s : (CudaNdarray_HOST_DIMS(top)[2] - 1) * dH + (CudaNdarray_HOST_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[3] = (dW != 1) ? %(width)s : (CudaNdarray_HOST_DIMS(top)[3] - 1) * dW + (CudaNdarray_HOST_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
+        out_dim[4] = (dD != 1) ? %(depth)s : (CudaNdarray_HOST_DIMS(top)[4] - 1) * dD + (CudaNdarray_HOST_DIMS(weights)[4]-1)*dilD + 1 - 2*padD;
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n");
@@ -1716,7 +1761,8 @@ class BaseGpuCorr3dMM(GpuOp):
    }

    // Call CUDA code
-    out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dD, padH, padW, padD);
+    out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dD,
+                    dilH, dilW, dilD, padH, padW, padD);
    if (out2==NULL){
       %(fail)s
    }
@@ -1734,19 +1780,28 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
        Currently supports "valid" only; "full" can be simulated by setting
        `pad="full"` (at the cost of performance), or by using
        `GpuCorrMM_gradInputs`.
+        The width of a border of implicit zeros to pad the
+        input with. Must be a tuple with 3 elements giving the width of
+        the padding on each side, or a single integer to pad the same
+        on all sides, or a string shortcut setting the padding at runtime:
+        ``'valid'`` for ``(0, 0, 0)`` (valid convolution, no padding), ``'full'``
+        for ``(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)``
+        (full convolution), ``'half'`` for ``(kernel_rows // 2,
+        kernel_columns // 2, kernel_depth // 2)`` (same convolution for
+        odd-sized kernels). Note that the three widths are each
+        applied twice, once per side (left and right, top and bottom, front
+        and back).
    subsample
        The subsample operation applied to each output image. Should be a tuple
        with 3 elements. `(sv, sh, sl)` is equivalent to
        `GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`, but faster.
        Set to `(1, 1, 1)` to disable subsampling.
+    filter_dilation
+        The filter dilation operation applied to each input image.
+        Should be a tuple with 3 elements.
+        Set to `(1, 1, 1)` to disable filter dilation.
    pad
-        The width of a border of implicit zeros to pad the input image with.
-        Should be a tuple with 3 elements giving the numbers of rows and columns
-        to pad on each side, or "half" to set the padding
-        to `(kernel_rows // 2, kernel_columns // 2, kernel_depth // 2)`,
-        or "full" to set the padding
-        to `(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)` at runtime.
-        Set to `(0, 0, 0)` to disable padding.
+        Deprecated alias for `border_mode`.

    Notes
    -----
@@ -1765,8 +1820,10 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
        batchsize or number of filters) may also work around the CUBLAS bug.

    """
-    def __init__(self, border_mode="valid", subsample=(1, 1, 1), pad=(0, 0, 0)):
-        super(GpuCorr3dMM, self).__init__(border_mode, subsample, pad)
+    def __init__(self, border_mode="valid", subsample=(1, 1, 1),
+                 filter_dilation=(1, 1, 1), pad=(0, 0, 0)):
+        super(GpuCorr3dMM, self).__init__(border_mode, subsample,
+                                          filter_dilation, pad)

    def make_node(self, img, kern):
        img = as_cuda_ndarray_variable(img)
@@ -1792,14 +1849,12 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
        top = gpu_contiguous(top)
        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
                                          self.subsample,
-                                          self.pad)(weights,
-                                                    top,
-                                                    bottom.shape[-3:])
+                                          self.filter_dilation)(
+            weights, top, bottom.shape[-3:])
        d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
                                            self.subsample,
-                                            self.pad)(bottom,
-                                                      top,
-                                                      weights.shape[-3:])
+                                            self.filter_dilation)(
+            bottom, top, weights.shape[-3:])
        return d_bottom, d_weights


@@ -1815,8 +1870,10 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):

    def __init__(self, border_mode="valid",
                 subsample=(1, 1, 1),
+                 filter_dilation=(1, 1, 1),
                 pad=(0, 0, 0)):
-        super(GpuCorr3dMM_gradWeights, self).__init__(border_mode, subsample, pad)
+        super(GpuCorr3dMM_gradWeights, self).__init__(border_mode, subsample,
+                                                      filter_dilation, pad)

    def make_node(self, img, topgrad, shape=None):
        img = as_cuda_ndarray_variable(img)
@@ -1828,10 +1885,14 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
            raise TypeError('img must be 5D tensor')
        if topgrad.type.ndim != 5:
            raise TypeError('topgrad must be 5D tensor')
-        if self.subsample != (1, 1, 1) or self.pad == "half":
+        if self.subsample != (1, 1, 1) or self.border_mode == "half":
            if shape is None:
-                raise ValueError('shape must be given if subsample != (1, 1, 1), or pad == "half"')
+                raise ValueError('shape must be given if subsample != (1, 1, 1)'
+                                 ' or border_mode == "half"')
            height_width_depth = [shape[0], shape[1], shape[2]]
+            assert shape[0].ndim == 0
+            assert shape[1].ndim == 0
+            assert shape[2].ndim == 0
        else:
            height_width_depth = []

@@ -1850,9 +1911,13 @@ class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM):
        bottom, top = inp[:2]
        weights, = grads
        weights = gpu_contiguous(weights)
-        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)(weights, top, bottom.shape[-3:])
-        d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)(
-            bottom, weights)
+        d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
+                                          self.subsample,
+                                          self.filter_dilation)(weights,
+                                                                top,
+                                                                bottom.shape[-3:])
+        d_top = GpuCorr3dMM(
+            self.border_mode, self.subsample, self.filter_dilation)(bottom, weights)
        d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
        return (d_bottom, d_top) + d_height_width_depth

@@ -1875,8 +1940,10 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):

    def __init__(self, border_mode="valid",
                 subsample=(1, 1, 1),
+                 filter_dilation=(1, 1, 1),
                 pad=(0, 0, 0)):
-        super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample, pad)
+        super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample,
+                                                     filter_dilation, pad)

    def make_node(self, kern, topgrad, shape=None):
        kern = as_cuda_ndarray_variable(kern)
@@ -1888,6 +1955,10 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
        if self.subsample != (1, 1, 1) and shape is None:
            raise ValueError('shape must be given if subsample != (1, 1, 1)')
        height_width_depth = [shape[0], shape[1], shape[2]] if self.subsample != (1, 1, 1) else []
+        if height_width_depth:
+            assert shape[0].ndim == 0
+            assert shape[1].ndim == 0
+            assert shape[2].ndim == 0

        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
                         False, False, False]
@@ -1906,12 +1977,12 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
        bottom = gpu_contiguous(bottom)
        d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
                                            self.subsample,
-                                            self.pad)(bottom,
-                                                      top,
-                                                      weights.shape[-3:])
+                                            self.filter_dilation)(bottom,
+                                                                  top,
+                                                                  weights.shape[-3:])
        d_top = GpuCorr3dMM(self.border_mode,
                            self.subsample,
-                            self.pad)(bottom, weights)
+                            self.filter_dilation)(bottom, weights)
        d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
            * 3 if len(inp) == 5 else ()
        return (d_weights, d_top) + d_height_width_depth

--- a/theano/sandbox/cuda/corr3d_gemm.cu
+++ b/theano/sandbox/cuda/corr3d_gemm.cu
@@ -52,6 +52,54 @@ inline int GET_BLOCKS(const int N) {

 // (Adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
 // Kernels for fast unfold + copy
+// CUDA kernel for the case of dilation
+__global__ void dilated_im3d2col_kernel(const int n, const float* data_im,
+    const int height, const int width, const int depth,
+    const int kernel_h, const int kernel_w, const int kernel_d,
+    const int dilation_h, const int dilation_w, const int dilation_d,
+    const int pad_h, const int pad_w, const int pad_d,
+    const int stride_h, const int stride_w, const int stride_d,
+    const int height_col, const int width_col, const int depth_col,
+    float* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int w_index = index / depth_col;
+    const int h_index = w_index / width_col;
+    const int d_col = index % depth_col;
+    const int h_col = h_index % height_col;
+    const int w_col = w_index % width_col;
+    const int c_im = h_index / height_col;
+    const int c_col = c_im * kernel_h * kernel_w * kernel_d;
+    const int h_offset = h_col * stride_h - pad_h;
+    const int w_offset = w_col * stride_w - pad_w;
+    const int d_offset = d_col * stride_d - pad_d;
+    float* data_col_ptr = data_col;
+    data_col_ptr += c_col * (height_col * width_col * depth_col) +
+      h_col * (width_col * depth_col) + w_col * depth_col + d_col;
+    const float* data_im_ptr = data_im;
+    data_im_ptr += c_im * (height * width * depth) +
+      h_offset * (width * depth) + w_offset * depth + d_offset;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      int h_im = h_offset + i * dilation_h;
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        int w_im = w_offset + j * dilation_w;
+        for (int k = 0; k < kernel_d; ++k)
+        {
+          int d_im = d_offset + k * dilation_d;
+          *data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
+                           h_im < height && w_im < width && d_im < depth) ?
+                           data_im_ptr[i * dilation_h * (width * depth) +
+                                       j * dilation_w * depth +
+                                       k * dilation_d] : 0;
+          data_col_ptr += height_col * width_col * depth_col;
+        }
+      }
+    }
+  }
+}
+
 __global__ void im3d2col_kernel(const int n, const float* data_im,
                                const int height, const int width, const int depth,
                                const int kernel_h, const int kernel_w, const int kernel_d,
@@ -62,41 +110,35 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
 {
  CUDA_KERNEL_LOOP(index, n)
  {
-    int d_out = index % depth_col;
-    int w_index = index / depth_col;
-    int w_out = w_index % width_col;
-    int h_index = w_index / width_col;
-    int h_out = h_index % height_col;
-
-    int channel_in = h_index / height_col;
-    //channel_in = 1;
-
-    int channel_out = channel_in * kernel_h * kernel_w * kernel_d;
-
-    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride_w - pad_w;
-    int d_in = d_out * stride_d - pad_d;
-
+    const int w_index = index / depth_col;
+    const int h_index = w_index / width_col;
+    const int d_col = index % depth_col;
+    const int h_col = h_index % height_col;
+    const int w_col = w_index % width_col;
+    const int c_im = h_index / height_col;
+    const int c_col = c_im * kernel_h * kernel_w * kernel_d;
+    const int h_offset = h_col * stride_h - pad_h;
+    const int w_offset = w_col * stride_w - pad_w;
+    const int d_offset = d_col * stride_d - pad_d;
    float* data_col_ptr = data_col;
-    data_col_ptr += channel_out * (height_col * width_col * depth_col) +
-      h_out * (width_col * depth_col) + w_out * depth_col + d_out;
-
+    data_col_ptr += c_col * (height_col * width_col * depth_col) +
+      h_col * (width_col * depth_col) + w_col * depth_col + d_col;
    const float* data_im_ptr = data_im;
-    data_im_ptr += channel_in * (height * width * depth) +
-      h_in * (width * depth) + w_in * depth + d_in;
+    data_im_ptr += c_im * (height * width * depth) +
+      h_offset * (width * depth) + w_offset * depth + d_offset;

    for (int i = 0; i < kernel_h; ++i)
    {
-      int h = h_in + i;
+      int h_im = h_offset + i;
      for (int j = 0; j < kernel_w; ++j)
      {
-        int w = w_in + j;
+        int w_im = w_offset + j;
        for (int k = 0; k < kernel_d; ++k)
        {
-          int d = d_in + k;
-          *data_col_ptr = (h >= 0 && w >= 0 && d >= 0 &&
-                           h < height && w < width && d < depth) ?
-                           data_im_ptr[i * (width * depth) + j *depth + k] : 0;
+          int d_im = d_offset + k;
+          *data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
+                           h_im < height && w_im < width && d_im < depth) ?
+                           data_im_ptr[i * (width * depth) + j * depth + k] : 0;
          data_col_ptr += height_col * width_col * depth_col;
        }
      }
@@ -107,31 +149,105 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
 void im3d2col(const float* data_im, const int channels,
              const int height, const int width, const int depth,
              const int kernel_h, const int kernel_w, const int kernel_d,
+              const int dilation_h, const int dilation_w, const int dilation_d,
              const int pad_h, const int pad_w, const int pad_d,
              const int stride_h, const int stride_w, const int stride_d,
              float* data_col)
 {
  // We are going to launch channels * height_col * width_col * depth_col kernels, each
  // kernel responsible for copying a single-channel grid.
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-  int depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1;
+  int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
+  int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
+  int dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
+  int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
+  int depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
  int num_kernels = channels * height_col * width_col * depth_col;
-  im3d2col_kernel<<<GET_BLOCKS(num_kernels),
-                    CUDA_NUM_THREADS>>>(num_kernels, data_im,
-                                        height, width, depth,
-                                        kernel_h, kernel_w, kernel_d,
-                                        pad_h, pad_w, pad_d,
-                                        stride_h, stride_w, stride_d,
-                                        height_col, width_col, depth_col,
-                                        data_col);
+  if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
+    dilated_im3d2col_kernel<<<GET_BLOCKS(num_kernels),
+                      CUDA_NUM_THREADS>>>(num_kernels, data_im,
+                                          height, width, depth,
+                                          kernel_h, kernel_w, kernel_d,
+                                          dilation_h, dilation_w, dilation_d,
+                                          pad_h, pad_w, pad_d,
+                                          stride_h, stride_w, stride_d,
+                                          height_col, width_col, depth_col,
+                                          data_col);
+  }
+  else{
+    im3d2col_kernel<<<GET_BLOCKS(num_kernels),
+                      CUDA_NUM_THREADS>>>(num_kernels, data_im,
+                                          height, width, depth,
+                                          kernel_h, kernel_w, kernel_d,
+                                          pad_h, pad_w, pad_d,
+                                          stride_h, stride_w, stride_d,
+                                          height_col, width_col, depth_col,
+                                          data_col);
+  }
 }

+// CUDA kernel for the case of dilation
+__global__ void dilated_col2im3d_kernel(
+    const int n, const float* data_col,
+    const int height, const int width, const int depth,
+    const int channels,
+    const int kernel_h, const int kernel_w, const int kernel_d,
+    const int dilation_h, const int dilation_w, const int dilation_d,
+    const int pad_h, const int pad_w, const int pad_d,
+    const int stride_h, const int stride_w, const int stride_d,
+    const int height_col, const int width_col, const int depth_col,
+    float* data_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    float val = 0;
+    const int d_im = index % depth + pad_d;
+    const int w_index = index / depth;
+    const int w_im = w_index % width + pad_w;
+    const int h_index = w_index / width;
+    const int h_im = h_index % height + pad_h;
+    const int c_im = h_index / height;
+    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+    int kernel_extent_d = (kernel_d - 1) * dilation_d + 1;
+    // compute the start and end of the output
+    const int d_col_start = (d_im < kernel_extent_d) ? 0 : (d_im - kernel_extent_d) / stride_d + 1;
+    const int d_col_end = min(d_im / stride_d + 1, depth_col);
+    const int w_col_start = (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+    const int w_col_end = min(w_im / stride_w + 1, width_col);
+    const int h_col_start = (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+    const int h_col_end = min(h_im / stride_h + 1, height_col);
+    // TODO: use LCM of stride and dilation to avoid unnecessary loops
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          int h_k = (h_im - h_col * stride_h);
+          int w_k = (w_im - w_col * stride_w);
+          int d_k = (d_im - d_col * stride_d);
+          if (h_k % dilation_h == 0 && w_k % dilation_w == 0 && d_k % dilation_d == 0) {
+            h_k /= dilation_h;
+            w_k /= dilation_w;
+            d_k /= dilation_d;
+            int data_col_index = c_im * kernel_h * kernel_w * kernel_d * height_col * width_col * depth_col +
+                                 h_k             * kernel_w * kernel_d * height_col * width_col * depth_col +
+                                 w_k                        * kernel_d * height_col * width_col * depth_col +
+                                 d_k                                   * height_col * width_col * depth_col +
+                                 h_col                                              * width_col * depth_col +
+                                 w_col                                                          * depth_col +
+                                 d_col;
+            val += data_col[data_col_index];
+          }
+        }
+      }
+   }
+   data_im[index] = val;
+  }
+}

 __global__ void col2im3d_kernel(const int n, const float* data_col,
                                const int height, const int width, const int depth,
                                const int channels,
-                                const int patch_h, const int patch_w, const int patch_d,
+                                const int kernel_h, const int kernel_w, const int kernel_d,
                                const int pad_h, const int pad_w, const int pad_d,
                                const int stride_h, const int stride_w, const int stride_d,
                                const int height_col, const int width_col, const int depth_col,
@@ -140,59 +256,78 @@ __global__ void col2im3d_kernel(const int n, const float* data_col,
  CUDA_KERNEL_LOOP(index, n)
  {
    float val = 0;
-    int d = index % depth + pad_d;
-    int w_index = index / depth;
-    int w = w_index % width + pad_w;
-    int h_index = w_index / width;
-    int h = h_index % height + pad_h;
-    int c = h_index / height;
+    const int d_im = index % depth + pad_d;
+    const int w_index = index / depth;
+    const int w_im = w_index % width + pad_w;
+    const int h_index = w_index / width;
+    const int h_im = h_index % height + pad_h;
+    const int c_im = h_index / height;

    // compute the start and end of the output
-    int d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1;
-    int d_col_end = min(d / stride_d + 1, depth_col);
-    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
-    int w_col_end = min(w / stride_w + 1, width_col);
-    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
-    int h_col_end = min(h / stride_h + 1, height_col);
+    const int d_col_start = (d_im < kernel_d) ? 0 : (d_im - kernel_d) / stride_d + 1;
+    const int d_col_end = min(d_im / stride_d + 1, depth_col);
+    const int w_col_start = (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
+    const int w_col_end = min(w_im / stride_w + 1, width_col);
+    const int h_col_start = (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
+    const int h_col_end = min(h_im / stride_h + 1, height_col);

    int offset =
-      (c * patch_h * patch_w * patch_d + h * patch_w * patch_d + w * patch_d + d) * height_col * width_col * depth_col;
+      (c_im * kernel_h * kernel_w * kernel_d + h_im * kernel_w * kernel_d +
+       w_im * kernel_d + d_im) * height_col * width_col * depth_col;

-    int coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
-    int coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
+    int coeff_h_col = (1 - stride_h * kernel_w * kernel_d * height_col) * width_col * depth_col;
+    int coeff_w_col = (1 - stride_w * kernel_d * height_col * width_col) * depth_col;
    int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
-    for (int d_col = d_col_start; d_col < d_col_end; ++d_col)
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
+        }
      }
   }
-    data_im[index] = val;
+   data_im[index] = val;
  }
 }

 void col2im3d(const float* data_col, const int channels,
              const int height, const int width, const int depth,
              const int patch_h, const int patch_w, const int patch_d,
+              const int dilation_h, const int dilation_w, const int dilation_d,
              const int pad_h, const int pad_w, const int pad_d,
              const int stride_h, const int stride_w, const int stride_d,
              float* data_im)
 {
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-  int depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1;
+  int dil_patch_h = (patch_h - 1) * dilation_h + 1;
+  int dil_patch_w = (patch_w - 1) * dilation_w + 1;
+  int dil_patch_d = (patch_d - 1) * dilation_d + 1;
+  int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
+  int depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
  int num_kernels = channels * height * width * depth;

  // To avoid involving atomic operations, we will launch one kernel per
  // bottom dimension, and then in the kernel add up the top dimensions.
-  col2im3d_kernel<<<GET_BLOCKS(num_kernels),
-                    CUDA_NUM_THREADS>>>(num_kernels, data_col,
-                                        height, width, depth, channels,
-                                        patch_h, patch_w, patch_d,
-                                        pad_h, pad_w, pad_d,
-                                        stride_h, stride_w, stride_d,
-                                        height_col, width_col, depth_col,
-                                        data_im);
+  if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
+    dilated_col2im3d_kernel<<<GET_BLOCKS(num_kernels),
+                              CUDA_NUM_THREADS>>>(num_kernels, data_col,
+                                                  height, width, depth, channels,
+                                                  patch_h, patch_w, patch_d,
+                                                  dilation_h, dilation_w, dilation_d,
+                                                  pad_h, pad_w, pad_d,
+                                                  stride_h, stride_w, stride_d,
+                                                  height_col, width_col, depth_col,
+                                                  data_im);
+  }
+  else{
+    col2im3d_kernel<<<GET_BLOCKS(num_kernels),
+                      CUDA_NUM_THREADS>>>(num_kernels, data_col,
+                                          height, width, depth, channels,
+                                          patch_h, patch_w, patch_d,
+                                          pad_h, pad_w, pad_d,
+                                          stride_h, stride_w, stride_d,
+                                          height_col, width_col, depth_col,
+                                          data_im);
+  }
 }


@@ -210,6 +345,9 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
                      const int dH = 1,
                      const int dW = 1,
                      const int dD = 1,
+                      const int dilH = 1,
+                      const int dilW = 1,
+                      const int dilD = 1,
                      const int padH = 0,
                      const int padW = 0,
                      const int padD = 0)
@@ -286,10 +424,14 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
                      "GpuCorr3dMM images and kernel must have the same stack size\n");
      return 0;
    }
+    // implicit dilated filter
+    const int dil_kH = (kH - 1) * dilH + 1;
+    const int dil_kW = (kW - 1) * dilW + 1;
+    const int dil_kD = (kD - 1) * dilD + 1;
    // top: (batchSize, nFilters, topHeight, topWidth, topDepth)
-    const int topHeight = int((bottomHeight + 2*padH - kH) / dH) + 1;
-    const int topWidth  = int((bottomWidth + 2*padW - kW) / dW) + 1;
-    const int topDepth  = int((bottomDepth + 2*padD - kD) / dD) + 1;
+    const int topHeight = int((bottomHeight + 2*padH - dil_kH) / dH) + 1;
+    const int topWidth  = int((bottomWidth + 2*padW - dil_kW) / dW) + 1;
+    const int topDepth  = int((bottomDepth + 2*padD - dil_kD) / dD) + 1;
    if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
        nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
        topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
@@ -345,6 +487,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
                 nChannels,
                 bottomHeight, bottomWidth, bottomDepth,
                 kH, kW, kD,
+                 dilH, dilW, dilD,
                 padH, padW, padD,
                 dH, dW, dD,
                 col->devdata);
@@ -392,6 +535,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
        im3d2col(bottom->devdata + n * bottom_stride, nChannels,
                 bottomHeight, bottomWidth, bottomDepth,
                 kH, kW, kD,
+                 dilH, dilW, dilD,
                 padH, padW, padD,
                 dH, dW, dD,
                 col->devdata);
@@ -461,6 +605,7 @@ CudaNdarray* corr3dMM(CudaNdarray *const bottom,
        col2im3d(col->devdata, nChannels,
                 bottomHeight, bottomWidth, bottomDepth,
                 kH, kW, kD,
+                 dilH, dilW, dilD,
                 padH, padW, padD,
                 dH, dW, dD, bottom->devdata + n * bottom_stride);
        cudaError_t err = cudaGetLastError();

--- a/theano/sandbox/cuda/tests/test_gemmcorr3d.py
+++ b/theano/sandbox/cuda/tests/test_gemmcorr3d.py
 from __future__ import absolute_import, print_function, division
 import unittest
 import numpy
+from six.moves import xrange
+try:
+    from scipy import ndimage
+except ImportError:
+    ndimage = None

 import theano
 from theano.tests import unittest_tools as utt
@@ -21,31 +26,127 @@ else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')


+# python reference implementation of a 3D convolution
+# see also: theano.tensor.nnet.tests.test_conv3d2d
+# expects: (batch, 0, channels, 1, 2)
+def pyconv3d(signals, filters, border_mode='valid', dilation=(1, 1, 1)):
+    Ns, Ts, C, Hs, Ws = signals.shape
+    Nf, Tf, C, Hf, Wf = filters.shape
+    Tdil, Hdil, Wdil = dilation
+    Tfdil = (Tf - 1) * Tdil + 1
+    Hfdil = (Hf - 1) * Hdil + 1
+    Wfdil = (Wf - 1) * Wdil + 1
+
+    # if border_mode is not 'valid', the signals need zero-padding
+    if border_mode == 'full':
+        Tpad = Tfdil - 1
+        Hpad = Hfdil - 1
+        Wpad = Wfdil - 1
+    elif border_mode == 'half':
+        Tpad = Tfdil // 2
+        Hpad = Hfdil // 2
+        Wpad = Wfdil // 2
+    elif isinstance(border_mode, tuple):
+        Tpad, Hpad, Wpad = map(int, border_mode)
+    else:
+        Tpad = 0
+        Hpad = 0
+        Wpad = 0
+
+    if Tpad > 0 or Hpad > 0 or Wpad > 0:
+        # zero-pad signals
+        signals_padded = numpy.zeros((Ns, Ts + 2 * Tpad, C,
+                                      Hs + 2 * Hpad, Ws + 2 * Wpad), 'float32')
+        signals_padded[:, Tpad:(Ts + Tpad), :, Hpad:(Hs + Hpad),
+                       Wpad:(Ws + Wpad)] = signals
+        Ns, Ts, C, Hs, Ws = signals_padded.shape
+        signals = signals_padded
+
+    Tfdil2 = Tfdil // 2
+    Hfdil2 = Hfdil // 2
+    Wfdil2 = Wfdil // 2
+
+    dilated_filters = numpy.zeros((Nf, Tfdil, C, Hfdil, Wfdil), dtype=filters.dtype)
+    dilated_filters[:, ::Tdil, :, ::Hdil, ::Wdil] = filters
+
+    # perform valid convolution on the padded signals
+    rval = numpy.zeros((Ns, Ts - Tfdil + 1, Nf, Hs - Hfdil + 1, Ws - Wfdil + 1))
+    for ns in xrange(Ns):
+        for nf in xrange(Nf):
+            for c in xrange(C):
+                s_i = signals[ns, :, c, :, :]
+                f_i = dilated_filters[nf, :, c, :, :]
+                r_i = rval[ns, :, nf, :, :]
+                # scipy.signal.convolve performs valid convolution,
+                # but is quite slow. scipy.ndimage.convolve is faster
+                # only supports 'same' convolution.
+                # origin must be -1 for even filters, 0 for odd filters
+                o_i = ndimage.convolve(s_i, f_i, mode='constant', cval=1,
+                                       origin=(f_i.shape[0] % 2 - 1,
+                                               f_i.shape[1] % 2 - 1,
+                                               f_i.shape[2] % 2 - 1))
+                # crop to get the result of 'valid' convolution
+                o_i = o_i[Tfdil2:(r_i.shape[0] + Tfdil2),
+                          Hfdil2:(r_i.shape[1] + Hfdil2),
+                          Wfdil2:(r_i.shape[2] + Wfdil2)]
+                # the result should be equal to 'valid' convolution
+                # utt.assert_allclose(o_i, signal.convolve(s_i, f_i, mode='valid'))
+                r_i += o_i
+    return rval
+
+
 class TestCorr3DMM(unittest.TestCase):

    def run_conv_valid(self, inputs_shape, filters_shape,
-                       subsample=(1, 1, 1)):
+                       border_mode='valid',
+                       filter_dilation=(1, 1, 1),
+                       subsample=(1, 1, 1),
+                       verify_grad=False):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')

        inputs = shared(inputs_val)
        filters = shared(filters_val)
        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
-        conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters,
-                                             b=bias, d=subsample)
-        conv = GpuCorr3dMM(border_mode="valid",
+
+        if filter_dilation == (1, 1, 1) and border_mode in ('valid', (0, 0, 0)):
+            conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters,
+                                                 b=bias, d=subsample)
+            f_ref = theano.function([], conv_ref)
+            res_ref = f_ref()
+        elif subsample == (1, 1, 1):
+            if ndimage is None:
+                raise SkipTest('This test needs SciPy.')
+            # input = b012c
+            # pyconv3d wants = b0c12 = (0, 1, 4, 2, 3)
+            # pyconv3d outputs = b0c12 = (0, 1, 3, 4, 2)
+            res_ref = pyconv3d(signals=inputs_val.transpose(0, 1, 4, 2, 3),
+                               filters=filters_val.transpose(0, 1, 4, 2, 3)[:, ::-1, :, ::-1, ::-1],
+                               dilation=filter_dilation,
+                               border_mode=border_mode).transpose(0, 1, 3, 4, 2)
+        else:
+            raise SkipTest('No reference implementation that combines '
+                           'border_mode and subsampling.')
+
+        conv = GpuCorr3dMM(border_mode=border_mode,
+                           filter_dilation=filter_dilation,
                           subsample=subsample)(
                               inputs.dimshuffle(0, 4, 1, 2, 3),
                               filters.dimshuffle(0, 4, 1, 2, 3))
        conv = conv.dimshuffle(0, 2, 3, 4, 1)

-        f_ref = theano.function([], conv_ref)
        f = theano.function([], conv, mode=mode_with_gpu)

-        res_ref = f_ref()
        res = f()
        utt.assert_allclose(res_ref, res)

+        if verify_grad:
+            utt.verify_grad(GpuCorr3dMM(border_mode=border_mode,
+                                        filter_dilation=filter_dilation,
+                                        subsample=subsample),
+                            [inputs_val.transpose(0, 4, 1, 2, 3),
+                             filters_val.transpose(0, 4, 1, 2, 3)])
+
    def test_valid(self):
        self.run_conv_valid(inputs_shape=(16, 20, 12, 16, 1),
                            filters_shape=(10, 6, 12, 4, 1))
@@ -68,6 +169,50 @@ class TestCorr3DMM(unittest.TestCase):
                            filters_shape=(10, 6, 12, 4, 1),
                            subsample=(1, 2, 3))

+    def test_border_mode(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='valid')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='half')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='full')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(0, 0, 0))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(1, 2, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(3, 2, 1))
+
+    def test_filter_dilation(self):
+        inputs_shape = [16, 20, 12, 15, 1]
+        filters_shape = [10, 6, 5, 4, 1]
+
+        for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
+            for border_mode in ['valid', 'half', 'full']:
+                self.run_conv_valid(inputs_shape=inputs_shape,
+                                    filters_shape=filters_shape,
+                                    filter_dilation=filter_dilation,
+                                    border_mode=border_mode)
+
+    def test_verify_gradients(self):
+        # use a small example to check the gradients
+        inputs_shape = [2, 7, 9, 6, 1]
+        filters_shape = [1, 3, 3, 2, 1]
+
+        for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
+            for border_mode in ['valid', 'half', 'full', (2, 1, 3)]:
+                self.run_conv_valid(inputs_shape=inputs_shape,
+                                    filters_shape=filters_shape,
+                                    filter_dilation=filter_dilation,
+                                    border_mode=border_mode,
+                                    verify_grad=True)
+
    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
                       subsample=(1, 1, 1)):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')