Merge pull request #5936 from HapeMask/cudnnv6_dilation

Add support for cudnn v6 dilated convolution.

Merge pull request #5936 from HapeMask/cudnnv6_dilation
4e094fc0 · Pascal Lamblin · GitHub · 7c07a3ce · 45bbb90c · 4e094fc0
--- a/theano/gpuarray/conv_desc.c
+++ b/theano/gpuarray/conv_desc.c
@@ -5,19 +5,19 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
  cudnnStatus_t err;
  int pad[3] = {PAD_0, PAD_1, PAD_2};
  int strides[3] = {SUB_0, SUB_1, SUB_2};
-  int upscale[3] = {1, 1, 1};
+  int dilation[3] = {DIL_0, DIL_1, DIL_2};

 #if BORDER_MODE == 0
-  pad[0] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1;
-  pad[1] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1;
+  pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * DIL_0;
+  pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * DIL_1;
 #if NB_DIMS > 2
-  pad[2] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1;
+  pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * DIL_2;
 #endif
 #elif BORDER_MODE == 2
-  pad[0] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) / 2;
-  pad[1] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) / 2;
+  pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * DIL_0 + 1) / 2;
+  pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * DIL_1 + 1) / 2;
 #if NB_DIMS > 2
-  pad[2] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) / 2;
+  pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * DIL_2 + 1) / 2;
 #endif
 #endif

@@ -36,6 +36,11 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
  }

  err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides,
-                                        upscale, CONV_MODE, PRECISION);
+                                        dilation, CONV_MODE, PRECISION);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not set convolution "
+                 "descriptor: %s", cudnnGetErrorString(err));
+    return -1;
+  }
  return 0;
 }
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -131,11 +131,11 @@ def _dnn_check_version():
    if v < 5000:
        return False, "cuDNN version is too old. Update to v5, was %d." % v
    # 5200 should not print warning with cudnn 5.1 final.
-    if v >= 5200:
+    if v >= 6100:
        warnings.warn("Your cuDNN version is more recent than "
                      "Theano. If you encounter problems, try "
                      "updating Theano or downgrading cuDNN to "
-                      "version 5.1.")
+                      "version 6.0.")
    return True, None


@@ -363,7 +363,7 @@ class GpuDnnConvDesc(COp):

    """

-    __props__ = ('border_mode', 'subsample', 'conv_mode', 'precision')
+    __props__ = ('border_mode', 'subsample', 'dilation', 'conv_mode', 'precision')

    def c_headers(self):
        return ['cudnn.h', 'cudnn_helper.h']
@@ -380,10 +380,13 @@ class GpuDnnConvDesc(COp):
    def do_constant_folding(self, node):
        return False

-    def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv',
+    def __init__(self, border_mode, subsample=(1, 1), dilation=(1, 1), conv_mode='conv',
                 precision="float32"):
        COp.__init__(self, ["conv_desc.c"], "APPLY_SPECIFIC(conv_desc)")

+        if version() < 6000 and any([d != 1 for d in dilation]):
+            raise RuntimeError("Dilation > 1 not supported for cuDNN version < 6.")
+
        if isinstance(border_mode, integer_types):
            border_mode = (border_mode,) * len(subsample)
        if isinstance(border_mode, tuple):
@@ -401,6 +404,9 @@ class GpuDnnConvDesc(COp):
        assert conv_mode in ('conv', 'cross')
        self.conv_mode = conv_mode

+        assert len(dilation) == len(subsample)
+        self.dilation = dilation
+
        assert precision in ['float16', 'float32', 'float64']
        self.precision = precision

@@ -452,6 +458,13 @@ class GpuDnnConvDesc(COp):
        else:
            sub2 = '0'

+        dil0 = str(self.dilation[0])
+        dil1 = str(self.dilation[1])
+        if len(self.dilation) > 2:
+            dil2 = str(self.dilation[2])
+        else:
+            dil2 = '0'
+
        if self.precision == 'float16':
            precision = 'CUDNN_DATA_HALF'
        elif self.precision == 'float32':
@@ -463,6 +476,7 @@ class GpuDnnConvDesc(COp):
        return [('NB_DIMS', str(len(self.subsample))),
                ('BORDER_MODE', bmode),
                ('PAD_0', pad0), ('PAD_1', pad1), ('PAD_2', pad2),
+                ('DIL_0', dil0), ('DIL_1', dil1), ('DIL_2', dil2),
                ('CONV_MODE', conv_flag),
                ('SUB_0', sub0), ('SUB_1', sub1), ('SUB_2', sub2),
                ('PRECISION', precision)]
@@ -470,6 +484,11 @@ class GpuDnnConvDesc(COp):
    def c_code_cache_version(self):
        return (super(GpuDnnConvDesc, self).c_code_cache_version(), version())

+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, "dilation"):
+            self.dilation = (1,) * len(self.subsample)
+

 # scalar constants
 _zero = constant(np.asarray(0.0, dtype='float64'))
@@ -574,6 +593,7 @@ class GpuDnnConv(DnnBase):
        img = as_gpuarray_variable(img, ctx_name)
        kern = as_gpuarray_variable(kern, ctx_name)
        output = as_gpuarray_variable(output, ctx_name)
+
        if img.type.ndim not in (4, 5):
            raise TypeError('img must be 4D or 5D tensor')
        if kern.type.ndim not in (4, 5):
@@ -619,7 +639,7 @@ class GpuDnnConv(DnnBase):
        return [[1], [1], [1], [0], [1], [1]]

    @staticmethod
-    def get_out_shape(ishape, kshape, border_mode, subsample):
+    def get_out_shape(ishape, kshape, border_mode, subsample, dilation):
        """
        This function computes the output shape for a convolution with
        the specified parameters. `ishape` and `kshape` can be symbolic
@@ -638,7 +658,8 @@ class GpuDnnConv(DnnBase):
            ishape,
            kshape,
            border_mode,
-            subsample)
+            subsample,
+            dilation)

    def infer_shape(self, node, shape):
        return [shape[2]]
@@ -910,7 +931,7 @@ class GpuDnnConvGradI(DnnBase):
        return [shape[2]]


-def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
+def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
             conv_mode='conv', direction_hint=None, workmem=None,
             algo=None, precision=None):
    """
@@ -930,16 +951,20 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        could be directly specified by an integer or a pair of integers.
    subsample
        Perform subsampling of the output (default: (1, 1)).
+    dilation
+        Filter dilation factor. A dilation factor of d is equivalent to a
+        convolution with d - 1 zeros inserted between neighboring filter
+        values.
    conv_mode
        Perform convolution (kernels flipped) or cross-correlation.
        One of 'conv', 'cross' (default: 'conv').
    direction_hint
        Used by graph optimizers to change algorithm choice.
        By default, GpuDnnConv will be used to carry out the convolution.
-        If border_mode is 'valid', subsample is (1, 1) and direction_hint is
-        'bprop weights', it will use GpuDnnConvGradW.
-        If border_mode is 'full', subsample is (1, 1) and direction_hint is
-        *not* 'forward!', it will use GpuDnnConvGradI.
+        If border_mode is 'valid', subsample is (1, 1), dilation is (1, 1), and
+        direction_hint is 'bprop weights', it will use GpuDnnConvGradW.
+        If border_mode is 'full', subsample is (1, 1), dilation is (1, 1), and
+        direction_hint is *not* 'forward!', it will use GpuDnnConvGradI.
        This parameter is used internally by graph optimizers and may be
        removed at any time without a deprecation period. You have been warned.
    algo : {'none', 'small', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
@@ -969,7 +994,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        algo = workmem
    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
    ctx_name = infer_context_name(img, kerns)
-    if (border_mode == 'valid' and subsample == (1, 1) and
+    if (border_mode == 'valid' and subsample == (1, 1) and dilation == (1, 1) and
            direction_hint == 'bprop weights'):
        # Special case: We are asked to use GpuDnnConvGradW. We need to set
        # up a suitable 'fake' convolution to compute the gradient for.
@@ -985,12 +1010,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
                   shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1)
        out_shp = assert_conv_shape(out_shp)
        out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
-        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
+        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), dilation=(1, 1),
                              conv_mode='cross', precision=precision)(out.shape)
        conv = GpuDnnConvGradW()(img, kerns, out, desc)
        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)

-    elif (border_mode == 'full' and subsample == (1, 1) and
+    elif (border_mode == 'full' and subsample == (1, 1) and dilation == (1, 1) and
          direction_hint != 'forward!'):
        # Special case: We can be faster by using GpuDnnConvGradI to compute
        # the full convolution as the backward pass of a valid convolution.
@@ -1004,7 +1029,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
                   shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1)
        out_shp = assert_conv_shape(out_shp)
        out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
-        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
+        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), dilation=(1, 1),
                              conv_mode=conv_mode, precision=precision)(kerns.shape)
        return GpuDnnConvGradI()(kerns, img, out, desc)

@@ -1013,7 +1038,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    # if the img contains negative strides
    img = gpu_contiguous(img)
    kerns = gpu_contiguous(kerns)
-    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
                          conv_mode=conv_mode, precision=precision)(kerns.shape)
    desc_op = desc.owner.op
    # We can use Shape_i and bypass the infer_shape here as this is on
@@ -1022,13 +1047,14 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
    out_shp = get_conv_output_shape(ishape, kshape,
                                    desc_op.border_mode,
-                                    desc_op.subsample)
+                                    desc_op.subsample,
+                                    filter_dilation=dilation)
    out_shp = assert_conv_shape(out_shp)
    out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
    return GpuDnnConv(algo=algo)(img, kerns, out, desc)


-def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
+def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1),
               conv_mode='conv', direction_hint=None,
               algo='none', precision=None):
    """
@@ -1047,17 +1073,23 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
        One of 'valid', 'full', 'half'; additionally, the padding size
        could be directly specified by an integer or a pair of integers.
    subsample
-        Perform subsampling of the output (default: (1, 1)).
+        Perform subsampling of the output (default: (1, 1, 1)).
+    dilation
+        Filter dilation factor. A dilation factor of d is equivalent to a
+        convolution with d - 1 zeros inserted between neighboring filter
+        values.
    conv_mode
        Perform convolution (kernels flipped) or cross-correlation.
        One of 'conv', 'cross' (default: 'conv').
    direction_hint
        Used by graph optimizers to change algorithm choice.
        By default, GpuDnnConv will be used to carry out the convolution.
-        If border_mode is 'valid', subsample is (1, 1) and direction_hint is
-        'bprop weights', it will use GpuDnnConvGradW.
-        If border_mode is 'full', subsample is (1, 1) and direction_hint is
-        *not* 'forward!', it will use GpuDnnConvGradI.
+        If border_mode is 'valid', subsample is (1, 1, 1), dilation is
+        (1, 1, 1), and direction_hint is 'bprop weights', it will use
+        GpuDnnConvGradW.
+        If border_mode is 'full', subsample is (1, 1, 1), dilation is
+        (1, 1, 1), and direction_hint is *not* 'forward!', it will use
+        GpuDnnConvGradI.
        This parameter is used internally by graph optimizers and may be
        removed at any time without a deprecation period. You have been warned.
    algo : convolution implementation to use. Only 'none' is implemented
@@ -1080,7 +1112,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),

    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
    ctx_name = infer_context_name(img, kerns)
-    if (border_mode == 'valid' and subsample == (1, 1, 1) and
+    if (border_mode == 'valid' and subsample == (1, 1, 1) and dilation == (1, 1, 1) and
            direction_hint == 'bprop weights'):
        # Special case: We are asked to use GpuDnnConvGradW. We need to set
        # up a suitable 'fake' convolution to compute the gradient for.
@@ -1097,12 +1129,12 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
                   shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1)
        out_shp = assert_conv_shape(out_shp)
        out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
-        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
+        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1),
                              conv_mode='cross', precision=precision)(out.shape)
        conv = GpuDnnConvGradW()(img, kerns, out, desc)
        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)

-    elif (border_mode == 'full' and subsample == (1, 1, 1) and
+    elif (border_mode == 'full' and subsample == (1, 1, 1) and dilation == (1, 1, 1) and
          direction_hint != 'forward!'):
        # Special case: We can be faster by using GpuDnnConvGradI to compute
        # the full convolution as the backward pass of a valid convolution.
@@ -1117,7 +1149,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
                   shape_i(img, 4, fgraph) + shape_i(kerns, 4, fgraph) - 1)
        out_shp = assert_conv_shape(out_shp)
        out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
-        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
+        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1),
                              conv_mode=conv_mode, precision=precision)(kerns.shape)
        return GpuDnnConvGradI()(kerns, img, out, desc)

@@ -1126,7 +1158,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
    # if the img contains negative strides
    img = gpu_contiguous(img)
    kerns = gpu_contiguous(kerns)
-    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
                          conv_mode=conv_mode, precision=precision)(kerns.shape)
    desc_op = desc.owner.op
    # We can use Shape_i and bypass the infer_shape here as this is on
@@ -1135,14 +1167,15 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
    kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
    out_shp = get_conv_output_shape(ishape, kshape,
                                    desc_op.border_mode,
-                                    desc_op.subsample)
+                                    desc_op.subsample,
+                                    filter_dilation=dilation)
    out_shp = assert_conv_shape(out_shp)
    out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
    return GpuDnnConv(algo=algo)(img, kerns, out, desc)


 def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
-                   subsample=(1, 1), conv_mode='conv', precision=None):
+                   subsample=(1, 1), dilation=(1, 1), conv_mode='conv', precision=None):
    """
    TODO: document this
    """
@@ -1154,23 +1187,23 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
    kerns_shp = as_tensor_variable(kerns_shp)
    precision = get_precision(precision, [img, topgrad])

-    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
                          conv_mode=conv_mode, precision=precision)(kerns_shp)
    out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*kerns_shp)
    return GpuDnnConvGradW()(img, topgrad, out, desc)


 def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid',
-                     subsample=(1, 1, 1), conv_mode='conv', precision=None):
+                     subsample=(1, 1, 1), dilation=(1, 1, 1), conv_mode='conv', precision=None):
    """
    3d version of dnn_gradweight
    """
    return dnn_gradweight(img, topgrad, kerns_shp, border_mode,
-                          subsample, conv_mode, precision)
+                          subsample, dilation, conv_mode, precision)


 def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
-                  subsample=(1, 1), conv_mode='conv', precision=None):
+                  subsample=(1, 1), dilation=(1, 1), conv_mode='conv', precision=None):
    """
    TODO: document this
    """
@@ -1182,19 +1215,19 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
    img_shp = as_tensor_variable(img_shp)
    precision = get_precision(precision, [kerns, topgrad])

-    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
                          conv_mode=conv_mode, precision=precision)(kerns.shape)
    out = GpuAllocEmpty(dtype=kerns.dtype, context_name=ctx_name)(*img_shp)
    return GpuDnnConvGradI()(kerns, topgrad, out, desc)


 def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid',
-                    subsample=(1, 1, 1), conv_mode='conv', precision=None):
+                    subsample=(1, 1, 1), dilation=(1, 1, 1), conv_mode='conv', precision=None):
    """
    3d version of `dnn_gradinput`.
    """
    return dnn_gradinput(kerns, topgrad, img_shp, border_mode, subsample,
-                         conv_mode, precision)
+                         dilation, conv_mode, precision)


 class GpuDnnPoolDesc(Op):
@@ -2711,7 +2744,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
                            AbstractConv2d_gradInputs))):
        return

-    if (op.filter_dilation != (1, 1)):
+    if version() < 6000 and op.filter_dilation != (1, 1):
        return None

    inp1 = inputs[0]
@@ -2729,6 +2762,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
        rval = dnn_conv(inp1, inp2,
                        border_mode=op.border_mode,
                        subsample=op.subsample,
+                        dilation=op.filter_dilation,
                        direction_hint='forward!',
                        conv_mode=conv_mode)
    elif isinstance(op, AbstractConv2d_gradWeights):
@@ -2737,6 +2771,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
        rval = dnn_gradweight(inp1, inp2, shape,
                              border_mode=op.border_mode,
                              subsample=op.subsample,
+                              dilation=op.filter_dilation,
                              conv_mode=conv_mode)
    elif isinstance(op, AbstractConv2d_gradInputs):
        shape = (inp2.shape[0], inp1.shape[1],
@@ -2744,6 +2779,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
        rval = dnn_gradinput(inp1, inp2, shape,
                             border_mode=op.border_mode,
                             subsample=op.subsample,
+                             dilation=op.filter_dilation,
                             conv_mode=conv_mode)
    return [rval]

@@ -2754,7 +2790,7 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
                            AbstractConv3d_gradInputs))):
        return

-    if (op.filter_dilation != (1, 1, 1)):
+    if version() < 6000 and op.filter_dilation != (1, 1, 1):
        return None

    inp1 = inputs[0]
@@ -2772,6 +2808,7 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
        rval = dnn_conv3d(inp1, inp2,
                          border_mode=op.border_mode,
                          subsample=op.subsample,
+                          dilation=op.filter_dilation,
                          direction_hint='forward!',
                          conv_mode=conv_mode)
    elif isinstance(op, AbstractConv3d_gradWeights):
@@ -2780,6 +2817,7 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
        rval = dnn_gradweight3d(inp1, inp2, shape,
                                border_mode=op.border_mode,
                                subsample=op.subsample,
+                                dilation=op.filter_dilation,
                                conv_mode=conv_mode)
    elif isinstance(op, AbstractConv3d_gradInputs):
        shape = (inp2.shape[0], inp1.shape[1],
@@ -2787,6 +2825,7 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
        rval = dnn_gradinput3d(inp1, inp2, shape,
                               border_mode=op.border_mode,
                               subsample=op.subsample,
+                               dilation=op.filter_dilation,
                               conv_mode=conv_mode)
    return [rval]


--- a/theano/gpuarray/dnn_fwd.c
+++ b/theano/gpuarray/dnn_fwd.c
@@ -188,11 +188,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    int nd;
    int pad[2];
    int stride[2];
-    int upscale[2];
+    int dilation[2];
    cudnnConvolutionMode_t mode;
    cudnnDataType_t data_type;
    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                             upscale, &mode, &data_type);
+                                             dilation, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -13,7 +13,7 @@ import theano.tensor as T
 import theano.tests.unittest_tools as utt
 from theano.tensor.signal.pool import pool_2d, pool_3d
 from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
-from theano.tensor.nnet.abstract_conv import get_conv_output_shape
+from theano.tensor.nnet.abstract_conv import get_conv_output_shape, get_conv_gradinputs_shape
 from theano.tensor.nnet import bn

 from .. import dnn
@@ -45,9 +45,9 @@ def test_dnn_conv_desc_merge():
        raise SkipTest(dnn.dnn_available.msg)
    kern_shp = T.as_tensor_variable(
        np.asarray([3, 1, 2, 2]).astype('int64'))
-    desc1 = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(2, 2),
+    desc1 = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(2, 2), dilation=(1, 1),
                               conv_mode='conv')(kern_shp)
-    desc2 = dnn.GpuDnnConvDesc(border_mode='full', subsample=(1, 1),
+    desc2 = dnn.GpuDnnConvDesc(border_mode='full', subsample=(1, 1), dilation=(1, 1),
                               conv_mode='cross')(kern_shp)
    # CDataType is not DeepCopyable so this will crash if we don't use
    # borrow=True
@@ -602,32 +602,35 @@ class TestDnnInferShapes(utt.InferShapeTester):
            dnn.GpuDnnSoftmaxGrad
        )

-    def _test_conv(self, img, kerns, out, img_val, kern_vals, border_mode, conv_mode, subsamples, algo):
+    def _test_conv(self, img, kerns, out, img_val, kern_vals, border_mode, conv_mode, subsamples, dilations, algo):
        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)

        img_val = np.asarray(img_val, dtype=theano.config.floatX)
        kern_vals = np.asarray(kern_vals, dtype=theano.config.floatX)

-        for subsample in subsamples:
-            out_vals = np.zeros(
-                dnn.GpuDnnConv.get_out_shape(img_val.shape, kern_vals.shape,
-                                             border_mode=border_mode,
-                                             subsample=subsample),
-                dtype=theano.config.floatX)
-            desc = dnn.GpuDnnConvDesc(
-                border_mode=border_mode,
-                subsample=subsample,
-                conv_mode=conv_mode,
-                precision=set_precision(theano.config.floatX)
-            )(kerns.shape)
-            conv = dnn.GpuDnnConv(algo=algo)(img, kerns, out, desc)
-            self._compile_and_check(
-                [img, kerns, out],
-                [conv],
-                [img_val, kern_vals, out_vals],
-                dnn.GpuDnnConv
-            )
+        for dilation in dilations:
+            for subsample in subsamples:
+                out_vals = np.zeros(
+                    dnn.GpuDnnConv.get_out_shape(img_val.shape, kern_vals.shape,
+                                                 border_mode=border_mode,
+                                                 subsample=subsample,
+                                                 dilation=dilation),
+                    dtype=theano.config.floatX)
+                desc = dnn.GpuDnnConvDesc(
+                    border_mode=border_mode,
+                    subsample=subsample,
+                    dilation=dilation,
+                    conv_mode=conv_mode,
+                    precision=set_precision(theano.config.floatX)
+                )(kerns.shape)
+                conv = dnn.GpuDnnConv(algo=algo)(img, kerns, out, desc)
+                self._compile_and_check(
+                    [img, kerns, out],
+                    [conv],
+                    [img_val, kern_vals, out_vals],
+                    dnn.GpuDnnConv
+                )

    @parameterized.expand(chain(product([SUPPORTED_DNN_CONV_ALGO_FWD[0]],
                                        border_modes,
@@ -637,67 +640,83 @@ class TestDnnInferShapes(utt.InferShapeTester):
                                        [conv_modes[0]])),
                          testcase_func_name=utt.custom_name_func)
    def test_conv(self, algo, border_mode, conv_mode):
+        # Currently only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM (algo 'none')
+        # supports dilation > 1. 'time*' and 'guess*' should fallback to it.
+        dilations = [(1, 1)]
+        if dnn.version() >= 6000 and (algo == "none" or "time_" in algo or "guess_" in algo):
+            dilations += [(2, 2)]
+
        self._test_conv(T.tensor4('img'),
                        T.tensor4('kerns'),
                        T.tensor4('out'),
-                        np.random.rand(7, 2, 8, 4),
+                        np.random.rand(7, 2, 12, 16),
                        np.random.rand(8, 2, 4, 3),
                        border_mode,
                        conv_mode,
                        [(1, 1), (2, 2)],
+                        dilations,
                        algo)

    @parameterized.expand(product(border_modes, conv_modes), utt.custom_name_func)
    def test_conv3d_none(self, border_mode, conv_mode):
+        dilations = [(1, 1, 1), (2, 2, 2)] if dnn.version() >= 6000 else [(1, 1, 1)]
+
        self._test_conv(T.tensor5('img'),
                        T.tensor5('kerns'),
                        T.tensor5('out'),
-                        np.random.rand(10, 2, 6, 4, 11),
+                        np.random.rand(10, 2, 15, 16, 17),
                        np.random.rand(8, 2, 4, 3, 1),
                        border_mode,
                        conv_mode,
                        [(1, 1, 1), (2, 2, 2)],
+                        dilations,
                        'none')

-    def _test_conv_gradw(self, img, topgrad, kerns, img_shape, kerns_shape, border_mode, conv_mode, subsample):
+    def _test_conv_gradw(self, img, topgrad, kerns, img_shape, kerns_shape, border_mode, conv_mode, subsamples, dilations):
        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)

-        topgrad_shape = get_conv_output_shape(img_shape, kerns_shape,
-                                              border_mode, subsample)
+        kerns_vals = np.zeros(kerns_shape, dtype=theano.config.floatX)
+        kerns_shape_shared = theano.shared(np.asarray(kerns_shape))

-        img_val = np.asarray(
-            np.random.rand(*img_shape),
-            dtype=theano.config.floatX
-        )
-        topgrad_vals = np.asarray(
-            np.random.rand(*topgrad_shape),
-            dtype=theano.config.floatX
-        )
+        for dilation in dilations:
+            for subsample in subsamples:
+                topgrad_shape = get_conv_output_shape(img_shape, kerns_shape,
+                                                      border_mode, subsample, dilation)

-        kerns_vals = np.zeros(kerns_shape, dtype=theano.config.floatX)
-        kerns_shape = theano.shared(np.asarray(kerns_shape))
-        desc = dnn.GpuDnnConvDesc(
-            border_mode=border_mode,
-            subsample=subsample,
-            conv_mode=conv_mode,
-            precision=set_precision(theano.config.floatX)
-        )(kerns_shape)
-        conv_grad_w = dnn.GpuDnnConvGradW()(
-            img,
-            topgrad,
-            kerns,
-            desc,
-        )
-        self._compile_and_check(
-            [img, topgrad, kerns],
-            [conv_grad_w],
-            [img_val, topgrad_vals, kerns_vals],
-            dnn.GpuDnnConvGradW
-        )
+                img_val = np.asarray(
+                    np.random.rand(*img_shape),
+                    dtype=theano.config.floatX
+                )
+                topgrad_vals = np.asarray(
+                    np.random.rand(*topgrad_shape),
+                    dtype=theano.config.floatX
+                )
+
+                desc = dnn.GpuDnnConvDesc(
+                    border_mode=border_mode,
+                    subsample=subsample,
+                    dilation=dilation,
+                    conv_mode=conv_mode,
+                    precision=set_precision(theano.config.floatX)
+                )(kerns_shape_shared)
+                conv_grad_w = dnn.GpuDnnConvGradW()(
+                    img,
+                    topgrad,
+                    kerns,
+                    desc,
+                )
+                self._compile_and_check(
+                    [img, topgrad, kerns],
+                    [conv_grad_w],
+                    [img_val, topgrad_vals, kerns_vals],
+                    dnn.GpuDnnConvGradW
+                )

    @parameterized.expand(product(border_modes, conv_modes), utt.custom_name_func)
    def test_conv_gradw(self, border_mode, conv_mode):
+        dilations = [(1, 1), (2, 2)] if dnn.version() >= 6000 else [(1, 1)]
+
        self._test_conv_gradw(T.tensor4('img'),
                              T.tensor4('topgrad'),
                              T.tensor4('kerns'),
@@ -705,7 +724,8 @@ class TestDnnInferShapes(utt.InferShapeTester):
                              (1, 2, 3, 7),
                              border_mode,
                              conv_mode,
-                              (1, 1))
+                              [(1, 1)],
+                              dilations)

    def test_conv_gradi(self):
        if not dnn.dnn_available(test_ctx_name):
@@ -714,29 +734,28 @@ class TestDnnInferShapes(utt.InferShapeTester):
        kerns = T.tensor4('kerns')
        out = T.tensor4('out')
        kern_vals = np.asarray(
-            np.random.rand(13, 14, 15, 16),
+            np.random.rand(13, 4, 5, 6),
            dtype=theano.config.floatX
        )
        out_vals = np.asarray(
-            np.random.rand(3, 13, 5, 6),
+            np.random.rand(3, 13, 9, 11),
            dtype=theano.config.floatX
        )

-        for params in product(
-            ['valid'],  # Should this work for 'full'?
+        dilations = [(1, 1), (2, 2)] if dnn.version() >= 6000 else [(1, 1)]
+        for border_mode, subsample, dilation, conv_mode in product(
+            ['valid', 'full'],
            [(1, 1)],
+            dilations,
            ['conv', 'cross']
        ):
-            shape = (
-                out_vals.shape[0], kern_vals.shape[1],
-                out_vals.shape[2] + kern_vals.shape[2] - 1,
-                out_vals.shape[3] + kern_vals.shape[3] - 1
-            )
+            shape = get_conv_gradinputs_shape(kern_vals.shape, out_vals.shape, border_mode, subsample, dilation)
            img_vals = np.zeros(shape, dtype=theano.config.floatX)
            desc = dnn.GpuDnnConvDesc(
-                border_mode=params[0],
-                subsample=params[1],
-                conv_mode=params[2],
+                border_mode=border_mode,
+                subsample=subsample,
+                dilation=dilation,
+                conv_mode=conv_mode,
                precision=set_precision(theano.config.floatX)
            )(kerns.shape)
            conv_grad_i = dnn.GpuDnnConvGradI()(
@@ -982,18 +1001,18 @@ def test_dnn_conv_grad():
                                iw - kw + 1)).astype(theano.config.floatX)

    def dconv(img, kern, out):
-        desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
+        desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), dilation=(1, 1),
                                  conv_mode='conv', precision=set_precision(theano.config.floatX))(kern.shape)
        return dnn.GpuDnnConv()(img, kern, out, desc, alpha=0.5, beta=0.75)

    def dconvi(img, kern, out):
-        desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
+        desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), dilation=(1, 1),
                                  conv_mode='conv', precision=set_precision(theano.config.floatX))(kern.shape)
        return dnn.GpuDnnConvGradI()(kern, out, img, desc, alpha=-1.0,
                                     beta=0.0)

    def dconvw(img, kern, out):
-        desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
+        desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), dilation=(1, 1),
                                  conv_mode='conv', precision=set_precision(theano.config.floatX))(kern.shape)
        return dnn.GpuDnnConvGradW()(img, out, kern, desc, alpha=0.75,
                                     beta=-1.0)
@@ -1005,29 +1024,37 @@ def test_dnn_conv_grad():

 def get_conv3d_test_cases():
    # Every element of test_shapes follows the format
-    # [input_shape, filter_shape, subsample]
-    test_shapes = [[(128, 3, 5, 5, 5), (64, 3, 1, 2, 4), (1, 1, 1)],
-                   [(8, 4, 20, 12, 15), (5, 4, 6, 12, 4), (2, 2, 2)],
-                   [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 3, 3)],
-                   [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1)],
+    # [input_shape, filter_shape, subsample, dilation]
+    test_shapes = [[(128, 3, 5, 5, 5), (64, 3, 1, 2, 4), (1, 1, 1), (1, 1, 1)],
+                   [(8, 4, 20, 12, 15), (5, 4, 6, 12, 4), (2, 2, 2), (1, 1, 1)],
+                   [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 3, 3), (1, 1, 1)],
+                   [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1), (1, 1, 1)],
                   # Test with 1x1x1 filters
-                   [(8, 1, 10, 10, 10), (10, 1, 1, 1, 1), (1, 1, 1)],
+                   [(8, 1, 10, 10, 10), (10, 1, 1, 1, 1), (1, 1, 1), (1, 1, 1)],
                   # Test with dimensions larger than 1024 (thread block dim)
-                   [(1025, 1, 2, 3, 4), (5, 1, 1, 2, 3), (1, 1, 1)],
-                   [(8, 1, 2, 3, 4), (1025, 1, 1, 2, 3), (1, 1, 1)],
-                   [(8, 1025, 2, 3, 4), (5, 1025, 1, 1, 2), (1, 1, 1)],
-                   [(8, 1, 1030, 3, 4), (5, 1, 1025, 1, 1), (1, 1, 1)],
-                   [(8, 1, 2, 1030, 4), (5, 1, 2, 1025, 1), (1, 1, 1)],
-                   [(8, 1, 2, 3, 1030), (5, 1, 1, 2, 1025), (1, 1, 1)],
+                   [(1025, 1, 2, 3, 4), (5, 1, 1, 2, 3), (1, 1, 1), (1, 1, 1)],
+                   [(8, 1, 2, 3, 4), (1025, 1, 1, 2, 3), (1, 1, 1), (1, 1, 1)],
+                   [(8, 1025, 2, 3, 4), (5, 1025, 1, 1, 2), (1, 1, 1), (1, 1, 1)],
+                   [(8, 1, 1030, 3, 4), (5, 1, 1025, 1, 1), (1, 1, 1), (1, 1, 1)],
+                   [(8, 1, 2, 1030, 4), (5, 1, 2, 1025, 1), (1, 1, 1), (1, 1, 1)],
+                   [(8, 1, 2, 3, 1030), (5, 1, 1, 2, 1025), (1, 1, 1), (1, 1, 1)],
                   # The equivalent of this caused a crash with conv2d
-                   [(1, 1, 1, 44800, 1), (6, 1, 1, 1, 1), (1, 1, 1)]]
+                   [(1, 1, 1, 44800, 1), (6, 1, 1, 1, 1), (1, 1, 1), (1, 1, 1)]]

    # With border mode 'full', test with kernel bigger than image in some/all
    # dimensions
-    test_shapes_full = [[(6, 2, 2, 2, 2), (4, 2, 3, 1, 1), (1, 1, 1)],
-                        [(6, 2, 2, 2, 2), (4, 2, 1, 3, 1), (1, 1, 1)],
-                        [(6, 2, 2, 2, 2), (4, 2, 1, 1, 3), (1, 1, 1)],
-                        [(6, 2, 2, 2, 2), (4, 2, 5, 5, 5), (1, 1, 1)]]
+    test_shapes_full = [[(6, 2, 2, 2, 2), (4, 2, 3, 1, 1), (1, 1, 1), (1, 1, 1)],
+                        [(6, 2, 2, 2, 2), (4, 2, 1, 3, 1), (1, 1, 1), (1, 1, 1)],
+                        [(6, 2, 2, 2, 2), (4, 2, 1, 1, 3), (1, 1, 1), (1, 1, 1)],
+                        [(6, 2, 2, 2, 2), (4, 2, 5, 5, 5), (1, 1, 1), (1, 1, 1)]]
+
+    if dnn.version() >= 6000:
+        test_shapes.extend([
+            [(8, 1, 20, 12, 15), (5, 1, 6, 3, 4), (1, 1, 2), (3, 2, 1)],
+            [(8, 1, 20, 12, 15), (5, 1, 6, 3, 4), (2, 2, 1), (1, 2, 3)]])
+        test_shapes_full.append(
+            [(6, 2, 2, 2, 2), (4, 2, 5, 5, 5), (1, 1, 1), (3, 2, 1)])
+
    border_modes = ['valid', 'full', 'half', (1, 2, 3), (3, 2, 1), 1, 2]
    conv_modes = ['conv', 'cross']

@@ -1044,7 +1071,7 @@ def test_conv3d_fwd():
    utt.seed_rng()

    def run_conv3d_fwd(inputs_shape, filters_shape, subsample,
-                       border_mode, conv_mode):
+                       dilation, border_mode, conv_mode):

        inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
        filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
@@ -1060,6 +1087,7 @@ def test_conv3d_fwd():
        # Compile a theano function for the cuDNN implementation
        conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
                              border_mode=border_mode, subsample=subsample,
+                              dilation=dilation,
                              conv_mode=conv_mode)
        f = theano.function([], conv, mode=mode_with_gpu)

@@ -1072,7 +1100,8 @@ def test_conv3d_fwd():

        # Compile a theano function for the reference implementation
        conv_ref = theano.tensor.nnet.corr3d.Corr3dMM(border_mode=border_mode,
-                                                      subsample=subsample
+                                                      subsample=subsample,
+                                                      filter_dilation=dilation,
                                                      )(ref_cast(inputs), flipped_filters)
        f_ref = theano.function([], conv_ref, mode="FAST_RUN")

@@ -1087,8 +1116,8 @@ def test_conv3d_fwd():
        utt.assert_allclose(res_ref, res, rtol=rtol)

    test_cases = get_conv3d_test_cases()
-    for (i_shape, f_shape, subsample), border_mode, conv_mode in test_cases:
-        yield (run_conv3d_fwd, i_shape, f_shape, subsample, border_mode,
+    for (i_shape, f_shape, subsample, dilation), border_mode, conv_mode in test_cases:
+        yield (run_conv3d_fwd, i_shape, f_shape, subsample, dilation, border_mode,
               conv_mode)


@@ -1099,7 +1128,7 @@ def test_conv3d_bwd():
    utt.seed_rng()

    def run_conv3d_bwd(inputs_shape, filters_shape, subsample,
-                       border_mode, conv_mode):
+                       dilation, border_mode, conv_mode):

        inputs_val = np.random.random(inputs_shape).astype(theano.config.floatX)
        filters_val = np.random.random(filters_shape).astype(theano.config.floatX)
@@ -1109,7 +1138,9 @@ def test_conv3d_bwd():

        # Compile a theano function for the cuDNN implementation
        conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
-                              border_mode=border_mode, subsample=subsample,
+                              border_mode=border_mode,
+                              subsample=subsample,
+                              dilation=dilation,
                              conv_mode=conv_mode)

        grad_i, grad_w = theano.tensor.grad(conv.sum(), [inputs, filters])
@@ -1125,7 +1156,8 @@ def test_conv3d_bwd():

        # Compile a theano function for the reference implementation
        conv_ref = theano.tensor.nnet.corr3d.Corr3dMM(border_mode=border_mode,
-                                                      subsample=subsample
+                                                      subsample=subsample,
+                                                      filter_dilation=dilation,
                                                      )(ref_cast(inputs), flipped_filters)
        (grad_i_ref,
         grad_w_ref) = theano.tensor.grad(conv_ref.sum(),
@@ -1145,8 +1177,8 @@ def test_conv3d_bwd():
        utt.assert_allclose(res_ref[1], res[1], rtol=rtol)

    test_cases = get_conv3d_test_cases()
-    for (i_shape, f_shape, subsample), border_mode, conv_mode in test_cases:
-        yield (run_conv3d_bwd, i_shape, f_shape, subsample, border_mode,
+    for (i_shape, f_shape, subsample, dilation), border_mode, conv_mode in test_cases:
+        yield (run_conv3d_bwd, i_shape, f_shape, subsample, dilation, border_mode,
               conv_mode)