Update GpuDnnConv for CuDNN V4 (gpua backend)

64439f41 · carriepl · Frederic · 93f6f441 · 64439f41 · 64439f41
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -370,7 +370,8 @@ class GpuDnnConv(DnnBase):
    kernel
    descr
        The convolution descriptor.
-    algo : {'small', 'none', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
+    algo : {'small', 'none', 'large', 'fft', 'fft_tiling', 'guess_once',
+            'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
        Default is the value of :attr:`config.dnn.conv.algo_fwd`.
    """
@@ -399,9 +400,15 @@ class GpuDnnConv(DnnBase):
            elif self.algo in ['time_once', 'time_on_shape_change']:
                raise RuntimeError("CuDNN convolution timing requires CuDNN v3")
-        assert self.algo in ['none', 'small', 'large', 'fft', 'guess_once',
+        # The fft_tiling implementation is only available from CuDNN V4 onward
-                             'guess_on_shape_change', 'time_once',
+        if version() < 4000:
-                             'time_on_shape_change']
+            if self.algo == 'fft_tiling':
+                raise RuntimeError("CuDNN tiled-FFT convolution requires "
+                                   "CuDNN v4 or more recent")
+        assert self.algo in ['none', 'small', 'large', 'fft', 'fft_tiling',
+                             'guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']
    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -425,8 +432,13 @@ class GpuDnnConv(DnnBase):
            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
        elif self.algo == 'large':
            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
+        elif self.algo == 'direct':
+            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'
        elif self.algo == 'fft':
            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
+        elif self.algo == 'fft_tiling':
+            # need v4
+            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING'
        defs.append(('CONV_ALGO', alg))
        if self.algo in ['guess_once', 'guess_on_shape_change',
@@ -456,9 +468,10 @@ class GpuDnnConv(DnnBase):
            raise TypeError("The number of dimensions of "
                            "img, kern and output must match")
-        if img.type.ndim == 5 and self.algo == 'fft':
+        if (img.type.ndim == 5 and
-            raise ValueError("convolution algo fft can't be used for "
+                self.algo in ['small', 'large', 'fft', 'fft_tiling']):
-                             "3d convolutions")
+            raise ValueError("convolution algo %s can't be used for "
+                             "3d convolutions", (self.algo,))
        if (not isinstance(desc.type, CDataType) or
                desc.type.ctype != 'cudnnConvolutionDescriptor_t'):

--- a/theano/sandbox/gpuarray/dnn_fwd.c
+++ b/theano/sandbox/gpuarray/dnn_fwd.c
@@ -137,7 +137,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
 #if CUDNN_VERSION > 3000
-  if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
+  // The FFT implementation does not support strides, 1x1 filters or inputs
+  // with a spatial dimension larger than 1024. The tiled-FFT implementation
+  // does not support strides.
+  // If the chosen implementation is FFT or tiled-FFT, validate that it can
+  // be used on the current data and default to a safe implementation if it
+  // can't.
+  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+  // defined only for 2d filters
+  if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
+       algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
    int nd;
    int pad[2];
    int stride[2];
@@ -153,10 +162,22 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      return 1;
    }
-    if (stride[0] != 1 || stride[1] != 1 ||
+    if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
-        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
+    {
-        (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
+      if (stride[0] != 1 || stride[1] != 1 ||
-      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
+          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
+      {
+        chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+      }
+    }
+    else
+    {
+      // chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
+      if (stride[0] != 1 || stride[1] != 1)
+      {
+        chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+      }
    }
  }
 #endif