Update GpuDnnConvGradI for CuDNN v4 (gpua backend)

2a857e3a · carriepl · Frederic · 1e48b734 · 2a857e3a · 2a857e3a
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -667,16 +667,23 @@ class GpuDnnConvGradI(DnnBase):
        if self.inplace:
            self.destroy_map = {0: [2]}
        if algo is None:
-            algo = config.dnn.conv.algo_bwd
+            algo = config.dnn.conv.algo_bwd_data
        self.algo = algo
-        assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
-                             'guess_on_shape_change', 'time_once',
+        # The small-workspace implementation is only available from CuDNN V4
-                             'time_on_shape_change']
+        # onward.
+        if version() < (4000, 4000) and self.algo == 'fft_tiling':
+            raise RuntimeError("CuDNN's tiled-FFT convolution requires CuDNN "
+                               "v4 or more recent")
+        assert self.algo in ['none', 'deterministic', 'fft', 'fft_tiling',
+                             'guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']
    def __setstate__(self, d):
        self.__dict__.update(d)
        if not hasattr(self, 'algo'):
-            self.algo = config.dnn.conv.algo_bwd
+            self.algo = config.dnn.conv.algo_bwd_data
        if not hasattr(self, 'inplace'):
            self.inplace = False
@@ -713,6 +720,9 @@ class GpuDnnConvGradI(DnnBase):
                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
            if self.algo == 'fft':
                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
+            if self.algo == 'fft_tiling':
+                # big workspace but less than fft
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING'
            if self.algo in ['guess_once', 'guess_on_shape_change',
                             'time_once', 'time_on_shape_change']:
@@ -743,7 +753,8 @@ class GpuDnnConvGradI(DnnBase):
            raise TypeError("The number of dimensions of "
                            "kern, topgrad and output must match")
-        if kern.type.ndim == 5 and self.algo in ['fft', 'deterministic']:
+        if (kern.type.ndim == 5 and
+                self.algo in ['fft', 'deterministic', 'fft_tiling']):
            raise ValueError("convolution algo %s can't be used for "
                             "3d convolutions", (self.algo,))

--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
@@ -129,7 +129,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
 #endif
 #if CUDNN_VERSION > 3000
-  if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
+  // The FFT implementation does not support strides, 1x1 filters or inputs
+  // with a spatial dimension larger than 1024. The tiled-FFT implementation
+  // does not support strides.
+  // If the chosen implementation is FFT or tiled-FFT, validate that it can
+  // be used on the current data and default to a safe implementation if it
+  // can't.
+  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+  // defined only for 2d filters
+  if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
+       algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
    int nd;
    int pad[2];
    int stride[2];
@@ -145,10 +154,22 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      return 1;
    }
-    if (stride[0] != 1 || stride[1] != 1 ||
+    if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
-        PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
+    {
-        (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
+      if (stride[0] != 1 || stride[1] != 1 ||
-      algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+          PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
+          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
+      {
+        chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+      }
+    }
+    else
+    {
+      // chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
+      if (stride[0] != 1 || stride[1] != 1)
+      {
+        chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+      }
    }
  }
 #endif