Merge pull request #3788 from nouiz/carriepl-v4

Rebased cudnn v4

Merge pull request #3788 from nouiz/carriepl-v4
4ad36ddc · Frédéric Bastien · 22db3930 · 32e113c1 · 4ad36ddc · 4ad36ddc
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -219,30 +219,91 @@ AddConfigVar('gpuarray.sync',
             BoolParam(False),
             in_c_key=True)
+def safe_no_dnn_workmem(workmem):
+    """
+    Make sure the user is not attempting to use dnn.conv.workmem`.
+    """
+    if workmem:
+        raise RuntimeError(
+            'The option `dnn.conv.workmem` has been removed and should '
+            'not be used anymore. Please use the option '
+            '`dnn.conv.algo_fwd` instead.')
+    return True
 AddConfigVar('dnn.conv.workmem',
             "This flag is deprecated; use dnn.conv.algo_fwd.",
-             EnumStr(''),
+             ConfigParam('', allow_override=False, filter=safe_no_dnn_workmem),
             in_c_key=False)
+def safe_no_dnn_workmem_bwd(workmem):
+    """
+    Make sure the user is not attempting to use dnn.conv.workmem_bwd`.
+    """
+    if workmem:
+        raise RuntimeError(
+            'The option `dnn.conv.workmem_bwd` has been removed and '
+            'should not be used anymore. Please use the options '
+            '`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
+    return True
 AddConfigVar('dnn.conv.workmem_bwd',
             "This flag is deprecated; use dnn.conv.algo_bwd.",
-             EnumStr(''),
+             ConfigParam('', allow_override=False,
+                         filter=safe_no_dnn_workmem_bwd),
+             in_c_key=False)
+def safe_no_dnn_algo_bwd(algo):
+    """
+    Make sure the user is not attempting to use dnn.conv.algo_bwd`.
+    """
+    if algo:
+        raise RuntimeError(
+            'The option `dnn.conv.algo_bwd` has been removed and '
+            'should not be used anymore. Please use the options '
+            '`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
+    return True
+AddConfigVar('dnn.conv.algo_bwd',
+             "This flag is deprecated; use dnn.conv.algo_bwd_data and "
+             "dnn.conv.algo_bwd_filter.",
+             ConfigParam('', allow_override=False,
+                         filter=safe_no_dnn_algo_bwd),
             in_c_key=False)
 AddConfigVar('dnn.conv.algo_fwd',
             "Default implementation to use for CuDNN forward convolution.",
-             EnumStr('small', 'none', 'large', 'fft', 'guess_once',
+             EnumStr('small', 'none', 'large', 'fft', 'fft_tiling',
-                     'guess_on_shape_change', 'time_once',
+                     'guess_once', 'guess_on_shape_change',
+                     'time_once', 'time_on_shape_change'),
+             in_c_key=False)
+AddConfigVar('dnn.conv.algo_bwd_data',
+             "Default implementation to use for CuDNN backward convolution to "
+             "get the gradients of the convolution with regard to the inputs.",
+             EnumStr('none', 'deterministic', 'fft', 'fft_tiling',
+                     'guess_once', 'guess_on_shape_change', 'time_once',
                     'time_on_shape_change'),
             in_c_key=False)
-AddConfigVar('dnn.conv.algo_bwd',
+AddConfigVar('dnn.conv.algo_bwd_filter',
-             "Default implementation to use for CuDNN backward convolution.",
+             "Default implementation to use for CuDNN backward convolution to "
-             EnumStr('none', 'deterministic', 'fft', 'guess_once',
+             "get the gradients of the convolution with regard to the "
+             "filters.",
+             EnumStr('none', 'deterministic', 'fft', 'small', 'guess_once',
                     'guess_on_shape_change', 'time_once',
                     'time_on_shape_change'),
             in_c_key=False)
+AddConfigVar('dnn.conv.precision',
+             "Default data precision to use for the computation in CuDNN "
+             "convolutions (defaults to the same dtype as the inputs of the "
+             "convolutions).",
+             EnumStr('as_input', 'float16', 'float32', 'float64'),
+             in_c_key=False)
 def default_dnn_path(suffix):
    def f(suffix=suffix):

--- a/theano/sandbox/cuda/cudnn_helper.h
+++ b/theano/sandbox/cuda/cudnn_helper.h
@@ -3,6 +3,15 @@
 #include <cudnn.h>
+// If needed, define element of the V4 interface in terms of elements of
+// previous versions
+#if defined(CUDNN_VERSION) && CUDNN_VERSION < 4000
+#define CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING 5
+#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING 3
+#endif
 #ifndef CUDNN_VERSION
 #include <assert.h>

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
--- a/theano/sandbox/cuda/dnn_conv_base.c
+++ b/theano/sandbox/cuda/dnn_conv_base.c
@@ -15,11 +15,8 @@ int APPLY_SPECIFIC(previous_kerns_shape)[5];
 int APPLY_SPECIFIC(previous_output_shape)[5];
 bool APPLY_SPECIFIC(previous_algo_set);
 cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
 cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
 cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
-#endif
 #section init_code_struct
@@ -55,10 +52,8 @@ APPLY_SPECIFIC(previous_algo_set) = false;
 // Select default implementations for the case where the convolution
 // implementations should be selected based on the size of the data.
 APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
 APPLY_SPECIFIC(previous_bwd_f_algo) = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
 APPLY_SPECIFIC(previous_bwd_d_algo) = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-#endif
 #section cleanup_code_struct

--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
@@ -81,7 +81,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
        // CuDNN time every implementation and choose the best one.
        if (CHOOSE_ALGO_TIME)
        {
-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
          // Time the different implementations to choose the best one
          int requestedCount = 1;
          int count;
@@ -102,7 +101,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
          }
          chosen_algo = choosen_algo_perf.algo;
-#endif
        }
        else
        {
@@ -161,24 +159,28 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
      chosen_algo = CONV_ALGO;
    }
-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
    // The FFT implementation (only in V3 and onward) does not support strides,
    // 1x1 filters or inputs with a spatial dimension larger than 1024.
-    // If the chosen implementation is FFT, validate that it can be used
+    // The tiled-FFT implementation (only in V4 onward) does not support
-    // on the current data and default on a safe implementation if it
+    // strides.
+    // If the chosen implementation is FFT or tiled-FFT, validate that it can
+    // be used on the current data and default on a safe implementation if it
    // can't.
-    // Following code is 2d-specific, but it is fine as ftt is defined only for
+    // Following code is 2d-specific, but it is fine as FFT and tiled-FFT are
-    // 2d-filters
+    // defined only for 2d-filters
-    if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT && nb_dim == 4)
+    if ((chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
+         chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && nb_dim == 4)
    {
      // Extract the properties of the convolution descriptor
-      int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
+      int nd;
+      int pad[2];
+      int stride[2];
+      int upscale[2];
      cudnnConvolutionMode_t mode;
-      err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
+      cudnnDataType_t data_type;
-                                            &stride_v, &stride_h,
+      err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
-                                            &upscale_x, &upscale_y,
+                                               upscale, &mode, &data_type);
-                                            &mode);
      if (err != CUDNN_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError,
@@ -197,36 +199,23 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
      // Ensure that the selected implementation supports the requested
      // convolution. Fall back to a safe implementation otherwise.
-      if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
+      if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
-          input_w > 1024 || (filter_h == 1 && filter_w == 1))
      {
-        chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+        if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
+            input_w > 1024 || (filter_h == 1 && filter_w == 1))
+        {
+          chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+        }
+      }
+      else
+      {
+        // chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
+        if (stride[0] != 1 || stride[1] != 1)
+        {
+          chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+        }
      }
    }
-#endif
-#if defined(CUDNN_VERSION) && CUDNN_VERSION < 3000
-    // In versions before V3, CuDNN did not support kernels larger than the
-    // inputs in any spatial dimension, even if padding was used such that the
-    // padded inputs were larger than the kernels. If the kernels are larger
-    // then the inputs, raise an error message.
-    bool shape_mismatch = false;
-    for (int i=2; i < nb_dim; i++){
-        shape_mismatch = shape_mismatch || (CudaNdarray_HOST_DIMS(kerns)[i] >
-                                            CudaNdarray_HOST_DIMS(input)[i]);
-    }
-    if (shape_mismatch){
-      PyErr_Format(PyExc_RuntimeError,
-                   "GpuDnnConv: the current version of CuDNN does not support "
-                   "kernels larger than the inputs in any spatial dimension, "
-                   "even if the inputs are padded such that the padded inputs "
-                   "are larger than the kernels. Update your installation of "
-                   "CuDNN to V3 or more recent to solve the issue.");
-      return 1;
-    }
-#endif
    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
                                                  APPLY_SPECIFIC(input),

--- a/theano/sandbox/cuda/dnn_gi.c
+++ b/theano/sandbox/cuda/dnn_gi.c
@@ -33,7 +33,6 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
  if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
    return 1;
-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
  {
    size_t worksize;
    void *workspace;
@@ -159,21 +158,28 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
        chosen_algo = CONV_ALGO;
    }
-    // The FFT implementation (only in v3 and onward) does not support strides,
+    // The FFT implementation (only in V3 and onward) does not support strides,
    // 1x1 filters or inputs with a spatial dimension larger than 1024.
-    // If the chosen implementation is FFT, validate that it can be used
+    // The tiled-FFT implementation (only in V4 onward) does not support
-    // on the current data and default on a safe implementation if it
+    // strides.
+    // If the chosen implementation is FFT or tiled-FFT, validate that it can
+    // be used on the current data and default on a safe implementation if it
    // can't.
-    if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT && nb_dim == 4)
+    // Following code is 2d-specific, but it is fine as FFT and tiled-FFT are
+    // defined only for 2d-filters
+    if ((chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING  ||
+         chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && nb_dim == 4)
    {
      // Extract the properties of the convolution descriptor
-      int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
+      int nd;
+      int pad[2];
+      int stride[2];
+      int upscale[2];
      cudnnConvolutionMode_t mode;
-      err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
+      cudnnDataType_t data_type;
-                                            &stride_v, &stride_h,
+      err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
-                                            &upscale_x, &upscale_y,
+                                               upscale, &mode, &data_type);
-                                            &mode);
      if (err != CUDNN_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError,
@@ -192,10 +198,21 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
      // Ensure that the selected implementation supports the requested
      // convolution. Fall back to a safe implementation otherwise.
-      if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
+      if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
-          input_w > 1024 || (filter_h == 1 && filter_w == 1))
      {
-        chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+        if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
+            input_w > 1024 || (filter_h == 1 && filter_w == 1))
+        {
+          chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+        }
+      }
+      else
+      {
+        // chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
+        if (stride[0] != 1 || stride[1] != 1)
+        {
+          chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+        }
      }
    }
@@ -231,16 +248,6 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
      (void *)&beta,
      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
  }
-#else
-  err = cudnnConvolutionBackwardData(
-    _handle,
-    (void *)&alpha,
-    APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
-    APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
-    desc,
-    (void *)&beta,
-    APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
-#endif
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",

--- a/theano/sandbox/cuda/dnn_gw.c
+++ b/theano/sandbox/cuda/dnn_gw.c
@@ -33,7 +33,6 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
  if (c_set_filterNd(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;
-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
  {
    size_t worksize;
    void *workspace;
@@ -168,12 +167,14 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
    {
      // Extract the properties of the convolution descriptor
-      int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
+      int nd;
+      int pad[2];
+      int stride[2];
+      int upscale[2];
      cudnnConvolutionMode_t mode;
-      err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
+      cudnnDataType_t data_type;
-                                            &stride_v, &stride_h,
+      err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
-                                            &upscale_x, &upscale_y,
+                                               upscale, &mode, &data_type);
-                                            &mode);
      if (err != CUDNN_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError,
@@ -192,7 +193,7 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
      // Ensure that the selected implementation supports the requested
      // convolution. Fall back to a safe implementation otherwise.
-      if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
+      if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
          input_w > 1024 || (filter_h == 1 && filter_w == 1))
      {
        chosen_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
@@ -232,16 +233,6 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
  }
-#else
-  err = cudnnConvolutionBackwardFilter(
-    _handle,
-    (void *)&alpha,
-    APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
-    APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
-    desc,
-    (void *)&beta,
-    APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
-#endif
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",

--- a/theano/sandbox/gpuarray/conv_desc.c
+++ b/theano/sandbox/gpuarray/conv_desc.c
@@ -29,7 +29,7 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
    return -1;
  }
-  err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides, upscale,
+  err = cudnnSetConvolutionNdDescriptor_v3(*desc, NB_DIMS, pad, strides,
-                                        CONV_MODE);
+                                           upscale, CONV_MODE, PRECISION);
  return 0;
 }
--- a/theano/sandbox/gpuarray/cudnn_helper.h
+++ b/theano/sandbox/gpuarray/cudnn_helper.h
@@ -13,99 +13,12 @@ static inline int cudnnGetVersion() {
 #include <assert.h>
-#if CUDNN_VERSION < 3000
+// If needed, define element of the V4 interface in terms of elements of
-// Here we define the R3 API in terms of functions in the R2 interface
+// previous versions
-// This is only for what we use
+#if defined(CUDNN_VERSION) && CUDNN_VERSION < 4000
-typedef int cudnnConvolutionBwdDataAlgo_t;
+#define CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING 5
+#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING 3
-#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 0
-#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 1
-#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT 2
-static cudnnStatus_t cudnnGetConvolutionBackwardDataWorkspaceSize(
-  cudnnHandle_t handle,
-  const cudnnFilterDescriptor_t filterDesc,
-  const cudnnTensorDescriptor_t diffDesc,
-  const cudnnConvolutionDescriptor_t convDesc,
-  const cudnnTensorDescriptor_t gradDesc,
-  cudnnConvolutionBwdDataAlgo_t algo,
-  size_t *sizeInBytes) {
-  *sizeInBytes = 0;
-  return CUDNN_STATUS_SUCCESS;
-}
-static cudnnStatus_t cudnnConvolutionBackwardData_v3(
-  cudnnHandle_t handle,
-  const void *alpha,
-  const cudnnFilterDescriptor_t filterDesc,
-  const void *filterData,
-  const cudnnTensorDescriptor_t diffDesc,
-  const void *diffData,
-  const cudnnConvolutionDescriptor_t convDesc,
-  cudnnConvolutionBwdDataAlgo_t algo,
-  void *workspace,
-  size_t workspaceSizeInBytes,
-  const void *beta,
-  const cudnnTensorDescriptor_t gradDesc,
-  void *gradData) {
-  return cudnnConvolutionBackwardData(
-    handle,
-    alpha,
-    filterDesc,
-    filterData,
-    diffDesc,
-    diffData,
-    convDesc,
-    beta,
-    gradDesc,
-    gradData);
-}
-typedef int cudnnConvolutionBwdFilterAlgo_t;
-#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 0
-#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 1
-#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT 2
-static cudnnStatus_t cudnnGetConvolutionBackwardFilterWorkspaceSize(
-  cudnnHandle_t handle,
-  const cudnnTensorDescriptor_t filterDesc,
-  const cudnnTensorDescriptor_t diffDesc,
-  const cudnnConvolutionDescriptor_t convDesc,
-  const cudnnFilterDescriptor_t gradDesc,
-  cudnnConvolutionBwdDataAlgo_t algo,
-  size_t *sizeInBytes) {
-  *sizeInBytes = 0;
-  return CUDNN_STATUS_SUCCESS;
-}
-static cudnnStatus_t cudnnConvolutionBackwardFilter_v3(
-  cudnnHandle_t handle,
-  const void *alpha,
-  const cudnnTensorDescriptor_t srcDesc,
-  const void *srcData,
-  const cudnnTensorDescriptor_t diffDesc,
-  const void *diffData,
-  const cudnnConvolutionDescriptor_t convDesc,
-  cudnnConvolutionBwdFilterAlgo_t algo,
-  void *workspace,
-  size_t workspaceSizeInBytes,
-  const void *beta,
-  const cudnnFilterDescriptor_t gradDesc,
-  void *gradData) {
-  return cudnnConvolutionBackwardFilter(
-    handle,
-    alpha,
-    srcDesc,
-    srcData,
-    diffDesc,
-    diffData,
-    convDesc,
-    beta,
-    gradDesc,
-    gradData);
-}
 #endif

--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
--- a/theano/sandbox/gpuarray/dnn_fwd.c
+++ b/theano/sandbox/gpuarray/dnn_fwd.c
@@ -136,15 +136,26 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
       algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-#if CUDNN_VERSION > 3000
+  // The FFT implementation does not support strides, 1x1 filters or inputs
-  if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
+  // with a spatial dimension larger than 1024. The tiled-FFT implementation
+  // does not support strides.
+  // If the chosen implementation is FFT or tiled-FFT, validate that it can
+  // be used on the current data and default to a safe implementation if it
+  // can't.
+  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+  // defined only for 2d filters
+  if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
+       algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
+    // Extract the properties of the convolution descriptor
    int nd;
    int pad[2];
    int stride[2];
    int upscale[2];
    cudnnConvolutionMode_t mode;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
+    cudnnDataType_t data_type;
-                                          upscale, &mode);
+    err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+                                             upscale, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
@@ -153,30 +164,24 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      return 1;
    }
-    if (stride[0] != 1 || stride[1] != 1 ||
+    if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
-        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
+    {
-        (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
+      if (stride[0] != 1 || stride[1] != 1 ||
-      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
+          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
+      {
+        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+      }
    }
-  }
+    else
-#endif
+    {
+      // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
-#if CUDNN_VERSION < 3000
+      if (stride[0] != 1 || stride[1] != 1)
-  /* cuDNN before v3 does not support kernels larger than input even
+      {
-   * if appropriate padding is selected. */
+        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-  for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) {
+      }
-    if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) {
-      PyErr_SetString(PyExc_RuntimeError, "the current version "
-                      "of CuDNN does not support kernels larger than the "
-                      "inputs in any spatial dimension, even if the inputs "
-                      "are padded such that the padded inputs are larger "
-                      "than the kernels. Update your installation of CuDNN "
-                      "to V3 or more recent to solve the issue.");
-      cuda_exit(c->ctx);
-      return 1;
    }
  }
-#endif
  {
    size_t worksize;

--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
@@ -128,15 +128,26 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
 #endif
-#if CUDNN_VERSION > 3000
+  // The FFT implementation does not support strides, 1x1 filters or inputs
-  if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
+  // with a spatial dimension larger than 1024. The tiled-FFT implementation
+  // does not support strides.
+  // If the chosen implementation is FFT or tiled-FFT, validate that it can
+  // be used on the current data and default to a safe implementation if it
+  // can't.
+  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+  // defined only for 2d filters
+  if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
+       algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
+    // Extract the properties of the convolution descriptor
    int nd;
    int pad[2];
    int stride[2];
    int upscale[2];
    cudnnConvolutionMode_t mode;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
+    cudnnDataType_t data_type;
-                                          upscale, &mode);
+    err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+                                             upscale, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
@@ -145,13 +156,24 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      return 1;
    }
-    if (stride[0] != 1 || stride[1] != 1 ||
+    if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
-        PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
+    {
-        (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
+      if (stride[0] != 1 || stride[1] != 1 ||
-      algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+          PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
+          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
+      {
+        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+      }
+    }
+    else
+    {
+      // algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
+      if (stride[0] != 1 || stride[1] != 1)
+      {
+        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+      }
    }
  }
-#endif
  size_t worksize;
  gpudata *workspace;

--- a/theano/sandbox/gpuarray/dnn_gw.c
+++ b/theano/sandbox/gpuarray/dnn_gw.c
@@ -130,15 +130,24 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
 #endif
-#if CUDNN_VERSION > 3000
+  // The FFT implementation does not support strides, 1x1 filters or inputs
-  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) {
+  // with a spatial dimension larger than 1024.
+  // If the chosen implementation is FFT, validate that it can
+  // be used on the current data and default to a safe implementation if it
+  // can't.
+  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+  // defined only for 2d filters
+  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
+      PyGpuArray_NDIM(input) == 4) {
+    // Extract the properties of the convolution descriptor
    int nd;
    int pad[2];
    int stride[2];
    int upscale[2];
    cudnnConvolutionMode_t mode;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
+    cudnnDataType_t data_type;
-                                          upscale, &mode);
+    err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+                                             upscale, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting convolution properties: %s",
@@ -153,7 +162,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
    }
  }
-#endif
  size_t worksize;
  gpudata *workspace;