Merge pull request #3245 from carriepl/v3

CuDNN v3

Merge pull request #3245 from carriepl/v3
8cb9d50e · Frédéric Bastien · 2b83b6ac · 4d4be316 · 8cb9d50e · 8cb9d50e
--- a/doc/library/sandbox/cuda/dnn.txt
+++ b/doc/library/sandbox/cuda/dnn.txt
@@ -36,12 +36,61 @@ To get an error if Theano can not use cuDNN, use this Theano flag:

 .. note::

-   CuDNN v2 is now released, if you used any v2 release candidate, we
-   strongly suggest that you update it to the final version. From now
-   on, we only support the final release.
+   CuDNN v3 has now been released. CuDNN v2 remains supported but CuDNN v3 is
+   faster and offers many more options. We recommend that everybody update to
+   v3.
+
+.. note::
+
+   Starting in CuDNN v3, multiple convolution implementations are offered and
+   it is possible to use heuristics to automatically choose a convolution
+   implementation well suited to the parameters of the convolution.
+
+   The Theano flag ``dnn.conv.algo_fwd`` allows to specify the CuDNN
+   convolution implementation that Theano should use for forward convolutions.
+   Possible values include :
+
+   * ``small`` (default) : use a convolution implementation with small memory
+     usage
+   * ``none`` : use a slower implementation with minimal memory usage
+   * ``large`` : use a faster implementation with large memory usage
+   * ``fft`` : use the Fast Fourrier Transform implementation of convolution
+     (very high memory usage)
+   * ``guess_once`` : the first time a convolution is executed, the
+     implementation to use is chosen according to CuDNN's heuristics and reused
+     for every subsequent execution of the convolution.
+   * ``guess_on_shape_change`` : like ``guess_once`` but a new convolution
+     implementation selected every time the shapes of the inputs and kernels
+     don't match the shapes from the last execution.
+   * ``time_once`` : the first time a convolution is executed, every convolution
+     implementation offered by CuDNN is executed and timed. The fastest is
+     reused for every subsequent execution of the convolution.
+   * ``time_on_shape_change`` : like ``time_once`` but a new convolution
+     implementation selected every time the shapes of the inputs and kernels
+     don't match the shapes from the last execution.
+
+   The Theano flag ``dnn.conv.algo_bwd`` allows to specify the CuDNN
+   convolution implementation that Theano should use for gradient convolutions.
+   Possible values include :
+
+   * ``none`` (default) : use the default non-deterministic convolution
+     implementation
+   * ``deterministic`` : use a slower but deterministic implementation
+   * ``fft`` : use the Fast Fourrier Transform implementation of convolution
+     (very high memory usage)
+   * ``guess_once`` : the first time a convolution is executed, the
+     implementation to use is chosen according to CuDNN's heuristics and reused
+     for every subsequent execution of the convolution.
+   * ``guess_on_shape_change`` : like ``guess_once`` but a new convolution
+     implementation selected every time the shapes of the inputs and kernels
+     don't match the shapes from the last execution.
+   * ``time_once`` : the first time a convolution is executed, every convolution
+     implementation offered by CuDNN is executed and timed. The fastest is
+     reused for every subsequent execution of the convolution.
+   * ``time_on_shape_change`` : like ``time_once`` but a new convolution
+     implementation selected every time the shapes of the inputs and kernels
+     don't match the shapes from the last execution.

-   CuDNN v2 is much faster than v1. We recommend that everybody
-   updates to v2.

 .. note::

@@ -51,13 +100,16 @@ To get an error if Theano can not use cuDNN, use this Theano flag:

 .. note::

-    The documentation of CUDNN R1 and R2 tells that, for the following
-    2 operations, the reproducibility is not guaranteed:
+    The documentation of CUDNN tells that, for the 2 following operations, the
+    reproducibility is not guaranteed with the default implementation:
    `cudnnConvolutionBackwardFilter` and `cudnnConvolutionBackwardData`.
    Those correspond to the gradient wrt the weights and the gradient wrt the
-    input of the convolution. They are also used sometimes in the forward 
+    input of the convolution. They are also used sometimes in the forward
    pass, when they give a speed up.

+    The Theano flag ``dnn.conv.algo_bwd`` can be use to force the use of a
+    slower but deterministic convolution implementation.
+
 .. note::

    There is a problem we do not understand yet when cudnn paths are
@@ -79,7 +131,8 @@ Convolution Ops
 ===============

 .. automodule:: theano.sandbox.cuda.dnn
-    :members: GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI
+    :members: GpuDnnConvDesc, GpuDnnConv, GpuDnnConv3d, GpuDnnConvGradW,
+              GpuDnnConv3dGradW, GpuDnnConvGradI, GpuDnnConv3dGradI

 Pooling Ops
 ===========

--- a/theano/sandbox/cuda/cudnn_helper.h
+++ b/theano/sandbox/cuda/cudnn_helper.h
@@ -41,6 +41,20 @@ static inline const char *cudnnGetErrorString(cudnnStatus_t err) {

 typedef cudnnTensor4dDescriptor_t cudnnTensorDescriptor_t;

+static inline cudnnStatus_t
+cudnnSetTensorNdDescriptor(
+  cudnnTensorDescriptor_t tensorDesc,
+  cudnnDataType_t dataType,
+  int nbDims,
+  const int dimA[],
+  const int strideA[]) {
+  if (ndDims != 4) return CUDNN_STATUS_NOT_SUPPORTED;
+  return cudnnSetTensor4dDescriptorEx(
+    tensorDesc, dataType,
+    dimA[0], dimA[1], dimA[2], dimA[3],
+    strideA[0], strideA[1], strideA[2], strideA[3]);
+}
+
 static inline cudnnStatus_t
 cudnnGetConvolution2dForwardOutputDim(
  const cudnnConvolutionDescriptor_t convDesc,
@@ -183,6 +197,85 @@ cudnnConvolutionBackwardData_v2(

 #define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2

+static inline cudnnStatus_t
+cudnnSetPoolingNdDescriptor(
+  cudnnPoolingDescriptor_t poolingDesc,
+  const cudnnPoolingMode_t mode,
+  int nbDims,
+  const int windowDimA[],
+  const int paddingA[],
+  const in strideA[]) {
+  if (nbDims != 2) return CUDNN_STATUS_NOT_SUPPORTED;
+  if (paddingA[0] != 0 || paddingA[1] != 0) return CUDNN_STATUS_NOT_SUPPORTED;
+  return cudnnSetPoolingDescriptor(poolingDesc, mode,
+                                   windowDimA[0], windowDimA[1],
+                                   strideA[0], strideA[1]);
+}
+
+static inline cudnnStatus_t
+cudnnGetPoolingNdDescriptor(
+  const cudnnPoolingDescriptor_t poolingDesc,
+  const int nbDimsRequested,
+  cudnnPoolingMode_t *mode,
+  int *nbDims,
+  int windowA[],
+  int paddingA[],
+  int strideA[]) {
+  int win0, win1, str0, str1;
+  cudnnStatus_t err;
+  if (ndDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
+  err = cudnnGetPoolingDescriptor(poolingDesc, mode, &win0, &win1,
+                                  &str0, &str1);
+  if (err != CUDNN_STATUS_SUCCESS) return err;
+  *nbDims = 2;
+  paddingA[0] = 0;
+  paddingA[1] = 0;
+  windowA[0] = win0;
+  windowA[1] = win1;
+  strideA[0] = str0;
+  strideA[1] = str1;
+  return CUDNN_STATUS_SUCCESS;
+}
+
+static inline cudnnStatus_t
+cudnnPoolingForward_v2(
+  cudnnHandle_t handle,
+  const cudnnPoolingDescriptor_t poolingDesc,
+  const void *alpha,
+  const cudnnTensorDescriptor_t srcDesc,
+  const void *srcData,
+  const void *beta,
+  const cudnnTensorDescriptor_t destDesc,
+  void *destData) {
+  if (*(float*)alpha != 1.0 || *(float *)beta != 0.0) return CUDNN_STATUS_NOT_SUPPORTED;
+  return cudnnPoolingForward(handle, poolingDesc, srcDesc, srcData,
+                             destDesc, destData);
+}
+#define cudnnPoolingForward cudnnPoolingForward_v2
+
+static inline cudnnStatus_t
+cudnnPoolingBackward_v2(
+  cudnnHandle_t handle,
+  const cudnnPoolingDescriptor_t poolingDesc,
+  const void *alpha,
+  const cudnnTensorDescriptor_t srcDesc,
+  const void *srcData,
+  const cudnnTensorDescriptor_t srcDiffDesc,
+  const void *srcDiffData,
+  const cudnnTensorDescriptor_t destDesc,
+  const void *destData,
+  const void *beta,
+  const cudnnTensorDescriptor_t destDiffDesc,
+  void *destDiffData) {
+  if (*(float*)alpha != 1.0 || *(float *)beta != 0.0) return CUDNN_STATUS_NOT_SUPPORTED;
+  return cudnnPoolingBackward(handle, poolingDesc,
+                              srcDesc, srcData,
+                              srcDiffDesc, srcDiffData,
+                              destDesc, destData,
+                              destDiffDesc, destDiffData);
+}
+#define cudnnPoolingBackward cudnnPoolingBackward_v2
+
 //Needed for R2 rc2
 # define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE
 #else

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
--- a/theano/sandbox/cuda/dnn_base.c
+++ b/theano/sandbox/cuda/dnn_base.c
 #section support_code
 static cudnnHandle_t _handle = NULL;

+
 static int
-c_set_tensor4d(CudaNdarray *var, cudnnTensorDescriptor_t desc) {
-  cudnnStatus_t err = cudnnSetTensor4dDescriptorEx(
-    desc, CUDNN_DATA_FLOAT,
-    CudaNdarray_HOST_DIMS(var)[0],
-    CudaNdarray_HOST_DIMS(var)[1],
-    CudaNdarray_HOST_DIMS(var)[2],
-    CudaNdarray_HOST_DIMS(var)[3],
-    CudaNdarray_HOST_STRIDES(var)[0]?CudaNdarray_HOST_STRIDES(var)[0]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3]*CudaNdarray_HOST_DIMS(var)[1],
-    CudaNdarray_HOST_STRIDES(var)[1]?CudaNdarray_HOST_STRIDES(var)[1]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3],
-    CudaNdarray_HOST_STRIDES(var)[2]?CudaNdarray_HOST_STRIDES(var)[2]:CudaNdarray_HOST_DIMS(var)[3],
-    CudaNdarray_HOST_STRIDES(var)[3]?CudaNdarray_HOST_STRIDES(var)[3]:1
-    );
+c_set_tensorNd(CudaNdarray *var, cudnnTensorDescriptor_t desc) {
+
+
+  int dim = CudaNdarray_NDIM(var);
+  int strides[dim];
+  int default_str = 1;
+
+  for (int i = dim-1; i >= 0; i--)
+  {
+    if (CudaNdarray_HOST_STRIDES(var)[i])
+      strides[i] = CudaNdarray_HOST_STRIDES(var)[i];
+    else
+      strides[i] = default_str;
+    default_str *= CudaNdarray_HOST_DIMS(var)[i];
+  }
+
+  cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
+                                                 CudaNdarray_HOST_DIMS(var),
+                                                 strides);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
-		 "Could not set tensor4d descriptor: %s"
-		 "shapes=%d %d %d %d strides=%d %d %d %d",
-		 cudnnGetErrorString(err),
-		 CudaNdarray_HOST_DIMS(var)[0],
-		 CudaNdarray_HOST_DIMS(var)[1],
-		 CudaNdarray_HOST_DIMS(var)[2],
-		 CudaNdarray_HOST_DIMS(var)[3],
-		 CudaNdarray_HOST_STRIDES(var)[0]?CudaNdarray_HOST_STRIDES(var)[0]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3]*CudaNdarray_HOST_DIMS(var)[1],
-		 CudaNdarray_HOST_STRIDES(var)[1]?CudaNdarray_HOST_STRIDES(var)[1]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3],
-		 CudaNdarray_HOST_STRIDES(var)[2]?CudaNdarray_HOST_STRIDES(var)[2]:CudaNdarray_HOST_DIMS(var)[3],
-		 CudaNdarray_HOST_STRIDES(var)[3]?CudaNdarray_HOST_STRIDES(var)[3]:1
-      );
+		 "Could not set tensorNd descriptor: %s"
+		 "dim=%d",
+		 cudnnGetErrorString(err), dim);
    return -1;
  }
  return 0;
 }

+
 static int
-c_set_filter(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
+c_set_filterNd(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
  if (!CudaNdarray_is_c_contiguous(var)) {
    PyErr_SetString(PyExc_ValueError,
 		    "Only contiguous filters (kernels) are supported.");
    return -1;
  }
-  cudnnStatus_t err = cudnnSetFilter4dDescriptor(
-    desc, CUDNN_DATA_FLOAT,
-    CudaNdarray_HOST_DIMS(var)[0],
-    CudaNdarray_HOST_DIMS(var)[1],
-    CudaNdarray_HOST_DIMS(var)[2],
-    CudaNdarray_HOST_DIMS(var)[3]
-    );
+  int dim = CudaNdarray_NDIM(var);
+  cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
+                                                 CudaNdarray_HOST_DIMS(var));
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
 		 "Could not set filter descriptor: %s."
-		 " dims= %d %d %d %d",
-		 cudnnGetErrorString(err),
-		 CudaNdarray_HOST_DIMS(var)[0],
-		 CudaNdarray_HOST_DIMS(var)[1],
-		 CudaNdarray_HOST_DIMS(var)[2],
-		 CudaNdarray_HOST_DIMS(var)[3]);
+		 " dims= %d",
+		 cudnnGetErrorString(err), dim);
    return -1;
  }
  return 0;

--- a/theano/sandbox/cuda/dnn_conv_base.c
+++ b/theano/sandbox/cuda/dnn_conv_base.c
@@ -3,6 +3,24 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
 cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);

+/* Keep track, from one execution to another, of the dimension of the data
+and the algorithms, if any, that were selected according to these dimensions
+and according to the amount of memory available at that time.
+
+Note : Implementation selection for backward convolution only exists starting
+at V3.
+*/
+int APPLY_SPECIFIC(previous_input_shape)[5];
+int APPLY_SPECIFIC(previous_kerns_shape)[5];
+int APPLY_SPECIFIC(previous_output_shape)[5];
+bool APPLY_SPECIFIC(previous_algo_set);
+cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
+
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
+cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
+cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
+#endif
+
 #section init_code_struct

 cudnnStatus_t APPLY_SPECIFIC(err);
@@ -10,21 +28,38 @@ APPLY_SPECIFIC(input) = NULL;
 APPLY_SPECIFIC(output) = NULL;
 APPLY_SPECIFIC(kerns) = NULL;
 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
 	       "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }
 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }
 if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s", 
+  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
 	       cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }

+for (int i = 0; i < 5; i++)
+{
+  APPLY_SPECIFIC(previous_input_shape)[i] = 0;
+  APPLY_SPECIFIC(previous_kerns_shape)[i] = 0;
+  APPLY_SPECIFIC(previous_output_shape)[i] = 0;
+}
+
+APPLY_SPECIFIC(previous_algo_set) = false;
+
+// Select default implementations for the case where the convolution
+// implementations should be selected based on the size of the data.
+APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
+APPLY_SPECIFIC(previous_bwd_f_algo) = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+APPLY_SPECIFIC(previous_bwd_d_algo) = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+#endif
+
 #section cleanup_code_struct

 if (APPLY_SPECIFIC(input) != NULL)

--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
--- a/theano/sandbox/cuda/dnn_gi.c
+++ b/theano/sandbox/cuda/dnn_gi.c
@@ -12,25 +12,225 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
    return 1;
  }

-  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+  if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

+  int nb_dim = CudaNdarray_NDIM(output);
+
 #ifdef CONV_INPLACE
  Py_XDECREF(*input);
  *input = im;
  Py_INCREF(*input);
 #else
-  if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0)
+  if (CudaNdarray_prep_output(input, nb_dim, CudaNdarray_HOST_DIMS(im)) != 0)
    return 1;
  if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im))
    return 1;
 #endif

-  if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
    return 1;

+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
+  {
+    size_t worksize;
+    void *workspace;
+    cudnnConvolutionBwdDataAlgo_t chosen_algo;
+
+    if (CHOOSE_ALGO)
+    {
+
+      // A new convolution implementation should be selected, based either on
+      // timing or heuristics, if in one of the two following cases :
+      // - The implementation should only be chosen during the first execution
+      //   of an apply node and this is the first execution of the apply node.
+      // - The implementation should be chosen as often as necessary and the
+      //   shapes of the inputs differ from the last time an implementation
+      //   was chosen.
+      bool reuse_previous_algo;
+      if (CHOOSE_ALGO_ONCE)
+      {
+        // Only choose a new implementation of none has been chosen before.
+        reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
+      }
+      else
+      {
+        // Reuse the previous implementation if the the kernels and the outputs
+        // have the same shapes as they had when the previous implementation
+        // was selected
+        bool same_shapes = true;
+        for (int i = 0; (i < nb_dim) && same_shapes; i++)
+        {
+            same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] ==
+                            APPLY_SPECIFIC(previous_kerns_shape)[i]);
+            same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] ==
+                            APPLY_SPECIFIC(previous_output_shape)[i]);
+        }
+        reuse_previous_algo = same_shapes;
+      }
+
+      // If the previously choosen implementation can't be reused, select a
+      // new one based on the shapes of the current inputs
+      if (!reuse_previous_algo)
+      {
+        // Obtain a convolution algorithm appropriate for the kernel and output
+        // shapes. Either by choosing one according to heuristics or by making
+        // CuDNN time every implementation and choose the best one.
+        if (CHOOSE_ALGO_TIME)
+        {
+          // Time the different implementations to choose the best one
+          int requestedCount = 1;
+          int count;
+          cudnnConvolutionBwdDataAlgoPerf_t choosen_algo_perf;
+          err = cudnnFindConvolutionBackwardDataAlgorithm(_handle,
+                                                          APPLY_SPECIFIC(kerns),
+                                                          APPLY_SPECIFIC(output),
+                                                          desc,
+                                                          APPLY_SPECIFIC(input),
+                                                          requestedCount,
+                                                          &count,
+                                                          &choosen_algo_perf);
+          if (err != CUDNN_STATUS_SUCCESS) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuDnnConvGradI: error selecting convolution algo: "
+                         "%s", cudnnGetErrorString(err));
+            return 1;
+          }
+
+          chosen_algo = choosen_algo_perf.algo;
+        }
+        else
+        {
+          // Choose the convolution implementation using heuristics based on the
+          // shapes of the inputs and the amount of memory available.
+
+          // Get the amount of available memory
+          size_t free = 0, total = 0;
+          cudaError_t err2 = cudaMemGetInfo(&free, &total);
+          if (err2 != cudaSuccess){
+            cudaGetLastError();
+            fprintf(stderr,
+                    "Error when trying to find the memory information"
+                    " on the GPU: %s\n", cudaGetErrorString(err2));
+            return 1;
+          }
+
+          // Use heuristics to choose the implementation
+          err = cudnnGetConvolutionBackwardDataAlgorithm(_handle,
+                                                         APPLY_SPECIFIC(kerns),
+                                                         APPLY_SPECIFIC(output),
+                                                         desc,
+                                                         APPLY_SPECIFIC(input),
+                                                         CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+                                                         free,
+                                                         &chosen_algo);
+
+          if (err != CUDNN_STATUS_SUCCESS) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuDnnConvGradI: error selecting convolution algo: %s",
+                         cudnnGetErrorString(err));
+            return 1;
+          }
+        }
+
+        // Store the shapes of the kernels and output as well as the chosen
+        // algorithm for future use.
+        APPLY_SPECIFIC(previous_bwd_d_algo) = chosen_algo;
+        for (int i = 0; i < nb_dim; i++)
+        {
+            APPLY_SPECIFIC(previous_kerns_shape)[i] =
+                                            CudaNdarray_HOST_DIMS(kerns)[i];
+            APPLY_SPECIFIC(previous_output_shape)[i] =
+                                            CudaNdarray_HOST_DIMS(output)[i];
+        }
+
+      }
+      else
+      {
+        // Reuse the previously chosen convlution implementation
+        chosen_algo = APPLY_SPECIFIC(previous_bwd_d_algo);
+      }
+    }
+    else
+    {
+        chosen_algo = CONV_ALGO;
+    }
+
+    // The FFT implementation (only in v3 and onward) does not support strides,
+    // 1x1 filters or inputs with a spatial dimension larger than 1024.
+    // If the chosen implementation is FFT, validate that it can be used
+    // on the current data and default on a safe implementation if it
+    // can't.
+    if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT && nb_dim == 4)
+    {
+
+      // Extract the properties of the convolution descriptor
+      int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
+      cudnnConvolutionMode_t mode;
+      err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
+                                            &stride_v, &stride_h,
+                                            &upscale_x, &upscale_y,
+                                            &mode);
+
+      if (err != CUDNN_STATUS_SUCCESS) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuDnnConvGradI: error getting convolution properties: %s",
+                     cudnnGetErrorString(err));
+        return 1;
+      }
+
+      // Extract the spatial size of the filters
+      int filter_h = CudaNdarray_HOST_DIMS(kerns)[3];
+      int filter_w = CudaNdarray_HOST_DIMS(kerns)[4];
+
+      // Extract the spatial size of the input
+      int input_h = CudaNdarray_HOST_DIMS(*input)[3];
+      int input_w = CudaNdarray_HOST_DIMS(*input)[4];
+
+      // Ensure that the selected implementation supports the requested
+      // convolution. Fall back to a safe implementation otherwise.
+      if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
+          input_w > 1024 || (filter_h == 1 && filter_w == 1))
+      {
+        chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+      }
+    }
+
+    // Infer required workspace size from the chosen implementation
+    err = cudnnGetConvolutionBackwardDataWorkspaceSize(_handle,
+                                                       APPLY_SPECIFIC(kerns),
+                                                       APPLY_SPECIFIC(output),
+                                                       desc,
+                                                       APPLY_SPECIFIC(input),
+                                                       chosen_algo,
+                                                       &worksize);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuDnnConvGradI: error getting worksize: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+
+    // Allocate workspace for the convolution
+    workspace = get_work_mem(worksize);
+    if (workspace == NULL && worksize != 0)
+      return 1;
+
+    // Perform the convolution
+    err = cudnnConvolutionBackwardData_v3(
+      _handle,
+      (void *)&alpha,
+      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
+      APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
+      desc,
+      chosen_algo,
+      workspace, worksize,
+      (void *)&beta,
+      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
+  }
+#else
  err = cudnnConvolutionBackwardData(
    _handle,
    (void *)&alpha,
@@ -39,6 +239,8 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
    desc,
    (void *)&beta,
    APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
+#endif
+
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
                 cudnnGetErrorString(err));

--- a/theano/sandbox/cuda/dnn_gw.c
+++ b/theano/sandbox/cuda/dnn_gw.c
--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -452,7 +452,8 @@ def test_default_conv():
                for a in f.maker.fgraph.apply_nodes])


-def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
+def _test_full(cls, mode=None, version=[-1], extra_shapes=[],
+               test_bigger_kernels=True):
    seed_rng()
    shapes = get_basic_shapes()
    shapes += get_shapes2()
@@ -481,14 +482,18 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
            , ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1))  # a big one
            , ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # MNIST LeNET layer 1
            , ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1))  # layer 1 backprop to weights
+            ]

-        # other test
-            , ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1))  # kernel bigger then image
+    if test_bigger_kernels:
+        # Shapes where the kernel is larger than the image in some dimension
+        shapes += [
+              ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1))
            , ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1))
            , ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1))
-            , ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1))  # kernel bigger then image
+            , ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1))
            , ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1))
            ]
+
    shapes += [
 #        ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers
 #            , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers
@@ -516,9 +521,16 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):


 def test_full():
-    for t in _test_full(None,
-                        mode=theano_mode,
-                        version=[-1]):
+
+    # If using CuDNN version before v3, only run the tests where the
+    # kernels are not larger than the input in any spatial dimension.
+    if cuda.dnn.dnn_available() and cuda.dnn.version() < (3000, 3000):
+        test_bigger_kernels = False
+    else:
+        test_bigger_kernels = True
+
+    for t in _test_full(None, mode=theano_mode, version=[-1],
+                        test_bigger_kernels=test_bigger_kernels):
        yield t


@@ -531,7 +543,16 @@ def test_gemm_full():
 def test_dnn_full():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)
-    for t in _test_full(DnnBase, mode=theano_mode.including("cudnn")):
+
+    # If using CuDNN version before v3, only run the tests where the
+    # kernels are not larger than the input in any spatial dimension.
+    if cuda.dnn.version() < (3000, 3000):
+        test_bigger_kernels = False
+    else:
+        test_bigger_kernels = True
+
+    for t in _test_full(DnnBase, mode=theano_mode.including("cudnn"),
+                        test_bigger_kernels=test_bigger_kernels):
        yield t



--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py