Remove `AlgoRec.dataType` field as it is used nowhere.

Add some tests, to check at least if everything runs well. Standardize all three files (dnn_fwd, dnn_gi), to have same code logic and organization across these three codes. Add fallback entries as fallback functions. Add `-t` to hash for timed algorithms.

Remove `AlgoRec.dataType` field as it is used nowhere.
87641e6f · notoraptor · 5ce08bc8 · 87641e6f · 87641e6f · 87641e6f
--- a/theano/gpuarray/c_code/cudnn_helper.h
+++ b/theano/gpuarray/c_code/cudnn_helper.h
@@ -17,7 +17,6 @@ static inline int cudnnGetVersion() {
 /* a common struct for all 3 CUDNN enums */
 struct AlgoRec {
        int algo;
-        cudnnDataType_t dataType;
        size_t wsSize;
        cudnnMathType_t mathType;
 };

--- a/theano/gpuarray/c_code/dnn_fwd.c
+++ b/theano/gpuarray/c_code/dnn_fwd.c
 #section init_code_struct
-reuse_algo = 0;
-use_cached = 0;
 prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
-prev_algo.dataType = CUDNN_DATA_FLOAT;
+reuse_algo = 0;
 hash_prefix = std::string("FWD|GPU#");

 #section support_code_struct
-#line 11 "dnn_fwd.c"
+#line 9 "dnn_fwd.c"
 int     reuse_algo;
-bool    use_cached;
 AlgoRec prev_algo;
 std::string hash_prefix;
+#ifdef DEBUG
+char algorithm_name[128];
+#endif
+
+/** Check given algorithm against inputs and convolution descriptor,
+    change algorithm inplace to a fallback algorithm if checkings fail.
+    Return 0 on success, non-0 on error. **/
+int dnn_conv_fwd_fallback(cudnnConvolutionFwdAlgo_t* _algo,
+                          const PyGpuArrayObject* input,
+                          const PyGpuArrayObject* kerns,
+                          cudnnConvolutionDescriptor_t desc) {
+  cudnnConvolutionFwdAlgo_t algo = *_algo;
+
+  /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
+  if (PyGpuArray_NDIM(input) == 5 &&
+      !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
+        algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
+        algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
+  {
+    #ifdef DEBUG
+    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
+        return 1;
+    fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
+    #endif
+    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+  }
+
+  // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
+  // Issue should be resolved for cuDNN > V6.0.
+  // NB: In cuDNN V7, issue is resolved for 2D convolutionss only.
+  if ((cudnnGetVersion() < 6100 || PyGpuArray_NDIM(input) == 5) &&
+      algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
+      PyGpuArray_DIM(input, 0) > 65536)
+  {
+    #ifdef DEBUG
+    fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
+                    "will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n");
+    #endif
+    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+  }
+
+  // The FFT implementation does not support strides, 1x1 filters or inputs
+  // with a spatial dimension larger than 1024. The tiled-FFT implementation
+  // does not support strides.
+  // If the chosen implementation is FFT or tiled-FFT, validate that it can
+  // be used on the current data and default to a safe implementation if it
+  // can't.
+  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+  // defined only for 2d filters
+  /* NB:
+  TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
+  New conditions apply and may depend on number of dimensions (2D or 3D)
+  e.g. for FFT_TILING.
+  TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
+  */
+  if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
+       algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
+
+    // Extract the properties of the convolution descriptor
+    int nd;
+    int pad[2];
+    int stride[2];
+    int dilation[2];
+    cudnnConvolutionMode_t mode;
+    cudnnDataType_t data_type;
+    cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, dilation, &mode, &data_type);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+
+    if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
+      if (stride[0] != 1 || stride[1] != 1 ||
+          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
+          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
+      {
+        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+        #ifdef DEBUG
+        fprintf(stderr, "(replacing fwd algo fft with none)\n");
+        #endif
+      }
+    } else {
+      // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
+      if (stride[0] != 1 || stride[1] != 1) {
+        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+        #ifdef DEBUG
+        fprintf(stderr, "(replacing fwd algo fft_tiling with none)\n");
+        #endif
+      }
+    }
+  }
+  *_algo = algo;
+  return 0;
+}

 int
 APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
@@ -25,6 +117,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  void *beta_p;
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  bool use_cached = 0;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError,
@@ -93,10 +186,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,

  std::string hashkey;

-  #ifdef DEBUG
-  char algorithm_name[128];
-  #endif
-
  size_t free = c_get_largest_free_block_size(c);
  if (PyErr_Occurred()) return 1;

@@ -107,11 +196,13 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    if (!reuse_algo) {
      char pci_id[16];
      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+      // check out cache
      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups);
-      if (hashkey.empty())
+      if (hashkey.empty()) {
+        cuda_exit(c->ctx);
        return 1;
-      hashkey =  hash_prefix + pci_id + " " + hashkey;
-      // check out cache
+      }
+      hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
      const AlgoRec* cached = dnn_conv_check_cache(hashkey);
      if (cached) {
        prev_algo = *cached;
@@ -132,6 +223,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
        tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
        if (tmpmem == NULL) {
          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+          cuda_exit(c->ctx);
          return -1;
        }

@@ -155,11 +247,13 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
        #ifdef DEBUG
        if (count == 0) {
            PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found");
+            cuda_exit(c->ctx);
            return 1;
        } else if (choice.status != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError,
                         "error getting best-timed FWD algo: %s",
                         cudnnGetErrorString(choice.status));
+            cuda_exit(c->ctx);
            return 1;
        } // Else, count is necessarly 1 for current implementation.
        #endif
@@ -170,10 +264,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
 #if CUDNN_MAJOR >= 7
        prev_algo.mathType = mathtype = choice.mathType;
 #endif
-        // Add to the cache
-        dnn_conv_update_cache(hashkey, prev_algo);
-        // NB: It is added again later to cqche,
-        // so maybe this line could be removed.

      } else {
        err = cudnnGetConvolutionForwardAlgorithm(
@@ -194,6 +284,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    }
  }

+  if (dnn_conv_fwd_fallback(&algo, input, kerns, desc) != 0) {
+    cuda_exit(c->ctx);
+    return 1;
+  }
+
  // if FindEx was used (choose_time), workspace size is set.
  if (!(reuse_algo || use_cached || params->choose_time))
  {
@@ -205,45 +300,49 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                                                  algo,
                                                  &worksize);
    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
-        // Fallback to none algo if not supported
+      // Fallback to none algo if not supported
      #ifdef DEBUG
-      if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
+      if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
+        cuda_exit(c->ctx);
        return 1;
-      fprintf(stderr, "(%s error getting worksize: "
-                      "fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
+      }
+      fprintf(stderr, "(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n",
+              algorithm_name);
      #endif
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+      err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
+                                                    APPLY_SPECIFIC(input),
+                                                    APPLY_SPECIFIC(kerns),
+                                                    desc,
+                                                    APPLY_SPECIFIC(output),
+                                                    algo,
+                                                    &worksize);
    }

-    err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
-                                                  APPLY_SPECIFIC(input),
-                                                  APPLY_SPECIFIC(kerns),
-                                                  desc,
-                                                  APPLY_SPECIFIC(output),
-                                                  algo,
-                                                  &worksize);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
  }

-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-                 "error getting worksize: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(c->ctx);
-    return 1;
-  }
-  // save for next time/cache
-  prev_algo.wsSize = worksize;
-  prev_algo.algo = algo;
+  if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
+    // algo may have changed due to fallback, we must update it.
+    prev_algo.algo = algo;
+    // save worksize for next time/cache
+    prev_algo.wsSize = worksize;

-  // Add to the cache, even if this node use *_once algo
-  // (in case the user specify the algo per layer and not globally).
-  if (params->choose_algo)
+    // Add to the cache if we choose on shape change, or first time if we choose once.
    dnn_conv_update_cache(hashkey, prev_algo);
+  }

 #ifdef DEBUG
  if (params->choose_algo) {
-    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
+    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
+      cuda_exit(c->ctx);
      return 1;
+    }
    fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
            algorithm_name,
            params->choose_time ? "(timed)": "" ,
@@ -264,7 +363,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    gpudata *workspace = 0;
 #if CUDNN_MAJOR >= 7
    // CUDNN7: need to set math type
-    err = cudnnSetConvolutionMathType(desc, prev_algo.mathType);
+    err = cudnnSetConvolutionMathType(desc, mathtype);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error setting math type for convolution : %s",
@@ -282,8 +381,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    if (worksize != 0) {
      workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
      if (workspace == NULL) {
-        PyErr_SetString(PyExc_RuntimeError,
-                        "Could not allocate working memory");
+        PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
        cuda_exit(c->ctx);
        return 1;
      }
@@ -294,15 +392,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    for ( int g = 0; g < groups; g++) {
-    err = cudnnConvolutionForward(
-      params->handle,
-      alpha_p,
-      APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g,
-      APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g,
-      desc, algo,
-      worksize == 0 ? NULL : *(void **)workspace, worksize,
-      beta_p,
-      APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g);
+      err = cudnnConvolutionForward(
+        params->handle,
+        alpha_p,
+        APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g,
+        APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g,
+        desc, algo,
+        worksize == 0 ? NULL : *(void **)workspace, worksize,
+        beta_p,
+        APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g);
    }

    if (worksize != 0)
@@ -316,7 +414,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
+    PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv FWD operation: %s",
 		 cudnnGetErrorString(err));
    return 1;
  }

--- a/theano/gpuarray/c_code/dnn_gi.c
+++ b/theano/gpuarray/c_code/dnn_gi.c
 #section init_code_struct
 prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
-prev_algo.dataType = CUDNN_DATA_FLOAT;
 reuse_algo = 0;
-use_cached = 0;
 hash_prefix = std::string("GI|GPU#");

 #section support_code_struct
-#line 11 "dnn_gi.c"
+#line 9 "dnn_gi.c"
 int     reuse_algo;
-bool    use_cached;
 AlgoRec prev_algo;
 std::string hash_prefix;
+#ifdef DEBUG
+char algorithm_name[128];
+#endif
+
+/** Check given algorithm against inputs and convolution descriptor,
+    change algorithm inplace to a fallback algorithm if checkings fail.
+    Return 0 on success, non-0 on error. **/
+int dnn_conv_gi_fallback(cudnnConvolutionBwdDataAlgo_t* _algo,
+                         const PyGpuArrayObject* input,
+                         const PyGpuArrayObject* kerns,
+                         cudnnConvolutionDescriptor_t desc) {
+  cudnnConvolutionBwdDataAlgo_t algo = *_algo;
+
+  // The FFT implementation does not support strides, 1x1 filters or inputs
+  // with a spatial dimension larger than 1024. The tiled-FFT implementation
+  // does not support strides.
+  // If the chosen implementation is FFT or tiled-FFT, validate that it can
+  // be used on the current data and default to a safe implementation if it
+  // can't.
+  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+  // defined only for 2d filters
+  if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
+       algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
+
+    // Extract the properties of the convolution descriptor
+    int nd;
+    int pad[2];
+    int stride[2];
+    int upscale[2];
+    cudnnConvolutionMode_t mode;
+    cudnnDataType_t data_type;
+    cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+
+    if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
+      if (stride[0] != 1 || stride[1] != 1 ||
+          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
+          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
+      {
+        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+        #ifdef DEBUG
+        fprintf(stderr, "(replacing gradinput algo fft with none)\n");
+        #endif
+      }
+    } else {
+      // algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
+      if (stride[0] != 1 || stride[1] != 1) {
+        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+        #ifdef DEBUG
+        fprintf(stderr, "(replacing gradinput algo fft_tiling with none)\n");
+        #endif
+      }
+    }
+  }
+  *_algo = algo;
+  return 0;
+}

 int
 APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
@@ -24,6 +82,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  void *beta_p;
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  bool use_cached = 0;

  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
@@ -73,7 +132,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    return 0;
  }

-
  int groups = c_check_groups_for_conv(desc, params->num_groups);
  if (groups == -1)
    return 1;
@@ -88,178 +146,155 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;

  cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
-  #ifdef DEBUG
-  char algorithm_name[128];
-  #endif
  size_t worksize = 0;
  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;

  std::string hashkey;

-  if (params->choose_algo && !reuse_algo) {
-    char pci_id[16];
-    gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
-    // check out cache
-    hashkey=dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
-    if (hashkey.empty())
-      return 1;
-    hashkey = hash_prefix + pci_id + " " + hashkey;
-    const AlgoRec* cached = dnn_conv_check_cache(hashkey);
-    if (cached) {
-      prev_algo = *cached;
-      use_cached = 1;
-    }
-  }
-
  size_t free = c_get_largest_free_block_size(c);
  if (PyErr_Occurred()) return 1;

  cuda_enter(c->ctx);

-  if (params->choose_algo && !(reuse_algo || use_cached)) {
-    if (params->choose_time) {
-      int count;
-      cudnnConvolutionBwdDataAlgoPerf_t choice;
-      gpudata *tmpmem;
-
-      tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
-      if (tmpmem == NULL) {
-        PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
-        return -1;
-      }
-
-      err = cudnnFindConvolutionBackwardDataAlgorithmEx(
-        params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-        APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
-        APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
-        1, &count, &choice, *(void **)tmpmem, free);
-      gpudata_release(tmpmem);
+  if (params->choose_algo) {

-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
-                     cudnnGetErrorString(err));
+    if (!reuse_algo) {
+      char pci_id[16];
+      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+      // check out cache
+      hashkey=dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
+      if (hashkey.empty()) {
        cuda_exit(c->ctx);
        return 1;
      }
+      hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
+      const AlgoRec* cached = dnn_conv_check_cache(hashkey);
+      if (cached) {
+        prev_algo = *cached;
+        use_cached = 1;
+      }
+    }

-      #ifdef DEBUG
-      if (count == 0) {
-          PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
-          return 1;
-      } else if (choice.status != CUDNN_STATUS_SUCCESS) {
-          PyErr_Format(PyExc_RuntimeError,
-                       "error getting best-timed gradinput algo: %s",
-                       cudnnGetErrorString(choice.status));
+    if (reuse_algo || use_cached) {
+      algo = (cudnnConvolutionBwdDataAlgo_t)prev_algo.algo;
+      worksize = prev_algo.wsSize;
+      mathtype = prev_algo.mathType;
+    } else {
+      if (params->choose_time) {
+        int count;
+        cudnnConvolutionBwdDataAlgoPerf_t choice;
+        gpudata *tmpmem;
+
+        tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+        if (tmpmem == NULL) {
+          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+          cuda_exit(c->ctx);
+          return -1;
+        }
+
+        err = cudnnFindConvolutionBackwardDataAlgorithmEx(
+          params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
+          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
+          APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
+          1, &count, &choice, *(void **)tmpmem, free);
+        gpudata_release(tmpmem);
+
+        if (err != CUDNN_STATUS_SUCCESS) {
+          PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
+                       cudnnGetErrorString(err));
+          cuda_exit(c->ctx);
          return 1;
-      } // Else, count is necessarly 1 for current implementation.
-      #endif
+        }

-      algo = choice.algo;
-      prev_algo.algo = (int)algo;
-      prev_algo.wsSize = worksize = choice.memory;
+        #ifdef DEBUG
+        if (count == 0) {
+            PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
+            cuda_exit(c->ctx);
+            return 1;
+        } else if (choice.status != CUDNN_STATUS_SUCCESS) {
+            PyErr_Format(PyExc_RuntimeError, "error getting best-timed gradinput algo: %s",
+                         cudnnGetErrorString(choice.status));
+            cuda_exit(c->ctx);
+            return 1;
+        } // Else, count is necessarly 1 for current implementation.
+        #endif
+
+        algo = choice.algo;
+        prev_algo.algo = (int)algo;
+        prev_algo.wsSize = worksize = choice.memory;
 #if CUDNN_MAJOR >= 7
-      prev_algo.mathType = mathtype = choice.mathType;
+        prev_algo.mathType = mathtype = choice.mathType;
 #endif
-    // Add to the cache
-	dnn_conv_update_cache(hashkey, prev_algo);

-    } else {
-      err = cudnnGetConvolutionBackwardDataAlgorithm(
-        params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
-        desc, APPLY_SPECIFIC(input),
-        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
-                     cudnnGetErrorString(err));
-        cuda_exit(c->ctx);
-        return 1;
+      } else {
+        err = cudnnGetConvolutionBackwardDataAlgorithm(
+          params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
+          desc, APPLY_SPECIFIC(input),
+          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+        if (err != CUDNN_STATUS_SUCCESS) {
+          PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
+                       cudnnGetErrorString(err));
+          cuda_exit(c->ctx);
+          return 1;
+        }
+        prev_algo.algo = algo;
+        // no tensor_op returned from Get()
+        prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
      }
-      prev_algo.algo = algo;
-      // no tensor_op returned from Get()
-      prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
    }
  }

+  if (dnn_conv_gi_fallback(&algo, *input, kerns, desc) != 0) {
+    cuda_exit(c->ctx);
+    return 1;
+  }
+
  // if FindEx was used (choose_time), workspace size is set.
  if (!(reuse_algo || use_cached || params->choose_time))
  {
-
    err = cudnnGetConvolutionBackwardDataWorkspaceSize(
-    params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
-    APPLY_SPECIFIC(input), algo, &worksize);
-
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
-                   cudnnGetErrorString(err));
-
-      // The FFT implementation does not support strides, 1x1 filters or inputs
-      // with a spatial dimension larger than 1024. The tiled-FFT implementation
-      // does not support strides.
-      // If the chosen implementation is FFT or tiled-FFT, validate that it can
-      // be used on the current data and default to a safe implementation if it
-      // can't.
-      // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-      // defined only for 2d filters
-      if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
-           algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
-
-        // Extract the properties of the convolution descriptor
-        int nd;
-        int pad[2];
-        int stride[2];
-        int upscale[2];
-        cudnnConvolutionMode_t mode;
-        cudnnDataType_t data_type;
-        err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                              upscale, &mode, &data_type);
-        if (err != CUDNN_STATUS_SUCCESS) {
-          PyErr_Format(PyExc_RuntimeError,
-                       "error getting convolution properties: %s",
-                       cudnnGetErrorString(err));
-          cuda_exit(c->ctx);
-          return 1;
-        }
-
-        if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
-        {
-          if (stride[0] != 1 || stride[1] != 1 ||
-              PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
-              (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
-          {
-            algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-          }
-        }
-        else
-        {
-          // algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
-          if (stride[0] != 1 || stride[1] != 1)
-          {
-            algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-          }
-        }
+      params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
+      APPLY_SPECIFIC(input), algo, &worksize);
+    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
+      // Fallback to none algo if not supported
+      #ifdef DEBUG
+      if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
+        cuda_exit(c->ctx);
+        return 1;
      }
+      fprintf(stderr, "(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_BWD_DATA_ALGO_0)\n",
+              algorithm_name);
+      #endif
+      algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
      err = cudnnGetConvolutionBackwardDataWorkspaceSize(
        params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
        APPLY_SPECIFIC(input), algo, &worksize);
    }

    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
+                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }
-    // save for next time/cache
-    prev_algo.wsSize = worksize;
+  }  // !(reuse_algo || use_cached || params->choose_time)
+
+  if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
+    // algo may have changed due to fallback, we must update it.
    prev_algo.algo = algo;
+    // save worksize for next time/cache
+    prev_algo.wsSize = worksize;

    // Add to the cache
-    if (params->choose_algo)
-      dnn_conv_update_cache(hashkey, prev_algo);
-  }  // !(reuse_algo || use_cached || params->choose_time)
+    dnn_conv_update_cache(hashkey, prev_algo);
+  }

 #ifdef DEBUG
  if (params->choose_algo) {
-    if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name))
+    if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
+        cuda_exit(c->ctx);
        return 1;
+    }
    // NB: This is printed only when algorithm is chosen at runtime.
    fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
            algorithm_name,
@@ -280,7 +315,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    gpudata *workspace = 0;
 #if CUDNN_MAJOR >= 7
    // CUDNN7: need to set math type
-    err = cudnnSetConvolutionMathType(desc, prev_algo.mathType);
+    err = cudnnSetConvolutionMathType(desc, mathtype);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error setting math type for convolution : %s",
@@ -293,8 +328,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  if (worksize != 0) {
    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {
-      PyErr_SetString(PyExc_RuntimeError,
-                      "Could not allocate working memory");
+      PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
      cuda_exit(c->ctx);
      return 1;
    }
@@ -304,8 +338,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

-  for ( int g = 0; g < groups; g++)
-  {
+  for ( int g = 0; g < groups; g++) {
    err = cudnnConvolutionBackwardData(
      params->handle,
      alpha_p,
@@ -326,7 +359,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
+    PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv gradinput operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }

--- a/theano/gpuarray/c_code/dnn_gw.c
+++ b/theano/gpuarray/c_code/dnn_gw.c
 #section init_code_struct
 prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
-prev_algo.dataType = CUDNN_DATA_FLOAT;
 reuse_algo = 0;
-use_cached = 0;
 hash_prefix = std::string("GW|GPU#");

 #section support_code_struct
-#line 11 "dnn_gw.c"
+#line 9 "dnn_gw.c"
 int     reuse_algo;
-bool    use_cached;
 AlgoRec prev_algo;
 std::string hash_prefix;
+#ifdef DEBUG
+char algorithm_name[128];
+#endif
+
+/** Check given algorithm against inputs and convolution descriptor,
+    change algorithm inplace to a fallback algorithm if checkings fail.
+    Return 0 on success, non-0 on error. **/
+int dnn_conv_gw_fallback(cudnnConvolutionBwdFilterAlgo_t* _algo,
+                         const PyGpuArrayObject* input,
+                         const PyGpuArrayObject* kerns,
+                         cudnnConvolutionDescriptor_t desc) {
+  cudnnConvolutionBwdFilterAlgo_t algo = *_algo;
+
+  // The FFT implementation does not support strides, 1x1 filters or inputs
+  // with a spatial dimension larger than 1024.
+  // If the chosen implementation is FFT, validate that it can
+  // be used on the current data and default to a safe implementation if it
+  // can't.
+  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+  // defined only for 2d filters
+  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
+      PyGpuArray_NDIM(input) == 4) {
+    // Extract the properties of the convolution descriptor
+    int nd;
+    int pad[2];
+    int stride[2];
+    int upscale[2];
+    cudnnConvolutionMode_t mode;
+    cudnnDataType_t data_type;
+    cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+
+    if (stride[0] != 1 || stride[1] != 1 ||
+        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
+        (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
+      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+      #ifdef DEBUG
+      fprintf(stderr, "(replacing gradweight algo fft with none)\n");
+      #endif
+    }
+  }
+  *_algo = algo;
+  return 0;
+}

 int
 APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
@@ -24,6 +69,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  void *beta_p;
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  bool use_cached = 0;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError,
@@ -82,17 +128,14 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    return 1;
  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), groups) == -1)
    return 1;
-
  size_t input_offset = PyGpuArray_STRIDE(input, 0) / groups;
  size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / groups;
  size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;

  cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
-  #ifdef DEBUG
-  char algorithm_name[128];
-  #endif
  size_t worksize = 0;
  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
+
  std::string hashkey ;

  size_t free = c_get_largest_free_block_size(c);
@@ -105,11 +148,13 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    if (!reuse_algo) {
      char pci_id[16];
      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+      // check out cache
      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), *kerns, desc, output, groups);
-      if (hashkey.empty())
+      if (hashkey.empty()) {
+        cuda_exit(c->ctx);
        return 1;
-      hashkey =  hash_prefix + pci_id + hashkey;
-      // check out cache
+      }
+      hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
      const AlgoRec* cached = dnn_conv_check_cache(hashkey);
      if (cached) {
        prev_algo = *cached;
@@ -122,7 +167,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
      worksize = prev_algo.wsSize;
      mathtype = prev_algo.mathType;
    } else {
-
      if (params->choose_time) {
        int count;
        cudnnConvolutionBwdFilterAlgoPerf_t choice;
@@ -131,6 +175,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
        if (tmpmem == NULL) {
          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+          cuda_exit(c->ctx);
          return -1;
        }

@@ -152,11 +197,13 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        #ifdef DEBUG
        if (count == 0) {
            PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found");
+            cuda_exit(c->ctx);
            return 1;
        } else if (choice.status != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError,
                         "error getting best-timed gradweight algo: %s",
                         cudnnGetErrorString(choice.status));
+            cuda_exit(c->ctx);
            return 1;
        } // Else, count is necessarly 1 for current implementation.
        #endif
@@ -167,8 +214,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
 #if CUDNN_MAJOR >= 7
        prev_algo.mathType = mathtype = choice.mathType;
 #endif
-        // Add to the cache
-        dnn_conv_update_cache(hashkey, prev_algo);

      } else {
        err = cudnnGetConvolutionBackwardFilterAlgorithm(
@@ -189,49 +234,57 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    }
  } /* choose_algo */

+  if (dnn_conv_gw_fallback(&algo, input, *kerns, desc) != 0) {
+    cuda_exit(c->ctx);
+    return 1;
+  }
+
  // if FindEx was used (choose_time), workspace size is set.
  if (!(reuse_algo || use_cached || params->choose_time))
-    {
-
+  {
+    err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+      APPLY_SPECIFIC(kerns), algo, &worksize);
+    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
+      // Fallback to none algo if not supported
+#ifdef DEBUG
+      if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
+        cuda_exit(c->ctx);
+        return 1;
+      }
+      fprintf(stderr, "(error getting worksize for %s: falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0)\n",
+              algorithm_name);
+#endif
+      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
      err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
        params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
        APPLY_SPECIFIC(kerns), algo, &worksize);
+    }

-      if (err != CUDNN_STATUS_SUCCESS) {
-#ifdef DEBUG
-        if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
-          return 1;
-        fprintf(stderr, "(%s error getting worksize:%s, falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0",
-                algorithm_name, cudnnGetErrorString(err));
-#endif
-        algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-        err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
-          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
-          APPLY_SPECIFIC(kerns), algo, &worksize);
-
-        if (err != CUDNN_STATUS_SUCCESS) {
-          PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
-                       cudnnGetErrorString(err));
-
-
-          cuda_exit(c->ctx);
-          return 1;
-        }
-      }
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  }

-      // save for next time/cache
-      prev_algo.wsSize = worksize;
-      prev_algo.algo = algo;
+  if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
+    // algo may have changed due to fallback, we must update it.
+    prev_algo.algo = algo;
+    // save worksize for next time/cache
+    prev_algo.wsSize = worksize;

-      // Add to the cache
-      if (params->choose_algo)
-        dnn_conv_update_cache(hashkey, prev_algo);
-    }
+    // Add to the cache
+    dnn_conv_update_cache(hashkey, prev_algo);
+  }

 #ifdef DEBUG
  if (params->choose_algo) {
-    if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
+    if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
+      cuda_exit(c->ctx);
      return 1;
+    }
    // NB: This is printed only when algorithm is chosen at runtime.
    fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
            algorithm_name,
@@ -274,9 +327,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

-  for ( int g = 0; g < groups; g++)
-  {
-
+  for ( int g = 0; g < groups; g++) {
    err = cudnnConvolutionBackwardFilter(
      params->handle,
      alpha_p,
@@ -297,7 +348,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
+    PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv gradweight operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -399,7 +399,7 @@ class DnnBase(COp):
        return []

    def c_code_cache_version(self):
-        return (super(DnnBase, self).c_code_cache_version(), version(), 1)
+        return (super(DnnBase, self).c_code_cache_version(), version(), 4)


 class GpuDnnConvDesc(COp):

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -2287,6 +2287,152 @@ def dconvgi(border_mode, subsample, filter_dilation, num_groups):
    return dconvi


+class TestDnnConv2DRuntimeAlgorithms(object):
+    ndim = 2
+    cpu_conv_class = theano.tensor.nnet.corr.CorrMM
+    runtime_shapes = [
+        (3, [(2, 3, 10, 9), (5, 3, 7, 7)]),
+        (1, [(1, 1, 100, 200), (1, 1, 50, 200)]),
+        (1, [(4, 2, 20, 20), (2, 2, 20, 19)]),
+        (3, [(2, 3, 10, 9), (5, 3, 7, 7)]),  # cache should be used
+        (1, [(2, 2, 50, 50), (5, 2, 25, 31)]),
+        (1, [(1, 1, 100, 200), (1, 1, 50, 200)]),  # cache should be used
+        (1, [(4, 2, 20, 20), (2, 2, 20, 19)]),  # cache should be used
+        (1, [(1, 2, 3, 4), (6, 2, 2, 1)])
+    ]
+
+    def __init__(self):
+        self.runtime_algorithms = ('time_once', 'guess_once', 'time_on_shape_change', 'guess_on_shape_change')
+
+    def test_fwd_runtime_algorithms(self):
+        dtype = 'float32'
+        unit_shape = (1,) * self.ndim
+        _broadcastable = [False] * (2 + self.ndim)
+
+        def run_fwd_runtime_algorithm(algo):
+            inputs = theano.tensor.TensorType(dtype, _broadcastable)()
+            filters = theano.tensor.TensorType(dtype, _broadcastable)()
+            inputs /= 10
+            filters /= 10
+            conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
+                                subsample=unit_shape, dilation=unit_shape)
+            f = theano.function([inputs, filters], conv, mode=mode_with_gpu)
+            if self.ndim == 3:
+                flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
+            else:
+                flipped_filters = filters[:, :, ::-1, ::-1]
+            conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
+            f_ref = theano.function([inputs, filters], conv_ref, mode='FAST_RUN')
+            runtime_shapes = self.runtime_shapes
+            if algo in ('time_once', 'guess_once'):
+                runtime_shapes = [list(runtime_shapes[0])]
+                runtime_shapes[0][0] = 5
+            for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
+                for i in range(ntimes):
+                    inputs_val = np.random.random(inputs_shape).astype(dtype)
+                    filters_val = np.random.random(filters_shape).astype(dtype)
+                    gpu_res = f(inputs_val, filters_val)
+                    cpu_res = f_ref(inputs_val, filters_val)
+                    utt.assert_allclose(cpu_res, gpu_res)
+
+        for algo in self.runtime_algorithms:
+            yield (run_fwd_runtime_algorithm, algo)
+
+    def test_gradinput_runtime_algorithms(self):
+        dtype = 'float32'
+        unit_shape = (1,) * self.ndim
+        _broadcastable = [False] * (2 + self.ndim)
+
+        def run_gradinput_runtime_algorithm(algo):
+            theano.config.dnn.conv.algo_bwd_data = algo
+            inputs = theano.tensor.TensorType(dtype, _broadcastable)()
+            filters = theano.tensor.TensorType(dtype, _broadcastable)()
+            inputs /= 10
+            filters /= 10
+            conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
+                                subsample=unit_shape, dilation=unit_shape)
+            grad_i = theano.tensor.grad(conv.sum(), [inputs])
+            f = theano.function([inputs, filters], grad_i, mode=mode_with_gpu)
+            assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradI)])
+            assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
+            assert not any(isinstance(node.op, dnn.GpuDnnConvGradW) for node in f.maker.fgraph.apply_nodes)
+            if self.ndim == 3:
+                flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
+            else:
+                flipped_filters = filters[:, :, ::-1, ::-1]
+            conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
+            grad_i_ref = theano.tensor.grad(conv_ref.sum(), [inputs])
+            f_ref = theano.function([inputs, filters], grad_i_ref, mode='FAST_RUN')
+            runtime_shapes = self.runtime_shapes
+            if algo in ('time_once', 'guess_once'):
+                runtime_shapes = [list(runtime_shapes[0])]
+                runtime_shapes[0][0] = 5
+            for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
+                for i in range(ntimes):
+                    inputs_val = np.random.random(inputs_shape).astype(dtype)
+                    filters_val = np.random.random(filters_shape).astype(dtype)
+                    gpu_res = f(inputs_val, filters_val)
+                    cpu_res = f_ref(inputs_val, filters_val)
+                    utt.assert_allclose(cpu_res, gpu_res)
+
+        for algo in self.runtime_algorithms:
+            yield (run_gradinput_runtime_algorithm, algo)
+
+    def test_gradweight_runtime_algorithms(self):
+        dtype = 'float32'
+        unit_shape = (1,) * self.ndim
+        _broadcastable = [False] * (2 + self.ndim)
+
+        def run_gradweight_runtime_algorithm(algo):
+            theano.config.dnn.conv.algo_bwd_filter = algo
+            inputs = theano.tensor.TensorType(dtype, _broadcastable)()
+            filters = theano.tensor.TensorType(dtype, _broadcastable)()
+            inputs /= 10
+            filters /= 10
+            conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
+                                subsample=unit_shape, dilation=unit_shape)
+            grad_w = theano.tensor.grad(conv.sum(), [filters])
+            f = theano.function([inputs, filters], grad_w, mode=mode_with_gpu)
+            assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradW)])
+            assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
+            assert not any(isinstance(node.op, dnn.GpuDnnConvGradI) for node in f.maker.fgraph.apply_nodes)
+            if self.ndim == 3:
+                flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
+            else:
+                flipped_filters = filters[:, :, ::-1, ::-1]
+            conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
+            grad_w_ref = theano.tensor.grad(conv_ref.sum(), [filters])
+            f_ref = theano.function([inputs, filters], grad_w_ref, mode='FAST_RUN')
+            runtime_shapes = self.runtime_shapes
+            if algo in ('time_once', 'guess_once'):
+                runtime_shapes = [list(runtime_shapes[0])]
+                runtime_shapes[0][0] = 5
+            for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
+                for i in range(ntimes):
+                    inputs_val = np.random.random(inputs_shape).astype(dtype)
+                    filters_val = np.random.random(filters_shape).astype(dtype)
+                    gpu_res = f(inputs_val, filters_val)
+                    cpu_res = f_ref(inputs_val, filters_val)
+                    utt.assert_allclose(cpu_res, gpu_res)
+
+        for algo in self.runtime_algorithms:
+            yield (run_gradweight_runtime_algorithm, algo)
+
+
+class TestDnnConv3DRuntimeAlgorithms(TestDnnConv2DRuntimeAlgorithms):
+    ndim = 3
+    runtime_shapes = [
+        (3, [(2, 3, 5, 10, 9), (5, 3, 4, 7, 7)]),
+        (1, [(1, 1, 5, 100, 200), (1, 1, 4, 50, 200)]),
+        (1, [(4, 2, 20, 20, 20), (2, 2, 20, 19, 18)]),
+        (3, [(2, 3, 5, 10, 9), (5, 3, 4, 7, 7)]),  # cache should be used
+        (1, [(2, 2, 50, 50, 5), (5, 2, 25, 31, 4)]),
+        (1, [(1, 1, 5, 100, 200), (1, 1, 4, 50, 200)]),  # cache should be used
+        (1, [(4, 2, 20, 20, 20), (2, 2, 20, 19, 18)]),  # cache should be used
+        (1, [(1, 2, 3, 4, 5), (6, 2, 3, 2, 1)])
+    ]
+
+
 class Cudnn_grouped_conv(Grouped_conv_noOptim):
    mode = mode_with_gpu
    conv = staticmethod(dconvfwd)