Merge pull request #6302 from borisfom/tensor_op

Tensor op, cache

Merge pull request #6302 from borisfom/tensor_op
b998dc61 · Frédéric Bastien · GitHub · c470bd38 · e51b6a22 · b998dc61
--- a/theano/gpuarray/c_code/conv_desc.c
+++ b/theano/gpuarray/c_code/conv_desc.c
 #section support_code_apply

+static int c_set_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups) {
+#if CUDNN_MAJOR >= 7
+  cudnnStatus_t err = cudnnSetConvolutionGroupCount(desc, groups);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+		   "error setting groups for convolution : %s",
+		   cudnnGetErrorString(err));
+    return -1;
+  }
+#endif
+  return 0;
+}
+
 int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
                              cudnnConvolutionDescriptor_t *desc,
                              PARAMS_TYPE* params) {
@@ -43,5 +56,7 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
                 "descriptor: %s", cudnnGetErrorString(err));
    return -1;
  }
+  if (c_set_groups_for_conv(*desc, params->num_groups) == -1)
+      return -1;
  return 0;
 }
--- a/theano/gpuarray/c_code/cudnn_helper.h
+++ b/theano/gpuarray/c_code/cudnn_helper.h
@@ -11,6 +11,14 @@ static inline int cudnnGetVersion() {
 }
 #endif

-
+#if CUDNN_MAJOR < 7
+    enum cudnnMathType_t { CUDNN_DEFAULT_MATH=0, CUDNN_TENSOR_OP_MATH = 1 };
+#endif
+/* a common struct for all 3 CUDNN enums */
+struct AlgoRec {
+        int algo;
+        size_t wsSize;
+        cudnnMathType_t mathType;
+};

 #endif
--- a/theano/gpuarray/c_code/dnn_conv_base.c
+++ b/theano/gpuarray/c_code/dnn_conv_base.c
@@ -3,6 +3,43 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
 cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);

+static int c_get_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups) {
+#if CUDNN_MAJOR >= 7
+  int desc_groups;
+  if (groups > 1) {
+    cudnnStatus_t err = cudnnGetConvolutionGroupCount(desc, &desc_groups);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+		   "error getting groups for convolution : %s",
+		   cudnnGetErrorString(err));
+      return -1;
+    }
+    if (groups != desc_groups) {
+      PyErr_SetString(PyExc_MemoryError,
+              "groups specified different from convolution descriptor");
+      return -1;
+    }
+  }
+  return 1;
+#else
+  return groups;
+#endif
+}
+
+static int c_set_math_type_for_conv(cudnnConvolutionDescriptor_t desc, cudnnMathType_t mathtype) {
+#if CUDNN_MAJOR >= 7
+  // CUDNN7: need to set math type
+  cudnnStatus_t err = cudnnSetConvolutionMathType(desc, mathtype);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "error setting math type for convolution : %s",
+                 cudnnGetErrorString(err));
+    return -1;
+  }
+#endif
+  return 0;
+}
+
 #section init_code_struct

 cudnnStatus_t APPLY_SPECIFIC(err);
@@ -20,7 +57,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output)))
  FAIL;
 }
 if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s", 
+  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
 	       cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }
@@ -33,3 +70,220 @@ if (APPLY_SPECIFIC(output) != NULL)
  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
 if (APPLY_SPECIFIC(kerns) != NULL)
  cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
+
+#section support_code
+#include <sstream>
+#include <string>
+#if __cplusplus < 201103L
+#include <tr1/unordered_map>
+typedef std::tr1::unordered_map<std::string, AlgoRec> AlgoCache;
+#else
+#include <unordered_map>
+typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
+#endif
+#include "pthread.h"
+
+#line 87 "dnn_conv_base.c"
+
+pthread_mutex_t  algoMutex;
+AlgoCache        algoCache;
+
+static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err, const char* msg)
+{
+    if (err != CUDNN_STATUS_SUCCESS) {
+        PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s: %s",
+                     msg, cudnnGetErrorString(err));
+    }
+    return err;
+}
+
+static size_t
+c_get_largest_free_block_size(PyGpuContextObject *c)
+{
+  size_t maxfree = 0;
+
+  int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &maxfree);
+  if (err2 != GA_NO_ERROR) {
+    PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                 "memory information on the GPU");
+  }
+  // Guess 4Mb if the info is not available
+  if (maxfree == 0) maxfree = 4 * 1024 * 1024;
+  return maxfree;
+}
+
+/** Check if convolution output tensor has expected dimensions
+    depending on given inputs and number of groups.
+    return 0 if everything is ok, non-0 on error.
+**/
+static int dnn_check_convolution_output(cudnnConvolutionDescriptor_t convDesc,
+                                        cudnnTensorDescriptor_t inputDesc,
+                                        cudnnFilterDescriptor_t filterDesc,
+                                        size_t tensorNdim,
+                                        PyGpuArrayObject* output,
+                                        int groups) {
+    int expected_output_dims[5] = {0};
+    cudnnStatus_t err = cudnnGetConvolutionNdForwardOutputDim(convDesc, inputDesc, filterDesc,
+                                                              tensorNdim, expected_output_dims);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+    if (tensorNdim == 4) {
+      if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
+          (PyGpuArray_DIMS(output)[1] / groups != expected_output_dims[1]) ||
+          (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
+          (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
+        PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%d"
+                     " but received %ldx%ldx%ldx%ld",
+                     expected_output_dims[0], expected_output_dims[1] * groups,
+                     expected_output_dims[2], expected_output_dims[3],
+                     PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
+                     PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
+        return 1;
+      }
+    } else if (tensorNdim == 5) {
+      if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
+          (PyGpuArray_DIMS(output)[1] / groups != expected_output_dims[1]) ||
+          (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
+          (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
+          (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
+        PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%dx%d"
+                     " but received %ldx%ldx%ldx%ldx%ld",
+                     expected_output_dims[0], expected_output_dims[1] * groups,
+                     expected_output_dims[2], expected_output_dims[3],
+                     expected_output_dims[4],
+                     PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
+                     PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
+                     PyGpuArray_DIMS(output)[4]);
+        return 1;
+      }
+    }
+    return 0;
+}
+
+static std::string shape(int* res, int size)
+{
+    std::ostringstream s;
+    if (size > 0) {
+
+      s << res[0];
+      for (int i = 1; i < size; ++i)
+        s <<',' << res[i];
+    }
+    return s.str();
+}
+
+static std::string shape(cudnnTensorDescriptor_t t)
+{
+    // cuDNN can handle up to CUDNN_DIM_MAX dimensions.
+    int res[CUDNN_DIM_MAX];
+    int stride[CUDNN_DIM_MAX];
+    int nbDims;
+    cudnnDataType_t type;
+    checkCudnnStatus(cudnnGetTensorNdDescriptor(t, CUDNN_DIM_MAX, &type, &nbDims, res, stride),
+                     "error getting tensor description");
+    if (PyErr_Occurred()) return "";
+    return shape(res, nbDims) + "," + shape(stride, nbDims);
+};
+
+static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type)
+{
+    cudnnTensorFormat_t format;
+    int res[CUDNN_DIM_MAX];
+    int outDims;
+    checkCudnnStatus(cudnnGetFilterNdDescriptor(t, CUDNN_DIM_MAX, type, &format, &outDims, res),
+                     "error getting filter description");
+    if (PyErr_Occurred()) return "";
+    return shape(res, outDims);
+};
+
+static std::string shape(cudnnConvolutionDescriptor_t convDesc)
+{
+    int nDim;
+    cudnnConvolutionMode_t mode;
+    cudnnDataType_t        computeType;
+
+    int                                 padA[5];
+    int                                 strideA[5];
+    int                                 dilationA[5];
+
+    checkCudnnStatus(
+        cudnnGetConvolutionNdDescriptor( convDesc, 5,
+                                         &nDim,
+                                         &padA[0],
+                                         &strideA[0],
+                                         &dilationA[0],
+                                         &mode,
+                                         &computeType ),
+        "error getting convolution description");
+    if (PyErr_Occurred()) return "";
+
+    return (std::string("-mode ") +
+            ((mode == CUDNN_CONVOLUTION) ? "conv" : "cross") +
+            " -pad " +
+            shape(padA, nDim) +
+            " -subsample " +
+            shape(strideA, nDim) +
+            " -dilation " +
+            shape(dilationA, nDim));
+}
+
+static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
+{
+        size_t alignMask = (type == CUDNN_DATA_HALF) ? 0x7F : 0xFF ;
+        // there have to be entries for both aligned and not
+        if (((size_t)in | (size_t)out | (size_t)filter) & alignMask)
+        {
+            return false;
+        }
+        return true;
+}
+
+static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayObject* input,
+				  cudnnFilterDescriptor_t filterDesc, PyGpuArrayObject* filter,
+				  cudnnConvolutionDescriptor_t convDesc,
+				  PyGpuArrayObject* output, int groups)
+{
+    cudnnDataType_t  dType;
+    std::ostringstream s;
+    int expected_output_dims[5] = {0};
+    if (dnn_check_convolution_output(convDesc, inputDesc, filterDesc, PyGpuArray_NDIM(filter), output, groups) != 0)
+        return "";
+    std::string shapeInput = shape(inputDesc);
+    std::string shapeFilter = shape(filterDesc, &dType);
+    std::string shapeConvDesc = shape(convDesc);
+    if (shapeInput.empty() || shapeFilter.empty() || shapeConvDesc.empty())
+        return "";
+    s << "-g " << groups << " -dim " << shapeInput << " -filt " <<
+      shapeFilter << " " << shapeConvDesc;
+
+    // there have to be entries for both aligned and not.
+    if (!all_aligned(dType, PyGpuArray_DEV_DATA(input), PyGpuArray_DEV_DATA(output), PyGpuArray_DEV_DATA(filter)))
+    {
+      s << " [unaligned]";
+    }
+    return s.str();
+}
+
+static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
+{
+  pthread_mutex_lock(&algoMutex);
+  algoCache[hash] = rec;
+  pthread_mutex_unlock(&algoMutex);
+}
+
+static const AlgoRec* dnn_conv_check_cache(const std::string& hash)
+{
+  pthread_mutex_lock(&algoMutex);
+  const AlgoRec* ret = 0;
+
+  AlgoCache::iterator hit = algoCache.find(hash);
+
+  if (hit != algoCache.end())
+    ret = &hit->second;
+
+  pthread_mutex_unlock(&algoMutex);
+  return ret;
+}
--- a/theano/gpuarray/c_code/dnn_fwd.c
+++ b/theano/gpuarray/c_code/dnn_fwd.c
 #section init_code_struct
-
+prev_algo.algo = PARAMS->conv_algo;
+prev_algo.mathType = CUDNN_DEFAULT_MATH;
 reuse_algo = 0;
-prev_algo = PARAMS->conv_algo;
-memset(prev_img_dims, 0, sizeof(prev_img_dims));
-memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
+hash_prefix = std::string("FWD|GPU#");

 #section support_code_struct
+#line 9 "dnn_fwd.c"
+int     reuse_algo;
+AlgoRec prev_algo;
+std::string hash_prefix;
+#ifdef DEBUG
+char algorithm_name[128];
+#endif
+
+/** Check given algorithm against inputs and convolution descriptor,
+    change algorithm inplace to a fallback algorithm if checkings fail.
+    Return 0 on success, non-0 on error. **/
+int dnn_conv_fwd_fallback(cudnnConvolutionFwdAlgo_t* _algo,
+                          const PyGpuArrayObject* input,
+                          const PyGpuArrayObject* kerns,
+                          cudnnConvolutionDescriptor_t desc) {
+  cudnnConvolutionFwdAlgo_t algo = *_algo;

-int reuse_algo;
-cudnnConvolutionFwdAlgo_t prev_algo;
-size_t prev_img_dims[5];
-size_t prev_kern_dims[5];
+  /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
+  if (PyGpuArray_NDIM(input) == 5 &&
+      !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
+        algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
+        algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
+  {
+    #ifdef DEBUG
+    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
+        return 1;
+    fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
+    #endif
+    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+  }
+
+  // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
+  // Issue should be resolved for cuDNN > V6.0.
+  // NB: In cuDNN V7, issue is resolved for 2D convolutionss only.
+  if ((cudnnGetVersion() < 6100 || PyGpuArray_NDIM(input) == 5) &&
+      algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
+      PyGpuArray_DIM(input, 0) > 65536)
+  {
+    #ifdef DEBUG
+    fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
+                    "will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n");
+    #endif
+    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+  }
+
+  // The FFT implementation does not support strides, 1x1 filters or inputs
+  // with a spatial dimension larger than 1024. The tiled-FFT implementation
+  // does not support strides.
+  // If the chosen implementation is FFT or tiled-FFT, validate that it can
+  // be used on the current data and default to a safe implementation if it
+  // can't.
+  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+  // defined only for 2d filters
+  /* NB:
+  TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
+  New conditions apply and may depend on number of dimensions (2D or 3D)
+  e.g. for FFT_TILING.
+  TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
+  */
+  if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
+       algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
+
+    // Extract the properties of the convolution descriptor
+    int nd;
+    int pad[2];
+    int stride[2];
+    int dilation[2];
+    cudnnConvolutionMode_t mode;
+    cudnnDataType_t data_type;
+    cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, dilation, &mode, &data_type);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+
+    if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
+      if (stride[0] != 1 || stride[1] != 1 ||
+          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
+          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
+      {
+        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+        #ifdef DEBUG
+        fprintf(stderr, "(replacing fwd algo fft with none)\n");
+        #endif
+      }
+    } else {
+      // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
+      if (stride[0] != 1 || stride[1] != 1) {
+        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+        #ifdef DEBUG
+        fprintf(stderr, "(replacing fwd algo fft_tiling with none)\n");
+        #endif
+      }
+    }
+  }
+  *_algo = algo;
+  return 0;
+}

 int
 APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
@@ -24,6 +117,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  void *beta_p;
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  bool use_cached = 0;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError,
@@ -73,65 +167,76 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    return 0;
  }

-  if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1)
+  int groups = c_get_groups_for_conv(desc, params->num_groups);
+  if (groups == -1)
+    return 1;
+  if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), groups) == -1)
    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
+  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), groups) == -1)
    return 1;
-  if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), params->num_groups) == -1)
+  if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), groups) == -1)
    return 1;
-  size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups;
-  size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups;
-  size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups;
+  size_t input_offset = PyGpuArray_STRIDE(input, 0) / groups;
+  size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / groups;
+  size_t output_offset = PyGpuArray_STRIDE(*output, 0) / groups;

  cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
-  #ifdef DEBUG
-  char algorithm_name[128];
-  #endif
+  size_t worksize = 0;
+  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
+
+  std::string hashkey;
+

  cuda_enter(c->ctx);

+  size_t maxfree = c_get_largest_free_block_size(c);
+  if (PyErr_Occurred()) return 1;
+
  if (params->choose_algo) {
-    if (!params->choose_once) {
-      reuse_algo = 1;
-      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-        reuse_algo = (reuse_algo &&
-                      PyGpuArray_DIM(input, i) == prev_img_dims[i]);
-        reuse_algo = (reuse_algo &&
-                      PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
-      }
-    }

    if (!reuse_algo) {
-      size_t free;
-
-      int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
-      if (err2 != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                     "memory information on the GPU");
+      char pci_id[16];
+      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+      // check out cache
+      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups);
+      if (hashkey.empty()) {
        cuda_exit(c->ctx);
        return 1;
      }
+      hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
+      const AlgoRec* cached = dnn_conv_check_cache(hashkey);
+      if (cached) {
+        prev_algo = *cached;
+        use_cached = 1;
+      }
+    }

-      // Guess 4Mb if the info is not available
-      if (free == 0) free = 4 * 1024 * 1024;
-
+    if (reuse_algo || use_cached) {
+      algo = (cudnnConvolutionFwdAlgo_t)prev_algo.algo;
+      worksize = prev_algo.wsSize;
+      mathtype = prev_algo.mathType;
+    }  else {
      if (params->choose_time) {
        int count;
        cudnnConvolutionFwdAlgoPerf_t choice;
        gpudata *tmpmem;

-        tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+        tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
        if (tmpmem == NULL) {
-          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+          PyErr_SetString(PyExc_MemoryError, "Could not allocate GPU memory for FindEx");
+          cuda_exit(c->ctx);
          return -1;
        }
+        // set the 'tensor math ok' flag
+        c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
+
        // We don't sync the buffer as we don't care about the values.
        err = cudnnFindConvolutionForwardAlgorithmEx(
          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
          desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
          1, &count, &choice, *(void **)tmpmem,
-          free);
+          maxfree);
        gpudata_release(tmpmem);

        if (err != CUDNN_STATUS_SUCCESS) {
@@ -141,138 +246,56 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          cuda_exit(c->ctx);
          return 1;
        }
-        algo = choice.algo;

        #ifdef DEBUG
        if (count == 0) {
            PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found");
+            cuda_exit(c->ctx);
            return 1;
        } else if (choice.status != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError,
                         "error getting best-timed FWD algo: %s",
                         cudnnGetErrorString(choice.status));
+            cuda_exit(c->ctx);
            return 1;
        } // Else, count is necessarly 1 for current implementation.
        #endif

+        algo = choice.algo;
+        prev_algo.algo = (int)algo;
+        prev_algo.wsSize = worksize = choice.memory;
+#if CUDNN_MAJOR >= 7
+        prev_algo.mathType = mathtype = choice.mathType;
+#endif
+
      } else {
        err = cudnnGetConvolutionForwardAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
          desc, APPLY_SPECIFIC(output),
-          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
-          if (err != CUDNN_STATUS_SUCCESS) {
-            PyErr_Format(PyExc_RuntimeError,
-                         "error selecting convolution algo: %s",
-                         cudnnGetErrorString(err));
-            cuda_exit(c->ctx);
-            return 1;
-          }
-      }
-      prev_algo = algo;
-    } else {
-      algo = prev_algo;
-    }
-
-    #ifdef DEBUG
-    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
-        return 1;
-    // NB: This is printed only when algorithm is chosen at runtime.
-    if (reuse_algo)
-        fprintf(stderr, "(reused %s)\n", algorithm_name);
-    else
-        fprintf(stderr, "(using %s)\n", algorithm_name);
-    #endif
-
-    if (params->choose_once) {
-      reuse_algo = 1;
-    } else {
-      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-        prev_img_dims[i] = PyGpuArray_DIM(input, i);
-        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
+          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
+        if (err != CUDNN_STATUS_SUCCESS) {
+          PyErr_Format(PyExc_RuntimeError,
+                       "error selecting convolution algo: %s",
+                       cudnnGetErrorString(err));
+          cuda_exit(c->ctx);
+          return 1;
+        }
+        prev_algo.algo = algo;
+        // no tensor_op returned from Get()
+        prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
      }
    }
  }

-  /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
-  if (PyGpuArray_NDIM(input) == 5 &&
-      !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
-        algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
-        algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
-  {
-    #ifdef DEBUG
-    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
-        return 1;
-    fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
-    #endif
-    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-  }
-
-  // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
-  // Issue should be resolved for cuDNN > V6.0.
-  // NB: In cuDNN V7, issue is resolved for 2D convolutionss only.
-  if ((cudnnGetVersion() < 6100 || PyGpuArray_NDIM(input) == 5) &&
-      algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
-      PyGpuArray_DIM(input, 0) > 65536)
-  {
-    #ifdef DEBUG
-    fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
-                    "will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n");
-    #endif
-    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-  }
-
-  // The FFT implementation does not support strides, 1x1 filters or inputs
-  // with a spatial dimension larger than 1024. The tiled-FFT implementation
-  // does not support strides.
-  // If the chosen implementation is FFT or tiled-FFT, validate that it can
-  // be used on the current data and default to a safe implementation if it
-  // can't.
-  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-  // defined only for 2d filters
-  /* NB:
-  TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
-  New conditions apply and may depend on number of dimensions (2D or 3D)
-  e.g. for FFT_TILING.
-  TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
-  */
-  if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
-       algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
-
-    // Extract the properties of the convolution descriptor
-    int nd;
-    int pad[2];
-    int stride[2];
-    int dilation[2];
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t data_type;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                             dilation, &mode, &data_type);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error getting convolution properties: %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
-    if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
-      if (stride[0] != 1 || stride[1] != 1 ||
-          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
-          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
-      {
-        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-      }
-    } else {
-      // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
-      if (stride[0] != 1 || stride[1] != 1) {
-        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-      }
-    }
+  if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
+      dnn_conv_fwd_fallback(&algo, input, kerns, desc) != 0) {
+    cuda_exit(c->ctx);
+    return 1;
  }

+  // if FindEx was used (choose_time), workspace size is set.
+  if (!(reuse_algo || use_cached || params->choose_time))
  {
-    size_t worksize;
-    gpudata *workspace;
    err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
@@ -280,19 +303,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                                                  APPLY_SPECIFIC(output),
                                                  algo,
                                                  &worksize);
-
    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
      // Fallback to none algo if not supported
-
      #ifdef DEBUG
-      if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
+      if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
+        cuda_exit(c->ctx);
        return 1;
-      fprintf(stderr, "(%s error getting worksize: "
-                      "fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
+      }
+      fprintf(stderr, "(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n",
+              algorithm_name);
      #endif
-
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-
      err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
                                                    APPLY_SPECIFIC(input),
                                                    APPLY_SPECIFIC(kerns),
@@ -303,13 +324,47 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    }

    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error getting worksize: %s",
+      PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }
+  }
+
+  if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
+    // algo may have changed due to fallback, we must update it.
+    prev_algo.algo = algo;
+    // save worksize for next time/cache
+    prev_algo.wsSize = worksize;
+
+    // Add to the cache if we choose on shape change, or first time if we choose once.
+    dnn_conv_update_cache(hashkey, prev_algo);
+  }
+
+#ifdef DEBUG
+  if (params->choose_algo) {
+    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
+      cuda_exit(c->ctx);
+      return 1;
+    }
+    fprintf(stderr, "(using %s%s %s%s%s, ws:%ld, hash:%s)\n",
+            algorithm_name,
+            mathtype == CUDNN_TENSOR_OP_MATH ? "[T]" : "",
+            params->choose_time ? "(timed)": "" ,
+            reuse_algo ? "(reused)" : "",
+            use_cached ? "(cache)": "",
+            worksize,
+            hashkey.c_str()
+      );
+  }
+#endif

+  if (params->choose_once) {
+    reuse_algo = 1;
+  }
+
+  {
+    gpudata *workspace = 0;
    /*
     * This is less than ideal since we need to free it after (which
     * introduces a synchronization point. But we don't have a module
@@ -318,8 +373,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    if (worksize != 0) {
      workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
      if (workspace == NULL) {
-        PyErr_SetString(PyExc_RuntimeError,
-                        "Could not allocate working memory");
+        PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
        cuda_exit(c->ctx);
        return 1;
      }
@@ -329,16 +383,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

-    for ( int g = 0; g < params->num_groups; g++) {
-    err = cudnnConvolutionForward(
-      params->handle,
-      alpha_p,
-      APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g,
-      APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g,
-      desc, algo,
-      worksize == 0 ? NULL : *(void **)workspace, worksize,
-      beta_p,
-      APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g);
+    for ( int g = 0; g < groups; g++) {
+      err = cudnnConvolutionForward(
+        params->handle,
+        alpha_p,
+        APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g,
+        APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g,
+        desc, algo,
+        worksize == 0 ? NULL : *(void **)workspace, worksize,
+        beta_p,
+        APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g);
    }

    if (worksize != 0)
@@ -348,12 +402,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
  }
+
  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
+    PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv FWD operation: %s",
 		 cudnnGetErrorString(err));
    return 1;
  }
  return 0;
 }
+
+
--- a/theano/gpuarray/c_code/dnn_gi.c
+++ b/theano/gpuarray/c_code/dnn_gi.c
 #section init_code_struct
-
+prev_algo.algo = PARAMS->conv_algo;
+prev_algo.mathType = CUDNN_DEFAULT_MATH;
 reuse_algo = 0;
-prev_algo = PARAMS->conv_algo;
-memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
-memset(prev_top_dims, 0, sizeof(prev_top_dims));
+hash_prefix = std::string("GI|GPU#");

 #section support_code_struct
+#line 9 "dnn_gi.c"
+int     reuse_algo;
+AlgoRec prev_algo;
+std::string hash_prefix;
+#ifdef DEBUG
+char algorithm_name[128];
+#endif
+
+/** Check given algorithm against inputs and convolution descriptor,
+    change algorithm inplace to a fallback algorithm if checkings fail.
+    Return 0 on success, non-0 on error. **/
+int dnn_conv_gi_fallback(cudnnConvolutionBwdDataAlgo_t* _algo,
+                         const PyGpuArrayObject* input,
+                         const PyGpuArrayObject* kerns,
+                         cudnnConvolutionDescriptor_t desc) {
+  cudnnConvolutionBwdDataAlgo_t algo = *_algo;

-int reuse_algo;
-cudnnConvolutionBwdDataAlgo_t prev_algo;
-size_t prev_kern_dims[5];
-size_t prev_top_dims[5];
+  // The FFT implementation does not support strides, 1x1 filters or inputs
+  // with a spatial dimension larger than 1024. The tiled-FFT implementation
+  // does not support strides.
+  // If the chosen implementation is FFT or tiled-FFT, validate that it can
+  // be used on the current data and default to a safe implementation if it
+  // can't.
+  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+  // defined only for 2d filters
+  if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
+       algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
+
+    // Extract the properties of the convolution descriptor
+    int nd;
+    int pad[2];
+    int stride[2];
+    int upscale[2];
+    cudnnConvolutionMode_t mode;
+    cudnnDataType_t data_type;
+    cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+
+    if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
+      if (stride[0] != 1 || stride[1] != 1 ||
+          PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
+          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
+      {
+        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+        #ifdef DEBUG
+        fprintf(stderr, "(replacing gradinput algo fft with none)\n");
+        #endif
+      }
+    } else {
+      // algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
+      if (stride[0] != 1 || stride[1] != 1) {
+        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+        #ifdef DEBUG
+        fprintf(stderr, "(replacing gradinput algo fft_tiling with none)\n");
+        #endif
+      }
+    }
+  }
+  *_algo = algo;
+  return 0;
+}

 int
 APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
@@ -23,6 +82,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  void *beta_p;
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  bool use_cached = 0;

  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
@@ -72,233 +132,200 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    return 0;
  }

-  if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), params->num_groups) == -1)
+  int groups = c_get_groups_for_conv(desc, params->num_groups);
+  if (groups == -1)
    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
+  if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), groups) == -1)
    return 1;
-  if (c_set_tensor_for_conv(*input, APPLY_SPECIFIC(input), params->num_groups) == -1)
+  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), groups) == -1)
    return 1;
-  size_t input_offset = PyGpuArray_STRIDE(*input, 0) / params->num_groups;
-  size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups;
-  size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
-
-  cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
-  #ifdef DEBUG
-  char algorithm_name[128];
-  #endif
-
-  cuda_enter(c->ctx);
-
-  int expected_output_dims[5] = {0};
-  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-                                              PyGpuArray_NDIM(im), expected_output_dims);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(c->ctx);
+  if (c_set_tensor_for_conv(*input, APPLY_SPECIFIC(input), groups) == -1)
    return 1;
-  }
-  if (PyGpuArray_NDIM(im) == 4) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  } else if (PyGpuArray_NDIM(im) == 5) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
-        (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   expected_output_dims[4],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
-                   PyGpuArray_DIMS(output)[4]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  }

-  if (params->choose_algo) {
-    if (!params->choose_once) {
-      reuse_algo = 1;
-      for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
-        reuse_algo = (reuse_algo &&
-                      PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
-        reuse_algo = (reuse_algo &&
-                      PyGpuArray_DIM(output, i) == prev_top_dims[i]);
-      }
-    }
+  if (0 != dnn_check_convolution_output(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+                                        PyGpuArray_NDIM(kerns), output, groups))
+    return 1;

-    if (!reuse_algo) {
-      size_t free;
-      int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+  size_t input_offset = PyGpuArray_STRIDE(*input, 0) / groups;
+  size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / groups;
+  size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;

-      if (err2 != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                     "memory information on the GPU");
-        cuda_exit(c->ctx);
-        return 1;
-      }
+  cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
+  size_t worksize = 0;
+  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;

-      // Guess 4Mb if the info is not available
-      if (free == 0) free = 4 * 1024 * 1024;
+  std::string hashkey;

-    if (params->choose_time) {
-      int count;
-      cudnnConvolutionBwdDataAlgoPerf_t choice;
-      gpudata *tmpmem;
+  size_t maxfree = c_get_largest_free_block_size(c);
+  if (PyErr_Occurred()) return 1;

-      tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
-      if (tmpmem == NULL) {
-        PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
-        return -1;
-      }
+  cuda_enter(c->ctx);

-      err = cudnnFindConvolutionBackwardDataAlgorithmEx(
-        params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-        APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
-        APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
-        1, &count, &choice, *(void **)tmpmem, free);
-      gpudata_release(tmpmem);
+  if (params->choose_algo) {

-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
-                     cudnnGetErrorString(err));
+    if (!reuse_algo) {
+      char pci_id[16];
+      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+      // check out cache
+      hashkey=dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
+      if (hashkey.empty()) {
        cuda_exit(c->ctx);
        return 1;
      }
+      hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
+      const AlgoRec* cached = dnn_conv_check_cache(hashkey);
+      if (cached) {
+        prev_algo = *cached;
+        use_cached = 1;
+      }
+    }

-      algo = choice.algo;
-
-      #ifdef DEBUG
-      if (count == 0) {
-          PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
+    if (reuse_algo || use_cached) {
+      algo = (cudnnConvolutionBwdDataAlgo_t)prev_algo.algo;
+      worksize = prev_algo.wsSize;
+      mathtype = prev_algo.mathType;
+    } else {
+      if (params->choose_time) {
+        int count;
+        cudnnConvolutionBwdDataAlgoPerf_t choice;
+        gpudata *tmpmem;
+
+        // set the 'tensor math ok' flag
+        c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
+
+        tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
+        if (tmpmem == NULL) {
+          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+          cuda_exit(c->ctx);
+          return -1;
+        }
+
+        err = cudnnFindConvolutionBackwardDataAlgorithmEx(
+          params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
+          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
+          APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
+          1, &count, &choice, *(void **)tmpmem, maxfree);
+        gpudata_release(tmpmem);
+
+        if (err != CUDNN_STATUS_SUCCESS) {
+          PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
+                       cudnnGetErrorString(err));
+          cuda_exit(c->ctx);
          return 1;
-      } else if (choice.status != CUDNN_STATUS_SUCCESS) {
-          PyErr_Format(PyExc_RuntimeError,
-                       "error getting best-timed gradinput algo: %s",
-                       cudnnGetErrorString(choice.status));
+        }
+
+        #ifdef DEBUG
+        if (count == 0) {
+            PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
+            cuda_exit(c->ctx);
+            return 1;
+        } else if (choice.status != CUDNN_STATUS_SUCCESS) {
+            PyErr_Format(PyExc_RuntimeError, "error getting best-timed gradinput algo: %s",
+                         cudnnGetErrorString(choice.status));
+            cuda_exit(c->ctx);
+            return 1;
+        } // Else, count is necessarly 1 for current implementation.
+        #endif
+
+        algo = choice.algo;
+        prev_algo.algo = (int)algo;
+        prev_algo.wsSize = worksize = choice.memory;
+#if CUDNN_MAJOR >= 7
+        prev_algo.mathType = mathtype = choice.mathType;
+#endif
+
+      } else {
+        err = cudnnGetConvolutionBackwardDataAlgorithm(
+          params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
+          desc, APPLY_SPECIFIC(input),
+          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
+        if (err != CUDNN_STATUS_SUCCESS) {
+          PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
+                       cudnnGetErrorString(err));
+          cuda_exit(c->ctx);
          return 1;
-      } // Else, count is necessarly 1 for current implementation.
-      #endif
-
-    } else {
-      err = cudnnGetConvolutionBackwardDataAlgorithm(
-        params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
-        desc, APPLY_SPECIFIC(input),
-        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
-      if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
-                     cudnnGetErrorString(err));
-        cuda_exit(c->ctx);
-        return 1;
+        }
+        prev_algo.algo = algo;
+        // no tensor_op returned from Get()
+        prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
      }
    }
-      prev_algo = algo;
-    } else {
-      algo = prev_algo;
-    }
+  }
+
+  if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
+      dnn_conv_gi_fallback(&algo, *input, kerns, desc) != 0) {
+    cuda_exit(c->ctx);
+    return 1;
+  }

-    #ifdef DEBUG
-    char algorithm_name[128];
-    if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name))
+  // if FindEx was used (choose_time), workspace size is set.
+  if (!(reuse_algo || use_cached || params->choose_time))
+  {
+    err = cudnnGetConvolutionBackwardDataWorkspaceSize(
+      params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
+      APPLY_SPECIFIC(input), algo, &worksize);
+    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
+      // Fallback to none algo if not supported
+      #ifdef DEBUG
+      if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
+        cuda_exit(c->ctx);
        return 1;
-    // NB: This is printed only when algorithm is chosen at runtime.
-    if (reuse_algo)
-        fprintf(stderr, "(reused %s)\n", algorithm_name);
-    else
-        fprintf(stderr, "(using %s)\n", algorithm_name);
-    #endif
-
-    if (params->choose_once) {
-      reuse_algo = 1;
-    } else {
-      for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
-        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
-        prev_top_dims[i] = PyGpuArray_DIM(output, i);
      }
+      fprintf(stderr, "(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_BWD_DATA_ALGO_0)\n",
+              algorithm_name);
+      #endif
+      algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+      err = cudnnGetConvolutionBackwardDataWorkspaceSize(
+        params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
+        APPLY_SPECIFIC(input), algo, &worksize);
    }
-  }
-
-  // The FFT implementation does not support strides, 1x1 filters or inputs
-  // with a spatial dimension larger than 1024. The tiled-FFT implementation
-  // does not support strides.
-  // If the chosen implementation is FFT or tiled-FFT, validate that it can
-  // be used on the current data and default to a safe implementation if it
-  // can't.
-  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-  // defined only for 2d filters
-  if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
-       algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {

-    // Extract the properties of the convolution descriptor
-    int nd;
-    int pad[2];
-    int stride[2];
-    int upscale[2];
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t data_type;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                             upscale, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error getting convolution properties: %s",
+      PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }
+  }  // !(reuse_algo || use_cached || params->choose_time)

-    if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
-    {
-      if (stride[0] != 1 || stride[1] != 1 ||
-          PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
-          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
-      {
-        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-      }
-    }
-    else
-    {
-      // algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
-      if (stride[0] != 1 || stride[1] != 1)
-      {
-        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-      }
-    }
-  }
+  if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
+    // algo may have changed due to fallback, we must update it.
+    prev_algo.algo = algo;
+    // save worksize for next time/cache
+    prev_algo.wsSize = worksize;

-  size_t worksize;
-  gpudata *workspace;
+    // Add to the cache
+    dnn_conv_update_cache(hashkey, prev_algo);
+  }

-  err = cudnnGetConvolutionBackwardDataWorkspaceSize(
-    params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
-    APPLY_SPECIFIC(input), algo, &worksize);
+#ifdef DEBUG
+  if (params->choose_algo) {
+    if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
+        cuda_exit(c->ctx);
+        return 1;
+    }
+    // NB: This is printed only when algorithm is chosen at runtime.
+    fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
+            algorithm_name,
+            params->choose_time ? "(timed)": "" ,
+            reuse_algo ? "(reused)" : "",
+            use_cached ? "(cache)": "",
+            mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
+            worksize,
+            hashkey.c_str()
+      );
+  }
+#endif

-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(c->ctx);
-    return 1;
+  if (params->choose_once) {
+    reuse_algo = 1;
  }

+  gpudata *workspace = 0;
  if (worksize != 0) {
    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {
-      PyErr_SetString(PyExc_RuntimeError,
-                      "Could not allocate working memory");
+      PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
      cuda_exit(c->ctx);
      return 1;
    }
@@ -308,8 +335,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

-  for ( int g = 0; g < params->num_groups; g++)
-  {
+  for ( int g = 0; g < groups; g++) {
    err = cudnnConvolutionBackwardData(
      params->handle,
      alpha_p,
@@ -330,7 +356,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
+    PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv gradinput operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }

--- a/theano/gpuarray/c_code/dnn_gw.c
+++ b/theano/gpuarray/c_code/dnn_gw.c
 #section init_code_struct
-
+prev_algo.algo = PARAMS->conv_algo;
+prev_algo.mathType = CUDNN_DEFAULT_MATH;
 reuse_algo = 0;
-prev_algo = PARAMS->conv_algo;
-memset(prev_img_dims, 0, sizeof(prev_img_dims));
-memset(prev_top_dims, 0, sizeof(prev_top_dims));
+hash_prefix = std::string("GW|GPU#");

 #section support_code_struct
+#line 9 "dnn_gw.c"
+int     reuse_algo;
+AlgoRec prev_algo;
+std::string hash_prefix;
+#ifdef DEBUG
+char algorithm_name[128];
+#endif
+
+/** Check given algorithm against inputs and convolution descriptor,
+    change algorithm inplace to a fallback algorithm if checkings fail.
+    Return 0 on success, non-0 on error. **/
+int dnn_conv_gw_fallback(cudnnConvolutionBwdFilterAlgo_t* _algo,
+                         const PyGpuArrayObject* input,
+                         const PyGpuArrayObject* kerns,
+                         cudnnConvolutionDescriptor_t desc) {
+  cudnnConvolutionBwdFilterAlgo_t algo = *_algo;

-int reuse_algo;
-cudnnConvolutionBwdFilterAlgo_t prev_algo;
-size_t prev_img_dims[5];
-size_t prev_top_dims[5];
+  // The FFT implementation does not support strides, 1x1 filters or inputs
+  // with a spatial dimension larger than 1024.
+  // If the chosen implementation is FFT, validate that it can
+  // be used on the current data and default to a safe implementation if it
+  // can't.
+  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+  // defined only for 2d filters
+  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
+      PyGpuArray_NDIM(input) == 4) {
+    // Extract the properties of the convolution descriptor
+    int nd;
+    int pad[2];
+    int stride[2];
+    int upscale[2];
+    cudnnConvolutionMode_t mode;
+    cudnnDataType_t data_type;
+    cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+
+    if (stride[0] != 1 || stride[1] != 1 ||
+        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
+        (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
+      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+      #ifdef DEBUG
+      fprintf(stderr, "(replacing gradweight algo fft with none)\n");
+      #endif
+    }
+  }
+  *_algo = algo;
+  return 0;
+}

 int
 APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
@@ -23,6 +69,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  void *beta_p;
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  bool use_cached = 0;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError,
@@ -72,99 +119,71 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    return 0;
  }

-  if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1)
+  int groups = c_get_groups_for_conv(desc, params->num_groups);
+  if (groups == -1)
    return 1;
-  if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), params->num_groups) == -1)
+  if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), groups) == -1)
    return 1;
-  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
+  if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), groups) == -1)
+    return 1;
+  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), groups) == -1)
    return 1;

-  size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups;
-  size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / params->num_groups;
-  size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
+  if (0 != dnn_check_convolution_output(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+                                        PyGpuArray_NDIM(*kerns), output, groups))
+    return 1;
+
+  size_t input_offset = PyGpuArray_STRIDE(input, 0) / groups;
+  size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / groups;
+  size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;

  cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
-  #ifdef DEBUG
-  char algorithm_name[128];
-  #endif
+  size_t worksize = 0;
+  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;

-  cuda_enter(c->ctx);
+  std::string hashkey ;

-  int expected_output_dims[5] = {0};
-  err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-                                              PyGpuArray_NDIM(input), expected_output_dims);
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(c->ctx);
-    return 1;
-  }
-  if (PyGpuArray_NDIM(input) == 4) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld"
-                                     " but received gradient with shape %ldx%ldx%dx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  } else if (PyGpuArray_NDIM(input) == 5) {
-    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
-        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
-        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
-        (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
-                   expected_output_dims[0], expected_output_dims[1],
-                   expected_output_dims[2], expected_output_dims[3],
-                   expected_output_dims[4],
-                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
-                   PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
-                   PyGpuArray_DIMS(output)[4]);
-      cuda_exit(c->ctx);
-      return 1;
-    }
-  }
+  size_t maxfree = c_get_largest_free_block_size(c);
+  if (PyErr_Occurred()) return 1;
+
+  cuda_enter(c->ctx);

  if (params->choose_algo) {
-    if (!params->choose_once) {
-      reuse_algo = 1;
-      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-        reuse_algo = (reuse_algo &&
-                      PyGpuArray_DIM(input, i) == prev_img_dims[i]);
-        reuse_algo = (reuse_algo &&
-                      PyGpuArray_DIM(output, i) == prev_top_dims[i]);
-      }
-    }

    if (!reuse_algo) {
-      size_t free;
-
-      int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
-      if (err2 != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                     "memory information on the GPU");
+      char pci_id[16];
+      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+      // check out cache
+      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), *kerns, desc, output, groups);
+      if (hashkey.empty()) {
        cuda_exit(c->ctx);
        return 1;
      }
+      hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
+      const AlgoRec* cached = dnn_conv_check_cache(hashkey);
+      if (cached) {
+        prev_algo = *cached;
+        use_cached = 1;
+      }
+    }

-      // Guess 4Mb if the info is not available
-      if (free == 0) free = 4 * 1024 * 1024;
-
+    if (reuse_algo || use_cached) {
+      algo = (cudnnConvolutionBwdFilterAlgo_t)prev_algo.algo;
+      worksize = prev_algo.wsSize;
+      mathtype = prev_algo.mathType;
+    } else {
      if (params->choose_time) {
        int count;
        cudnnConvolutionBwdFilterAlgoPerf_t choice;
        gpudata *tmpmem;

-        tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+        // set the 'tensor math ok' flag
+        c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
+
+        tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
        if (tmpmem == NULL) {
          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+          cuda_exit(c->ctx);
          return -1;
        }

@@ -172,7 +191,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
-          1, &count, &choice, *(void **)tmpmem, free);
+          1, &count, &choice, *(void **)tmpmem, maxfree);
        gpudata_release(tmpmem);

        if (err != CUDNN_STATUS_SUCCESS) {
@@ -183,25 +202,32 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
          return 1;
        }

-        algo = choice.algo;
-
        #ifdef DEBUG
        if (count == 0) {
            PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found");
+            cuda_exit(c->ctx);
            return 1;
        } else if (choice.status != CUDNN_STATUS_SUCCESS) {
            PyErr_Format(PyExc_RuntimeError,
                         "error getting best-timed gradweight algo: %s",
                         cudnnGetErrorString(choice.status));
+            cuda_exit(c->ctx);
            return 1;
        } // Else, count is necessarly 1 for current implementation.
        #endif

+        algo = choice.algo;
+        prev_algo.algo = (int)algo;
+        prev_algo.wsSize = worksize = choice.memory;
+#if CUDNN_MAJOR >= 7
+        prev_algo.mathType = mathtype = choice.mathType;
+#endif
+
      } else {
        err = cudnnGetConvolutionBackwardFilterAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
          desc, APPLY_SPECIFIC(kerns),
-          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError,
                       "error selecting convolution algo: %s",
@@ -209,79 +235,84 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
          cuda_exit(c->ctx);
          return 1;
        }
+	    prev_algo.algo = algo;
+	    // no tensor_op returned from Get()
+	    prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
      }
-      prev_algo = algo;
-    } else {
-      algo = prev_algo;
    }
+  } /* choose_algo */
+
+  if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
+      dnn_conv_gw_fallback(&algo, input, *kerns, desc) != 0) {
+    cuda_exit(c->ctx);
+    return 1;
+  }

-    #ifdef DEBUG
-    if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
+  // if FindEx was used (choose_time), workspace size is set.
+  if (!(reuse_algo || use_cached || params->choose_time))
+  {
+    err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+      APPLY_SPECIFIC(kerns), algo, &worksize);
+    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
+      // Fallback to none algo if not supported
+#ifdef DEBUG
+      if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
+        cuda_exit(c->ctx);
        return 1;
-    // NB: This is printed only when algorithm is chosen at runtime.
-    if (reuse_algo)
-        fprintf(stderr, "(reused %s)\n", algorithm_name);
-    else
-        fprintf(stderr, "(using %s)\n", algorithm_name);
-    #endif
-
-    if (params->choose_once) {
-      reuse_algo = 1;
-    } else {
-      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-        prev_img_dims[i] = PyGpuArray_DIM(input, i);
-        prev_top_dims[i] = PyGpuArray_DIM(output, i);
      }
+      fprintf(stderr, "(error getting worksize for %s: falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0)\n",
+              algorithm_name);
+#endif
+      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+      err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+        APPLY_SPECIFIC(kerns), algo, &worksize);
    }
-  }

-  // The FFT implementation does not support strides, 1x1 filters or inputs
-  // with a spatial dimension larger than 1024.
-  // If the chosen implementation is FFT, validate that it can
-  // be used on the current data and default to a safe implementation if it
-  // can't.
-  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-  // defined only for 2d filters
-  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
-      PyGpuArray_NDIM(input) == 4) {
-    // Extract the properties of the convolution descriptor
-    int nd;
-    int pad[2];
-    int stride[2];
-    int upscale[2];
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t data_type;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                             upscale, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error getting convolution properties: %s",
+      PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
                   cudnnGetErrorString(err));
      cuda_exit(c->ctx);
      return 1;
    }
-
-    if (stride[0] != 1 || stride[1] != 1 ||
-        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
-        (PyGpuArray_DIM(*kerns, 2) == 1 && PyGpuArray_DIM(*kerns, 3) == 1)) {
-      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
-    }
  }

-  size_t worksize;
-  gpudata *workspace;
+  if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
+    // algo may have changed due to fallback, we must update it.
+    prev_algo.algo = algo;
+    // save worksize for next time/cache
+    prev_algo.wsSize = worksize;

-  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
-    params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
-    APPLY_SPECIFIC(kerns), algo, &worksize);
+    // Add to the cache
+    dnn_conv_update_cache(hashkey, prev_algo);
+  }

-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
-                 cudnnGetErrorString(err));
+#ifdef DEBUG
+  if (params->choose_algo) {
+    if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
      cuda_exit(c->ctx);
-    return 1;
+      return 1;
+    }
+    // NB: This is printed only when algorithm is chosen at runtime.
+    fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
+            algorithm_name,
+            params->choose_time ? "(timed)": "" ,
+            reuse_algo ? "(reused)" : "",
+            use_cached ? "(cache)": "",
+            mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
+            worksize,
+            hashkey.c_str()
+     );
+  }
+#endif
+
+  if (params->choose_once) {
+    reuse_algo = 1;
  }

+  gpudata *workspace = 0;
+
  if (worksize != 0) {
    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {
@@ -295,9 +326,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

-  for ( int g = 0; g < params->num_groups; g++)
-  {
-
+  for ( int g = 0; g < groups; g++) {
    err = cudnnConvolutionBackwardFilter(
      params->handle,
      alpha_p,
@@ -318,7 +347,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
+    PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv gradweight operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -399,7 +399,7 @@ class DnnBase(COp):
        return []

    def c_code_cache_version(self):
-        return (super(DnnBase, self).c_code_cache_version(), version(), 1)
+        return (super(DnnBase, self).c_code_cache_version(), version(), 4)


 class GpuDnnConvDesc(COp):
@@ -412,7 +412,8 @@ class GpuDnnConvDesc(COp):

    """

-    __props__ = ('border_mode', 'subsample', 'dilation', 'conv_mode', 'precision')
+    __props__ = ('border_mode', 'subsample', 'dilation', 'conv_mode',
+                 'precision', 'num_groups')
    params_type = ParamsType(pad0=int_t, pad1=int_t, pad2=int_t,
                             sub0=int_t, sub1=int_t, sub2=int_t,
                             dil0=int_t, dil1=int_t, dil2=int_t,
@@ -421,7 +422,8 @@ class GpuDnnConvDesc(COp):
                                            ('BORDER_MODE_VALID', 'valid'),
                                            ('BORDER_MODE_HALF', 'half')),
                             conv_mode=cudnn.cudnnConvolutionMode_t,
-                             precision=cudnn.cudnnDataType_t)
+                             precision=cudnn.cudnnDataType_t,
+                             num_groups=int_t)

    def c_headers(self):
        return ['cudnn.h', 'cudnn_helper.h']
@@ -448,7 +450,7 @@ class GpuDnnConvDesc(COp):
        return False

    def __init__(self, border_mode, subsample=(1, 1), dilation=(1, 1), conv_mode='conv',
-                 precision="float32"):
+                 precision="float32", num_groups=1):
        COp.__init__(self, ["c_code/conv_desc.c"], "APPLY_SPECIFIC(conv_desc)")

        if version() < 6000 and any([d != 1 for d in dilation]):
@@ -470,6 +472,7 @@ class GpuDnnConvDesc(COp):
        self.subsample = subsample
        assert cudnn.cudnnConvolutionMode_t.has_alias(conv_mode)
        self.conv_mode = conv_mode
+        self.num_groups = num_groups

        assert len(dilation) == len(subsample)
        self.dilation = dilation
@@ -514,6 +517,8 @@ class GpuDnnConvDesc(COp):
        self.__dict__.update(d)
        if not hasattr(self, "dilation"):
            self.dilation = (1,) * len(self.subsample)
+        if not hasattr(self, "num_groups"):
+            self.num_groups = 1


 # scalar constants
@@ -622,8 +627,6 @@ class GpuDnnConv(DnnBase):
                                                    SUPPORTED_DNN_CONV_ALGO_RUNTIME):
            raise ValueError("convolution algo %s can't be used for "
                             "3d convolutions", (self.algo,))
-        if img.type.ndim == 5 and self.num_groups != 1:
-            raise ValueError("Grouped convolutions not implemented for 3D convolutions")

        if (not isinstance(desc.type, CDataType) or
                desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
@@ -854,7 +857,6 @@ class GpuDnnConvGradI(DnnBase):
        if algo is None:
            algo = config.dnn.conv.algo_bwd_data
        self.algo = algo
-
        assert cudnn.cudnnConvolutionBwdDataAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME

        self.conv_algo = cudnn.cudnnConvolutionBwdDataAlgo_t.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
@@ -1039,7 +1041,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
    img = gpu_contiguous(img)
    kerns = gpu_contiguous(kerns)
    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
-                          conv_mode=conv_mode, precision=precision)(kerns.shape)
+                          conv_mode=conv_mode, precision=precision,
+                          num_groups=num_groups)(kerns.shape)
    desc_op = desc.owner.op
    # We can use Shape_i and bypass the infer_shape here as this is on
    # the input of node and it will always be present.
@@ -1056,7 +1059,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),

 def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1),
               conv_mode='conv', direction_hint=None,
-               algo=None, precision=None):
+               algo=None, precision=None, num_groups=1):
    """
    GPU convolution using cuDNN from NVIDIA.

@@ -1099,6 +1102,9 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
        should be done. Possible values are 'as_input', 'float16', 'float32'
        and 'float64'. Default is the value of
        :attr:`config.dnn.conv.precision`.
+    num_groups :
+        Divides the image, kernel and output tensors into num_groups
+        separate groups. Each which carry out convolutions separately


    .. warning:: The cuDNN library only works with GPUs that have a compute
@@ -1113,7 +1119,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
    ctx_name = infer_context_name(img, kerns)
    if (border_mode == 'valid' and subsample == (1, 1, 1) and dilation == (1, 1, 1) and
-            direction_hint == 'bprop weights'):
+            direction_hint == 'bprop weights' and num_groups == 1):
        # Special case: We are asked to use GpuDnnConvGradW. We need to set
        # up a suitable 'fake' convolution to compute the gradient for.
        img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
@@ -1135,7 +1141,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)

    elif (border_mode == 'full' and subsample == (1, 1, 1) and
-          direction_hint != 'forward!'):
+          direction_hint != 'forward!' and num_groups == 1):
        # Special case: We can be faster by using GpuDnnConvGradI to compute
        # the full convolution as the backward pass of a valid convolution.
        # We just need to set up a suitable 'fake' valid convolution.
@@ -1159,7 +1165,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
    img = gpu_contiguous(img)
    kerns = gpu_contiguous(kerns)
    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
-                          conv_mode=conv_mode, precision=precision)(kerns.shape)
+                          conv_mode=conv_mode, precision=precision,
+                          num_groups=num_groups)(kerns.shape)
    desc_op = desc.owner.op
    # We can use Shape_i and bypass the infer_shape here as this is on
    # the input of node and it will always be present.
@@ -1171,7 +1178,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
                                    filter_dilation=dilation)
    out_shp = assert_conv_shape(out_shp)
    out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
-    return GpuDnnConv(algo=algo)(img, kerns, out, desc)
+    return GpuDnnConv(algo=algo, num_groups=num_groups)(img, kerns, out, desc)


 def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
@@ -1189,18 +1196,21 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
    precision = get_precision(precision, [img, topgrad])

    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
-                          conv_mode=conv_mode, precision=precision)(kerns_shp)
+                          conv_mode=conv_mode, precision=precision,
+                          num_groups=num_groups)(kerns_shp)
    out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*kerns_shp)
    return GpuDnnConvGradW(algo=algo, num_groups=num_groups)(img, topgrad, out, desc)


 def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid',
-                     subsample=(1, 1, 1), dilation=(1, 1, 1), conv_mode='conv', precision=None):
+                     subsample=(1, 1, 1), dilation=(1, 1, 1), conv_mode='conv',
+                     precision=None, algo=None, num_groups=1):
    """
    3d version of dnn_gradweight
    """
    return dnn_gradweight(img, topgrad, kerns_shp, border_mode,
-                          subsample, dilation, conv_mode, precision)
+                          subsample, dilation, conv_mode, precision,
+                          algo, num_groups)


 def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
@@ -1218,18 +1228,21 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
    precision = get_precision(precision, [kerns, topgrad])

    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
-                          conv_mode=conv_mode, precision=precision)(kerns.shape)
+                          conv_mode=conv_mode, precision=precision,
+                          num_groups=num_groups)(kerns.shape)
    out = GpuAllocEmpty(dtype=kerns.dtype, context_name=ctx_name)(*img_shp)
    return GpuDnnConvGradI(algo=algo, num_groups=num_groups)(kerns, topgrad, out, desc)


 def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid',
-                    subsample=(1, 1, 1), dilation=(1, 1, 1), conv_mode='conv', precision=None):
+                    subsample=(1, 1, 1), dilation=(1, 1, 1), conv_mode='conv',
+                    precision=None, algo=None, num_groups=1):
    """
    3d version of `dnn_gradinput`.
    """
    return dnn_gradinput(kerns, topgrad, img_shp, border_mode, subsample,
-                         dilation, conv_mode, precision)
+                         dilation, conv_mode, precision, algo,
+                         num_groups)


 class GpuDnnPoolDesc(Op):
@@ -3020,8 +3033,6 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):

    if version(raises=False) < 6000 and op.filter_dilation != (1, 1):
        return None
-    if op.num_groups > 1:
-        return None

    inp1 = inputs[0]
    inp2 = inputs[1]
@@ -3071,8 +3082,6 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):

    if version(raises=False) < 6000 and op.filter_dilation != (1, 1, 1):
        return None
-    if op.num_groups > 1:
-        return None

    inp1 = inputs[0]
    inp2 = inputs[1]
@@ -3091,7 +3100,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
                          subsample=op.subsample,
                          dilation=op.filter_dilation,
                          direction_hint='forward!',
-                          conv_mode=conv_mode)
+                          conv_mode=conv_mode,
+                          num_groups=op.num_groups)
    elif isinstance(op, AbstractConv3d_gradWeights):
        shape = (inp2.shape[1], inp1.shape[1],
                 inputs[2][0], inputs[2][1], inputs[2][2])
@@ -3099,7 +3109,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
                                border_mode=op.border_mode,
                                subsample=op.subsample,
                                dilation=op.filter_dilation,
-                                conv_mode=conv_mode)
+                                conv_mode=conv_mode,
+                                num_groups=op.num_groups)
    elif isinstance(op, AbstractConv3d_gradInputs):
        shape = (inp2.shape[0], inp1.shape[1],
                 inputs[2][0], inputs[2][1], inputs[2][2])
@@ -3107,7 +3118,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
                               border_mode=op.border_mode,
                               subsample=op.subsample,
                               dilation=op.filter_dilation,
-                               conv_mode=conv_mode)
+                               conv_mode=conv_mode,
+                               num_groups=op.num_groups)
    return [rval]



--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -26,6 +26,7 @@ from .rnn_support import Model, GRU, LSTM, WrapperLayer

 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD
 from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
+from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv3d_noOptim

 try:
    import pygpu
@@ -2264,7 +2265,7 @@ def test_dnn_rnn_lstm_grad_c():
            utt.assert_allclose(ref_grads_layer[j], g)


-def dconv2d(border_mode, subsample, filter_dilation, num_groups):
+def dconvfwd(border_mode, subsample, filter_dilation, num_groups):
    def dconv(img, kern):
        return dnn.dnn_conv(img, kern, border_mode=border_mode, subsample=subsample, dilation=filter_dilation,
                            conv_mode='conv', direction_hint='forward', workmem=None,
@@ -2272,14 +2273,14 @@ def dconv2d(border_mode, subsample, filter_dilation, num_groups):
    return dconv


-def dconv2dw(border_mode, subsample, filter_dilation, num_groups):
+def dconvgw(border_mode, subsample, filter_dilation, num_groups):
    def dconvw(img, topgrad, kshp):
        return dnn.dnn_gradweight(img, topgrad, kshp, border_mode=border_mode, subsample=subsample, dilation=filter_dilation,
                                  conv_mode='conv', precision=None, algo=None, num_groups=num_groups)
    return dconvw


-def dconv2di(border_mode, subsample, filter_dilation, num_groups):
+def dconvgi(border_mode, subsample, filter_dilation, num_groups):
    def dconvi(kern, topgrad, imshp):
        return dnn.dnn_gradinput(kern, topgrad, imshp, border_mode=border_mode, subsample=subsample, dilation=filter_dilation,
                                 conv_mode='conv', precision=None, algo=None, num_groups=num_groups)
@@ -2288,9 +2289,21 @@ def dconv2di(border_mode, subsample, filter_dilation, num_groups):

 class Cudnn_grouped_conv(Grouped_conv_noOptim):
    mode = mode_with_gpu
-    conv = staticmethod(dconv2d)
-    conv_gradw = staticmethod(dconv2dw)
-    conv_gradi = staticmethod(dconv2di)
+    conv = staticmethod(dconvfwd)
+    conv_gradw = staticmethod(dconvgw)
+    conv_gradi = staticmethod(dconvgi)
+    conv_op = dnn.GpuDnnConv
+    conv_gradw_op = dnn.GpuDnnConvGradW
+    conv_gradi_op = dnn.GpuDnnConvGradI
+    flip_filter = False
+    is_dnn = True
+
+
+class Cudnn_grouped_conv3d(Grouped_conv3d_noOptim):
+    mode = mode_with_gpu
+    conv = staticmethod(dconvfwd)
+    conv_gradw = staticmethod(dconvgw)
+    conv_gradi = staticmethod(dconvgi)
    conv_op = dnn.GpuDnnConv
    conv_gradw_op = dnn.GpuDnnConvGradW
    conv_gradi_op = dnn.GpuDnnConvGradI
@@ -2519,3 +2532,151 @@ def test_dnn_spatialtf_grad():

    utt.verify_grad(grad_functor, [inputs_val, theta_val], mode=mode_with_gpu,
                    abs_tol=atol, rel_tol=rtol)
+
+
+class TestDnnConv2DRuntimeAlgorithms(object):
+    ndim = 2
+    cpu_conv_class = theano.tensor.nnet.corr.CorrMM
+    runtime_shapes = [
+        (3, [(2, 3, 10, 9), (5, 3, 7, 7)]),
+        (1, [(1, 1, 100, 200), (1, 1, 50, 200)]),
+        (1, [(4, 2, 20, 20), (2, 2, 20, 19)]),
+        (3, [(2, 3, 10, 9), (5, 3, 7, 7)]),  # cache should be used
+        (1, [(2, 2, 50, 50), (5, 2, 25, 31)]),
+        (1, [(1, 1, 100, 200), (1, 1, 50, 200)]),  # cache should be used
+        (1, [(4, 2, 20, 20), (2, 2, 20, 19)]),  # cache should be used
+        (1, [(1, 2, 3, 4), (6, 2, 2, 1)])
+    ]
+
+    def __init__(self):
+        if not dnn.dnn_available(test_ctx_name):
+            raise SkipTest(dnn.dnn_available.msg)
+        utt.seed_rng()
+        self.runtime_algorithms = ('time_once', 'guess_once', 'time_on_shape_change', 'guess_on_shape_change')
+
+    def test_fwd_runtime_algorithms(self):
+        dtype = 'float32'
+        unit_shape = (1,) * self.ndim
+        _broadcastable = [False] * (2 + self.ndim)
+
+        def run_fwd_runtime_algorithm(algo):
+            inputs = theano.tensor.TensorType(dtype, _broadcastable)()
+            filters = theano.tensor.TensorType(dtype, _broadcastable)()
+            # Scale down the input values to prevent very large absolute errors
+            # due to float rounding
+            lower_inputs = inputs / 10
+            lower_filters = filters / 10
+            conv = dnn.dnn_conv(img=lower_inputs, kerns=lower_filters, algo=algo, precision=dtype,
+                                subsample=unit_shape, dilation=unit_shape)
+            f = theano.function([inputs, filters], conv, mode=mode_with_gpu)
+            if self.ndim == 3:
+                flipped_filters = lower_filters[:, :, ::-1, ::-1, ::-1]
+            else:
+                flipped_filters = lower_filters[:, :, ::-1, ::-1]
+            conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(lower_inputs), flipped_filters)
+            f_ref = theano.function([inputs, filters], conv_ref, mode='FAST_RUN')
+            runtime_shapes = self.runtime_shapes
+            if algo in ('time_once', 'guess_once'):
+                runtime_shapes = [list(runtime_shapes[0])]
+                runtime_shapes[0][0] = 5
+            for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
+                for i in range(ntimes):
+                    inputs_val = np.random.random(inputs_shape).astype(dtype)
+                    filters_val = np.random.random(filters_shape).astype(dtype)
+                    gpu_res = f(inputs_val, filters_val)
+                    cpu_res = f_ref(inputs_val, filters_val)
+                    utt.assert_allclose(cpu_res, np.asarray(gpu_res))
+
+        for algo in self.runtime_algorithms:
+            yield (run_fwd_runtime_algorithm, algo)
+
+    def test_gradinput_runtime_algorithms(self):
+        dtype = 'float32'
+        unit_shape = (1,) * self.ndim
+        _broadcastable = [False] * (2 + self.ndim)
+
+        def run_gradinput_runtime_algorithm(algo):
+            theano.config.dnn.conv.algo_bwd_data = algo
+            inputs = theano.tensor.TensorType(dtype, _broadcastable)()
+            filters = theano.tensor.TensorType(dtype, _broadcastable)()
+            conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
+                                subsample=unit_shape, dilation=unit_shape)
+            grad_i = theano.tensor.grad(conv.sum(), [inputs])
+            f = theano.function([inputs, filters], grad_i, mode=mode_with_gpu)
+            assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradI)])
+            assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
+            assert not any(isinstance(node.op, dnn.GpuDnnConvGradW) for node in f.maker.fgraph.apply_nodes)
+            if self.ndim == 3:
+                flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
+            else:
+                flipped_filters = filters[:, :, ::-1, ::-1]
+            conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
+            grad_i_ref = theano.tensor.grad(conv_ref.sum(), [inputs])
+            f_ref = theano.function([inputs, filters], grad_i_ref, mode='FAST_RUN')
+            runtime_shapes = self.runtime_shapes
+            if algo in ('time_once', 'guess_once'):
+                runtime_shapes = [list(runtime_shapes[0])]
+                runtime_shapes[0][0] = 5
+            for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
+                for i in range(ntimes):
+                    inputs_val = np.random.random(inputs_shape).astype(dtype)
+                    filters_val = np.random.random(filters_shape).astype(dtype)
+                    gpu_res = f(inputs_val, filters_val)
+                    cpu_res = f_ref(inputs_val, filters_val)
+                    utt.assert_allclose(cpu_res, np.asarray(gpu_res))
+
+        for algo in self.runtime_algorithms:
+            yield (run_gradinput_runtime_algorithm, algo)
+
+    def test_gradweight_runtime_algorithms(self):
+        dtype = 'float32'
+        unit_shape = (1,) * self.ndim
+        _broadcastable = [False] * (2 + self.ndim)
+
+        def run_gradweight_runtime_algorithm(algo):
+            theano.config.dnn.conv.algo_bwd_filter = algo
+            inputs = theano.tensor.TensorType(dtype, _broadcastable)()
+            filters = theano.tensor.TensorType(dtype, _broadcastable)()
+            conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
+                                subsample=unit_shape, dilation=unit_shape)
+            grad_w = theano.tensor.grad(conv.sum(), [filters])
+            f = theano.function([inputs, filters], grad_w, mode=mode_with_gpu)
+            assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradW)])
+            assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
+            assert not any(isinstance(node.op, dnn.GpuDnnConvGradI) for node in f.maker.fgraph.apply_nodes)
+            if self.ndim == 3:
+                flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
+            else:
+                flipped_filters = filters[:, :, ::-1, ::-1]
+            conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
+            grad_w_ref = theano.tensor.grad(conv_ref.sum(), [filters])
+            f_ref = theano.function([inputs, filters], grad_w_ref, mode='FAST_RUN')
+            runtime_shapes = self.runtime_shapes
+            if algo in ('time_once', 'guess_once'):
+                runtime_shapes = [list(runtime_shapes[0])]
+                runtime_shapes[0][0] = 5
+            for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
+                for i in range(ntimes):
+                    inputs_val = np.random.random(inputs_shape).astype(dtype)
+                    filters_val = np.random.random(filters_shape).astype(dtype)
+                    gpu_res = f(inputs_val, filters_val)
+                    cpu_res = f_ref(inputs_val, filters_val)
+                    utt.assert_allclose(cpu_res, np.asarray(gpu_res))
+
+        for algo in self.runtime_algorithms:
+            yield (run_gradweight_runtime_algorithm, algo)
+
+
+class TestDnnConv3DRuntimeAlgorithms(TestDnnConv2DRuntimeAlgorithms):
+    ndim = 3
+    cpu_conv_class = theano.tensor.nnet.corr3d.Corr3dMM
+    runtime_shapes = [
+        (3, [(2, 3, 5, 10, 9), (5, 3, 4, 7, 7)]),
+        (1, [(1, 1, 5, 100, 200), (1, 1, 4, 50, 200)]),
+        (1, [(4, 2, 20, 20, 20), (2, 2, 20, 19, 18)]),
+        (3, [(2, 3, 5, 10, 9), (5, 3, 4, 7, 7)]),  # cache should be used
+        (1, [(2, 2, 50, 50, 5), (5, 2, 25, 31, 4)]),
+        (1, [(1, 1, 5, 100, 200), (1, 1, 4, 50, 200)]),  # cache should be used
+        (1, [(4, 2, 20, 20, 20), (2, 2, 20, 19, 18)]),  # cache should be used
+        (1, [(1, 2, 3, 4, 5), (6, 2, 3, 2, 1)])
+    ]