Make code safer and simpler.

fa5590e6 · notoraptor · d3cb3ad4 · fa5590e6 · fa5590e6 · fa5590e6
--- a/theano/gpuarray/c_code/dnn_conv_base.c
+++ b/theano/gpuarray/c_code/dnn_conv_base.c
@@ -59,7 +59,6 @@ if (APPLY_SPECIFIC(kerns) != NULL)
 #section support_code
 #include <sstream>
-#include <vector>
 #include <string>
 #if __cplusplus < 201103L
 #include <tr1/unordered_map>
@@ -70,20 +69,17 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
 #endif
 #include "pthread.h"
-#line 69 "dnn_conv_base.c"
+#line 73 "dnn_conv_base.c"
-using std::vector;
-using std::string;
 pthread_mutex_t  algoMutex;
 AlgoCache        algoCache;
-static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err)
+static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err, const char* msg)
 {
    if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s",
+        PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s: %s",
-                     cudnnGetErrorString(err));
+                     msg, cudnnGetErrorString(err));
-    }    
+    }
    return err;
 }
@@ -105,64 +101,69 @@ c_get_largest_free_block_size(PyGpuContextObject *c)
 static std::string shape(int* res, int size)
 {
-    std::stringstream s;
+    std::ostringstream s;
-    if (size>0) {
+    if (size > 0) {
-      s<<res[0];
+      s << res[0];
-      for (int i=1; i< size; ++i)
+      for (int i = 1; i < size; ++i)
        s <<',' << res[i];
    }
-    return std::string(s.str().c_str());
+    return s.str();
 }
 static std::string shape(cudnnTensorDescriptor_t t)
 {
-    std::vector<int> res;
+    // cuDNN can handle up to CUDNN_DIM_MAX dimensions.
-    std::vector<int> stride;
+    int res[CUDNN_DIM_MAX];
+    int stride[CUDNN_DIM_MAX];
    int nbDims;
    cudnnDataType_t type;
-    checkCudnnStatus(cudnnGetTensorNdDescriptor(t, 0, &type, &nbDims,0,0));
+    checkCudnnStatus(cudnnGetTensorNdDescriptor(t, CUDNN_DIM_MAX, &type, &nbDims, res, stride),
-    res.resize(nbDims);
+                     "error getting tensor description");
-    stride.resize(nbDims);
+    if (PyErr_Occurred()) return "";
-    checkCudnnStatus(cudnnGetTensorNdDescriptor(t, nbDims, &type, &nbDims, res.data(), stride.data()));
+    return shape(res, nbDims) + "," + shape(stride, nbDims);
-    return shape(&res[0], nbDims) + shape(&stride[0], nbDims);
 };
 static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type)
 {
    cudnnTensorFormat_t format;
-    int sizes = 8;
+    int res[CUDNN_DIM_MAX];
-    std::vector<int> res(sizes);
    int outDims;
-    checkCudnnStatus(cudnnGetFilterNdDescriptor(t, sizes, type, &format, &outDims, res.data()));
+    checkCudnnStatus(cudnnGetFilterNdDescriptor(t, CUDNN_DIM_MAX, type, &format, &outDims, res),
-    return shape(&res[0], outDims);
+                     "error getting filter description");
+    if (PyErr_Occurred()) return "";
+    return shape(res, outDims);
 };
 static std::string shape(cudnnConvolutionDescriptor_t convDesc)
 {
-    const int maxDim = 5;
+    int nDim;
-    int nDim=0;
    cudnnConvolutionMode_t mode;
    cudnnDataType_t        computeType;
-    int                                 padA[maxDim];
+    int                                 padA[5];
-    int                                 strideA[maxDim];
+    int                                 strideA[5];
-    int                                 dilationA[maxDim];    
+    int                                 dilationA[5];
    checkCudnnStatus(
-        cudnnGetConvolutionNdDescriptor( convDesc, maxDim,
+        cudnnGetConvolutionNdDescriptor( convDesc, 5,
                                         &nDim,
                                         &padA[0],
                                         &strideA[0],
                                         &dilationA[0],
                                         &mode,
-                                         &computeType ));
+                                         &computeType ),
+        "error getting convolution description");
+    if (PyErr_Occurred()) return "";
-    return std::string("-mode ") + (((int)mode==0) ? "conv" : "corr") + " -padA" + shape(padA,nDim) + " -convStrideA " + shape(strideA, nDim)  + " -dilationA " + shape(dilationA, nDim);
+    return (std::string("-mode ") +
+            ((mode == CUDNN_CONVOLUTION) ? "conv" : "cross") +
+            " -pad " +
+            shape(padA, nDim) +
+            " -subsample " +
+            shape(strideA, nDim) +
+            " -dilation " +
+            shape(dilationA, nDim));
 }
 static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
@@ -182,7 +183,7 @@ static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayO
 				  PyGpuArrayObject* output, int groups)
 {
    cudnnDataType_t  dType;
-    std::stringstream s;
+    std::ostringstream s;
    int expected_output_dims[5] = {0};
    cudnnStatus_t err = cudnnGetConvolutionNdForwardOutputDim(convDesc, inputDesc, filterDesc,
 							      PyGpuArray_NDIM(filter), expected_output_dims);
@@ -221,16 +222,20 @@ static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayO
        return "";
      }
    }
+    std::string shapeInput = shape(inputDesc);
-    s << "-g" << groups << " -dimA" << shape(inputDesc) << " -filtA" <<
+    std::string shapeFilter = shape(filterDesc, &dType);
-      shape(filterDesc, &dType) << shape(convDesc);    
+    std::string shapeConvDesc = shape(convDesc);
+    if (shapeInput.empty() || shapeFilter.empty() || shapeConvDesc.empty())
+        return "";
+    s << "-g " << groups << " -dim " << shapeInput << " -filt " <<
+      shapeFilter << " " << shapeConvDesc;
 // there have to be entries for both aligned and not
    if (!all_aligned(dType, PyGpuArray_DEV_DATA(input), PyGpuArray_DEV_DATA(output), PyGpuArray_DEV_DATA(filter)))
    {
-      s << " [unaligned] ";
+      s << " [unaligned]";
    }
-    return std::string(s.str().c_str());
+    return s.str();
 }
 static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
@@ -240,15 +245,11 @@ static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
  pthread_mutex_unlock(&algoMutex);
 }
 static const AlgoRec* dnn_conv_check_cache(const std::string& hash)
 {
  pthread_mutex_lock(&algoMutex);    
-  bool cacheHit = false;
  const AlgoRec* ret = 0;
-  // cout << "dnn_conv_check_cache: "<< hash << endl;
  AlgoCache::iterator hit = algoCache.find(hash);
  if (hit != algoCache.end())
@@ -257,4 +258,3 @@ static const AlgoRec* dnn_conv_check_cache(const std::string& hash)
  pthread_mutex_unlock(&algoMutex);
  return ret;
 }
--- a/theano/gpuarray/c_code/dnn_fwd.c
+++ b/theano/gpuarray/c_code/dnn_fwd.c
 #section init_code_struct
 reuse_algo = 0;
+use_cached = 0;
 prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
 prev_algo.dataType = CUDNN_DATA_FLOAT;
-hash_prefix = std::string("FW| GPU#");
+hash_prefix = std::string("FWD|GPU#");
 #section support_code_struct
-#line 12 "dnn_fwd.c"
+#line 11 "dnn_fwd.c"
 int     reuse_algo;
 bool    use_cached;
 AlgoRec prev_algo;
@@ -72,7 +73,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    }
    return 0;
  }
  int groups = c_check_groups_for_conv(desc, params->num_groups);
  if (groups == -1)
    return 1;
@@ -87,28 +88,29 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  size_t output_offset = PyGpuArray_STRIDE(*output, 0) / groups;
  cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
-  size_t   worksize  = 0;
+  size_t worksize = 0;
  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
  std::string hashkey;
  #ifdef DEBUG
  char algorithm_name[128];
  #endif
  size_t free = c_get_largest_free_block_size(c);
+  if (PyErr_Occurred()) return 1;
  cuda_enter(c->ctx);
  if (params->choose_algo) {
    if (!reuse_algo) {
      char pci_id[16];
      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups);
      if (hashkey.empty())
-	return 1;
+        return 1;
-      hashkey =  hash_prefix + pci_id + hashkey;
+      hashkey =  hash_prefix + pci_id + " " + hashkey;
      // check out cache
      const AlgoRec* cached = dnn_conv_check_cache(hashkey);
      if (cached) {
@@ -116,17 +118,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
        use_cached = 1;
      }
    }
    if (reuse_algo || use_cached) {
      algo = (cudnnConvolutionFwdAlgo_t)prev_algo.algo;
      worksize = prev_algo.wsSize;
      mathtype = prev_algo.mathType;
-    }  else { 
+    }  else {
      if (params->choose_time) {
        int count;
        cudnnConvolutionFwdAlgoPerf_t choice;
        gpudata *tmpmem;
        tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
        if (tmpmem == NULL) {
          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
@@ -142,9 +144,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          free);
        gpudata_release(tmpmem);
-	//	fprintf(stderr, "(cudnnFindConvolutionForwardAlgorithmEx: (err:%d), algo: %d, mem: %ld, free: %ld\n",
-	//	err, choice.algo, choice.memory, free);
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError,
                       "error selecting convolution algo: %s",
@@ -152,14 +151,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          cuda_exit(c->ctx);
          return 1;
        }
-        algo = choice.algo;
-        prev_algo.algo = (int)algo;
-        prev_algo.wsSize = worksize = choice.memory;
-#if CUDNN_MAJOR >= 7        
-        prev_algo.mathType = mathtype = choice.mathType;
-#endif
-        // Add to the cache
-        dnn_conv_update_cache(hashkey, prev_algo);
        #ifdef DEBUG
        if (count == 0) {
@@ -173,27 +164,37 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
        } // Else, count is necessarly 1 for current implementation.
        #endif
+        algo = choice.algo;
+        prev_algo.algo = (int)algo;
+        prev_algo.wsSize = worksize = choice.memory;
+#if CUDNN_MAJOR >= 7
+        prev_algo.mathType = mathtype = choice.mathType;
+#endif
+        // Add to the cache
+        dnn_conv_update_cache(hashkey, prev_algo);
+        // NB: It is added again later to cqche,
+        // so maybe this line could be removed.
      } else {
        err = cudnnGetConvolutionForwardAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
          desc, APPLY_SPECIFIC(output),
          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
-          if (err != CUDNN_STATUS_SUCCESS) {
+        if (err != CUDNN_STATUS_SUCCESS) {
-            PyErr_Format(PyExc_RuntimeError,
+          PyErr_Format(PyExc_RuntimeError,
-                         "error selecting convolution algo: %s",
+                       "error selecting convolution algo: %s",
-                         cudnnGetErrorString(err));
+                       cudnnGetErrorString(err));
-            cuda_exit(c->ctx);
+          cuda_exit(c->ctx);
-            return 1;
+          return 1;
-          }
+        }
-          prev_algo.algo = algo;
+        prev_algo.algo = algo;
-          // no tensor_op returned from Get()
+        // no tensor_op returned from Get()
-          prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
+        prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
-	  // fprintf(stderr, "(cudnnGetConvolutionForwardAlgorithm: (err:%d), algo: %d\n", err, algo);
      }
-    } 
+    }
  }
-  // if FindEx was used (choose_time), workspace size is set. 
+  // if FindEx was used (choose_time), workspace size is set.
  if (!(reuse_algo || use_cached || params->choose_time))
  {
    err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
@@ -203,7 +204,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                                                  APPLY_SPECIFIC(output),
                                                  algo,
                                                  &worksize);
-    if (err == CUDNN_STATUS_NOT_SUPPORTED) {    
+    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
        // Fallback to none algo if not supported
      #ifdef DEBUG
      if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
@@ -222,7 +223,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                                                  algo,
                                                  &worksize);
  }
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
                 "error getting worksize: %s",
@@ -232,33 +233,35 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  }
  // save worksize for next time/cache
  prev_algo.wsSize = worksize;
-  // Add to the cache
+  // Add to the cache, even if this node use *_once algo
+  // (in case the user specify the algo per layer and not globally).
  if (params->choose_algo)
    dnn_conv_update_cache(hashkey, prev_algo);
-#ifdef DEBUG  
+#ifdef DEBUG
-  if (params->choose_algo) {  
+  if (params->choose_algo) {
    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
      return 1;
-    fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n",
+    fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
-            params->choose_algo ? "[A]": "" ,
+            algorithm_name,
-            params->choose_time ? "[T]": "" ,
+            params->choose_time ? "(timed)": "" ,
-            algo, // algorithm_name,
            reuse_algo ? "(reused)" : "",
            use_cached ? "(cache)": "",
-            worksize, mathtype, hashkey.c_str()
+            mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
+            worksize,
+            hashkey.c_str()
      );
  }
 #endif
  if (params->choose_once) {
    reuse_algo = 1;
  }
  {
-    gpudata *workspace = 0;  
+    gpudata *workspace = 0;
-#if CUDNN_MAJOR >= 7    
+#if CUDNN_MAJOR >= 7
    // CUDNN7: need to set math type
    err = cudnnSetConvolutionMathType(desc, prev_algo.mathType);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -269,7 +272,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      return 1;
    }
 #endif
    /*
     * This is less than ideal since we need to free it after (which
     * introduces a synchronization point. But we don't have a module
@@ -284,7 +287,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
        return 1;
      }
    }
    cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
@@ -308,7 +311,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
  }
  cuda_exit(c->ctx);
  if (err != CUDNN_STATUS_SUCCESS) {

--- a/theano/gpuarray/c_code/dnn_gi.c
+++ b/theano/gpuarray/c_code/dnn_gi.c
@@ -3,15 +3,16 @@ prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
 prev_algo.dataType = CUDNN_DATA_FLOAT;
 reuse_algo = 0;
-hash_prefix = std::string("GI| GPU#");
+use_cached = 0;
-#section support_code_struct
+hash_prefix = std::string("GI|GPU#");
-#line 12 "dnn_gi.c"
+#section support_code_struct
+#line 11 "dnn_gi.c"
 int     reuse_algo;
 bool    use_cached;
 AlgoRec prev_algo;
 std::string hash_prefix;
 int
 APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
                        PyGpuArrayObject *im,
@@ -72,7 +73,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    return 0;
  }
  int groups = c_check_groups_for_conv(desc, params->num_groups);
  if (groups == -1)
    return 1;
@@ -90,19 +91,19 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  #ifdef DEBUG
  char algorithm_name[128];
  #endif
-  size_t   worksize  = 0;
+  size_t worksize = 0;
  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
  std::string hashkey;
  if (params->choose_algo && !reuse_algo) {
    char pci_id[16];
-    gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);    
+    gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
    // check out cache
    hashkey=dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
    if (hashkey.empty())
      return 1;
-    hashkey = hash_prefix + pci_id + hashkey;
+    hashkey = hash_prefix + pci_id + " " + hashkey;
    const AlgoRec* cached = dnn_conv_check_cache(hashkey);
    if (cached) {
      prev_algo = *cached;
@@ -111,9 +112,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  }
  size_t free = c_get_largest_free_block_size(c);
+  if (PyErr_Occurred()) return 1;
  cuda_enter(c->ctx);
  if (params->choose_algo && !(reuse_algo || use_cached)) {
    if (params->choose_time) {
      int count;
@@ -140,15 +142,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
        return 1;
      }
-      algo = choice.algo;
-        prev_algo.algo = (int)algo;
-        prev_algo.wsSize = worksize = choice.memory;
-#if CUDNN_MAJOR >= 7
-        prev_algo.mathType = mathtype = choice.mathType;
-#endif
-        // Add to the cache
-	dnn_conv_update_cache(hashkey, prev_algo);
      #ifdef DEBUG
      if (count == 0) {
          PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
@@ -161,6 +154,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      } // Else, count is necessarly 1 for current implementation.
      #endif
+      algo = choice.algo;
+      prev_algo.algo = (int)algo;
+      prev_algo.wsSize = worksize = choice.memory;
+#if CUDNN_MAJOR >= 7
+      prev_algo.mathType = mathtype = choice.mathType;
+#endif
+    // Add to the cache
+	dnn_conv_update_cache(hashkey, prev_algo);
    } else {
      err = cudnnGetConvolutionBackwardDataAlgorithm(
        params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
@@ -177,11 +179,11 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
    }
  }
-  // if FindEx was used (choose_time), workspace size is set. 
+  // if FindEx was used (choose_time), workspace size is set.
  if (!(reuse_algo || use_cached || params->choose_time))
  {
    err = cudnnGetConvolutionBackwardDataWorkspaceSize(
    params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
    APPLY_SPECIFIC(input), algo, &worksize);
@@ -200,7 +202,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      // defined only for 2d filters
      if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
           algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
        // Extract the properties of the convolution descriptor
        int nd;
        int pad[2];
@@ -217,7 +219,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
          cuda_exit(c->ctx);
          return 1;
        }
        if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
        {
          if (stride[0] != 1 || stride[1] != 1 ||
@@ -240,31 +242,32 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
        params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
        APPLY_SPECIFIC(input), algo, &worksize);
    }
-    if (err != CUDNN_STATUS_SUCCESS) {      
+    if (err != CUDNN_STATUS_SUCCESS) {
      cuda_exit(c->ctx);
      return 1;
    }
    // save worksize for next time/cache
    prev_algo.wsSize = worksize;
    // Add to the cache
    if (params->choose_algo)
      dnn_conv_update_cache(hashkey, prev_algo);
  }  // !(reuse_algo || use_cached || params->choose_time)
 #ifdef DEBUG
-  if (params->choose_algo) { 
+  if (params->choose_algo) {
    if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name))
        return 1;
    // NB: This is printed only when algorithm is chosen at runtime.
-    fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n",
+    fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
-            params->choose_algo ? "[A]": "" ,
+            algorithm_name,
-            params->choose_time ? "[T]": "" ,
+            params->choose_time ? "(timed)": "" ,
-            algo, // algorithm_name,
            reuse_algo ? "(reused)" : "",
            use_cached ? "(cache)": "",
-            worksize, mathtype, hashkey.c_str()
+            mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
+            worksize,
+            hashkey.c_str()
      );
  }
 #endif
@@ -272,9 +275,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    if (params->choose_once) {
      reuse_algo = 1;
    }
-    gpudata *workspace = 0;  
+    gpudata *workspace = 0;
-#if CUDNN_MAJOR >= 7    
+#if CUDNN_MAJOR >= 7
    // CUDNN7: need to set math type
    err = cudnnSetConvolutionMathType(desc, prev_algo.mathType);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -285,7 +288,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      return 1;
    }
 #endif
  if (worksize != 0) {
    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {

--- a/theano/gpuarray/c_code/dnn_gw.c
+++ b/theano/gpuarray/c_code/dnn_gw.c
@@ -3,11 +3,11 @@ prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
 prev_algo.dataType = CUDNN_DATA_FLOAT;
 reuse_algo = 0;
-hash_prefix = std::string("GW| GPU#");
+use_cached = 0;
+hash_prefix = std::string("GW|GPU#");
 #section support_code_struct
 #line 11 "dnn_gw.c"
 int     reuse_algo;
 bool    use_cached;
 AlgoRec prev_algo;
@@ -91,16 +91,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  #ifdef DEBUG
  char algorithm_name[128];
  #endif
-  size_t   worksize  = 0;
+  size_t worksize = 0;
  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
  std::string hashkey ;
  size_t free = c_get_largest_free_block_size(c);
+  if (PyErr_Occurred()) return 1;
-  cuda_enter(c->ctx);  
+  cuda_enter(c->ctx);
  if (params->choose_algo) {
    if (!reuse_algo) {
      char pci_id[16];
      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
@@ -115,12 +116,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        use_cached = 1;
      }
    }
    if (reuse_algo || use_cached) {
      algo = (cudnnConvolutionBwdFilterAlgo_t)prev_algo.algo;
      worksize = prev_algo.wsSize;
      mathtype = prev_algo.mathType;
-    } else { 
+    } else {
      if (params->choose_time) {
        int count;
@@ -148,15 +149,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
          return 1;
        }
-        algo = choice.algo;
-        prev_algo.algo = (int)algo;
-        prev_algo.wsSize = worksize = choice.memory;
-#if CUDNN_MAJOR >= 7        
-        prev_algo.mathType = mathtype = choice.mathType;
-#endif
-        // Add to the cache
-        dnn_conv_update_cache(hashkey, prev_algo);
        #ifdef DEBUG
        if (count == 0) {
            PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found");
@@ -169,6 +161,15 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        } // Else, count is necessarly 1 for current implementation.
        #endif
+        algo = choice.algo;
+        prev_algo.algo = (int)algo;
+        prev_algo.wsSize = worksize = choice.memory;
+#if CUDNN_MAJOR >= 7
+        prev_algo.mathType = mathtype = choice.mathType;
+#endif
+        // Add to the cache
+        dnn_conv_update_cache(hashkey, prev_algo);
      } else {
        err = cudnnGetConvolutionBackwardFilterAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
@@ -181,73 +182,74 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
          cuda_exit(c->ctx);
          return 1;
        }
-	prev_algo.algo = algo;
+	    prev_algo.algo = algo;
-	// no tensor_op returned from Get()
+	    // no tensor_op returned from Get()
-	prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
+	    prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
      }
    }
  } /* choose_algo */
-  // if FindEx was used (choose_time), workspace size is set. 
+  // if FindEx was used (choose_time), workspace size is set.
  if (!(reuse_algo || use_cached || params->choose_time))
    {
      err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
        params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
        APPLY_SPECIFIC(kerns), algo, &worksize);
      if (err != CUDNN_STATUS_SUCCESS) {
 #ifdef DEBUG
        if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
          return 1;
        fprintf(stderr, "(%s error getting worksize:%s, falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0",
                algorithm_name, cudnnGetErrorString(err));
-#endif        
+#endif
        algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
        err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
          APPLY_SPECIFIC(kerns), algo, &worksize);
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
                       cudnnGetErrorString(err));
          cuda_exit(c->ctx);
          return 1;
        }
      }
      // save worksize for next time/cache
      prev_algo.wsSize = worksize;
      // Add to the cache
      if (params->choose_algo)
        dnn_conv_update_cache(hashkey, prev_algo);
    }
-#ifdef DEBUG  
+#ifdef DEBUG
-  if (params->choose_algo) { 
+  if (params->choose_algo) {
    if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
      return 1;
    // NB: This is printed only when algorithm is chosen at runtime.
-    fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n",
+    fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
-            params->choose_algo ? "[A]": "" ,
+            algorithm_name,
-            params->choose_time ? "[T]": "" ,
+            params->choose_time ? "(timed)": "" ,
-            algo, // algorithm_name,
            reuse_algo ? "(reused)" : "",
            use_cached ? "(cache)": "",
-            worksize, mathtype, hashkey.c_str()
+            mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
-      );
+            worksize,
+            hashkey.c_str()
+     );
  }
 #endif
    if (params->choose_once) {
      reuse_algo = 1;
    }
-    gpudata *workspace = 0;  
+    gpudata *workspace = 0;
-#if CUDNN_MAJOR >= 7    
+#if CUDNN_MAJOR >= 7
    // CUDNN7: need to set math type
    err = cudnnSetConvolutionMathType(desc, mathtype);
    if (err != CUDNN_STATUS_SUCCESS) {

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -399,7 +399,7 @@ class DnnBase(COp):
        return []
    def c_code_cache_version(self):
-        return (super(DnnBase, self).c_code_cache_version(), version(), 3)
+        return (super(DnnBase, self).c_code_cache_version(), version(), 1)
 class GpuDnnConvDesc(COp):