Addressing code review comments

46e0dbdd · Boris Fomitchev · notoraptor · c7e02f24 · 46e0dbdd · 46e0dbdd
--- a/theano/gpuarray/c_code/cudnn_helper.h
+++ b/theano/gpuarray/c_code/cudnn_helper.h
@@ -11,6 +11,15 @@ static inline int cudnnGetVersion() {
 }
 #endif
+#if CUDNN_MAJOR < 7
+    enum cudnnMathType_t { CUDNN_DEFAULT_MATH=0, CUDNN_TENSOR_OP_MATH = 1 };
+#endif
+/* a common struct for all 3 CUDNN enums */
+struct AlgoRec {
+        int algo;
+        cudnnDataType_t dataType;
+        size_t wsSize;
+        cudnnMathType_t mathType;
+};
 #endif
--- a/theano/gpuarray/c_code/dnn_conv_base.c
+++ b/theano/gpuarray/c_code/dnn_conv_base.c
@@ -50,3 +50,205 @@ if (APPLY_SPECIFIC(output) != NULL)
  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
 if (APPLY_SPECIFIC(kerns) != NULL)
  cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
+#section support_code
+#include <sstream>
+#include <vector>
+#include <string>
+#if __cplusplus < 201103L
+#include <tr1/unordered_map>
+typedef std::tr1::unordered_map<std::string, AlgoRec> AlgoCache;
+#else
+#include <unordered_map>
+typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
+#endif
+#include "pthread.h"
+#line 69 "dnn_conv_base.c"
+using std::vector;
+using std::string;
+pthread_mutex_t  algoMutex;
+AlgoCache        algoCache;
+static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err)
+{
+    if (err != CUDNN_STATUS_SUCCESS) {
+        PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s",
+                     cudnnGetErrorString(err));
+    }    
+    return err;
+}
+static int
+c_get_largest_free_block_size(PyGpuContextObject *c) 
+{
+  size_t free = 0;
+  int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+  if (err2 != GA_NO_ERROR) {
+    PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                 "memory information on the GPU");
+  }
+  // Guess 4Mb if the info is not available
+  if (free == 0) free = 4 * 1024 * 1024;
+  return free;
+}
+static std::string shape(int* res, int size)
+{
+    std::stringstream s;
+    if (size>0) {
+      s<<res[0];
+      for (int i=1; i< size; ++i)
+        s <<',' << res[i];
+    }
+    return std::string(s.str().c_str());
+}
+static std::string shape(cudnnTensorDescriptor_t t)
+{
+    std::vector<int> res;
+    std::vector<int> stride;
+    int nbDims;
+    cudnnDataType_t type;
+    checkCudnnStatus(cudnnGetTensorNdDescriptor(t, 0, &type, &nbDims,0,0));
+    res.resize(nbDims);
+    stride.resize(nbDims);
+    checkCudnnStatus(cudnnGetTensorNdDescriptor(t, nbDims, &type, &nbDims, res.data(), stride.data()));
+    return shape(&res[0], nbDims) + shape(&stride[0], nbDims);
+};
+static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type)
+{
+    cudnnTensorFormat_t format;
+    int sizes = 8;
+    std::vector<int> res(sizes);
+    int outDims;
+    checkCudnnStatus(cudnnGetFilterNdDescriptor(t, sizes, type, &format, &outDims, res.data()));
+    return shape(&res[0], outDims);
+};
+static std::string shape(cudnnConvolutionDescriptor_t convDesc)
+{
+    const int maxDim = 5;
+    int nDim=0;
+    cudnnConvolutionMode_t mode;
+    cudnnDataType_t        computeType;
+    int                                 padA[maxDim];
+    int                                 strideA[maxDim];
+    int                                 dilationA[maxDim];    
+    checkCudnnStatus(
+        cudnnGetConvolutionNdDescriptor( convDesc, maxDim,
+                                         &nDim,
+                                         &padA[0],
+                                         &strideA[0],
+                                         &dilationA[0],
+                                         &mode,
+                                         &computeType ));
+    return std::string("-mode ") + (((int)mode==0) ? "conv" : "corr") + " -padA" + shape(padA,nDim) + " -convStrideA " + shape(strideA, nDim)  + " -dilationA " + shape(dilationA, nDim);
+}
+static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
+{
+        size_t alignMask = (type == CUDNN_DATA_HALF) ? 0x7F : 0xFF ;
+        // there have to be entries for both aligned and not
+        if (((size_t)in | (size_t)out | (size_t)filter) & alignMask)
+        {
+            return false;
+        }
+        return true;
+}
+static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayObject* input,
+				  cudnnFilterDescriptor_t filterDesc, PyGpuArrayObject* filter,
+				  cudnnConvolutionDescriptor_t convDesc,
+				  PyGpuArrayObject* output, int groups)
+{
+    cudnnDataType_t  dType;
+    std::stringstream s;
+    int expected_output_dims[5] = {0};
+    cudnnStatus_t err = cudnnGetConvolutionNdForwardOutputDim(convDesc, inputDesc, filterDesc,
+							      PyGpuArray_NDIM(filter), expected_output_dims);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
+                   cudnnGetErrorString(err));
+      return "";
+    }
+    if (PyGpuArray_NDIM(filter) == 4) {
+      if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
+          (PyGpuArray_DIMS(output)[1] / groups  != expected_output_dims[1]) ||
+          (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
+          (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
+        PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
+                     " but received gradient with shape %dx%dx% dx%d",
+                     expected_output_dims[0], expected_output_dims[1] / groups,
+                     expected_output_dims[2], expected_output_dims[3],
+                     PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
+                     PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
+        return "";
+      }
+    } else if (PyGpuArray_NDIM(filter) == 5) {
+      if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
+          (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
+          (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
+          (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
+          (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
+        PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
+                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
+                     expected_output_dims[0], expected_output_dims[1],
+                     expected_output_dims[2], expected_output_dims[3],
+                     expected_output_dims[4],
+                     PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
+                     PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
+                     PyGpuArray_DIMS(output)[4]);
+        return "";
+      }
+    }
+    s << "-g" << groups << " -dimA" << shape(inputDesc) << " -filtA" <<
+      shape(filterDesc, &dType) << shape(convDesc);    
+// there have to be entries for both aligned and not
+    if (!all_aligned(dType, PyGpuArray_DEV_DATA(input), PyGpuArray_DEV_DATA(output), PyGpuArray_DEV_DATA(filter)))
+    {
+      s << " [unaligned] ";
+    }
+    return std::string(s.str().c_str());
+}
+static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
+{
+  pthread_mutex_lock(&algoMutex);    
+  algoCache[hash] = rec;
+  pthread_mutex_unlock(&algoMutex);
+}
+static const AlgoRec* dnn_conv_check_cache(const std::string& hash)
+{
+  pthread_mutex_lock(&algoMutex);    
+  bool cacheHit = false;
+  const AlgoRec* ret = 0;
+  // cout << "dnn_conv_check_cache: "<< hash << endl;
+  AlgoCache::iterator hit = algoCache.find(hash);
+  if (hit != algoCache.end())
+    ret = &hit->second;
+  pthread_mutex_unlock(&algoMutex);
+  return ret;
+}
--- a/theano/gpuarray/c_code/dnn_conv_find.c
+++ b/theano/gpuarray/c_code/dnn_conv_find.c
@@ -3,7 +3,6 @@
 #include <sstream>
 #include <vector>
 #include <string>
-#include "dnn_conv_find.h"
 #if __cplusplus < 201103L
 #include <tr1/unordered_map>
 typedef std::tr1::unordered_map<std::string, AlgoRec> AlgoCache;

--- a/theano/gpuarray/c_code/dnn_conv_find.h
+++ b/theano/gpuarray/c_code/dnn_conv_find.h
 #pragma once
-#include <string>
 #include <cuda.h>
-#include <cudnn.h>
+#include <cudnn.h>    
-#if CUDNN_MAJOR < 7
-    enum cudnnMathType_t { CUDNN_DEFAULT_MATH=0, CUDNN_TENSOR_OP_MATH = 1 };
-#endif
-inline cudnnStatus_t checkCudnnStatus(cudnnStatus_t err)
-{
-    if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s",
-                     cudnnGetErrorString(err));
-    }    
-    return err;
-}
-/* a common struct for all 3 CUDNN enums */
-struct AlgoRec {
-        int algo;
-        cudnnDataType_t dataType;
-        size_t wsSize;
-        cudnnMathType_t mathType;
-};
--- a/theano/gpuarray/c_code/dnn_fwd.c
+++ b/theano/gpuarray/c_code/dnn_fwd.c
--- a/theano/gpuarray/c_code/dnn_gi.c
+++ b/theano/gpuarray/c_code/dnn_gi.c
@@ -3,18 +3,15 @@ prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
 prev_algo.dataType = CUDNN_DATA_FLOAT;
 reuse_algo = 0;
-memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
+hash_prefix = std::string("GI| GPU#");
-memset(prev_top_dims, 0, sizeof(prev_top_dims));
 #section support_code_struct
-#include "dnn_conv_find.h"
 #line 12 "dnn_gi.c"
 int     reuse_algo;
 bool    use_cached;
 AlgoRec prev_algo;
-size_t prev_kern_dims[5];
+std::string hash_prefix;
-size_t prev_top_dims[5];
 int
 APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
                        PyGpuArrayObject *im,
@@ -97,51 +94,27 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
  std::string hashkey;
-  if (params->choose_algo) {
-    if (!params->choose_once) {
-      reuse_algo = 1;
-      for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
-        reuse_algo = (reuse_algo &&
-                      PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
-        reuse_algo = (reuse_algo &&
-                      PyGpuArray_DIM(output, i) == prev_top_dims[i]);
-      }
-    }    
-    if (!reuse_algo) {
-      char pci_id[16];
-      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
-      // check out cache
-      hashkey+=dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
-      if (hashkey.empty())
-	return 1;
-      hashkey =  std::string("GI| GPU#") + pci_id + hashkey;
+  if (params->choose_algo && !reuse_algo) {
-      const AlgoRec* cached = dnn_conv_check_cache(hashkey);
+    char pci_id[16];
-      if (cached) {
+    gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);    
-	prev_algo = *cached;
+    // check out cache
-	use_cached = 1;
+    hashkey=dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
-      }
+    if (hashkey.empty())
+      return 1;
+    hashkey = hash_prefix + pci_id + hashkey;
+    const AlgoRec* cached = dnn_conv_check_cache(hashkey);
+    if (cached) {
+      prev_algo = *cached;
+      use_cached = 1;
    }
+  }
-    cuda_enter(c->ctx);
-    if (!(reuse_algo || use_cached)) {
-      size_t free;
-      int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
-      if (err2 != GA_NO_ERROR) {
-        PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                     "memory information on the GPU");
-        cuda_exit(c->ctx);
-        return 1;
-      }
-      // Guess 4Mb if the info is not available
-      if (free == 0) free = 4 * 1024 * 1024;
+  size_t free = c_get_largest_free_block_size(c);
+  cuda_enter(c->ctx);
+  if (params->choose_algo && !(reuse_algo || use_cached)) {
    if (params->choose_time) {
      int count;
      cudnnConvolutionBwdDataAlgoPerf_t choice;
@@ -170,8 +143,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      algo = choice.algo;
        prev_algo.algo = (int)algo;
        prev_algo.wsSize = worksize = choice.memory;
+#if CUDNN_MAJOR >= 7
        prev_algo.mathType = mathtype = choice.mathType;
+#endif
        // Add to the cache
 	dnn_conv_update_cache(hashkey, prev_algo);
@@ -202,99 +176,103 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      // no tensor_op returned from Get()
      prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
    }
-    }
-  } else { /*choose_algo */
-  // The FFT implementation does not support strides, 1x1 filters or inputs
-  // with a spatial dimension larger than 1024. The tiled-FFT implementation
-  // does not support strides.
-  // If the chosen implementation is FFT or tiled-FFT, validate that it can
-  // be used on the current data and default to a safe implementation if it
-  // can't.
-  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-  // defined only for 2d filters
-  if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
-       algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
-    // Extract the properties of the convolution descriptor
-    int nd;
-    int pad[2];
-    int stride[2];
-    int upscale[2];
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t data_type;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                             upscale, &mode, &data_type);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error getting convolution properties: %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-    if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
-    {
-      if (stride[0] != 1 || stride[1] != 1 ||
-          PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
-          (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
-      {
-        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-      }
-    }
-    else
-    {
-      // algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
-      if (stride[0] != 1 || stride[1] != 1)
-      {
-        algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
-      }
-    }
  }
-  } /* choose_algo */
  // if FindEx was used (choose_time), workspace size is set. 
  if (!(reuse_algo || use_cached || params->choose_time))
-    {
+  {
-  err = cudnnGetConvolutionBackwardDataWorkspaceSize(
+    err = cudnnGetConvolutionBackwardDataWorkspaceSize(
    params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
    APPLY_SPECIFIC(input), algo, &worksize);
-  if (err != CUDNN_STATUS_SUCCESS) {
+    if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
+      PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
-                 cudnnGetErrorString(err));
+                   cudnnGetErrorString(err));
-    cuda_exit(c->ctx);
-    return 1;
+      // The FFT implementation does not support strides, 1x1 filters or inputs
-  }
+      // with a spatial dimension larger than 1024. The tiled-FFT implementation
-  // save worksize for next time/cache
+      // does not support strides.
-  prev_algo.wsSize = worksize;
+      // If the chosen implementation is FFT or tiled-FFT, validate that it can
+      // be used on the current data and default to a safe implementation if it
-  // Add to the cache
+      // can't.
-  if (params->choose_algo)
+      // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
+      // defined only for 2d filters
+      if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
+           algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
+        // Extract the properties of the convolution descriptor
+        int nd;
+        int pad[2];
+        int stride[2];
+        int upscale[2];
+        cudnnConvolutionMode_t mode;
+        cudnnDataType_t data_type;
+        err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
+                                              upscale, &mode, &data_type);
+        if (err != CUDNN_STATUS_SUCCESS) {
+          PyErr_Format(PyExc_RuntimeError,
+                       "error getting convolution properties: %s",
+                       cudnnGetErrorString(err));
+          cuda_exit(c->ctx);
+          return 1;
+        }
+        if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
+        {
+          if (stride[0] != 1 || stride[1] != 1 ||
+              PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
+              (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
+          {
+            algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+          }
+        }
+        else
+        {
+          // algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
+          if (stride[0] != 1 || stride[1] != 1)
+          {
+            algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+          }
+        }
+      }
+      err = cudnnGetConvolutionBackwardDataWorkspaceSize(
+        params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
+        APPLY_SPECIFIC(input), algo, &worksize);
+    }
+    if (err != CUDNN_STATUS_SUCCESS) {      
+      cuda_exit(c->ctx);
+      return 1;
+    }
+    // save worksize for next time/cache
+    prev_algo.wsSize = worksize;
+    // Add to the cache
+    if (params->choose_algo)
      dnn_conv_update_cache(hashkey, prev_algo);
-    }  
+  }  // !(reuse_algo || use_cached || params->choose_time)
-    #ifdef DEBUG
+#ifdef DEBUG
-    char algorithm_name[128];
+  if (params->choose_algo) { 
    if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name))
        return 1;
    // NB: This is printed only when algorithm is chosen at runtime.
-    if (reuse_algo)
+    fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n",
-        fprintf(stderr, "(reused %s)\n", algorithm_name);
+            params->choose_algo ? "[A]": "" ,
-    else
+            params->choose_time ? "[T]": "" ,
-        fprintf(stderr, "(using %s)\n", algorithm_name);
+            algo, // algorithm_name,
-    #endif
+            reuse_algo ? "(reused)" : "",
+            use_cached ? "(cache)": "",
+            worksize, mathtype, hashkey.c_str()
+      );
+  }
+#endif
    if (params->choose_once) {
      reuse_algo = 1;
-    } else {
-      for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
-        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
-        prev_top_dims[i] = PyGpuArray_DIM(output, i);
-      }
    }
    gpudata *workspace = 0;  
 #if CUDNN_MAJOR >= 7    
    // CUDNN7: need to set math type

--- a/theano/gpuarray/c_code/dnn_gw.c
+++ b/theano/gpuarray/c_code/dnn_gw.c
@@ -3,17 +3,15 @@ prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
 prev_algo.dataType = CUDNN_DATA_FLOAT;
 reuse_algo = 0;
-memset(prev_img_dims, 0, sizeof(prev_img_dims));
+hash_prefix = std::string("GW| GPU#");
-memset(prev_top_dims, 0, sizeof(prev_top_dims));
 #section support_code_struct
 #line 11 "dnn_gw.c"
-#include "dnn_conv_find.h"
 int     reuse_algo;
 bool    use_cached;
 AlgoRec prev_algo;
-size_t prev_img_dims[5];
+std::string hash_prefix;
-size_t prev_top_dims[5];
 int
 APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
@@ -95,28 +93,21 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  #endif
  size_t   worksize  = 0;
  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
  std::string hashkey ;
-  cuda_enter(c->ctx);
+  size_t free = c_get_largest_free_block_size(c);
+  cuda_enter(c->ctx);  
  if (params->choose_algo) {
-    if (!params->choose_once) {
-      reuse_algo = 1;
-      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-        reuse_algo = (reuse_algo &&
-                      PyGpuArray_DIM(input, i) == prev_img_dims[i]);
-        reuse_algo = (reuse_algo &&
-                      PyGpuArray_DIM(output, i) == prev_top_dims[i]);
-      }
-    }
    if (!reuse_algo) {
      char pci_id[16];
      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), *kerns, desc, output, groups);
      if (hashkey.empty())
-	return 1;
+        return 1;
-      hashkey =  std::string("GW| GPU#") + pci_id + hashkey;
+      hashkey =  hash_prefix + pci_id + hashkey;
      // check out cache
      const AlgoRec* cached = dnn_conv_check_cache(hashkey);
      if (cached) {
@@ -124,21 +115,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        use_cached = 1;
      }
    }
-    if (!(reuse_algo || use_cached)) {
-      size_t free;
+    if (reuse_algo || use_cached) {
+      algo = (cudnnConvolutionBwdFilterAlgo_t)prev_algo.algo;
-      int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+      worksize = prev_algo.wsSize;
-      if (err2 != GA_NO_ERROR) {
+      mathtype = prev_algo.mathType;
-        PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+    } else { 
-                     "memory information on the GPU");
-        cuda_exit(c->ctx);
-        return 1;
-      }
-      // Guess 4Mb if the info is not available
-      if (free == 0) free = 4 * 1024 * 1024;
      if (params->choose_time) {
        int count;
@@ -169,8 +151,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        algo = choice.algo;
        prev_algo.algo = (int)algo;
        prev_algo.wsSize = worksize = choice.memory;
+#if CUDNN_MAJOR >= 7        
        prev_algo.mathType = mathtype = choice.mathType;
+#endif
        // Add to the cache
        dnn_conv_update_cache(hashkey, prev_algo);
@@ -202,89 +185,65 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
 	// no tensor_op returned from Get()
 	prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
      }
-    } else { 
-      algo = (cudnnConvolutionBwdFilterAlgo_t)prev_algo.algo;
-      worksize = prev_algo.wsSize;
-      mathtype = prev_algo.mathType;
-    }
-  } else {
-  // The FFT implementation does not support strides, 1x1 filters or inputs
-  // with a spatial dimension larger than 1024.
-  // If the chosen implementation is FFT, validate that it can
-  // be used on the current data and default to a safe implementation if it
-  // can't.
-  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
-  // defined only for 2d filters
-  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
-      PyGpuArray_NDIM(input) == 4) {
-    // Extract the properties of the convolution descriptor
-    int nd;
-    int pad[2];
-    int stride[2];
-    int upscale[2];
-    cudnnConvolutionMode_t mode;
-    cudnnDataType_t data_type;
-    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                             upscale, &mode, &data_type);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error getting convolution properties: %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-    if (stride[0] != 1 || stride[1] != 1 ||
-        PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
-        (PyGpuArray_DIM(*kerns, 2) == 1 && PyGpuArray_DIM(*kerns, 3) == 1)) {
-      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
    }
-  }
+  } /* choose_algo */
-  }/* choose_algo */
  // if FindEx was used (choose_time), workspace size is set. 
  if (!(reuse_algo || use_cached || params->choose_time))
    {
-  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
-    params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+        params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
-    APPLY_SPECIFIC(kerns), algo, &worksize);
+        APPLY_SPECIFIC(kerns), algo, &worksize);
-  if (err != CUDNN_STATUS_SUCCESS) {
+      if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
+#ifdef DEBUG
-                 cudnnGetErrorString(err));
+        if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
-      cuda_exit(c->ctx);
+          return 1;
-    return 1;
+        fprintf(stderr, "(%s error getting worksize:%s, falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0",
-  }
+                algorithm_name, cudnnGetErrorString(err));
-  // save worksize for next time/cache
+#endif        
-  prev_algo.wsSize = worksize;
+        algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+        err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
-  // Add to the cache
+          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
-  if (params->choose_algo)
+          APPLY_SPECIFIC(kerns), algo, &worksize);
-    dnn_conv_update_cache(hashkey, prev_algo);
+        if (err != CUDNN_STATUS_SUCCESS) {
+          PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
+                       cudnnGetErrorString(err));
+          cuda_exit(c->ctx);
+          return 1;
+        }
+      }
+      // save worksize for next time/cache
+      prev_algo.wsSize = worksize;
+      // Add to the cache
+      if (params->choose_algo)
+        dnn_conv_update_cache(hashkey, prev_algo);
    }
 #ifdef DEBUG  
-  if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
+  if (params->choose_algo) { 
-    return 1;
+    if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
-  // NB: This is printed only when algorithm is chosen at runtime.
+      return 1;
-  fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n",
+    // NB: This is printed only when algorithm is chosen at runtime.
-	  params->choose_algo ? "[A]": "" ,
+    fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n",
-	  params->choose_time ? "[T]": "" ,
+            params->choose_algo ? "[A]": "" ,
-	  algo, // algorithm_name,
+            params->choose_time ? "[T]": "" ,
-	  reuse_algo ? "(reused)" : "",
+            algo, // algorithm_name,
-	  use_cached ? "(cache)": "",
+            reuse_algo ? "(reused)" : "",
-	  worksize, mathtype, hashkey.c_str()
+            use_cached ? "(cache)": "",
-	  );
+            worksize, mathtype, hashkey.c_str()
+      );
+  }
 #endif
    if (params->choose_once) {
      reuse_algo = 1;
-    } else {
-      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-        prev_img_dims[i] = PyGpuArray_DIM(input, i);
-        prev_top_dims[i] = PyGpuArray_DIM(output, i);
-      }
    }
    gpudata *workspace = 0;  

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -567,7 +567,7 @@ class GpuDnnConv(DnnBase):
                             num_groups=int_t)
    def __init__(self, algo=None, inplace=False, num_groups=1):
-        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_conv_find.c", "c_code/dnn_fwd.c"],
+        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_fwd.c"],
                         "APPLY_SPECIFIC(conv_fwd)")
        if algo is None:
@@ -710,7 +710,7 @@ class GpuDnnConvGradW(DnnBase):
                             num_groups=int_t)
    def __init__(self, inplace=False, algo=None, num_groups=1):
-        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_conv_find.c", "c_code/dnn_gw.c"],
+        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_gw.c"],
                         "APPLY_SPECIFIC(conv_gw)")
        self.inplace = bool(inplace)
        if self.inplace:
@@ -846,7 +846,7 @@ class GpuDnnConvGradI(DnnBase):
                             num_groups=int_t)
    def __init__(self, inplace=False, algo=None, num_groups=1):
-        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_conv_find.c", "c_code/dnn_gi.c"],
+        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_gi.c"],
                         "APPLY_SPECIFIC(conv_gi)")
        self.inplace = bool(inplace)
        if self.inplace: