Tensor op, cache

bacc5f6f · Boris Fomitchev · notoraptor · 2d3ab3b0 · bacc5f6f · bacc5f6f
--- a/theano/gpuarray/c_code/dnn_conv_find.c
+++ b/theano/gpuarray/c_code/dnn_conv_find.c
+#section support_code
+#include <cuda.h>
+#include <sstream>
+#include <vector>
+#include <string>
+#include "dnn_conv_find.h"
+#if __cplusplus < 201103L
+#include <tr1/unordered_map>
+typedef std::tr1::unordered_map<std::string, AlgoRec> AlgoCache;
+#else
+#include <unordered_map>
+typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
+#endif
+#include "pthread.h"
+#line 10 "dnn_conv_find.cc"
+using std::vector;
+using std::string;
+pthread_mutex_t  algoMutex;
+AlgoCache        algoCache;
+static std::string shape(int* res, int size)
+{
+    std::stringstream s;
+    if (size>0) {
+      s<<res[0];
+      for (int i=1; i< size; ++i)
+        s <<',' << res[i];
+    }
+    return std::string(s.str().c_str());
+}
+static std::string shape(cudnnTensorDescriptor_t t)
+{
+    std::vector<int> res;
+    std::vector<int> stride;
+    int nbDims;
+    cudnnDataType_t type;
+    checkCudnnStatus(cudnnGetTensorNdDescriptor(t, 0, &type, &nbDims,0,0));
+    res.resize(nbDims);
+    stride.resize(nbDims);
+    checkCudnnStatus(cudnnGetTensorNdDescriptor(t, nbDims, &type, &nbDims, res.data(), stride.data()));
+    return shape(&res[0], nbDims) + shape(&stride[0], nbDims);
+};
+static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type)
+{
+    cudnnTensorFormat_t format;
+    int sizes = 8;
+    std::vector<int> res(sizes);
+    int outDims;
+    checkCudnnStatus(cudnnGetFilterNdDescriptor(t, sizes, type, &format, &outDims, res.data()));
+    return shape(&res[0], outDims);
+};
+static std::string shape(cudnnConvolutionDescriptor_t convDesc)
+{
+    const int maxDim = 5;
+    int nDim=0;
+    cudnnConvolutionMode_t mode;
+    cudnnDataType_t        computeType;
+    int                                 padA[maxDim];
+    int                                 strideA[maxDim];
+    int                                 dilationA[maxDim];    
+    checkCudnnStatus(
+        cudnnGetConvolutionNdDescriptor( convDesc, maxDim,
+                                         &nDim,
+                                         &padA[0],
+                                         &strideA[0],
+                                         &dilationA[0],
+                                         &mode,
+                                         &computeType ));
+    return std::string("-mode ") + (((int)mode==0) ? "conv" : "corr") + " -padA" + shape(padA,nDim) + " -convStrideA " + shape(strideA, nDim)  + " -dilationA " + shape(dilationA, nDim);
+}
+static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
+{
+        size_t alignMask = (type == CUDNN_DATA_HALF) ? 0x7F : 0xFF ;
+        // there have to be entries for both aligned and not
+        if (((size_t)in | (size_t)out | (size_t)filter) & alignMask)
+        {
+            return false;
+        }
+        return true;
+}
+std::string dnn_conv_shape(cudnnTensorDescriptor_t input, void* in,
+                           cudnnFilterDescriptor_t filterDesc, void* filter,
+                           cudnnConvolutionDescriptor_t convDesc,
+                           void* out)
+{
+    cudnnDataType_t  dType;
+    std::stringstream s;
+    s << " -dimA" << shape(input) << " -filtA" << shape(filterDesc, &dType) << shape(convDesc);    
+// there have to be entries for both aligned and not
+    if (!all_aligned(dType, in, out, filter))
+    {
+        s << " [unaligned] ";
+    }
+    return std::string(s.str().c_str());
+}
+void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
+{
+  pthread_mutex_lock(&algoMutex);    
+  algoCache[hash] = rec;
+  pthread_mutex_unlock(&algoMutex);
+}
+const AlgoRec* dnn_conv_check_cache(const std::string& hash)
+{
+  pthread_mutex_lock(&algoMutex);    
+  bool cacheHit = false;
+  const AlgoRec* ret = 0;
+  // cout << "dnn_conv_check_cache: "<< hash << endl;
+  AlgoCache::iterator hit = algoCache.find(hash);
+  if (hit != algoCache.end())
+    ret = &hit->second;
+  pthread_mutex_unlock(&algoMutex);
+  return ret;
+}
--- a/theano/gpuarray/c_code/dnn_conv_find.cc
+++ b/theano/gpuarray/c_code/dnn_conv_find.cc
+#section support_code
+#include <cuda.h>
+#include <mutex>
+#include <sstream>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include "dnn_conv_find.h"
+#line 10 "dnn_conv_find.cc"
+using std::vector;
+using std::string;
+using std::unique_lock;
+using std::mutex;
+typedef std::unordered_map<string, AlgoRec> AlgoCache;
+mutex  algoMutex;
+AlgoCache algoCache;
+static std::string shape(int* res, int size)
+{
+    std::stringstream s;
+    if (size>0) {
+      s<<res[0];
+      for (int i=1; i< size; ++i)
+        s <<',' << res[i];
+    }
+    return std::string(s.str().c_str());
+}
+static std::string shape(cudnnTensorDescriptor_t t)
+{
+    std::vector<int> res;
+    std::vector<int> stride;
+    int nbDims;
+    cudnnDataType_t type;
+    checkCudnnStatus(cudnnGetTensorNdDescriptor(t, 0, &type, &nbDims, nullptr, nullptr));
+    res.resize(nbDims);
+    stride.resize(nbDims);
+    checkCudnnStatus(cudnnGetTensorNdDescriptor(t, nbDims, &type, &nbDims, res.data(), stride.data()));
+    return shape(&res[0], nbDims) + shape(&stride[0], nbDims);
+};
+static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type)
+{
+    cudnnTensorFormat_t format;
+    int sizes = 8;
+    std::vector<int> res(sizes);
+    int outDims;
+    checkCudnnStatus(cudnnGetFilterNdDescriptor(t, sizes, type, &format, &outDims, res.data()));
+    return shape(&res[0], outDims);
+};
+static std::string shape(cudnnConvolutionDescriptor_t convDesc)
+{
+    const int maxDim = 5;
+    int nDim=0;
+    cudnnConvolutionMode_t mode;
+    cudnnDataType_t        computeType;
+    int                                 padA[maxDim];
+    int                                 strideA[maxDim];
+    int                                 dilationA[maxDim];    
+    checkCudnnStatus(
+        cudnnGetConvolutionNdDescriptor( convDesc, maxDim,
+                                         &nDim,
+                                         &padA[0],
+                                         &strideA[0],
+                                         &dilationA[0],
+                                         &mode,
+                                         &computeType ));
+    return std::string("-mode ") + (((int)mode==0) ? "conv" : "corr") + " -padA" + shape(padA,nDim) + " -convStrideA " + shape(strideA, nDim)  + " -dilationA " + shape(dilationA, nDim);
+}
+static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
+{
+        size_t alignMask = (type == CUDNN_DATA_HALF) ? 0x7F : 0xFF ;
+        // there have to be entries for both aligned and not
+        if (((size_t)in | (size_t)out | (size_t)filter) & alignMask)
+        {
+            return false;
+        }
+        return true;
+}
+std::string dnn_conv_shape(cudnnTensorDescriptor_t input, void* in,
+                           cudnnFilterDescriptor_t filterDesc, void* filter,
+                           cudnnConvolutionDescriptor_t convDesc,
+                           void* out)
+{
+    cudnnDataType_t  dType;
+    std::stringstream s;
+    s << " -dimA" << shape(input) << " -filtA" << shape(filterDesc, &dType) << shape(convDesc);    
+// there have to be entries for both aligned and not
+    if (!all_aligned(dType, in, out, filter))
+    {
+        s << " [unaligned] ";
+    }
+    return std::string(s.str().c_str());
+}
+void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
+{
+    unique_lock<mutex> lock(algoMutex);    
+    algoCache[hash] = rec;
+}
+const AlgoRec* dnn_conv_check_cache(const std::string& hash)
+{
+    unique_lock<mutex> lock(algoMutex);    
+    bool cacheHit = false;
+    // cout << "dnn_conv_check_cache: "<< hash << endl;
+    AlgoCache::iterator hit = algoCache.find(hash);
+    if (hit != algoCache.end())
+        return &hit->second;
+    return nullptr;
+}
--- a/theano/gpuarray/c_code/dnn_conv_find.h
+++ b/theano/gpuarray/c_code/dnn_conv_find.h
+#pragma once
+#include <string>
+#include <cuda.h>
+#include <cudnn.h>
+#if CUDNN_MAJOR < 7
+    enum cudnnMathType_t { CUDNN_DEFAULT_MATH=0, CUDNN_TENSOR_OP_MATH = 1 };
+#endif
+inline void checkCudnnStatus(cudnnStatus_t err)
+{
+    if (err != CUDNN_STATUS_SUCCESS) {
+        PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s",
+                     cudnnGetErrorString(err));
+    }    
+}
+/* a common struct for all 3 CUDNN enums */
+struct AlgoRec {
+        int algo;
+        cudnnDataType_t dataType;
+        size_t wsSize;
+        cudnnMathType_t mathType;
+};
+const AlgoRec* dnn_conv_check_cache(const std::string&);
+std::string dnn_conv_shape(cudnnTensorDescriptor_t input, void* in,
+                           cudnnFilterDescriptor_t filterDesc, void* filter,
+                           cudnnConvolutionDescriptor_t convDesc,
+                           void* out);
+void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec);
--- a/theano/gpuarray/c_code/dnn_fwd.c
+++ b/theano/gpuarray/c_code/dnn_fwd.c
 #section init_code_struct
 reuse_algo = 0;
-prev_algo = PARAMS->conv_algo;
+prev_algo.algo = PARAMS->conv_algo;
+prev_algo.mathType = CUDNN_DEFAULT_MATH;
+prev_algo.dataType = CUDNN_DATA_FLOAT;
 memset(prev_img_dims, 0, sizeof(prev_img_dims));
 memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
 #section support_code_struct
+#line 12 "dnn_fwd.c"
-int reuse_algo;
+#include "dnn_conv_find.h"
-cudnnConvolutionFwdAlgo_t prev_algo;
+int     reuse_algo;
-size_t prev_img_dims[5];
+bool    use_cached;
-size_t prev_kern_dims[5];
+AlgoRec prev_algo;
+size_t  prev_img_dims[5];
+size_t  prev_kern_dims[5];
 int
 APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
@@ -84,12 +88,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups;
  cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
+  size_t   worksize  = 0;
+  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
+  std::string hashkey = "F| GPU#";
  #ifdef DEBUG
  char algorithm_name[128];
  #endif
  cuda_enter(c->ctx);
  if (params->choose_algo) {
    if (!params->choose_once) {
      reuse_algo = 1;
@@ -100,10 +108,26 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                      PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
      }
    }
+    char pci_id[16];
+    gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+    hashkey += pci_id;
+    hashkey += dnn_conv_shape(APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
+			      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
+			      desc, PyGpuArray_DEV_DATA(*output));
    if (!reuse_algo) {
+      // check out cache
+      const AlgoRec* cached = dnn_conv_check_cache(hashkey);
+      if (cached) {
+        prev_algo = *cached;
+        use_cached = 1;
+      }
+    }
+    if (!(reuse_algo || use_cached)) {
      size_t free;
      int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
      if (err2 != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
@@ -125,6 +149,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
          return -1;
        }
        // We don't sync the buffer as we don't care about the values.
        err = cudnnFindConvolutionForwardAlgorithmEx(
          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
@@ -134,6 +159,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          free);
        gpudata_release(tmpmem);
+	//	fprintf(stderr, "(cudnnFindConvolutionForwardAlgorithmEx: (err:%d), algo: %d, mem: %ld, free: %ld\n",
+	//	err, choice.algo, choice.memory, free);
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError,
                       "error selecting convolution algo: %s",
@@ -142,6 +170,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          return 1;
        }
        algo = choice.algo;
+        prev_algo.algo = (int)algo;
+        prev_algo.wsSize = worksize = choice.memory;
+        prev_algo.mathType = mathtype = choice.mathType;
+        // Add to the cache
+        dnn_conv_update_cache(hashkey, prev_algo);
        #ifdef DEBUG
        if (count == 0) {
@@ -167,31 +201,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
            cuda_exit(c->ctx);
            return 1;
          }
+          prev_algo.algo = algo;
+          // no tensor_op returned from Get()
+          prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
+	  // fprintf(stderr, "(cudnnGetConvolutionForwardAlgorithm: (err:%d), algo: %d\n", err, algo);
      }
-      prev_algo = algo;
+    } else { 
-    } else {
+      algo = (cudnnConvolutionFwdAlgo_t)prev_algo.algo;
-      algo = prev_algo;
+      worksize = prev_algo.wsSize;
+      mathtype = prev_algo.mathType;
    }
+  } else { /* choose_algo */
-    #ifdef DEBUG
-    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
-        return 1;
-    // NB: This is printed only when algorithm is chosen at runtime.
-    if (reuse_algo)
-        fprintf(stderr, "(reused %s)\n", algorithm_name);
-    else
-        fprintf(stderr, "(using %s)\n", algorithm_name);
-    #endif
-    if (params->choose_once) {
-      reuse_algo = 1;
-    } else {
-      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-        prev_img_dims[i] = PyGpuArray_DIM(input, i);
-        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
-      }
-    }
-  }
  /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
  if (PyGpuArray_NDIM(input) == 5 &&
@@ -269,10 +289,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      }
    }
  }
+  }/* choose_algo */
+  // if FindEx was used (choose_time), workspace size is set. 
+  if (!(reuse_algo || use_cached || params->choose_time))
  {
-    size_t worksize;
-    gpudata *workspace;
    err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
@@ -309,7 +330,52 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      cuda_exit(c->ctx);
      return 1;
    }
+    // save worksize for next time/cache
+    prev_algo.wsSize = worksize;
+    // Add to the cache
+    if (params->choose_algo)
+      dnn_conv_update_cache(hashkey, prev_algo);
+  }
+#ifdef DEBUG  
+  if (params->choose_algo) {  
+    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
+      return 1;
+    fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n",
+	    params->choose_algo ? "[A]": "" ,
+	    params->choose_time ? "[T]": "" ,
+	    algo, // algorithm_name,
+	    reuse_algo ? "(reused)" : "",
+	    use_cached ? "(cache)": "",
+	    worksize, mathtype, hashkey.c_str()
+	    );
+  }
+#endif
+  if (params->choose_once) {
+    reuse_algo = 1;
+  } else {
+    for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+      prev_img_dims[i] = PyGpuArray_DIM(input, i);
+      prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
+    }
+  }
+  {
+    gpudata *workspace = 0;  
+#if CUDNN_MAJOR >= 7    
+    // CUDNN7: need to set math type
+    err = cudnnSetConvolutionMathType(desc, prev_algo.mathType);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error setting math type for convolution : %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+#endif
    /*
     * This is less than ideal since we need to free it after (which
     * introduces a synchronization point. But we don't have a module
@@ -324,7 +390,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
        return 1;
      }
    }
    cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
@@ -348,6 +414,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
  }
  cuda_exit(c->ctx);
  if (err != CUDNN_STATUS_SUCCESS) {
@@ -357,3 +424,5 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  }
  return 0;
 }
--- a/theano/gpuarray/c_code/dnn_gi.c
+++ b/theano/gpuarray/c_code/dnn_gi.c
 #section init_code_struct
+prev_algo.algo = PARAMS->conv_algo;
+prev_algo.mathType = CUDNN_DEFAULT_MATH;
+prev_algo.dataType = CUDNN_DATA_FLOAT;
 reuse_algo = 0;
-prev_algo = PARAMS->conv_algo;
 memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
 memset(prev_top_dims, 0, sizeof(prev_top_dims));
 #section support_code_struct
+#include "dnn_conv_find.h"
-int reuse_algo;
+#line 12 "dnn_gi.c"
-cudnnConvolutionBwdDataAlgo_t prev_algo;
+int     reuse_algo;
+bool    use_cached;
+AlgoRec prev_algo;
 size_t prev_kern_dims[5];
 size_t prev_top_dims[5];
@@ -86,6 +89,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  #ifdef DEBUG
  char algorithm_name[128];
  #endif
+  size_t   worksize  = 0;
+  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
  cuda_enter(c->ctx);
@@ -104,7 +109,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
-                                     " but received gradient with shape %ldx%ldx%ldx%ld",
+                                     " but received gradient with shape %dx%dx% dx%d",
                   expected_output_dims[0], expected_output_dims[1],
                   expected_output_dims[2], expected_output_dims[3],
                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
@@ -131,6 +136,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    }
  }
+  char pci_id[16];
+  gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+  std::string hashkey;
  if (params->choose_algo) {
    if (!params->choose_once) {
      reuse_algo = 1;
@@ -140,9 +149,22 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
        reuse_algo = (reuse_algo &&
                      PyGpuArray_DIM(output, i) == prev_top_dims[i]);
      }
+    }    
+     if (!reuse_algo) {
+      // check out cache
+       hashkey = std::string("GI | GPU#") + pci_id + 
+         dnn_conv_shape(APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
+                        APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
+                        desc, PyGpuArray_DEV_DATA(output)
+           );
+       const AlgoRec* cached = dnn_conv_check_cache(hashkey);
+      if (cached) {
+        prev_algo = *cached;
+        use_cached = 1;
+      }
    }
-    if (!reuse_algo) {
+    if (!(reuse_algo || use_cached)) {
      size_t free;
      int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
@@ -182,6 +204,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      }
      algo = choice.algo;
+        prev_algo.algo = (int)algo;
+        prev_algo.wsSize = worksize = choice.memory;
+        prev_algo.mathType = mathtype = choice.mathType;
+        // Add to the cache
+	dnn_conv_update_cache(hashkey, prev_algo);
      #ifdef DEBUG
      if (count == 0) {
@@ -206,32 +234,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
        cuda_exit(c->ctx);
        return 1;
      }
+      prev_algo.algo = algo;
+      // no tensor_op returned from Get()
+      prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
    }
-      prev_algo = algo;
-    } else {
-      algo = prev_algo;
-    }
-    #ifdef DEBUG
-    char algorithm_name[128];
-    if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name))
-        return 1;
-    // NB: This is printed only when algorithm is chosen at runtime.
-    if (reuse_algo)
-        fprintf(stderr, "(reused %s)\n", algorithm_name);
-    else
-        fprintf(stderr, "(using %s)\n", algorithm_name);
-    #endif
-    if (params->choose_once) {
-      reuse_algo = 1;
-    } else {
-      for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
-        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
-        prev_top_dims[i] = PyGpuArray_DIM(output, i);
-      }
    }
-  }
+  } else { /*choose_algo */
  // The FFT implementation does not support strides, 1x1 filters or inputs
  // with a spatial dimension larger than 1024. The tiled-FFT implementation
@@ -279,9 +287,11 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      }
    }
  }
+  } /* choose_algo */
-  size_t worksize;
-  gpudata *workspace;
+  // if FindEx was used (choose_time), workspace size is set. 
+  if (!(reuse_algo || use_cached || params->choose_time))
+    {
  err = cudnnGetConvolutionBackwardDataWorkspaceSize(
    params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
@@ -293,7 +303,47 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    cuda_exit(c->ctx);
    return 1;
  }
+  // save worksize for next time/cache
+  prev_algo.wsSize = worksize;
+  // Add to the cache
+  if (params->choose_algo)
+      dnn_conv_update_cache(hashkey, prev_algo);
+    }  
+    #ifdef DEBUG
+    char algorithm_name[128];
+    if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name))
+        return 1;
+    // NB: This is printed only when algorithm is chosen at runtime.
+    if (reuse_algo)
+        fprintf(stderr, "(reused %s)\n", algorithm_name);
+    else
+        fprintf(stderr, "(using %s)\n", algorithm_name);
+    #endif
+    if (params->choose_once) {
+      reuse_algo = 1;
+    } else {
+      for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
+        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
+        prev_top_dims[i] = PyGpuArray_DIM(output, i);
+      }
+    }
+    gpudata *workspace = 0;  
+#if CUDNN_MAJOR >= 7    
+    // CUDNN7: need to set math type
+    err = cudnnSetConvolutionMathType(desc, prev_algo.mathType);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error setting math type for convolution : %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+#endif
  if (worksize != 0) {
    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {

--- a/theano/gpuarray/c_code/dnn_gw.c
+++ b/theano/gpuarray/c_code/dnn_gw.c
 #section init_code_struct
+prev_algo.algo = PARAMS->conv_algo;
+prev_algo.mathType = CUDNN_DEFAULT_MATH;
+prev_algo.dataType = CUDNN_DATA_FLOAT;
 reuse_algo = 0;
-prev_algo = PARAMS->conv_algo;
 memset(prev_img_dims, 0, sizeof(prev_img_dims));
 memset(prev_top_dims, 0, sizeof(prev_top_dims));
 #section support_code_struct
+#line 12 "dnn_gw.c"
-int reuse_algo;
+#include "dnn_conv_find.h"
-cudnnConvolutionBwdFilterAlgo_t prev_algo;
+int     reuse_algo;
+bool    use_cached;
+AlgoRec prev_algo;
 size_t prev_img_dims[5];
 size_t prev_top_dims[5];
@@ -87,6 +90,10 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  #ifdef DEBUG
  char algorithm_name[128];
  #endif
+  size_t   worksize  = 0;
+  cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
+  std::string hashkey = "GW | GPU#";
  cuda_enter(c->ctx);
@@ -104,8 +111,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        (PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld"
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%d"
-                                     " but received gradient with shape %ldx%ldx%dx%ld",
+                                     " but received gradient with shape %ldx%ldx%ldx%ld",
                   expected_output_dims[0], expected_output_dims[1],
                   expected_output_dims[2], expected_output_dims[3],
                   PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
@@ -119,7 +126,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
        (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
-      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
+      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%dx%d"
                                     " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
                   expected_output_dims[0], expected_output_dims[1],
                   expected_output_dims[2], expected_output_dims[3],
@@ -143,7 +150,26 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
      }
    }
-    if (!reuse_algo) {
+    char pci_id[16];
+    gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+    hashkey = pci_id;
+    hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
+			     APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
+			     desc, PyGpuArray_DEV_DATA(output)
+			     );
+     if (!reuse_algo) {
+      // check out cache
+      const AlgoRec* cached = dnn_conv_check_cache(hashkey);
+      if (cached) {
+        prev_algo = *cached;
+        use_cached = 1;
+      }
+    }
+    if (!(reuse_algo || use_cached)) {
      size_t free;
      int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
@@ -184,6 +210,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        }
        algo = choice.algo;
+        prev_algo.algo = (int)algo;
+        prev_algo.wsSize = worksize = choice.memory;
+        prev_algo.mathType = mathtype = choice.mathType;
+        // Add to the cache
+        dnn_conv_update_cache(hashkey, prev_algo);
        #ifdef DEBUG
        if (count == 0) {
@@ -209,32 +241,16 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
          cuda_exit(c->ctx);
          return 1;
        }
+	prev_algo.algo = algo;
+	// no tensor_op returned from Get()
+	prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
      }
-      prev_algo = algo;
+    } else { 
-    } else {
+      algo = (cudnnConvolutionBwdFilterAlgo_t)prev_algo.algo;
-      algo = prev_algo;
+      worksize = prev_algo.wsSize;
-    }
+      mathtype = prev_algo.mathType;
-    #ifdef DEBUG
-    if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
-        return 1;
-    // NB: This is printed only when algorithm is chosen at runtime.
-    if (reuse_algo)
-        fprintf(stderr, "(reused %s)\n", algorithm_name);
-    else
-        fprintf(stderr, "(using %s)\n", algorithm_name);
-    #endif
-    if (params->choose_once) {
-      reuse_algo = 1;
-    } else {
-      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
-        prev_img_dims[i] = PyGpuArray_DIM(input, i);
-        prev_top_dims[i] = PyGpuArray_DIM(output, i);
-      }
    }
-  }
+  } else {
  // The FFT implementation does not support strides, 1x1 filters or inputs
  // with a spatial dimension larger than 1024.
  // If the chosen implementation is FFT, validate that it can
@@ -267,9 +283,11 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
    }
  }
+  }/* choose_algo */
-  size_t worksize;
+  // if FindEx was used (choose_time), workspace size is set. 
-  gpudata *workspace;
+  if (!(reuse_algo || use_cached || params->choose_time))
+    {
  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
    params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
@@ -281,7 +299,49 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
      cuda_exit(c->ctx);
    return 1;
  }
+  // save worksize for next time/cache
+  prev_algo.wsSize = worksize;
+  // Add to the cache
+  if (params->choose_algo)
+    dnn_conv_update_cache(hashkey, prev_algo);
+    }
+#ifdef DEBUG  
+  if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
+    return 1;
+  // NB: This is printed only when algorithm is chosen at runtime.
+  fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n",
+	  params->choose_algo ? "[A]": "" ,
+	  params->choose_time ? "[T]": "" ,
+	  algo, // algorithm_name,
+	  reuse_algo ? "(reused)" : "",
+	  use_cached ? "(cache)": "",
+	  worksize, mathtype, hashkey.c_str()
+	  );
+#endif
+    if (params->choose_once) {
+      reuse_algo = 1;
+    } else {
+      for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+        prev_img_dims[i] = PyGpuArray_DIM(input, i);
+        prev_top_dims[i] = PyGpuArray_DIM(output, i);
+      }
+    }
+    gpudata *workspace = 0;  
+#if CUDNN_MAJOR >= 7    
+    // CUDNN7: need to set math type
+    err = cudnnSetConvolutionMathType(desc, prev_algo.mathType);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error setting math type for convolution : %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+#endif
  if (worksize != 0) {
    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -399,7 +399,7 @@ class DnnBase(COp):
        return []
    def c_code_cache_version(self):
-        return (super(DnnBase, self).c_code_cache_version(), version(), 1)
+        return (super(DnnBase, self).c_code_cache_version(), version(), 2)
 class GpuDnnConvDesc(COp):
@@ -567,7 +567,7 @@ class GpuDnnConv(DnnBase):
                             num_groups=int_t)
    def __init__(self, algo=None, inplace=False, num_groups=1):
-        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_fwd.c"],
+        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_conv_find.c", "c_code/dnn_fwd.c"],
                         "APPLY_SPECIFIC(conv_fwd)")
        if algo is None:
@@ -710,7 +710,7 @@ class GpuDnnConvGradW(DnnBase):
                             num_groups=int_t)
    def __init__(self, inplace=False, algo=None, num_groups=1):
-        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_gw.c"],
+        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_conv_find.c", "c_code/dnn_gw.c"],
                         "APPLY_SPECIFIC(conv_gw)")
        self.inplace = bool(inplace)
        if self.inplace:
@@ -846,7 +846,7 @@ class GpuDnnConvGradI(DnnBase):
                             num_groups=int_t)
    def __init__(self, inplace=False, algo=None, num_groups=1):
-        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_gi.c"],
+        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_conv_find.c", "c_code/dnn_gi.c"],
                         "APPLY_SPECIFIC(conv_gi)")
        self.inplace = bool(inplace)
        if self.inplace:

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -1180,15 +1180,19 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
  old = *base;
  do {
    assumed = old;
-    sum = __float2half_rn(
+    ga_half old_perm;
+    __HALF_TO_US(old_perm)  = __byte_perm(old, 0,
+                     ((ga_size)addr & 2) ? 0x4432 : 0x4410);
+    sum = __float2half_as_us(
      __half2float(val) +
-      __half2float((ga_half)__byte_perm(old, 0,
+      __half2float(old_perm));
-                     ((ga_size)addr & 2) ? 0x4432 : 0x4410)));
    new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
    old = atomicCAS(base, assumed, new_);
  } while (assumed != old);
-  return (ga_half)__byte_perm(old, 0,
+  ga_half ret;
-                                  ((ga_size)addr & 2) ? 0x4432 : 0x4410);
+  __HALF_TO_US(ret) = __byte_perm(old, 0,
+        ((ga_size)addr & 2) ? 0x4432 : 0x4410);
+  return ret;
 }
 __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
@@ -1197,13 +1201,14 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
  old = *base;
  do {
    assumed = old;
-    new_ = __byte_perm(old, val, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
+    new_ = __byte_perm(old, __HALF_TO_US(val), ((ga_size)addr & 2) ? 0x5410 : 0x3254);
    old = atomicCAS(base, assumed, new_);
  } while (assumed != old);
-  return (ga_half)__byte_perm(old, 0,
+  ga_half ret; 
-                                  ((ga_size)addr & 2) ? 0x4432 : 0x4410);
+  __HALF_TO_US(ret) =__byte_perm(old, 0,
+        ((ga_size)addr & 2) ? 0x4432 : 0x4410);
+  return ret;
 }
        KERNEL void k_vector_add_fast(const ga_size numRowsX,
                                      const ga_size numColsX,
                                      const ga_ssize stridesX0,