Fixing math_type for F32 and cache update

ade0510d · Boris Fomitchev · 1bc17311 · ade0510d · ade0510d · ade0510d
--- a/theano/gpuarray/c_code/dnn_fwd.c
+++ b/theano/gpuarray/c_code/dnn_fwd.c
@@ -231,7 +231,8 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          return -1;
        }
        // set the 'tensor math ok' flag
-        c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
+        if (input->ga.typecode == GA_HALF)
+          c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);

        // We don't sync the buffer as we don't care about the values.
        err = cudnnFindConvolutionForwardAlgorithmEx(
@@ -265,12 +266,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
        #endif

        algo = choice.algo;
-        prev_algo.algo = (int)algo;
-        prev_algo.wsSize = worksize = choice.memory;
+        worksize = choice.memory;
 #if CUDNN_MAJOR >= 7
-        prev_algo.mathType = mathtype = choice.mathType;
+        if (input->ga.typecode == GA_HALF)
+          mathtype = choice.mathType;
 #endif
-
      } else {
        err = cudnnGetConvolutionForwardAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
@@ -283,9 +283,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          cuda_exit(c->ctx);
          return 1;
        }
-        prev_algo.algo = algo;
-        // no tensor_op returned from Get()
-        prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
      }
    }
  }
@@ -334,18 +331,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    }
  }

-  if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
-    // algo may have changed due to fallback, we must update it.
+  if (params->choose_algo && !reuse_algo) {
+    // save for next time/cache
    prev_algo.algo = algo;
-    // save worksize for next time/cache
    prev_algo.wsSize = worksize;
-
-    // Add to the cache if we choose on shape change, or first time if we choose once.
-    dnn_conv_update_cache(hashkey, prev_algo);
-  }
+    prev_algo.mathType = mathtype;
+    // Add to the cache if we choose on shape change, or first time if
+    // we choose once.
+    if (!use_cached)
+      dnn_conv_update_cache(hashkey, prev_algo);

 #ifdef DEBUG
-  if (params->choose_algo) {
    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
      cuda_exit(c->ctx);
      return 1;
@@ -359,12 +355,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
            worksize,
            hashkey.c_str()
      );
-  }
 #endif
-
-  if (params->choose_once) {
-    reuse_algo = 1;
-  }
+    
+    if (params->choose_once)
+      reuse_algo = 1;
+  } // params->choose_algo && !reuse_algo

  {
    gpudata *workspace = 0;

--- a/theano/gpuarray/c_code/dnn_gi.c
+++ b/theano/gpuarray/c_code/dnn_gi.c
@@ -194,7 +194,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
        gpudata *tmpmem;

        // set the 'tensor math ok' flag
-        c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
+        if (im->ga.typecode == GA_HALF)
+          c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);

        tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
        if (tmpmem == NULL) {
@@ -229,14 +230,13 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
            return 1;
        } // Else, count is necessarly 1 for current implementation.
        #endif
-
+        
        algo = choice.algo;
-        prev_algo.algo = (int)algo;
-        prev_algo.wsSize = worksize = choice.memory;
+        worksize = choice.memory;
 #if CUDNN_MAJOR >= 7
-        prev_algo.mathType = mathtype = choice.mathType;
+        if (im->ga.typecode == GA_HALF)
+          mathtype = choice.mathType;
 #endif
-
      } else {
        err = cudnnGetConvolutionBackwardDataAlgorithm(
          params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
@@ -248,9 +248,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
          cuda_exit(c->ctx);
          return 1;
        }
-        prev_algo.algo = algo;
-        // no tensor_op returned from Get()
-        prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
      }
    }
  }
@@ -291,18 +288,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    }
  }  // !(reuse_algo || use_cached || params->choose_time)

-  if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
-    // algo may have changed due to fallback, we must update it.
+  if (params->choose_algo  && !reuse_algo) {
+    // save for next time/cache
    prev_algo.algo = algo;
-    // save worksize for next time/cache
    prev_algo.wsSize = worksize;
+    prev_algo.mathType = mathtype;

    // Add to the cache
-    dnn_conv_update_cache(hashkey, prev_algo);
-  }
+    if (!use_cached)
+      dnn_conv_update_cache(hashkey, prev_algo);

 #ifdef DEBUG
-  if (params->choose_algo) {
    if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
        cuda_exit(c->ctx);
        return 1;
@@ -316,13 +312,11 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
            worksize,
            hashkey.c_str()
      );
-  }
 #endif
-
-  if (params->choose_once) {
-    reuse_algo = 1;
-  }
-
+    if (params->choose_once)
+      reuse_algo = 1;
+  } // params->choose_algo  && !reuse_algo
+  
  gpudata *workspace = 0;
  if (worksize != 0) {
    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);

--- a/theano/gpuarray/c_code/dnn_gw.c
+++ b/theano/gpuarray/c_code/dnn_gw.c
@@ -181,7 +181,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        gpudata *tmpmem;

        // set the 'tensor math ok' flag
-        c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
+        if (input->ga.typecode == GA_HALF)
+          c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);

        tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
        if (tmpmem == NULL) {
@@ -220,12 +221,11 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        #endif

        algo = choice.algo;
-        prev_algo.algo = (int)algo;
-        prev_algo.wsSize = worksize = choice.memory;
+        worksize = choice.memory;
 #if CUDNN_MAJOR >= 7
-        prev_algo.mathType = mathtype = choice.mathType;
+        if (input->ga.typecode == GA_HALF)
+          mathtype = choice.mathType;
 #endif
-
      } else {
        err = cudnnGetConvolutionBackwardFilterAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
@@ -238,9 +238,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
          cuda_exit(c->ctx);
          return 1;
        }
-	    prev_algo.algo = algo;
-	    // no tensor_op returned from Get()
-	    prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
      }
    }
  } /* choose_algo */
@@ -281,18 +278,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    }
  }

-  if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
-    // algo may have changed due to fallback, we must update it.
+  if (params->choose_algo && !reuse_algo) {
+    // save for next time/cache
    prev_algo.algo = algo;
-    // save worksize for next time/cache
    prev_algo.wsSize = worksize;
-
-    // Add to the cache
-    dnn_conv_update_cache(hashkey, prev_algo);
-  }
+    prev_algo.mathType = mathtype;
+    // Add to the cache if we choose on shape change, or first time if
+    // we choose once.
+    if (!use_cached)
+      dnn_conv_update_cache(hashkey, prev_algo);

 #ifdef DEBUG
-  if (params->choose_algo) {
    if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
      cuda_exit(c->ctx);
      return 1;
@@ -306,13 +302,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
            worksize,
            hashkey.c_str()
     );
-  }
 #endif

-  if (params->choose_once) {
-    reuse_algo = 1;
-  }
-
+    if (params->choose_once)
+      reuse_algo = 1;
+  } // params->choose_algo && !reuse_algo
+  
  gpudata *workspace = 0;

  if (worksize != 0) {