Merge pull request #6359 from borisfom/f32-hmma-fix

Fixing math_type for F32 and cache update

Merge pull request #6359 from borisfom/f32-hmma-fix
c55c15ad · Frédéric Bastien · GitHub · 9592125c · 93073ef3 · c55c15ad
--- a/theano/gpuarray/c_code/dnn_fwd.c
+++ b/theano/gpuarray/c_code/dnn_fwd.c
@@ -231,7 +231,8 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          return -1;
        }
        // set the 'tensor math ok' flag
-        c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
+        if (input->ga.typecode == GA_HALF)
+          c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
        // We don't sync the buffer as we don't care about the values.
        err = cudnnFindConvolutionForwardAlgorithmEx(
@@ -265,12 +266,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
        #endif
        algo = choice.algo;
-        prev_algo.algo = (int)algo;
+        worksize = choice.memory;
-        prev_algo.wsSize = worksize = choice.memory;
 #if CUDNN_MAJOR >= 7
-        prev_algo.mathType = mathtype = choice.mathType;
+        if (input->ga.typecode == GA_HALF)
+          mathtype = choice.mathType;
 #endif
      } else {
        err = cudnnGetConvolutionForwardAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
@@ -283,9 +283,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          cuda_exit(c->ctx);
          return 1;
        }
-        prev_algo.algo = algo;
-        // no tensor_op returned from Get()
-        prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
      }
    }
  }
@@ -334,18 +331,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    }
  }
-  if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
+  if (params->choose_algo) {
-    // algo may have changed due to fallback, we must update it.
-    prev_algo.algo = algo;
-    // save worksize for next time/cache
-    prev_algo.wsSize = worksize;
-    // Add to the cache if we choose on shape change, or first time if we choose once.
-    dnn_conv_update_cache(hashkey, prev_algo);
-  }
 #ifdef DEBUG
-  if (params->choose_algo) {
    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
      cuda_exit(c->ctx);
      return 1;
@@ -359,12 +347,23 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
            worksize,
            hashkey.c_str()
      );
-  }
 #endif
-  if (params->choose_once) {
+    if (!reuse_algo) {
-    reuse_algo = 1;
+      // save for next time/cache
-  }
+      prev_algo.algo = algo;
+      prev_algo.wsSize = worksize;
+      prev_algo.mathType = mathtype;
+      // Add to the cache if we choose on shape change, or first time if
+      // we choose once.
+      if (!use_cached)
+        dnn_conv_update_cache(hashkey, prev_algo);
+      if (params->choose_once)
+        reuse_algo = 1;
+    }
+  } // params->choose_algo
  {
    gpudata *workspace = 0;

--- a/theano/gpuarray/c_code/dnn_gi.c
+++ b/theano/gpuarray/c_code/dnn_gi.c
@@ -194,7 +194,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
        gpudata *tmpmem;
        // set the 'tensor math ok' flag
-        c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
+        if (im->ga.typecode == GA_HALF)
+          c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
        tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
        if (tmpmem == NULL) {
@@ -231,12 +232,11 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
        #endif
        algo = choice.algo;
-        prev_algo.algo = (int)algo;
+        worksize = choice.memory;
-        prev_algo.wsSize = worksize = choice.memory;
 #if CUDNN_MAJOR >= 7
-        prev_algo.mathType = mathtype = choice.mathType;
+        if (im->ga.typecode == GA_HALF)
+          mathtype = choice.mathType;
 #endif
      } else {
        err = cudnnGetConvolutionBackwardDataAlgorithm(
          params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
@@ -248,9 +248,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
          cuda_exit(c->ctx);
          return 1;
        }
-        prev_algo.algo = algo;
-        // no tensor_op returned from Get()
-        prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
      }
    }
  }
@@ -291,18 +288,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    }
  }  // !(reuse_algo || use_cached || params->choose_time)
-  if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
+  if (params->choose_algo) {
-    // algo may have changed due to fallback, we must update it.
-    prev_algo.algo = algo;
-    // save worksize for next time/cache
-    prev_algo.wsSize = worksize;
-    // Add to the cache
-    dnn_conv_update_cache(hashkey, prev_algo);
-  }
 #ifdef DEBUG
-  if (params->choose_algo) {
    if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
        cuda_exit(c->ctx);
        return 1;
@@ -316,12 +304,23 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
            worksize,
            hashkey.c_str()
      );
-  }
 #endif
-  if (params->choose_once) {
+    if (!reuse_algo) {
-    reuse_algo = 1;
+      // save for next time/cache
-  }
+      prev_algo.algo = algo;
+      prev_algo.wsSize = worksize;
+      prev_algo.mathType = mathtype;
+      // Add to the cache
+      if (!use_cached)
+        dnn_conv_update_cache(hashkey, prev_algo);
+      if (params->choose_once)
+        reuse_algo = 1;
+    }
+  } // params->choose_algo
  gpudata *workspace = 0;
  if (worksize != 0) {

--- a/theano/gpuarray/c_code/dnn_gw.c
+++ b/theano/gpuarray/c_code/dnn_gw.c
@@ -181,7 +181,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        gpudata *tmpmem;
        // set the 'tensor math ok' flag
-        c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
+        if (input->ga.typecode == GA_HALF)
+          c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
        tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
        if (tmpmem == NULL) {
@@ -220,12 +221,11 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        #endif
        algo = choice.algo;
-        prev_algo.algo = (int)algo;
+        worksize = choice.memory;
-        prev_algo.wsSize = worksize = choice.memory;
 #if CUDNN_MAJOR >= 7
-        prev_algo.mathType = mathtype = choice.mathType;
+        if (input->ga.typecode == GA_HALF)
+          mathtype = choice.mathType;
 #endif
      } else {
        err = cudnnGetConvolutionBackwardFilterAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
@@ -238,9 +238,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
          cuda_exit(c->ctx);
          return 1;
        }
-	    prev_algo.algo = algo;
-	    // no tensor_op returned from Get()
-	    prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
      }
    }
  } /* choose_algo */
@@ -281,18 +278,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    }
  }
-  if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
+  if (params->choose_algo) {
-    // algo may have changed due to fallback, we must update it.
-    prev_algo.algo = algo;
-    // save worksize for next time/cache
-    prev_algo.wsSize = worksize;
-    // Add to the cache
-    dnn_conv_update_cache(hashkey, prev_algo);
-  }
 #ifdef DEBUG
-  if (params->choose_algo) {
    if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
      cuda_exit(c->ctx);
      return 1;
@@ -306,12 +294,23 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
            worksize,
            hashkey.c_str()
     );
-  }
 #endif
-  if (params->choose_once) {
+    if (!reuse_algo) {
-    reuse_algo = 1;
+      // save for next time/cache
-  }
+      prev_algo.algo = algo;
+      prev_algo.wsSize = worksize;
+      prev_algo.mathType = mathtype;
+      // Add to the cache if we choose on shape change, or first time if
+      // we choose once.
+      if (!use_cached)
+        dnn_conv_update_cache(hashkey, prev_algo);
+      if (params->choose_once)
+        reuse_algo = 1;
+    }
+  } // params->choose_algo
  gpudata *workspace = 0;