Fixed overflow with workspace size, tensor_op setting for FindEx

c5944c7d · Boris Fomitchev · e596e80e · c5944c7d · c5944c7d · c5944c7d
--- a/theano/gpuarray/c_code/dnn_conv_base.c
+++ b/theano/gpuarray/c_code/dnn_conv_base.c
@@ -26,6 +26,20 @@ static int c_check_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups
 #endif
 }
+static int c_set_math_type_for_conv(cudnnConvolutionDescriptor_t desc, cudnnMathType_t mathtype) {
+#if CUDNN_MAJOR >= 7
+  // CUDNN7: need to set math type
+  cudnnStatus_t err = cudnnSetConvolutionMathType(desc, mathtype);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "error setting math type for convolution : %s",
+                 cudnnGetErrorString(err));
+    return -1;
+  }
+#endif
+  return 1;
+}
 #section init_code_struct
 cudnnStatus_t APPLY_SPECIFIC(err);
@@ -83,19 +97,19 @@ static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err, const char* msg)
    return err;
 }
-static int
+static size_t
 c_get_largest_free_block_size(PyGpuContextObject *c)
 {
-  size_t free = 0;
+  size_t maxfree = 0;
-  int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+  int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &maxfree);
  if (err2 != GA_NO_ERROR) {
    PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
                 "memory information on the GPU");
  }
  // Guess 4Mb if the info is not available
-  if (free == 0) free = 4 * 1024 * 1024;
+  if (maxfree == 0) maxfree = 4 * 1024 * 1024;
-  return free;
+  return maxfree;
 }
 /** Check if convolution output tensor has expected dimensions

--- a/theano/gpuarray/c_code/dnn_fwd.c
+++ b/theano/gpuarray/c_code/dnn_fwd.c
@@ -186,11 +186,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  std::string hashkey;
-  size_t free = c_get_largest_free_block_size(c);
-  if (PyErr_Occurred()) return 1;
  cuda_enter(c->ctx);
+  size_t maxfree = c_get_largest_free_block_size(c);
+  if (PyErr_Occurred()) return 1;
  if (params->choose_algo) {
    if (!reuse_algo) {
@@ -220,12 +221,14 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
        cudnnConvolutionFwdAlgoPerf_t choice;
        gpudata *tmpmem;
-        tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+        tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
        if (tmpmem == NULL) {
-          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+          PyErr_SetString(PyExc_MemoryError, "Could not allocate GPU memory for FindEx");
          cuda_exit(c->ctx);
          return -1;
        }
+        // set the 'tensor math ok' flag
+        c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
        // We don't sync the buffer as we don't care about the values.
        err = cudnnFindConvolutionForwardAlgorithmEx(
@@ -233,7 +236,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
          desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
          1, &count, &choice, *(void **)tmpmem,
-          free);
+          maxfree);
        gpudata_release(tmpmem);
        if (err != CUDNN_STATUS_SUCCESS) {
@@ -269,7 +272,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
        err = cudnnGetConvolutionForwardAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
          desc, APPLY_SPECIFIC(output),
-          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError,
                       "error selecting convolution algo: %s",
@@ -283,8 +286,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      }
    }
  }
-  if (dnn_conv_fwd_fallback(&algo, input, kerns, desc) != 0) {
+  if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
+      dnn_conv_fwd_fallback(&algo, input, kerns, desc) != 0) {
    cuda_exit(c->ctx);
    return 1;
  }
@@ -343,12 +347,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      cuda_exit(c->ctx);
      return 1;
    }
-    fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
+    fprintf(stderr, "(using %s%s %s%s%s, ws:%ld, hash:%s)\n",
            algorithm_name,
+            mathtype == CUDNN_TENSOR_OP_MATH ? "[T]" : "",
            params->choose_time ? "(timed)": "" ,
            reuse_algo ? "(reused)" : "",
            use_cached ? "(cache)": "",
-            mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
            worksize,
            hashkey.c_str()
      );
@@ -361,18 +365,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  {
    gpudata *workspace = 0;
-#if CUDNN_MAJOR >= 7
-    // CUDNN7: need to set math type
-    err = cudnnSetConvolutionMathType(desc, mathtype);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error setting math type for convolution : %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-#endif
    /*
     * This is less than ideal since we need to free it after (which
     * introduces a synchronization point. But we don't have a module

--- a/theano/gpuarray/c_code/dnn_gi.c
+++ b/theano/gpuarray/c_code/dnn_gi.c
@@ -156,7 +156,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  std::string hashkey;
-  size_t free = c_get_largest_free_block_size(c);
+  size_t maxfree = c_get_largest_free_block_size(c);
  if (PyErr_Occurred()) return 1;
  cuda_enter(c->ctx);
@@ -190,7 +190,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
        cudnnConvolutionBwdDataAlgoPerf_t choice;
        gpudata *tmpmem;
-        tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+        // set the 'tensor math ok' flag
+        c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
+        tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
        if (tmpmem == NULL) {
          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
          cuda_exit(c->ctx);
@@ -201,7 +204,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
          params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
          APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
-          1, &count, &choice, *(void **)tmpmem, free);
+          1, &count, &choice, *(void **)tmpmem, maxfree);
        gpudata_release(tmpmem);
        if (err != CUDNN_STATUS_SUCCESS) {
@@ -235,7 +238,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
        err = cudnnGetConvolutionBackwardDataAlgorithm(
          params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
          desc, APPLY_SPECIFIC(input),
-          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
                       cudnnGetErrorString(err));
@@ -248,8 +251,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      }
    }
  }
-  if (dnn_conv_gi_fallback(&algo, *input, kerns, desc) != 0) {
+  if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
+      dnn_conv_gi_fallback(&algo, *input, kerns, desc) != 0) {
    cuda_exit(c->ctx);
    return 1;
  }
@@ -313,23 +317,11 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  }
 #endif
-    if (params->choose_once) {
+  if (params->choose_once) {
-      reuse_algo = 1;
+    reuse_algo = 1;
-    }
+  }
-    gpudata *workspace = 0;
+  gpudata *workspace = 0;
-#if CUDNN_MAJOR >= 7
-    // CUDNN7: need to set math type
-    err = cudnnSetConvolutionMathType(desc, mathtype);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error setting math type for convolution : %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-#endif
  if (worksize != 0) {
    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {

--- a/theano/gpuarray/c_code/dnn_gw.c
+++ b/theano/gpuarray/c_code/dnn_gw.c
@@ -143,7 +143,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  std::string hashkey ;
-  size_t free = c_get_largest_free_block_size(c);
+  size_t maxfree = c_get_largest_free_block_size(c);
  if (PyErr_Occurred()) return 1;
  cuda_enter(c->ctx);
@@ -176,8 +176,11 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        int count;
        cudnnConvolutionBwdFilterAlgoPerf_t choice;
        gpudata *tmpmem;
+        // set the 'tensor math ok' flag
+        c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
-        tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+        tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
        if (tmpmem == NULL) {
          PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
          cuda_exit(c->ctx);
@@ -188,7 +191,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
-          1, &count, &choice, *(void **)tmpmem, free);
+          1, &count, &choice, *(void **)tmpmem, maxfree);
        gpudata_release(tmpmem);
        if (err != CUDNN_STATUS_SUCCESS) {
@@ -224,7 +227,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        err = cudnnGetConvolutionBackwardFilterAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
          desc, APPLY_SPECIFIC(kerns),
-          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError,
                       "error selecting convolution algo: %s",
@@ -238,8 +241,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
      }
    }
  } /* choose_algo */
-  if (dnn_conv_gw_fallback(&algo, input, *kerns, desc) != 0) {
+  if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
+      dnn_conv_gw_fallback(&algo, input, *kerns, desc) != 0) {
    cuda_exit(c->ctx);
    return 1;
  }
@@ -303,22 +307,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  }
 #endif
-    if (params->choose_once) {
+  if (params->choose_once) {
-      reuse_algo = 1;
+    reuse_algo = 1;
-    }
+  }
-    gpudata *workspace = 0;
+  gpudata *workspace = 0;
-#if CUDNN_MAJOR >= 7
-    // CUDNN7: need to set math type
-    err = cudnnSetConvolutionMathType(desc, mathtype);
-    if (err != CUDNN_STATUS_SUCCESS) {
-      PyErr_Format(PyExc_RuntimeError,
-                   "error setting math type for convolution : %s",
-                   cudnnGetErrorString(err));
-      cuda_exit(c->ctx);
-      return 1;
-    }
-#endif
  if (worksize != 0) {
    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {