Apply changes to dnn_gi and dnn_gw.

7806823d · notoraptor · 403865ea · 7806823d · 7806823d
--- a/theano/gpuarray/c_code/dnn_gi.c
+++ b/theano/gpuarray/c_code/dnn_gi.c
@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
 reuse_algo = 0;
 hash_prefix = std::string("GI|GPU#");
+#ifdef DEBUG_TIMING
+total_computation_time = 0;
+total_selection_time = 0;
+n_computations = 0;
+n_selections = 0;
+if (PARAMS->choose_algo) {
+    if (PARAMS->choose_time) {
+        selection_name = "fastest";
+    } else {
+        selection_name = "best suited";
+    }
+};
+#endif

 #section support_code_struct
-#line 9 "dnn_gi.c"
+#line 22 "dnn_gi.c"
 int     reuse_algo;
 AlgoRec prev_algo;
 std::string hash_prefix;
@@ -15,6 +28,13 @@ std::string hash_prefix;
 #ifdef DEBUG
 char algorithm_name[128];
 #endif
+#ifdef DEBUG_TIMING
+double total_computation_time;
+double total_selection_time;
+size_t n_computations;
+size_t n_selections;
+const char* selection_name;
+#endif

 /** Check given algorithm against inputs and convolution descriptor,
    change algorithm inplace to a fallback algorithm if checkings fail.
@@ -86,6 +106,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  bool use_cached = 0;
+  #ifdef DEBUG
+  if (_cppver) fprintf(stderr, "%s\n", _cppver);
+  #endif
+  #ifdef DEBUG_TIMING
+  TheanoTimer timer;
+  #endif

  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
@@ -159,11 +185,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,

  std::string hashkey;

-  size_t maxfree = c_get_largest_free_block_size(c);
-  if (PyErr_Occurred()) return 1;

  cuda_enter(c->ctx);

+  size_t maxfree = c_get_largest_free_block_size(c);
+  if (PyErr_Occurred()) return 1;
+
  if (params->choose_algo) {

    if (!reuse_algo) {
@@ -211,11 +238,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
            ip = pygpu_empty(PyGpuArray_NDIM(*input), PyGpuArray_DIMS(*input), (*input)->ga.typecode, GA_C_ORDER, c, Py_None);
        }

+        #ifdef DEBUG_TIMING
+        timer.start();
+        #endif
        err = cudnnFindConvolutionBackwardDataAlgorithmEx(
          params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
          APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(ip),
          1, &count, &choice, *(void **)tmpmem, maxfree);
+        #ifdef DEBUG_TIMING
+        timer.end();
+        #endif
        gpudata_release(tmpmem);
        if (beta != 0) {
            Py_XDECREF(ip);
@@ -248,10 +281,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
          mathtype = choice.mathType;
 #endif
      } else {
+        #ifdef DEBUG_TIMING
+        timer.start();
+        #endif
        err = cudnnGetConvolutionBackwardDataAlgorithm(
          params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
          desc, APPLY_SPECIFIC(input),
          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
+        #ifdef DEBUG_TIMING
+        timer.end();
+        #endif
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
                       cudnnGetErrorString(err));
@@ -259,6 +298,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
          return 1;
        }
      }
+      #ifdef DEBUG_TIMING
+      total_selection_time += timer.milliseconds;
+      ++n_selections;
+      #endif
    }
  }

@@ -313,7 +356,18 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
            use_cached ? "(cache)": "",
            worksize,
            hashkey.c_str()
-      );
+    );
+#endif
+#ifdef DEBUG_TIMING
+    if (!(reuse_algo || use_cached)) {
+        // We have selected an algorithm at runtime.
+        // `timer` still contains timing about selection step.
+        fprintf(stderr, "\t(selected %s gradinput algo in %g milliseconds)\n", selection_name, timer.milliseconds);
+        if (n_selections > 1) {
+            fprintf(stderr, "\t(selected %lu gradinput algos in %g milliseconds (average: %g milliseconds per selection))\n",
+                    n_selections, total_selection_time, total_selection_time / n_selections);
+        }
+    }
 #endif

    if (!reuse_algo) {
@@ -348,6 +402,11 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

+  #ifdef DEBUG_TIMING
+  GpuArray_sync(&(*input)->ga);
+  timer.start();
+  #endif
+
  for ( int g = 0; g < groups; g++) {
    err = cudnnConvolutionBackwardData(
      params->handle,
@@ -368,6 +427,13 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

+  #ifdef DEBUG_TIMING
+  GpuArray_sync(&(*input)->ga);
+  timer.end();
+  total_computation_time += timer.milliseconds;
+  ++n_computations;
+  #endif
+
  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
@@ -375,5 +441,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
                 cudnnGetErrorString(err));
    return 1;
  }
+  #ifdef DEBUG_TIMING
+  fprintf(stderr, "\t(ran gradinput algo in %g milliseconds)\n", timer.milliseconds);
+  if (n_computations > 1) {
+    fprintf(stderr, "\t(ran %lu gradinput computations in %g milliseconds (average: %g milliseconds per call))\n",
+            n_computations, total_computation_time, total_computation_time / n_computations);
+  }
+  #endif
  return 0;
 }
--- a/theano/gpuarray/c_code/dnn_gw.c
+++ b/theano/gpuarray/c_code/dnn_gw.c
@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
 reuse_algo = 0;
 hash_prefix = std::string("GW|GPU#");
+#ifdef DEBUG_TIMING
+total_computation_time = 0;
+total_selection_time = 0;
+n_computations = 0;
+n_selections = 0;
+if (PARAMS->choose_algo) {
+    if (PARAMS->choose_time) {
+        selection_name = "fastest";
+    } else {
+        selection_name = "best suited";
+    }
+};
+#endif

 #section support_code_struct
-#line 9 "dnn_gw.c"
+#line 22 "dnn_gw.c"
 int     reuse_algo;
 AlgoRec prev_algo;
 std::string hash_prefix;
@@ -15,6 +28,13 @@ std::string hash_prefix;
 #ifdef DEBUG
 char algorithm_name[128];
 #endif
+#ifdef DEBUG_TIMING
+double total_computation_time;
+double total_selection_time;
+size_t n_computations;
+size_t n_selections;
+const char* selection_name;
+#endif

 /** Check given algorithm against inputs and convolution descriptor,
    change algorithm inplace to a fallback algorithm if checkings fail.
@@ -73,6 +93,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  bool use_cached = 0;
+  #ifdef DEBUG
+  if (_cppver) fprintf(stderr, "%s\n", _cppver);
+  #endif
+  #ifdef DEBUG_TIMING
+  TheanoTimer timer;
+  #endif

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError,
@@ -146,11 +172,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,

  std::string hashkey ;

-  size_t maxfree = c_get_largest_free_block_size(c);
-  if (PyErr_Occurred()) return 1;

  cuda_enter(c->ctx);

+  size_t maxfree = c_get_largest_free_block_size(c);
+  if (PyErr_Occurred()) return 1;
+
  if (params->choose_algo) {

    if (!reuse_algo) {
@@ -198,11 +225,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
            k = pygpu_empty(PyGpuArray_NDIM(*kerns), PyGpuArray_DIMS(*kerns), (*kerns)->ga.typecode, GA_C_ORDER, c, Py_None);
        }

+        #ifdef DEBUG_TIMING
+        timer.start();
+        #endif
        err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(k),
          1, &count, &choice, *(void **)tmpmem, maxfree);
+        #ifdef DEBUG_TIMING
+        timer.end();
+        #endif
        gpudata_release(tmpmem);
        if (beta != 0) {
            Py_XDECREF(k);
@@ -237,10 +270,16 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
          mathtype = choice.mathType;
 #endif
      } else {
+        #ifdef DEBUG_TIMING
+        timer.start();
+        #endif
        err = cudnnGetConvolutionBackwardFilterAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
          desc, APPLY_SPECIFIC(kerns),
          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
+        #ifdef DEBUG_TIMING
+        timer.end();
+        #endif
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError,
                       "error selecting convolution algo: %s",
@@ -249,6 +288,10 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
          return 1;
        }
      }
+      #ifdef DEBUG_TIMING
+      total_selection_time += timer.milliseconds;
+      ++n_selections;
+      #endif
    }
  } /* choose_algo */

@@ -305,6 +348,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
            hashkey.c_str()
     );
 #endif
+#ifdef DEBUG_TIMING
+    if (!(reuse_algo || use_cached)) {
+        // We have selected an algorithm at runtime.
+        // `timer` still contains timing about selection step.
+        fprintf(stderr, "\t(selected %s gradweight algo in %g milliseconds)\n", selection_name, timer.milliseconds);
+        if (n_selections > 1) {
+            fprintf(stderr, "\t(selected %lu gradweight algos in %g milliseconds (average: %g milliseconds per selection))\n",
+                    n_selections, total_selection_time, total_selection_time / n_selections);
+        }
+    }
+#endif

    if (!reuse_algo) {
      // save for next time/cache
@@ -339,6 +393,11 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

+  #ifdef DEBUG_TIMING
+  GpuArray_sync(&(*kerns)->ga);
+  timer.start();
+  #endif
+
  for ( int g = 0; g < groups; g++) {
    err = cudnnConvolutionBackwardFilter(
      params->handle,
@@ -359,6 +418,13 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

+  #ifdef DEBUG_TIMING
+  GpuArray_sync(&(*kerns)->ga);
+  timer.end();
+  total_computation_time += timer.milliseconds;
+  ++n_computations;
+  #endif
+
  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
@@ -366,5 +432,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                 cudnnGetErrorString(err));
    return 1;
  }
+  #ifdef DEBUG_TIMING
+  fprintf(stderr, "\t(ran gradweight algo in %g milliseconds)\n", timer.milliseconds);
+  if (n_computations > 1) {
+    fprintf(stderr, "\t(ran %lu gradweight computations in %g milliseconds (average: %g milliseconds per call))\n",
+            n_computations, total_computation_time, total_computation_time / n_computations);
+  }
+  #endif
  return 0;
 }