Add debug profiling for dnn_fwd

5c10bb1d · notoraptor · 1bb1bb8e · 5c10bb1d · 5c10bb1d · 5c10bb1d
--- a/theano/gpuarray/c_code/dnn_conv_base.c
+++ b/theano/gpuarray/c_code/dnn_conv_base.c
@@ -85,6 +85,23 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
 #line 87 "dnn_conv_base.c"
+#if __cplusplus < 201103L
+/* Using C standard interface (<ctime>). */
+#define theano_clock_t clock_t
+#define theano_clock() clock()
+#define theano_clock_to_milliseconds(t) ( 1000.0 * (t) / CLOCKS_PER_SEC )
+#define theano_clock_average_to_milliseconds(t, n) ( (1000.0 * (t) / (n)) / CLOCKS_PER_SEC )
+#else
+/* Using C++11 standard interface (<chrono>).
+I don't know if it's really more accurate, but at least
+it provides interfaces up to nanoseconds. */
+#include <chrono>
+#define theano_clock_t std::chrono::time_point
+#define theano_clock() std::chrono::steady_clock::now()
+#define theano_clock_to_milliseconds(t) ( std::chrono::duration_cast<std::chrono::nanoseconds>(t).count() / 1000000.0 )
+#define theano_clock_average_to_milliseconds(t, n) ( theano_clock_to_milliseconds(t) / (n) )
+#endif
 pthread_mutex_t  algoMutex;
 AlgoCache        algoCache;

--- a/theano/gpuarray/c_code/dnn_fwd.c
+++ b/theano/gpuarray/c_code/dnn_fwd.c
@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
 reuse_algo = 0;
 hash_prefix = std::string("FWD|GPU#");
+#ifdef DEBUG
+total_computation_time = 0;
+total_selection_time = 0;
+n_computations = 0;
+n_selections = 0;
+if (PARAMS->choose_algo) {
+    if (PARAMS->choose_time) {
+        selection_name = "fastest";
+    } else {
+        selection_name = "best suited";
+    }
+};
+#endif
 #section support_code_struct
-#line 9 "dnn_fwd.c"
+#line 22 "dnn_fwd.c"
 int     reuse_algo;
 AlgoRec prev_algo;
 std::string hash_prefix;
@@ -14,6 +27,11 @@ std::string hash_prefix;
 #ifdef DEBUG
 char algorithm_name[128];
+theano_clock_t total_computation_time;
+theano_clock_t total_selection_time;
+size_t n_computations;
+size_t n_selections;
+const char* selection_name;
 #endif
 /** Check given algorithm against inputs and convolution descriptor,
@@ -121,6 +139,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  bool use_cached = 0;
+  #ifdef DEBUG
+  theano_clock_t t;
+  #endif
  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError,
@@ -242,12 +263,18 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
        }
        // We don't sync the buffer as we don't care about the values.
+        #ifdef DEBUG
+        t = theano_clock();
+        #endif
        err = cudnnFindConvolutionForwardAlgorithmEx(
          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
          desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(o),
          1, &count, &choice, *(void **)tmpmem,
          maxfree);
+        #ifdef DEBUG
+        t = theano_clock() - t;
+        #endif
        gpudata_release(tmpmem);
        if (beta != 0) {
            Py_XDECREF(o);
@@ -282,10 +309,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          mathtype = choice.mathType;
 #endif
      } else {
+        #ifdef DEBUG
+        t = theano_clock();
+        #endif
        err = cudnnGetConvolutionForwardAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
          desc, APPLY_SPECIFIC(output),
          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
+        #ifdef DEBUG
+        t = theano_clock() - t;
+        #endif
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError,
                       "error selecting convolution algo: %s",
@@ -294,6 +327,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          return 1;
        }
      }
+      #ifdef DEBUG
+      total_selection_time += t;
+      ++n_selections;
+      #endif
    }
  }
@@ -357,6 +394,18 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
            worksize,
            hashkey.c_str()
    );
+    if (!(reuse_algo || use_cached)) {
+        // We have selected an algorithm at runtime.
+        // `t` still contains timing about selection step.
+        fprintf(stderr, "\t(selected %s fwd algo in %g milliseconds)\n", selection_name, theano_clock_to_milliseconds(t));
+        if (n_selections > 1) {
+            fprintf(stderr, "\t(selected %lu fwd algos in %g milliseconds (average: %g milliseconds per selection))\n",
+                    n_selections,
+                    theano_clock_to_milliseconds(total_selection_time),
+                    theano_clock_average_to_milliseconds(total_selection_time, n_selections));
+        }
+    }
+  }
 #endif
    if (!reuse_algo) {
@@ -375,7 +424,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  } // params->choose_algo
-  {
  gpudata *workspace = 0;
  /*
   * This is less than ideal since we need to free it after (which
@@ -395,6 +443,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
+  #ifdef DEBUG
+  t = theano_clock();
+  #endif
  for ( int g = 0; g < groups; g++) {
    err = cudnnConvolutionForward(
      params->handle,
@@ -413,7 +465,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-  }
+  #ifdef DEBUG
+  t = theano_clock() - t;
+  total_computation_time += t;
+  ++n_computations;
+  #endif
  cuda_exit(c->ctx);
@@ -422,6 +479,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
 		 cudnnGetErrorString(err));
    return 1;
  }
+  #ifdef DEBUG
+  fprintf(stderr, "\t(ran fwd algo in %g milliseconds)\n", theano_clock_to_milliseconds(t));
+  if (n_computations > 1) {
+    fprintf(stderr, "\t(ran %lu fwd computations in %g milliseconds (average: %g milliseconds per call))\n",
+            n_computations,
+            theano_clock_to_milliseconds(total_computation_time),
+            theano_clock_average_to_milliseconds(total_computation_time, n_computations));
+  }
+  #endif
  return 0;
 }

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -2720,6 +2720,11 @@ class TestDnnConv3DRuntimeAlgorithms(TestDnnConv2DRuntimeAlgorithms):
    ]
+class TestDnnConv2DRuntimeAlgorithmsWithBigInputs(TestDnnConv2DRuntimeAlgorithms):
+    runtime_shapes = [(5, [(12, 4, 128, 128), (5, 4, 64, 64)]),
+                      (6, [(12, 4, 256, 256), (5, 4, 32, 64)])]
 def test_conv_guess_once_with_dtypes():
    # This test checks that runtime conv algorithm selection does not raise any exception
    # when consecutive functions with different dtypes and precisions are executed.