Merge pull request #6348 from notoraptor/debug-info-cudnn-conv-timing

Add DEBUG infos to profile cuDNN convolutions.

Merge pull request #6348 from notoraptor/debug-info-cudnn-conv-timing
f241dfac · Frédéric Bastien · GitHub · ca5b9581 · 95d29760 · f241dfac
--- a/theano/gpuarray/c_code/dnn_conv_base.c
+++ b/theano/gpuarray/c_code/dnn_conv_base.c
@@ -85,6 +85,34 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
 #line 87 "dnn_conv_base.c"
+#ifdef DEBUG
+#if __cplusplus < 201103L
+const char* const _cppver = "No timing available: C++11 or later is required.";
+#else
+#define DEBUG_TIMING
+#include <chrono>
+const char* const _cppver = NULL;
+struct TheanoTimer {
+    double milliseconds;
+    std::chrono::steady_clock::time_point base;
+    void start() {base = std::chrono::steady_clock::now();}
+    void end() {
+        milliseconds =
+            std::chrono::duration_cast<std::chrono::nanoseconds>(
+                std::chrono::steady_clock::now() - base
+            ).count() / 1000000.0;
+    }
+};
+#endif
+#endif
 pthread_mutex_t  algoMutex;
 AlgoCache        algoCache;

--- a/theano/gpuarray/c_code/dnn_fwd.c
+++ b/theano/gpuarray/c_code/dnn_fwd.c
@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
 reuse_algo = 0;
 hash_prefix = std::string("FWD|GPU#");
+#ifdef DEBUG_TIMING
+total_computation_time = 0;
+total_selection_time = 0;
+n_computations = 0;
+n_selections = 0;
+if (PARAMS->choose_algo) {
+    if (PARAMS->choose_time) {
+        selection_name = "fastest";
+    } else {
+        selection_name = "best suited";
+    }
+};
+#endif
 #section support_code_struct
-#line 9 "dnn_fwd.c"
+#line 22 "dnn_fwd.c"
 int     reuse_algo;
 AlgoRec prev_algo;
 std::string hash_prefix;
@@ -15,6 +28,13 @@ std::string hash_prefix;
 #ifdef DEBUG
 char algorithm_name[128];
 #endif
+#ifdef DEBUG_TIMING
+double total_computation_time;
+double total_selection_time;
+size_t n_computations;
+size_t n_selections;
+const char* selection_name;
+#endif
 /** Check given algorithm against inputs and convolution descriptor,
    change algorithm inplace to a fallback algorithm if checkings fail.
@@ -121,6 +141,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  bool use_cached = 0;
+  #ifdef DEBUG
+  if (_cppver) fprintf(stderr, "%s\n", _cppver);
+  #endif
+  #ifdef DEBUG_TIMING
+  TheanoTimer timer;
+  #endif
  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError,
@@ -193,7 +219,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  cuda_enter(c->ctx);
  size_t maxfree = c_get_largest_free_block_size(c);
-  if (PyErr_Occurred()) return 1;
+  if (PyErr_Occurred()) {
+    cuda_exit(c->ctx);
+    return 1;
+  }
  if (params->choose_algo) {
@@ -241,6 +270,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
            o = pygpu_empty(PyGpuArray_NDIM(*output), PyGpuArray_DIMS(*output), (*output)->ga.typecode, GA_C_ORDER, c, Py_None);
        }
+        #ifdef DEBUG_TIMING
+        timer.start();
+        #endif
        // We don't sync the buffer as we don't care about the values.
        err = cudnnFindConvolutionForwardAlgorithmEx(
          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
@@ -248,6 +280,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(o),
          1, &count, &choice, *(void **)tmpmem,
          maxfree);
+        #ifdef DEBUG_TIMING
+        timer.end();
+        #endif
        gpudata_release(tmpmem);
        if (beta != 0) {
            Py_XDECREF(o);
@@ -282,10 +317,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          mathtype = choice.mathType;
 #endif
      } else {
+        #ifdef DEBUG_TIMING
+        timer.start();
+        #endif
        err = cudnnGetConvolutionForwardAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
          desc, APPLY_SPECIFIC(output),
          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
+        #ifdef DEBUG_TIMING
+        timer.end();
+        #endif
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError,
                       "error selecting convolution algo: %s",
@@ -294,6 +335,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          return 1;
        }
      }
+      #ifdef DEBUG_TIMING
+      total_selection_time += timer.milliseconds;
+      ++n_selections;
+      #endif
    }
  }
@@ -356,7 +401,18 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
            use_cached ? "(cache)": "",
            worksize,
            hashkey.c_str()
-      );
+    );
+#endif
+#ifdef DEBUG_TIMING
+    if (!(reuse_algo || use_cached)) {
+        // We have selected an algorithm at runtime.
+        // `timer` still contains timing about selection step.
+        fprintf(stderr, "\t(selected %s fwd algo in %g milliseconds)\n", selection_name, timer.milliseconds);
+        if (n_selections > 1) {
+            fprintf(stderr, "\t(selected %lu fwd algos in %g milliseconds (average: %g milliseconds per selection))\n",
+                    n_selections, total_selection_time, total_selection_time / n_selections);
+        }
+    }
 #endif
    if (!reuse_algo) {
@@ -377,11 +433,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  {
    gpudata *workspace = 0;
-    /*
-     * This is less than ideal since we need to free it after (which
-     * introduces a synchronization point. But we don't have a module
-     * to place a nice get_work_mem() function in.
-     */
    if (worksize != 0) {
      workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
      if (workspace == NULL) {
@@ -391,10 +442,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      }
    }
+    if (worksize != 0)
+      cuda_wait(workspace, GPUARRAY_CUDA_WAIT_WRITE);
    cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
+    #ifdef DEBUG_TIMING
+    GpuArray_sync(&(*output)->ga);
+    timer.start();
+    #endif
    for ( int g = 0; g < groups; g++) {
      err = cudnnConvolutionForward(
        params->handle,
@@ -407,14 +465,23 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
        APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g);
    }
-    if (worksize != 0)
+    if (worksize != 0) {
+      cuda_record(workspace, GPUARRAY_CUDA_WAIT_WRITE);
      gpudata_release(workspace);
+    }
    cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
  }
+  #ifdef DEBUG_TIMING
+  GpuArray_sync(&(*output)->ga);
+  timer.end();
+  total_computation_time += timer.milliseconds;
+  ++n_computations;
+  #endif
  cuda_exit(c->ctx);
  if (err != CUDNN_STATUS_SUCCESS) {
@@ -422,6 +489,13 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
 		 cudnnGetErrorString(err));
    return 1;
  }
+  #ifdef DEBUG_TIMING
+  fprintf(stderr, "\t(ran fwd algo in %g milliseconds)\n", timer.milliseconds);
+  if (n_computations > 1) {
+    fprintf(stderr, "\t(ran %lu fwd computations in %g milliseconds (average: %g milliseconds per call))\n",
+            n_computations, total_computation_time, total_computation_time / n_computations);
+  }
+  #endif
  return 0;
 }

--- a/theano/gpuarray/c_code/dnn_gi.c
+++ b/theano/gpuarray/c_code/dnn_gi.c
@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
 reuse_algo = 0;
 hash_prefix = std::string("GI|GPU#");
+#ifdef DEBUG_TIMING
+total_computation_time = 0;
+total_selection_time = 0;
+n_computations = 0;
+n_selections = 0;
+if (PARAMS->choose_algo) {
+    if (PARAMS->choose_time) {
+        selection_name = "fastest";
+    } else {
+        selection_name = "best suited";
+    }
+};
+#endif
 #section support_code_struct
-#line 9 "dnn_gi.c"
+#line 22 "dnn_gi.c"
 int     reuse_algo;
 AlgoRec prev_algo;
 std::string hash_prefix;
@@ -15,6 +28,13 @@ std::string hash_prefix;
 #ifdef DEBUG
 char algorithm_name[128];
 #endif
+#ifdef DEBUG_TIMING
+double total_computation_time;
+double total_selection_time;
+size_t n_computations;
+size_t n_selections;
+const char* selection_name;
+#endif
 /** Check given algorithm against inputs and convolution descriptor,
    change algorithm inplace to a fallback algorithm if checkings fail.
@@ -86,6 +106,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  bool use_cached = 0;
+  #ifdef DEBUG
+  if (_cppver) fprintf(stderr, "%s\n", _cppver);
+  #endif
+  #ifdef DEBUG_TIMING
+  TheanoTimer timer;
+  #endif
  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
@@ -159,11 +185,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  std::string hashkey;
-  size_t maxfree = c_get_largest_free_block_size(c);
-  if (PyErr_Occurred()) return 1;
  cuda_enter(c->ctx);
+  size_t maxfree = c_get_largest_free_block_size(c);
+  if (PyErr_Occurred()) {
+    cuda_exit(c->ctx);
+    return 1;
+  }
  if (params->choose_algo) {
    if (!reuse_algo) {
@@ -211,11 +241,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
            ip = pygpu_empty(PyGpuArray_NDIM(*input), PyGpuArray_DIMS(*input), (*input)->ga.typecode, GA_C_ORDER, c, Py_None);
        }
+        #ifdef DEBUG_TIMING
+        timer.start();
+        #endif
        err = cudnnFindConvolutionBackwardDataAlgorithmEx(
          params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
          APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(ip),
          1, &count, &choice, *(void **)tmpmem, maxfree);
+        #ifdef DEBUG_TIMING
+        timer.end();
+        #endif
        gpudata_release(tmpmem);
        if (beta != 0) {
            Py_XDECREF(ip);
@@ -248,10 +284,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
          mathtype = choice.mathType;
 #endif
      } else {
+        #ifdef DEBUG_TIMING
+        timer.start();
+        #endif
        err = cudnnGetConvolutionBackwardDataAlgorithm(
          params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
          desc, APPLY_SPECIFIC(input),
          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
+        #ifdef DEBUG_TIMING
+        timer.end();
+        #endif
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
                       cudnnGetErrorString(err));
@@ -259,6 +301,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
          return 1;
        }
      }
+      #ifdef DEBUG_TIMING
+      total_selection_time += timer.milliseconds;
+      ++n_selections;
+      #endif
    }
  }
@@ -313,7 +359,18 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
            use_cached ? "(cache)": "",
            worksize,
            hashkey.c_str()
-      );
+    );
+#endif
+#ifdef DEBUG_TIMING
+    if (!(reuse_algo || use_cached)) {
+        // We have selected an algorithm at runtime.
+        // `timer` still contains timing about selection step.
+        fprintf(stderr, "\t(selected %s gradinput algo in %g milliseconds)\n", selection_name, timer.milliseconds);
+        if (n_selections > 1) {
+            fprintf(stderr, "\t(selected %lu gradinput algos in %g milliseconds (average: %g milliseconds per selection))\n",
+                    n_selections, total_selection_time, total_selection_time / n_selections);
+        }
+    }
 #endif
    if (!reuse_algo) {
@@ -342,10 +399,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    }
  }
+  if (worksize != 0)
+    cuda_wait(workspace, GPUARRAY_CUDA_WAIT_WRITE);
  cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
+  #ifdef DEBUG_TIMING
+  GpuArray_sync(&(*input)->ga);
+  timer.start();
+  #endif
  for ( int g = 0; g < groups; g++) {
    err = cudnnConvolutionBackwardData(
      params->handle,
@@ -357,13 +421,22 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(*input)) + input_offset * g);
  }
-  if (worksize != 0)
+  if (worksize != 0) {
+    cuda_record(workspace, GPUARRAY_CUDA_WAIT_WRITE);
    gpudata_release(workspace);
+  }
  cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
+  #ifdef DEBUG_TIMING
+  GpuArray_sync(&(*input)->ga);
+  timer.end();
+  total_computation_time += timer.milliseconds;
+  ++n_computations;
+  #endif
  cuda_exit(c->ctx);
  if (err != CUDNN_STATUS_SUCCESS) {
@@ -371,5 +444,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
                 cudnnGetErrorString(err));
    return 1;
  }
+  #ifdef DEBUG_TIMING
+  fprintf(stderr, "\t(ran gradinput algo in %g milliseconds)\n", timer.milliseconds);
+  if (n_computations > 1) {
+    fprintf(stderr, "\t(ran %lu gradinput computations in %g milliseconds (average: %g milliseconds per call))\n",
+            n_computations, total_computation_time, total_computation_time / n_computations);
+  }
+  #endif
  return 0;
 }
--- a/theano/gpuarray/c_code/dnn_gw.c
+++ b/theano/gpuarray/c_code/dnn_gw.c
@@ -3,9 +3,22 @@ prev_algo.algo = PARAMS->conv_algo;
 prev_algo.mathType = CUDNN_DEFAULT_MATH;
 reuse_algo = 0;
 hash_prefix = std::string("GW|GPU#");
+#ifdef DEBUG_TIMING
+total_computation_time = 0;
+total_selection_time = 0;
+n_computations = 0;
+n_selections = 0;
+if (PARAMS->choose_algo) {
+    if (PARAMS->choose_time) {
+        selection_name = "fastest";
+    } else {
+        selection_name = "best suited";
+    }
+};
+#endif
 #section support_code_struct
-#line 9 "dnn_gw.c"
+#line 22 "dnn_gw.c"
 int     reuse_algo;
 AlgoRec prev_algo;
 std::string hash_prefix;
@@ -15,6 +28,13 @@ std::string hash_prefix;
 #ifdef DEBUG
 char algorithm_name[128];
 #endif
+#ifdef DEBUG_TIMING
+double total_computation_time;
+double total_selection_time;
+size_t n_computations;
+size_t n_selections;
+const char* selection_name;
+#endif
 /** Check given algorithm against inputs and convolution descriptor,
    change algorithm inplace to a fallback algorithm if checkings fail.
@@ -73,6 +93,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  bool use_cached = 0;
+  #ifdef DEBUG
+  if (_cppver) fprintf(stderr, "%s\n", _cppver);
+  #endif
+  #ifdef DEBUG_TIMING
+  TheanoTimer timer;
+  #endif
  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError,
@@ -146,11 +172,15 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  std::string hashkey ;
-  size_t maxfree = c_get_largest_free_block_size(c);
-  if (PyErr_Occurred()) return 1;
  cuda_enter(c->ctx);
+  size_t maxfree = c_get_largest_free_block_size(c);
+  if (PyErr_Occurred()) {
+    cuda_exit(c->ctx);
+    return 1;
+  }
  if (params->choose_algo) {
    if (!reuse_algo) {
@@ -198,11 +228,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
            k = pygpu_empty(PyGpuArray_NDIM(*kerns), PyGpuArray_DIMS(*kerns), (*kerns)->ga.typecode, GA_C_ORDER, c, Py_None);
        }
+        #ifdef DEBUG_TIMING
+        timer.start();
+        #endif
        err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
          params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
          APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(k),
          1, &count, &choice, *(void **)tmpmem, maxfree);
+        #ifdef DEBUG_TIMING
+        timer.end();
+        #endif
        gpudata_release(tmpmem);
        if (beta != 0) {
            Py_XDECREF(k);
@@ -237,10 +273,16 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
          mathtype = choice.mathType;
 #endif
      } else {
+        #ifdef DEBUG_TIMING
+        timer.start();
+        #endif
        err = cudnnGetConvolutionBackwardFilterAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
          desc, APPLY_SPECIFIC(kerns),
          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
+        #ifdef DEBUG_TIMING
+        timer.end();
+        #endif
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError,
                       "error selecting convolution algo: %s",
@@ -249,6 +291,10 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
          return 1;
        }
      }
+      #ifdef DEBUG_TIMING
+      total_selection_time += timer.milliseconds;
+      ++n_selections;
+      #endif
    }
  } /* choose_algo */
@@ -305,6 +351,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
            hashkey.c_str()
     );
 #endif
+#ifdef DEBUG_TIMING
+    if (!(reuse_algo || use_cached)) {
+        // We have selected an algorithm at runtime.
+        // `timer` still contains timing about selection step.
+        fprintf(stderr, "\t(selected %s gradweight algo in %g milliseconds)\n", selection_name, timer.milliseconds);
+        if (n_selections > 1) {
+            fprintf(stderr, "\t(selected %lu gradweight algos in %g milliseconds (average: %g milliseconds per selection))\n",
+                    n_selections, total_selection_time, total_selection_time / n_selections);
+        }
+    }
+#endif
    if (!reuse_algo) {
      // save for next time/cache
@@ -333,10 +390,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    }
  }
+  if (worksize != 0)
+    cuda_wait(workspace, GPUARRAY_CUDA_WAIT_WRITE);
  cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
+  #ifdef DEBUG_TIMING
+  GpuArray_sync(&(*kerns)->ga);
+  timer.start();
+  #endif
  for ( int g = 0; g < groups; g++) {
    err = cudnnConvolutionBackwardFilter(
      params->handle,
@@ -348,13 +412,22 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
      APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(*kerns)) + kern_offset * g);
  }
-  if (worksize != 0)
+  if (worksize != 0) {
+    cuda_record(workspace, GPUARRAY_CUDA_WAIT_WRITE);
    gpudata_release(workspace);
+  }
  cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
+  #ifdef DEBUG_TIMING
+  GpuArray_sync(&(*kerns)->ga);
+  timer.end();
+  total_computation_time += timer.milliseconds;
+  ++n_computations;
+  #endif
  cuda_exit(c->ctx);
  if (err != CUDNN_STATUS_SUCCESS) {
@@ -362,5 +435,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                 cudnnGetErrorString(err));
    return 1;
  }
+  #ifdef DEBUG_TIMING
+  fprintf(stderr, "\t(ran gradweight algo in %g milliseconds)\n", timer.milliseconds);
+  if (n_computations > 1) {
+    fprintf(stderr, "\t(ran %lu gradweight computations in %g milliseconds (average: %g milliseconds per call))\n",
+            n_computations, total_computation_time, total_computation_time / n_computations);
+  }
+  #endif
  return 0;
 }
--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -2639,7 +2639,7 @@ class TestDnnConv2DRuntimeAlgorithms(object):
            filters = theano.tensor.TensorType(dtype, _broadcastable)()
            conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
                                subsample=unit_shape, dilation=unit_shape)
-            grad_i = theano.tensor.grad(conv.sum(), [inputs])
+            grad_i, = theano.tensor.grad(conv.sum(), [inputs])
            f = theano.function([inputs, filters], grad_i, mode=mode_with_gpu)
            assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradI)])
            assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
@@ -2649,7 +2649,7 @@ class TestDnnConv2DRuntimeAlgorithms(object):
            else:
                flipped_filters = filters[:, :, ::-1, ::-1]
            conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
-            grad_i_ref = theano.tensor.grad(conv_ref.sum(), [inputs])
+            grad_i_ref, = theano.tensor.grad(conv_ref.sum(), [inputs])
            f_ref = theano.function([inputs, filters], grad_i_ref, mode='FAST_RUN')
            runtime_shapes = self.runtime_shapes
            if algo in ('time_once', 'guess_once'):
@@ -2677,7 +2677,7 @@ class TestDnnConv2DRuntimeAlgorithms(object):
            filters = theano.tensor.TensorType(dtype, _broadcastable)()
            conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
                                subsample=unit_shape, dilation=unit_shape)
-            grad_w = theano.tensor.grad(conv.sum(), [filters])
+            grad_w, = theano.tensor.grad(conv.sum(), [filters])
            f = theano.function([inputs, filters], grad_w, mode=mode_with_gpu)
            assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradW)])
            assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
@@ -2687,7 +2687,7 @@ class TestDnnConv2DRuntimeAlgorithms(object):
            else:
                flipped_filters = filters[:, :, ::-1, ::-1]
            conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
-            grad_w_ref = theano.tensor.grad(conv_ref.sum(), [filters])
+            grad_w_ref, = theano.tensor.grad(conv_ref.sum(), [filters])
            f_ref = theano.function([inputs, filters], grad_w_ref, mode='FAST_RUN')
            runtime_shapes = self.runtime_shapes
            if algo in ('time_once', 'guess_once'):