Add DEBUG messages into cuDNN conv codes.

Fix docstring for c_init_code_struct. Update support_code_struct for dnn_gi.

Add DEBUG messages into cuDNN conv codes.
e6b96472 · notoraptor · 743f7aa9 · e6b96472 · e6b96472 · e6b96472
--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -442,7 +442,7 @@ class CLinkerOp(CLinkerObject):
            The subclass does not override this method.

        """
-        raise utils.MethodNotDefined("c_init_code_apply", type(self),
+        raise utils.MethodNotDefined("c_init_code_struct", type(self),
                                     self.__class__.__name__)

    def c_support_code_struct(self, node, name):

--- a/theano/gpuarray/dnn_fwd.c
+++ b/theano/gpuarray/dnn_fwd.c
@@ -83,6 +83,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups;

  cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
+  #ifdef DEBUG
+  char algorithm_name[128];
+  #endif

  cuda_enter(c->ctx);

@@ -138,6 +141,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
          return 1;
        }
        algo = choice.algo;
+
+        #ifdef DEBUG
+        if (count == 0) {
+            PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found");
+            return 1;
+        } else {
+            fprintf(stderr, " (%d best-timed conv fwd algorithms) ", count);
+        }
+        #endif
+
      } else {
        err = cudnnGetConvolutionForwardAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
@@ -156,6 +169,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      algo = prev_algo;
    }

+    #ifdef DEBUG
+    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
+        return 1;
+    };
+    // NB: This is printed only when algorithm is chosen at runtime.
+    if (reuse_algo)
+        fprintf(stderr, "(reused %s) ", algorithm_name);
+    else
+        fprintf(stderr, "(using %s) ", algorithm_name);
+    #endif
+
    if (params->choose_once) {
      reuse_algo = 1;
    } else {
@@ -164,15 +188,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
        prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
      }
    }
-
-    #ifdef DEBUG
-    char algorithm_name[128];
-    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
-        return 1;
-    };
-    // NB: This is printed only when algorithm is chosen at runtime.
-    fprintf(stderr, "(using %s) ", algorithm_name);
-    #endif
  }

  /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
@@ -180,14 +195,28 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
        algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
        algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
+  {
+    #ifdef DEBUG
+    if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
+        return 1;
+    };
+    fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM) ", algorithm_name);
+    #endif
    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+  }

  // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
  // Issue should be resolved for cuDNN > V6.0.
  if (cudnnGetVersion() < 6100 &&
      algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
      PyGpuArray_DIM(input, 0) > 65536)
-      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+  {
+    #ifdef DEBUG
+    fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
+                    "will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM) ");
+    #endif
+    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+  }

  // The FFT implementation does not support strides, 1x1 filters or inputs
  // with a spatial dimension larger than 1024. The tiled-FFT implementation
@@ -197,6 +226,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  // can't.
  // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
  // defined only for 2d filters
+  /* NB:
+  TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
+  New conditions apply and may depend on number of dimensions (2D or 3D)
+  e.g. for FFT_TILING.
+  TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
+  */
  if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
       algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {

@@ -245,7 +280,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,

    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
      // Fallback to none algo if not supported
-      // TODO: Print a warning
+
+      #ifdef DEBUG
+      if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
+        return 1;
+      };
+      fprintf(stderr, "(%s error getting worksize: "
+                      "fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM) ", algorithm_name);
+      #endif
+
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;

      err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,

--- a/theano/gpuarray/dnn_gi.c
+++ b/theano/gpuarray/dnn_gi.c
@@ -13,8 +13,8 @@ if (PARAMS->choose_algo) {

 int reuse_algo;
 cudnnConvolutionBwdDataAlgo_t prev_algo;
-size_t prev_kern_dims[5] = {0};
-size_t prev_top_dims[5] = {0};
+size_t prev_kern_dims[5];
+size_t prev_top_dims[5];

 int
 APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
@@ -82,6 +82,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;

  cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
+  #ifdef DEBUG
+  char algorithm_name[128];
+  #endif

  cuda_enter(c->ctx);

@@ -178,6 +181,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      }

      algo = choice.algo;
+
+      #ifdef DEBUG
+      if (count == 0) {
+          PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
+          return 1;
+      } else {
+          fprintf(stderr, " (%d best-timed conv gradinput algorithms) ", count);
+      }
+      #endif
+
    } else {
      err = cudnnGetConvolutionBackwardDataAlgorithm(
        params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
@@ -195,6 +208,18 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
      algo = prev_algo;
    }

+    #ifdef DEBUG
+    char algorithm_name[128];
+    if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
+        return 1;
+    };
+    // NB: This is printed only when algorithm is chosen at runtime.
+    if (reuse_algo)
+        fprintf(stderr, "(reused %s) ", algorithm_name);
+    else
+        fprintf(stderr, "(using %s) ", algorithm_name);
+    #endif
+
    if (params->choose_once) {
      reuse_algo = 1;
    } else {
@@ -203,15 +228,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
        prev_top_dims[i] = PyGpuArray_DIM(output, i);
      }
    }
-
-    #ifdef DEBUG
-    char algorithm_name[128];
-    if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
-        return 1;
-    };
-    // NB: This is printed only when algorithm is chosen at runtime.
-    fprintf(stderr, "(using %s) ", algorithm_name);
-    #endif
  }

  // The FFT implementation does not support strides, 1x1 filters or inputs

--- a/theano/gpuarray/dnn_gw.c
+++ b/theano/gpuarray/dnn_gw.c
@@ -83,6 +83,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;

  cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
+  #ifdef DEBUG
+  char algorithm_name[128];
+  #endif

  cuda_enter(c->ctx);

@@ -180,6 +183,16 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        }

        algo = choice.algo;
+
+        #ifdef DEBUG
+        if (count == 0) {
+            PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found");
+            return 1;
+        } else {
+            fprintf(stderr, " (%d best-timed conv gradweight algorithms) ", count);
+        }
+        #endif
+
      } else {
        err = cudnnGetConvolutionBackwardFilterAlgorithm(
          params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
@@ -198,6 +211,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
      algo = prev_algo;
    }

+    #ifdef DEBUG
+    if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
+        return 1;
+    };
+    // NB: This is printed only when algorithm is chosen at runtime.
+    if (reuse_algo)
+        fprintf(stderr, "(reused %s) ", algorithm_name);
+    else
+        fprintf(stderr, "(using %s) ", algorithm_name);
+    #endif
+
    if (params->choose_once) {
      reuse_algo = 1;
    } else {
@@ -206,15 +230,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
        prev_top_dims[i] = PyGpuArray_DIM(output, i);
      }
    }
-
-    #ifdef DEBUG
-    char algorithm_name[128];
-    if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
-        return 1;
-    };
-    // NB: This is printed only when algorithm is chosen at runtime.
-    fprintf(stderr, "(using %s) ", algorithm_name);
-    #endif
  }

  // The FFT implementation does not support strides, 1x1 filters or inputs