Add support for implementation timing

2597dcd2 · --global · f9b85e1e · 2597dcd2 · 2597dcd2 · 2597dcd2
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -350,7 +350,7 @@ class GpuDnnConvDesc(GpuOp):
 AddConfigVar('dnn.conv.workmem',
             "Default value for the workmem attribute of cudnn convolutions.",
-             EnumStr('small', 'none', 'large'),
+             EnumStr('small', 'none', 'large', 'guess', 'time'),
             in_c_key=False)
 # scalar constants
@@ -397,7 +397,8 @@ class GpuDnnConv(DnnBase, COp):
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
-        assert self.workmem in ['none', 'small', 'large']
+        assert self.workmem in ['none', 'small', 'large', 'fft', 'time',
+                                'guess']
    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -417,18 +418,37 @@ class GpuDnnConv(DnnBase, COp):
            if self.workmem == 'none':
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
                choose_alg = '0'
+                choose_alg_time = '0'
            elif self.workmem == 'small':
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
                choose_alg = '0'
+                choose_alg_time = '0'
            elif self.workmem == 'large':
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
                choose_alg = '0'
+                choose_alg_time = '0'
+            elif self.workmem == 'fft':
+                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
+                choose_alg = '0'
+                choose_alg_time = '0'
+            elif self.workmem == 'guess':
+                # The convolution implementation should be choosen according
+                # to a heuristic
+                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
+                choose_alg = '1'
+                choose_alg_time = '0'
            elif self.workmem == 'time':
-                alg = "0"
+                # The convolution implementation should be choosen by timing
+                # every available implementation
+                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
                choose_alg = '1'
+                choose_alg_time = '1'
            alg_def = ('CONV_ALGO', alg)
            alg_choose_def = ('CHOOSE_ALGO', choose_alg)
-        return [alg_def, alg_choose_def] + inpl_def
+            alg_choose_time_def = ('CHOOSE_ALGO_TIME', choose_alg_time)
+        return [alg_def, alg_choose_def, alg_choose_time_def] + inpl_def
    def make_node(self, img, kern, output, desc, alpha=None, beta=None):
        img = as_cuda_ndarray_variable(img)

--- a/theano/sandbox/cuda/dnn_conv_base.c
+++ b/theano/sandbox/cuda/dnn_conv_base.c
@@ -42,6 +42,7 @@ APPLY_SPECIFIC(previous_kerns_shape)[0] = 0;
 APPLY_SPECIFIC(previous_kerns_shape)[1] = 0;
 APPLY_SPECIFIC(previous_kerns_shape)[2] = 0;
 APPLY_SPECIFIC(previous_kerns_shape)[3] = 0;
+APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
 #section cleanup_code_struct

--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
@@ -7,7 +7,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,
-		    "GpuDnnConv images and kernel must have the same stack size\n");
+                    "GpuDnnConv images and kernel must have the same stack size\n");
    return 1;
  }
@@ -35,7 +35,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
    void *workspace;
    cudnnConvolutionFwdAlgo_t chosen_algo;
-    if (CHOOSE_ALGO){
+    if (CHOOSE_ALGO)
+    {
      // Check if the input and the kernels have the same shape as they have
      // last time the apply node was executed
@@ -48,7 +49,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
                          APPLY_SPECIFIC(previous_kerns_shape)[i]);
      }
-      if (same_shapes)
+      if (!same_shapes)
      {
        // The shape of the inputs and/or the kernels is different from the
        // last execution. Use the current shapes to infer the implementation
@@ -62,18 +63,41 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
          fprintf(stderr,
                  "Error when trying to find the memory information"
                  " on the GPU: %s\n", cudaGetErrorString(err2));
+          return 1;
        }
        // Obtain a convolution algorithm appropriate for the input and kernel
-        // shapes
+        // shapes. Either by choosing one according to heuristics or by making
-        err = cudnnGetConvolutionForwardAlgorithm(_handle,
+        // CuDNN time every implementation and choose the best one.
-                                                  APPLY_SPECIFIC(input),
+        if (CHOOSE_ALGO_TIME)
-                                                  APPLY_SPECIFIC(kerns),
+        {
-                                                  desc,
+          // Time the different implementations to choose the best one
-                                                  APPLY_SPECIFIC(output),
+          int requestedCount = 2;
-                                                  CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+          int count;
-                                                  free,
+          cudnnConvolutionFwdAlgoPerf_t choosen_algo_perf;
-                                                  &chosen_algo);
+          err = cudnnFindConvolutionForwardAlgorithm(_handle,
+                                                     APPLY_SPECIFIC(input),
+                                                     APPLY_SPECIFIC(kerns),
+                                                     desc,
+                                                     APPLY_SPECIFIC(output),
+                                                     requestedCount,
+                                                     &count,
+                                                     &choosen_algo_perf);
+          chosen_algo = choosen_algo_perf.algo;
+          fprintf(stdout, "Choose algo %i\n", chosen_algo);
+        }
+        else
+        {
+          // Use heuristics to choose the implementation
+          err = cudnnGetConvolutionForwardAlgorithm(_handle,
+                                                    APPLY_SPECIFIC(input),
+                                                    APPLY_SPECIFIC(kerns),
+                                                    desc,
+                                                    APPLY_SPECIFIC(output),
+                                                    CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+                                                    free,
+                                                    &chosen_algo);
+        }
        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError,