Add support for implementation timing

2597dcd2 · --global · f9b85e1e · 2597dcd2 · 2597dcd2 · 2597dcd2
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -350,7 +350,7 @@ class GpuDnnConvDesc(GpuOp):

 AddConfigVar('dnn.conv.workmem',
             "Default value for the workmem attribute of cudnn convolutions.",
-             EnumStr('small', 'none', 'large'),
+             EnumStr('small', 'none', 'large', 'guess', 'time'),
             in_c_key=False)

 # scalar constants
@@ -397,7 +397,8 @@ class GpuDnnConv(DnnBase, COp):
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
-        assert self.workmem in ['none', 'small', 'large']
+        assert self.workmem in ['none', 'small', 'large', 'fft', 'time',
+                                'guess']

    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -417,18 +418,37 @@ class GpuDnnConv(DnnBase, COp):
            if self.workmem == 'none':
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
                choose_alg = '0'
+                choose_alg_time = '0'
            elif self.workmem == 'small':
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
                choose_alg = '0'
+                choose_alg_time = '0'
            elif self.workmem == 'large':
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
                choose_alg = '0'
+                choose_alg_time = '0'
+            elif self.workmem == 'fft':
+                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
+                choose_alg = '0'
+                choose_alg_time = '0'
+            elif self.workmem == 'guess':
+                # The convolution implementation should be choosen according
+                # to a heuristic
+                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
+                choose_alg = '1'
+                choose_alg_time = '0'
            elif self.workmem == 'time':
-                alg = "0"
+                # The convolution implementation should be choosen by timing
+                # every available implementation
+                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
                choose_alg = '1'
+                choose_alg_time = '1'
+
            alg_def = ('CONV_ALGO', alg)
            alg_choose_def = ('CHOOSE_ALGO', choose_alg)
-        return [alg_def, alg_choose_def] + inpl_def
+            alg_choose_time_def = ('CHOOSE_ALGO_TIME', choose_alg_time)
+
+        return [alg_def, alg_choose_def, alg_choose_time_def] + inpl_def

    def make_node(self, img, kern, output, desc, alpha=None, beta=None):
        img = as_cuda_ndarray_variable(img)

--- a/theano/sandbox/cuda/dnn_conv_base.c
+++ b/theano/sandbox/cuda/dnn_conv_base.c
@@ -42,6 +42,7 @@ APPLY_SPECIFIC(previous_kerns_shape)[0] = 0;
 APPLY_SPECIFIC(previous_kerns_shape)[1] = 0;
 APPLY_SPECIFIC(previous_kerns_shape)[2] = 0;
 APPLY_SPECIFIC(previous_kerns_shape)[3] = 0;
+APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;

 #section cleanup_code_struct


--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
@@ -35,7 +35,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
    void *workspace;
    cudnnConvolutionFwdAlgo_t chosen_algo;

-    if (CHOOSE_ALGO){
+    if (CHOOSE_ALGO)
+    {

      // Check if the input and the kernels have the same shape as they have
      // last time the apply node was executed
@@ -48,7 +49,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
                          APPLY_SPECIFIC(previous_kerns_shape)[i]);
      }

-      if (same_shapes)
+      if (!same_shapes)
      {
        // The shape of the inputs and/or the kernels is different from the
        // last execution. Use the current shapes to infer the implementation
@@ -62,10 +63,32 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
          fprintf(stderr,
                  "Error when trying to find the memory information"
                  " on the GPU: %s\n", cudaGetErrorString(err2));
+          return 1;
        }

        // Obtain a convolution algorithm appropriate for the input and kernel
-        // shapes
+        // shapes. Either by choosing one according to heuristics or by making
+        // CuDNN time every implementation and choose the best one.
+        if (CHOOSE_ALGO_TIME)
+        {
+          // Time the different implementations to choose the best one
+          int requestedCount = 2;
+          int count;
+          cudnnConvolutionFwdAlgoPerf_t choosen_algo_perf;
+          err = cudnnFindConvolutionForwardAlgorithm(_handle,
+                                                     APPLY_SPECIFIC(input),
+                                                     APPLY_SPECIFIC(kerns),
+                                                     desc,
+                                                     APPLY_SPECIFIC(output),
+                                                     requestedCount,
+                                                     &count,
+                                                     &choosen_algo_perf);
+          chosen_algo = choosen_algo_perf.algo;
+          fprintf(stdout, "Choose algo %i\n", chosen_algo);
+        }
+        else
+        {
+          // Use heuristics to choose the implementation
          err = cudnnGetConvolutionForwardAlgorithm(_handle,
                                                    APPLY_SPECIFIC(input),
                                                    APPLY_SPECIFIC(kerns),
@@ -74,6 +97,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
                                                    CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
                                                    free,
                                                    &chosen_algo);
+        }

        if (err != CUDNN_STATUS_SUCCESS) {
          PyErr_Format(PyExc_RuntimeError,