Modify GpuDnnConvGradI to support timing feature

fc1ed897 · --global · c4df0b99 · fc1ed897 · fc1ed897
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -897,8 +897,8 @@ class GpuDnnConvGradI(DnnBase, COp):
    def __init__(self, inplace=False, workmem=None, algo=None):
        """
        :param workmem: *deprecated*, use param algo instead
-        :param algo: either 'none', 'deterministic', 'fft', 'guess_once' or
-        'guess_on_shape_change'.
+        :param algo: either 'none', 'deterministic', 'fft', 'guess_once',
+        'guess_on_shape_change', 'time_once' or 'time_on_shape_change'.
        Default is the value of :attr:`config.dnn.conv.algo_bwd`.
        """
        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"],
@@ -918,7 +918,8 @@ class GpuDnnConvGradI(DnnBase, COp):
        if self.inplace:
            self.destroy_map = {0: [2]}
        assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
-                                'guess_on_shape_change']
+                             'guess_on_shape_change', 'time_once',
+                             'time_on_shape_change']

    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -954,29 +955,42 @@ class GpuDnnConvGradI(DnnBase, COp):
        else:
            inplace_def = []

-        alg_choose_once_def = ('CHOOSE_ALGO_ONCE', '0')
+        choose_alg = '0'
+        choose_alg_once = '0'
+        choose_alg_time = '0'
+
        if version() == -1 or version() < (3000, 3000):
-            alg_def = ('CONV_ALGO', '0')
-            alg_choose_def = ('CHOOSE_ALGO', '0')
+            alg = "0"
        else:
            if self.algo == 'none':
-                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0')
-                alg_choose_def = ('CHOOSE_ALGO', '0')
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
            elif self.algo == 'deterministic':
-                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1')
-                alg_choose_def = ('CHOOSE_ALGO', '0')
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
            elif self.algo == 'fft':
-                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT')
-                alg_choose_def = ('CHOOSE_ALGO', '0')
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
            elif self.algo in ['guess_once', 'guess_on_shape_change']:
-                # The convolution implementation should be choosen according
+                # The convolution implementation should be chosen according
                # to a heuristic
-                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0')
-                alg_choose_def = ('CHOOSE_ALGO', '1')
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
+                choose_alg = '1'
                if self.algo == 'guess_once':
-                    alg_choose_once_def = ('CHOOSE_ALGO_ONCE', '1')
+                    choose_alg_once = '1'
+            elif self.algo in ['time_once', 'guess_on_shape_change']:
+                # The convolution implementation should be chosen according
+                # to timing
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
+                choose_alg = '1'
+                choose_alg_time = '1'
+                if self.algo == 'time_once':
+                    choose_alg_once = '1'
+
+        alg_def = ('CONV_ALGO', alg)
+        alg_choose_def = ('CHOOSE_ALGO', choose_alg)
+        alg_choose_once_def = ('CHOOSE_ALGO_ONCE', choose_alg_once)
+        alg_choose_time_def = ('CHOOSE_ALGO_TIME', choose_alg_time)

-        return inplace_def + [alg_def, alg_choose_def, alg_choose_once_def]
+        return inplace_def + [alg_def, alg_choose_def, alg_choose_once_def,
+                              alg_choose_time_def]

    def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
        kern = as_cuda_ndarray_variable(kern)
@@ -1031,7 +1045,8 @@ class GpuDnnConv3dGradI(GpuDnnConvGradI):

        super(GpuDnnConv3dGradI, self).__init__(inplace=inplace,
                                                algo="none")
-        assert self.algo in ['none', 'guess_once', 'guess_on_shape_change']
+        assert self.algo in ['none', 'guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']


    def grad(self, inp, grads):

--- a/theano/sandbox/cuda/dnn_gi.c
+++ b/theano/sandbox/cuda/dnn_gi.c
@@ -42,8 +42,8 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
    if (CHOOSE_ALGO)
    {

-      // A new convolution implementation should be selected, based on
-      // heuristics, if in one of the two following cases :
+      // A new convolution implementation should be selected, based either on
+      // timing or heuristics, if in one of the two following cases :
      // - The implementation should only be chosen during the first execution
      //   of an apply node and this is the first execution of the apply node.
      // - The implementation should be chosen as often as necessary and the
@@ -75,35 +75,64 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
      // new one based on the shapes of the current inputs
      if (!reuse_previous_algo)
      {
-        // Choose the convolution implementation using heuristics based on the
-        // shapes of the inputs and the amount of memory available.
-
-        // Get the amount of available memory
-        size_t free = 0, total = 0;
-        cudaError_t err2 = cudaMemGetInfo(&free, &total);
-        if (err2 != cudaSuccess){
-          cudaGetLastError();
-          fprintf(stderr,
-                  "Error when trying to find the memory information"
-                  " on the GPU: %s\n", cudaGetErrorString(err2));
-          return 1;
+        // Obtain a convolution algorithm appropriate for the kernel and output
+        // shapes. Either by choosing one according to heuristics or by making
+        // CuDNN time every implementation and choose the best one.
+        if (CHOOSE_ALGO_TIME)
+        {
+          // Time the different implementations to choose the best one
+          int requestedCount = 1;
+          int count;
+          cudnnConvolutionBwdDataAlgoPerf_t choosen_algo_perf;
+          err = cudnnFindConvolutionBackwardDataAlgorithm(_handle,
+                                                          APPLY_SPECIFIC(kerns),
+                                                          APPLY_SPECIFIC(output),
+                                                          desc,
+                                                          APPLY_SPECIFIC(input),
+                                                          requestedCount,
+                                                          &count,
+                                                          &choosen_algo_perf);
+          if (err != CUDNN_STATUS_SUCCESS) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuDnnConvGradI: error selecting convolution algo: "
+                         "%s", cudnnGetErrorString(err));
+            return 1;
+          }
+
+          chosen_algo = choosen_algo_perf.algo;
        }
-
-        // Use heuristics to choose the implementation
-        err = cudnnGetConvolutionBackwardDataAlgorithm(_handle,
-                                                       APPLY_SPECIFIC(kerns),
-                                                       APPLY_SPECIFIC(output),
-                                                       desc,
-                                                       APPLY_SPECIFIC(input),
-                                                       CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
-                                                       free,
-                                                       &chosen_algo);
-
-        if (err != CUDNN_STATUS_SUCCESS) {
-          PyErr_Format(PyExc_RuntimeError,
-                       "GpuDnnConvGradI: error selecting convolution algo: %s",
-                       cudnnGetErrorString(err));
-          return 1;
+        else
+        {
+          // Choose the convolution implementation using heuristics based on the
+          // shapes of the inputs and the amount of memory available.
+
+          // Get the amount of available memory
+          size_t free = 0, total = 0;
+          cudaError_t err2 = cudaMemGetInfo(&free, &total);
+          if (err2 != cudaSuccess){
+            cudaGetLastError();
+            fprintf(stderr,
+                    "Error when trying to find the memory information"
+                    " on the GPU: %s\n", cudaGetErrorString(err2));
+            return 1;
+          }
+
+          // Use heuristics to choose the implementation
+          err = cudnnGetConvolutionBackwardDataAlgorithm(_handle,
+                                                         APPLY_SPECIFIC(kerns),
+                                                         APPLY_SPECIFIC(output),
+                                                         desc,
+                                                         APPLY_SPECIFIC(input),
+                                                         CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+                                                         free,
+                                                         &chosen_algo);
+
+          if (err != CUDNN_STATUS_SUCCESS) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuDnnConvGradI: error selecting convolution algo: %s",
+                         cudnnGetErrorString(err));
+            return 1;
+          }
        }

        // Store the shapes of the kernels and output as well as the chosen
@@ -129,8 +158,8 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
        chosen_algo = CONV_ALGO;
    }

-    // The FFT implementation does not support strides, 1x1 filters or
-    // inputs with a spatial dimension larger than 1024.
+    // The FFT implementation (only in v3 and onward) does not support strides,
+    // 1x1 filters or inputs with a spatial dimension larger than 1024.
    // If the chosen implementation is FFT, validate that it can be used
    // on the current data and default on a safe implementation if it
    // can't.