Add 'time_once' and 'guess_once' options for DnnConv3d algo selection

89199807 · --global · 680d2034 · 89199807 · 89199807 · 89199807
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -341,7 +341,8 @@ class GpuDnnConvDesc(GpuOp):
 AddConfigVar('dnn.conv.workmem',
             "Default value for the workmem attribute of cudnn convolutions.",
-             EnumStr('small', 'none', 'large', 'fft', 'guess', 'time'),
+             EnumStr('small', 'none', 'large', 'fft', 'guess',
+                     'guess_once', 'time', 'time_once'),
             in_c_key=False)
 AddConfigVar('dnn.conv.workmem_bwd',
@@ -399,8 +400,9 @@ class GpuDnnConv(DnnBase, COp):
    def __init__(self, workmem=None, inplace=False):
        """
-        :param workmem: either 'none', 'small', 'large', 'fft', 'time' or
+        :param workmem: either 'none', 'small', 'large', 'fft', 'time',
-        'guess'. Default is the value of :attr:`config.dnn.conv.workmem`.
+        'time_once', 'guess' or 'guess_once'. Default is the value of
+        :attr:`config.dnn.conv.workmem`.
        """
        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"],
                     "APPLY_SPECIFIC(conv_fwd)")
@@ -421,7 +423,7 @@ class GpuDnnConv(DnnBase, COp):
                raise RuntimeError("CuDNN convolution timing requires CuDNN v3")
        assert self.workmem in ['none', 'small', 'large', 'fft', 'time',
-                                'guess']
+                                'time_once', 'guess', 'guess_once']
    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -437,6 +439,7 @@ class GpuDnnConv(DnnBase, COp):
            inpl_def = []
        choose_alg = '0'
+        choose_alg_once = '0'
        choose_alg_time = '0'
        if version() == -1:
            alg = "0"
@@ -449,23 +452,29 @@ class GpuDnnConv(DnnBase, COp):
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
            elif self.workmem == 'fft':
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
-            elif self.workmem == 'guess':
+            elif self.workmem in ['guess', 'guess_once']:
                # The convolution implementation should be choosen according
                # to a heuristic
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
                choose_alg = '1'
-            elif self.workmem == 'time':
+                if self.workmem == 'guess_once':
+                    choose_alg_once = '1'
+            elif self.workmem in ['time', 'time_once']:
                # The convolution implementation should be choosen by timing
                # every available implementation
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
                choose_alg = '1'
                choose_alg_time = '1'
+                if self.workmem == 'time_once':
+                    choose_alg_once = '1'
        alg_def = ('CONV_ALGO', alg)
        alg_choose_def = ('CHOOSE_ALGO', choose_alg)
+        alg_choose_once_def = ('CHOOSE_ALGO_ONCE', choose_alg_once)
        alg_choose_time_def = ('CHOOSE_ALGO_TIME', choose_alg_time)
-        return [alg_def, alg_choose_def, alg_choose_time_def] + inpl_def
+        return [alg_def, alg_choose_def, alg_choose_once_def,
+                alg_choose_time_def] + inpl_def
    def make_node(self, img, kern, output, desc, alpha=None, beta=None):
        img = as_cuda_ndarray_variable(img)
@@ -556,11 +565,13 @@ class GpuDnnConv3d(GpuDnnConv):
    def __init__(self, workmem=None, inplace=False):
        """
-        :param workmem: either 'none', 'time' or 'guess'.
+        :param workmem: either 'none', 'time', 'time_once', 'guess' or
+                        'guess_once'.
        Default is the value of :attr:`config.dnn.conv.workmem`.
        """
        super(GpuDnnConv3d, self).__init__(workmem='guess', inplace=inplace)
-        assert self.workmem in ['none', 'time','guess']
+        assert self.workmem in ['none', 'time', 'time_once', 'guess',
+                                'guess_once']
    def make_node(self, img, kern, output, desc, alpha=None, beta=None):

--- a/theano/sandbox/cuda/dnn_conv_base.c
+++ b/theano/sandbox/cuda/dnn_conv_base.c
@@ -13,6 +13,7 @@ at V3.
 int APPLY_SPECIFIC(previous_input_shape)[5];
 int APPLY_SPECIFIC(previous_kerns_shape)[5];
 int APPLY_SPECIFIC(previous_output_shape)[5];
+bool APPLY_SPECIFIC(previous_algo_set);
 cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
 #if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
@@ -49,6 +50,8 @@ for (int i = 0; i < 5; i++)
  APPLY_SPECIFIC(previous_output_shape)[i] = 0;
 }
+APPLY_SPECIFIC(previous_algo_set) = false;
 // Select default implementations for the case where the convolution
 // implementations should be selected based on the size of the data.
 APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;

--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
@@ -41,34 +41,40 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
    if (CHOOSE_ALGO)
    {
-      // Check if the input and the kernels have the same shape as they have
-      // last time the apply node was executed
+      // A new convolution implementation should be selected, based either on
-      bool same_shapes = true;
+      // timing or heuristics if in one of the two following cases :
-      for (int i = 0; (i < nb_dim) && same_shapes; i++)
+      // - The implementation should only be chosen during the first execution
+      //   of an apply node and this is the first execution of the apply node.
+      // - The implementation should be chosen as often as necessary and the
+      //   shapes of the inputs differ from the last time an implementation
+      //   was chosen.
+      bool reuse_previous_algo;
+      if (CHOOSE_ALGO_ONCE)
+      {
+        // Only choose a new implementation of none has been chosen before.
+        reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
+      }
+      else
      {
+        // Reuse the previous implementation if the inputs and the kernels
+        // have the same shapes as they had when the previous implementation
+        // was selected
+        bool same_shapes = true;
+        for (int i = 0; (i < nb_dim) && same_shapes; i++)
+        {
          same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] ==
                          APPLY_SPECIFIC(previous_input_shape)[i]);
          same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] ==
                          APPLY_SPECIFIC(previous_kerns_shape)[i]);
+        }
+        reuse_previous_algo = same_shapes;
      }
-      if (!same_shapes)
+      // If the previously choosen implementation can't be reused, select a
+      // new one based on the shapes of the current inputs
+      if (!reuse_previous_algo)
      {
-        // The shape of the inputs and/or the kernels is different from the
-        // last execution. Use the current shapes to infer the implementation
-        // to use from now on.
-        // Get the amount of available memory
-        size_t free = 0, total = 0;
-        cudaError_t err2 = cudaMemGetInfo(&free, &total);
-        if (err2 != cudaSuccess){
-          cudaGetLastError();
-          fprintf(stderr,
-                  "Error when trying to find the memory information"
-                  " on the GPU: %s\n", cudaGetErrorString(err2));
-          return 1;
-        }
        // Obtain a convolution algorithm appropriate for the input and kernel
        // shapes. Either by choosing one according to heuristics or by making
@@ -100,6 +106,20 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
        }
        else
        {
+          // The implementation should be chosen using heuristics based on the
+          // input shapes and the amount of memory available.
+          // Get the amount of available memory
+          size_t free = 0, total = 0;
+          cudaError_t err2 = cudaMemGetInfo(&free, &total);
+          if (err2 != cudaSuccess){
+            cudaGetLastError();
+            fprintf(stderr,
+                    "Error when trying to find the memory information"
+                    " on the GPU: %s\n", cudaGetErrorString(err2));
+            return 1;
+          }
          // Use heuristics to choose the implementation
          err = cudnnGetConvolutionForwardAlgorithm(_handle,
                                                    APPLY_SPECIFIC(input),
@@ -131,9 +151,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
      }
      else
      {
-          // The shapes of the inputs and the kernels are the same as for the
+          // Reuse the previously chosen convolution implementation
-          // last execution. The convolution algorithm used last time can also
-          // be used here
          chosen_algo = APPLY_SPECIFIC(previous_algo);
      }
    }