Add 'guess_once' option for DnnConv3dGrad algo selection

8c7ab092 · --global · 89199807 · 8c7ab092 · 8c7ab092 · 8c7ab092
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -348,7 +348,7 @@ AddConfigVar('dnn.conv.workmem',
 AddConfigVar('dnn.conv.workmem_bwd',
             "Default value for the workmem attribute of cudnn gradient "
             "convolutions.",
-             EnumStr('none', 'deterministic', 'fft', 'guess'),
+             EnumStr('none', 'deterministic', 'fft', 'guess', 'guess_once'),
             in_c_key=False)
@@ -665,7 +665,8 @@ class GpuDnnConvGradW(DnnBase, COp):
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
-        assert self.workmem in ['none', 'deterministic', 'fft', 'guess']
+        assert self.workmem in ['none', 'deterministic', 'fft', 'guess',
+                                'guess_once']
    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -698,6 +699,7 @@ class GpuDnnConvGradW(DnnBase, COp):
        else:
            inplace_def = []
+        alg_choose_once_def = ('CHOOSE_ALGO_ONCE', '0')
        if version() == -1 or version() < (3000, 3000):
            alg_def = ('CONV_ALGO', '0')
            alg_choose_def = ('CHOOSE_ALGO', '0')
@@ -711,13 +713,15 @@ class GpuDnnConvGradW(DnnBase, COp):
            elif self.workmem == 'fft':
                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT')
                alg_choose_def = ('CHOOSE_ALGO', '0')
-            elif self.workmem == 'guess':
+            elif self.workmem in ['guess', 'guess_once']:
                # The convolution implementation should be choosen according
                # to a heuristic
                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0')
                alg_choose_def = ('CHOOSE_ALGO', '1')
+                if self.workmem == 'guess_once':
+                    alg_choose_once_def = ('CHOOSE_ALGO_ONCE', '1')
-        return inplace_def + [alg_def, alg_choose_def]
+        return inplace_def + [alg_def, alg_choose_def, alg_choose_once_def]
    def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
        img = as_cuda_ndarray_variable(img)
@@ -757,7 +761,7 @@ class GpuDnnConv3dGradW(GpuDnnConvGradW):
    def __init__(self, inplace=False, workmem=None):
        super(GpuDnnConv3dGradW, self).__init__(inplace=inplace, workmem='none')
-        assert self.workmem in ['none', 'time','guess']
+        assert self.workmem in ['none', 'time','guess', 'guess_once']
    def grad(self, inp, grads):
        img, top, output, desc, alpha, beta = inp
@@ -818,7 +822,8 @@ class GpuDnnConvGradI(DnnBase, COp):
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
-        assert self.workmem in ['none', 'deterministic', 'fft', 'guess']
+        assert self.workmem in ['none', 'deterministic', 'fft', 'guess',
+                                'guess_once']
    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -849,6 +854,7 @@ class GpuDnnConvGradI(DnnBase, COp):
        else:
            inplace_def = []
+        alg_choose_once_def = ('CHOOSE_ALGO_ONCE', '0')
        if version() == -1 or version() < (3000, 3000):
            alg_def = ('CONV_ALGO', '0')
            alg_choose_def = ('CHOOSE_ALGO', '0')
@@ -862,13 +868,15 @@ class GpuDnnConvGradI(DnnBase, COp):
            elif self.workmem == 'fft':
                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT')
                alg_choose_def = ('CHOOSE_ALGO', '0')
-            elif self.workmem == 'guess':
+            elif self.workmem in ['guess', 'guess_once']:
                # The convolution implementation should be choosen according
                # to a heuristic
                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0')
                alg_choose_def = ('CHOOSE_ALGO', '1')
+                if self.workmem == 'guess_once':
+                    alg_choose_once_def = ('CHOOSE_ALGO_ONCE', '1')
-        return inplace_def + [alg_def, alg_choose_def]
+        return inplace_def + [alg_def, alg_choose_def, alg_choose_once_def]
    def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
        kern = as_cuda_ndarray_variable(kern)
@@ -913,7 +921,7 @@ class GpuDnnConv3dGradI(GpuDnnConvGradI):
        if workmem == None:
            workmem = 'none'
        super(GpuDnnConv3dGradI, self).__init__(inplace, workmem)
-        assert self.workmem in ['none', 'time','guess']
+        assert self.workmem in ['none', 'time', 'guess', 'guess_once']
    def grad(self, inp, grads):

--- a/theano/sandbox/cuda/dnn_gi.c
+++ b/theano/sandbox/cuda/dnn_gi.c
@@ -41,22 +41,42 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
    if (CHOOSE_ALGO)
    {
-      // Check if the kernels and the output have the same shape as they have
-      // last time the apply node was executed
+      // A new convolution implementation should be selected, based on
-      bool same_shapes = true;
+      // heuristics, if in one of the two following cases :
-      for (int i = 0; (i < nb_dim) && same_shapes; i++)
+      // - The implementation should only be chosen during the first execution
+      //   of an apply node and this is the first execution of the apply node.
+      // - The implementation should be chosen as often as necessary and the
+      //   shapes of the inputs differ from the last time an implementation
+      //   was chosen.
+      bool reuse_previous_algo;
+      if (CHOOSE_ALGO_ONCE)
+      {
+        // Only choose a new implementation of none has been chosen before.
+        reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
+      }
+      else
      {
-          same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] ==
+        // Reuse the previous implementation if the the kernels and the outputs
-                          APPLY_SPECIFIC(previous_kerns_shape)[i]);
+        // have the same shapes as they had when the previous implementation
-          same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] ==
+        // was selected
-                          APPLY_SPECIFIC(previous_output_shape)[i]);
+        bool same_shapes = true;
+        for (int i = 0; (i < nb_dim) && same_shapes; i++)
+        {
+            same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] ==
+                            APPLY_SPECIFIC(previous_kerns_shape)[i]);
+            same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] ==
+                            APPLY_SPECIFIC(previous_output_shape)[i]);
+        }
+        reuse_previous_algo = same_shapes;
      }
-      if (!same_shapes)
+      // If the previously choosen implementation can't be reused, select a
+      // new one based on the shapes of the current inputs
+      if (!reuse_previous_algo)
      {
-        // The shape of the kernels and/or the output is different from the
+        // Choose the convolution implementation using heuristics based on the
-        // last execution. Use the current shapes to infer the implementation
+        // shapes of the inputs and the amount of memory available.
-        // to use from now on.
        // Get the amount of available memory
        size_t free = 0, total = 0;
@@ -100,9 +120,7 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
      }
      else
      {
-        // The shapes of the kernels and the output are the same as for the
+        // Reuse the previously chosen convlution implementation
-        // last execution. The convolution algorithm used last time can also
-        // be used here
        chosen_algo = APPLY_SPECIFIC(previous_bwd_d_algo);
      }
    }

--- a/theano/sandbox/cuda/dnn_gw.c
+++ b/theano/sandbox/cuda/dnn_gw.c
@@ -41,22 +41,42 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
    if (CHOOSE_ALGO)
    {
-      // Check if the input and the output have the same shape as they have
-      // last time the apply node was executed
+      // A new convolution implementation should be selected, based on
-      bool same_shapes = true;
+      // heuristics, if in one of the two following cases :
-      for (int i = 0; (i < nb_dim) && same_shapes; i++)
+      // - The implementation should only be chosen during the first execution
+      //   of an apply node and this is the first execution of the apply node.
+      // - The implementation should be chosen as often as necessary and the
+      //   shapes of the inputs differ from the last time an implementation
+      //   was chosen.
+      bool reuse_previous_algo;
+      if (CHOOSE_ALGO_ONCE)
+      {
+        // Only choose a new implementation of none has been chosen before.
+        reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
+      }
+      else
      {
-          same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] ==
+        // Reuse the previous implementation if the the kernels and the outputs
-                          APPLY_SPECIFIC(previous_input_shape)[i]);
+        // have the same shapes as they had when the previous implementation
-          same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] ==
+        // was selected
-                          APPLY_SPECIFIC(previous_output_shape)[i]);
+        bool same_shapes = true;
+        for (int i = 0; (i < nb_dim) && same_shapes; i++)
+        {
+            same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] ==
+                            APPLY_SPECIFIC(previous_input_shape)[i]);
+            same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] ==
+                            APPLY_SPECIFIC(previous_output_shape)[i]);
+        }
+        reuse_previous_algo = same_shapes;
      }
-      if (!same_shapes)
+      // If the previously choosen implementation can't be reused, select a
+      // new one based on the shapes of the current inputs
+      if (!reuse_previous_algo)
      {
-        // The shape of the inputs and/or the output is different from the
+        // Choose the convolution implementation using heuristics based on the
-        // last execution. Use the current shapes to infer the implementation
+        // shapes of the inputs and the amount of memory available.
-        // to use from now on.
        // Get the amount of available memory
        size_t free = 0, total = 0;
@@ -100,9 +120,7 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
      }
      else
      {
-        // The shapes of the input and the output are the same as for the
+        // Reuse the previously chosen convlution implementation
-        // last execution. The convolution algorithm used last time can also
-        // be used here
        chosen_algo = APPLY_SPECIFIC(previous_bwd_f_algo);
      }
    }