Implement implementation selection for GpuDnnConvGradI

ad9646be · --global · 221fb064 · ad9646be · ad9646be
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -638,16 +638,25 @@ class GpuDnnConvGradI(DnnBase, COp):
    :param descr: the convolution descriptor
    """
-    __props__ = ('inplace',)
+    __props__ = ('workmem', 'inplace',)
    __input_name__ = ('kernel', 'grad', 'output',
                      'descriptor', 'alpha', 'beta')
-    def __init__(self, inplace=False):
+    def __init__(self, inplace=False, workmem=None):
        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"],
                     "APPLY_SPECIFIC(conv_gi)")
+        if workmem is None:
+            workmem = config.dnn.conv.workmem_bwd
+        self.workmem = workmem
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
+        assert self.workmem in ['none', 'deterministic', 'fft', 'guess']
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'workmem'):
+            self.workmem = 'none'
    def grad(self, inp, grads):
        kerns, top, output, desc, alpha, beta = inp
@@ -669,9 +678,29 @@ class GpuDnnConvGradI(DnnBase, COp):
    def get_op_params(self):
        if self.inplace:
-            return [('CONV_INPLACE', '1')]
+            inplace_def = [('CONV_INPLACE', '1')]
        else:
-            return []
+            inplace_def = []
+        if version() == -1:
+            alg_def = ('CONV_ALGO', '0')
+        else:
+            if self.workmem == 'none':
+                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0')
+                alg_choose_def = ('CHOOSE_ALGO', '0')
+            elif self.workmem == 'deterministic':
+                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1')
+                alg_choose_def = ('CHOOSE_ALGO', '0')
+            elif self.workmem == 'fft':
+                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT')
+                alg_choose_def = ('CHOOSE_ALGO', '0')
+            elif self.workmem == 'guess':
+                # The convolution implementation should be choosen according
+                # to a heuristic
+                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0')
+                alg_choose_def = ('CHOOSE_ALGO', '1')
+        return inplace_def + [alg_def, alg_choose_def]
    def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
        kern = as_cuda_ndarray_variable(kern)

--- a/theano/sandbox/cuda/dnn_gi.c
+++ b/theano/sandbox/cuda/dnn_gi.c
@@ -31,14 +31,156 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
  if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
    return 1;
-  err = cudnnConvolutionBackwardData(
+  {
-    _handle,
+    size_t worksize;
-    (void *)&alpha,
+    void *workspace;
-    APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
+    cudnnConvolutionBwdDataAlgo_t chosen_algo;
-    APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
-    desc,
+    if (CHOOSE_ALGO)
-    (void *)&beta,
+    {
-    APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
+      // Check if the kernels and the output have the same shape as they have
+      // last time the apply node was executed
+      bool same_shapes = true;
+      for (int i = 0; (i < 4) && same_shapes; i++)
+      {
+          same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] !=
+                          APPLY_SPECIFIC(previous_kerns_shape)[i]);
+          same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] !=
+                          APPLY_SPECIFIC(previous_output_shape)[i]);
+      }
+      if (!same_shapes)
+      {
+        // The shape of the kernels and/or the output is different from the
+        // last execution. Use the current shapes to infer the implementation
+        // to use from now on.
+        // Get the amount of available memory
+        size_t free = 0, total = 0;
+        cudaError_t err2 = cudaMemGetInfo(&free, &total);
+        if (err2 != cudaSuccess){
+          cudaGetLastError();
+          fprintf(stderr,
+                  "Error when trying to find the memory information"
+                  " on the GPU: %s\n", cudaGetErrorString(err2));
+          return 1;
+        }
+        // Use heuristics to choose the implementation
+        err = cudnnGetConvolutionBackwardDataAlgorithm(_handle,
+                                                       APPLY_SPECIFIC(kerns),
+                                                       APPLY_SPECIFIC(output),
+                                                       desc,
+                                                       APPLY_SPECIFIC(input),
+                                                       CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
+                                                       free,
+                                                       &chosen_algo);
+        if (err != CUDNN_STATUS_SUCCESS) {
+          PyErr_Format(PyExc_RuntimeError,
+                       "GpuDnnConvGradI: error selecting convolution algo: %s",
+                       cudnnGetErrorString(err));
+          return 1;
+        }
+        // Store the shapes of the kernels and output as well as the chosen
+        // algorithm for future use.
+        APPLY_SPECIFIC(previous_bwd_d_algo) = chosen_algo;
+        for (int i = 0; i < 4; i++)
+        {
+            APPLY_SPECIFIC(previous_kerns_shape)[i] =
+                                            CudaNdarray_HOST_DIMS(kerns)[i];
+            APPLY_SPECIFIC(previous_output_shape)[i] =
+                                            CudaNdarray_HOST_DIMS(output)[i];
+        }
+      }
+      else
+      {
+        // The shapes of the kernels and the output are the same as for the
+        // last execution. The convolution algorithm used last time can also
+        // be used here
+        chosen_algo = APPLY_SPECIFIC(previous_bwd_d_algo);
+      }
+    }
+    else
+    {
+        chosen_algo = CONV_ALGO;
+    }
+    // The FFT implementation does not support strides, 1x1 filters or
+    // inputs with a spatial dimension larger than 1024.
+    // If the chosen implementation is FFT, validate that it can be used
+    // on the current data and default on a safe implementation if it
+    // can't.
+    if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
+    {
+      // Extract the properties of the convolution descriptor
+      int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
+      cudnnConvolutionMode_t mode;
+      err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
+                                            &stride_v, &stride_h,
+                                            &upscale_x, &upscale_y,
+                                            &mode);
+      if (err != CUDNN_STATUS_SUCCESS) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuDnnConvGradI: error getting convolution properties: %s",
+                     cudnnGetErrorString(err));
+        return 1;
+      }
+      // Extract the spatial size of the filters
+      int filter_h = CudaNdarray_HOST_DIMS(kerns)[3];
+      int filter_w = CudaNdarray_HOST_DIMS(kerns)[4];
+      // Extract the spatial size of the input
+      int input_h = CudaNdarray_HOST_DIMS(*input)[3];
+      int input_w = CudaNdarray_HOST_DIMS(*input)[4];
+      // Ensure that the selected implementation supports the requested
+      // convolution. Fall back to a safe implementation otherwise.
+      if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
+          input_w > 1024 || (filter_h == 1 && filter_w == 1))
+      {
+        chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+      }
+    }
+    // Infer required workspace size from the chosen implementation
+    err = cudnnGetConvolutionBackwardDataWorkspaceSize(_handle,
+                                                       APPLY_SPECIFIC(kerns),
+                                                       APPLY_SPECIFIC(output),
+                                                       desc,
+                                                       APPLY_SPECIFIC(input),
+                                                       chosen_algo,
+                                                       &worksize);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuDnnConvGradI: error getting worksize: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+    // Allocate workspace for the convolution
+    workspace = get_work_mem(worksize);
+    if (workspace == NULL && worksize != 0)
+      return 1;
+    // Perform the convolution
+    err = cudnnConvolutionBackwardData_v3(
+      _handle,
+      (void *)&alpha,
+      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
+      APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
+      desc,
+      chosen_algo,
+      workspace, worksize,
+      (void *)&beta,
+      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
+  }
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
                 cudnnGetErrorString(err));