Integrate v3 in GpuDnnConvGradW

221fb064 · --global · 813bc1e9 · 221fb064 · 221fb064
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -539,18 +539,24 @@ class GpuDnnConvGradW(DnnBase, COp):
    :param descr: the convolution descriptor

    """
-    __props__ = ('inplace',)
+    __props__ = ('workmem', 'inplace',)
    __input_name__ = ('image', 'grad', 'output', 'descriptor', 'alpha', 'beta')

-    def __init__(self, inplace=False):
+    def __init__(self, inplace=False, workmem=None):
        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"],
                     "APPLY_SPECIFIC(conv_gw)")
+        if workmem is None:
+            workmem = config.dnn.conv.workmem_bwd
+        self.workmem = workmem
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
+        assert self.workmem in ['none', 'deterministic', 'fft', 'guess']

    def __setstate__(self, d):
        self.__dict__.update(d)
+        if not hasattr(self, 'workmem'):
+            self.workmem = 'none'
        if not hasattr(self, 'inplace'):
            self.inplace = False

@@ -574,9 +580,29 @@ class GpuDnnConvGradW(DnnBase, COp):

    def get_op_params(self):
        if self.inplace:
-            return [('CONV_INPLACE', '1')]
+            inplace_def = [('CONV_INPLACE', '1')]
        else:
-            return []
+            inplace_def = []
+
+        if version() == -1:
+            alg_def = ('CONV_ALGO', '0')
+        else:
+            if self.workmem == 'none':
+                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0')
+                alg_choose_def = ('CHOOSE_ALGO', '0')
+            elif self.workmem == 'deterministic':
+                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1')
+                alg_choose_def = ('CHOOSE_ALGO', '0')
+            elif self.workmem == 'fft':
+                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT')
+                alg_choose_def = ('CHOOSE_ALGO', '0')
+            elif self.workmem == 'guess':
+                # The convolution implementation should be choosen according
+                # to a heuristic
+                alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0')
+                alg_choose_def = ('CHOOSE_ALGO', '1')
+
+        return inplace_def + [alg_def, alg_choose_def]

    def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
        img = as_cuda_ndarray_variable(img)

--- a/theano/sandbox/cuda/dnn_gw.c
+++ b/theano/sandbox/cuda/dnn_gw.c
 #section support_code_struct

-int 
+int
 APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
                        CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
                        float alpha, float beta, CudaNdarray **kerns) {
@@ -8,7 +8,7 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,

  if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,
-		    "GpuDnnConv images and kernel must have the same stack size\n");
+                   "GpuDnnConv images and kernel must have the same stack size\n");
    return 1;
  }

@@ -31,14 +31,110 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

-  err = cudnnConvolutionBackwardFilter(
-    _handle,
-    (void *)&alpha,
-    APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
-    APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
-    desc,
-    (void *)&beta,
-    APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
+  {
+    size_t worksize;
+    void *workspace;
+    cudnnConvolutionBwdFilterAlgo_t chosen_algo;
+
+    if (CHOOSE_ALGO)
+    {
+      // Check if the input and the output have the same shape as they have
+      // last time the apply node was executed
+      bool same_shapes = true;
+      for (int i = 0; (i < 4) && same_shapes; i++)
+      {
+          same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] !=
+                          APPLY_SPECIFIC(previous_input_shape)[i]);
+          same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] !=
+                          APPLY_SPECIFIC(previous_output_shape)[i]);
+      }
+
+      if (!same_shapes)
+      {
+        // The shape of the inputs and/or the output is different from the
+        // last execution. Use the current shapes to infer the implementation
+        // to use from now on.
+
+        // Get the amount of available memory
+        size_t free = 0, total = 0;
+        cudaError_t err2 = cudaMemGetInfo(&free, &total);
+        if (err2 != cudaSuccess){
+          cudaGetLastError();
+          fprintf(stderr,
+                  "Error when trying to find the memory information"
+                  " on the GPU: %s\n", cudaGetErrorString(err2));
+          return 1;
+        }
+
+        // Use heuristics to choose the implementation
+        err = cudnnGetConvolutionBackwardFilterAlgorithm(_handle,
+                                                         APPLY_SPECIFIC(input),
+                                                         APPLY_SPECIFIC(output),
+                                                         desc,
+                                                         APPLY_SPECIFIC(kerns),
+                                                         CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
+                                                         free,
+                                                         &chosen_algo);
+
+        if (err != CUDNN_STATUS_SUCCESS) {
+          PyErr_Format(PyExc_RuntimeError,
+                       "GpuDnnConvGradW: error selecting convolution algo: %s",
+                       cudnnGetErrorString(err));
+          return 1;
+        }
+
+        // Store the shapes of the inputs and kernels as well as the chosen
+        // algorithm for future use.
+        APPLY_SPECIFIC(previous_bwd_f_algo) = chosen_algo;
+        for (int i = 0; i < 4; i++)
+        {
+            APPLY_SPECIFIC(previous_input_shape)[i] =
+                                            CudaNdarray_HOST_DIMS(input)[i];
+            APPLY_SPECIFIC(previous_output_shape)[i] =
+                                            CudaNdarray_HOST_DIMS(output)[i];
+        }
+
+      }
+      else
+      {
+          chosen_algo = CONV_ALGO;
+      }
+    }
+
+    // Infer required workspace size from the chosen implementation
+    err = cudnnGetConvolutionBackwardFilterWorkspaceSize(_handle,
+                                                         APPLY_SPECIFIC(input),
+                                                         APPLY_SPECIFIC(output),
+                                                         desc,
+                                                         APPLY_SPECIFIC(kerns),
+                                                         chosen_algo,
+                                                         &worksize);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuDnnConvGradW: error getting worksize: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+
+    // Allocate workspace for the convolution
+    workspace = get_work_mem(worksize);
+    if (workspace == NULL && worksize != 0)
+      return 1;
+
+    // Perform the convolution
+    err = cudnnConvolutionBackwardFilter_v3(
+      _handle,
+      (void *)&alpha,
+      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
+      APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
+      desc,
+      chosen_algo,
+      &workspace, worksize,
+      (void *)&beta,
+      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
+
+  }
+
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
                 cudnnGetErrorString(err));