Merge pull request #2542 from abergeron/cudnn_r2_alg

Enable algorithm selection in cudnn convolution

Merge pull request #2542 from abergeron/cudnn_r2_alg
9147b12e · Frédéric Bastien · f120de51 · 009733e0 · 9147b12e · 9147b12e
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -484,6 +484,25 @@ import theano and print the config variable, as in:

    A directory with bin/, lib/, include/ folders containing cuda utilities.

+.. attribute:: config.dnn.conv.workmem
+
+   String value: 'none', 'small', 'large'
+
+   Default: 'small'
+
+   The default value for the amount of working memory that is
+   tolerated in the convolution implementation in cudnn.
+
+   'none'
+     Don't allow any extra memory.
+
+   'small'
+     Allow extra memory that is much smaller than the input sizes.
+
+   'large'
+     Allow extra memory that is on the order of the input sizes.
+
+
 .. attribute:: config.gcc.cxxflags

    Default: ""

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -1146,10 +1146,27 @@ class COp(Op):
                raise ValueError("No valid section marker was found in file "
                                 "%s" % self.func_files[i])

+    def get_op_params(self):
+        """
+        Returns a list of (name, value) pairs that will be turned into
+        macros for use within the op code. This is intended to allow
+        an op's properties to influence the generated C code.
+
+        The names must be strings that are not a C keyword and the
+        values must be strings of literal C representations.
+        """
+        return []
+
    def c_code_cache_version(self):
        return hash(tuple(self.func_codes))

-    c_init_code = simple_meth('init_code')
+    def c_init_code(self):
+        if 'init_code' in self.code_sections:
+            return [self.code_sections['init_code']]
+        else:
+            raise utils.MethodNotDefined(
+                'c_init_code', type(self), type(self).__name__)
+
    c_init_code_apply = apply_meth('init_code_apply')
    c_support_code = simple_meth('support_code')
    c_support_code_apply = apply_meth('support_code_apply')
@@ -1208,6 +1225,10 @@ class COp(Op):
                                                "str##_%s" % name))
        undef_macros.append(undef_template % "APPLY_SPECIFIC")

+        for n, v in self.get_op_params():
+            define_macros.append(define_template % (n, v))
+            undef_macros.append(undef_template % (n,))
+
        return os.linesep.join(define_macros), os.linesep.join(undef_macros)

    def _lquote_macro(self, txt):

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -298,6 +298,25 @@ outstanding_mallocs(PyObject* self, PyObject * args)
    return PyInt_FromLong(_outstanding_mallocs[0]);
 }

+
+static void *work_mem = NULL;
+static size_t work_size = 0;
+
+/*
+ * Returns a chunk of memory for temporary work inside of an op. You can only
+ * request a single chunk of memory at a time since it is reused.
+ */
+void *get_work_mem(size_t sz) {
+    if (sz < work_size)
+        return work_mem;
+    device_free(work_mem);
+    work_mem = device_malloc(sz);
+    work_size = sz;
+    if (work_mem == NULL)
+        work_size = 0;
+    return work_mem;
+}
+
 /////////////////////////
 // Static helper methods
 /////////////////////////

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -88,7 +88,8 @@ typedef float real;
 extern DllExport cublasHandle_t handle;

 /**
- * Allocation and freeing of device memory should go through these functions so that the lib can track memory usage.
+ * Allocation and freeing of device memory should go through these functions so
+ * that the lib can track memory usage.
 *
 * device_malloc will set the Python error message before returning None.
 * device_free will return nonzero on failure (after setting the python error message)
@@ -98,6 +99,7 @@ extern DllExport cublasHandle_t handle;
 DllExport void * device_malloc(size_t size);
 DllExport void * device_malloc(size_t size, int verbose);
 DllExport int device_free(void * ptr);
+DllExport void *get_work_mem(size_t sz);

 template <typename T>
 static T ceil_intdiv(T a, T b)

--- a/theano/sandbox/cuda/cudnn_helper.h
+++ b/theano/sandbox/cuda/cudnn_helper.h
@@ -73,6 +73,20 @@ cudnnGetConvolutionForwardAlgorithm(
  return CUDNN_STATUS_SUCCESS;
 }

+static inline cudnnStatus_t
+cudnnGetConvolutionForwardWorkspaceSize(
+ cudnnHandle_t handle,
+ const cudnnTensorDescriptor_t srcDesc,
+ const cudnnFilterDescriptor_t filterDesc,
+ const cudnnConvolutionDescriptor_t convDesc,
+ const cudnnTensor4dDescriptor_t destDesc,
+ cudnnConvolutionFwdAlgo_t algo,
+ size_t *sizeInBytes) {
+  *sizeInBytes = 0;
+  return CUDNN_STATUS_SUCCESS;
+}
+
+
 static inline cudnnStatus_t
 cudnnConvolutionForward_v2(
  cudnnHandle_t handle,

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
 import os

 import theano
-from theano import Apply, gof, tensor
+from theano import Apply, gof, tensor, config
 from theano.scalar import as_scalar
 from theano.gradient import DisconnectedType
 from theano.gof import Optimizer, local_optimizer, COp
 from theano.gof.type import CDataType, Generic
 from theano.compat import PY3
 from theano.compile.ops import shape_i
+from theano.configparser import AddConfigVar, EnumStr
 from theano.tensor.nnet import SoftmaxGrad
 from theano.tensor.basic import ShapeError
 from theano.sandbox.cuda.type import CudaNdarrayType
@@ -133,20 +134,6 @@ class DnnBase(GpuOp, COp):
    def c_libraries(self):
        return ['cudnn']

-    def c_init_code(self):
-        if PY3:
-            error_out = "NULL"
-        else:
-            error_out = ""
-        return ["""{
-cudnnStatus_t err;
-if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %%s",
-               cudnnGetErrorString(err));
-  return %s;
-}
-}""" % (error_out,)]
-

 class DnnVersion(GpuOp):
    def c_compiler(self):
@@ -342,6 +329,11 @@ class GpuDnnConvDesc(GpuOp):
        return (2, version())


+AddConfigVar('dnn.conv.workmem',
+             "Default value for the workmem attribute of cudnn convolutions.",
+             EnumStr('small', 'none', 'large'),
+             in_c_key=False)
+
 class GpuDnnConv(DnnBase, COp):
    """
    The forward convolution.
@@ -349,13 +341,36 @@ class GpuDnnConv(DnnBase, COp):
    :param image:
    :param kernel:
    :param descr: the convolution descriptor
-
    """
-    __props__ = ()
+    __props__ = ('workmem',)

-    def __init__(self):
+    def __init__(self, workmem=None):
+        """
+        :param workmem: either 'none', 'small' or 'large'.  Default is
+        the value of :attr:`config.dnn.conv.workmem`.
+        """
        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"],
                     "APPLY_SPECIFIC(conv_fwd)")
+        if workmem is None:
+            workmem = config.dnn.conv.workmem
+        self.workmem = workmem
+        assert self.workmem in ['none', 'small', 'large']
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'workmem'):
+            self.workmem = 'small'
+
+    def get_op_params(self):
+        if version() == -1:
+            return [('CONV_ALGO', "0")]
+        if self.workmem == 'none':
+            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
+        elif self.workmem == 'small':
+            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
+        elif self.workmem == 'large':
+            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
+        return [('CONV_ALGO', alg)]

    def make_node(self, img, kern, desc):
        img = as_cuda_ndarray_variable(img)
@@ -575,6 +590,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    :warning: The cuDNN library only works with GPU that have a compute
      capability of 3.0 or higer.  This means that older GPU will not
      work with this Op.
+    :note: The working memory of the op is influenced by
+      :attr:`config.dnn.conv.workmem`.
    """
    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
    if (border_mode == 'valid' and subsample == (1,1) and

--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
@@ -33,24 +33,27 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
    return 1;

  {
-    cudnnConvolutionFwdAlgo_t algo;
-    err = cudnnGetConvolutionForwardAlgorithm(
-      _handle,
-      APPLY_SPECIFIC(input),
-      APPLY_SPECIFIC(kerns),
-      desc,
-      APPLY_SPECIFIC(output),
-      CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, // TODO: add op param
-      0,
-      &algo);
+    size_t worksize;
+    void *workspace;

+    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
+                                                  APPLY_SPECIFIC(input),
+                                                  APPLY_SPECIFIC(kerns),
+                                                  desc,
+                                                  APPLY_SPECIFIC(output),
+                                                  CONV_ALGO,
+                                                  &worksize);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
-		   "GpuDnnConv: Couldn't select convolution algorithm: %s",
-		   cudnnGetErrorString(err));
+                   "GpuDnnConv: error getting worksize: %s",
+                   cudnnGetErrorString(err));
      return 1;
    }

+    workspace = get_work_mem(worksize);
+    if (workspace == NULL && worksize != 0)
+      return 1;
+
    const float alpha = 1;
    const float beta = 0;

@@ -60,8 +63,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
      desc,
-      algo,
-      NULL, 0,
+      CONV_ALGO,
+      workspace, worksize,
      (void *)&beta,
      APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output));
  }