Merge pull request #2273 from f0k/conv-direction-hints

Make automatic insertions of dnn_conv() use the optimal cuDNN code path (2)

Merge pull request #2273 from f0k/conv-direction-hints
18fe0369 · Frédéric Bastien · 447ab32d · dc101dcd · 18fe0369 · 18fe0369
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -1513,6 +1513,7 @@ class GpuConv(GpuOp):
            logical_kern_hw=None,
            logical_kern_align_top=True,
            version=-1,
+            direction_hint=None,
            verbose=0,
            kshp=None,
            imshp=None,
@@ -1525,6 +1526,10 @@ class GpuConv(GpuOp):
                        convolution. By default we try to guess the best one.
                        You can force one version with this parameter. This
                        parameter is used by the tests.
+        :param direction_hint: 'forward', 'bprop weights' or 'bprop inputs'.
+                        Serves as a hint for graph optimizers replacing
+                        GpuConv by other implementations. If the GpuConv is
+                        inserted automatically, we take its value from ConvOp.
        :param verbose: for value of 1,2 and 3. Print more information during
                        the execution of the convolution. Mostly used for
                        optimization or debugging.
@@ -1570,6 +1575,7 @@ class GpuConv(GpuOp):
        self.logical_kern_hw = logical_kern_hw
        self.logical_kern_align_top = logical_kern_align_top
        self.version = version
+        self.direction_hint = direction_hint
        self.verbose = verbose
        self.kshp = kshp
        self.imshp = imshp
@@ -1597,6 +1603,8 @@ class GpuConv(GpuOp):
            self.imshp = None
        if not hasattr(self, "max_threads_dim0"):
            self.max_threads_dim0 = None
+        if not hasattr(self, "direction_hint"):
+            self.direction_hint = None
    def __hash__(self):
        # don't use hash(self.version) as hash(-1)==-2 and

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -469,7 +469,7 @@ class GpuDnnConvGradI(GpuDnnConvBase):
 def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
-             conv_mode='conv'):
+             conv_mode='conv', direction_hint=None):
    """
    GPU convolution using cuDNN from NVIDIA.
@@ -481,13 +481,41 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    :param border_mode: one of 'valid', 'full'; additionally, the padding size
        could be directly specified by an integer or a pair of integers
    :param subsample: perform subsampling of the output (default: (1, 1))
-    :param conv_mode: perform convolution (kernels flipped) or cross-correlation.  One of 'conv', 'cross'. (default: 'conv')
+    :param conv_mode: perform convolution (kernels flipped) or cross-correlation.
+        One of 'conv', 'cross'. (default: 'conv')
+    :param direction_hint: Used by graph optimizers to change algorithm choice.
+        By default, GpuDnnConv will be used to carry out the convolution.
+        If border_mode is 'valid', subsample is (1,1) and direction_hint is
+        'bprop weights', it will use GpuDnnConvGradW.
+        If border_mode is 'full', subsample is (1,1) and direction_hint is
+        *not* 'forward!', it will use GpuDnnConvGradI.
+        This parameter is used internally by graph optimizers and may be
+        removed at any time without a deprecation period. You have been warned.
    :warning: The cuDNN library only works with GPU that have a compute
      capability of 3.0 or higer.  This means that older GPU will not
      work with this Op.
    """
-    if border_mode == 'full' and subsample == (1, 1):
+    if (border_mode == 'valid' and subsample == (1,1) and
+        direction_hint == 'bprop weights'):
+        # Special case: We are asked to use GpuDnnConvGradW. We need to set
+        # up a suitable 'fake' convolution to compute the gradient for.
+        img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
+        if conv_mode == 'conv':
+            # We need to flip manually. These 'kerns' are not the kernels
+            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
+            kerns = kerns[:, :, ::-1, ::-1]
+        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
+        shape = theano.tensor.stack(kerns.shape[1], img.shape[1],
+                                    img.shape[2] - kerns.shape[2] + 1,
+                                    img.shape[3] - kerns.shape[3] + 1)
+        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
+                              conv_mode='cross')(img.shape, shape)
+        conv = GpuDnnConvGradW()(img, kerns, desc)
+        return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3))
+    elif (border_mode == 'full' and subsample == (1, 1) and
+          direction_hint != 'forward!'):
        # Special case: We can be faster by using GpuDnnConvGradI to compute
        # the full convolution as the backward pass of a valid convolution.
        # We just need to set up a suitable 'fake' valid convolution.
@@ -501,6 +529,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
                              conv_mode=conv_mode)(shape, kerns.shape)
        return GpuDnnConvGradI()(kerns, img, desc)
+    # Standard case: We use GpuDnnConv with suitable padding.
    img = gpu_contiguous(img)
    kerns = gpu_contiguous(kerns)
    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
@@ -1134,8 +1163,11 @@ if cuda_available:
            img, kern = node.inputs
            border_mode = node.op.border_mode
            subsample = node.op.subsample
-            return [dnn_conv(gpu_contiguous(img), gpu_contiguous(kern),
+            direction_hint = node.op.direction_hint
-                             border_mode=border_mode, subsample=subsample)]
+            return [dnn_conv(img, kern,
+                             border_mode=border_mode, subsample=subsample,
+                             direction_hint=direction_hint)]
 # DISABLED as there is problems in the handling of borders
 #    @register_opt('cudnn')
    @local_optimizer([GpuDownsampleFactorMax])

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1181,6 +1181,7 @@ def local_gpu_conv(node):
                    logical_kern_align_top=op.kshp_logical_top_aligned,
                    kshp=op.kshp,
                    version=op.version,
+                    direction_hint=op.direction_hint,
                    verbose=op.verbose,
                    imshp=op.imshp,
                    nkern=op.nkern,

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -594,7 +594,7 @@ def test_gemm_valid():
 def test_dnn_valid():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)
-    for t in _test_valid(GpuDnnConv, mode=theano_mode.including("cudnn")):
+    for t in _test_valid(GpuDnnConvBase, mode=theano_mode.including("cudnn")):
        yield t
@@ -708,7 +708,7 @@ def test_gemm_full():
 def test_dnn_full():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)
-    for t in _test_full(GpuDnnConv, mode=theano_mode.including("cudnn")):
+    for t in _test_full(GpuDnnConvBase, mode=theano_mode.including("cudnn")):
        yield t
@@ -760,7 +760,7 @@ def test_gemm_subsample():
 def test_dnn_subsample():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)
-    for t in _test_subsample(GpuDnnConv, theano_mode.including('cudnn')):
+    for t in _test_subsample(GpuDnnConvBase, theano_mode.including('cudnn')):
        yield t

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -280,6 +280,7 @@ class ConvOp(OpenMPOp):
                 kshp_logical_top_aligned=True,
                 verbose=0,
                 version=-1,
+                 direction_hint='forward',
                 openmp=None):
        """
        Initializes a ConvOp with given output_mode (full/valid). All other
@@ -348,6 +349,8 @@ class ConvOp(OpenMPOp):
        :type version: int or str
        :param version: passed to GpuConv, if version='no_fft', fft
            optimization will be desactivated at the op level.
+        :param direction_hint: 'forward', 'bprop weights' or 'bprop inputs'.
+            Passed to GpuConv, used by graph optimizers to aid algorithm choice
        The 3 following parameters are used internally when we generate
        the gradient when dx!=1 or dy!=1.
@@ -423,6 +426,7 @@ class ConvOp(OpenMPOp):
        self.dy = dy
        self.verbose = verbose
        self.version = version
+        self.direction_hint = direction_hint
        # a triple
        if imshp_logical is None:
@@ -565,6 +569,7 @@ class ConvOp(OpenMPOp):
    def __setstate__(self, d):
        super(ConvOp, self).__setstate__(d)
+        self.direction_hint = d.get("direction_hint", None)
        self._rehash()
    def _rehash(self):
@@ -888,6 +893,7 @@ class ConvOp(OpenMPOp):
                        kshp_logical=kshp_logical,
                        kshp_logical_top_aligned=kshp_logical_top_aligned,
                        version=self.version,
+                        direction_hint='bprop weights',
                        verbose=self.verbose)
        else:  # let __init__ choose c params be chosen automatically from shapes
@@ -897,6 +903,7 @@ class ConvOp(OpenMPOp):
                        kshp_logical=kshp_logical,
                        kshp_logical_top_aligned=kshp_logical_top_aligned,
                        version=self.version,
+                        direction_hint='bprop weights',
                        verbose=self.verbose)
        dw = dw(img, filters)
@@ -929,6 +936,7 @@ class ConvOp(OpenMPOp):
                         imshp_logical=imshp_logical,
                         kshp_logical=None,
                         version=-1,  # we we change the mode, we don't forward the version.
+                         direction_hint='bprop inputs',
                         verbose=self.verbose)
        else:  # let __init__ figure out the unrolling / patch sizes
            din = ConvOp(imshp, self.kshp, nkern, self.bsize,
@@ -938,6 +946,7 @@ class ConvOp(OpenMPOp):
                         imshp_logical=imshp_logical,
                         kshp_logical=None,
                         version=-1,  # we we change the mode, we don't forward the version.
+                         direction_hint='bprop inputs',
                         verbose=self.verbose)
        din = din(gz, filters)