Merge pull request #3618 from JesseLivezey/cormm_opt

CorrMM optimizations

Merge pull request #3618 from JesseLivezey/cormm_opt
dfb27303 · Pascal Lamblin · 3180ec4d · 850e8902 · dfb27303 · dfb27303
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -124,9 +124,9 @@ TODO: Give examples on how to use these things! They are pretty complicated.
      This is a CPU-only 2d correlation implementation taken from
      `caffe <https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cpp>`_
      and also used by Torch. It does not flip the kernel. As it provides a gradient,
-      you can use it as a replacement for nnet.conv2d. There is currently no
-      optimization to move this to GPU. This will be added when the new convolution
-      interface is finished.
+      you can use it as a replacement for nnet.conv2d. For convolutions done on
+      CPU, nnet.conv2d will be replaced by CorrMM. To explicitly disable it, set
+      ``THEANO_FLAGS=optimizer_excluding=conv_gemm`` in your environment.
    - :func:`dnn_conv <theano.sandbox.cuda.dnn.dnn_conv>` GPU-only
      convolution using NVIDIA's cuDNN library. This requires that you have
      cuDNN installed and available, which in turn requires CUDA 6.5 and a GPU

--- a/theano/sandbox/cuda/tests/test_abstractconv.py
+++ b/theano/sandbox/cuda/tests/test_abstractconv.py
@@ -212,7 +212,7 @@ class TestConv2d(unittest.TestCase):
                               provide_shape=provide_shape, border_mode=b,
                               filter_flip=flip)

-    def test_cormm_conv(self):
+    def test_gpucormm_conv(self):
        if not dnn_available():
            raise SkipTest(cuda.dnn.dnn_available.msg)

@@ -240,11 +240,39 @@ class TestConv2d(unittest.TestCase):
                               provide_shape=provide_shape, border_mode=b,
                               filter_flip=flip)

-    def test_cpu_conv(self):
+    def test_cormm_conv(self):
        if not dnn_available():
            raise SkipTest(cuda.dnn.dnn_available.msg)

        mode = mode_without_gpu
+        for (i, f), s, b, flip, provide_shape in itertools.product(
+                zip(self.inputs_shapes, self.filters_shapes),
+                self.subsamples,
+                self.border_modes,
+                self.filter_flip,
+                [False, True]):
+
+            o = self.get_output_shape(i, f, s, b)
+            self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
+                         verify_grad=True, mode=mode, device='cpu',
+                         provide_shape=provide_shape, border_mode=b,
+                         filter_flip=flip)
+            self.run_gradweight(inputs_shape=i, filters_shape=f,
+                                output_shape=o, subsample=s,
+                                verify_grad=True, mode=mode, device='cpu',
+                                provide_shape=provide_shape, border_mode=b,
+                                filter_flip=flip)
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode, device='cpu',
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip)
+
+    def test_cpu_conv(self):
+        if not dnn_available():
+            raise SkipTest(cuda.dnn.dnn_available.msg)
+
+        mode = mode_without_gpu.excluding('conv_gemm')
        for (i, f), s, b, flip, provide_shape in itertools.product(
                zip(self.inputs_shapes, self.filters_shapes),
                self.subsamples,

--- a/theano/tensor/nnet/abstract_conv2d.py
+++ b/theano/tensor/nnet/abstract_conv2d.py
@@ -4,16 +4,9 @@ Define abstract conv2d interface
 import logging

 import theano
-from theano.tensor import (as_tensor_variable, patternbroadcast)
-from theano.tensor import TensorType
+from theano.tensor import as_tensor_variable
 from theano.gof import Apply, Op
-from theano.gof import local_optimizer
-from theano.tensor.opt import register_specialize_device

-# Cpu implementation
-from theano.tensor.nnet import conv2d as cpu_conv2d, ConvOp
-from theano.tensor.nnet.ConvGrad3D import convGrad3D
-from theano.tensor.nnet.ConvTransp3D import convTransp3D

 __docformat__ = "restructuredtext en"

@@ -326,218 +319,3 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):

    def connection_pattern(self, node):
        return [[1], [1], [0]]  # no connection to height, width
-
-
-# Cpu Optmization
-@local_optimizer([AbstractConv2d])
-def local_conv2d_cpu(node):
-
-    if not isinstance(node.op, AbstractConv2d):
-        return None
-
-    img, kern = node.inputs
-    if ((not isinstance(img.type, TensorType) or
-         not isinstance(kern.type, TensorType))):
-        return None
-    if node.op.border_mode not in ['full', 'valid']:
-        return None
-    if not node.op.filter_flip:
-        # Not tested yet
-        return None
-
-    rval = cpu_conv2d(img, kern,
-                      node.op.imshp, node.op.kshp,
-                      border_mode=node.op.border_mode,
-                      subsample=node.op.subsample)
-    return [rval]
-register_specialize_device(local_conv2d_cpu, 'fast_compile')
-
-
-@local_optimizer([AbstractConv2d_gradWeights])
-def local_conv2d_gradweight_cpu(node):
-
-    img, topgrad, shape = node.inputs
-
-    if ((not isinstance(img.type, TensorType) or
-         not isinstance(topgrad.type, TensorType))):
-        return None
-    if node.op.border_mode not in ['full', 'valid']:
-        return None
-    if not node.op.filter_flip:
-        # Not tested yet
-        return
-
-    if node.op.border_mode == 'valid' and \
-            (node.op.subsample != (1, 1)):
-        # Use the gradient as defined in conv3D, because the implementation
-        # by Conv is slow (about 3x slower than conv3D, and probably 10x
-        # slower than it could be), nad incorrect when subsample > 2.
-        # build a "node", that should be equivalent to the one given by
-        # self.make_node, but using convGrad3D instead.
-        shuffled_img = img.dimshuffle(0, 2, 3, 'x', 1)
-        shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1)
-        rval = convGrad3D(V=shuffled_img,
-                          d=(node.op.subsample[0], node.op.subsample[1], 1),
-                          WShape=(shuffled_topgrad.shape[4],
-                                  shape[0], shape[1], 1,
-                                  shuffled_img.shape[4]),
-                          dCdH=shuffled_topgrad)
-
-        rval = theano.tensor.addbroadcast(rval, 3)
-        rval = rval.dimshuffle(0, 4, 1, 2)
-        rval = rval[:, :, ::-1, ::-1]
-        rval = patternbroadcast(rval, node.outputs[0].broadcastable)
-        return [rval]
-
-    dx, dy = node.op.subsample
-    if dx not in (1, 2) or dy not in (1, 2):
-        # Not implemented in the gradient of ConvOp
-        return None
-
-    if node.op.imshp is None:
-        op_imshp = (None, None, None, None)
-    else:
-        op_imshp = node.op.imshp
-
-    if node.op.kshp is None:
-        op_kshp = (None, None, None, None)
-    else:
-        op_kshp = node.op.kshp
-
-    if None in op_imshp or None in op_kshp:
-        if (dx, dy) != (1, 1):
-            # We cannot infer the shapes
-            return None
-
-    # Determine gradient on kernels
-    assert len(op_imshp) == 4 and len(op_kshp) == 4
-
-    outshp = ConvOp.getOutputShape(op_imshp[2:],
-                                   op_kshp[2:], node.op.subsample,
-                                   node.op.border_mode)
-    fulloutshp = ConvOp.getOutputShape(op_imshp[2:],
-                                       op_kshp[2:], (1, 1),
-                                       node.op.border_mode)
-
-    newimg = img.dimshuffle((1, 0, 2, 3))
-    newtopgrad = topgrad.dimshuffle((1, 0, 2, 3))
-
-    if node.op.border_mode == 'valid':
-        (img, filters) = (newimg, newtopgrad)
-        kshp_logical = fulloutshp
-        kshp_logical_top_aligned = False
-        imshp_logical = None
-        (bsize, nkern) = (op_imshp[1], op_kshp[0])
-        imshp = (op_imshp[0], op_imshp[2], op_imshp[3])
-        kshp = outshp
-    elif node.op.border_mode == 'full':
-        (img, filters) = (newtopgrad, newimg)
-        kshp_logical = None
-        kshp_logical_top_aligned = True
-        imshp_logical = (op_imshp[0],
-                         fulloutshp[0],
-                         fulloutshp[1])
-        (bsize, nkern) = (op_kshp[0], op_imshp[1])
-        imshp = (op_imshp[0], outshp[0], outshp[1])
-        kshp = op_imshp[2:]
-    else:
-        raise NotImplementedError(
-            'Only [full,valid] modes are currently supported.')
-
-    # Flip the kernels
-    filters = filters[:, :, ::-1, ::-1]
-
-    dw = ConvOp(imshp, kshp, nkern, bsize, 1, 1, output_mode='valid',
-                unroll_batch=None, unroll_kern=None, unroll_patch=None,
-                imshp_logical=imshp_logical,
-                kshp_logical=kshp_logical,
-                kshp_logical_top_aligned=kshp_logical_top_aligned,
-                direction_hint='bprop weights')
-    res = dw(img, filters)
-    if node.op.border_mode == 'valid':
-        res = res.dimshuffle((1, 0, 2, 3))
-        res = res[:, :, ::-1, ::-1]
-
-    res = patternbroadcast(res, node.outputs[0].broadcastable)
-    return [res]
-register_specialize_device(local_conv2d_gradweight_cpu, 'fast_compile')
-
-
-@local_optimizer([AbstractConv2d_gradInputs])
-def local_conv2d_gradinputs_cpu(node):
-    kern, topgrad, shape = node.inputs
-
-    if ((not isinstance(kern.type, TensorType) or
-         not isinstance(topgrad.type, TensorType))):
-        return None
-    if node.op.border_mode not in ['full', 'valid']:
-        return None
-    if not node.op.filter_flip:
-        # Not tested yet
-        return None
-
-    # Conv 3d implementation, needed when subsample > 2
-    if node.op.border_mode == 'valid' and node.op.subsample != (1, 1):
-        kern = kern[:, :, ::-1, ::-1]
-        shuffled_kern = kern.dimshuffle(0, 2, 3, 'x', 1)
-        shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1)
-        b = theano.tensor.zeros_like(shuffled_kern[0, 0, 0, 0, :])
-        rval = convTransp3D(W=shuffled_kern, b=b,
-                            d=(node.op.subsample[0], node.op.subsample[1], 1),
-                            H=shuffled_topgrad,
-                            RShape=(shape[0], shape[1], 1))
-        rval = theano.tensor.addbroadcast(rval, 3)
-        rval = rval.dimshuffle(0, 4, 1, 2)
-        rval = patternbroadcast(rval, node.outputs[0].broadcastable)
-        return [rval]
-
-    # Conv2d Implementation
-    dx, dy = node.op.subsample
-    if dx not in (1, 2) or dy not in (1, 2):
-        # Not implemented in the gradient of ConvOp
-        return None
-
-    if node.op.imshp is None:
-        op_imshp = (None, None, None, None)
-    else:
-        op_imshp = node.op.imshp
-
-    if node.op.kshp is None:
-        op_kshp = (None, None, None, None)
-    else:
-        op_kshp = node.op.kshp
-
-    if None in op_imshp or None in op_kshp:
-        if (dx, dy) != (1, 1):
-            return None
-
-    mode = 'valid'
-    if not node.op.border_mode == 'full':
-        mode = 'full'
-    filters = kern.dimshuffle((1, 0, 2, 3))
-    filters = filters[:, :, ::-1, ::-1]
-
-    outshp = ConvOp.getOutputShape(op_imshp[2:],
-                                   op_kshp[2:], node.op.subsample,
-                                   node.op.border_mode)
-    fulloutshp = ConvOp.getOutputShape(op_imshp[2:],
-                                       op_kshp[2:], (1, 1),
-                                       node.op.border_mode)
-    nkern = op_imshp[1]
-    imshp = (op_kshp[0], outshp[0], outshp[1])
-    imshp_logical = (op_kshp[0], fulloutshp[0], fulloutshp[1])
-    din = ConvOp(imshp,
-                 op_kshp[2:],
-                 nkern,
-                 op_imshp[0],
-                 1, 1, output_mode=mode,
-                 unroll_batch=None, unroll_kern=None,
-                 unroll_patch=None,
-                 imshp_logical=imshp_logical,
-                 kshp_logical=None,
-                 version=-1,
-                 direction_hint='bprop inputs')
-    din = din(topgrad, filters)
-    din = patternbroadcast(din, node.outputs[0].broadcastable)
-    return [din]
-register_specialize_device(local_conv2d_gradinputs_cpu, 'fast_compile')
--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py