Merge pull request #2192 from nouiz/mixed

Mixed

Merge pull request #2192 from nouiz/mixed
ee47526d · Pascal Lamblin · 795ded70 · fbe23a89 · ee47526d · 795ded70
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -22,6 +22,33 @@
 .. moduleauthor:: LISA


+.. note::
+
+    As of October 21st, 2014, the default GPU image convolution
+    changed. Here is the algo:
+
+    - If we can use `cuDNN <https://developer.nvidia.com/cuDNN>`_, use it.
+    - If not, use gemm version (slower then cuDNN, uses more memory).
+
+    If the users do not want the extra memory usage of the gemm
+    version, they can enable the legacy code that is even slower, but
+    does not use extra memory. For this, use the Theano flag
+    ``optimizer_excluding=conv_gemm``.
+
+    There is no reason to use the legacy code or the gemm version if
+    cuDNN is available.
+
+    2 other options:
+
+    - There is also the fft version that is the fastest in some cases,
+      but uses even more memory. It does not support striding to remove
+      computation and has some shapes restriction.
+
+    - There is also the cuda_convnet convolution in Pylearn2. It uses a
+      different memory layout, has shapes restrictions, but does not use
+      extra memory and is faster then the legacy convolution.
+
+
 TODO: Give examples on how to use these things! They are pretty complicated.

 - Convolution operators implemented:

--- a/doc/proposals/advidx.txt
+++ b/doc/proposals/advidx.txt
-==================
-Advanced Indexing
-==================
-
-Continue the Advanced Indexing project that is on either github or bitbucket.
-
--- a/doc/tutorial/multi_cores.txt
+++ b/doc/tutorial/multi_cores.txt
+.. _tut_multi_cores:
+
 =============================
 Multi cores support in Theano
 =============================

--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
@@ -135,9 +135,14 @@ class Apply(Node):
            if len(self.outputs) == 1:
                return self.outputs[0]
            else:
-                raise AttributeError("%s.default_output should be an output index." % self.op)
+                raise AttributeError(
+                    "%s.default_output should be an output index." % self.op)
+        elif not isinstance(do, (int, long)):
+            raise AttributeError("%s.default_output should be an int or long" %
+                                 self.op)
        elif do < 0 or do >= len(self.outputs):
-            raise AttributeError("%s.default_output is out of range." % self.op)
+            raise AttributeError("%s.default_output is out of range." %
+                                 self.op)
        return self.outputs[do]

    def env_getter(self):

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -616,6 +616,7 @@ class PerformLinker(LocalLinker):

        f.allow_gc = self.allow_gc #HACK: this is a way of passing an arg to Function.__call__
        add_clear_storage(f, computed, storage_map)
+        f.storage_map = storage_map

        return f, [Container(input, storage) for input, storage in zip(fgraph.inputs, input_storage)], \
            [Container(output, storage, True) for output, storage in zip(fgraph.outputs, output_storage)], \

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -201,41 +201,43 @@ if __name__ == "__main__":

        Test time in float32

-        cuda version      6.0    5.5    5.0    4.2    4.1    4.0    3.2    3.0   # note
+        cuda version      6.5    6.0    5.5    5.0    4.2    4.1    4.0    3.2    3.0   # note
        gpu
-        K6000/NOECC              0.06s
-        K40                      0.07s
-        K20m/ECC                        0.07s
-        K20/NOECC                       0.07s
-        M2090                    0.19s
-        C2075                                  0.25s
-        M2075                           0.25s
-        M2070                           0.25s         0.27s         0.32s
-        M2070-Q                         0.48s         0.27s         0.32s
-        M2050(Amazon)                   0.25s
-        C1060                                                       0.46s
-        K600                     1.04s
-
-        GTX Titan Black          0.05s
-        GTX Titan(D15U-50)       0.06s  0.06s  don't work
-        GTX 780                  0.06s
-        GTX 680                  0.11s  0.12s  0.154s               0.218s
-        GTX 580                  0.16s  0.16s  0.164s               0.203s
-        GTX 480                  0.19s  0.19s  0.192s               0.237s 0.27s
-        GTX 470                  0.23s  0.23s  0.238s               0.297s 0.34s
-        GTX 660                  0.18s  0.20s  0.23s
-        GTX 560                                0.30s
-        GTX 650 Ti                      0.27s
-        GTX 765M          0.27s
-        GTX 460                         0.37s                0.45s
-        GTX 285                  0.42s         0.452s        0.452s        0.40s # cuda 3.0 seems faster? driver version?
-        750M                            0.49s
-        GTX 550 Ti                                           0.57s
-        GT 520                                 2.68s                3.06s
-        520M                            2.44s                       3.19s        # with bumblebee on Ubuntu 12.04
-        GT 220                                                      3.80s
-        GT 210                                               6.35s
-        8500 GT                                                            10.68s
+        K6000/NOECC                     0.06s
+        K40                             0.07s
+        K20m/ECC                               0.07s
+        K20/NOECC                              0.07s
+        M2090                           0.19s
+        C2075                                         0.25s
+        M2075                                  0.25s
+        M2070                                  0.25s         0.27s         0.32s
+        M2070-Q                                0.48s         0.27s         0.32s
+        M2050(Amazon)                          0.25s
+        C1060                                                              0.46s
+        K600                            1.04s
+
+        GTX Titan Black                 0.05s
+        GTX Titan(D15U-50)              0.06s  0.06s  don't work
+        GTX 780                         0.06s
+        GTX 970           0.08s
+        GTX 680                         0.11s  0.12s  0.154s               0.218s
+        GTX 580                         0.16s  0.16s  0.164s               0.203s
+        GTX 480                         0.19s  0.19s  0.192s               0.237s 0.27s
+        GTX 750 Ti        0.20s
+        GTX 470                         0.23s  0.23s  0.238s               0.297s 0.34s
+        GTX 660                         0.18s  0.20s  0.23s
+        GTX 560                                       0.30s
+        GTX 650 Ti                             0.27s
+        GTX 765M                 0.27s
+        GTX 460                                0.37s                0.45s
+        GTX 285                         0.42s         0.452s        0.452s        0.40s # cuda 3.0 seems faster? driver version?
+        750M                                   0.49s
+        GTX 550 Ti                                                  0.57s
+        GT 520                                        2.68s                3.06s
+        520M                                   2.44s                       3.19s        # with bumblebee on Ubuntu 12.04
+        GT 220                                                             3.80s
+        GT 210                                                      6.35s
+        8500 GT                                                                   10.68s
        """

    t, impl = execute(not options.print_only, not options.quiet,

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1109,9 +1109,33 @@ def local_gpu_softmax_with_bias(node):
 from theano.tensor.nnet import conv


-@register_opt()
+# Needs to be registered before local_gpu_conv_legacy. Otherwise, it
+# will have priority over this optimization.  We want, if cudnn is
+# available and the GPU supports it, to use it.  Otherwise, the gemm
+# version should be used.  If the users want the legacy convolution,
+# they should use the Theano flag to disable the dnn and/or gemm version.
+@register_opt("dnn")
 @local_optimizer([gpu_from_host, conv.ConvOp])
 def local_gpu_conv(node):
+    """
+    If cudnn is available, use it. Otherwise, use the gemm version.
+    """
+    if theano.sandbox.cuda.dnn.dnn_available():
+        repl = local_gpu_conv_legacy.transform(node)
+        if repl:
+            n = repl[0].owner.inputs[0].owner
+            assert isinstance(n.op, GpuConv)
+            ret = theano.sandbox.cuda.dnn.local_conv_dnn.transform(n)
+            if ret:
+                return [host_from_gpu(ret[0])]
+    # If dnn isn't avail, the local_gpu_conv_legacy wil introduce the
+    # legacy opt. Then the local_conv_gemm will convert it to gemm
+    # opt.
+
+
+@register_opt()
+@local_optimizer([gpu_from_host, conv.ConvOp])
+def local_gpu_conv_legacy(node):
    """
    gpu_from_host(conv) -> gpu_conv(gpu_from_host)

@@ -1438,6 +1462,7 @@ def local_gpu_downsample_factor_max_grad(node):
                                              gpu_from_host(gz)))]


+@register_opt()
 @local_optimizer([GpuConv])
 def local_conv_gemm(node):
    if (isinstance(node.op, GpuConv) and
@@ -1493,7 +1518,6 @@ def local_conv_gemm(node):
            return [GpuCorrMM_gradInputs('valid', subsample, pad)(
                    gpu_contiguous(kern), gpu_contiguous(img))]

-gpu_optimizer.register("conv_gemm", local_conv_gemm)

 from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin


--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -586,6 +586,31 @@ def test_dnn_valid():
        yield t


+def test_default_conv():
+    """Just test that we introduce the right GPU convolution
+    version.
+
+    """
+    img = theano.tensor.ftensor4()
+    fil = theano.tensor.ftensor4()
+
+    c = theano.tensor.nnet.conv2d(img, fil)
+    f = theano.function([img, fil], c, mode=theano_mode)
+
+    if cuda.dnn.dnn_available():
+        assert any([isinstance(a.op, GpuDnnConv)
+                    for a in f.maker.fgraph.apply_nodes])
+    else:
+        assert any([isinstance(a.op, cuda.blas.GpuCorrMM)
+                    for a in f.maker.fgraph.apply_nodes])
+
+    mode = theano_mode.excluding('local_gpu_conv', 'local_conv_gemm')
+    f = theano.function([img, fil], c, mode=mode)
+
+    assert any([isinstance(a.op, cuda.blas.GpuConv)
+                for a in f.maker.fgraph.apply_nodes])
+
+
 def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
    seed_rng()
    shapes = get_basic_shapes()

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -78,13 +78,17 @@ def safe_to_cpu(x):
        return x


-def op_lifter(OP):
+def op_lifter(OP, cuda_only=False):
    """
    OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
    gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
    """
    def f(maker):
        def local_opt(node):
+            dev = theano.sandbox.gpuarray.init_dev.device
+            if cuda_only and not dev.startswith('cuda'):
+                return
+
            if type(node.op) in OP:

                # Either one of our inputs is on the gpu or
@@ -484,25 +488,25 @@ def local_gpua_eye(node):


 @register_opt('fast_compile')
-@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
+@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
 def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
    return GpuCrossentropySoftmaxArgmax1HotWithBias()


 @register_opt('fast_compile')
-@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
+@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
 def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
    return GpuCrossentropySoftmax1HotWithBiasDx()


 @register_opt('fast_compile')
-@op_lifter([tensor.nnet.Softmax])
+@op_lifter([tensor.nnet.Softmax], cuda_only=True)
 def local_gpua_softmax(node):
    return GpuSoftmax()


 @register_opt('fast_compile')
-@op_lifter([tensor.nnet.SoftmaxWithBias])
+@op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True)
 def local_gpua_softmaxwithbias(node):
    return GpuSoftmaxWithBias()


--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -1281,7 +1281,7 @@ class TestAdvancedSubtensor(unittest.TestCase):
        z = b[[i, j], :]
        f1 = theano.function([i, j], z)
        cmd = f1(0, 1) == a[[0, 1], :]
-        self.assertTrue(numpy.all(cmp))
+        self.assertTrue(cmd.all())

        aa = rng.uniform(size=(4, 2, 3))
        bb = theano.shared(aa)
@@ -1289,7 +1289,7 @@ class TestAdvancedSubtensor(unittest.TestCase):
        z = bb[[i, j, k], :, i:k]
        f2 = theano.function([i, j, k], z)
        cmd = f2(0, 1, 2) == aa[[0, 1, 2], :, 0:2]
-        self.assertTrue(numpy.all(cmp))
+        self.assertTrue(cmd.all())


 class TestInferShape(utt.InferShapeTester):