Change the default GPU 2d convolution.

cdea94b3 · Frederic · 35e2aa0e · cdea94b3 · cdea94b3 · cdea94b3
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -22,6 +22,29 @@
 .. moduleauthor:: LISA
+.. note::
+    As October 20, 2014, the default GPU image convolution
+    changed. Now, if `cuDNN <https://developer.nvidia.com/cuDNN>`_ is
+    available and the GPU selected is supported by it. This give
+    faster GPU convolution without using more memory then the legacy
+    convolution.
+    - If can use cuDNN, use it.
+    - If not, use gemm version (slower then cuDNN, use more memory).
+    - If the user don't want the extra memory of the gemm version,
+      they can enable the legacy code that is even slower, but don't
+      use extra memory.
+    - There is also the fft version that is the fastest in some cases,
+      but use even more memory. It don't support striding to remove
+      computation and have some shape restriction.
+    - There is also the cuda_convnet convolution in Pylearn2. It use a
+      different memory layout, have shapes restriction, but don't use
+      extra memory and is faster then the legacy convolution.
 TODO: Give examples on how to use these things! They are pretty complicated.
 - Convolution operators implemented:

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1109,9 +1109,36 @@ def local_gpu_softmax_with_bias(node):
 from theano.tensor.nnet import conv
-@register_opt()
+# Need to be registered before local_gpu_conv_legacy. Otherwise, it
+# will have priority over this optimization.  We want, if cudnn is
+# available and the GPU support it, use it.  Otherwise, the gemm
+# version should be used.  If the user want the legacy convolution,
+# they should use the Theano flag:
+# optimizer_excluding=local_conv_gemm.
+# If cudnn is available, this flag should be added:
+# optimizer_excluding=local_gpu_conv
+@register_opt("dnn")
 @local_optimizer([gpu_from_host, conv.ConvOp])
 def local_gpu_conv(node):
+    """
+    If cudnn is available, use it. Otherwise, use the gemm version.
+    """
+    if theano.sandbox.cuda.dnn.dnn_available():
+        repl = local_gpu_conv_legacy.transform(node)
+        if repl:
+            n = repl[0].owner.inputs[0].owner
+            assert isinstance(n.op, GpuConv)
+            ret = theano.sandbox.cuda.dnn.local_conv_dnn.transform(n)
+            if ret:
+                return [host_from_gpu(ret[0])]
+    # If dnn isn't avail, the local_gpu_conv_legacy wil introduce the
+    # legacy opt. Then the local_conv_gemm will convert it to gemm
+    # opt.
+@register_opt()
+@local_optimizer([gpu_from_host, conv.ConvOp])
+def local_gpu_conv_legacy(node):
    """
    gpu_from_host(conv) -> gpu_conv(gpu_from_host)
@@ -1438,6 +1465,7 @@ def local_gpu_downsample_factor_max_grad(node):
                                              gpu_from_host(gz)))]
+@register_opt()
 @local_optimizer([GpuConv])
 def local_conv_gemm(node):
    if (isinstance(node.op, GpuConv) and
@@ -1493,7 +1521,6 @@ def local_conv_gemm(node):
            return [GpuCorrMM_gradInputs('valid', subsample, pad)(
                    gpu_contiguous(kern), gpu_contiguous(img))]
-gpu_optimizer.register("conv_gemm", local_conv_gemm)
 from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -586,6 +586,31 @@ def test_dnn_valid():
        yield t
+def test_default_conv():
+    """Just test that we introduce the right GPU convolution
+    versoin.
+    """
+    img = theano.tensor.ftensor4()
+    fil = theano.tensor.ftensor4()
+    c = theano.tensor.nnet.conv2d(img, fil)
+    f = theano.function([img, fil], c, mode=theano_mode)
+    if cuda.dnn.dnn_available():
+        assert any([isinstance(a.op, GpuDnnConv)
+                    for a in f.maker.fgraph.apply_nodes])
+    else:
+        assert any([isinstance(a.op, cuda.blas.GpuCorrMM)
+                    for a in f.maker.fgraph.apply_nodes])
+    mode = theano_mode.excluding('local_gpu_conv', 'local_conv_gemm')
+    f = theano.function([img, fil], c, mode=mode)
+    assert any([isinstance(a.op, cuda.blas.GpuConv)
+                for a in f.maker.fgraph.apply_nodes])
 def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
    seed_rng()
    shapes = get_basic_shapes()