提交 ee47526d authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #2192 from nouiz/mixed

Mixed
...@@ -22,6 +22,33 @@ ...@@ -22,6 +22,33 @@
.. moduleauthor:: LISA .. moduleauthor:: LISA
.. note::
As of October 21st, 2014, the default GPU image convolution
changed. Here is the algo:
- If we can use `cuDNN <https://developer.nvidia.com/cuDNN>`_, use it.
- If not, use gemm version (slower then cuDNN, uses more memory).
If the users do not want the extra memory usage of the gemm
version, they can enable the legacy code that is even slower, but
does not use extra memory. For this, use the Theano flag
``optimizer_excluding=conv_gemm``.
There is no reason to use the legacy code or the gemm version if
cuDNN is available.
2 other options:
- There is also the fft version that is the fastest in some cases,
but uses even more memory. It does not support striding to remove
computation and has some shapes restriction.
- There is also the cuda_convnet convolution in Pylearn2. It uses a
different memory layout, has shapes restrictions, but does not use
extra memory and is faster then the legacy convolution.
TODO: Give examples on how to use these things! They are pretty complicated. TODO: Give examples on how to use these things! They are pretty complicated.
- Convolution operators implemented: - Convolution operators implemented:
......
==================
Advanced Indexing
==================
Continue the Advanced Indexing project that is on either github or bitbucket.
.. _tut_multi_cores:
============================= =============================
Multi cores support in Theano Multi cores support in Theano
============================= =============================
......
...@@ -135,9 +135,14 @@ class Apply(Node): ...@@ -135,9 +135,14 @@ class Apply(Node):
if len(self.outputs) == 1: if len(self.outputs) == 1:
return self.outputs[0] return self.outputs[0]
else: else:
raise AttributeError("%s.default_output should be an output index." % self.op) raise AttributeError(
"%s.default_output should be an output index." % self.op)
elif not isinstance(do, (int, long)):
raise AttributeError("%s.default_output should be an int or long" %
self.op)
elif do < 0 or do >= len(self.outputs): elif do < 0 or do >= len(self.outputs):
raise AttributeError("%s.default_output is out of range." % self.op) raise AttributeError("%s.default_output is out of range." %
self.op)
return self.outputs[do] return self.outputs[do]
def env_getter(self): def env_getter(self):
......
...@@ -616,6 +616,7 @@ class PerformLinker(LocalLinker): ...@@ -616,6 +616,7 @@ class PerformLinker(LocalLinker):
f.allow_gc = self.allow_gc #HACK: this is a way of passing an arg to Function.__call__ f.allow_gc = self.allow_gc #HACK: this is a way of passing an arg to Function.__call__
add_clear_storage(f, computed, storage_map) add_clear_storage(f, computed, storage_map)
f.storage_map = storage_map
return f, [Container(input, storage) for input, storage in zip(fgraph.inputs, input_storage)], \ return f, [Container(input, storage) for input, storage in zip(fgraph.inputs, input_storage)], \
[Container(output, storage, True) for output, storage in zip(fgraph.outputs, output_storage)], \ [Container(output, storage, True) for output, storage in zip(fgraph.outputs, output_storage)], \
......
...@@ -201,41 +201,43 @@ if __name__ == "__main__": ...@@ -201,41 +201,43 @@ if __name__ == "__main__":
Test time in float32 Test time in float32
cuda version 6.0 5.5 5.0 4.2 4.1 4.0 3.2 3.0 # note cuda version 6.5 6.0 5.5 5.0 4.2 4.1 4.0 3.2 3.0 # note
gpu gpu
K6000/NOECC 0.06s K6000/NOECC 0.06s
K40 0.07s K40 0.07s
K20m/ECC 0.07s K20m/ECC 0.07s
K20/NOECC 0.07s K20/NOECC 0.07s
M2090 0.19s M2090 0.19s
C2075 0.25s C2075 0.25s
M2075 0.25s M2075 0.25s
M2070 0.25s 0.27s 0.32s M2070 0.25s 0.27s 0.32s
M2070-Q 0.48s 0.27s 0.32s M2070-Q 0.48s 0.27s 0.32s
M2050(Amazon) 0.25s M2050(Amazon) 0.25s
C1060 0.46s C1060 0.46s
K600 1.04s K600 1.04s
GTX Titan Black 0.05s GTX Titan Black 0.05s
GTX Titan(D15U-50) 0.06s 0.06s don't work GTX Titan(D15U-50) 0.06s 0.06s don't work
GTX 780 0.06s GTX 780 0.06s
GTX 680 0.11s 0.12s 0.154s 0.218s GTX 970 0.08s
GTX 580 0.16s 0.16s 0.164s 0.203s GTX 680 0.11s 0.12s 0.154s 0.218s
GTX 480 0.19s 0.19s 0.192s 0.237s 0.27s GTX 580 0.16s 0.16s 0.164s 0.203s
GTX 470 0.23s 0.23s 0.238s 0.297s 0.34s GTX 480 0.19s 0.19s 0.192s 0.237s 0.27s
GTX 660 0.18s 0.20s 0.23s GTX 750 Ti 0.20s
GTX 560 0.30s GTX 470 0.23s 0.23s 0.238s 0.297s 0.34s
GTX 650 Ti 0.27s GTX 660 0.18s 0.20s 0.23s
GTX 765M 0.27s GTX 560 0.30s
GTX 460 0.37s 0.45s GTX 650 Ti 0.27s
GTX 285 0.42s 0.452s 0.452s 0.40s # cuda 3.0 seems faster? driver version? GTX 765M 0.27s
750M 0.49s GTX 460 0.37s 0.45s
GTX 550 Ti 0.57s GTX 285 0.42s 0.452s 0.452s 0.40s # cuda 3.0 seems faster? driver version?
GT 520 2.68s 3.06s 750M 0.49s
520M 2.44s 3.19s # with bumblebee on Ubuntu 12.04 GTX 550 Ti 0.57s
GT 220 3.80s GT 520 2.68s 3.06s
GT 210 6.35s 520M 2.44s 3.19s # with bumblebee on Ubuntu 12.04
8500 GT 10.68s GT 220 3.80s
GT 210 6.35s
8500 GT 10.68s
""" """
t, impl = execute(not options.print_only, not options.quiet, t, impl = execute(not options.print_only, not options.quiet,
......
...@@ -1109,9 +1109,33 @@ def local_gpu_softmax_with_bias(node): ...@@ -1109,9 +1109,33 @@ def local_gpu_softmax_with_bias(node):
from theano.tensor.nnet import conv from theano.tensor.nnet import conv
@register_opt() # Needs to be registered before local_gpu_conv_legacy. Otherwise, it
# will have priority over this optimization. We want, if cudnn is
# available and the GPU supports it, to use it. Otherwise, the gemm
# version should be used. If the users want the legacy convolution,
# they should use the Theano flag to disable the dnn and/or gemm version.
@register_opt("dnn")
@local_optimizer([gpu_from_host, conv.ConvOp]) @local_optimizer([gpu_from_host, conv.ConvOp])
def local_gpu_conv(node): def local_gpu_conv(node):
"""
If cudnn is available, use it. Otherwise, use the gemm version.
"""
if theano.sandbox.cuda.dnn.dnn_available():
repl = local_gpu_conv_legacy.transform(node)
if repl:
n = repl[0].owner.inputs[0].owner
assert isinstance(n.op, GpuConv)
ret = theano.sandbox.cuda.dnn.local_conv_dnn.transform(n)
if ret:
return [host_from_gpu(ret[0])]
# If dnn isn't avail, the local_gpu_conv_legacy wil introduce the
# legacy opt. Then the local_conv_gemm will convert it to gemm
# opt.
@register_opt()
@local_optimizer([gpu_from_host, conv.ConvOp])
def local_gpu_conv_legacy(node):
""" """
gpu_from_host(conv) -> gpu_conv(gpu_from_host) gpu_from_host(conv) -> gpu_conv(gpu_from_host)
...@@ -1438,6 +1462,7 @@ def local_gpu_downsample_factor_max_grad(node): ...@@ -1438,6 +1462,7 @@ def local_gpu_downsample_factor_max_grad(node):
gpu_from_host(gz)))] gpu_from_host(gz)))]
@register_opt()
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
def local_conv_gemm(node): def local_conv_gemm(node):
if (isinstance(node.op, GpuConv) and if (isinstance(node.op, GpuConv) and
...@@ -1493,7 +1518,6 @@ def local_conv_gemm(node): ...@@ -1493,7 +1518,6 @@ def local_conv_gemm(node):
return [GpuCorrMM_gradInputs('valid', subsample, pad)( return [GpuCorrMM_gradInputs('valid', subsample, pad)(
gpu_contiguous(kern), gpu_contiguous(img))] gpu_contiguous(kern), gpu_contiguous(img))]
gpu_optimizer.register("conv_gemm", local_conv_gemm)
from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin
......
...@@ -586,6 +586,31 @@ def test_dnn_valid(): ...@@ -586,6 +586,31 @@ def test_dnn_valid():
yield t yield t
def test_default_conv():
"""Just test that we introduce the right GPU convolution
version.
"""
img = theano.tensor.ftensor4()
fil = theano.tensor.ftensor4()
c = theano.tensor.nnet.conv2d(img, fil)
f = theano.function([img, fil], c, mode=theano_mode)
if cuda.dnn.dnn_available():
assert any([isinstance(a.op, GpuDnnConv)
for a in f.maker.fgraph.apply_nodes])
else:
assert any([isinstance(a.op, cuda.blas.GpuCorrMM)
for a in f.maker.fgraph.apply_nodes])
mode = theano_mode.excluding('local_gpu_conv', 'local_conv_gemm')
f = theano.function([img, fil], c, mode=mode)
assert any([isinstance(a.op, cuda.blas.GpuConv)
for a in f.maker.fgraph.apply_nodes])
def _test_full(cls, mode=None, version=[-1], extra_shapes=[]): def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
seed_rng() seed_rng()
shapes = get_basic_shapes() shapes = get_basic_shapes()
......
...@@ -78,13 +78,17 @@ def safe_to_cpu(x): ...@@ -78,13 +78,17 @@ def safe_to_cpu(x):
return x return x
def op_lifter(OP): def op_lifter(OP, cuda_only=False):
""" """
OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...)) OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...) gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
""" """
def f(maker): def f(maker):
def local_opt(node): def local_opt(node):
dev = theano.sandbox.gpuarray.init_dev.device
if cuda_only and not dev.startswith('cuda'):
return
if type(node.op) in OP: if type(node.op) in OP:
# Either one of our inputs is on the gpu or # Either one of our inputs is on the gpu or
...@@ -484,25 +488,25 @@ def local_gpua_eye(node): ...@@ -484,25 +488,25 @@ def local_gpua_eye(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias]) @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
def local_gpua_crossentropysoftmaxargmax1hotwithbias(node): def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
return GpuCrossentropySoftmaxArgmax1HotWithBias() return GpuCrossentropySoftmaxArgmax1HotWithBias()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx]) @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
def local_gpua_crossentropysoftmax1hotwithbiasdx(node): def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
return GpuCrossentropySoftmax1HotWithBiasDx() return GpuCrossentropySoftmax1HotWithBiasDx()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.Softmax]) @op_lifter([tensor.nnet.Softmax], cuda_only=True)
def local_gpua_softmax(node): def local_gpua_softmax(node):
return GpuSoftmax() return GpuSoftmax()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.SoftmaxWithBias]) @op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True)
def local_gpua_softmaxwithbias(node): def local_gpua_softmaxwithbias(node):
return GpuSoftmaxWithBias() return GpuSoftmaxWithBias()
......
...@@ -1281,7 +1281,7 @@ class TestAdvancedSubtensor(unittest.TestCase): ...@@ -1281,7 +1281,7 @@ class TestAdvancedSubtensor(unittest.TestCase):
z = b[[i, j], :] z = b[[i, j], :]
f1 = theano.function([i, j], z) f1 = theano.function([i, j], z)
cmd = f1(0, 1) == a[[0, 1], :] cmd = f1(0, 1) == a[[0, 1], :]
self.assertTrue(numpy.all(cmp)) self.assertTrue(cmd.all())
aa = rng.uniform(size=(4, 2, 3)) aa = rng.uniform(size=(4, 2, 3))
bb = theano.shared(aa) bb = theano.shared(aa)
...@@ -1289,7 +1289,7 @@ class TestAdvancedSubtensor(unittest.TestCase): ...@@ -1289,7 +1289,7 @@ class TestAdvancedSubtensor(unittest.TestCase):
z = bb[[i, j, k], :, i:k] z = bb[[i, j, k], :, i:k]
f2 = theano.function([i, j, k], z) f2 = theano.function([i, j, k], z)
cmd = f2(0, 1, 2) == aa[[0, 1, 2], :, 0:2] cmd = f2(0, 1, 2) == aa[[0, 1, 2], :, 0:2]
self.assertTrue(numpy.all(cmp)) self.assertTrue(cmd.all())
class TestInferShape(utt.InferShapeTester): class TestInferShape(utt.InferShapeTester):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论