提交 ee47526d authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #2192 from nouiz/mixed

Mixed
......@@ -22,6 +22,33 @@
.. moduleauthor:: LISA
.. note::
As of October 21st, 2014, the default GPU image convolution
changed. Here is the algo:
- If we can use `cuDNN <https://developer.nvidia.com/cuDNN>`_, use it.
- If not, use gemm version (slower then cuDNN, uses more memory).
If the users do not want the extra memory usage of the gemm
version, they can enable the legacy code that is even slower, but
does not use extra memory. For this, use the Theano flag
``optimizer_excluding=conv_gemm``.
There is no reason to use the legacy code or the gemm version if
cuDNN is available.
2 other options:
- There is also the fft version that is the fastest in some cases,
but uses even more memory. It does not support striding to remove
computation and has some shapes restriction.
- There is also the cuda_convnet convolution in Pylearn2. It uses a
different memory layout, has shapes restrictions, but does not use
extra memory and is faster then the legacy convolution.
TODO: Give examples on how to use these things! They are pretty complicated.
- Convolution operators implemented:
......
==================
Advanced Indexing
==================
Continue the Advanced Indexing project that is on either github or bitbucket.
.. _tut_multi_cores:
=============================
Multi cores support in Theano
=============================
......
......@@ -135,9 +135,14 @@ class Apply(Node):
if len(self.outputs) == 1:
return self.outputs[0]
else:
raise AttributeError("%s.default_output should be an output index." % self.op)
raise AttributeError(
"%s.default_output should be an output index." % self.op)
elif not isinstance(do, (int, long)):
raise AttributeError("%s.default_output should be an int or long" %
self.op)
elif do < 0 or do >= len(self.outputs):
raise AttributeError("%s.default_output is out of range." % self.op)
raise AttributeError("%s.default_output is out of range." %
self.op)
return self.outputs[do]
def env_getter(self):
......
......@@ -616,6 +616,7 @@ class PerformLinker(LocalLinker):
f.allow_gc = self.allow_gc #HACK: this is a way of passing an arg to Function.__call__
add_clear_storage(f, computed, storage_map)
f.storage_map = storage_map
return f, [Container(input, storage) for input, storage in zip(fgraph.inputs, input_storage)], \
[Container(output, storage, True) for output, storage in zip(fgraph.outputs, output_storage)], \
......
......@@ -201,41 +201,43 @@ if __name__ == "__main__":
Test time in float32
cuda version 6.0 5.5 5.0 4.2 4.1 4.0 3.2 3.0 # note
cuda version 6.5 6.0 5.5 5.0 4.2 4.1 4.0 3.2 3.0 # note
gpu
K6000/NOECC 0.06s
K40 0.07s
K20m/ECC 0.07s
K20/NOECC 0.07s
M2090 0.19s
C2075 0.25s
M2075 0.25s
M2070 0.25s 0.27s 0.32s
M2070-Q 0.48s 0.27s 0.32s
M2050(Amazon) 0.25s
C1060 0.46s
K600 1.04s
GTX Titan Black 0.05s
GTX Titan(D15U-50) 0.06s 0.06s don't work
GTX 780 0.06s
GTX 680 0.11s 0.12s 0.154s 0.218s
GTX 580 0.16s 0.16s 0.164s 0.203s
GTX 480 0.19s 0.19s 0.192s 0.237s 0.27s
GTX 470 0.23s 0.23s 0.238s 0.297s 0.34s
GTX 660 0.18s 0.20s 0.23s
GTX 560 0.30s
GTX 650 Ti 0.27s
GTX 765M 0.27s
GTX 460 0.37s 0.45s
GTX 285 0.42s 0.452s 0.452s 0.40s # cuda 3.0 seems faster? driver version?
750M 0.49s
GTX 550 Ti 0.57s
GT 520 2.68s 3.06s
520M 2.44s 3.19s # with bumblebee on Ubuntu 12.04
GT 220 3.80s
GT 210 6.35s
8500 GT 10.68s
K6000/NOECC 0.06s
K40 0.07s
K20m/ECC 0.07s
K20/NOECC 0.07s
M2090 0.19s
C2075 0.25s
M2075 0.25s
M2070 0.25s 0.27s 0.32s
M2070-Q 0.48s 0.27s 0.32s
M2050(Amazon) 0.25s
C1060 0.46s
K600 1.04s
GTX Titan Black 0.05s
GTX Titan(D15U-50) 0.06s 0.06s don't work
GTX 780 0.06s
GTX 970 0.08s
GTX 680 0.11s 0.12s 0.154s 0.218s
GTX 580 0.16s 0.16s 0.164s 0.203s
GTX 480 0.19s 0.19s 0.192s 0.237s 0.27s
GTX 750 Ti 0.20s
GTX 470 0.23s 0.23s 0.238s 0.297s 0.34s
GTX 660 0.18s 0.20s 0.23s
GTX 560 0.30s
GTX 650 Ti 0.27s
GTX 765M 0.27s
GTX 460 0.37s 0.45s
GTX 285 0.42s 0.452s 0.452s 0.40s # cuda 3.0 seems faster? driver version?
750M 0.49s
GTX 550 Ti 0.57s
GT 520 2.68s 3.06s
520M 2.44s 3.19s # with bumblebee on Ubuntu 12.04
GT 220 3.80s
GT 210 6.35s
8500 GT 10.68s
"""
t, impl = execute(not options.print_only, not options.quiet,
......
......@@ -1109,9 +1109,33 @@ def local_gpu_softmax_with_bias(node):
from theano.tensor.nnet import conv
@register_opt()
# Needs to be registered before local_gpu_conv_legacy. Otherwise, it
# will have priority over this optimization. We want, if cudnn is
# available and the GPU supports it, to use it. Otherwise, the gemm
# version should be used. If the users want the legacy convolution,
# they should use the Theano flag to disable the dnn and/or gemm version.
@register_opt("dnn")
@local_optimizer([gpu_from_host, conv.ConvOp])
def local_gpu_conv(node):
"""
If cudnn is available, use it. Otherwise, use the gemm version.
"""
if theano.sandbox.cuda.dnn.dnn_available():
repl = local_gpu_conv_legacy.transform(node)
if repl:
n = repl[0].owner.inputs[0].owner
assert isinstance(n.op, GpuConv)
ret = theano.sandbox.cuda.dnn.local_conv_dnn.transform(n)
if ret:
return [host_from_gpu(ret[0])]
# If dnn isn't avail, the local_gpu_conv_legacy wil introduce the
# legacy opt. Then the local_conv_gemm will convert it to gemm
# opt.
@register_opt()
@local_optimizer([gpu_from_host, conv.ConvOp])
def local_gpu_conv_legacy(node):
"""
gpu_from_host(conv) -> gpu_conv(gpu_from_host)
......@@ -1438,6 +1462,7 @@ def local_gpu_downsample_factor_max_grad(node):
gpu_from_host(gz)))]
@register_opt()
@local_optimizer([GpuConv])
def local_conv_gemm(node):
if (isinstance(node.op, GpuConv) and
......@@ -1493,7 +1518,6 @@ def local_conv_gemm(node):
return [GpuCorrMM_gradInputs('valid', subsample, pad)(
gpu_contiguous(kern), gpu_contiguous(img))]
gpu_optimizer.register("conv_gemm", local_conv_gemm)
from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin
......
......@@ -586,6 +586,31 @@ def test_dnn_valid():
yield t
def test_default_conv():
"""Just test that we introduce the right GPU convolution
version.
"""
img = theano.tensor.ftensor4()
fil = theano.tensor.ftensor4()
c = theano.tensor.nnet.conv2d(img, fil)
f = theano.function([img, fil], c, mode=theano_mode)
if cuda.dnn.dnn_available():
assert any([isinstance(a.op, GpuDnnConv)
for a in f.maker.fgraph.apply_nodes])
else:
assert any([isinstance(a.op, cuda.blas.GpuCorrMM)
for a in f.maker.fgraph.apply_nodes])
mode = theano_mode.excluding('local_gpu_conv', 'local_conv_gemm')
f = theano.function([img, fil], c, mode=mode)
assert any([isinstance(a.op, cuda.blas.GpuConv)
for a in f.maker.fgraph.apply_nodes])
def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
seed_rng()
shapes = get_basic_shapes()
......
......@@ -78,13 +78,17 @@ def safe_to_cpu(x):
return x
def op_lifter(OP):
def op_lifter(OP, cuda_only=False):
"""
OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
"""
def f(maker):
def local_opt(node):
dev = theano.sandbox.gpuarray.init_dev.device
if cuda_only and not dev.startswith('cuda'):
return
if type(node.op) in OP:
# Either one of our inputs is on the gpu or
......@@ -484,25 +488,25 @@ def local_gpua_eye(node):
@register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
return GpuCrossentropySoftmaxArgmax1HotWithBias()
@register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
return GpuCrossentropySoftmax1HotWithBiasDx()
@register_opt('fast_compile')
@op_lifter([tensor.nnet.Softmax])
@op_lifter([tensor.nnet.Softmax], cuda_only=True)
def local_gpua_softmax(node):
return GpuSoftmax()
@register_opt('fast_compile')
@op_lifter([tensor.nnet.SoftmaxWithBias])
@op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True)
def local_gpua_softmaxwithbias(node):
return GpuSoftmaxWithBias()
......
......@@ -1281,7 +1281,7 @@ class TestAdvancedSubtensor(unittest.TestCase):
z = b[[i, j], :]
f1 = theano.function([i, j], z)
cmd = f1(0, 1) == a[[0, 1], :]
self.assertTrue(numpy.all(cmp))
self.assertTrue(cmd.all())
aa = rng.uniform(size=(4, 2, 3))
bb = theano.shared(aa)
......@@ -1289,7 +1289,7 @@ class TestAdvancedSubtensor(unittest.TestCase):
z = bb[[i, j, k], :, i:k]
f2 = theano.function([i, j, k], z)
cmd = f2(0, 1, 2) == aa[[0, 1, 2], :, 0:2]
self.assertTrue(numpy.all(cmp))
self.assertTrue(cmd.all())
class TestInferShape(utt.InferShapeTester):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论