提交 4f06e78d authored 作者: Frederic's avatar Frederic

If we keep the Subtensor on the CPU, keep the bubble active

fix gh-2327
上级 c7034860
...@@ -44,7 +44,7 @@ from theano.sandbox.cuda.elemwise import SupportCodeError ...@@ -44,7 +44,7 @@ from theano.sandbox.cuda.elemwise import SupportCodeError
from theano.scalar.basic_scipy import Erfinv from theano.scalar.basic_scipy import Erfinv
from theano.sandbox.cuda.elemwise import erfinv_gpu from theano.sandbox.cuda.elemwise import erfinv_gpu
from theano.sandbox.cuda.var import CudaNdarrayConstant from theano.sandbox.cuda.var import CudaNdarrayConstant
from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt, GpuOp
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.blas import _is_real_vector, _is_real_matrix from theano.tensor.blas import _is_real_vector, _is_real_matrix
from theano.tensor import nlinalg from theano.tensor import nlinalg
...@@ -832,6 +832,11 @@ def local_gpu_subtensor(node): ...@@ -832,6 +832,11 @@ def local_gpu_subtensor(node):
isinstance(host_input.owner.op, tensor.Subtensor): isinstance(host_input.owner.op, tensor.Subtensor):
subt = host_input.owner.op subt = host_input.owner.op
x = host_input.owner.inputs[0] x = host_input.owner.inputs[0]
if len(x.clients) == 1:
# It mean, the input of the subtensor is used only by
# the subtensor. We do not want to move the subtensor
# to the GPU in that case.
return
coords = host_input.owner.inputs[1:] coords = host_input.owner.inputs[1:]
return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)] return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)]
if isinstance(node.op, tensor.Subtensor): if isinstance(node.op, tensor.Subtensor):
...@@ -840,8 +845,16 @@ def local_gpu_subtensor(node): ...@@ -840,8 +845,16 @@ def local_gpu_subtensor(node):
isinstance(x.owner.op, HostFromGpu) and isinstance(x.owner.op, HostFromGpu) and
x.dtype == "float32"): x.dtype == "float32"):
gpu_x = x.owner.inputs[0] gpu_x = x.owner.inputs[0]
if gpu_x.owner and isinstance(gpu_x.owner.op, GpuFromHost) and not gpu_x.owner.inputs[0].owner: if (gpu_x.owner and
isinstance(gpu_x.owner.op, GpuFromHost) and
# And it is a shared var or an input of the graph.
not gpu_x.owner.inputs[0].owner):
if len(x.clients) == 1: if len(x.clients) == 1:
if any([n == 'output' or isinstance(n.op, GpuOp)
for n,_ in node.outputs[0].clients]):
return
else:
return [host_from_gpu(gpu_from_host(node.outputs[0]))]
return return
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
......
...@@ -211,7 +211,6 @@ def test_local_gpu_subtensor(): ...@@ -211,7 +211,6 @@ def test_local_gpu_subtensor():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert any([type(node.op) is tensor.Subtensor for node in topo]) assert any([type(node.op) is tensor.Subtensor for node in topo])
assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo]) assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
theano.printing.debugprint(f)
# Test graph input. # Test graph input.
t = tensor.fmatrix() t = tensor.fmatrix()
...@@ -219,17 +218,32 @@ def test_local_gpu_subtensor(): ...@@ -219,17 +218,32 @@ def test_local_gpu_subtensor():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert any([type(node.op) is tensor.Subtensor for node in topo]) assert any([type(node.op) is tensor.Subtensor for node in topo])
assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo]) assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
theano.printing.debugprint(f)
# Test multiple use of the input # Test multiple use of the input
# We want the subtensor to be on the GPU to prevent multiple transfer. # We want the subtensor to be on the GPU to prevent multiple transfer.
t = tensor.fmatrix() t = tensor.fmatrix()
f = theano.function([t], [t[3:4], t+1], mode=mode_with_gpu) f = theano.function([t], [t[3:4], t+1], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
theano.printing.debugprint(f)
assert not any([type(node.op) is tensor.Subtensor for node in topo]) assert not any([type(node.op) is tensor.Subtensor for node in topo])
assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo]) assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
# Test multiple use of the input + input as output
# We want the subtensor to be on the GPU to prevent multiple transfer.
t = tensor.fmatrix()
f = theano.function([t], [t[3:4], t+1, t], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert not any([type(node.op) is tensor.Subtensor for node in topo])
assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
# Test shared forced on CPU end we do computation on the output of
# the subtensor.
t = tensor._shared(numpy.zeros(20, "float32"))
f = theano.function([], t[3:4]+1, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert any([type(node.op) is tensor.Subtensor for node in topo])
assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
assert any([isinstance(node.op, cuda.GpuElemwise) for node in topo])
def test_print_op(): def test_print_op():
""" Test that print ops don't block gpu optimization""" """ Test that print ops don't block gpu optimization"""
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论