提交 ecbde210 authored 作者: Frederic Bastien's avatar Frederic Bastien

Don't move Elemwise to gpu when we can't generate c code for it! Test it.

上级 d44fc708
......@@ -90,6 +90,8 @@ def local_gpu_elemwise_0(node):
if isinstance(node.op, tensor.Elemwise):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
if max_inputs_to_GpuElemwise(node)<len(node.inputs):
return False
#don't set any inplace pattern. gpu_insert_inplace_optimizer will do it later
new_op = GpuElemwise(node.op.scalar_op)
......@@ -113,9 +115,10 @@ def local_gpu_elemwise_0(node):
else:
return False
gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner).outputs[0]
return [host_from_gpu(gpu_elemwise)]
gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner)
if not gpu_elemwise:
return False
return [host_from_gpu(gpu_elemwise.outputs[0])]
@register_opt()
@local_optimizer([])
def local_gpu_elemwise_1(node):
......@@ -130,8 +133,10 @@ def local_gpu_elemwise_1(node):
new_op = GpuElemwise(elemwise_node.op.scalar_op)
if all([i.dtype=='float32' for i in elemwise_node.inputs]):
gpu_elemwise = new_op(*[gpu_from_host(i) for i in elemwise_node.inputs])
gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner).outputs[0]
return [gpu_elemwise]
gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner)
if not gpu_elemwise:
return False
return [gpu_elemwise.outputs[0]]
return False
@register_opt()
......@@ -762,6 +767,8 @@ def split_huge_add_or_mul(node):
"""
if node.op.scalar_op in (scal.add, scal.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(node)
if max_nb_inputs<=1 and len(node.inputs)>1:
return False
while len(node.inputs)>max_nb_inputs:
inner_op = []
for i in range(0,len(node.inputs),max_nb_inputs):
......
......@@ -176,6 +176,23 @@ def test_huge_elemwise_fusion():
gen = lambda : theano._asarray(numpy.random.rand(*shape), dtype='float32')
f(gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen())
# Test the case where we can't put the computation on the gpu! their is too many
# dimensions to the input to have 2 inputs to the op!
shape = (1,2,3,4,5,6,7,2,2,3,2,1,2,2,2,)
ttype = tensor.tensor(dtype='float32',broadcastable=(False,)*len(shape))
vars = [tensor.tanh(ttype) for x in range(10)]
f = pfunc(vars, [vars[0]-vars[1]-vars[2]-vars[3]-vars[4]-vars[5]-vars[6]], mode=mode_with_gpu)
topo = f.maker.env.toposort()
#theano.printing.debugprint(f)
assert len(topo)==1
assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo])==0
assert sum([isinstance(node.op, tensor.Elemwise) for node in topo])==1
#let debugmode catch errors
gen = lambda : theano._asarray(numpy.random.rand(*shape), dtype='float32')
f(gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen())
def test_elemwise_fusion():
""" Test the the GpuElemwise fusion work correctly"""
shape = (3,4)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论