提交 a7ca0c4f authored 作者: Frederic Bastien's avatar Frederic Bastien

make GpuElemwise optimization to work inplace as Elemwise.

上级 44af7bc1
...@@ -85,13 +85,9 @@ class GpuElemwise(Op): ...@@ -85,13 +85,9 @@ class GpuElemwise(Op):
# #
sync = config.gpuelemwise.sync sync = config.gpuelemwise.sync
self.scalar_op = scalar_op self.scalar_op = scalar_op
if 0:
#we don't put them their as this cause trouble with the local_cut_gpu_host_gpu optimizer. self.inplace_pattern = inplace_pattern
#and the gpu don't implement any inplace pattern for now. self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
self.inplace_pattern = inplace_pattern
self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
else:
self.inplace_pattern = {}
self.sync = sync self.sync = sync
......
...@@ -89,7 +89,8 @@ def local_gpu_elemwise_0(node): ...@@ -89,7 +89,8 @@ def local_gpu_elemwise_0(node):
if isinstance(node.op, tensor.Elemwise): if isinstance(node.op, tensor.Elemwise):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern) #don't set any inplace pattern. gpu_insert_inplace_optimizer will do it later
new_op = GpuElemwise(node.op.scalar_op)
# case 1 - all inputs are already float32 # case 1 - all inputs are already float32
if numpy.all([i.type.dtype == 'float32' for i in node.inputs]): if numpy.all([i.type.dtype == 'float32' for i in node.inputs]):
...@@ -120,7 +121,8 @@ def local_gpu_elemwise_1(node): ...@@ -120,7 +121,8 @@ def local_gpu_elemwise_1(node):
host_i, = node.inputs host_i, = node.inputs
if host_i.owner and isinstance(host_i.owner.op, tensor.Elemwise) and len(host_i.clients)==1: if host_i.owner and isinstance(host_i.owner.op, tensor.Elemwise) and len(host_i.clients)==1:
elemwise_node = host_i.owner elemwise_node = host_i.owner
new_op = GpuElemwise(elemwise_node.op.scalar_op, elemwise_node.op.inplace_pattern) #don't set any inplace pattern. gpu_insert_inplace_optimizer will do it later
new_op = GpuElemwise(elemwise_node.op.scalar_op)
if all([i.dtype=='float32' for i in elemwise_node.inputs]): if all([i.dtype=='float32' for i in elemwise_node.inputs]):
return [new_op(*[gpu_from_host(i) for i in elemwise_node.inputs])] return [new_op(*[gpu_from_host(i) for i in elemwise_node.inputs])]
return False return False
...@@ -629,6 +631,9 @@ else: ...@@ -629,6 +631,9 @@ else:
_logger.debug("not enabling optimization fusion of gpu elemwise in fast_run") _logger.debug("not enabling optimization fusion of gpu elemwise in fast_run")
compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion') compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
#GpuElemwise inplace
gpu_insert_inplace_optimizer = tensor.opt.insert_inplace_optimizer_op(GpuElemwise)
compile.optdb.register('gpu_inplace_opt', gpu_insert_inplace_optimizer, 75, 'fast_run', 'inplace','gpu_inplace')
@register_opt() @register_opt()
@local_optimizer([tensor.Alloc]) @local_optimizer([tensor.Alloc])
......
...@@ -217,6 +217,9 @@ def test_elemwise0(): ...@@ -217,6 +217,9 @@ def test_elemwise0():
f = pfunc([b], [], updates=[(a, a+b)], mode=mode_with_gpu) f = pfunc([b], [], updates=[(a, a+b)], mode=mode_with_gpu)
#check that we work inplace.
assert f.maker.env.toposort()[1].op.destroy_map.items()==[(0,[0])]
a0 = a.value * 1.0 a0 = a.value * 1.0
print 'BEFORE ADD', a.value print 'BEFORE ADD', a.value
for i, node in enumerate(f.maker.env.toposort()): for i, node in enumerate(f.maker.env.toposort()):
......
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论