提交 a7ca0c4f authored 作者: Frederic Bastien's avatar Frederic Bastien

make GpuElemwise optimization to work inplace as Elemwise.

上级 44af7bc1
...@@ -85,13 +85,9 @@ class GpuElemwise(Op): ...@@ -85,13 +85,9 @@ class GpuElemwise(Op):
# #
sync = config.gpuelemwise.sync sync = config.gpuelemwise.sync
self.scalar_op = scalar_op self.scalar_op = scalar_op
if 0:
#we don't put them their as this cause trouble with the local_cut_gpu_host_gpu optimizer. self.inplace_pattern = inplace_pattern
#and the gpu don't implement any inplace pattern for now. self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
self.inplace_pattern = inplace_pattern
self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
else:
self.inplace_pattern = {}
self.sync = sync self.sync = sync
......
...@@ -89,7 +89,8 @@ def local_gpu_elemwise_0(node): ...@@ -89,7 +89,8 @@ def local_gpu_elemwise_0(node):
if isinstance(node.op, tensor.Elemwise): if isinstance(node.op, tensor.Elemwise):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern) #don't set any inplace pattern. gpu_insert_inplace_optimizer will do it later
new_op = GpuElemwise(node.op.scalar_op)
# case 1 - all inputs are already float32 # case 1 - all inputs are already float32
if numpy.all([i.type.dtype == 'float32' for i in node.inputs]): if numpy.all([i.type.dtype == 'float32' for i in node.inputs]):
...@@ -120,7 +121,8 @@ def local_gpu_elemwise_1(node): ...@@ -120,7 +121,8 @@ def local_gpu_elemwise_1(node):
host_i, = node.inputs host_i, = node.inputs
if host_i.owner and isinstance(host_i.owner.op, tensor.Elemwise) and len(host_i.clients)==1: if host_i.owner and isinstance(host_i.owner.op, tensor.Elemwise) and len(host_i.clients)==1:
elemwise_node = host_i.owner elemwise_node = host_i.owner
new_op = GpuElemwise(elemwise_node.op.scalar_op, elemwise_node.op.inplace_pattern) #don't set any inplace pattern. gpu_insert_inplace_optimizer will do it later
new_op = GpuElemwise(elemwise_node.op.scalar_op)
if all([i.dtype=='float32' for i in elemwise_node.inputs]): if all([i.dtype=='float32' for i in elemwise_node.inputs]):
return [new_op(*[gpu_from_host(i) for i in elemwise_node.inputs])] return [new_op(*[gpu_from_host(i) for i in elemwise_node.inputs])]
return False return False
...@@ -629,6 +631,9 @@ else: ...@@ -629,6 +631,9 @@ else:
_logger.debug("not enabling optimization fusion of gpu elemwise in fast_run") _logger.debug("not enabling optimization fusion of gpu elemwise in fast_run")
compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion') compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
#GpuElemwise inplace
gpu_insert_inplace_optimizer = tensor.opt.insert_inplace_optimizer_op(GpuElemwise)
compile.optdb.register('gpu_inplace_opt', gpu_insert_inplace_optimizer, 75, 'fast_run', 'inplace','gpu_inplace')
@register_opt() @register_opt()
@local_optimizer([tensor.Alloc]) @local_optimizer([tensor.Alloc])
......
...@@ -217,6 +217,9 @@ def test_elemwise0(): ...@@ -217,6 +217,9 @@ def test_elemwise0():
f = pfunc([b], [], updates=[(a, a+b)], mode=mode_with_gpu) f = pfunc([b], [], updates=[(a, a+b)], mode=mode_with_gpu)
#check that we work inplace.
assert f.maker.env.toposort()[1].op.destroy_map.items()==[(0,[0])]
a0 = a.value * 1.0 a0 = a.value * 1.0
print 'BEFORE ADD', a.value print 'BEFORE ADD', a.value
for i, node in enumerate(f.maker.env.toposort()): for i, node in enumerate(f.maker.env.toposort()):
......
...@@ -98,112 +98,120 @@ theano.configparser.AddConfigVar('tensor.insert_inplace_optimizer_validate_nb', ...@@ -98,112 +98,120 @@ theano.configparser.AddConfigVar('tensor.insert_inplace_optimizer_validate_nb',
"-1: auto, if graph have less then 500 nodes 1, else 10", "-1: auto, if graph have less then 500 nodes 1, else 10",
theano.configparser.IntParam(-1)) theano.configparser.IntParam(-1))
@gof.optimizer def insert_inplace_optimizer_op(OP):
def insert_inplace_optimizer(env):
""" """
Usage: inplace_optimizer.optimize(env) We parametrise it to make it work for Elemwise and GpuElemwise op.
"""
Attempts to replace all Broadcast ops by versions of them @gof.optimizer
that operate inplace. It operates greedily: for each Broadcast def insert_inplace_optimizer(env):
Op that is encountered, for each output, tries each input to """
see if it can operate inplace on that input. If so, makes the Usage: inplace_optimizer.optimize(env)
change and go to the next output or Broadcast Op.
Examples: Attempts to replace all Broadcast ops by versions of them
x + y + z -> x += y += z that operate inplace. It operates greedily: for each Broadcast
(x + y) * (x * y) -> (x += y) *= (x * y) or (x + y) *= (x *= y) Op that is encountered, for each output, tries each input to
""" see if it can operate inplace on that input. If so, makes the
#we should not validate too often as this take too much time to execute! change and go to the next output or Broadcast Op.
#It is the _dfs_toposort() fct in theano/gof/destroyhandler.py
#that take so much time.
#Should we try to use another lib that do toposort?
# igraph: http://igraph.sourceforge.net/
# networkx: https://networkx.lanl.gov/
#Should we try to use cython?
# compiling only that fct is not enought, should we try to add the deque class too?
# and init the deque and other list to an upper bound number of element?
#Should Theano do online toposort as in http://code.google.com/p/acyclic/?
#
#The next longuest optimizer is the canonizer phase
#Then I think it is the [io_?]toposort(need to validate) so check if the solution is also applicable their.
#we execute validate after this number of change.
validate_each_change = config.tensor.insert_inplace_optimizer_validate_nb
if validate_each_change==-1:
if len(env.nodes)>500:
validate_each_change = 10
else: validate_each_change = 1
nb_change_no_validate = 0
chk = env.checkpoint()
for node in list(graph.io_toposort(env.inputs, env.outputs)):
op = node.op
if not isinstance(op, Elemwise):
continue
baseline = op.inplace_pattern
protected_inputs = [f.protected for f in node.env._features if isinstance(f,theano.compile.function_module.Supervisor)]
protected_inputs = sum(protected_inputs,[])#flatten the list
protected_inputs.extend(env.outputs)
candidate_outputs = [i for i in xrange(len(node.outputs)) if i not in baseline]
#node inputs that are Constant, already destroyed,
# env protected inputs and env outputs can't be used as inplace target.
# Remove here as faster.
candidate_inputs = [i for i in xrange(len(node.inputs)) if i not in baseline.values() \
and not isinstance(node.inputs[i],Constant)\
and not env.destroyers(node.inputs[i])\
and node.inputs[i] not in protected_inputs]
verbose = False
raised_warning = not verbose
for candidate_output in candidate_outputs:
for candidate_input in candidate_inputs:
#remove inputs that don't have the same dtype as the output.
if node.inputs[candidate_input].type!=node.outputs[candidate_output].type:
continue
inplace_pattern = dict(baseline, **{candidate_output: candidate_input})
try:
if hasattr(op.scalar_op,"make_new_inplace"):
new_scal = op.scalar_op.make_new_inplace(
scalar.transfer_type(
*[inplace_pattern.get(i, None) \
for i in xrange(len(node.outputs))]))
else:
new_scal = op.scalar_op.__class__(
scalar.transfer_type(
*[inplace_pattern.get(i, None) \
for i in xrange(len(node.outputs))]))
new = Elemwise(new_scal,inplace_pattern).make_node(*node.inputs)
for r,new_r in zip(node.outputs,new.outputs):
env.replace(r,new_r,
reason="insert_inplace_optimizer")
nb_change_no_validate +=1
if nb_change_no_validate >= validate_each_change:
env.validate()
chk = env.checkpoint()
nb_change_no_validate = 0
except (ValueError, TypeError, InconsistencyError), e:
if validate_each_change!=1 and not raised_warning:
print >> sys.stderr, "Their was some inplace optimization that was not done due to unexpected error:"
print >> sys.stderr, e
raised_warning = True
env.revert(chk)
continue
candidate_inputs.remove(candidate_input)
node = new
baseline = inplace_pattern
break
if nb_change_no_validate>0: Examples:
try: x + y + z -> x += y += z
env.validate() (x + y) * (x * y) -> (x += y) *= (x * y) or (x + y) *= (x *= y)
except Exception, e: """
if not raised_warning: #we should not validate too often as this take too much time to execute!
print >> sys.stderr, "Their was some inplace optimization that was not done due to unexpected error" #It is the _dfs_toposort() fct in theano/gof/destroyhandler.py
env.revert(chk) #that take so much time.
#Should we try to use another lib that do toposort?
# igraph: http://igraph.sourceforge.net/
# networkx: https://networkx.lanl.gov/
#Should we try to use cython?
# compiling only that fct is not enought, should we try to add the deque class too?
# and init the deque and other list to an upper bound number of element?
#Should Theano do online toposort as in http://code.google.com/p/acyclic/?
#
#The next longuest optimizer is the canonizer phase
#Then I think it is the [io_?]toposort(need to validate) so check if the solution is also applicable their.
#we execute validate after this number of change.
validate_each_change = config.tensor.insert_inplace_optimizer_validate_nb
if validate_each_change==-1:
if len(env.nodes)>500:
validate_each_change = 10
else: validate_each_change = 1
nb_change_no_validate = 0
chk = env.checkpoint()
for node in list(graph.io_toposort(env.inputs, env.outputs)):
op = node.op
if not isinstance(op, OP):
continue
baseline = op.inplace_pattern
protected_inputs = [f.protected for f in node.env._features if isinstance(f,theano.compile.function_module.Supervisor)]
protected_inputs = sum(protected_inputs,[])#flatten the list
protected_inputs.extend(env.outputs)
candidate_outputs = [i for i in xrange(len(node.outputs)) if i not in baseline]
#node inputs that are Constant, already destroyed,
# env protected inputs and env outputs can't be used as inplace target.
# Remove here as faster.
candidate_inputs = [i for i in xrange(len(node.inputs)) if i not in baseline.values() \
and not isinstance(node.inputs[i],Constant)\
and not env.destroyers(node.inputs[i])\
and node.inputs[i] not in protected_inputs]
verbose = False
raised_warning = not verbose
for candidate_output in candidate_outputs:
for candidate_input in candidate_inputs:
#remove inputs that don't have the same dtype as the output.
if node.inputs[candidate_input].type!=node.outputs[candidate_output].type:
continue
inplace_pattern = dict(baseline, **{candidate_output: candidate_input})
try:
if hasattr(op.scalar_op,"make_new_inplace"):
new_scal = op.scalar_op.make_new_inplace(
scalar.transfer_type(
*[inplace_pattern.get(i, None) \
for i in xrange(len(node.outputs))]))
else:
new_scal = op.scalar_op.__class__(
scalar.transfer_type(
*[inplace_pattern.get(i, None) \
for i in xrange(len(node.outputs))]))
new = OP(new_scal,inplace_pattern).make_node(*node.inputs)
for r,new_r in zip(node.outputs,new.outputs):
env.replace(r,new_r,
reason="insert_inplace_optimizer")
nb_change_no_validate +=1
if nb_change_no_validate >= validate_each_change:
env.validate()
chk = env.checkpoint()
nb_change_no_validate = 0
except (ValueError, TypeError, InconsistencyError), e:
if validate_each_change!=1 and not raised_warning:
print >> sys.stderr, "Their was some inplace optimization that was not done due to unexpected error:"
print >> sys.stderr, e
raised_warning = True
env.revert(chk)
continue
candidate_inputs.remove(candidate_input)
node = new
baseline = inplace_pattern
break
if nb_change_no_validate>0:
try:
env.validate()
except Exception, e:
if not raised_warning:
print >> sys.stderr, "Their was some inplace optimization that was not done due to unexpected error"
env.revert(chk)
return insert_inplace_optimizer
insert_inplace_optimizer = insert_inplace_optimizer_op(T.Elemwise)
compile.optdb.register('inplace_opt', insert_inplace_optimizer, 75, 'fast_run', 'inplace') compile.optdb.register('inplace_opt', insert_inplace_optimizer, 75, 'fast_run', 'inplace')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论