提交 a7ca0c4f authored 作者: Frederic Bastien's avatar Frederic Bastien

make GpuElemwise optimization to work inplace as Elemwise.

上级 44af7bc1
......@@ -85,13 +85,9 @@ class GpuElemwise(Op):
#
sync = config.gpuelemwise.sync
self.scalar_op = scalar_op
if 0:
#we don't put them their as this cause trouble with the local_cut_gpu_host_gpu optimizer.
#and the gpu don't implement any inplace pattern for now.
self.inplace_pattern = inplace_pattern
self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
else:
self.inplace_pattern = {}
self.inplace_pattern = inplace_pattern
self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
self.sync = sync
......
......@@ -89,7 +89,8 @@ def local_gpu_elemwise_0(node):
if isinstance(node.op, tensor.Elemwise):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern)
#don't set any inplace pattern. gpu_insert_inplace_optimizer will do it later
new_op = GpuElemwise(node.op.scalar_op)
# case 1 - all inputs are already float32
if numpy.all([i.type.dtype == 'float32' for i in node.inputs]):
......@@ -120,7 +121,8 @@ def local_gpu_elemwise_1(node):
host_i, = node.inputs
if host_i.owner and isinstance(host_i.owner.op, tensor.Elemwise) and len(host_i.clients)==1:
elemwise_node = host_i.owner
new_op = GpuElemwise(elemwise_node.op.scalar_op, elemwise_node.op.inplace_pattern)
#don't set any inplace pattern. gpu_insert_inplace_optimizer will do it later
new_op = GpuElemwise(elemwise_node.op.scalar_op)
if all([i.dtype=='float32' for i in elemwise_node.inputs]):
return [new_op(*[gpu_from_host(i) for i in elemwise_node.inputs])]
return False
......@@ -629,6 +631,9 @@ else:
_logger.debug("not enabling optimization fusion of gpu elemwise in fast_run")
compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
#GpuElemwise inplace
gpu_insert_inplace_optimizer = tensor.opt.insert_inplace_optimizer_op(GpuElemwise)
compile.optdb.register('gpu_inplace_opt', gpu_insert_inplace_optimizer, 75, 'fast_run', 'inplace','gpu_inplace')
@register_opt()
@local_optimizer([tensor.Alloc])
......
......@@ -217,6 +217,9 @@ def test_elemwise0():
f = pfunc([b], [], updates=[(a, a+b)], mode=mode_with_gpu)
#check that we work inplace.
assert f.maker.env.toposort()[1].op.destroy_map.items()==[(0,[0])]
a0 = a.value * 1.0
print 'BEFORE ADD', a.value
for i, node in enumerate(f.maker.env.toposort()):
......
......@@ -98,112 +98,120 @@ theano.configparser.AddConfigVar('tensor.insert_inplace_optimizer_validate_nb',
"-1: auto, if graph have less then 500 nodes 1, else 10",
theano.configparser.IntParam(-1))
@gof.optimizer
def insert_inplace_optimizer(env):
def insert_inplace_optimizer_op(OP):
"""
Usage: inplace_optimizer.optimize(env)
Attempts to replace all Broadcast ops by versions of them
that operate inplace. It operates greedily: for each Broadcast
Op that is encountered, for each output, tries each input to
see if it can operate inplace on that input. If so, makes the
change and go to the next output or Broadcast Op.
We parametrise it to make it work for Elemwise and GpuElemwise op.
"""
@gof.optimizer
def insert_inplace_optimizer(env):
"""
Usage: inplace_optimizer.optimize(env)
Examples:
x + y + z -> x += y += z
(x + y) * (x * y) -> (x += y) *= (x * y) or (x + y) *= (x *= y)
"""
#we should not validate too often as this take too much time to execute!
#It is the _dfs_toposort() fct in theano/gof/destroyhandler.py
#that take so much time.
#Should we try to use another lib that do toposort?
# igraph: http://igraph.sourceforge.net/
# networkx: https://networkx.lanl.gov/
#Should we try to use cython?
# compiling only that fct is not enought, should we try to add the deque class too?
# and init the deque and other list to an upper bound number of element?
#Should Theano do online toposort as in http://code.google.com/p/acyclic/?
#
#The next longuest optimizer is the canonizer phase
#Then I think it is the [io_?]toposort(need to validate) so check if the solution is also applicable their.
#we execute validate after this number of change.
validate_each_change = config.tensor.insert_inplace_optimizer_validate_nb
if validate_each_change==-1:
if len(env.nodes)>500:
validate_each_change = 10
else: validate_each_change = 1
nb_change_no_validate = 0
chk = env.checkpoint()
for node in list(graph.io_toposort(env.inputs, env.outputs)):
op = node.op
if not isinstance(op, Elemwise):
continue
baseline = op.inplace_pattern
protected_inputs = [f.protected for f in node.env._features if isinstance(f,theano.compile.function_module.Supervisor)]
protected_inputs = sum(protected_inputs,[])#flatten the list
protected_inputs.extend(env.outputs)
candidate_outputs = [i for i in xrange(len(node.outputs)) if i not in baseline]
#node inputs that are Constant, already destroyed,
# env protected inputs and env outputs can't be used as inplace target.
# Remove here as faster.
candidate_inputs = [i for i in xrange(len(node.inputs)) if i not in baseline.values() \
and not isinstance(node.inputs[i],Constant)\
and not env.destroyers(node.inputs[i])\
and node.inputs[i] not in protected_inputs]
verbose = False
raised_warning = not verbose
for candidate_output in candidate_outputs:
for candidate_input in candidate_inputs:
#remove inputs that don't have the same dtype as the output.
if node.inputs[candidate_input].type!=node.outputs[candidate_output].type:
continue
inplace_pattern = dict(baseline, **{candidate_output: candidate_input})
try:
if hasattr(op.scalar_op,"make_new_inplace"):
new_scal = op.scalar_op.make_new_inplace(
scalar.transfer_type(
*[inplace_pattern.get(i, None) \
for i in xrange(len(node.outputs))]))
else:
new_scal = op.scalar_op.__class__(
scalar.transfer_type(
*[inplace_pattern.get(i, None) \
for i in xrange(len(node.outputs))]))
new = Elemwise(new_scal,inplace_pattern).make_node(*node.inputs)
for r,new_r in zip(node.outputs,new.outputs):
env.replace(r,new_r,
reason="insert_inplace_optimizer")
nb_change_no_validate +=1
if nb_change_no_validate >= validate_each_change:
env.validate()
chk = env.checkpoint()
nb_change_no_validate = 0
except (ValueError, TypeError, InconsistencyError), e:
if validate_each_change!=1 and not raised_warning:
print >> sys.stderr, "Their was some inplace optimization that was not done due to unexpected error:"
print >> sys.stderr, e
raised_warning = True
env.revert(chk)
continue
candidate_inputs.remove(candidate_input)
node = new
baseline = inplace_pattern
break
Attempts to replace all Broadcast ops by versions of them
that operate inplace. It operates greedily: for each Broadcast
Op that is encountered, for each output, tries each input to
see if it can operate inplace on that input. If so, makes the
change and go to the next output or Broadcast Op.
if nb_change_no_validate>0:
try:
env.validate()
except Exception, e:
if not raised_warning:
print >> sys.stderr, "Their was some inplace optimization that was not done due to unexpected error"
env.revert(chk)
Examples:
x + y + z -> x += y += z
(x + y) * (x * y) -> (x += y) *= (x * y) or (x + y) *= (x *= y)
"""
#we should not validate too often as this take too much time to execute!
#It is the _dfs_toposort() fct in theano/gof/destroyhandler.py
#that take so much time.
#Should we try to use another lib that do toposort?
# igraph: http://igraph.sourceforge.net/
# networkx: https://networkx.lanl.gov/
#Should we try to use cython?
# compiling only that fct is not enought, should we try to add the deque class too?
# and init the deque and other list to an upper bound number of element?
#Should Theano do online toposort as in http://code.google.com/p/acyclic/?
#
#The next longuest optimizer is the canonizer phase
#Then I think it is the [io_?]toposort(need to validate) so check if the solution is also applicable their.
#we execute validate after this number of change.
validate_each_change = config.tensor.insert_inplace_optimizer_validate_nb
if validate_each_change==-1:
if len(env.nodes)>500:
validate_each_change = 10
else: validate_each_change = 1
nb_change_no_validate = 0
chk = env.checkpoint()
for node in list(graph.io_toposort(env.inputs, env.outputs)):
op = node.op
if not isinstance(op, OP):
continue
baseline = op.inplace_pattern
protected_inputs = [f.protected for f in node.env._features if isinstance(f,theano.compile.function_module.Supervisor)]
protected_inputs = sum(protected_inputs,[])#flatten the list
protected_inputs.extend(env.outputs)
candidate_outputs = [i for i in xrange(len(node.outputs)) if i not in baseline]
#node inputs that are Constant, already destroyed,
# env protected inputs and env outputs can't be used as inplace target.
# Remove here as faster.
candidate_inputs = [i for i in xrange(len(node.inputs)) if i not in baseline.values() \
and not isinstance(node.inputs[i],Constant)\
and not env.destroyers(node.inputs[i])\
and node.inputs[i] not in protected_inputs]
verbose = False
raised_warning = not verbose
for candidate_output in candidate_outputs:
for candidate_input in candidate_inputs:
#remove inputs that don't have the same dtype as the output.
if node.inputs[candidate_input].type!=node.outputs[candidate_output].type:
continue
inplace_pattern = dict(baseline, **{candidate_output: candidate_input})
try:
if hasattr(op.scalar_op,"make_new_inplace"):
new_scal = op.scalar_op.make_new_inplace(
scalar.transfer_type(
*[inplace_pattern.get(i, None) \
for i in xrange(len(node.outputs))]))
else:
new_scal = op.scalar_op.__class__(
scalar.transfer_type(
*[inplace_pattern.get(i, None) \
for i in xrange(len(node.outputs))]))
new = OP(new_scal,inplace_pattern).make_node(*node.inputs)
for r,new_r in zip(node.outputs,new.outputs):
env.replace(r,new_r,
reason="insert_inplace_optimizer")
nb_change_no_validate +=1
if nb_change_no_validate >= validate_each_change:
env.validate()
chk = env.checkpoint()
nb_change_no_validate = 0
except (ValueError, TypeError, InconsistencyError), e:
if validate_each_change!=1 and not raised_warning:
print >> sys.stderr, "Their was some inplace optimization that was not done due to unexpected error:"
print >> sys.stderr, e
raised_warning = True
env.revert(chk)
continue
candidate_inputs.remove(candidate_input)
node = new
baseline = inplace_pattern
break
if nb_change_no_validate>0:
try:
env.validate()
except Exception, e:
if not raised_warning:
print >> sys.stderr, "Their was some inplace optimization that was not done due to unexpected error"
env.revert(chk)
return insert_inplace_optimizer
insert_inplace_optimizer = insert_inplace_optimizer_op(T.Elemwise)
compile.optdb.register('inplace_opt', insert_inplace_optimizer, 75, 'fast_run', 'inplace')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论