提交 6d23147f authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #4496 from nouiz/gpu_opt

Gpu opt
...@@ -225,12 +225,16 @@ class SeqOptimizer(Optimizer, list): ...@@ -225,12 +225,16 @@ class SeqOptimizer(Optimizer, list):
callback_before = fgraph.execute_callbacks_time callback_before = fgraph.execute_callbacks_time
nb_node_before = len(fgraph.apply_nodes) nb_node_before = len(fgraph.apply_nodes)
sub_profs = [] sub_profs = []
nb_nodes = []
for optimizer in self: for optimizer in self:
try: try:
nb_nodes_before = len(fgraph.apply_nodes)
t0 = time.time() t0 = time.time()
sub_prof = optimizer.optimize(fgraph) sub_prof = optimizer.optimize(fgraph)
l.append(float(time.time() - t0)) l.append(float(time.time() - t0))
sub_profs.append(sub_prof) sub_profs.append(sub_prof)
nb_nodes.append((nb_nodes_before,
len(fgraph.apply_nodes)))
if fgraph.profile: if fgraph.profile:
sub_validate_time.append(fgraph.profile.validate_time) sub_validate_time.append(fgraph.profile.validate_time)
except AssertionError: except AssertionError:
...@@ -249,7 +253,8 @@ class SeqOptimizer(Optimizer, list): ...@@ -249,7 +253,8 @@ class SeqOptimizer(Optimizer, list):
validate_time = None validate_time = None
callback_time = fgraph.execute_callbacks_time - callback_before callback_time = fgraph.execute_callbacks_time - callback_before
return (self, l, validate_time, callback_time, nb_node_before, return (self, l, validate_time, callback_time, nb_node_before,
len(fgraph.apply_nodes), sub_profs, sub_validate_time) len(fgraph.apply_nodes), sub_profs, sub_validate_time,
nb_nodes)
def __str__(self): def __str__(self):
return "SeqOpt(%s)" % list.__str__(self) return "SeqOpt(%s)" % list.__str__(self)
...@@ -270,7 +275,7 @@ class SeqOptimizer(Optimizer, list): ...@@ -270,7 +275,7 @@ class SeqOptimizer(Optimizer, list):
@staticmethod @staticmethod
def print_profile(stream, prof, level=0): def print_profile(stream, prof, level=0):
(opts, prof, validate_time, callback_time, nb_node_before, (opts, prof, validate_time, callback_time, nb_node_before,
nb_node_after, sub_profs, sub_validate_time) = prof nb_node_after, sub_profs, sub_validate_time, nb_nodes) = prof
blanc = (' ' * level) blanc = (' ' * level)
print(blanc, "SeqOptimizer", end=' ', file=stream) print(blanc, "SeqOptimizer", end=' ', file=stream)
...@@ -284,18 +289,19 @@ class SeqOptimizer(Optimizer, list): ...@@ -284,18 +289,19 @@ class SeqOptimizer(Optimizer, list):
print(blanc, " %.3fs for callback" % (callback_time), file=stream) print(blanc, " %.3fs for callback" % (callback_time), file=stream)
print(blanc, " %.3fs for fgraph.validate()" % (validate_time), file=stream) print(blanc, " %.3fs for fgraph.validate()" % (validate_time), file=stream)
if level == 0: if level == 0:
print(blanc, " time - (name, class, index) - validate time", file=stream) print(blanc, " time - (name, class, index, nodes before, nodes after) - validate time", file=stream)
ll = [] ll = []
for opt in opts: for opt in opts:
if hasattr(opt, "__name__"): if hasattr(opt, "__name__"):
ll.append((opt.__name__, opt.__class__.__name__, name = opt.__name__
opts.index(opt)))
else: else:
ll.append((opt.name, opt.__class__.__name__, name = opt.name
opts.index(opt))) idx = opts.index(opt)
lll = sorted(zip(prof, ll), key=lambda a: a[0]) ll.append((name, opt.__class__.__name__,
idx) + nb_nodes[idx])
lll = sorted(zip(prof, ll, nb_nodes), key=lambda a: a[0])
for (t, opt) in lll[::-1]: for (t, opt, nb_n) in lll[::-1]:
# if t < 1: # if t < 1:
# continue # continue
if sub_validate_time: if sub_validate_time:
......
...@@ -245,7 +245,8 @@ def local_cut_gpu_transfers(node): ...@@ -245,7 +245,8 @@ def local_cut_gpu_transfers(node):
# host -> # host ->
if isinstance(n2.op, GpuFromHost): if isinstance(n2.op, GpuFromHost):
return [GpuFromHost(node.op.context_name)(n2.inputs[0])] return [as_gpuarray_variable(n2.inputs[0],
node.op.context_name)]
# gpuc -> # gpuc ->
if isinstance(n2.op, GpuToGpu): if isinstance(n2.op, GpuToGpu):
...@@ -464,7 +465,8 @@ def local_gpua_dimshuffle(node, context_name): ...@@ -464,7 +465,8 @@ def local_gpua_dimshuffle(node, context_name):
def local_gpua_specifyShape(node, context_name): def local_gpua_specifyShape(node, context_name):
if isinstance(node.inputs[0].type, GpuArrayType): if isinstance(node.inputs[0].type, GpuArrayType):
return return
inp = [GpuFromHost(context_name)(node.inputs[0])] + node.inputs[1:] inp = [as_gpuarray_variable(node.inputs[0], context_name)]
inp += node.inputs[1:]
return tensor.specify_shape(*inp) return tensor.specify_shape(*inp)
...@@ -475,7 +477,7 @@ def local_gpua_shape(node, context_name): ...@@ -475,7 +477,7 @@ def local_gpua_shape(node, context_name):
# always on the CPU. # always on the CPU.
if isinstance(node.inputs[0].type, GpuArrayType): if isinstance(node.inputs[0].type, GpuArrayType):
return return
return [GpuFromHost(context_name)(node.inputs[0]).shape] return [as_gpuarray_variable(node.inputs[0], context_name).shape]
def gpu_print_wrapper(op, cnda): def gpu_print_wrapper(op, cnda):
...@@ -530,7 +532,7 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -530,7 +532,7 @@ def local_gpu_pdbbreakpoint_op(node):
elif output_goes_to_gpu: elif output_goes_to_gpu:
# The input should be transfered to the gpu # The input should be transfered to the gpu
new_inputs.append(GpuFromHost(context_name)(inp)) new_inputs.append(as_gpuarray_variable(inp, context_name))
input_transfered.append(True) input_transfered.append(True)
else: else:
...@@ -690,7 +692,8 @@ def local_gpua_careduce(node, context_name): ...@@ -690,7 +692,8 @@ def local_gpua_careduce(node, context_name):
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
if (op is GpuCAReduceCPY or if (op is GpuCAReduceCPY or
gvar.owner.op.supports_c_code([GpuFromHost(context_name)(x)])): gvar.owner.op.supports_c_code([
as_gpuarray_variable(x, context_name)])):
return greduce return greduce
else: else:
# Try to make a simpler pattern based on reshaping # Try to make a simpler pattern based on reshaping
...@@ -730,7 +733,7 @@ def local_gpua_careduce(node, context_name): ...@@ -730,7 +733,7 @@ def local_gpua_careduce(node, context_name):
acc_dtype=getattr(node.op, 'acc_dtype', None)) acc_dtype=getattr(node.op, 'acc_dtype', None))
reshaped_x = x.reshape(tensor.stack(new_in_shp)) reshaped_x = x.reshape(tensor.stack(new_in_shp))
gpu_reshaped_x = GpuFromHost(context_name)(reshaped_x) gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
gvar = greduce(gpu_reshaped_x) gvar = greduce(gpu_reshaped_x)
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
......
...@@ -299,7 +299,7 @@ def local_gpu_elemwise_0(node): ...@@ -299,7 +299,7 @@ def local_gpu_elemwise_0(node):
if all([i.type.dtype == 'float32' for i in node.inputs]): if all([i.type.dtype == 'float32' for i in node.inputs]):
# TODO: change this when fusion makes Elemwise with # TODO: change this when fusion makes Elemwise with
# multiple outputs # multiple outputs
gpu_elemwise = new_op(*(gpu_from_host(i) gpu_elemwise = new_op(*(as_cuda_ndarray_variable(i)
for i in node.inputs), for i in node.inputs),
return_list=True) return_list=True)
# case 2 - it is still ok if some inputs were upcast to float32 # case 2 - it is still ok if some inputs were upcast to float32
...@@ -312,7 +312,7 @@ def local_gpu_elemwise_0(node): ...@@ -312,7 +312,7 @@ def local_gpu_elemwise_0(node):
if [o.type for o in upcasted.outputs] ==\ if [o.type for o in upcasted.outputs] ==\
[o.type for o in node.outputs]: [o.type for o in node.outputs]:
new_inputs = [gpu_from_host(tensor.cast(i, 'float32')) new_inputs = [as_cuda_ndarray_variable(tensor.cast(i, 'float32'))
for i in node.inputs] for i in node.inputs]
gpu_elemwise = new_op(*new_inputs, return_list=True) gpu_elemwise = new_op(*new_inputs, return_list=True)
else: else:
...@@ -1314,7 +1314,7 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -1314,7 +1314,7 @@ def local_gpu_pdbbreakpoint_op(node):
elif output_goes_to_gpu: elif output_goes_to_gpu:
# The input should be transfered to the gpu # The input should be transfered to the gpu
new_inputs.append(gpu_from_host(inp)) new_inputs.append(as_cuda_ndarray_variable(inp))
input_transfered.append(True) input_transfered.append(True)
else: else:
...@@ -1537,7 +1537,7 @@ def local_gpu_conv(node): ...@@ -1537,7 +1537,7 @@ def local_gpu_conv(node):
img.shape[0], *op.imshp_logical) img.shape[0], *op.imshp_logical)
img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride], img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride],
img) img)
img = gpu_from_host(img) img = as_cuda_ndarray_variable(img)
return ret(img, kern) return ret(img, kern)
return make_graph return make_graph
...@@ -1551,8 +1551,8 @@ def local_gpu_conv(node): ...@@ -1551,8 +1551,8 @@ def local_gpu_conv(node):
if gpu_conv is None: if gpu_conv is None:
return return
img, kern = host_input.owner.inputs img, kern = host_input.owner.inputs
out = gpu_conv(gpu_from_host(img), out = gpu_conv(as_cuda_ndarray_variable(img),
gpu_from_host(kern)) as_cuda_ndarray_variable(kern))
out = tensor.patternbroadcast(out, out = tensor.patternbroadcast(out,
node.outputs[0].broadcastable) node.outputs[0].broadcastable)
out.tag.values_eq_approx = values_eq_approx_high_tol out.tag.values_eq_approx = values_eq_approx_high_tol
...@@ -1569,8 +1569,8 @@ def local_gpu_conv(node): ...@@ -1569,8 +1569,8 @@ def local_gpu_conv(node):
gpu_conv = GpuConvOp_from_ConvOp(node.op) gpu_conv = GpuConvOp_from_ConvOp(node.op)
if gpu_conv is None: if gpu_conv is None:
return return
out = gpu_conv(gpu_from_host(img), out = gpu_conv(as_cuda_ndarray_variable(img),
gpu_from_host(kern)) as_cuda_ndarray_variable(kern))
out = tensor.patternbroadcast( out = tensor.patternbroadcast(
host_from_gpu(out), host_from_gpu(out),
node.outputs[0].broadcastable) node.outputs[0].broadcastable)
......
...@@ -155,13 +155,16 @@ def broadcast_like(value, template, fgraph, dtype=None): ...@@ -155,13 +155,16 @@ def broadcast_like(value, template, fgraph, dtype=None):
if template not in fgraph.variables: if template not in fgraph.variables:
raise NotImplementedError('broadcast_like currently requires the ' raise NotImplementedError('broadcast_like currently requires the '
'template Variable to be in the fgraph already') 'template Variable to be in the fgraph already')
if dtype is None:
dtype = template.dtype
value = T.cast(value, dtype)
if value.type == template.type:
return value
if hasattr(fgraph, 'shape_feature'): if hasattr(fgraph, 'shape_feature'):
new_shape = fgraph.shape_feature.shape_of[template] new_shape = fgraph.shape_feature.shape_of[template]
else: else:
new_shape = template.shape new_shape = template.shape
if dtype is None: rval = T.alloc(value, *new_shape)
dtype = template.dtype
rval = T.alloc(T.cast(value, dtype), *new_shape)
# the template may have 1s in its shape without being broadcastable # the template may have 1s in its shape without being broadcastable
if rval.broadcastable != template.broadcastable: if rval.broadcastable != template.broadcastable:
rval = T.unbroadcast(rval, *[i for i in xrange(rval.ndim) rval = T.unbroadcast(rval, *[i for i in xrange(rval.ndim)
...@@ -234,6 +237,11 @@ def inplace_elemwise_optimizer_op(OP): ...@@ -234,6 +237,11 @@ def inplace_elemwise_optimizer_op(OP):
else: else:
update_outs = [] update_outs = []
protected_inputs = [
f.protected for f in fgraph._features if
isinstance(f, theano.compile.function_module.Supervisor)]
protected_inputs = sum(protected_inputs, []) # flatten the list
protected_inputs.extend(fgraph.outputs)
for node in list(graph.io_toposort(fgraph.inputs, fgraph.outputs)): for node in list(graph.io_toposort(fgraph.inputs, fgraph.outputs)):
op = node.op op = node.op
# gpuarray GpuElemwise inherit from Elemwise # gpuarray GpuElemwise inherit from Elemwise
...@@ -242,25 +250,39 @@ def inplace_elemwise_optimizer_op(OP): ...@@ -242,25 +250,39 @@ def inplace_elemwise_optimizer_op(OP):
# If big graph and the outputs are scalar, do not make it # If big graph and the outputs are scalar, do not make it
# inplace. # inplace.
if (check_each_change != 1 and if (check_each_change != 1 and
all([getattr(o.type, 'ndim', -1) == 0 # If multiple outputs, they must all have the same size,
for o in node.outputs])): # so only check the first.
getattr(node.outputs[0].type, 'ndim', -1) == 0):
continue continue
if op.inplace_pattern:
# Maybe this isn't needed anymore, but I don't want to
# rish regression now. This case only happen if the
# original node add already some inplace patter and we
# still try to add more pattern.
baseline = op.inplace_pattern baseline = op.inplace_pattern
protected_inputs = [
f.protected for f in node.fgraph._features if
isinstance(f, theano.compile.function_module.Supervisor)]
protected_inputs = sum(protected_inputs, []) # flatten the list
protected_inputs.extend(fgraph.outputs)
candidate_outputs = [i for i in xrange(len(node.outputs)) candidate_outputs = [i for i in xrange(len(node.outputs))
if i not in baseline] if i not in baseline]
# node inputs that are Constant, already destroyed, # node inputs that are Constant, already destroyed,
# fgraph protected inputs and fgraph outputs can't be used as inplace # or fgraph protected inputs and fgraph outputs can't be used as
# target. # inplace target.
# Remove here as faster. # Remove here as faster.
candidate_inputs = [i for i in xrange(len(node.inputs)) candidate_inputs = [i for i in xrange(len(node.inputs))
if i not in baseline.values() and if i not in baseline.values() and
not isinstance(node.inputs[i], Constant) and not isinstance(node.inputs[i], Constant) and
# Is next line costly?
not fgraph.destroyers(node.inputs[i]) and
node.inputs[i] not in protected_inputs]
else:
baseline = []
candidate_outputs = list(range(len(node.outputs)))
# node inputs that are Constant, already destroyed,
# fgraph protected inputs and fgraph outputs can't be used as inplace
# target.
# Remove here as faster.
candidate_inputs = [i for i in xrange(len(node.inputs))
if not isinstance(node.inputs[i], Constant) and
not fgraph.destroyers(node.inputs[i]) and not fgraph.destroyers(node.inputs[i]) and
node.inputs[i] not in protected_inputs] node.inputs[i] not in protected_inputs]
...@@ -2706,6 +2728,7 @@ def merge_two_slices(slice1, len1, slice2, len2): ...@@ -2706,6 +2728,7 @@ def merge_two_slices(slice1, len1, slice2, len2):
val = T.switch(T.lt(sl2, 0), - len1 - 1, val) val = T.switch(T.lt(sl2, 0), - len1 - 1, val)
if sl1.step: if sl1.step:
val = T.switch(T.eq(sl1.step, 0), len1 + 1, val) val = T.switch(T.eq(sl1.step, 0), len1 + 1, val)
val = pre_greedy_local_optimizer(list_opt, val)
return val return val
else: else:
# We are in the more complex case when we do not actually know # We are in the more complex case when we do not actually know
...@@ -2730,6 +2753,7 @@ def merge_two_slices(slice1, len1, slice2, len2): ...@@ -2730,6 +2753,7 @@ def merge_two_slices(slice1, len1, slice2, len2):
val = T.switch(T.lt(sl2, 0), - len1 - 1, val) val = T.switch(T.lt(sl2, 0), - len1 - 1, val)
if sl1.step: if sl1.step:
val = T.switch(T.eq(sl1.step, 0), len1 + 1, val) val = T.switch(T.eq(sl1.step, 0), len1 + 1, val)
val = pre_greedy_local_optimizer(list_opt, val)
return val return val
else: else:
# We are deleaing with two slices that need to be put together # We are deleaing with two slices that need to be put together
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论