提交 6d23147f authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #4496 from nouiz/gpu_opt

Gpu opt
......@@ -225,12 +225,16 @@ class SeqOptimizer(Optimizer, list):
callback_before = fgraph.execute_callbacks_time
nb_node_before = len(fgraph.apply_nodes)
sub_profs = []
nb_nodes = []
for optimizer in self:
try:
nb_nodes_before = len(fgraph.apply_nodes)
t0 = time.time()
sub_prof = optimizer.optimize(fgraph)
l.append(float(time.time() - t0))
sub_profs.append(sub_prof)
nb_nodes.append((nb_nodes_before,
len(fgraph.apply_nodes)))
if fgraph.profile:
sub_validate_time.append(fgraph.profile.validate_time)
except AssertionError:
......@@ -249,7 +253,8 @@ class SeqOptimizer(Optimizer, list):
validate_time = None
callback_time = fgraph.execute_callbacks_time - callback_before
return (self, l, validate_time, callback_time, nb_node_before,
len(fgraph.apply_nodes), sub_profs, sub_validate_time)
len(fgraph.apply_nodes), sub_profs, sub_validate_time,
nb_nodes)
def __str__(self):
return "SeqOpt(%s)" % list.__str__(self)
......@@ -270,7 +275,7 @@ class SeqOptimizer(Optimizer, list):
@staticmethod
def print_profile(stream, prof, level=0):
(opts, prof, validate_time, callback_time, nb_node_before,
nb_node_after, sub_profs, sub_validate_time) = prof
nb_node_after, sub_profs, sub_validate_time, nb_nodes) = prof
blanc = (' ' * level)
print(blanc, "SeqOptimizer", end=' ', file=stream)
......@@ -284,18 +289,19 @@ class SeqOptimizer(Optimizer, list):
print(blanc, " %.3fs for callback" % (callback_time), file=stream)
print(blanc, " %.3fs for fgraph.validate()" % (validate_time), file=stream)
if level == 0:
print(blanc, " time - (name, class, index) - validate time", file=stream)
print(blanc, " time - (name, class, index, nodes before, nodes after) - validate time", file=stream)
ll = []
for opt in opts:
if hasattr(opt, "__name__"):
ll.append((opt.__name__, opt.__class__.__name__,
opts.index(opt)))
name = opt.__name__
else:
ll.append((opt.name, opt.__class__.__name__,
opts.index(opt)))
lll = sorted(zip(prof, ll), key=lambda a: a[0])
name = opt.name
idx = opts.index(opt)
ll.append((name, opt.__class__.__name__,
idx) + nb_nodes[idx])
lll = sorted(zip(prof, ll, nb_nodes), key=lambda a: a[0])
for (t, opt) in lll[::-1]:
for (t, opt, nb_n) in lll[::-1]:
# if t < 1:
# continue
if sub_validate_time:
......
......@@ -245,7 +245,8 @@ def local_cut_gpu_transfers(node):
# host ->
if isinstance(n2.op, GpuFromHost):
return [GpuFromHost(node.op.context_name)(n2.inputs[0])]
return [as_gpuarray_variable(n2.inputs[0],
node.op.context_name)]
# gpuc ->
if isinstance(n2.op, GpuToGpu):
......@@ -464,7 +465,8 @@ def local_gpua_dimshuffle(node, context_name):
def local_gpua_specifyShape(node, context_name):
if isinstance(node.inputs[0].type, GpuArrayType):
return
inp = [GpuFromHost(context_name)(node.inputs[0])] + node.inputs[1:]
inp = [as_gpuarray_variable(node.inputs[0], context_name)]
inp += node.inputs[1:]
return tensor.specify_shape(*inp)
......@@ -475,7 +477,7 @@ def local_gpua_shape(node, context_name):
# always on the CPU.
if isinstance(node.inputs[0].type, GpuArrayType):
return
return [GpuFromHost(context_name)(node.inputs[0]).shape]
return [as_gpuarray_variable(node.inputs[0], context_name).shape]
def gpu_print_wrapper(op, cnda):
......@@ -530,7 +532,7 @@ def local_gpu_pdbbreakpoint_op(node):
elif output_goes_to_gpu:
# The input should be transfered to the gpu
new_inputs.append(GpuFromHost(context_name)(inp))
new_inputs.append(as_gpuarray_variable(inp, context_name))
input_transfered.append(True)
else:
......@@ -690,7 +692,8 @@ def local_gpua_careduce(node, context_name):
# We need to have the make node called, otherwise the mask can
# be None
if (op is GpuCAReduceCPY or
gvar.owner.op.supports_c_code([GpuFromHost(context_name)(x)])):
gvar.owner.op.supports_c_code([
as_gpuarray_variable(x, context_name)])):
return greduce
else:
# Try to make a simpler pattern based on reshaping
......@@ -730,7 +733,7 @@ def local_gpua_careduce(node, context_name):
acc_dtype=getattr(node.op, 'acc_dtype', None))
reshaped_x = x.reshape(tensor.stack(new_in_shp))
gpu_reshaped_x = GpuFromHost(context_name)(reshaped_x)
gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
gvar = greduce(gpu_reshaped_x)
# We need to have the make node called, otherwise the mask can
# be None
......
......@@ -299,7 +299,7 @@ def local_gpu_elemwise_0(node):
if all([i.type.dtype == 'float32' for i in node.inputs]):
# TODO: change this when fusion makes Elemwise with
# multiple outputs
gpu_elemwise = new_op(*(gpu_from_host(i)
gpu_elemwise = new_op(*(as_cuda_ndarray_variable(i)
for i in node.inputs),
return_list=True)
# case 2 - it is still ok if some inputs were upcast to float32
......@@ -312,7 +312,7 @@ def local_gpu_elemwise_0(node):
if [o.type for o in upcasted.outputs] ==\
[o.type for o in node.outputs]:
new_inputs = [gpu_from_host(tensor.cast(i, 'float32'))
new_inputs = [as_cuda_ndarray_variable(tensor.cast(i, 'float32'))
for i in node.inputs]
gpu_elemwise = new_op(*new_inputs, return_list=True)
else:
......@@ -1314,7 +1314,7 @@ def local_gpu_pdbbreakpoint_op(node):
elif output_goes_to_gpu:
# The input should be transfered to the gpu
new_inputs.append(gpu_from_host(inp))
new_inputs.append(as_cuda_ndarray_variable(inp))
input_transfered.append(True)
else:
......@@ -1537,7 +1537,7 @@ def local_gpu_conv(node):
img.shape[0], *op.imshp_logical)
img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride],
img)
img = gpu_from_host(img)
img = as_cuda_ndarray_variable(img)
return ret(img, kern)
return make_graph
......@@ -1551,8 +1551,8 @@ def local_gpu_conv(node):
if gpu_conv is None:
return
img, kern = host_input.owner.inputs
out = gpu_conv(gpu_from_host(img),
gpu_from_host(kern))
out = gpu_conv(as_cuda_ndarray_variable(img),
as_cuda_ndarray_variable(kern))
out = tensor.patternbroadcast(out,
node.outputs[0].broadcastable)
out.tag.values_eq_approx = values_eq_approx_high_tol
......@@ -1569,8 +1569,8 @@ def local_gpu_conv(node):
gpu_conv = GpuConvOp_from_ConvOp(node.op)
if gpu_conv is None:
return
out = gpu_conv(gpu_from_host(img),
gpu_from_host(kern))
out = gpu_conv(as_cuda_ndarray_variable(img),
as_cuda_ndarray_variable(kern))
out = tensor.patternbroadcast(
host_from_gpu(out),
node.outputs[0].broadcastable)
......
......@@ -155,13 +155,16 @@ def broadcast_like(value, template, fgraph, dtype=None):
if template not in fgraph.variables:
raise NotImplementedError('broadcast_like currently requires the '
'template Variable to be in the fgraph already')
if dtype is None:
dtype = template.dtype
value = T.cast(value, dtype)
if value.type == template.type:
return value
if hasattr(fgraph, 'shape_feature'):
new_shape = fgraph.shape_feature.shape_of[template]
else:
new_shape = template.shape
if dtype is None:
dtype = template.dtype
rval = T.alloc(T.cast(value, dtype), *new_shape)
rval = T.alloc(value, *new_shape)
# the template may have 1s in its shape without being broadcastable
if rval.broadcastable != template.broadcastable:
rval = T.unbroadcast(rval, *[i for i in xrange(rval.ndim)
......@@ -234,6 +237,11 @@ def inplace_elemwise_optimizer_op(OP):
else:
update_outs = []
protected_inputs = [
f.protected for f in fgraph._features if
isinstance(f, theano.compile.function_module.Supervisor)]
protected_inputs = sum(protected_inputs, []) # flatten the list
protected_inputs.extend(fgraph.outputs)
for node in list(graph.io_toposort(fgraph.inputs, fgraph.outputs)):
op = node.op
# gpuarray GpuElemwise inherit from Elemwise
......@@ -242,27 +250,41 @@ def inplace_elemwise_optimizer_op(OP):
# If big graph and the outputs are scalar, do not make it
# inplace.
if (check_each_change != 1 and
all([getattr(o.type, 'ndim', -1) == 0
for o in node.outputs])):
# If multiple outputs, they must all have the same size,
# so only check the first.
getattr(node.outputs[0].type, 'ndim', -1) == 0):
continue
baseline = op.inplace_pattern
protected_inputs = [
f.protected for f in node.fgraph._features if
isinstance(f, theano.compile.function_module.Supervisor)]
protected_inputs = sum(protected_inputs, []) # flatten the list
protected_inputs.extend(fgraph.outputs)
candidate_outputs = [i for i in xrange(len(node.outputs))
if i not in baseline]
# node inputs that are Constant, already destroyed,
# fgraph protected inputs and fgraph outputs can't be used as inplace
# target.
# Remove here as faster.
candidate_inputs = [i for i in xrange(len(node.inputs))
if i not in baseline.values() and
not isinstance(node.inputs[i], Constant) and
not fgraph.destroyers(node.inputs[i]) and
node.inputs[i] not in protected_inputs]
if op.inplace_pattern:
# Maybe this isn't needed anymore, but I don't want to
# rish regression now. This case only happen if the
# original node add already some inplace patter and we
# still try to add more pattern.
baseline = op.inplace_pattern
candidate_outputs = [i for i in xrange(len(node.outputs))
if i not in baseline]
# node inputs that are Constant, already destroyed,
# or fgraph protected inputs and fgraph outputs can't be used as
# inplace target.
# Remove here as faster.
candidate_inputs = [i for i in xrange(len(node.inputs))
if i not in baseline.values() and
not isinstance(node.inputs[i], Constant) and
# Is next line costly?
not fgraph.destroyers(node.inputs[i]) and
node.inputs[i] not in protected_inputs]
else:
baseline = []
candidate_outputs = list(range(len(node.outputs)))
# node inputs that are Constant, already destroyed,
# fgraph protected inputs and fgraph outputs can't be used as inplace
# target.
# Remove here as faster.
candidate_inputs = [i for i in xrange(len(node.inputs))
if not isinstance(node.inputs[i], Constant) and
not fgraph.destroyers(node.inputs[i]) and
node.inputs[i] not in protected_inputs]
verbose = False
......@@ -2706,6 +2728,7 @@ def merge_two_slices(slice1, len1, slice2, len2):
val = T.switch(T.lt(sl2, 0), - len1 - 1, val)
if sl1.step:
val = T.switch(T.eq(sl1.step, 0), len1 + 1, val)
val = pre_greedy_local_optimizer(list_opt, val)
return val
else:
# We are in the more complex case when we do not actually know
......@@ -2730,6 +2753,7 @@ def merge_two_slices(slice1, len1, slice2, len2):
val = T.switch(T.lt(sl2, 0), - len1 - 1, val)
if sl1.step:
val = T.switch(T.eq(sl1.step, 0), len1 + 1, val)
val = pre_greedy_local_optimizer(list_opt, val)
return val
else:
# We are deleaing with two slices that need to be put together
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论