提交 f0bd940e authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3477 from nouiz/crash_gpu

Crash gpu and opt speed up
......@@ -212,11 +212,11 @@ optimization you wrote. For example, consider the following:
Nothing happened here. The reason is: ``add(y, z) != add(y,
z)``. That is the case for efficiency reasons. To fix this problem we
first need to merge the parts of the graph that represent the same
computation, using the ``merge_optimizer`` defined in
computation, using the ``MergeOptimizer`` defined in
``theano.gof.opt``.
>>> from theano.gof.opt import merge_optimizer
>>> merge_optimizer.optimize(e) # doctest: +ELLIPSIS
>>> from theano.gof.opt import MergeOptimizer
>>> MergeOptimizer().optimize(e) # doctest: +ELLIPSIS
(0, ..., None, None, {}, 1, 0)
>>> e
[true_div(mul(*1 -> add(y, z), x), *1)]
......
......@@ -198,8 +198,17 @@ optdb.register('merge1', gof.MergeOptimizer(),
0, 'fast_run', 'fast_compile', 'merge')
# rearranges elemwise expressions
optdb.register('canonicalize', gof.EquilibriumDB(),
optdb.register('canonicalize', gof.EquilibriumDB(ignore_newtrees=False),
1, 'fast_run', 'fast_compile')
# Register in the canonizer Equilibrium as a clean up opt the merge opt.
# Without this, as the equilibrium have ignore_newtrees=False, we
# won't merge all nodes if it is set as a global optimizer with
# final_opt=True.
# We need a new instance of MergeOptimizer to don't have its name
# changed by other usage of it.
optdb['canonicalize'].register("merge", gof.opt.MergeOptimizer(), 'fast_run',
"fast_compile", cleanup=True)
optdb.register('merge1.2', gof.MergeOptimizer(),
1.2, 'fast_run', 'fast_compile', 'merge')
......
......@@ -547,6 +547,7 @@ class CLinker(link.Linker):
if no_recycling is None:
no_recycling = []
if self.fgraph is not None and self.fgraph is not fgraph:
# A linker can be tied to only one FunctionGraph.
return type(self)(self.schedule).accept(fgraph, no_recycling)
self.fgraph = fgraph
self.fetch_variables()
......@@ -1750,14 +1751,13 @@ class OpWiseCLinker(link.LocalLinker):
if no_recycling is None:
no_recycling = []
if self.fgraph is not None and self.fgraph is not fgraph:
# A linker can be tied to only one FunctionGraph.
return type(self)(
fallback_on_perform=self.fallback_on_perform,
allow_gc=self.allow_gc,
nice_errors=self.nice_errors,
schedule=self.schedule,
).accept(fgraph, no_recycling)
# raise Exception("Cannot accept from a Linker that is
# already tied to another FunctionGraph.")
self.fgraph = fgraph
self.no_recycling = no_recycling
return self
......
......@@ -873,8 +873,23 @@ class MergeOptimizer(Optimizer):
if i[1] > 0:
print(i)
merge_optimizer = MergeOptimizer()
@staticmethod
def merge_profile(prof1, prof2):
def merge_none_number(v1, v2):
if v1 is None:
return v2
if v2 is None:
return v1
return v1 + v2
nb_fail = prof1[0] + prof2[0]
replace_time = prof1[1] + prof2[1]
validate_time = merge_none_number(prof1[2], prof2[2])
callback_time = merge_none_number(prof1[3], prof2[3])
callbacks_time = merge_dict(prof1[4], prof2[4])
nb_merged = prof1[5] + prof2[5]
nb_constant = prof1[6] + prof2[6]
return (nb_fail, replace_time, validate_time,
callback_time, callbacks_time, nb_merged, nb_constant)
def is_same_graph_with_merge(var1, var2, givens=None):
......@@ -899,7 +914,7 @@ def is_same_graph_with_merge(var1, var2, givens=None):
for to_replace, replace_by in iteritems(givens):
fgraph.replace(to_replace, replace_by)
# Perform merge optimization.
merge_optimizer.optimize(fgraph)
MergeOptimizer().optimize(fgraph)
# When two variables perform the same computations, they will have the same
# owner in the optimized graph.
# We need to be careful with the special case where the owner is None,
......@@ -1165,7 +1180,7 @@ class FromFunctionLocalOptimizer(LocalOptimizer):
id(self)), file=stream)
def local_optimizer(tracks, inplace=False):
def local_optimizer(tracks, inplace=False, requirements=()):
def decorator(f):
"""
WRITEME
......@@ -1177,12 +1192,13 @@ def local_optimizer(tracks, inplace=False):
for t in tracks:
if not (isinstance(t, op.Op) or issubclass(t, op.PureOp)):
raise ValueError("Tracks are op classes or instances", f.__module__, f.__name__)
requirements = ()
req = requirements
if inplace:
dh_handler = dh.DestroyHandler
requirements = (lambda fgraph:
fgraph.attach_feature(dh_handler()),)
rval = FromFunctionLocalOptimizer(f, tracks, requirements)
req = tuple(requirements) + (
lambda fgraph:
fgraph.attach_feature(dh_handler()),)
rval = FromFunctionLocalOptimizer(f, tracks, req)
rval.__name__ = f.__name__
return rval
return decorator
......@@ -1974,19 +1990,41 @@ class ChangeTracker:
fgraph.change_tracker = self
def merge_dict(d1, d2):
"""
merge 2 dicts by adding the values.
"""
d = d1.copy()
for k, v in iteritems(d2):
if k in d:
d[k] += v
else:
d[k] = v
return d
class EquilibriumOptimizer(NavigatorOptimizer):
"""
Apply optimizations until equilibrium point.
Parameters
----------
optimizers
List or set of local or global optimizations to apply until equilibrium.
max_use_ratio
optimizers : list or set
Local or global optimizations to apply until equilibrium.
The global optimizer will be run at the start of each iteration before
the local optimizer.
max_use_ratio : int or float
Each optimizer can be applied at most (size of graph * this number)
times.
ignore_newtrees
See EquilibriumDB ignore_newtrees parameter definition.
final_optimizers
Global optimizers that will be run after each iteration.
cleanup_optimizers
Global optimizers that apply a list of pre determined optimization.
They must not traverse the graph as they are called very frequently.
The MergeOptimizer is one example of optimization that respect this.
They are applied after all global optimizer, then when one local optimizer is applied, then after all final optimizer.
"""
......@@ -1995,7 +2033,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
failure_callback=None,
ignore_newtrees=True,
max_use_ratio=None,
final_optimizers=None):
final_optimizers=None,
cleanup_optimizers=None):
super(EquilibriumOptimizer, self).__init__(
None,
ignore_newtrees=ignore_newtrees,
......@@ -2004,6 +2043,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
self.local_optimizers_all = []
self.global_optimizers = []
self.final_optimizers = []
self.cleanup_optimizers = []
for opt in optimizers:
if isinstance(opt, LocalOptimizer):
......@@ -2016,6 +2056,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
self.global_optimizers.append(opt)
if final_optimizers:
self.final_optimizers = final_optimizers
if cleanup_optimizers:
self.cleanup_optimizers = cleanup_optimizers
self.max_use_ratio = max_use_ratio
assert self.max_use_ratio is not None, (
'max_use_ratio has to be a number')
......@@ -2039,6 +2081,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
opt.add_requirements(fgraph)
for opt in self.final_optimizers:
opt.add_requirements(fgraph)
for opt in self.cleanup_optimizers:
opt.add_requirements(fgraph)
def apply(self, fgraph, start_from=None):
change_tracker = ChangeTracker()
......@@ -2066,17 +2110,39 @@ class EquilibriumOptimizer(NavigatorOptimizer):
node_created = {}
global_sub_profs = []
final_sub_profs = []
cleanup_sub_profs = []
for opt in (self.global_optimizers +
list(self.get_local_optimizers()) +
self.final_optimizers):
self.final_optimizers +
self.cleanup_optimizers):
global_process_count.setdefault(opt, 0)
time_opts.setdefault(opt, 0)
node_created.setdefault(opt, 0)
def apply_cleanup(profs_dict):
changed = False
for copt in self.cleanup_optimizers:
change_tracker.reset()
nb = change_tracker.nb_imported
t_opt = time.time()
sub_prof = copt.apply(fgraph)
time_opts[copt] += time.time() - t_opt
profs_dict[copt].append(sub_prof)
if change_tracker.changed:
process_count.setdefault(copt, 0)
process_count[copt] += 1
global_process_count[copt] += 1
changed = True
node_created[copt] += change_tracker.nb_imported - nb
return changed
while changed and not max_use_abort:
process_count = {}
t0 = time.time()
changed = False
iter_cleanup_sub_profs = {}
for copt in self.cleanup_optimizers:
iter_cleanup_sub_profs[copt] = []
# apply global optimizers
sub_profs = []
......@@ -2101,6 +2167,10 @@ class EquilibriumOptimizer(NavigatorOptimizer):
global_opt_timing.append(float(time.time() - t0))
# apply clean up as global opt can have done changes that
# request that
changed |= apply_cleanup(iter_cleanup_sub_profs)
# apply local optimizer
topo_t0 = time.time()
q = deque(graph.io_toposort(fgraph.inputs, start_from))
......@@ -2134,19 +2204,21 @@ class EquilibriumOptimizer(NavigatorOptimizer):
t_opt = time.time()
lopt_change = self.process_node(fgraph, node, lopt)
time_opts[lopt] += time.time() - t_opt
if lopt_change:
process_count.setdefault(lopt, 0)
process_count[lopt] += 1
global_process_count[lopt] += 1
changed = True
node_created[lopt] += change_tracker.nb_imported - nb
if global_process_count[lopt] > max_use:
max_use_abort = True
opt_name = (getattr(lopt, "name", None) or
getattr(lopt, "__name__", ""))
if node not in fgraph.apply_nodes:
# go to next node
break
if not lopt_change:
continue
process_count.setdefault(lopt, 0)
process_count[lopt] += 1
global_process_count[lopt] += 1
changed = True
node_created[lopt] += change_tracker.nb_imported - nb
changed |= apply_cleanup(iter_cleanup_sub_profs)
if global_process_count[lopt] > max_use:
max_use_abort = True
opt_name = (getattr(lopt, "name", None) or
getattr(lopt, "__name__", ""))
if node not in fgraph.apply_nodes:
# go to next node
break
finally:
self.detach_updater(fgraph, u)
......@@ -2173,6 +2245,17 @@ class EquilibriumOptimizer(NavigatorOptimizer):
final_sub_profs.append(sub_profs)
global_opt_timing[-1] += time.time() - t_before_final_opt
# apply clean up as final opt can have done changes that
# request that
changed |= apply_cleanup(iter_cleanup_sub_profs)
# merge clean up profiles during that iteration.
c_sub_profs = []
for copt, sub_profs in iteritems(iter_cleanup_sub_profs):
sub_prof = sub_profs[0]
for s_p in sub_profs[1:]:
sub_prof = copt.merge_profile(sub_prof, s_p)
c_sub_profs.append(sub_prof)
cleanup_sub_profs.append(c_sub_profs)
loop_process_count.append(process_count)
loop_timing.append(float(time.time() - t0))
......@@ -2188,7 +2271,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
return (self, loop_timing, loop_process_count,
(start_nb_nodes, end_nb_nodes, max_nb_nodes),
global_opt_timing, nb_nodes, time_opts, io_toposort_timing,
node_created, global_sub_profs, final_sub_profs)
node_created, global_sub_profs, final_sub_profs, cleanup_sub_profs)
def print_summary(self, stream=sys.stdout, level=0, depth=-1):
name = getattr(self, 'name', None)
......@@ -2204,7 +2287,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
(opt, loop_timing, loop_process_count,
(start_nb_nodes, end_nb_nodes, max_nb_nodes),
global_opt_timing, nb_nodes, time_opts, io_toposort_timing,
node_created, global_sub_profs, final_sub_profs) = prof
node_created, global_sub_profs, final_sub_profs,
cleanup_sub_profs) = prof
blanc = (' ' * level)
print(blanc, "EquilibriumOptimizer", end=' ', file=stream)
......@@ -2222,6 +2306,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
print(blanc, " time in global optimizers %.3fs" % s, file=stream)
s = sum([time_opts[o] for o in opt.final_optimizers])
print(blanc, " time in final optimizers %.3fs" % s, file=stream)
s = sum([time_opts[o] for o in opt.cleanup_optimizers])
print(blanc, " time in cleanup optimizers %.3fs" % s, file=stream)
for i in range(len(loop_timing)):
lopt = ""
if loop_process_count[i]:
......@@ -2245,7 +2331,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
process_count = {}
for o in (opt.global_optimizers +
list(opt.get_local_optimizers()) +
list(opt.final_optimizers)):
list(opt.final_optimizers) +
list(opt.cleanup_optimizers)):
process_count.setdefault(o, 0)
for count in loop_process_count:
for o, v in iteritems(count):
......@@ -2275,12 +2362,13 @@ class EquilibriumOptimizer(NavigatorOptimizer):
print(blanc + " ", ' %.3fs - %s' % (t, o), file=stream)
print(file=stream)
gf_opts = [o for o in (opt.global_optimizers +
list(opt.final_optimizers))
list(opt.final_optimizers) +
list(opt.cleanup_optimizers))
if o.print_profile.func_code is not
Optimizer.print_profile.func_code]
if not gf_opts:
return
print(blanc, "Global and final optimizer", file=stream)
print(blanc, "Global, final and clean up optimizers", file=stream)
for i in range(len(loop_timing)):
print(blanc, "Iter %d" % i, file=stream)
for o, prof in zip(opt.global_optimizers, global_sub_profs[i]):
......@@ -2293,6 +2381,11 @@ class EquilibriumOptimizer(NavigatorOptimizer):
o.print_profile(stream, prof, level + 2)
except NotImplementedError:
print(blanc, "merge not implemented for ", o)
for o, prof in zip(opt.cleanup_optimizers, cleanup_sub_profs[i]):
try:
o.print_profile(stream, prof, level + 2)
except NotImplementedError:
print(blanc, "merge not implemented for ", o)
@staticmethod
def merge_profile(prof1, prof2):
......@@ -2307,10 +2400,16 @@ class EquilibriumOptimizer(NavigatorOptimizer):
prof2[0].final_optimizers)
else:
final_optimizers = None
if len(prof1[0].cleanup_optimizers) > 0 or len(prof2[0].cleanup_optimizers) > 0:
cleanup_optimizers = OrderedSet(prof1[0].cleanup_optimizers).union(
prof2[0].cleanup_optimizers)
else:
cleanup_optimizers = None
new_opt = EquilibriumOptimizer(
local_optimizers.union(global_optimizers),
max_use_ratio=1,
final_optimizers=final_optimizers)
final_optimizers=final_optimizers,
cleanup_optimizers=cleanup_optimizers)
def merge_list(l1, l2):
l = copy.copy(l1)
......@@ -2321,15 +2420,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
l.append(nb)
return l
def merge_dict(d1, d2):
d = d1.copy()
for k, v in iteritems(d2):
if k in d:
d[k] += v
else:
d[k] = v
return d
loop_timing = merge_list(prof1[1], prof2[1])
loop_process_count = list(prof1[2])
......@@ -2358,6 +2448,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
node_created = merge_dict(prof1[8], prof2[8])
global_sub_profs = merge_list(prof1[9], prof2[9])
final_sub_profs = merge_list(prof1[10], prof2[10])
cleanup_sub_profs = merge_list(prof1[10], prof2[10])
return (new_opt,
loop_timing,
loop_process_count,
......@@ -2368,7 +2459,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
io_toposort_timing,
node_created,
global_sub_profs,
final_sub_profs)
final_sub_profs,
cleanup_sub_profs)
#################
# Utilities #
......
......@@ -268,28 +268,35 @@ class EquilibriumDB(DB):
super(EquilibriumDB, self).__init__()
self.ignore_newtrees = ignore_newtrees
self.__final__ = {}
self.__cleanup__ = {}
def register(self, name, obj, *tags, **kwtags):
if 'final_opt' in kwtags:
final_opt = kwtags['final_opt']
kwtags.pop('final_opt', None)
else:
final_opt = False
final_opt = kwtags.pop('final_opt', False)
cleanup = kwtags.pop('cleanup', False)
# An opt should not be final and clean up
assert not (final_opt and cleanup)
super(EquilibriumDB, self).register(name, obj, *tags, **kwtags)
self.__final__[name] = final_opt
self.__cleanup__[name] = cleanup
def query(self, *tags, **kwtags):
_opts = super(EquilibriumDB, self).query(*tags, **kwtags)
final_opts = [o for o in _opts if self.__final__.get(o.name, False)]
opts = [o for o in _opts if o not in final_opts]
cleanup_opts = [o for o in _opts if self.__cleanup__.get(o.name,
False)]
opts = [o for o in _opts
if o not in final_opts and o not in cleanup_opts]
if len(final_opts) == 0:
final_opts = None
if len(cleanup_opts) == 0:
cleanup_opts = None
return opt.EquilibriumOptimizer(
opts,
max_use_ratio=config.optdb.max_use_ratio,
ignore_newtrees=self.ignore_newtrees,
failure_callback=opt.NavigatorOptimizer.warn_inplace,
final_optimizers=final_opts)
final_optimizers=final_opts,
cleanup_optimizers=cleanup_opts)
class SequenceDB(DB):
......
......@@ -3622,7 +3622,7 @@ class GpuAllocEmpty(GpuOp):
const_shp = tensor.get_scalar_constant_value(s)
except tensor.NotScalarConstantError:
const_shp = None
bcast.append(numpy.all(1 == const_shp))
bcast.append(1 == const_shp)
otype = CudaNdarrayType(dtype='float32', broadcastable=bcast)
output = otype()
return sh, output
......
......@@ -48,7 +48,7 @@ cudnnSetTensorNdDescriptor(
int nbDims,
const int dimA[],
const int strideA[]) {
if (ndDims != 4) return CUDNN_STATUS_NOT_SUPPORTED;
if (nbDims != 4) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnSetTensor4dDescriptorEx(
tensorDesc, dataType,
dimA[0], dimA[1], dimA[2], dimA[3],
......@@ -204,7 +204,7 @@ cudnnSetPoolingNdDescriptor(
int nbDims,
const int windowDimA[],
const int paddingA[],
const in strideA[]) {
const int strideA[]) {
if (nbDims != 2) return CUDNN_STATUS_NOT_SUPPORTED;
if (paddingA[0] != 0 || paddingA[1] != 0) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnSetPoolingDescriptor(poolingDesc, mode,
......@@ -223,7 +223,7 @@ cudnnGetPoolingNdDescriptor(
int strideA[]) {
int win0, win1, str0, str1;
cudnnStatus_t err;
if (ndDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
if (nbDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
err = cudnnGetPoolingDescriptor(poolingDesc, mode, &win0, &win1,
&str0, &str1);
if (err != CUDNN_STATUS_SUCCESS) return err;
......
......@@ -1760,7 +1760,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
Subsampling stride (default: (1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad}
pad
(pad_h, pad_w) padding information.
(pad_h, pad_w) padding information.
pad_h is the number of zero-valued pixels added to each of the top and
bottom borders.
pad_w is the number of zero-valued pixels added to each of the left
......
......@@ -104,7 +104,7 @@ optdb.register('gpu_after_fusion',
'gpu')
# Register merge_optimizer as a global opt
gpu_optimizer.register('gpu_merge', theano.gof.opt.merge_optimizer,
gpu_optimizer.register('gpu_merge', theano.gof.opt.MergeOptimizer(),
'fast_run', 'fast_compile', final_opt=True)
......
......@@ -81,7 +81,7 @@ class CudaNdarrayType(Type):
raise TypeError('%s only supports dtype float32 for now. Tried '
'using dtype %s for variable %s' %
(self.__class__.__name__, dtype, name))
self.broadcastable = tuple(broadcastable)
self.broadcastable = tuple(bool(b) for b in broadcastable)
self.name = name
self.dtype_specs() # error checking is done there
......
......@@ -2673,7 +2673,7 @@ class Alloc(gof.Op):
const_shp = get_scalar_constant_value(s)
except NotScalarConstantError:
const_shp = None
bcast.append(numpy.all(1 == const_shp))
bcast.append(1 == const_shp)
return sh, bcast
def make_node(self, value, *shape):
......@@ -6037,7 +6037,7 @@ class AllocEmpty(gof.Op):
const_shp = get_scalar_constant_value(s)
except NotScalarConstantError:
const_shp = None
bcast.append(numpy.all(1 == const_shp))
bcast.append(1 == const_shp)
otype = TensorType(dtype=self.dtype, broadcastable=bcast)
output = otype()
return sh, output
......
......@@ -47,7 +47,6 @@ from theano.tensor.type import (values_eq_approx_remove_inf,
from theano.gof.opt import (Optimizer, pre_constant_merge,
pre_greedy_local_optimizer)
from theano.gof.opt import merge_optimizer
from theano.gof import toolbox
from theano.tensor.basic import get_scalar_constant_value, ShapeError, NotScalarConstantError
from six import StringIO
......@@ -452,8 +451,9 @@ def register_canonicalize(lopt, *tags, **kwargs):
return register_canonicalize(inner_lopt, lopt, *tags, **kwargs)
return register
else:
name = (kwargs and kwargs.pop('name')) or lopt.__name__
compile.optdb['canonicalize'].register(name, lopt, 'fast_run', *tags)
name = kwargs.pop('name', None) or lopt.__name__
compile.optdb['canonicalize'].register(name, lopt, 'fast_run',
*tags, **kwargs)
return lopt
......@@ -463,8 +463,9 @@ def register_stabilize(lopt, *tags, **kwargs):
return register_stabilize(inner_lopt, lopt, *tags, **kwargs)
return register
else:
name = (kwargs and kwargs.pop('name')) or lopt.__name__
compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
name = kwargs.pop('name', None) or lopt.__name__
compile.optdb['stabilize'].register(name, lopt, 'fast_run',
*tags, **kwargs)
return lopt
......@@ -474,9 +475,9 @@ def register_specialize(lopt, *tags, **kwargs):
return register_specialize(inner_lopt, lopt, *tags, **kwargs)
return register
else:
name = (kwargs and kwargs.pop('name')) or lopt.__name__
name = kwargs.pop('name', None) or lopt.__name__
compile.optdb['specialize'].register(name, lopt, 'fast_run',
*tags)
*tags, **kwargs)
return lopt
......@@ -502,11 +503,6 @@ def register_specialize_device(lopt, *tags, **kwargs):
return lopt
# Register merge_optimizer as a global opt during canonicalize
compile.optdb['canonicalize'].register('canon_merge', merge_optimizer,
'fast_run', final_opt=True)
#####################
# Dot optimizations #
#####################
......@@ -1414,6 +1410,172 @@ theano.compile.mode.optdb.register('ShapeOpt', ShapeOptimizer(),
0.1, 'fast_run', 'fast_compile')
def local_elemwise_alloc_op(ElemwiseOP, AllocOP, DimShuffleOP):
def local_elemwise_alloc(node):
"""
elemwise(alloc(x, shp), ..., y.TensorType(BROADCAST CONDITION))
-> elemwise(x, y.TensorType(BROADCAST CONDITION))
elemwise(dimshuffle(alloc(x, shp)),... ,y.TensorType(BROADCAST CONDITION))
-> elemwise(x.dimshuffle(...), y.TensorType(BROADCAST CONDITION))
BROADCAST CONDITION: the condition is that the one input that are
not to be optimized to have the same broadcast pattern as the
output.
We can change the alloc by a dimshuffle as the elemwise
already have the shape info. The dimshuffle will be faster
to exec.
"""
if not isinstance(node.op, ElemwiseOP):
return False
if len(node.outputs) > 1:
# Ensure all outputs have the same broadcast pattern
# This is a supposition that I'm not sure is always true.
assert all([o.type.broadcastable ==
node.outputs[0].type.broadcastable for o in
node.outputs[1:]])
# The broadcast pattern of the ouptut must match the broadcast
# pattern of at least one of the inputs.
if not any([i.type.broadcastable ==
node.outputs[0].type.broadcastable for i in node.inputs]):
return False
def dimshuffled_alloc(i):
return (isinstance(i.owner.op, DimShuffleOP) and
i.owner.inputs[0].owner and
isinstance(i.owner.inputs[0].owner.op, AllocOP))
# At least one input must have an owner that is either a AllocOP or a
# DimShuffleOP with an owner that is a AllocOP -- otherwise there is
# nothing to optimize.
if not any([i.owner and (isinstance(i.owner.op, AllocOP) or
dimshuffled_alloc(i)) for i in node.inputs]):
return False
# Search for input that we can use as a baseline for the dimensions.
assert_op_idx = -1
for idx, i in enumerate(node.inputs):
if i.type.broadcastable == node.outputs[0].type.broadcastable:
# Prefer an input that is not a AllocOP nor a DimShuffleOP of a
# AllocOP so that all allocs can be optimized.
if not (i.owner and (isinstance(i.owner.op, AllocOP) or
dimshuffled_alloc(i))):
assert_op_idx = idx
break
# It may be the case that only AllocOP and DimShuffleOP of AllocOP exist.
if assert_op_idx < 0:
# We want to optimize as many allocs as possible. When
# there is more than one then do all but one. number of
# inputs with alloc or dimshuffle alloc
l2 = [i for i in node.inputs
if (i.owner and (isinstance(i.owner.op, AllocOP) or
dimshuffled_alloc(i)))]
# If only 1 alloc or dimshuffle alloc, it is the one we
# will use for the shape. So no alloc would be removed.
if len(l2) > 1:
# l containt inputs with alloc or dimshuffle alloc
# only. Its length will always be at least one, as we
# checked that before
l = [idx for idx, i in enumerate(node.inputs)
if i.broadcastable == node.outputs[0].broadcastable]
assert_op_idx = l[0] # The first one is as good as any to use.
else:
# Nothing would be optimized!
return False
assert_op = node.inputs[assert_op_idx]
cmp_op = assert_op
new_i = []
same_shape = node.fgraph.shape_feature.same_shape
for i in node.inputs:
# Remove alloc
if (i.owner and isinstance(i.owner.op, AllocOP) and
i.owner.inputs[0].type != i.owner.outputs[0].type):
# when i.owner.inputs[0].type == i.owner.outputs[0].type we
# will remove that alloc later
assert i.type.ndim == cmp_op.ndim
if (theano.config.experimental.local_alloc_elemwise_assert and
not same_shape(i, cmp_op)):
assert_op = assert_(assert_op,
*[T.eq(i.shape[idx], cmp_op.shape[idx])
for idx in xrange(i.type.ndim)
if not i.type.broadcastable[idx]])
new_i.append(i.owner.inputs[0])
# Remove Alloc in DimShuffle
elif i.owner and dimshuffled_alloc(i):
assert i.type.ndim == cmp_op.type.ndim
if theano.config.experimental.local_alloc_elemwise_assert:
assert_cond = [T.eq(i.shape[idx], cmp_op.shape[idx])
for idx in xrange(i.type.ndim)
if not i.type.broadcastable[idx] and
not same_shape(i, cmp_op, idx, idx)]
if assert_cond:
assert_op = assert_(assert_op, *assert_cond)
alloc_input = i.owner.inputs[0].owner.inputs[0]
if alloc_input.ndim != i.owner.inputs[0].ndim:
# The alloc can add dimension to the value
# We add a dimshuffle to add them.
# We let later optimization merge the multiple dimshuffle
nb_dim_to_add = i.owner.inputs[0].ndim - alloc_input.ndim
alloc_input = alloc_input.dimshuffle(
['x'] * nb_dim_to_add +
list(range(alloc_input.ndim)))
# We need to keep the dimshuffle. It could swap axes or
# add dimensions anywhere.
r_i = i.owner.op(alloc_input)
# Copy stack trace from i to new_i
copy_stack_trace(i, r_i)
new_i.append(r_i)
else:
new_i.append(i)
new_i[assert_op_idx] = assert_op
ret = node.op(*new_i, return_list=True)
# Copy over stack trace from previous outputs to new outputs.
copy_stack_trace(node.outputs, ret)
return ret
return local_elemwise_alloc
# TODO, global optimizer that lift the assert to the beginning of the graph.
# TODO, optimize all inputs when possible -- currently when all inputs have
# an alloc all but one is optimized.
local_elemwise_alloc = register_specialize(
gof.local_optimizer([T.Elemwise])(
local_elemwise_alloc_op(T.Elemwise, T.Alloc, T.DimShuffle)),
'local_alloc_elemwise')
theano.configparser.AddConfigVar('experimental.local_alloc_elemwise',
"DEPRECATED: If True, enable the experimental"
" optimization local_alloc_elemwise."
" Generates error if not True. Use"
" optimizer_excluding=local_alloc_elemwise"
" to dsiable.",
theano.configparser.BoolParam(
True,
is_valid=lambda x: x
),
in_c_key=False)
# False could make the graph faster but not as safe.
theano.configparser.AddConfigVar(
'experimental.local_alloc_elemwise_assert',
"When the local_alloc_elemwise is applied, add"
" an assert to highlight shape errors.",
theano.configparser.BoolParam(True),
in_c_key=False)
@gof.local_optimizer([T.Elemwise])
def local_fill_sink(node):
"""
......@@ -1443,7 +1605,6 @@ def local_fill_sink(node):
# The newly created node c doesn't has 'clients',
# so this iteration is took place with node.outputs[0]
replacements = {node.outputs[0]: c}
all_clients_replaced = True
for client, cl_idx in node.outputs[0].clients:
if (hasattr(client, 'op') and
isinstance(client.op, T.Elemwise) and
......@@ -1456,13 +1617,8 @@ def local_fill_sink(node):
new_client.owner.outputs[0].clients = client.outputs[0].clients
r = local_fill_sink.transform(new_client.owner)
if not r:
all_clients_replaced = False
continue
replacements.update(r)
else:
all_clients_replaced = False
if all_clients_replaced:
replacements.pop(node.outputs[0], None)
return replacements
register_canonicalize(local_fill_sink)
......@@ -1470,7 +1626,7 @@ register_canonicalize(local_fill_sink)
@register_specialize
@register_stabilize
@register_canonicalize
# @register_canonicalize # We make full pass after the canonizer phase.
@gof.local_optimizer([T.fill])
def local_fill_to_alloc(node):
"""fill(s,v) -> alloc(v, shape(s))
......@@ -1510,7 +1666,18 @@ def local_fill_to_alloc(node):
node,) # theano.printing.debugprint(node.outputs[0], file='str'))
return rval
# Register this after stabilize at 1.5 to make sure stabilize don't
# get affected by less canonicalized graph due to alloc.
compile.optdb.register('local_fill_to_alloc',
in2out(local_fill_to_alloc),
1.51, 'fast_run')
# Needed to clean some extra alloc added by local_fill_to_alloc
compile.optdb.register('local_elemwise_alloc',
in2out(local_elemwise_alloc),
1.52, 'fast_run')
@register_canonicalize("fast_compile")
@gof.local_optimizer([T.fill])
def local_useless_fill(node):
"""fill(s,v) -> v
......@@ -1526,9 +1693,6 @@ def local_useless_fill(node):
# this is a useless fill, erase it.
# also, we don't need to copy over any stack traces here
return [v]
compile.optdb['canonicalize'].register('local_useless_fill',
in2out(local_useless_fill),
1.1, 'fast_compile')
@register_specialize
......@@ -2009,172 +2173,6 @@ compile.optdb['specialize'].register('local_remove_all_assert',
'unsafe',
use_db_name_as_tag=False)
def local_elemwise_alloc_op(ElemwiseOP, AllocOP, DimShuffleOP):
def local_elemwise_alloc(node):
"""
elemwise(alloc(x, shp), ..., y.TensorType(BROADCAST CONDITION))
-> elemwise(x, y.TensorType(BROADCAST CONDITION))
elemwise(dimshuffle(alloc(x, shp)),... ,y.TensorType(BROADCAST CONDITION))
-> elemwise(x.dimshuffle(...), y.TensorType(BROADCAST CONDITION))
BROADCAST CONDITION: the condition is that the one input that are
not to be optimized to have the same broadcast pattern as the
output.
We can change the alloc by a dimshuffle as the elemwise
already have the shape info. The dimshuffle will be faster
to exec.
"""
if not isinstance(node.op, ElemwiseOP):
return False
if len(node.outputs) > 1:
# Ensure all outputs have the same broadcast pattern
# This is a supposition that I'm not sure is always true.
assert all([o.type.broadcastable ==
node.outputs[0].type.broadcastable for o in
node.outputs[1:]])
# The broadcast pattern of the ouptut must match the broadcast
# pattern of at least one of the inputs.
if not any([i.type.broadcastable ==
node.outputs[0].type.broadcastable for i in node.inputs]):
return False
def dimshuffled_alloc(i):
return (isinstance(i.owner.op, DimShuffleOP) and
i.owner.inputs[0].owner and
isinstance(i.owner.inputs[0].owner.op, AllocOP))
# At least one input must have an owner that is either a AllocOP or a
# DimShuffleOP with an owner that is a AllocOP -- otherwise there is
# nothing to optimize.
if not any([i.owner and (isinstance(i.owner.op, AllocOP) or
dimshuffled_alloc(i)) for i in node.inputs]):
return False
# Search for input that we can use as a baseline for the dimensions.
assert_op_idx = -1
for idx, i in enumerate(node.inputs):
if i.type.broadcastable == node.outputs[0].type.broadcastable:
# Prefer an input that is not a AllocOP nor a DimShuffleOP of a
# AllocOP so that all allocs can be optimized.
if not (i.owner and (isinstance(i.owner.op, AllocOP) or
dimshuffled_alloc(i))):
assert_op_idx = idx
break
# It may be the case that only AllocOP and DimShuffleOP of AllocOP exist.
if assert_op_idx < 0:
# We want to optimize as many allocs as possible. When
# there is more than one then do all but one. number of
# inputs with alloc or dimshuffle alloc
l2 = [i for i in node.inputs
if (i.owner and (isinstance(i.owner.op, AllocOP) or
dimshuffled_alloc(i)))]
# If only 1 alloc or dimshuffle alloc, it is the one we
# will use for the shape. So no alloc would be removed.
if len(l2) > 1:
# l containt inputs with alloc or dimshuffle alloc
# only. Its length will always be at least one, as we
# checked that before
l = [idx for idx, i in enumerate(node.inputs)
if i.broadcastable == node.outputs[0].broadcastable]
assert_op_idx = l[0] # The first one is as good as any to use.
else:
# Nothing would be optimized!
return False
assert_op = node.inputs[assert_op_idx]
cmp_op = assert_op
new_i = []
same_shape = node.fgraph.shape_feature.same_shape
for i in node.inputs:
# Remove alloc
if (i.owner and isinstance(i.owner.op, AllocOP) and
i.owner.inputs[0].type != i.owner.outputs[0].type):
# when i.owner.inputs[0].type == i.owner.outputs[0].type we
# will remove that alloc later
assert i.type.ndim == cmp_op.ndim
if (theano.config.experimental.local_alloc_elemwise_assert and
not same_shape(i, cmp_op)):
assert_op = assert_(assert_op,
*[T.eq(i.shape[idx], cmp_op.shape[idx])
for idx in xrange(i.type.ndim)
if not i.type.broadcastable[idx]])
new_i.append(i.owner.inputs[0])
# Remove Alloc in DimShuffle
elif i.owner and dimshuffled_alloc(i):
assert i.type.ndim == cmp_op.type.ndim
if theano.config.experimental.local_alloc_elemwise_assert:
assert_cond = [T.eq(i.shape[idx], cmp_op.shape[idx])
for idx in xrange(i.type.ndim)
if not i.type.broadcastable[idx] and
not same_shape(i, cmp_op, idx, idx)]
if assert_cond:
assert_op = assert_(assert_op, *assert_cond)
alloc_input = i.owner.inputs[0].owner.inputs[0]
if alloc_input.ndim != i.owner.inputs[0].ndim:
# The alloc can add dimension to the value
# We add a dimshuffle to add them.
# We let later optimization merge the multiple dimshuffle
nb_dim_to_add = i.owner.inputs[0].ndim - alloc_input.ndim
alloc_input = alloc_input.dimshuffle(
['x'] * nb_dim_to_add +
list(range(alloc_input.ndim)))
# We need to keep the dimshuffle. It could swap axes or
# add dimensions anywhere.
r_i = i.owner.op(alloc_input)
# Copy stack trace from i to new_i
copy_stack_trace(i, r_i)
new_i.append(r_i)
else:
new_i.append(i)
new_i[assert_op_idx] = assert_op
ret = node.op(*new_i, return_list=True)
# Copy over stack trace from previous outputs to new outputs.
copy_stack_trace(node.outputs, ret)
return ret
return local_elemwise_alloc
# TODO, global optimizer that lift the assert to the beginning of the graph.
# TODO, optimize all inputs when possible -- currently when all inputs have
# an alloc all but one is optimized.
local_elemwise_alloc = register_specialize(
gof.local_optimizer([T.Elemwise])(
local_elemwise_alloc_op(T.Elemwise, T.Alloc, T.DimShuffle)),
'local_alloc_elemwise')
theano.configparser.AddConfigVar('experimental.local_alloc_elemwise',
"DEPRECATED: If True, enable the experimental"
" optimization local_alloc_elemwise."
" Generates error if not True. Use"
" optimizer_excluding=local_alloc_elemwise"
" to dsiable.",
theano.configparser.BoolParam(
True,
is_valid=lambda x: x
),
in_c_key=False)
# False could make the graph faster but not as safe.
theano.configparser.AddConfigVar(
'experimental.local_alloc_elemwise_assert',
"When the local_alloc_elemwise is applied, add"
" an assert to highlight shape errors.",
theano.configparser.BoolParam(True),
in_c_key=False)
#######################
# Constant Canonicalization
############################
......@@ -4018,7 +4016,9 @@ class Canonizer(gof.LocalOptimizer):
"""
if isinstance(v, Variable):
try:
return get_scalar_constant_value(v)
# As the constant folding is in the canonicalize phase,
# We don't need to check all the graph each time.
return get_scalar_constant_value(v, only_process_constants=True)
except NotScalarConstantError:
return None
else:
......@@ -5467,9 +5467,6 @@ def local_greedy_distributor(node):
return [rval]
@register_canonicalize('fast_compile')
@register_stabilize('fast_compile')
@register_specialize('fast_compile')
@gof.local_optimizer(None)
def constant_folding(node):
for input in node.inputs:
......@@ -5519,6 +5516,13 @@ def constant_folding(node):
return rval
topo_constant_folding = in2out(constant_folding, ignore_newtrees=True,
name="topo_constant_folding")
register_canonicalize(topo_constant_folding, 'fast_compile', final_opt=True)
register_stabilize(topo_constant_folding, 'fast_compile', final_opt=True)
register_specialize(topo_constant_folding, 'fast_compile', final_opt=True)
def _is_1(expr):
"""
......@@ -5758,7 +5762,7 @@ def local_log_erfc(node):
# sqrt(pi)*-x/(1-1/(2*x**2)+3/(4*x**4)-15/(8*x**6)))
# for float64: threshold=26.63 see at the end of the fct for the explaination
# for float32: threshold=9.3 see at the end of the fct for the explaination
# TODO: remove the contraint that there are only 2 inputs to mul and exp(x**2)
# TODO: remove the contraint that there are only 2 inputs to exp(x**2)
# is the second.
# TODO: at the test point 10 in float32, there is instability in the original
# value. The original gives -30.0, the stab -20.1 and in float64 -18.1.
......@@ -5779,20 +5783,23 @@ def local_grad_log_erfc_neg(node):
# The mul is optional.
if node.inputs[0].owner.op != T.mul:
mul = None
y = 1
y = []
if not node.inputs[0].owner or node.inputs[0].owner.op != T.exp:
return False
exp = node.inputs[0]
else:
mul = node.inputs[0]
if mul.owner.inputs[0].owner or len(mul.owner.inputs) != 2:
return False
y = mul.owner.inputs[0]
if (not mul.owner.inputs[1].owner or
mul.owner.inputs[1].owner.op != T.exp):
return False
exp = mul.owner.inputs[1]
exp = None
for idx, inp in enumerate(mul.owner.inputs):
if inp.owner and inp.owner.op == T.exp:
exp = inp
break
if len(mul.owner.inputs) == 2:
y = [mul.owner.inputs[1 - idx]]
else:
y = mul.owner.inputs[:]
del y[idx]
del mul
if not exp.owner.inputs[0].owner:
return False
......@@ -5894,9 +5901,10 @@ def local_grad_log_erfc_neg(node):
# threshold = 10.1
elif x.dtype == 'float64':
threshold = 26.641747557
ret = T.switch(x < threshold, true_div_no_mul, stab_value) * y
ret = T.switch(x < threshold, true_div_no_mul, stab_value)
if y:
ret = T.mul(ret, *y)
ret.values_eq_approx = values_eq_approx_remove_inf_nan
return [ret]
"""
The libm used for the test is amdlibm
......
......@@ -256,7 +256,10 @@ class DownsampleFactorMax(Op):
raise TypeError()
# TODO: consider restricting the dtype?
x = tensor.as_tensor_variable(x)
return gof.Apply(self, [x], [x.type()])
# If the input shape are broadcastable we can have 0 in the output shape
broad = x.broadcastable[:2] + (False, False)
out = tensor.TensorType(x.dtype, broad)
return gof.Apply(self, [x], [out()])
def perform(self, node, inp, out):
x, = inp
......
......@@ -801,6 +801,16 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
[image_val, maxout_val, gz_val],
MaxPoolGrad,
warn=False)
# checking with broadcastable input
image = tensor.tensor(dtype='float64',
broadcastable=(False, False, True, True))
image_val = rng.rand(4, 6, 1, 1)
self._compile_and_check(
[image],
[DownsampleFactorMax((2, 2),
ignore_border=True,
padding=(0, 0))(image)],
[image_val], DownsampleFactorMax)
def test_opt_max_to_average(self):
im = theano.tensor.tensor4()
......
......@@ -481,7 +481,7 @@ class test_canonize(unittest.TestCase):
mode = compile.mode.get_default_mode()
opt = gof.Query(["canonicalize"])
opt = opt.including('ShapeOpt')
opt = opt.including('ShapeOpt', 'local_fill_to_alloc')
opt = opt.excluding(
'local_elemwise_fusion')
mode = mode.__class__(linker=mode.linker, optimizer=opt)
......@@ -4021,7 +4021,8 @@ class T_Rebroadcast(unittest.TestCase):
class T_useless_elemwise(unittest.TestCase):
def setUp(self):
self.mode = theano.compile.get_default_mode().including('canonicalize')
self.mode = theano.compile.get_default_mode().including(
'canonicalize', 'local_fill_to_alloc')
def test_eq(self):
x = T.dmatrix()
......@@ -4545,7 +4546,7 @@ class T_local_erfc(unittest.TestCase):
# test that we work without the mul
f = theano.function([x], T.exp(T.neg(T.sqr(x))) / T.erfc(x), mode=mode)
assert len(f.maker.fgraph.apply_nodes) == 23, len(f.maker.fgraph.apply_nodes)
assert len(f.maker.fgraph.apply_nodes) == 22, len(f.maker.fgraph.apply_nodes)
assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
assert all(numpy.isfinite(f(val)))
......@@ -4558,7 +4559,7 @@ class T_local_erfc(unittest.TestCase):
# test that we work without the sqr and neg
f = theano.function([x], T.exp(T.mul(-1, x, x)) / T.erfc(x), mode=mode)
assert len(f.maker.fgraph.apply_nodes) == 22, len(f.maker.fgraph.apply_nodes)
assert len(f.maker.fgraph.apply_nodes) == 21, len(f.maker.fgraph.apply_nodes)
assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
assert all(numpy.isfinite(f(val)))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论