提交 140d0a06 authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #4876 from Sentient07/cgt-opt

Cgt opt
......@@ -150,6 +150,21 @@ optdb = gof.SequenceDB()
optdb.register('merge1', gof.MergeOptimizer(),
0, 'fast_run', 'fast_compile', 'merge')
# After scan1 opt at 0.5 and before ShapeOpt at 1
# This should only remove nodes.
# The opt should not do anything that need shape inference.
# New nodes that don't have infer_shape need that the original node
# also don't have infer_shape
local_useless = gof.optdb.LocalGroupDB(apply_all_opts=True, profile=True)
optdb.register(
'useless',
gof.optdb.TopoDB(local_useless,
failure_callback=gof.opt.NavigatorOptimizer.warn_inplace),
0.6, 'fast_run', 'fast_compile')
optdb.register('merge1.1', gof.MergeOptimizer(),
0.65, 'fast_run', 'fast_compile', 'merge')
# rearranges elemwise expressions
optdb.register('canonicalize', gof.EquilibriumDB(ignore_newtrees=False),
1, 'fast_run', 'fast_compile', 'canonicalize_db')
......
......@@ -52,6 +52,7 @@ def _atexit_print_fn():
destination_file = sys.stdout
else:
destination_file = open(config.profiling.destination, 'w')
# Reverse sort in the order of compile+exec time
for ps in sorted(_atexit_print_list,
key=lambda a:a.compile_time + a.fct_call_time)[::-1]:
......
差异被折叠。
......@@ -321,8 +321,11 @@ class SequenceDB(DB):
def register(self, name, obj, position, *tags):
super(SequenceDB, self).register(name, obj, *tags)
assert isinstance(position, (integer_types, float))
self.__position__[name] = position
if position == 'last':
self.__position__[name] = max(self.__position__.values())
else:
assert isinstance(position, (integer_types, float))
self.__position__[name] = position
def query(self, *tags, **kwtags):
"""
......@@ -390,7 +393,7 @@ class SequenceDB(DB):
return sio.getvalue()
class LocalGroupDB(SequenceDB):
class LocalGroupDB(DB):
"""
Generate a local optimizer of type LocalOptGroup instead
of a global optimizer.
......@@ -399,11 +402,41 @@ class LocalGroupDB(SequenceDB):
"""
seq_opt = opt.LocalOptGroup
def __init__(self, failure_callback=opt.SeqOptimizer.warn):
def __init__(self, apply_all_opts=False, profile=False):
super(LocalGroupDB, self).__init__()
self.failure_callback = None
self.apply_all_opts = apply_all_opts
self.profile = profile
def query(self, *tags, **kwtags):
# For the new `useless` optimizer
opts = super(LocalGroupDB, self).query(*tags, **kwtags)
ret = opt.LocalOptGroup(*opts,
apply_all_opts=self.apply_all_opts,
profile=self.profile)
return ret
class TopoDB(DB):
"""
Generate a Global Optimizer of type TopoOptimizer.
"""
def __init__(self, db, order='in_to_out', ignore_newtrees=False,
failure_callback=None):
super(TopoDB, self).__init__()
self.db = db
self.order = order
self.ignore_newtrees = ignore_newtrees
self.failure_callback = failure_callback
def query(self, *tags, **kwtags):
return opt.TopoOptimizer(self.db.query(*tags, **kwtags),
self.order,
self.ignore_newtrees,
self.failure_callback)
class ProxyDB(DB):
......
......@@ -736,7 +736,11 @@ gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
GpuElemwise,
max_inputs_to_GpuElemwise)
optdb.register('gpua_elemwise_fusion',
tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00,
# 48.5 move to gpu
# 48.6 specialize
# 49 cpu fusion
# 49.5 add destroy handler
tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 49,
'fast_run', 'fusion', 'local_elemwise_fusion', 'gpuarray')
inplace_gpu_elemwise_opt = tensor.opt.inplace_elemwise_optimizer_op(
......
......@@ -22,7 +22,7 @@ from theano import gof
from theano.compat import izip
from theano.gof import opt, InconsistencyError, TopoOptimizer, graph
from theano.gof import Variable, Constant
from theano.gof.opt import copy_stack_trace
from theano.gof.opt import copy_stack_trace, in2out
from theano.gof.utils import MethodNotDefined
from theano.gradient import DisconnectedType
from theano.configparser import config
......@@ -57,44 +57,6 @@ _logger = logging.getLogger('theano.tensor.opt')
# Utilities
def out2in(*local_opts, **kwargs):
"""WRITEME """
name = (kwargs and kwargs.pop('name', None))
if len(local_opts) > 1:
# Don't wrap it uselessly if their is only 1 optimization.
local_opts = opt.LocalOptGroup(*local_opts)
else:
local_opts, = local_opts
if not name:
name = local_opts.__name__
ret = opt.TopoOptimizer(local_opts,
order='out_to_in',
failure_callback=TopoOptimizer.warn_inplace,
**kwargs)
if name:
ret.__name__ = name
return ret
def in2out(*local_opts, **kwargs):
"""WRITEME """
name = (kwargs and kwargs.pop('name', None))
if len(local_opts) > 1:
# Don't wrap it uselessly if their is only 1 optimization.
local_opts = opt.LocalOptGroup(*local_opts)
else:
local_opts, = local_opts
if not name:
name = local_opts.__name__
ret = opt.TopoOptimizer(local_opts,
order='in_to_out',
failure_callback=TopoOptimizer.warn_inplace,
**kwargs)
if name:
ret.__name__ = name
return ret
def _fill_chain(new_out, orig_inputs):
for i in orig_inputs:
new_out = T.fill(i, new_out)
......@@ -409,6 +371,19 @@ compile.optdb.register('inplace_elemwise_opt', inplace_elemwise_optimizer, 75,
'fast_run', 'inplace')
def register_useless(lopt, *tags, **kwargs):
if type(lopt) == str:
def register(inner_lopt):
return register_useless(inner_lopt, lopt, *tags, **kwargs)
return register
else:
name = kwargs.pop('name', None) or lopt.__name__
compile.mode.local_useless.register(name, lopt, 'last', 'fast_run',
*tags, **kwargs)
return lopt
def register_canonicalize(lopt, *tags, **kwargs):
if type(lopt) == str:
def register(inner_lopt):
......@@ -1756,6 +1731,7 @@ compile.optdb.register('local_elemwise_alloc',
@register_canonicalize("fast_compile")
@register_useless
@gof.local_optimizer([T.fill])
def local_useless_fill(node):
"""fill(s,v) -> v
......@@ -1776,6 +1752,7 @@ def local_useless_fill(node):
@register_specialize
@register_stabilize
@register_canonicalize
@register_useless
@gof.local_optimizer([T.alloc])
def local_useless_alloc(node):
"""
......@@ -1796,6 +1773,35 @@ def local_useless_alloc(node):
# We don't need to copy over any stack traces here
return [input]
@register_specialize
@register_stabilize
@register_canonicalize
@gof.local_optimizer([T.alloc])
def local_canonicalize_alloc(node):
"""If the input type is the same as the output type (dtype and broadcast)
there is no change in the shape of the input. So this is just a simple copy
of the input. This is not needed. (as local_useless_alloc)
Also, it will canonicalize alloc by creating Dimshuffle after the
alloc to introduce the dimensions of constant size 1.
See https://github.com/Theano/Theano/issues/4072 to know why this
is needed.
"""
op = node.op
if not isinstance(op, Alloc):
return False
input = node.inputs[0]
output = node.outputs[0]
# Check if dtype and broadcast remain the same.
if input.type == output.type:
# We don't need to copy over any stack traces here
return [input]
# Allow local_merge_alloc to do its work first
clients = getattr(output, 'clients', [])
for client, i in clients:
......@@ -1803,6 +1809,7 @@ def local_useless_alloc(node):
return
# Check if alloc adds a broadcastable dimension with shape 1.
output_shape = node.inputs[1:]
num_dims_with_size_1_added_to_left = 0
for i in range(len(output_shape) - input.ndim):
......@@ -1925,6 +1932,7 @@ def local_subtensor_remove_broadcastable_index(node):
@register_specialize
@register_canonicalize('fast_compile_gpu')
@register_useless
@gof.local_optimizer([Subtensor, AdvancedSubtensor1])
def local_subtensor_make_vector(node):
"""
......@@ -2009,6 +2017,7 @@ def local_subtensor_make_vector(node):
# TODO: the other optimization for and, or, xor, le and ge see ticket #496.
@register_useless
@register_canonicalize('fast_compile')
@register_specialize
@gof.local_optimizer([T.Elemwise])
......@@ -2428,6 +2437,7 @@ def local_upcast_elemwise_constant_inputs(node):
##################
@register_useless
@register_canonicalize
@register_specialize
@gof.local_optimizer([IncSubtensor])
......@@ -2518,6 +2528,7 @@ def local_set_to_inc_subtensor(node):
return [ret]
@register_useless
@register_canonicalize
@register_specialize
@gof.local_optimizer([Subtensor])
......@@ -2558,6 +2569,11 @@ def local_useless_subtensor(node):
list/vector or the ARange op.
"""
# If the optimization is tried over a node that is not a part of graph before
if not hasattr(node, 'fgraph'):
return
# This optimization needs ShapeOpt and fgraph.shape_feature
if not hasattr(node.fgraph, 'shape_feature'):
return
......@@ -2988,11 +3004,18 @@ def local_subtensor_merge(node):
return [out]
@register_useless
@register_canonicalize
@register_specialize
@gof.local_optimizer([Subtensor])
def local_subtensor_of_alloc(node):
"""alloc[x:y] -> alloc"""
"""
alloc(val)[x:y] -> alloc(val[...])
alloc(val)[x:y] -> alloc(val)
This can be seen as a lift, but it also reduce the number of computation/memory.
"""
if not isinstance(node.op, Subtensor):
return False
u = node.inputs[0]
......@@ -3373,6 +3396,7 @@ def local_adv_sub1_adv_inc_sub1(node):
@register_specialize
@register_stabilize
@register_canonicalize
@register_useless
@gof.local_optimizer([IncSubtensor,
AdvancedIncSubtensor,
AdvancedIncSubtensor1])
......@@ -3484,6 +3508,7 @@ def local_useless_inc_subtensor_alloc(node):
# Rebroadcast opts #
####################
@register_useless
@register_canonicalize
@register_specialize
@gof.local_optimizer([T.Rebroadcast])
......@@ -3611,6 +3636,7 @@ def apply_rebroadcast_opt(rval):
#############
@register_specialize
@register_canonicalize
@register_useless
@gof.local_optimizer([T.Join])
def local_join_1(node):
"""Join(i, x) => x
......@@ -3627,6 +3653,8 @@ def local_join_1(node):
return [tensors[0]]
# TODO: merge in local_useless_join
@register_useless
@register_specialize
@register_canonicalize
@gof.local_optimizer([T.Join])
......@@ -3683,6 +3711,7 @@ def local_join_empty(node):
@register_specialize
@register_canonicalize
@register_useless
@gof.local_optimizer([T.Join])
def local_join_make_vector(node):
"""Join(0, make_vector1, make_vector2, ...) => Join(0, make_vector12, ...)
......@@ -3785,6 +3814,7 @@ def local_expm1(node):
###############
# Switch opts #
###############
@register_useless('local_remove_switch_const_cond')
@register_canonicalize('fast_compile', 'local_remove_switch_const_cond')
@register_specialize
@gof.local_optimizer([T.Elemwise])
......@@ -4053,6 +4083,7 @@ def local_merge_switch_same_cond(node):
#############
# Tile Opts #
#############
@register_useless
@register_canonicalize
@register_stabilize
@gof.local_optimizer([T.Tile])
......@@ -4099,6 +4130,7 @@ def local_useless_tile(node):
##############
# Split Opts #
##############
@register_useless
@register_canonicalize
@register_specialize
@gof.local_optimizer([T.Split])
......@@ -4179,6 +4211,7 @@ register_canonicalize(local_reshape_chain(T.Reshape),
name='local_reshape_chain')
@register_useless
@register_canonicalize
@register_stabilize
@gof.local_optimizer([T.Reshape])
......@@ -4987,6 +5020,7 @@ def local_elemwise_sub_zeros(node):
return [T.zeros_like(node.inputs[0])]
@register_useless
@register_specialize
@register_stabilize
@register_canonicalize
......@@ -5435,9 +5469,10 @@ def local_reduce_join(node):
return [ret]
@register_canonicalize('fast_compile')
@register_canonicalize('fast_compile', 'local_cut_useless_reduce')
@register_useless('local_cut_useless_reduce')
@gof.local_optimizer(ALL_REDUCE)
def local_cut_useless_reduce(node):
def local_useless_reduce(node):
"""Sum(a, axis=[]) -> a """
if isinstance(node.op, T.CAReduce):
summed, = node.inputs
......@@ -7213,6 +7248,7 @@ def local_grad_clip(node):
return node.inputs
@register_useless
@register_canonicalize
@register_stabilize
@register_specialize
......
......@@ -39,12 +39,12 @@ from theano.tensor.opt import (
local_useless_reshape,
local_reshape_to_dimshuffle,
mul_canonizer,
out2in,
Shape_i,
Assert,
MakeVector,
make_vector,
local_expm1
local_expm1,
local_canonicalize_alloc
)
from theano import tensor
from theano import tensor as T
......@@ -70,7 +70,7 @@ from theano.tensor.elemwise import DimShuffle
from theano.tests import unittest_tools as utt
from theano.compile.mode import optdb
from theano.compile import Mode
from theano.gof.opt import check_stack_trace
from theano.gof.opt import check_stack_trace, out2in
from nose.plugins.attrib import attr
mode_opt = theano.config.mode
......@@ -3175,7 +3175,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
# Exclude local_useless_alloc, since it does not introduce
# assert in all the same cases.
self.fast_run_mode = self.fast_run_mode.excluding(
'local_useless_alloc')
'local_useless_alloc', 'local_canonicalize_alloc')
# No optimization on alloc
func = function(
[self.vec, self.mat],
......@@ -3676,7 +3676,7 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
self.assert_eqs_const(f, 0)
class Test_local_useless_alloc(unittest.TestCase):
class Test_local_canonicalize_alloc(unittest.TestCase):
def setUp(self):
self.rng = numpy.random.RandomState(utt.fetch_seed())
......@@ -3698,11 +3698,11 @@ class Test_local_useless_alloc(unittest.TestCase):
self.assertRaises(ValueError, f)
# No need to check_stack_trace as the optimization
# local_useless_alloc only removes nodes.
# local_canonicalize_alloc only removes nodes.
def test1(self):
# Test that alloc never gets instantiated during optimization
mode = mode_opt.excluding('local_useless_alloc')
mode = mode_opt.excluding('local_canonicalize_alloc')
x = tensor.matrix('x')
xx = tensor.fill(x, x)
......@@ -3714,11 +3714,11 @@ class Test_local_useless_alloc(unittest.TestCase):
assert tensor.Alloc not in op_classes
# No need to check_stack_trace as the optimization
# local_useless_alloc only removes nodes.
# local_canonicalize_alloc only removes nodes.
def test2(self):
# Test that alloc never gets instantiated during optimization
mode = mode_opt.excluding('local_useless_alloc')
mode = mode_opt.excluding('local_canonicalize_alloc')
x = tensor.matrix('x')
y = tensor.tile(x, (1,)*2)
......@@ -3736,7 +3736,7 @@ class Test_local_useless_alloc(unittest.TestCase):
# The correct opt removes nodes, no need for check_stack_trace
def test_useless_alloc_with_shape_one(self):
alloc_lift = out2in(local_useless_alloc)
alloc_lift = out2in(local_canonicalize_alloc)
x = shared(self.rng.randn(2,))
y = shared(self.rng.randn())
z = shared(self.rng.randn(1, 1))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论