Merge pull request #4876 from Sentient07/cgt-opt

Cgt opt

Merge pull request #4876 from Sentient07/cgt-opt
140d0a06 · abergeron · GitHub · c49d23bd · 085b71c8 · 140d0a06
--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -150,6 +150,21 @@ optdb = gof.SequenceDB()
 optdb.register('merge1', gof.MergeOptimizer(),
               0, 'fast_run', 'fast_compile', 'merge')
+# After scan1 opt at 0.5 and before ShapeOpt at 1
+# This should only remove nodes.
+# The opt should not do anything that need shape inference.
+# New nodes that don't have infer_shape need that the original node
+# also don't have infer_shape
+local_useless = gof.optdb.LocalGroupDB(apply_all_opts=True, profile=True)
+optdb.register(
+    'useless',
+    gof.optdb.TopoDB(local_useless,
+                     failure_callback=gof.opt.NavigatorOptimizer.warn_inplace),
+    0.6, 'fast_run', 'fast_compile')
+optdb.register('merge1.1', gof.MergeOptimizer(),
+               0.65, 'fast_run', 'fast_compile', 'merge')
 # rearranges elemwise expressions
 optdb.register('canonicalize', gof.EquilibriumDB(ignore_newtrees=False),
               1, 'fast_run', 'fast_compile', 'canonicalize_db')

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -52,6 +52,7 @@ def _atexit_print_fn():
        destination_file = sys.stdout
    else:
        destination_file = open(config.profiling.destination, 'w')
    # Reverse sort in the order of compile+exec time
    for ps in sorted(_atexit_print_list,
                     key=lambda a:a.compile_time + a.fct_call_time)[::-1]:

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -321,6 +321,9 @@ class SequenceDB(DB):
    def register(self, name, obj, position, *tags):
        super(SequenceDB, self).register(name, obj, *tags)
+        if position == 'last':
+            self.__position__[name] = max(self.__position__.values())
+        else:
            assert isinstance(position, (integer_types, float))
            self.__position__[name] = position
@@ -390,7 +393,7 @@ class SequenceDB(DB):
        return sio.getvalue()
-class LocalGroupDB(SequenceDB):
+class LocalGroupDB(DB):
    """
    Generate a local optimizer of type LocalOptGroup instead
    of a global optimizer.
@@ -399,11 +402,41 @@ class LocalGroupDB(SequenceDB):
    """
-    seq_opt = opt.LocalOptGroup
+    def __init__(self, apply_all_opts=False, profile=False):
-    def __init__(self, failure_callback=opt.SeqOptimizer.warn):
        super(LocalGroupDB, self).__init__()
        self.failure_callback = None
+        self.apply_all_opts = apply_all_opts
+        self.profile = profile
+    def query(self, *tags, **kwtags):
+        # For the new `useless` optimizer
+        opts = super(LocalGroupDB, self).query(*tags, **kwtags)
+        ret = opt.LocalOptGroup(*opts,
+                                apply_all_opts=self.apply_all_opts,
+                                profile=self.profile)
+        return ret
+class TopoDB(DB):
+    """
+    Generate a Global Optimizer of type TopoOptimizer.
+    """
+    def __init__(self, db, order='in_to_out', ignore_newtrees=False,
+                 failure_callback=None):
+        super(TopoDB, self).__init__()
+        self.db = db
+        self.order = order
+        self.ignore_newtrees = ignore_newtrees
+        self.failure_callback = failure_callback
+    def query(self, *tags, **kwtags):
+        return opt.TopoOptimizer(self.db.query(*tags, **kwtags),
+                                 self.order,
+                                 self.ignore_newtrees,
+                                 self.failure_callback)
 class ProxyDB(DB):

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -736,7 +736,11 @@ gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
    GpuElemwise,
    max_inputs_to_GpuElemwise)
 optdb.register('gpua_elemwise_fusion',
-               tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00,
+               # 48.5 move to gpu
+               # 48.6 specialize
+               # 49 cpu fusion
+               # 49.5 add destroy handler
+               tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 49,
               'fast_run', 'fusion', 'local_elemwise_fusion', 'gpuarray')
 inplace_gpu_elemwise_opt = tensor.opt.inplace_elemwise_optimizer_op(

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -22,7 +22,7 @@ from theano import gof
 from theano.compat import izip
 from theano.gof import opt, InconsistencyError, TopoOptimizer, graph
 from theano.gof import Variable, Constant
-from theano.gof.opt import copy_stack_trace
+from theano.gof.opt import copy_stack_trace, in2out
 from theano.gof.utils import MethodNotDefined
 from theano.gradient import DisconnectedType
 from theano.configparser import config
@@ -57,44 +57,6 @@ _logger = logging.getLogger('theano.tensor.opt')
 # Utilities
-def out2in(*local_opts, **kwargs):
-    """WRITEME """
-    name = (kwargs and kwargs.pop('name', None))
-    if len(local_opts) > 1:
-        # Don't wrap it uselessly if their is only 1 optimization.
-        local_opts = opt.LocalOptGroup(*local_opts)
-    else:
-        local_opts, = local_opts
-        if not name:
-            name = local_opts.__name__
-    ret = opt.TopoOptimizer(local_opts,
-                            order='out_to_in',
-                            failure_callback=TopoOptimizer.warn_inplace,
-                            **kwargs)
-    if name:
-        ret.__name__ = name
-    return ret
-def in2out(*local_opts, **kwargs):
-    """WRITEME """
-    name = (kwargs and kwargs.pop('name', None))
-    if len(local_opts) > 1:
-        # Don't wrap it uselessly if their is only 1 optimization.
-        local_opts = opt.LocalOptGroup(*local_opts)
-    else:
-        local_opts, = local_opts
-        if not name:
-            name = local_opts.__name__
-    ret = opt.TopoOptimizer(local_opts,
-                            order='in_to_out',
-                            failure_callback=TopoOptimizer.warn_inplace,
-                            **kwargs)
-    if name:
-        ret.__name__ = name
-    return ret
 def _fill_chain(new_out, orig_inputs):
    for i in orig_inputs:
        new_out = T.fill(i, new_out)
@@ -409,6 +371,19 @@ compile.optdb.register('inplace_elemwise_opt', inplace_elemwise_optimizer, 75,
                       'fast_run', 'inplace')
+def register_useless(lopt, *tags, **kwargs):
+    if type(lopt) == str:
+        def register(inner_lopt):
+            return register_useless(inner_lopt, lopt, *tags, **kwargs)
+        return register
+    else:
+        name = kwargs.pop('name', None) or lopt.__name__
+        compile.mode.local_useless.register(name, lopt, 'last', 'fast_run',
+                                            *tags, **kwargs)
+        return lopt
 def register_canonicalize(lopt, *tags, **kwargs):
    if type(lopt) == str:
        def register(inner_lopt):
@@ -1756,6 +1731,7 @@ compile.optdb.register('local_elemwise_alloc',
 @register_canonicalize("fast_compile")
+@register_useless
 @gof.local_optimizer([T.fill])
 def local_useless_fill(node):
    """fill(s,v) -> v
@@ -1776,6 +1752,7 @@ def local_useless_fill(node):
 @register_specialize
 @register_stabilize
 @register_canonicalize
+@register_useless
 @gof.local_optimizer([T.alloc])
 def local_useless_alloc(node):
    """
@@ -1796,6 +1773,35 @@ def local_useless_alloc(node):
        # We don't need to copy over any stack traces here
        return [input]
+@register_specialize
+@register_stabilize
+@register_canonicalize
+@gof.local_optimizer([T.alloc])
+def local_canonicalize_alloc(node):
+    """If the input type is the same as the output type (dtype and broadcast)
+    there is no change in the shape of the input. So this is just a simple copy
+    of the input. This is not needed. (as local_useless_alloc)
+    Also, it will canonicalize alloc by creating Dimshuffle after the
+    alloc to introduce the dimensions of constant size 1.
+    See https://github.com/Theano/Theano/issues/4072 to know why this
+    is needed.
+    """
+    op = node.op
+    if not isinstance(op, Alloc):
+        return False
+    input = node.inputs[0]
+    output = node.outputs[0]
+    # Check if dtype and broadcast remain the same.
+    if input.type == output.type:
+        # We don't need to copy over any stack traces here
+        return [input]
    # Allow local_merge_alloc to do its work first
    clients = getattr(output, 'clients', [])
    for client, i in clients:
@@ -1803,6 +1809,7 @@ def local_useless_alloc(node):
            return
    # Check if alloc adds a broadcastable dimension with shape 1.
    output_shape = node.inputs[1:]
    num_dims_with_size_1_added_to_left = 0
    for i in range(len(output_shape) - input.ndim):
@@ -1925,6 +1932,7 @@ def local_subtensor_remove_broadcastable_index(node):
 @register_specialize
 @register_canonicalize('fast_compile_gpu')
+@register_useless
 @gof.local_optimizer([Subtensor, AdvancedSubtensor1])
 def local_subtensor_make_vector(node):
    """
@@ -2009,6 +2017,7 @@ def local_subtensor_make_vector(node):
 # TODO: the other optimization for and, or, xor, le and ge see ticket #496.
+@register_useless
 @register_canonicalize('fast_compile')
 @register_specialize
 @gof.local_optimizer([T.Elemwise])
@@ -2428,6 +2437,7 @@ def local_upcast_elemwise_constant_inputs(node):
 ##################
+@register_useless
 @register_canonicalize
 @register_specialize
 @gof.local_optimizer([IncSubtensor])
@@ -2518,6 +2528,7 @@ def local_set_to_inc_subtensor(node):
        return [ret]
+@register_useless
 @register_canonicalize
 @register_specialize
 @gof.local_optimizer([Subtensor])
@@ -2558,6 +2569,11 @@ def local_useless_subtensor(node):
    list/vector or the ARange op.
    """
+    # If the optimization is tried over a node that is not a part of graph before
+    if not hasattr(node, 'fgraph'):
+        return
    # This optimization needs ShapeOpt and fgraph.shape_feature
    if not hasattr(node.fgraph, 'shape_feature'):
        return
@@ -2988,11 +3004,18 @@ def local_subtensor_merge(node):
            return [out]
+@register_useless
 @register_canonicalize
 @register_specialize
 @gof.local_optimizer([Subtensor])
 def local_subtensor_of_alloc(node):
-    """alloc[x:y] -> alloc"""
+    """
+    alloc(val)[x:y] -> alloc(val[...])
+    alloc(val)[x:y] -> alloc(val)
+    This can be seen as a lift, but it also reduce the number of computation/memory.
+    """
    if not isinstance(node.op, Subtensor):
        return False
    u = node.inputs[0]
@@ -3373,6 +3396,7 @@ def local_adv_sub1_adv_inc_sub1(node):
 @register_specialize
 @register_stabilize
 @register_canonicalize
+@register_useless
 @gof.local_optimizer([IncSubtensor,
                      AdvancedIncSubtensor,
                      AdvancedIncSubtensor1])
@@ -3484,6 +3508,7 @@ def local_useless_inc_subtensor_alloc(node):
 # Rebroadcast opts #
 ####################
+@register_useless
 @register_canonicalize
 @register_specialize
 @gof.local_optimizer([T.Rebroadcast])
@@ -3611,6 +3636,7 @@ def apply_rebroadcast_opt(rval):
 #############
 @register_specialize
 @register_canonicalize
+@register_useless
 @gof.local_optimizer([T.Join])
 def local_join_1(node):
    """Join(i, x) => x
@@ -3627,6 +3653,8 @@ def local_join_1(node):
        return [tensors[0]]
+# TODO: merge in local_useless_join
+@register_useless
 @register_specialize
 @register_canonicalize
 @gof.local_optimizer([T.Join])
@@ -3683,6 +3711,7 @@ def local_join_empty(node):
 @register_specialize
 @register_canonicalize
+@register_useless
 @gof.local_optimizer([T.Join])
 def local_join_make_vector(node):
    """Join(0, make_vector1, make_vector2, ...) => Join(0, make_vector12, ...)
@@ -3785,6 +3814,7 @@ def local_expm1(node):
 ###############
 # Switch opts #
 ###############
+@register_useless('local_remove_switch_const_cond')
 @register_canonicalize('fast_compile', 'local_remove_switch_const_cond')
 @register_specialize
 @gof.local_optimizer([T.Elemwise])
@@ -4053,6 +4083,7 @@ def local_merge_switch_same_cond(node):
 #############
 # Tile Opts #
 #############
+@register_useless
 @register_canonicalize
 @register_stabilize
 @gof.local_optimizer([T.Tile])
@@ -4099,6 +4130,7 @@ def local_useless_tile(node):
 ##############
 # Split Opts #
 ##############
+@register_useless
 @register_canonicalize
 @register_specialize
 @gof.local_optimizer([T.Split])
@@ -4179,6 +4211,7 @@ register_canonicalize(local_reshape_chain(T.Reshape),
                      name='local_reshape_chain')
+@register_useless
 @register_canonicalize
 @register_stabilize
 @gof.local_optimizer([T.Reshape])
@@ -4987,6 +5020,7 @@ def local_elemwise_sub_zeros(node):
        return [T.zeros_like(node.inputs[0])]
+@register_useless
 @register_specialize
 @register_stabilize
 @register_canonicalize
@@ -5435,9 +5469,10 @@ def local_reduce_join(node):
        return [ret]
-@register_canonicalize('fast_compile')
+@register_canonicalize('fast_compile', 'local_cut_useless_reduce')
+@register_useless('local_cut_useless_reduce')
 @gof.local_optimizer(ALL_REDUCE)
-def local_cut_useless_reduce(node):
+def local_useless_reduce(node):
    """Sum(a, axis=[]) -> a  """
    if isinstance(node.op, T.CAReduce):
        summed, = node.inputs
@@ -7213,6 +7248,7 @@ def local_grad_clip(node):
        return node.inputs
+@register_useless
 @register_canonicalize
 @register_stabilize
 @register_specialize

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -39,12 +39,12 @@ from theano.tensor.opt import (
        local_useless_reshape,
        local_reshape_to_dimshuffle,
        mul_canonizer,
-        out2in,
        Shape_i,
        Assert,
        MakeVector,
        make_vector,
-        local_expm1
+        local_expm1,
+        local_canonicalize_alloc
        )
 from theano import tensor
 from theano import tensor as T
@@ -70,7 +70,7 @@ from theano.tensor.elemwise import DimShuffle
 from theano.tests import unittest_tools as utt
 from theano.compile.mode import optdb
 from theano.compile import Mode
-from theano.gof.opt import check_stack_trace
+from theano.gof.opt import check_stack_trace, out2in
 from nose.plugins.attrib import attr
 mode_opt = theano.config.mode
@@ -3175,7 +3175,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        # Exclude local_useless_alloc, since it does not introduce
        # assert in all the same cases.
        self.fast_run_mode = self.fast_run_mode.excluding(
-            'local_useless_alloc')
+            'local_useless_alloc', 'local_canonicalize_alloc')
        # No optimization on alloc
        func = function(
            [self.vec, self.mat],
@@ -3676,7 +3676,7 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
        self.assert_eqs_const(f, 0)
-class Test_local_useless_alloc(unittest.TestCase):
+class Test_local_canonicalize_alloc(unittest.TestCase):
    def setUp(self):
        self.rng = numpy.random.RandomState(utt.fetch_seed())
@@ -3698,11 +3698,11 @@ class Test_local_useless_alloc(unittest.TestCase):
            self.assertRaises(ValueError, f)
        # No need to check_stack_trace as the optimization
-        # local_useless_alloc only removes nodes.
+        # local_canonicalize_alloc only removes nodes.
    def test1(self):
        # Test that alloc never gets instantiated during optimization
-        mode = mode_opt.excluding('local_useless_alloc')
+        mode = mode_opt.excluding('local_canonicalize_alloc')
        x = tensor.matrix('x')
        xx = tensor.fill(x, x)
@@ -3714,11 +3714,11 @@ class Test_local_useless_alloc(unittest.TestCase):
        assert tensor.Alloc not in op_classes
        # No need to check_stack_trace as the optimization
-        # local_useless_alloc only removes nodes.
+        # local_canonicalize_alloc only removes nodes.
    def test2(self):
        # Test that alloc never gets instantiated during optimization
-        mode = mode_opt.excluding('local_useless_alloc')
+        mode = mode_opt.excluding('local_canonicalize_alloc')
        x = tensor.matrix('x')
        y = tensor.tile(x, (1,)*2)
@@ -3736,7 +3736,7 @@ class Test_local_useless_alloc(unittest.TestCase):
        # The correct opt removes nodes, no need for check_stack_trace
    def test_useless_alloc_with_shape_one(self):
-        alloc_lift = out2in(local_useless_alloc)
+        alloc_lift = out2in(local_canonicalize_alloc)
        x = shared(self.rng.randn(2,))
        y = shared(self.rng.randn())
        z = shared(self.rng.randn(1, 1))