Merge pull request #4876 from Sentient07/cgt-opt

Cgt opt

Merge pull request #4876 from Sentient07/cgt-opt
140d0a06 · abergeron · GitHub · c49d23bd · 085b71c8 · 140d0a06
--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -150,6 +150,21 @@ optdb = gof.SequenceDB()
 optdb.register('merge1', gof.MergeOptimizer(),
               0, 'fast_run', 'fast_compile', 'merge')
+# After scan1 opt at 0.5 and before ShapeOpt at 1
+# This should only remove nodes.
+# The opt should not do anything that need shape inference.
+# New nodes that don't have infer_shape need that the original node
+# also don't have infer_shape
+local_useless = gof.optdb.LocalGroupDB(apply_all_opts=True, profile=True)
+optdb.register(
+    'useless',
+    gof.optdb.TopoDB(local_useless,
+                     failure_callback=gof.opt.NavigatorOptimizer.warn_inplace),
+    0.6, 'fast_run', 'fast_compile')
+optdb.register('merge1.1', gof.MergeOptimizer(),
+               0.65, 'fast_run', 'fast_compile', 'merge')
 # rearranges elemwise expressions
 optdb.register('canonicalize', gof.EquilibriumDB(ignore_newtrees=False),
               1, 'fast_run', 'fast_compile', 'canonicalize_db')

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -52,6 +52,7 @@ def _atexit_print_fn():
        destination_file = sys.stdout
    else:
        destination_file = open(config.profiling.destination, 'w')
    # Reverse sort in the order of compile+exec time
    for ps in sorted(_atexit_print_list,
                     key=lambda a:a.compile_time + a.fct_call_time)[::-1]:

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -5,7 +5,7 @@ amount of useful generic optimization tools.
 """
 from __future__ import absolute_import, print_function, division
-from collections import deque
+from collections import deque, defaultdict
 import copy
 import inspect
 import logging
@@ -38,7 +38,6 @@ def _list_of_nodes(fgraph):
 class Optimizer(object):
    """
-    WRITEME
    An L{Optimizer} can be applied to an L{FunctionGraph} to transform it.
    It can represent an optimization or in general any kind
@@ -64,7 +63,6 @@ class Optimizer(object):
    def apply(self, fgraph):
        """
-        WRITEME
        Applies the optimization to the provided L{FunctionGraph}. It may
        use all the methods defined by the L{FunctionGraph}. If the
@@ -76,7 +74,6 @@ class Optimizer(object):
    def optimize(self, fgraph, *args, **kwargs):
        """
-        WRITEME
        This is meant as a shortcut to:
          opt.add_requirements(fgraph)
@@ -94,7 +91,6 @@ class Optimizer(object):
    def __call__(self, fgraph):
        """
-        WRITEME
        Same as self.optimize(fgraph).
@@ -103,7 +99,6 @@ class Optimizer(object):
    def add_requirements(self, fgraph):
        """
-        WRITEME
        Add features to the fgraph that are required to apply the optimization.
        For example:
@@ -179,7 +174,6 @@ def inplace_optimizer(f):
 class SeqOptimizer(Optimizer, list):
    # inherit from Optimizer first to get Optimizer.__hash__
    """
-    WRITEME
    Takes a list of L{Optimizer} instances and applies them
    sequentially.
@@ -201,17 +195,23 @@ class SeqOptimizer(Optimizer, list):
    def __init__(self, *opts, **kw):
        """
-        WRITEME
+        Parameters
+        ----------
+        *opts :
+            The List of optimizers to be applied to a node
+        failure_callback : callable or None
+            Keyword only argument. A callback used when a failure
+            happen during optimization.
        """
        if len(opts) == 1 and isinstance(opts[0], (list, tuple)):
            opts = opts[0]
        self[:] = opts
        self.failure_callback = kw.pop('failure_callback', None)
+        assert len(kw) == 0
    def apply(self, fgraph):
        """
-        WRITEME
        Applies each L{Optimizer} in self in turn.
@@ -890,6 +890,7 @@ class MergeOptimizer(Optimizer):
    @staticmethod
    def print_profile(stream, prof, level=0):
        (nb_fail, replace_time, validate_time,
         callback_time, callbacks_time, nb_merged, nb_constant) = prof
@@ -1232,21 +1233,56 @@ def local_optimizer(tracks, inplace=False, requirements=()):
 class LocalOptGroup(LocalOptimizer):
-    """
+    """Takes a list of LocalOptimizer and applies them to the node.
-    WRITEME
+    Parameters
+    ----------
+    optimizers :
+        The List of optimizers to be applied to a node
+    reentrant : bool (Default True)
+        Keyword only argument. Reentrant information. Some global
+        optimizer like NavigatorOptimizer can use this value to
+        determine if it ignore new nodes during a pass on the
+        nodes. Sometimes, ignore_newtrees is not reentrant.
+    apply_all_opts : bool (Default False)
+        If False, it will return after the new node after the first optimizer
+        applied. Otherwise, it will start again with the new node until no new
+        optimization apply.
    """
-    def __init__(self, *optimizers):
+    def __init__(self, *optimizers, **kwargs):
        if len(optimizers) == 1 and isinstance(optimizers[0], list):
            # This happen when created by LocalGroupDB.
            optimizers = tuple(optimizers[0])
        self.opts = optimizers
+        assert isinstance(self.opts, tuple)
        self.reentrant = any(getattr(opt, 'reentrant', True)
                             for opt in optimizers)
        self.retains_inputs = all(getattr(opt, 'retains_inputs', False)
                                  for opt in optimizers)
+        self.apply_all_opts = kwargs.pop('apply_all_opts', False)
+        self.profile = kwargs.pop('profile', False)
+        self.track_map = defaultdict(lambda: [])
+        assert len(kwargs) == 0
+        if self.profile:
+            self.time_opts = {}
+            self.process_count = {}
+            self.applied_true = {}
+            self.node_created = {}
+        for o in self.opts:
+            if self.profile:
+                self.time_opts.setdefault(o, 0)
+                self.process_count.setdefault(o, 0)
+                self.applied_true.setdefault(o, 0)
+                self.node_created.setdefault(o, 0)
+            for c in o.tracks():
+                self.track_map[c].append(o)
    def __str__(self):
        return getattr(self, '__name__',
                       ('LocalOptGroup(%s)' %
@@ -1261,10 +1297,77 @@ class LocalOptGroup(LocalOptimizer):
        return t
    def transform(self, node):
-        for opt in self.opts:
+        if len(self.opts) == 0:
-            repl = opt.transform(node)
+            return
-            if repl:
+        fgraph = node.fgraph
+        repl = None
+        while True:
+            opts = self.track_map[type(node.op)] + self.track_map[node.op] + self.track_map[None]
+            new_repl = None
+            for opt in opts:
+                opt_start = time.time()
+                new_repl = opt.transform(node)
+                opt_finish = time.time()
+                if self.profile:
+                    self.time_opts[opt] += opt_start - opt_finish
+                    self.process_count[opt] += 1
+                if not new_repl:
+                    continue
+                else:
+                    assert len(new_repl) == 1
+                    if self.profile:
+                        self.node_created[opt] += len(graph.ops(fgraph.variables, new_repl))
+                        self.applied_true[opt] += 1
+                    break  # break from the for loop over optimization.
+            if not new_repl:  # No optimization applied in the last iteration
                return repl
+            # only 1 iteration or we are at the start of the graph.
+            if not self.apply_all_opts or not new_repl[0].owner:
+                return new_repl
+            repl = new_repl
+            node = repl[0].owner
+    @staticmethod
+    def print_profile(stream, prof, level=0):
+        (time_opts, process_count, applied_true, node_created, profile) = prof
+        if not profile:
+            return
+        blanc = ('    ' * int(level))
+        print(blanc, "LocalOptGroup", file=stream)
+        print(blanc, "---------------------", file=stream)
+        count_opt = []
+        not_used = []
+        not_used_time = 0
+        for o, count in iteritems(process_count):
+            if count > 0:
+                count_opt.append((time_opts[o], applied_true[o], count, o, node_created[o]))
+            else:
+                not_used.append((time_opts[o], o))
+                not_used_time += time_opts[o]
+        if count_opt:
+            print(blanc,
+                  '  time taken - times applied - times tried - name - node_created:',
+                  file=stream)
+            count_opt.sort()
+            for (t, a_t, count, o, n_c) in count_opt[::-1]:
+                print(blanc, '  %.3fs - %d - %d - %s - %d' % (
+                      t, a_t, count, o, n_c), file=stream)
+            print(blanc, '  %.3fs - in %d optimization that were not used (display those with runtime greater than 0)' % (
+                not_used_time, len(not_used)), file=stream)
+            not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
+            for (t, o) in not_used[::-1]:
+                if t > 0:
+                    # Skip opt that have 0 times, they probably wasn't even tried.
+                    print(blanc + "  ", '  %.3fs - %s' % (t, o), file=stream)
+        else:
+            print(blanc, " The Optimizer wasn't successful ", file=stream)
+        print(file=stream)
+    def merge_profile(prof1, prof2):
+        raise NotImplementedError
    def print_summary(self, stream=sys.stdout, level=0, depth=-1):
        print("%s%s id=%i" % (
@@ -1281,7 +1384,6 @@ class LocalOptGroup(LocalOptimizer):
 class OpSub(LocalOptimizer):
    """
-    WRITEME
    Replaces the application of a certain op by the application of
    another op that takes the same inputs as what they are replacing.
@@ -1331,7 +1433,6 @@ class OpSub(LocalOptimizer):
 class OpRemove(LocalOptimizer):
    """
-    WRITEME
    Removes all applications of an op by transferring each of its
    outputs to the corresponding input.
@@ -1367,7 +1468,6 @@ class OpRemove(LocalOptimizer):
 class PatternSub(LocalOptimizer):
    """
-    WRITEME
    @todo update
@@ -1887,7 +1987,8 @@ class NavigatorOptimizer(Optimizer):
 class TopoOptimizer(NavigatorOptimizer):
    """
-    WRITEME
+    TopoOptimizer has one local optimizer. It tries to apply to each node, in topological order (or reverse).
+    Each time the local optimizer applies, the node gets replaced, and the topooptimizer moves on to the next one.
    """
@@ -1937,7 +2038,7 @@ class TopoOptimizer(NavigatorOptimizer):
        callback_time = fgraph.execute_callbacks_time - callback_before
        nb_nodes_end = len(fgraph.apply_nodes)
        return (self, nb, nb_nodes_start, nb_nodes_end,
-                io_t, loop_t, callback_time)
+                io_t, loop_t, callback_time, self.local_opt)
    @staticmethod
    def print_profile(stream, prof, level=0):
@@ -1948,7 +2049,7 @@ class TopoOptimizer(NavigatorOptimizer):
            return
        (opt, nb, nb_nodes_start, nb_nodes_end,
-         io_t, loop_t, callback_time) = prof
+         io_t, loop_t, callback_time, lopt) = prof
        print(blanc, "TopoOptimizer ",
              getattr(opt, "name", getattr(opt, "__name__", "")), file=stream)
@@ -1958,12 +2059,62 @@ class TopoOptimizer(NavigatorOptimizer):
        print(blanc, "  init io_toposort", io_t, file=stream)
        print(blanc, "  loop time", loop_t, file=stream)
        print(blanc, "  callback_time", callback_time, file=stream)
+        if isinstance(lopt, LocalOptGroup):
+            if lopt.profile:
+                lopt.print_profile(stream, (lopt.time_opts,
+                                            lopt.process_count,
+                                            lopt.applied_true,
+                                            lopt.node_created,
+                                            lopt.profile),
+                                   level=level + 1)
    def __str__(self):
        return getattr(self, '__name__',
                       '<TopoOptimizer instance>')
+def out2in(*local_opts, **kwargs):
+    """
+    Uses the TopoOptimizer from the output nodes to input nodes of the graph.
+    """
+    name = (kwargs and kwargs.pop('name', None))
+    if len(local_opts) > 1:
+        # Don't wrap it uselessly if their is only 1 optimization.
+        local_opts = LocalOptGroup(*local_opts)
+    else:
+        local_opts, = local_opts
+        if not name:
+            name = local_opts.__name__
+    ret = TopoOptimizer(local_opts,
+                        order='out_to_in',
+                        failure_callback=TopoOptimizer.warn_inplace,
+                        **kwargs)
+    if name:
+        ret.__name__ = name
+    return ret
+def in2out(*local_opts, **kwargs):
+    """
+    Uses the TopoOptimizer from the input nodes to output nodes of the graph.
+    """
+    name = (kwargs and kwargs.pop('name', None))
+    if len(local_opts) > 1:
+        # Don't wrap it uselessly if their is only 1 optimization.
+        local_opts = LocalOptGroup(*local_opts)
+    else:
+        local_opts, = local_opts
+        if not name:
+            name = local_opts.__name__
+    ret = TopoOptimizer(local_opts,
+                        order='in_to_out',
+                        failure_callback=TopoOptimizer.warn_inplace,
+                        **kwargs)
+    if name:
+        ret.__name__ = name
+    return ret
 class OpKeyOptimizer(NavigatorOptimizer):
    """
    WRITEME

--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -321,8 +321,11 @@ class SequenceDB(DB):
    def register(self, name, obj, position, *tags):
        super(SequenceDB, self).register(name, obj, *tags)
-        assert isinstance(position, (integer_types, float))
+        if position == 'last':
-        self.__position__[name] = position
+            self.__position__[name] = max(self.__position__.values())
+        else:
+            assert isinstance(position, (integer_types, float))
+            self.__position__[name] = position
    def query(self, *tags, **kwtags):
        """
@@ -390,7 +393,7 @@ class SequenceDB(DB):
        return sio.getvalue()
-class LocalGroupDB(SequenceDB):
+class LocalGroupDB(DB):
    """
    Generate a local optimizer of type LocalOptGroup instead
    of a global optimizer.
@@ -399,11 +402,41 @@ class LocalGroupDB(SequenceDB):
    """
-    seq_opt = opt.LocalOptGroup
+    def __init__(self, apply_all_opts=False, profile=False):
-    def __init__(self, failure_callback=opt.SeqOptimizer.warn):
        super(LocalGroupDB, self).__init__()
        self.failure_callback = None
+        self.apply_all_opts = apply_all_opts
+        self.profile = profile
+    def query(self, *tags, **kwtags):
+        # For the new `useless` optimizer
+        opts = super(LocalGroupDB, self).query(*tags, **kwtags)
+        ret = opt.LocalOptGroup(*opts,
+                                apply_all_opts=self.apply_all_opts,
+                                profile=self.profile)
+        return ret
+class TopoDB(DB):
+    """
+    Generate a Global Optimizer of type TopoOptimizer.
+    """
+    def __init__(self, db, order='in_to_out', ignore_newtrees=False,
+                 failure_callback=None):
+        super(TopoDB, self).__init__()
+        self.db = db
+        self.order = order
+        self.ignore_newtrees = ignore_newtrees
+        self.failure_callback = failure_callback
+    def query(self, *tags, **kwtags):
+        return opt.TopoOptimizer(self.db.query(*tags, **kwtags),
+                                 self.order,
+                                 self.ignore_newtrees,
+                                 self.failure_callback)
 class ProxyDB(DB):

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -736,7 +736,11 @@ gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
    GpuElemwise,
    max_inputs_to_GpuElemwise)
 optdb.register('gpua_elemwise_fusion',
-               tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00,
+               # 48.5 move to gpu
+               # 48.6 specialize
+               # 49 cpu fusion
+               # 49.5 add destroy handler
+               tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 49,
               'fast_run', 'fusion', 'local_elemwise_fusion', 'gpuarray')
 inplace_gpu_elemwise_opt = tensor.opt.inplace_elemwise_optimizer_op(

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -22,7 +22,7 @@ from theano import gof
 from theano.compat import izip
 from theano.gof import opt, InconsistencyError, TopoOptimizer, graph
 from theano.gof import Variable, Constant
-from theano.gof.opt import copy_stack_trace
+from theano.gof.opt import copy_stack_trace, in2out
 from theano.gof.utils import MethodNotDefined
 from theano.gradient import DisconnectedType
 from theano.configparser import config
@@ -57,44 +57,6 @@ _logger = logging.getLogger('theano.tensor.opt')
 # Utilities
-def out2in(*local_opts, **kwargs):
-    """WRITEME """
-    name = (kwargs and kwargs.pop('name', None))
-    if len(local_opts) > 1:
-        # Don't wrap it uselessly if their is only 1 optimization.
-        local_opts = opt.LocalOptGroup(*local_opts)
-    else:
-        local_opts, = local_opts
-        if not name:
-            name = local_opts.__name__
-    ret = opt.TopoOptimizer(local_opts,
-                            order='out_to_in',
-                            failure_callback=TopoOptimizer.warn_inplace,
-                            **kwargs)
-    if name:
-        ret.__name__ = name
-    return ret
-def in2out(*local_opts, **kwargs):
-    """WRITEME """
-    name = (kwargs and kwargs.pop('name', None))
-    if len(local_opts) > 1:
-        # Don't wrap it uselessly if their is only 1 optimization.
-        local_opts = opt.LocalOptGroup(*local_opts)
-    else:
-        local_opts, = local_opts
-        if not name:
-            name = local_opts.__name__
-    ret = opt.TopoOptimizer(local_opts,
-                            order='in_to_out',
-                            failure_callback=TopoOptimizer.warn_inplace,
-                            **kwargs)
-    if name:
-        ret.__name__ = name
-    return ret
 def _fill_chain(new_out, orig_inputs):
    for i in orig_inputs:
        new_out = T.fill(i, new_out)
@@ -409,6 +371,19 @@ compile.optdb.register('inplace_elemwise_opt', inplace_elemwise_optimizer, 75,
                       'fast_run', 'inplace')
+def register_useless(lopt, *tags, **kwargs):
+    if type(lopt) == str:
+        def register(inner_lopt):
+            return register_useless(inner_lopt, lopt, *tags, **kwargs)
+        return register
+    else:
+        name = kwargs.pop('name', None) or lopt.__name__
+        compile.mode.local_useless.register(name, lopt, 'last', 'fast_run',
+                                            *tags, **kwargs)
+        return lopt
 def register_canonicalize(lopt, *tags, **kwargs):
    if type(lopt) == str:
        def register(inner_lopt):
@@ -1756,6 +1731,7 @@ compile.optdb.register('local_elemwise_alloc',
 @register_canonicalize("fast_compile")
+@register_useless
 @gof.local_optimizer([T.fill])
 def local_useless_fill(node):
    """fill(s,v) -> v
@@ -1776,6 +1752,7 @@ def local_useless_fill(node):
 @register_specialize
 @register_stabilize
 @register_canonicalize
+@register_useless
 @gof.local_optimizer([T.alloc])
 def local_useless_alloc(node):
    """
@@ -1796,6 +1773,35 @@ def local_useless_alloc(node):
        # We don't need to copy over any stack traces here
        return [input]
+@register_specialize
+@register_stabilize
+@register_canonicalize
+@gof.local_optimizer([T.alloc])
+def local_canonicalize_alloc(node):
+    """If the input type is the same as the output type (dtype and broadcast)
+    there is no change in the shape of the input. So this is just a simple copy
+    of the input. This is not needed. (as local_useless_alloc)
+    Also, it will canonicalize alloc by creating Dimshuffle after the
+    alloc to introduce the dimensions of constant size 1.
+    See https://github.com/Theano/Theano/issues/4072 to know why this
+    is needed.
+    """
+    op = node.op
+    if not isinstance(op, Alloc):
+        return False
+    input = node.inputs[0]
+    output = node.outputs[0]
+    # Check if dtype and broadcast remain the same.
+    if input.type == output.type:
+        # We don't need to copy over any stack traces here
+        return [input]
    # Allow local_merge_alloc to do its work first
    clients = getattr(output, 'clients', [])
    for client, i in clients:
@@ -1803,6 +1809,7 @@ def local_useless_alloc(node):
            return
    # Check if alloc adds a broadcastable dimension with shape 1.
    output_shape = node.inputs[1:]
    num_dims_with_size_1_added_to_left = 0
    for i in range(len(output_shape) - input.ndim):
@@ -1925,6 +1932,7 @@ def local_subtensor_remove_broadcastable_index(node):
 @register_specialize
 @register_canonicalize('fast_compile_gpu')
+@register_useless
 @gof.local_optimizer([Subtensor, AdvancedSubtensor1])
 def local_subtensor_make_vector(node):
    """
@@ -2009,6 +2017,7 @@ def local_subtensor_make_vector(node):
 # TODO: the other optimization for and, or, xor, le and ge see ticket #496.
+@register_useless
 @register_canonicalize('fast_compile')
 @register_specialize
 @gof.local_optimizer([T.Elemwise])
@@ -2428,6 +2437,7 @@ def local_upcast_elemwise_constant_inputs(node):
 ##################
+@register_useless
 @register_canonicalize
 @register_specialize
 @gof.local_optimizer([IncSubtensor])
@@ -2518,6 +2528,7 @@ def local_set_to_inc_subtensor(node):
        return [ret]
+@register_useless
 @register_canonicalize
 @register_specialize
 @gof.local_optimizer([Subtensor])
@@ -2558,6 +2569,11 @@ def local_useless_subtensor(node):
    list/vector or the ARange op.
    """
+    # If the optimization is tried over a node that is not a part of graph before
+    if not hasattr(node, 'fgraph'):
+        return
    # This optimization needs ShapeOpt and fgraph.shape_feature
    if not hasattr(node.fgraph, 'shape_feature'):
        return
@@ -2988,11 +3004,18 @@ def local_subtensor_merge(node):
            return [out]
+@register_useless
 @register_canonicalize
 @register_specialize
 @gof.local_optimizer([Subtensor])
 def local_subtensor_of_alloc(node):
-    """alloc[x:y] -> alloc"""
+    """
+    alloc(val)[x:y] -> alloc(val[...])
+    alloc(val)[x:y] -> alloc(val)
+    This can be seen as a lift, but it also reduce the number of computation/memory.
+    """
    if not isinstance(node.op, Subtensor):
        return False
    u = node.inputs[0]
@@ -3373,6 +3396,7 @@ def local_adv_sub1_adv_inc_sub1(node):
 @register_specialize
 @register_stabilize
 @register_canonicalize
+@register_useless
 @gof.local_optimizer([IncSubtensor,
                      AdvancedIncSubtensor,
                      AdvancedIncSubtensor1])
@@ -3484,6 +3508,7 @@ def local_useless_inc_subtensor_alloc(node):
 # Rebroadcast opts #
 ####################
+@register_useless
 @register_canonicalize
 @register_specialize
 @gof.local_optimizer([T.Rebroadcast])
@@ -3611,6 +3636,7 @@ def apply_rebroadcast_opt(rval):
 #############
 @register_specialize
 @register_canonicalize
+@register_useless
 @gof.local_optimizer([T.Join])
 def local_join_1(node):
    """Join(i, x) => x
@@ -3627,6 +3653,8 @@ def local_join_1(node):
        return [tensors[0]]
+# TODO: merge in local_useless_join
+@register_useless
 @register_specialize
 @register_canonicalize
 @gof.local_optimizer([T.Join])
@@ -3683,6 +3711,7 @@ def local_join_empty(node):
 @register_specialize
 @register_canonicalize
+@register_useless
 @gof.local_optimizer([T.Join])
 def local_join_make_vector(node):
    """Join(0, make_vector1, make_vector2, ...) => Join(0, make_vector12, ...)
@@ -3785,6 +3814,7 @@ def local_expm1(node):
 ###############
 # Switch opts #
 ###############
+@register_useless('local_remove_switch_const_cond')
 @register_canonicalize('fast_compile', 'local_remove_switch_const_cond')
 @register_specialize
 @gof.local_optimizer([T.Elemwise])
@@ -4053,6 +4083,7 @@ def local_merge_switch_same_cond(node):
 #############
 # Tile Opts #
 #############
+@register_useless
 @register_canonicalize
 @register_stabilize
 @gof.local_optimizer([T.Tile])
@@ -4099,6 +4130,7 @@ def local_useless_tile(node):
 ##############
 # Split Opts #
 ##############
+@register_useless
 @register_canonicalize
 @register_specialize
 @gof.local_optimizer([T.Split])
@@ -4179,6 +4211,7 @@ register_canonicalize(local_reshape_chain(T.Reshape),
                      name='local_reshape_chain')
+@register_useless
 @register_canonicalize
 @register_stabilize
 @gof.local_optimizer([T.Reshape])
@@ -4987,6 +5020,7 @@ def local_elemwise_sub_zeros(node):
        return [T.zeros_like(node.inputs[0])]
+@register_useless
 @register_specialize
 @register_stabilize
 @register_canonicalize
@@ -5435,9 +5469,10 @@ def local_reduce_join(node):
        return [ret]
-@register_canonicalize('fast_compile')
+@register_canonicalize('fast_compile', 'local_cut_useless_reduce')
+@register_useless('local_cut_useless_reduce')
 @gof.local_optimizer(ALL_REDUCE)
-def local_cut_useless_reduce(node):
+def local_useless_reduce(node):
    """Sum(a, axis=[]) -> a  """
    if isinstance(node.op, T.CAReduce):
        summed, = node.inputs
@@ -7213,6 +7248,7 @@ def local_grad_clip(node):
        return node.inputs
+@register_useless
 @register_canonicalize
 @register_stabilize
 @register_specialize

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -39,12 +39,12 @@ from theano.tensor.opt import (
        local_useless_reshape,
        local_reshape_to_dimshuffle,
        mul_canonizer,
-        out2in,
        Shape_i,
        Assert,
        MakeVector,
        make_vector,
-        local_expm1
+        local_expm1,
+        local_canonicalize_alloc
        )
 from theano import tensor
 from theano import tensor as T
@@ -70,7 +70,7 @@ from theano.tensor.elemwise import DimShuffle
 from theano.tests import unittest_tools as utt
 from theano.compile.mode import optdb
 from theano.compile import Mode
-from theano.gof.opt import check_stack_trace
+from theano.gof.opt import check_stack_trace, out2in
 from nose.plugins.attrib import attr
 mode_opt = theano.config.mode
@@ -3175,7 +3175,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        # Exclude local_useless_alloc, since it does not introduce
        # assert in all the same cases.
        self.fast_run_mode = self.fast_run_mode.excluding(
-            'local_useless_alloc')
+            'local_useless_alloc', 'local_canonicalize_alloc')
        # No optimization on alloc
        func = function(
            [self.vec, self.mat],
@@ -3676,7 +3676,7 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
        self.assert_eqs_const(f, 0)
-class Test_local_useless_alloc(unittest.TestCase):
+class Test_local_canonicalize_alloc(unittest.TestCase):
    def setUp(self):
        self.rng = numpy.random.RandomState(utt.fetch_seed())
@@ -3698,11 +3698,11 @@ class Test_local_useless_alloc(unittest.TestCase):
            self.assertRaises(ValueError, f)
        # No need to check_stack_trace as the optimization
-        # local_useless_alloc only removes nodes.
+        # local_canonicalize_alloc only removes nodes.
    def test1(self):
        # Test that alloc never gets instantiated during optimization
-        mode = mode_opt.excluding('local_useless_alloc')
+        mode = mode_opt.excluding('local_canonicalize_alloc')
        x = tensor.matrix('x')
        xx = tensor.fill(x, x)
@@ -3714,11 +3714,11 @@ class Test_local_useless_alloc(unittest.TestCase):
        assert tensor.Alloc not in op_classes
        # No need to check_stack_trace as the optimization
-        # local_useless_alloc only removes nodes.
+        # local_canonicalize_alloc only removes nodes.
    def test2(self):
        # Test that alloc never gets instantiated during optimization
-        mode = mode_opt.excluding('local_useless_alloc')
+        mode = mode_opt.excluding('local_canonicalize_alloc')
        x = tensor.matrix('x')
        y = tensor.tile(x, (1,)*2)
@@ -3736,7 +3736,7 @@ class Test_local_useless_alloc(unittest.TestCase):
        # The correct opt removes nodes, no need for check_stack_trace
    def test_useless_alloc_with_shape_one(self):
-        alloc_lift = out2in(local_useless_alloc)
+        alloc_lift = out2in(local_canonicalize_alloc)
        x = shared(self.rng.randn(2,))
        y = shared(self.rng.randn())
        z = shared(self.rng.randn(1, 1))