Merge pull request #3477 from nouiz/crash_gpu

Crash gpu and opt speed up

Merge pull request #3477 from nouiz/crash_gpu
f0bd940e · Pascal Lamblin · dab522df · 7fce44ca · f0bd940e · f0bd940e
--- a/doc/extending/optimization.txt
+++ b/doc/extending/optimization.txt
@@ -212,11 +212,11 @@ optimization you wrote. For example, consider the following:
 Nothing happened here. The reason is: ``add(y, z) != add(y,
 z)``. That is the case for efficiency reasons. To fix this problem we
 first need to merge the parts of the graph that represent the same
-computation, using the ``merge_optimizer`` defined in
+computation, using the ``MergeOptimizer`` defined in
 ``theano.gof.opt``.

->>> from theano.gof.opt import merge_optimizer
->>> merge_optimizer.optimize(e)  # doctest: +ELLIPSIS
+>>> from theano.gof.opt import MergeOptimizer
+>>> MergeOptimizer().optimize(e)  # doctest: +ELLIPSIS
 (0, ..., None, None, {}, 1, 0)
 >>> e
 [true_div(mul(*1 -> add(y, z), x), *1)]

--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -198,8 +198,17 @@ optdb.register('merge1', gof.MergeOptimizer(),
               0, 'fast_run', 'fast_compile', 'merge')

 # rearranges elemwise expressions
-optdb.register('canonicalize', gof.EquilibriumDB(),
+optdb.register('canonicalize', gof.EquilibriumDB(ignore_newtrees=False),
               1, 'fast_run', 'fast_compile')
+# Register in the canonizer Equilibrium as a clean up opt the merge opt.
+# Without this, as the equilibrium have ignore_newtrees=False, we
+# won't merge all nodes if it is set as a global optimizer with
+# final_opt=True.
+
+# We need a new instance of MergeOptimizer to don't have its name
+# changed by other usage of it.
+optdb['canonicalize'].register("merge", gof.opt.MergeOptimizer(), 'fast_run',
+                               "fast_compile", cleanup=True)

 optdb.register('merge1.2', gof.MergeOptimizer(),
               1.2, 'fast_run', 'fast_compile', 'merge')

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -547,6 +547,7 @@ class CLinker(link.Linker):
        if no_recycling is None:
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
+            # A linker can be tied to only one FunctionGraph.
            return type(self)(self.schedule).accept(fgraph, no_recycling)
        self.fgraph = fgraph
        self.fetch_variables()
@@ -1750,14 +1751,13 @@ class OpWiseCLinker(link.LocalLinker):
        if no_recycling is None:
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
+            # A linker can be tied to only one FunctionGraph.
            return type(self)(
                fallback_on_perform=self.fallback_on_perform,
                allow_gc=self.allow_gc,
                nice_errors=self.nice_errors,
                schedule=self.schedule,
            ).accept(fgraph, no_recycling)
-            # raise Exception("Cannot accept from a Linker that is
-            # already tied to another FunctionGraph.")
        self.fgraph = fgraph
        self.no_recycling = no_recycling
        return self

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -873,8 +873,23 @@ class MergeOptimizer(Optimizer):
                if i[1] > 0:
                    print(i)

-
-merge_optimizer = MergeOptimizer()
+    @staticmethod
+    def merge_profile(prof1, prof2):
+        def merge_none_number(v1, v2):
+            if v1 is None:
+                return v2
+            if v2 is None:
+                return v1
+            return v1 + v2
+        nb_fail = prof1[0] + prof2[0]
+        replace_time = prof1[1] + prof2[1]
+        validate_time = merge_none_number(prof1[2], prof2[2])
+        callback_time = merge_none_number(prof1[3], prof2[3])
+        callbacks_time = merge_dict(prof1[4], prof2[4])
+        nb_merged = prof1[5] + prof2[5]
+        nb_constant = prof1[6] + prof2[6]
+        return (nb_fail, replace_time, validate_time,
+                callback_time, callbacks_time, nb_merged, nb_constant)


 def is_same_graph_with_merge(var1, var2, givens=None):
@@ -899,7 +914,7 @@ def is_same_graph_with_merge(var1, var2, givens=None):
    for to_replace, replace_by in iteritems(givens):
        fgraph.replace(to_replace, replace_by)
    # Perform merge optimization.
-    merge_optimizer.optimize(fgraph)
+    MergeOptimizer().optimize(fgraph)
    # When two variables perform the same computations, they will have the same
    # owner in the optimized graph.
    # We need to be careful with the special case where the owner is None,
@@ -1165,7 +1180,7 @@ class FromFunctionLocalOptimizer(LocalOptimizer):
            id(self)), file=stream)


-def local_optimizer(tracks, inplace=False):
+def local_optimizer(tracks, inplace=False, requirements=()):
    def decorator(f):
        """
        WRITEME
@@ -1177,12 +1192,13 @@ def local_optimizer(tracks, inplace=False):
            for t in tracks:
                if not (isinstance(t, op.Op) or issubclass(t, op.PureOp)):
                    raise ValueError("Tracks are op classes or instances", f.__module__, f.__name__)
-        requirements = ()
+        req = requirements
        if inplace:
            dh_handler = dh.DestroyHandler
-            requirements = (lambda fgraph:
-                            fgraph.attach_feature(dh_handler()),)
-        rval = FromFunctionLocalOptimizer(f, tracks, requirements)
+            req = tuple(requirements) + (
+                lambda fgraph:
+                fgraph.attach_feature(dh_handler()),)
+        rval = FromFunctionLocalOptimizer(f, tracks, req)
        rval.__name__ = f.__name__
        return rval
    return decorator
@@ -1974,19 +1990,41 @@ class ChangeTracker:
        fgraph.change_tracker = self


+def merge_dict(d1, d2):
+    """
+    merge 2 dicts by adding the values.
+    """
+    d = d1.copy()
+    for k, v in iteritems(d2):
+        if k in d:
+            d[k] += v
+        else:
+            d[k] = v
+    return d
+
+
 class EquilibriumOptimizer(NavigatorOptimizer):
    """
    Apply optimizations until equilibrium point.

    Parameters
    ----------
-    optimizers
-        List or set of local or global optimizations to apply until equilibrium.
-    max_use_ratio
+    optimizers : list or set
+        Local or global optimizations to apply until equilibrium.
+        The global optimizer will be run at the start of each iteration before
+        the local optimizer.
+    max_use_ratio : int or float
        Each optimizer can be applied at most (size of graph * this number)
        times.
    ignore_newtrees
        See EquilibriumDB ignore_newtrees parameter definition.
+    final_optimizers
+        Global optimizers that will be run after each iteration.
+    cleanup_optimizers
+        Global optimizers that apply a list of pre determined optimization.
+        They must not traverse the graph as they are called very frequently.
+        The MergeOptimizer is one example of optimization that respect this.
+        They are applied after all global optimizer, then when one local optimizer is applied, then after all final optimizer.

    """

@@ -1995,7 +2033,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                 failure_callback=None,
                 ignore_newtrees=True,
                 max_use_ratio=None,
-                 final_optimizers=None):
+                 final_optimizers=None,
+                 cleanup_optimizers=None):
        super(EquilibriumOptimizer, self).__init__(
            None,
            ignore_newtrees=ignore_newtrees,
@@ -2004,6 +2043,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        self.local_optimizers_all = []
        self.global_optimizers = []
        self.final_optimizers = []
+        self.cleanup_optimizers = []

        for opt in optimizers:
            if isinstance(opt, LocalOptimizer):
@@ -2016,6 +2056,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                self.global_optimizers.append(opt)
        if final_optimizers:
            self.final_optimizers = final_optimizers
+        if cleanup_optimizers:
+            self.cleanup_optimizers = cleanup_optimizers
        self.max_use_ratio = max_use_ratio
        assert self.max_use_ratio is not None, (
            'max_use_ratio has to be a number')
@@ -2039,6 +2081,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
            opt.add_requirements(fgraph)
        for opt in self.final_optimizers:
            opt.add_requirements(fgraph)
+        for opt in self.cleanup_optimizers:
+            opt.add_requirements(fgraph)

    def apply(self, fgraph, start_from=None):
        change_tracker = ChangeTracker()
@@ -2066,17 +2110,39 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        node_created = {}
        global_sub_profs = []
        final_sub_profs = []
+        cleanup_sub_profs = []
        for opt in (self.global_optimizers +
                    list(self.get_local_optimizers()) +
-                    self.final_optimizers):
+                    self.final_optimizers +
+                    self.cleanup_optimizers):
            global_process_count.setdefault(opt, 0)
            time_opts.setdefault(opt, 0)
            node_created.setdefault(opt, 0)

+        def apply_cleanup(profs_dict):
+            changed = False
+            for copt in self.cleanup_optimizers:
+                change_tracker.reset()
+                nb = change_tracker.nb_imported
+                t_opt = time.time()
+                sub_prof = copt.apply(fgraph)
+                time_opts[copt] += time.time() - t_opt
+                profs_dict[copt].append(sub_prof)
+                if change_tracker.changed:
+                    process_count.setdefault(copt, 0)
+                    process_count[copt] += 1
+                    global_process_count[copt] += 1
+                    changed = True
+                    node_created[copt] += change_tracker.nb_imported - nb
+            return changed
+
        while changed and not max_use_abort:
            process_count = {}
            t0 = time.time()
            changed = False
+            iter_cleanup_sub_profs = {}
+            for copt in self.cleanup_optimizers:
+                iter_cleanup_sub_profs[copt] = []

            # apply global optimizers
            sub_profs = []
@@ -2101,6 +2167,10 @@ class EquilibriumOptimizer(NavigatorOptimizer):

            global_opt_timing.append(float(time.time() - t0))

+            # apply clean up as global opt can have done changes that
+            # request that
+            changed |= apply_cleanup(iter_cleanup_sub_profs)
+
            # apply local optimizer
            topo_t0 = time.time()
            q = deque(graph.io_toposort(fgraph.inputs, start_from))
@@ -2134,19 +2204,21 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                        t_opt = time.time()
                        lopt_change = self.process_node(fgraph, node, lopt)
                        time_opts[lopt] += time.time() - t_opt
-                        if lopt_change:
-                            process_count.setdefault(lopt, 0)
-                            process_count[lopt] += 1
-                            global_process_count[lopt] += 1
-                            changed = True
-                            node_created[lopt] += change_tracker.nb_imported - nb
-                            if global_process_count[lopt] > max_use:
-                                max_use_abort = True
-                                opt_name = (getattr(lopt, "name", None) or
-                                            getattr(lopt, "__name__", ""))
-                            if node not in fgraph.apply_nodes:
-                                # go to next node
-                                break
+                        if not lopt_change:
+                            continue
+                        process_count.setdefault(lopt, 0)
+                        process_count[lopt] += 1
+                        global_process_count[lopt] += 1
+                        changed = True
+                        node_created[lopt] += change_tracker.nb_imported - nb
+                        changed |= apply_cleanup(iter_cleanup_sub_profs)
+                        if global_process_count[lopt] > max_use:
+                            max_use_abort = True
+                            opt_name = (getattr(lopt, "name", None) or
+                                        getattr(lopt, "__name__", ""))
+                        if node not in fgraph.apply_nodes:
+                            # go to next node
+                            break
            finally:
                self.detach_updater(fgraph, u)

@@ -2173,6 +2245,17 @@ class EquilibriumOptimizer(NavigatorOptimizer):
            final_sub_profs.append(sub_profs)

            global_opt_timing[-1] += time.time() - t_before_final_opt
+            # apply clean up as final opt can have done changes that
+            # request that
+            changed |= apply_cleanup(iter_cleanup_sub_profs)
+            # merge clean up profiles during that iteration.
+            c_sub_profs = []
+            for copt, sub_profs in iteritems(iter_cleanup_sub_profs):
+                sub_prof = sub_profs[0]
+                for s_p in sub_profs[1:]:
+                    sub_prof = copt.merge_profile(sub_prof, s_p)
+                c_sub_profs.append(sub_prof)
+            cleanup_sub_profs.append(c_sub_profs)

            loop_process_count.append(process_count)
            loop_timing.append(float(time.time() - t0))
@@ -2188,7 +2271,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        return (self, loop_timing, loop_process_count,
                (start_nb_nodes, end_nb_nodes, max_nb_nodes),
                global_opt_timing, nb_nodes, time_opts, io_toposort_timing,
-                node_created, global_sub_profs, final_sub_profs)
+                node_created, global_sub_profs, final_sub_profs, cleanup_sub_profs)

    def print_summary(self, stream=sys.stdout, level=0, depth=-1):
        name = getattr(self, 'name', None)
@@ -2204,7 +2287,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        (opt, loop_timing, loop_process_count,
         (start_nb_nodes, end_nb_nodes, max_nb_nodes),
         global_opt_timing, nb_nodes, time_opts, io_toposort_timing,
-         node_created, global_sub_profs, final_sub_profs) = prof
+         node_created, global_sub_profs, final_sub_profs,
+         cleanup_sub_profs) = prof

        blanc = ('    ' * level)
        print(blanc, "EquilibriumOptimizer", end=' ', file=stream)
@@ -2222,6 +2306,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        print(blanc, "  time in global optimizers %.3fs" % s, file=stream)
        s = sum([time_opts[o] for o in opt.final_optimizers])
        print(blanc, "  time in final optimizers %.3fs" % s, file=stream)
+        s = sum([time_opts[o] for o in opt.cleanup_optimizers])
+        print(blanc, "  time in cleanup optimizers %.3fs" % s, file=stream)
        for i in range(len(loop_timing)):
            lopt = ""
            if loop_process_count[i]:
@@ -2245,7 +2331,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        process_count = {}
        for o in (opt.global_optimizers +
                  list(opt.get_local_optimizers()) +
-                  list(opt.final_optimizers)):
+                  list(opt.final_optimizers) +
+                  list(opt.cleanup_optimizers)):
            process_count.setdefault(o, 0)
        for count in loop_process_count:
            for o, v in iteritems(count):
@@ -2275,12 +2362,13 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                    print(blanc + "  ", '  %.3fs - %s' % (t, o), file=stream)
            print(file=stream)
        gf_opts = [o for o in (opt.global_optimizers +
-                               list(opt.final_optimizers))
+                               list(opt.final_optimizers) +
+                               list(opt.cleanup_optimizers))
                   if o.print_profile.func_code is not
                   Optimizer.print_profile.func_code]
        if not gf_opts:
            return
-        print(blanc, "Global and final optimizer", file=stream)
+        print(blanc, "Global, final and clean up optimizers", file=stream)
        for i in range(len(loop_timing)):
            print(blanc, "Iter %d" % i, file=stream)
            for o, prof in zip(opt.global_optimizers, global_sub_profs[i]):
@@ -2293,6 +2381,11 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                    o.print_profile(stream, prof, level + 2)
                except NotImplementedError:
                    print(blanc, "merge not implemented for ", o)
+            for o, prof in zip(opt.cleanup_optimizers, cleanup_sub_profs[i]):
+                try:
+                    o.print_profile(stream, prof, level + 2)
+                except NotImplementedError:
+                    print(blanc, "merge not implemented for ", o)

    @staticmethod
    def merge_profile(prof1, prof2):
@@ -2307,10 +2400,16 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                prof2[0].final_optimizers)
        else:
            final_optimizers = None
+        if len(prof1[0].cleanup_optimizers) > 0 or len(prof2[0].cleanup_optimizers) > 0:
+            cleanup_optimizers = OrderedSet(prof1[0].cleanup_optimizers).union(
+                prof2[0].cleanup_optimizers)
+        else:
+            cleanup_optimizers = None
        new_opt = EquilibriumOptimizer(
            local_optimizers.union(global_optimizers),
            max_use_ratio=1,
-            final_optimizers=final_optimizers)
+            final_optimizers=final_optimizers,
+            cleanup_optimizers=cleanup_optimizers)

        def merge_list(l1, l2):
            l = copy.copy(l1)
@@ -2321,15 +2420,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                    l.append(nb)
            return l

-        def merge_dict(d1, d2):
-            d = d1.copy()
-            for k, v in iteritems(d2):
-                if k in d:
-                    d[k] += v
-                else:
-                    d[k] = v
-            return d
-
        loop_timing = merge_list(prof1[1], prof2[1])

        loop_process_count = list(prof1[2])
@@ -2358,6 +2448,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        node_created = merge_dict(prof1[8], prof2[8])
        global_sub_profs = merge_list(prof1[9], prof2[9])
        final_sub_profs = merge_list(prof1[10], prof2[10])
+        cleanup_sub_profs = merge_list(prof1[10], prof2[10])
        return (new_opt,
                loop_timing,
                loop_process_count,
@@ -2368,7 +2459,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                io_toposort_timing,
                node_created,
                global_sub_profs,
-                final_sub_profs)
+                final_sub_profs,
+                cleanup_sub_profs)

 #################
 #   Utilities   #

--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -268,28 +268,35 @@ class EquilibriumDB(DB):
        super(EquilibriumDB, self).__init__()
        self.ignore_newtrees = ignore_newtrees
        self.__final__ = {}
+        self.__cleanup__ = {}

    def register(self, name, obj, *tags, **kwtags):
-        if 'final_opt' in kwtags:
-            final_opt = kwtags['final_opt']
-            kwtags.pop('final_opt', None)
-        else:
-            final_opt = False
+        final_opt = kwtags.pop('final_opt', False)
+        cleanup = kwtags.pop('cleanup', False)
+        # An opt should not be final and clean up
+        assert not (final_opt and cleanup)
        super(EquilibriumDB, self).register(name, obj, *tags, **kwtags)
        self.__final__[name] = final_opt
+        self.__cleanup__[name] = cleanup

    def query(self, *tags, **kwtags):
        _opts = super(EquilibriumDB, self).query(*tags, **kwtags)
        final_opts = [o for o in _opts if self.__final__.get(o.name, False)]
-        opts = [o for o in _opts if o not in final_opts]
+        cleanup_opts = [o for o in _opts if self.__cleanup__.get(o.name,
+                                                                 False)]
+        opts = [o for o in _opts
+                if o not in final_opts and o not in cleanup_opts]
        if len(final_opts) == 0:
            final_opts = None
+        if len(cleanup_opts) == 0:
+            cleanup_opts = None
        return opt.EquilibriumOptimizer(
            opts,
            max_use_ratio=config.optdb.max_use_ratio,
            ignore_newtrees=self.ignore_newtrees,
            failure_callback=opt.NavigatorOptimizer.warn_inplace,
-            final_optimizers=final_opts)
+            final_optimizers=final_opts,
+            cleanup_optimizers=cleanup_opts)


 class SequenceDB(DB):

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -3622,7 +3622,7 @@ class GpuAllocEmpty(GpuOp):
                const_shp = tensor.get_scalar_constant_value(s)
            except tensor.NotScalarConstantError:
                const_shp = None
-            bcast.append(numpy.all(1 == const_shp))
+            bcast.append(1 == const_shp)
        otype = CudaNdarrayType(dtype='float32', broadcastable=bcast)
        output = otype()
        return sh, output

--- a/theano/sandbox/cuda/cudnn_helper.h
+++ b/theano/sandbox/cuda/cudnn_helper.h
@@ -48,7 +48,7 @@ cudnnSetTensorNdDescriptor(
  int nbDims,
  const int dimA[],
  const int strideA[]) {
-  if (ndDims != 4) return CUDNN_STATUS_NOT_SUPPORTED;
+  if (nbDims != 4) return CUDNN_STATUS_NOT_SUPPORTED;
  return cudnnSetTensor4dDescriptorEx(
    tensorDesc, dataType,
    dimA[0], dimA[1], dimA[2], dimA[3],
@@ -204,7 +204,7 @@ cudnnSetPoolingNdDescriptor(
  int nbDims,
  const int windowDimA[],
  const int paddingA[],
-  const in strideA[]) {
+  const int strideA[]) {
  if (nbDims != 2) return CUDNN_STATUS_NOT_SUPPORTED;
  if (paddingA[0] != 0 || paddingA[1] != 0) return CUDNN_STATUS_NOT_SUPPORTED;
  return cudnnSetPoolingDescriptor(poolingDesc, mode,
@@ -223,7 +223,7 @@ cudnnGetPoolingNdDescriptor(
  int strideA[]) {
  int win0, win1, str0, str1;
  cudnnStatus_t err;
-  if (ndDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
+  if (nbDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
  err = cudnnGetPoolingDescriptor(poolingDesc, mode, &win0, &win1,
                                  &str0, &str1);
  if (err != CUDNN_STATUS_SUCCESS) return err;

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -1760,7 +1760,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
        Subsampling stride (default: (1, 1)).
    mode : {'max', 'average_inc_pad', 'average_exc_pad}
    pad
-    	(pad_h, pad_w) padding information.
+        (pad_h, pad_w) padding information.
        pad_h is the number of zero-valued pixels added to each of the top and
        bottom borders.
        pad_w is the number of zero-valued pixels added to each of the left

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -104,7 +104,7 @@ optdb.register('gpu_after_fusion',
               'gpu')

 # Register merge_optimizer as a global opt
-gpu_optimizer.register('gpu_merge', theano.gof.opt.merge_optimizer,
+gpu_optimizer.register('gpu_merge', theano.gof.opt.MergeOptimizer(),
                       'fast_run', 'fast_compile', final_opt=True)



--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -81,7 +81,7 @@ class CudaNdarrayType(Type):
            raise TypeError('%s only supports dtype float32 for now. Tried '
                            'using dtype %s for variable %s' %
                            (self.__class__.__name__, dtype, name))
-        self.broadcastable = tuple(broadcastable)
+        self.broadcastable = tuple(bool(b) for b in broadcastable)
        self.name = name
        self.dtype_specs()  # error checking is done there


--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2673,7 +2673,7 @@ class Alloc(gof.Op):
                const_shp = get_scalar_constant_value(s)
            except NotScalarConstantError:
                const_shp = None
-            bcast.append(numpy.all(1 == const_shp))
+            bcast.append(1 == const_shp)
        return sh, bcast

    def make_node(self, value, *shape):
@@ -6037,7 +6037,7 @@ class AllocEmpty(gof.Op):
                const_shp = get_scalar_constant_value(s)
            except NotScalarConstantError:
                const_shp = None
-            bcast.append(numpy.all(1 == const_shp))
+            bcast.append(1 == const_shp)
        otype = TensorType(dtype=self.dtype, broadcastable=bcast)
        output = otype()
        return sh, output

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -47,7 +47,6 @@ from theano.tensor.type import (values_eq_approx_remove_inf,

 from theano.gof.opt import (Optimizer, pre_constant_merge,
                            pre_greedy_local_optimizer)
-from theano.gof.opt import merge_optimizer
 from theano.gof import toolbox
 from theano.tensor.basic import get_scalar_constant_value, ShapeError, NotScalarConstantError
 from six import StringIO
@@ -452,8 +451,9 @@ def register_canonicalize(lopt, *tags, **kwargs):
            return register_canonicalize(inner_lopt, lopt, *tags, **kwargs)
        return register
    else:
-        name = (kwargs and kwargs.pop('name')) or lopt.__name__
-        compile.optdb['canonicalize'].register(name, lopt, 'fast_run', *tags)
+        name = kwargs.pop('name', None) or lopt.__name__
+        compile.optdb['canonicalize'].register(name, lopt, 'fast_run',
+                                               *tags, **kwargs)
        return lopt


@@ -463,8 +463,9 @@ def register_stabilize(lopt, *tags, **kwargs):
            return register_stabilize(inner_lopt, lopt, *tags, **kwargs)
        return register
    else:
-        name = (kwargs and kwargs.pop('name')) or lopt.__name__
-        compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
+        name = kwargs.pop('name', None) or lopt.__name__
+        compile.optdb['stabilize'].register(name, lopt, 'fast_run',
+                                            *tags, **kwargs)
        return lopt


@@ -474,9 +475,9 @@ def register_specialize(lopt, *tags, **kwargs):
            return register_specialize(inner_lopt, lopt, *tags, **kwargs)
        return register
    else:
-        name = (kwargs and kwargs.pop('name')) or lopt.__name__
+        name = kwargs.pop('name', None) or lopt.__name__
        compile.optdb['specialize'].register(name, lopt, 'fast_run',
-                                             *tags)
+                                             *tags, **kwargs)
        return lopt


@@ -502,11 +503,6 @@ def register_specialize_device(lopt, *tags, **kwargs):
        return lopt


-# Register merge_optimizer as a global opt during canonicalize
-compile.optdb['canonicalize'].register('canon_merge', merge_optimizer,
-                                       'fast_run', final_opt=True)
-
-
 #####################
 # Dot optimizations #
 #####################
@@ -1414,6 +1410,172 @@ theano.compile.mode.optdb.register('ShapeOpt', ShapeOptimizer(),
                                   0.1, 'fast_run', 'fast_compile')


+def local_elemwise_alloc_op(ElemwiseOP, AllocOP, DimShuffleOP):
+    def local_elemwise_alloc(node):
+        """
+        elemwise(alloc(x, shp), ..., y.TensorType(BROADCAST CONDITION))
+          -> elemwise(x, y.TensorType(BROADCAST CONDITION))
+
+        elemwise(dimshuffle(alloc(x, shp)),... ,y.TensorType(BROADCAST CONDITION))
+          -> elemwise(x.dimshuffle(...), y.TensorType(BROADCAST CONDITION))
+
+        BROADCAST CONDITION: the condition is that the one input that are
+        not to be optimized to have the same broadcast pattern as the
+        output.
+
+        We can change the alloc by a dimshuffle as the elemwise
+        already have the shape info.  The dimshuffle will be faster
+        to exec.
+
+        """
+        if not isinstance(node.op, ElemwiseOP):
+            return False
+
+        if len(node.outputs) > 1:
+            # Ensure all outputs have the same broadcast pattern
+            # This is a supposition that I'm not sure is always true.
+            assert all([o.type.broadcastable ==
+                        node.outputs[0].type.broadcastable for o in
+                        node.outputs[1:]])
+
+        # The broadcast pattern of the ouptut must match the broadcast
+        # pattern of at least one of the inputs.
+        if not any([i.type.broadcastable ==
+                    node.outputs[0].type.broadcastable for i in node.inputs]):
+            return False
+
+        def dimshuffled_alloc(i):
+            return (isinstance(i.owner.op, DimShuffleOP) and
+                    i.owner.inputs[0].owner and
+                    isinstance(i.owner.inputs[0].owner.op, AllocOP))
+
+        # At least one input must have an owner that is either a AllocOP or a
+        # DimShuffleOP with an owner that is a AllocOP -- otherwise there is
+        # nothing to optimize.
+        if not any([i.owner and (isinstance(i.owner.op, AllocOP) or
+                                 dimshuffled_alloc(i)) for i in node.inputs]):
+            return False
+
+        # Search for input that we can use as a baseline for the dimensions.
+        assert_op_idx = -1
+        for idx, i in enumerate(node.inputs):
+            if i.type.broadcastable == node.outputs[0].type.broadcastable:
+                # Prefer an input that is not a AllocOP nor a DimShuffleOP of a
+                # AllocOP so that all allocs can be optimized.
+                if not (i.owner and (isinstance(i.owner.op, AllocOP) or
+                        dimshuffled_alloc(i))):
+                    assert_op_idx = idx
+                    break
+
+        # It may be the case that only AllocOP and DimShuffleOP of AllocOP exist.
+        if assert_op_idx < 0:
+            # We want to optimize as many allocs as possible. When
+            # there is more than one then do all but one.  number of
+            # inputs with alloc or dimshuffle alloc
+            l2 = [i for i in node.inputs
+                  if (i.owner and (isinstance(i.owner.op, AllocOP) or
+                      dimshuffled_alloc(i)))]
+            # If only 1 alloc or dimshuffle alloc, it is the one we
+            # will use for the shape. So no alloc would be removed.
+            if len(l2) > 1:
+                # l containt inputs with alloc or dimshuffle alloc
+                # only.  Its length will always be at least one, as we
+                # checked that before
+                l = [idx for idx, i in enumerate(node.inputs)
+                     if i.broadcastable == node.outputs[0].broadcastable]
+                assert_op_idx = l[0]  # The first one is as good as any to use.
+            else:
+                # Nothing would be optimized!
+                return False
+
+        assert_op = node.inputs[assert_op_idx]
+        cmp_op = assert_op
+        new_i = []
+        same_shape = node.fgraph.shape_feature.same_shape
+        for i in node.inputs:
+            # Remove alloc
+            if (i.owner and isinstance(i.owner.op, AllocOP) and
+                    i.owner.inputs[0].type != i.owner.outputs[0].type):
+                # when i.owner.inputs[0].type == i.owner.outputs[0].type we
+                # will remove that alloc later
+                assert i.type.ndim == cmp_op.ndim
+                if (theano.config.experimental.local_alloc_elemwise_assert and
+                        not same_shape(i, cmp_op)):
+                    assert_op = assert_(assert_op,
+                                        *[T.eq(i.shape[idx], cmp_op.shape[idx])
+                                          for idx in xrange(i.type.ndim)
+                                          if not i.type.broadcastable[idx]])
+                new_i.append(i.owner.inputs[0])
+
+            # Remove Alloc in DimShuffle
+            elif i.owner and dimshuffled_alloc(i):
+                assert i.type.ndim == cmp_op.type.ndim
+                if theano.config.experimental.local_alloc_elemwise_assert:
+                    assert_cond = [T.eq(i.shape[idx], cmp_op.shape[idx])
+                                   for idx in xrange(i.type.ndim)
+                                   if not i.type.broadcastable[idx] and
+                                   not same_shape(i, cmp_op, idx, idx)]
+                    if assert_cond:
+                        assert_op = assert_(assert_op, *assert_cond)
+                alloc_input = i.owner.inputs[0].owner.inputs[0]
+                if alloc_input.ndim != i.owner.inputs[0].ndim:
+                    # The alloc can add dimension to the value
+                    # We add a dimshuffle to add them.
+                    # We let later optimization merge the multiple dimshuffle
+                    nb_dim_to_add = i.owner.inputs[0].ndim - alloc_input.ndim
+                    alloc_input = alloc_input.dimshuffle(
+                        ['x'] * nb_dim_to_add +
+                        list(range(alloc_input.ndim)))
+
+                # We need to keep the dimshuffle. It could swap axes or
+                # add dimensions anywhere.
+                r_i = i.owner.op(alloc_input)
+
+                # Copy stack trace from i to new_i
+                copy_stack_trace(i, r_i)
+                new_i.append(r_i)
+            else:
+                new_i.append(i)
+        new_i[assert_op_idx] = assert_op
+
+        ret = node.op(*new_i, return_list=True)
+
+        # Copy over stack trace from previous outputs to new outputs.
+        copy_stack_trace(node.outputs, ret)
+        return ret
+
+    return local_elemwise_alloc
+
+# TODO, global optimizer that lift the assert to the beginning of the graph.
+# TODO, optimize all inputs when possible -- currently when all inputs have
+# an alloc all but one is optimized.
+
+local_elemwise_alloc = register_specialize(
+    gof.local_optimizer([T.Elemwise])(
+        local_elemwise_alloc_op(T.Elemwise, T.Alloc, T.DimShuffle)),
+    'local_alloc_elemwise')
+
+theano.configparser.AddConfigVar('experimental.local_alloc_elemwise',
+                                 "DEPRECATED: If True, enable the experimental"
+                                 " optimization local_alloc_elemwise."
+                                 " Generates error if not True. Use"
+                                 " optimizer_excluding=local_alloc_elemwise"
+                                 " to dsiable.",
+                                 theano.configparser.BoolParam(
+                                     True,
+                                     is_valid=lambda x: x
+                                 ),
+                                 in_c_key=False)
+
+# False could make the graph faster but not as safe.
+theano.configparser.AddConfigVar(
+    'experimental.local_alloc_elemwise_assert',
+    "When the local_alloc_elemwise is applied, add"
+    " an assert to highlight shape errors.",
+    theano.configparser.BoolParam(True),
+    in_c_key=False)
+
+
 @gof.local_optimizer([T.Elemwise])
 def local_fill_sink(node):
    """
@@ -1443,7 +1605,6 @@ def local_fill_sink(node):
    # The newly created node c doesn't has 'clients',
    # so this iteration is took place with node.outputs[0]
    replacements = {node.outputs[0]: c}
-    all_clients_replaced = True
    for client, cl_idx in node.outputs[0].clients:
        if (hasattr(client, 'op') and
                isinstance(client.op, T.Elemwise) and
@@ -1456,13 +1617,8 @@ def local_fill_sink(node):
            new_client.owner.outputs[0].clients = client.outputs[0].clients
            r = local_fill_sink.transform(new_client.owner)
            if not r:
-                all_clients_replaced = False
                continue
            replacements.update(r)
-        else:
-            all_clients_replaced = False
-    if all_clients_replaced:
-        replacements.pop(node.outputs[0], None)
    return replacements

 register_canonicalize(local_fill_sink)
@@ -1470,7 +1626,7 @@ register_canonicalize(local_fill_sink)

 @register_specialize
 @register_stabilize
-@register_canonicalize
+# @register_canonicalize  # We make full pass after the canonizer phase.
 @gof.local_optimizer([T.fill])
 def local_fill_to_alloc(node):
    """fill(s,v) -> alloc(v, shape(s))
@@ -1510,7 +1666,18 @@ def local_fill_to_alloc(node):
            node,)  # theano.printing.debugprint(node.outputs[0], file='str'))
        return rval

+# Register this after stabilize at 1.5 to make sure stabilize don't
+# get affected by less canonicalized graph due to alloc.
+compile.optdb.register('local_fill_to_alloc',
+                       in2out(local_fill_to_alloc),
+                       1.51, 'fast_run')
+# Needed to clean some extra alloc added by local_fill_to_alloc
+compile.optdb.register('local_elemwise_alloc',
+                       in2out(local_elemwise_alloc),
+                       1.52, 'fast_run')

+
+@register_canonicalize("fast_compile")
 @gof.local_optimizer([T.fill])
 def local_useless_fill(node):
    """fill(s,v) -> v
@@ -1526,9 +1693,6 @@ def local_useless_fill(node):
            # this is a useless fill, erase it.
            # also, we don't need to copy over any stack traces here
            return [v]
-compile.optdb['canonicalize'].register('local_useless_fill',
-                                       in2out(local_useless_fill),
-                                       1.1, 'fast_compile')


 @register_specialize
@@ -2009,172 +2173,6 @@ compile.optdb['specialize'].register('local_remove_all_assert',
                                     'unsafe',
                                     use_db_name_as_tag=False)

-
-def local_elemwise_alloc_op(ElemwiseOP, AllocOP, DimShuffleOP):
-    def local_elemwise_alloc(node):
-        """
-        elemwise(alloc(x, shp), ..., y.TensorType(BROADCAST CONDITION))
-          -> elemwise(x, y.TensorType(BROADCAST CONDITION))
-
-        elemwise(dimshuffle(alloc(x, shp)),... ,y.TensorType(BROADCAST CONDITION))
-          -> elemwise(x.dimshuffle(...), y.TensorType(BROADCAST CONDITION))
-
-        BROADCAST CONDITION: the condition is that the one input that are
-        not to be optimized to have the same broadcast pattern as the
-        output.
-
-        We can change the alloc by a dimshuffle as the elemwise
-        already have the shape info.  The dimshuffle will be faster
-        to exec.
-
-        """
-        if not isinstance(node.op, ElemwiseOP):
-            return False
-
-        if len(node.outputs) > 1:
-            # Ensure all outputs have the same broadcast pattern
-            # This is a supposition that I'm not sure is always true.
-            assert all([o.type.broadcastable ==
-                        node.outputs[0].type.broadcastable for o in
-                        node.outputs[1:]])
-
-        # The broadcast pattern of the ouptut must match the broadcast
-        # pattern of at least one of the inputs.
-        if not any([i.type.broadcastable ==
-                    node.outputs[0].type.broadcastable for i in node.inputs]):
-            return False
-
-        def dimshuffled_alloc(i):
-            return (isinstance(i.owner.op, DimShuffleOP) and
-                    i.owner.inputs[0].owner and
-                    isinstance(i.owner.inputs[0].owner.op, AllocOP))
-
-        # At least one input must have an owner that is either a AllocOP or a
-        # DimShuffleOP with an owner that is a AllocOP -- otherwise there is
-        # nothing to optimize.
-        if not any([i.owner and (isinstance(i.owner.op, AllocOP) or
-                                 dimshuffled_alloc(i)) for i in node.inputs]):
-            return False
-
-        # Search for input that we can use as a baseline for the dimensions.
-        assert_op_idx = -1
-        for idx, i in enumerate(node.inputs):
-            if i.type.broadcastable == node.outputs[0].type.broadcastable:
-                # Prefer an input that is not a AllocOP nor a DimShuffleOP of a
-                # AllocOP so that all allocs can be optimized.
-                if not (i.owner and (isinstance(i.owner.op, AllocOP) or
-                        dimshuffled_alloc(i))):
-                    assert_op_idx = idx
-                    break
-
-        # It may be the case that only AllocOP and DimShuffleOP of AllocOP exist.
-        if assert_op_idx < 0:
-            # We want to optimize as many allocs as possible. When
-            # there is more than one then do all but one.  number of
-            # inputs with alloc or dimshuffle alloc
-            l2 = [i for i in node.inputs
-                  if (i.owner and (isinstance(i.owner.op, AllocOP) or
-                      dimshuffled_alloc(i)))]
-            # If only 1 alloc or dimshuffle alloc, it is the one we
-            # will use for the shape. So no alloc would be removed.
-            if len(l2) > 1:
-                # l containt inputs with alloc or dimshuffle alloc
-                # only.  Its length will always be at least one, as we
-                # checked that before
-                l = [idx for idx, i in enumerate(node.inputs)
-                     if i.broadcastable == node.outputs[0].broadcastable]
-                assert_op_idx = l[0]  # The first one is as good as any to use.
-            else:
-                # Nothing would be optimized!
-                return False
-
-        assert_op = node.inputs[assert_op_idx]
-        cmp_op = assert_op
-        new_i = []
-        same_shape = node.fgraph.shape_feature.same_shape
-        for i in node.inputs:
-            # Remove alloc
-            if (i.owner and isinstance(i.owner.op, AllocOP) and
-                    i.owner.inputs[0].type != i.owner.outputs[0].type):
-                # when i.owner.inputs[0].type == i.owner.outputs[0].type we
-                # will remove that alloc later
-                assert i.type.ndim == cmp_op.ndim
-                if (theano.config.experimental.local_alloc_elemwise_assert and
-                        not same_shape(i, cmp_op)):
-                    assert_op = assert_(assert_op,
-                                        *[T.eq(i.shape[idx], cmp_op.shape[idx])
-                                          for idx in xrange(i.type.ndim)
-                                          if not i.type.broadcastable[idx]])
-                new_i.append(i.owner.inputs[0])
-
-            # Remove Alloc in DimShuffle
-            elif i.owner and dimshuffled_alloc(i):
-                assert i.type.ndim == cmp_op.type.ndim
-                if theano.config.experimental.local_alloc_elemwise_assert:
-                    assert_cond = [T.eq(i.shape[idx], cmp_op.shape[idx])
-                                   for idx in xrange(i.type.ndim)
-                                   if not i.type.broadcastable[idx] and
-                                   not same_shape(i, cmp_op, idx, idx)]
-                    if assert_cond:
-                        assert_op = assert_(assert_op, *assert_cond)
-                alloc_input = i.owner.inputs[0].owner.inputs[0]
-                if alloc_input.ndim != i.owner.inputs[0].ndim:
-                    # The alloc can add dimension to the value
-                    # We add a dimshuffle to add them.
-                    # We let later optimization merge the multiple dimshuffle
-                    nb_dim_to_add = i.owner.inputs[0].ndim - alloc_input.ndim
-                    alloc_input = alloc_input.dimshuffle(
-                        ['x'] * nb_dim_to_add +
-                        list(range(alloc_input.ndim)))
-
-                # We need to keep the dimshuffle. It could swap axes or
-                # add dimensions anywhere.
-                r_i = i.owner.op(alloc_input)
-
-                # Copy stack trace from i to new_i
-                copy_stack_trace(i, r_i)
-                new_i.append(r_i)
-            else:
-                new_i.append(i)
-        new_i[assert_op_idx] = assert_op
-
-        ret = node.op(*new_i, return_list=True)
-
-        # Copy over stack trace from previous outputs to new outputs.
-        copy_stack_trace(node.outputs, ret)
-        return ret
-
-    return local_elemwise_alloc
-
-# TODO, global optimizer that lift the assert to the beginning of the graph.
-# TODO, optimize all inputs when possible -- currently when all inputs have
-# an alloc all but one is optimized.
-
-local_elemwise_alloc = register_specialize(
-    gof.local_optimizer([T.Elemwise])(
-        local_elemwise_alloc_op(T.Elemwise, T.Alloc, T.DimShuffle)),
-    'local_alloc_elemwise')
-
-theano.configparser.AddConfigVar('experimental.local_alloc_elemwise',
-                                 "DEPRECATED: If True, enable the experimental"
-                                 " optimization local_alloc_elemwise."
-                                 " Generates error if not True. Use"
-                                 " optimizer_excluding=local_alloc_elemwise"
-                                 " to dsiable.",
-                                 theano.configparser.BoolParam(
-                                     True,
-                                     is_valid=lambda x: x
-                                 ),
-                                 in_c_key=False)
-
-# False could make the graph faster but not as safe.
-theano.configparser.AddConfigVar(
-    'experimental.local_alloc_elemwise_assert',
-    "When the local_alloc_elemwise is applied, add"
-    " an assert to highlight shape errors.",
-    theano.configparser.BoolParam(True),
-    in_c_key=False)
-
 #######################
 # Constant Canonicalization
 ############################
@@ -4018,7 +4016,9 @@ class Canonizer(gof.LocalOptimizer):
        """
        if isinstance(v, Variable):
            try:
-                return get_scalar_constant_value(v)
+                # As the constant folding is in the canonicalize phase,
+                # We don't need to check all the graph each time.
+                return get_scalar_constant_value(v, only_process_constants=True)
            except NotScalarConstantError:
                return None
        else:
@@ -5467,9 +5467,6 @@ def local_greedy_distributor(node):
    return [rval]


-@register_canonicalize('fast_compile')
-@register_stabilize('fast_compile')
-@register_specialize('fast_compile')
 @gof.local_optimizer(None)
 def constant_folding(node):
    for input in node.inputs:
@@ -5519,6 +5516,13 @@ def constant_folding(node):
    return rval


+topo_constant_folding = in2out(constant_folding, ignore_newtrees=True,
+                               name="topo_constant_folding")
+register_canonicalize(topo_constant_folding, 'fast_compile', final_opt=True)
+register_stabilize(topo_constant_folding, 'fast_compile', final_opt=True)
+register_specialize(topo_constant_folding, 'fast_compile', final_opt=True)
+
+
 def _is_1(expr):
    """

@@ -5758,7 +5762,7 @@ def local_log_erfc(node):
 #                            sqrt(pi)*-x/(1-1/(2*x**2)+3/(4*x**4)-15/(8*x**6)))
 # for float64: threshold=26.63 see at the end of the fct for the explaination
 # for float32: threshold=9.3 see at the end of the fct for the explaination
-# TODO: remove the contraint that there are only 2 inputs to mul and exp(x**2)
+# TODO: remove the contraint that there are only 2 inputs to exp(x**2)
 #      is the second.
 # TODO: at the test point 10 in float32, there is instability in the original
 #      value. The original gives -30.0, the stab -20.1 and in float64 -18.1.
@@ -5779,20 +5783,23 @@ def local_grad_log_erfc_neg(node):
    # The mul is optional.
    if node.inputs[0].owner.op != T.mul:
        mul = None
-        y = 1
+        y = []
        if not node.inputs[0].owner or node.inputs[0].owner.op != T.exp:
            return False
        exp = node.inputs[0]
    else:
        mul = node.inputs[0]
-        if mul.owner.inputs[0].owner or len(mul.owner.inputs) != 2:
-            return False
-        y = mul.owner.inputs[0]
-        if (not mul.owner.inputs[1].owner or
-                mul.owner.inputs[1].owner.op != T.exp):
-            return False
-        exp = mul.owner.inputs[1]
-
+        exp = None
+        for idx, inp in enumerate(mul.owner.inputs):
+            if inp.owner and inp.owner.op == T.exp:
+                exp = inp
+                break
+        if len(mul.owner.inputs) == 2:
+            y = [mul.owner.inputs[1 - idx]]
+        else:
+            y = mul.owner.inputs[:]
+            del y[idx]
+    del mul
    if not exp.owner.inputs[0].owner:
        return False

@@ -5894,9 +5901,10 @@ def local_grad_log_erfc_neg(node):
        # threshold = 10.1
    elif x.dtype == 'float64':
        threshold = 26.641747557
-    ret = T.switch(x < threshold, true_div_no_mul, stab_value) * y
+    ret = T.switch(x < threshold, true_div_no_mul, stab_value)
+    if y:
+        ret = T.mul(ret, *y)
    ret.values_eq_approx = values_eq_approx_remove_inf_nan
-
    return [ret]
    """
 The libm used for the test is amdlibm

--- a/theano/tensor/signal/downsample.py
+++ b/theano/tensor/signal/downsample.py
@@ -256,7 +256,10 @@ class DownsampleFactorMax(Op):
            raise TypeError()
        # TODO: consider restricting the dtype?
        x = tensor.as_tensor_variable(x)
-        return gof.Apply(self, [x], [x.type()])
+        # If the input shape are broadcastable we can have 0 in the output shape
+        broad = x.broadcastable[:2] + (False, False)
+        out = tensor.TensorType(x.dtype, broad)
+        return gof.Apply(self, [x], [out()])

    def perform(self, node, inp, out):
        x, = inp

--- a/theano/tensor/signal/tests/test_downsample.py
+++ b/theano/tensor/signal/tests/test_downsample.py
@@ -801,6 +801,16 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                                            [image_val, maxout_val, gz_val],
                                            MaxPoolGrad,
                                            warn=False)
+        # checking with broadcastable input
+        image = tensor.tensor(dtype='float64',
+                              broadcastable=(False, False, True, True))
+        image_val = rng.rand(4, 6, 1, 1)
+        self._compile_and_check(
+            [image],
+            [DownsampleFactorMax((2, 2),
+                                 ignore_border=True,
+                                 padding=(0, 0))(image)],
+            [image_val], DownsampleFactorMax)

    def test_opt_max_to_average(self):
        im = theano.tensor.tensor4()

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -481,7 +481,7 @@ class test_canonize(unittest.TestCase):
        mode = compile.mode.get_default_mode()

        opt = gof.Query(["canonicalize"])
-        opt = opt.including('ShapeOpt')
+        opt = opt.including('ShapeOpt', 'local_fill_to_alloc')
        opt = opt.excluding(
            'local_elemwise_fusion')
        mode = mode.__class__(linker=mode.linker, optimizer=opt)
@@ -4021,7 +4021,8 @@ class T_Rebroadcast(unittest.TestCase):

 class T_useless_elemwise(unittest.TestCase):
    def setUp(self):
-        self.mode = theano.compile.get_default_mode().including('canonicalize')
+        self.mode = theano.compile.get_default_mode().including(
+            'canonicalize', 'local_fill_to_alloc')

    def test_eq(self):
        x = T.dmatrix()
@@ -4545,7 +4546,7 @@ class T_local_erfc(unittest.TestCase):

        # test that we work without the mul
        f = theano.function([x], T.exp(T.neg(T.sqr(x))) / T.erfc(x), mode=mode)
-        assert len(f.maker.fgraph.apply_nodes) == 23, len(f.maker.fgraph.apply_nodes)
+        assert len(f.maker.fgraph.apply_nodes) == 22, len(f.maker.fgraph.apply_nodes)
        assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
        assert all(numpy.isfinite(f(val)))

@@ -4558,7 +4559,7 @@ class T_local_erfc(unittest.TestCase):

        # test that we work without the sqr and neg
        f = theano.function([x], T.exp(T.mul(-1, x, x)) / T.erfc(x), mode=mode)
-        assert len(f.maker.fgraph.apply_nodes) == 22, len(f.maker.fgraph.apply_nodes)
+        assert len(f.maker.fgraph.apply_nodes) == 21, len(f.maker.fgraph.apply_nodes)
        assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
        assert all(numpy.isfinite(f(val)))