Merge pull request #4524 from nouiz/opt

Opt related changes.

Merge pull request #4524 from nouiz/opt
3c70348f · Frédéric Bastien · 0d844076 · 96430899 · 3c70348f · 3c70348f
--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -84,10 +84,15 @@ def _atexit_print_fn():
                    cum_attr[key] = val
            if cum.optimizer_profile and ps.optimizer_profile:
-                merge = cum.optimizer_profile[0].merge_profile(
+                try:
-                    cum.optimizer_profile[1],
+                    merge = cum.optimizer_profile[0].merge_profile(
-                    ps.optimizer_profile[1])
+                        cum.optimizer_profile[1],
-                cum.optimizer_profile = (cum.optimizer_profile[0], merge)
+                        ps.optimizer_profile[1])
+                    cum.optimizer_profile = (cum.optimizer_profile[0], merge)
+                except Exception as e:
+                    print("Got an exception while merging profile")
+                    print(e)
+                    cum.optimizer_profile = None
            else:
                cum.optimizer_profile = None

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -220,8 +220,10 @@ class SeqOptimizer(Optimizer, list):
        if fgraph.profile:
            validate_before = fgraph.profile.validate_time
            sub_validate_time = [validate_before]
+            callbacks_before = fgraph.execute_callbacks_times.copy()
        else:
            sub_validate_time = []
+            callbacks_before = []
        callback_before = fgraph.execute_callbacks_time
        nb_node_before = len(fgraph.apply_nodes)
        sub_profs = []
@@ -249,12 +251,22 @@ class SeqOptimizer(Optimizer, list):
        if fgraph.profile:
            validate_time = fgraph.profile.validate_time - validate_before
+            callbacks_time = {}
+            for k, v in iteritems(fgraph.execute_callbacks_times):
+                if k in callbacks_before:
+                    t = v - callbacks_before[k]
+                    if t > 0:
+                        callbacks_time[k] = t
+                else:
+                    callbacks_time[k] = v
        else:
            validate_time = None
+            callbacks_time = {}
        callback_time = fgraph.execute_callbacks_time - callback_before
        return (self, l, validate_time, callback_time, nb_node_before,
                len(fgraph.apply_nodes), sub_profs, sub_validate_time,
-                nb_nodes)
+                nb_nodes, callbacks_time)
    def __str__(self):
        return "SeqOpt(%s)" % list.__str__(self)
@@ -274,8 +286,9 @@ class SeqOptimizer(Optimizer, list):
    @staticmethod
    def print_profile(stream, prof, level=0):
-        (opts, prof, validate_time, callback_time, nb_node_before,
+        (opts, prof, validate_time, callback_time,
-         nb_node_after, sub_profs, sub_validate_time, nb_nodes) = prof
+         nb_node_before, nb_node_after, sub_profs, sub_validate_time,
+         nb_nodes, callbacks_time) = prof
        blanc = ('    ' * level)
        print(blanc, "SeqOptimizer", end=' ', file=stream)
@@ -287,9 +300,20 @@ class SeqOptimizer(Optimizer, list):
               " before/after optimization" % (
                   sum(prof), nb_node_before, nb_node_after)), file=stream)
        print(blanc, "  %.3fs for callback" % (callback_time), file=stream)
-        print(blanc, "      %.3fs for fgraph.validate()" % (validate_time), file=stream)
+        print(blanc, "      %.3fs for fgraph.validate()" % (validate_time),
+              file=stream)
+        if callback_time > 1:
+            print(blanc, "  callbacks_time", file=stream)
+            for i in sorted(iteritems(callbacks_time), key=lambda a: -a[1]):
+                if i[1] > 0:
+                    # We want to have the __str__ called, so we can't
+                    # just print i.
+                    print(blanc, "      ", i[0], ',', i[1], file=stream)
        if level == 0:
-            print(blanc, "  time      - (name, class, index, nodes before, nodes after) - validate time", file=stream)
+            print(blanc,
+                  "  time      - (name, class, index, nodes before, nodes after) - validate time",
+                  file=stream)
        ll = []
        for opt in opts:
            if hasattr(opt, "__name__"):
@@ -298,7 +322,7 @@ class SeqOptimizer(Optimizer, list):
                name = opt.name
            idx = opts.index(opt)
            ll.append((name, opt.__class__.__name__,
-                       idx) + nb_nodes[idx])
+                       idx))
        lll = sorted(zip(prof, ll, nb_nodes), key=lambda a: a[0])
        for (t, opt, nb_n) in lll[::-1]:
@@ -375,6 +399,7 @@ class SeqOptimizer(Optimizer, list):
            new_sub_profile.append(p[6][idx])
        new_opt = SeqOptimizer(*new_l)
+        new_callbacks_times = merge_dict(prof1[9], prof2[9])
        # We need to assert based on the name as we merge also based on
        # the name.
        assert set([l.name for l in prof1[0]]).issubset(
@@ -384,7 +409,8 @@ class SeqOptimizer(Optimizer, list):
        assert len(new_t) == len(new_opt) == len(new_sub_profile)
        return (new_opt, new_t, prof1[2] + prof2[2],
                prof1[3] + prof2[3],
-                -1, -1, new_sub_profile, [])
+                -1, -1, new_sub_profile, [],
+                new_callbacks_times)
 class _metadict:
@@ -838,7 +864,9 @@ class MergeOptimizer(Optimizer):
            callbacks_time = {}
            for k, v in iteritems(fgraph.execute_callbacks_times):
                if k in callbacks_before:
-                    callbacks_time[k] = v - callbacks_before[k]
+                    t = v - callbacks_before[k]
+                    if t > 0:
+                        callbacks_time[k] = t
                else:
                    callbacks_time[k] = v
        else:
@@ -868,7 +896,9 @@ class MergeOptimizer(Optimizer):
            print(blanc, "  callbacks_time", file=stream)
            for i in sorted(iteritems(callbacks_time), key=lambda a: a[1]):
                if i[1] > 0:
-                    print(i)
+                    # We want to have the __str__ called, so we can't
+                    # just print i.
+                    print(blanc, "      ", i[0], ',', i[1], file=stream)
    @staticmethod
    def merge_profile(prof1, prof2):
@@ -1591,10 +1621,14 @@ class PatternSub(LocalOptimizer):
 # Use the following classes to apply LocalOptimizers
 class Updater:
-    def __init__(self, importer, pruner, chin):
+    def __init__(self, importer, pruner, chin, name=None):
        self.importer = importer
        self.pruner = pruner
        self.chin = chin
+        self.name = name
+    def __str__(self):
+        return "Updater{%s}" % str(self.name)
    def on_import(self, fgraph, node, reason):
        if self.importer:
@@ -1694,7 +1728,7 @@ class NavigatorOptimizer(Optimizer):
            self.ignore_newtrees = ignore_newtrees
        self.failure_callback = failure_callback
-    def attach_updater(self, fgraph, importer, pruner, chin=None):
+    def attach_updater(self, fgraph, importer, pruner, chin=None, name=None):
        """
        Install some FunctionGraph listeners to help the navigator deal with
        the ignore_trees-related functionality.
@@ -1709,6 +1743,8 @@ class NavigatorOptimizer(Optimizer):
            from the graph.
        chin
            "on change input" called whenever a node's inputs change.
+        name
+            name of the Updater to attach.
        Returns
        -------
@@ -1723,7 +1759,7 @@ class NavigatorOptimizer(Optimizer):
        if importer is None and pruner is None:
            return None
-        u = Updater(importer, pruner, chin)
+        u = Updater(importer, pruner, chin, name=name)
        fgraph.attach_feature(u)
        return u
@@ -1875,8 +1911,8 @@ class TopoOptimizer(NavigatorOptimizer):
                    q.remove(node)
                except ValueError:
                    pass
+        u = self.attach_updater(fgraph, importer, pruner,
-        u = self.attach_updater(fgraph, importer, pruner)
+                                name=getattr(self, 'name', None))
        nb = 0
        try:
            t0 = time.time()
@@ -1888,10 +1924,8 @@ class TopoOptimizer(NavigatorOptimizer):
                current_node = node
                nb += self.process_node(fgraph, node)
            loop_t = time.time() - t0
-        except Exception:
+        finally:
            self.detach_updater(fgraph, u)
-            raise
-        self.detach_updater(fgraph, u)
        callback_time = fgraph.execute_callbacks_time - callback_before
        nb_nodes_end = len(fgraph.apply_nodes)
@@ -1950,16 +1984,15 @@ class OpKeyOptimizer(NavigatorOptimizer):
                    q.remove(node)
                except ValueError:
                    pass
-        u = self.attach_updater(fgraph, importer, pruner)
+        u = self.attach_updater(fgraph, importer, pruner,
+                                name=getattr(self, 'name', None))
        try:
            while q:
                node = q.pop()
                current_node = node
                self.process_node(fgraph, node)
-        except Exception:
+        finally:
            self.detach_updater(fgraph, u)
-            raise
-        self.detach_updater(fgraph, u)
    def add_requirements(self, fgraph):
        """
@@ -1990,6 +2023,9 @@ class ChangeTracker:
    def on_attach(self, fgraph):
        fgraph.change_tracker = self
+    def on_detach(self, fgraph):
+        del fgraph.change_tracker
 def merge_dict(d1, d2):
    """
@@ -2033,6 +2069,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                 optimizers,
                 failure_callback=None,
                 ignore_newtrees=True,
+                 tracks_on_change_inputs=False,
                 max_use_ratio=None,
                 final_optimizers=None,
                 cleanup_optimizers=None):
@@ -2045,6 +2082,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        self.global_optimizers = []
        self.final_optimizers = []
        self.cleanup_optimizers = []
+        self.tracks_on_change_inputs = tracks_on_change_inputs
        for opt in optimizers:
            if isinstance(opt, LocalOptimizer):
@@ -2191,8 +2229,14 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                        q.remove(node)
                    except ValueError:
                        pass
+            chin = None
-            u = self.attach_updater(fgraph, importer, pruner)
+            if self.tracks_on_change_inputs:
+                def chin(node, i, r, new_r, reason):
+                    if node is not current_node and not isinstance(node, str):
+                        q.append(node)
+            u = self.attach_updater(fgraph, importer, pruner,
+                                    chin=chin,
+                                    name=getattr(self, 'name', None))
            try:
                while q:
                    node = q.pop()

--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -244,16 +244,26 @@ class EquilibriumDB(DB):
        optimization application. This could result in less fgraph iterations,
        but this doesn't mean it will be faster globally.
+    tracks_on_change_inputs
+        If True, we will re-apply local opt on nodes whose inputs
+        changed during local optimization application. This could
+        result in less fgraph iterations, but this doesn't mean it
+        will be faster globally.
    Notes
    -----
    We can put LocalOptimizer and Optimizer as EquilibriumOptimizer
    suppor both.
+    It is probably not a good idea to have ignore_newtrees=False and
+    tracks_on_change_inputs=True
    """
-    def __init__(self, ignore_newtrees=True):
+    def __init__(self, ignore_newtrees=True, tracks_on_change_inputs=False):
        super(EquilibriumDB, self).__init__()
        self.ignore_newtrees = ignore_newtrees
+        self.tracks_on_change_inputs = tracks_on_change_inputs
        self.__final__ = {}
        self.__cleanup__ = {}
@@ -281,6 +291,7 @@ class EquilibriumDB(DB):
            opts,
            max_use_ratio=config.optdb.max_use_ratio,
            ignore_newtrees=self.ignore_newtrees,
+            tracks_on_change_inputs=self.tracks_on_change_inputs,
            failure_callback=opt.NavigatorOptimizer.warn_inplace,
            final_optimizers=final_opts,
            cleanup_optimizers=cleanup_opts)

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -1493,7 +1493,7 @@ def local_dnn_convi_output_merge(node, *inputs):
    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
-@register_opt('cudnn')
+@register_opt('cudnn', 'fast_compile')
 @op_lifter([Pool])
 def local_pool_dnn_alternative(node, ctx_name):
    if not dnn_available(ctx_name):
@@ -1509,7 +1509,7 @@ def local_pool_dnn_alternative(node, ctx_name):
    return dnn_pool(gpu_contiguous(img), ds, stride=stride, pad=pad, mode=mode)
-@register_opt('cudnn')
+@register_opt('cudnn', 'fast_compile')
 @op_lifter([MaxPoolGrad])
 def local_pool_dnn_grad_stride(node, ctx_name):
    if not dnn_available(ctx_name):
@@ -1533,7 +1533,7 @@ def local_pool_dnn_grad_stride(node, ctx_name):
                                     pad)
-@register_opt('cudnn')
+@register_opt('cudnn', 'fast_compile')
 @op_lifter([AveragePoolGrad])
 def local_avg_pool_dnn_grad_stride(node, ctx_name):
    if not dnn_available(ctx_name):
@@ -1556,7 +1556,7 @@ def local_avg_pool_dnn_grad_stride(node, ctx_name):
    return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), cg, cg, ds, st, pad)
-@register_opt('cudnn')
+@register_opt('cudnn', 'fast_compile')
 @local_optimizer([GpuSoftmax])
 def local_softmax_dnn(node):
    if isinstance(node.op, GpuSoftmax):
@@ -1569,7 +1569,7 @@ def local_softmax_dnn(node):
        return [out]
-@register_opt('cudnn')
+@register_opt('cudnn', 'stabilize')
 @local_optimizer([GpuElemwise])
 def local_log_softmax_dnn(node):
    # This looks for GpuDnnSoftmax so we know that we have cudnn.
@@ -1586,7 +1586,7 @@ def local_log_softmax_dnn(node):
        return [new_softmax(softmax_node.inputs[0])]
-@register_opt('cudnn')
+@register_opt('cudnn', 'fast_compile')
 @op_lifter([LogSoftmax])
 def local_logsoftmax_to_dnn(node, ctx_name):
    # Transform the input in the format expected by GpuDnnSoftmax
@@ -1624,7 +1624,7 @@ class NoCuDNNRaise(Optimizer):
 gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
-@register_opt('cudnn')
+@register_opt('cudnn', 'fast_compile')
 @op_lifter([SoftmaxGrad])
 def local_softmax_dnn_grad(node, ctx_name):
    if not dnn_available(ctx_name):

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -711,18 +711,14 @@ def local_gpua_careduce(node, context_name):
                    assert reduce_mask[a] == 0
                    reduce_mask[a] = 1
-            shape_of = node.fgraph.shape_feature.shape_of
+            new_in_shp = [shape_i(x, 0)]
-            x_shape = shape_of[x]
-            new_in_shp = [x_shape[0]]
            new_mask = [reduce_mask[0]]
            for i in xrange(1, x.type.ndim):
                if reduce_mask[i] == reduce_mask[i - 1]:
-                    new_in_shp[-1] *= x_shape[i]
+                    new_in_shp[-1] *= shape_i(x, i)
                else:
                    new_mask.append(reduce_mask[i])
-                    new_in_shp.append(x_shape[i])
+                    new_in_shp.append(shape_i(x, i))
            new_axis = []
            for idx, m in enumerate(new_mask):
                if m == 1:
@@ -744,8 +740,12 @@ def local_gpua_careduce(node, context_name):
                    greduce(gpu_reshaped_x))
                if reduce_reshaped_x.ndim != node.outputs[0].ndim:
+                    out_shp = []
+                    for i in range(x.ndim):
+                        if i not in node.op.axis:
+                            out_shp.append(shape_i(x, i))
                    unreshaped_reduce = reduce_reshaped_x.reshape(
-                        tensor.stack(shape_of[node.outputs[0]]))
+                        tensor.stack(out_shp))
                else:
                    unreshaped_reduce = reduce_reshaped_x
                return [unreshaped_reduce]

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -249,6 +249,7 @@ if __name__ == "__main__":
        cuda version      7.5    7.0    6.5
        gpu
+        M40               0.47s
        k80               0.96s
        K6000/NOECC              0.69s
        K40                             0.88s

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -2526,7 +2526,8 @@ if True:
            out = as_cuda_ndarray_variable(out.dimshuffle(0, 1))
            return [out]
-    @register_opt('cudnn')
+    @register_opt('cudnn', 'stabilize', 'fast_compile')
+    # We put fast_compile as otherwise it won't be on the GPU.
    @local_optimizer([GpuElemwise, LogSoftmax])
    def local_log_softmax_dnn(node):
        # The log-softmax implementation is only available starting at cuDNN V3

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -14,6 +14,7 @@ from . import dnn
 import theano
 from theano import scalar as scal
 from theano import config, tensor, gof
+from theano.compile.ops import shape_i
 import theano.ifelse
 import theano.tensor.signal.pool
 import theano.tensor.nnet
@@ -900,18 +901,14 @@ def local_gpu_careduce(node):
                    # to make them a single dimension, do the reduction, and
                    # then reshape to get them back.
-                    shape_of = node.fgraph.shape_feature.shape_of
+                    new_in_shp = [shape_i(x, 0)]
-                    x_shape = shape_of[x]
-                    new_in_shp = [x_shape[0]]
                    new_mask = [reduce_mask[0]]
                    for i in xrange(1, x.type.ndim):
                        if reduce_mask[i] == reduce_mask[i - 1]:
-                            new_in_shp[-1] *= x_shape[i]
+                            new_in_shp[-1] *= shape_i(x, i)
                        else:
                            new_mask.append(reduce_mask[i])
-                            new_in_shp.append(x_shape[i])
+                            new_in_shp.append(shape_i(x, i))
                    new_greduce = GpuCAReduce(new_mask, scalar_op)
                    new_x = x.reshape(tensor.stack(new_in_shp))
@@ -936,8 +933,11 @@ def local_gpu_careduce(node):
                    # Restore the expected shape of the output
                    if rval.ndim != out.ndim:
-                        rval = rval.reshape(
+                        out_shp = []
-                            tensor.stack(shape_of[out]))
+                        for i in range(x.ndim):
+                            if i not in node.op.axis:
+                                out_shp.append(shape_i(x, i))
+                        rval = rval.reshape(tensor.stack(out_shp))
                if rval.type == out.type:
                    return [rval]

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1436,7 +1436,8 @@ class GemmOptimizer(Optimizer):
            if new_node is not node:
                nodelist.append(new_node)
-        u = theano.gof.opt.Updater(on_import, None, None)
+        u = theano.gof.opt.Updater(on_import, None, None,
+                                   name="GemmOptimizer")
        fgraph.attach_feature(u)
        while did_something:
            nb_iter += 1

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1260,6 +1260,12 @@ class ShapeFeature(object):
        for node in fgraph.toposort():
            self.on_import(fgraph, node, reason='on_attach')
+    def on_detach(self, fgraph):
+        self.shape_of = {}
+        self.scheduled = {}
+        self.shape_of_reverse_index = {}
+        del fgraph.shape_feature
    def on_import(self, fgraph, node, reason):
        if node.outputs[0] in self.shape_of:
            # this is a revert, not really an import
@@ -1436,10 +1442,23 @@ class ShapeOptimizer(Optimizer):
    def apply(self, fgraph):
        pass
+class UnShapeOptimizer(Optimizer):
+    """Optimizer remove ShapeFeature as an fgraph feature."""
+    def apply(self, fgraph):
+        for feature in fgraph._features:
+            if isinstance(feature, ShapeFeature):
+                fgraph.remove_feature(feature)
 # Register it after merge1 optimization at 0. We don't want to track
 # the shape of merged node.
 theano.compile.mode.optdb.register('ShapeOpt', ShapeOptimizer(),
                                   0.1, 'fast_run', 'fast_compile')
+# Not enabled by default for now. Some crossentropy opt use the
+# shape_feature.  They are at step 2.01. uncanonicalize is at step
+# 3. After it goes to 48.5 that move to the gpu. So 10 seem resonable.
+theano.compile.mode.optdb.register('UnShapeOpt', UnShapeOptimizer(),
+                                   10)
 def local_elemwise_alloc_op(ElemwiseOP, AllocOP, DimShuffleOP):