Merge pull request #666 from nouiz/time_opt

Time opt

Merge pull request #666 from nouiz/time_opt
bb096349 · lamblin · b8165faa · 0c0c9f91 · bb096349 · bb096349
--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -994,11 +994,16 @@ class FunctionMaker(object):
        # we allow ProfileMode to provide a ProfileStats object
        # using this somewhat awkward mechanism.
        mode_profile = getattr(mode, 'profile', None)
-        if (profile is not None) and (mode_profile is not None):
+        if (profile is not None and
+            profile is not False and
+            mode_profile is not None):
            raise TypeError(
                    'profile passed via both "mode" and "profile" arguments')
        self.profile = profile = profile or mode_profile
+        if profile:
+            # We preload the cache here to don't have its timming
+            # included in optimization that compile function.
+            theano.gof.cc.get_module_cache()
        # Handle the case where inputs and/or outputs is a single Variable (not in a list)
        self.orig_outputs = outputs
        unpack_single = False
@@ -1030,6 +1035,8 @@ class FunctionMaker(object):
        # make the env (copies the graph, creates NEW INPUT AND OUTPUT VARIABLES)
        env, additional_outputs = std_env(expanded_inputs, outputs, accept_inplace)
+        env.profile = profile
        self.env = env
        # Fetch the optimizer and linker
@@ -1042,13 +1049,15 @@ class FunctionMaker(object):
            theano.config.compute_test_value = "off"
            gof.Op.add_stack_trace_on_call = False
            start_optimizer = time.time()
-            optimizer(env)
+            optimizer_profile = optimizer(env)
            end_optimizer = time.time()
            opt_time = end_optimizer - start_optimizer
            mode.optimizer_time += opt_time
            if profile:
                profile.optimizer_time += opt_time
+                if theano.config.profile_optimizer:
+                    profile.optimizer_profile = (optimizer, optimizer_profile)
            _logger.debug('Optimizing took %f seconds', opt_time)
            #Add deep copy to respect the memory interface

--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -88,6 +88,10 @@ OPT_FAST_RUN_STABLE = OPT_FAST_RUN.requiring('stable')
 OPT_FAST_COMPILE = gof.Query(include=['fast_compile'])
 OPT_STABILIZE = gof.Query(include=['fast_run'])
 OPT_STABILIZE.position_cutoff = 1.5000001
+OPT_FAST_RUN.name = 'OPT_FAST_RUN'
+OPT_FAST_RUN_STABLE.name = 'OPT_FAST_RUN_STABLE'
+OPT_FAST_COMPILE.name = 'OPT_FAST_COMPILE'
+OPT_STABILIZE.name = 'OPT_STABILIZE'
 predefined_optimizers = {
    None: (lambda env: None),

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -38,12 +38,14 @@ AddConfigVar('profiling.time_thunks',
 def _atexit_print_fn():
    """Print ProfileStat objects in _atexit_print_list to _atexit_print_file
    """
+    printed = 0
    for ps in _atexit_print_list:
        if ps.fct_callcount or ps.compile_time > 0:
            ps.summary(file=_atexit_print_file)
+            printed += 1
        else:
            print 'Skipping empty Profile'
-    if len(_atexit_print_list) > 1:
+    if printed > 1:
    # Make a global profile
        cum = copy.copy(_atexit_print_list[0])
        cum.message = "Sum of all printed profiles at exit"
@@ -51,14 +53,26 @@ def _atexit_print_fn():
 #        for ps in [ps for ps in _atexit_print_list[1:]
 #                   if not isinstance(ps, ScanProfileStats)]:
            for attr in ["compile_time", "fct_call_time", "fct_callcount",
-                         "vm_call_time", "optimizer_time", "linker_time"]:
+                         "vm_call_time", "optimizer_time", "linker_time",
+                         "validate_time"]:
                setattr(cum, attr, getattr(cum, attr) + getattr(ps, attr))
+            #merge dictonary
            for attr in ["apply_time", "apply_callcount",
                         "apply_cimpl", "outputs_size"]:
                cum_attr = getattr(cum, attr)
                for key, val in getattr(ps, attr).iteritems():
                    assert key not in cum_attr
                    cum_attr[key] = val
+            if cum.optimizer_profile and ps.optimizer_profile:
+                merge = cum.optimizer_profile[0].merge_profile(
+                    cum.optimizer_profile[1],
+                    ps.optimizer_profile[1])
+                cum.optimizer_profile = (cum.optimizer_profile[0], merge)
+            else:
+                cum.optimizer_profile = None
        cum.summary(file=_atexit_print_file)
@@ -118,11 +132,19 @@ class ProfileStats(object):
    optimizer_time = 0.0
    # time spent optimizing graph (FunctionMaker.__init__)
+    validate_time = 0.0
+    # time spent in env.validate
+    # This is a subset of optimizer_time that is dominated by toposort()
+    # when the destorymap feature is included.
    linker_time = 0.0
    # time spent linking graph (FunctionMaker.create)
    line_width = 140
+    optimizer_profile = None
+    # None or tuple (the optimizer, the profile it returned)
    # param is called flag_time_thunks because most other attributes with time
    # in the name are times *of* something, rather than configuration flags.
    def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
@@ -390,11 +412,15 @@ class ProfileStats(object):
                        local_time, 100*local_time / self.fct_call_time)
        print >> file, '  Total compile time: %es' % self.compile_time
        print >> file, '    Theano Optimizer time: %es' % self.optimizer_time
+        print >> file, '       Theano validate time: %es' % self.validate_time
        print >> file, ('    Theano Linker time (includes C,'
                        ' CUDA code generation/compiling): %es' %
                        self.linker_time)
        print >> file, ''
+        # The validation time is a subset of optimizer_time
+        assert self.validate_time < self.optimizer_time
    def summary(self, file=sys.stderr, n_ops_to_print=20,
                n_applies_to_print=20):
        self.summary_function(file)
@@ -402,9 +428,13 @@ class ProfileStats(object):
        if local_time > 0:
            self.summary_ops(file, n_ops_to_print)
            self.summary_nodes(file, n_applies_to_print)
-        else:
+        elif self.fct_callcount > 0:
            print >> file, ("  No node time accumulated "
                            "(hint: try config profiling.time_thunks=1)")
+        if self.optimizer_profile:
+            print "Optimizer Profile"
+            print "-----------------"
+            self.optimizer_profile[0].print_profile(file, self.optimizer_profile[1])
 if 0: # old code still to be ported from ProfileMode

--- a/theano/gof/env.py
+++ b/theano/gof/env.py
@@ -129,6 +129,7 @@ class Env(utils.object2):
        self.node_locks = {}
        self.variable_locks = {}
+        self.profile = None
    ### Setup a Variable ###

--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
@@ -567,7 +567,7 @@ def clone(i, o, copy_inputs = True):
    :type o: list
    :param o: output L{Variable}s
    :type copy_inputs: bool
-    :param copy_inputs: if True, the inputs will be copied (defaults to False)
+    :param copy_inputs: if True, the inputs will be copied (defaults to True)
    Returns the inputs and outputs of that copy.
    """

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -75,7 +75,7 @@ class Optimizer(object):
          opt.apply(env)
        """
        self.add_requirements(env)
-        self.apply(env, *args, **kwargs)
+        return self.apply(env, *args, **kwargs)
    def __call__(self, env):
        """WRITEME
@@ -98,6 +98,12 @@ class Optimizer(object):
        print >> stream, "%s%s %s id=%i" % (
                (' ' * level), self.__class__.__name__, name, id(self))
+    def print_profile(self, prof):
+        if prof is not None:
+            raise NotImplementedError(
+                "The function print_profile must be overrided if the"
+                " optimizer return profiling information.")
 class FromFunctionOptimizer(Optimizer):
    """WRITEME"""
@@ -154,12 +160,16 @@ class SeqOptimizer(Optimizer, list):
        Applies each L{Optimizer} in self in turn.
        """
        l = []
+        if env.profile:
+            validate_before = env.profile.validate_time
        nb_node_before = len(env.nodes)
+        sub_profs = []
        for optimizer in self:
            try:
                t0 = time.time()
-                optimizer.optimize(env)
+                sub_prof = optimizer.optimize(env)
                l.append(float(time.time() - t0))
+                sub_profs.append(sub_prof)
            except AssertionError:
                # do not catch Assertion failures
                raise
@@ -169,12 +179,14 @@ class SeqOptimizer(Optimizer, list):
                    continue
                else:
                    raise
        if config.time_seq_optimizer:
            print "SeqOptimizer",
            if hasattr(self,"name"): print self.name,
            elif hasattr(self,"__name__"): print self.__name__,
            print " time %.3fs for %d/%d nodes before/after optimization"%(sum(l),nb_node_before,len(env.nodes))
+            print " time %.3fs for validate " % (
+                env.profile.validate_time - validate_before)
            ll=[]
            for opt in self:
                if hasattr(opt,"__name__"):
@@ -191,6 +203,12 @@ class SeqOptimizer(Optimizer, list):
            for (t, opt) in lll[::-1]:
                print '  %.6fs - %s' % (t, opt)
            print
+        if env.profile:
+            validate_time = env.profile.validate_time - validate_before
+        else:
+            validate_time = None
+        return (self, l, validate_time, nb_node_before,
+                len(env.nodes), sub_profs)
    def __eq__(self, other):
        #added to override the list's __eq__ implementation
@@ -216,6 +234,115 @@ class SeqOptimizer(Optimizer, list):
            for opt in self:
                opt.print_summary(stream, level=(level + 2), depth=depth)
+    @staticmethod
+    def print_profile(stream, prof, level=0):
+        (opts, prof, validate_time, nb_node_before,
+         nb_node_after, sub_profs) = prof
+        blanc = ('    ' * level)
+        print >> stream, blanc, "SeqOptimizer",
+        if hasattr(opts, "name"):
+            print >> stream, blanc, opts.name,
+        elif hasattr(opts, "__name__"):
+            print >> stream, blanc, opts.__name__,
+        print >> stream, (" time %.3fs for %d/%d nodes"
+                          " before/after optimization" % (
+                              sum(prof), nb_node_before, nb_node_after))
+        print >> stream, blanc, "  %.3fs for env.validate()" % (validate_time)
+        if level == 0:
+            print >> stream, blanc, "  time      - (name, class, index)"
+        ll = []
+        for opt in opts:
+            if hasattr(opt, "__name__"):
+                ll.append((opt.__name__, opt.__class__.__name__,
+                           opts.index(opt)))
+            else:
+                ll.append((opt.name, opt.__class__.__name__,
+                           opts.index(opt)))
+        lll = zip(prof, ll)
+        def cmp(a, b):
+            if a[0] == b[0]:
+                return 0
+            elif a[0] < b[0]:
+                return -1
+            return 1
+        lll.sort(cmp)
+        for (t, opt) in lll[::-1]:
+            #if t < 1:
+            #    continue
+            print >> stream, blanc, '  %.6fs - %s' % (t, opt)
+            if sub_profs[opt[-1]]:
+                opts[opt[-1]].print_profile(stream, sub_profs[opt[-1]],
+                                            level=level + 1)
+        print >> stream
+    @staticmethod
+    def merge_profile(prof1, prof2):
+        """
+        Merge 2 profiles returned by this cass apply() fct.
+        """
+        new_t = []
+        new_l = []
+        new_sub_profile = []
+        #merge common(same object) opt
+        for l in set(prof1[0]).intersection(set(prof2[0])):
+            idx1 = prof1[0].index(l)
+            idx2 = prof2[0].index(l)
+            new_t.append(prof1[1][idx1] +
+                         prof2[1][idx2])
+            new_l.append(l)
+            if hasattr(l, 'merge_profile'):
+                assert len(prof1[5][idx1]) == len(prof2[5][idx1])
+                new_sub_profile.append(l.merge_profile(prof1[5][idx1],
+                                                       prof2[5][idx2]))
+            else:
+                new_sub_profile.append(None)
+        # merge not common opt
+        import StringIO
+        for l in set(prof1[0]).symmetric_difference(set(prof2[0])):
+            #The set trick above only work for the same object optimization
+            #It don't work for equivalent optimization.
+            #So we try to merge equivalent optimization here.
+            new_l_names = [o.name for o in new_l]
+            if l.name in new_l_names:
+                idx = new_l_names.index(l.name)
+                io1 = StringIO.StringIO()
+                io2 = StringIO.StringIO()
+                l.print_summary(io1)
+                new_l[idx].print_summary(io2)
+                if io1.read() == io2.read():
+                    if l in prof1[0]:
+                        p = prof1
+                    else:
+                        p = prof2
+                    new_t[idx] += p[1][p[0].index(l)]
+                    if hasattr(l, 'merge_profile'):
+                        assert len(p[5][p[0].index(l)]) == len(new_sub_profile[idx])
+                        new_sub_profile[idx] = l.merge_profile(
+                            new_sub_profile[idx], p[5][p[0].index(l)])
+                    else:
+                        new_sub_profile[idx] = None
+                continue
+            if l in prof1[0]:
+                p = prof1
+            else:
+                p = prof2
+            new_t.append(p[1][p[0].index(l)])
+            idx = p[0].index(l)
+            new_l.append(l)
+            new_sub_profile.append(p[5][idx])
+        new_opt = SeqOptimizer(*new_l)
+        assert set(prof1[0]).issubset(set(new_l))
+#        assert set(prof2[0]).issubset(set(new_l))
+        assert len(new_t) == len(new_opt) == len(new_sub_profile)
+        return (new_opt, new_t, prof1[2] + prof2[2],
+                -1, -1, new_sub_profile)
 class _metadict:
    """WRITEME"""
@@ -500,7 +627,9 @@ def MergeOptMerge(opt):
    opt introduced additional similarities.
    """
    merger = merge_optimizer
-    return SeqOptimizer([merger, opt, merger])
+    opt = SeqOptimizer([merger, opt, merger])
+    opt.name = "MergeOptMerge"
+    return opt
 def pre_constant_merge(vars):
@@ -1314,7 +1443,12 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        loop_timing = []
        global_opt_timing = []
+        time_lopts = {}
+        io_toposort_timing = []
        nb_nodes = []
+        for lopt in self.local_optimizers:
+            process_count.setdefault(lopt, 0)
+            time_lopts.setdefault(lopt, 0)
        while changed and not max_use_abort:
            t0 = time.time()
@@ -1333,7 +1467,9 @@ class EquilibriumOptimizer(NavigatorOptimizer):
            for node in start_from:
                assert node in env.outputs
+            topo_t0 = time.time()
            q = deque(graph.io_toposort(env.inputs, start_from))
+            io_toposort_timing.append(time.time() - topo_t0)
            nb_nodes.append(len(q))
            max_nb_nodes = max(max_nb_nodes, len(q))
@@ -1355,9 +1491,11 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                while q:
                    node = q.pop()
                    current_node = node
                    for lopt in self.local_optimizers:
-                        process_count.setdefault(lopt, 0)
+                        t_lopt = time.time()
                        lopt_change = self.process_node(env, node, lopt)
+                        time_lopts[lopt] += time.time() - t_lopt
                        if lopt_change:
                            process_count[lopt] += 1
                            changed = True
@@ -1402,6 +1540,9 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                    print '  %d - %s' % (count, opt)
                print
+        return (self, loop_timing, process_count, max_nb_nodes,
+                global_opt_timing, nb_nodes, time_lopts, io_toposort_timing)
    def print_summary(self, stream=sys.stdout, level=0, depth=-1):
        name = getattr(self, 'name', None)
        print >> stream, "%s%s %s id=%i" % (
@@ -1411,6 +1552,95 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                lopt.print_summary(stream, level=(level + 2),
                        depth=(depth - 1))
+    @staticmethod
+    def print_profile(stream, prof, level=0):
+        (opt, loop_timing, process_count, max_nb_nodes,
+         global_opt_timing, nb_nodes, time_lopts, io_toposort_timing) = prof
+        blanc = ('    ' * level)
+        print >> stream, blanc, "EquilibriumOptimizer",
+        print >> stream, blanc, getattr(opt, "name",
+                                        getattr(opt, "__name__", ""))
+        print >> stream, blanc, " time %.3fs for %d passes, %d nodes max" % (
+                sum(loop_timing), len(loop_timing), max_nb_nodes)
+        print >> stream, blanc, " time io_toposort %.3fs" % sum(
+            io_toposort_timing)
+        for i in range(len(loop_timing)):
+            print >> stream, blanc, ('%d - %.3fs (%.3fs in global opts, '
+                                     '%.3fs io_toposort) - %d nodes' % (
+                                         i, loop_timing[i],
+                                         global_opt_timing[i],
+                                         io_toposort_timing[i], nb_nodes[i]))
+        count_opt = []
+        for opt, count in process_count.iteritems():
+            if count > 0:
+                count_opt.append((time_lopts[opt], count, opt))
+        if count_opt:
+            print >> stream, blanc, 'times applied - optimizer (only those applied):'
+            count_opt.sort()
+            for (t, count, opt) in count_opt[::-1]:
+                print >> stream, blanc, '  %.3fs - %d - %s' % (
+                    t, count, opt)
+            print >> stream
+    @staticmethod
+    def merge_profile(prof1, prof2):
+        #(opt, loop_timing, process_count, max_nb_nodes,
+        # global_opt_timing, nb_nodes, time_lopts, io_toposort_timing) = prof1
+        local_optimizers = set(prof1[0].local_optimizers).union(
+            prof2[0].local_optimizers)
+        global_optimizers = set(prof1[0].global_optimizers).union(
+            prof2[0].global_optimizers)
+        new_opt = EquilibriumOptimizer(
+            local_optimizers.union(global_optimizers),
+            max_use_ratio=1)
+        def merge_list(l1, l2):
+            l = copy.copy(l1)
+            for idx, nb in enumerate(l2):
+                if idx < len(l):
+                    l[idx] += nb
+                else:
+                    l.append(nb)
+            return l
+        loop_timing = merge_list(prof1[1], prof2[1])
+        process_count = prof1[2].copy()
+        for process, count in prof2[2].iteritems():
+            if process in process_count:
+                process_count[process] += count
+            else:
+                process_count[process] = count
+        max_nb_nodes = max(prof1[3], prof2[3])
+        global_opt_timing = merge_list(prof1[4], prof2[4])
+        nb_nodes = merge_list(prof1[5], prof2[5])
+        time_lopts = prof1[6].copy()
+        for opt, t in prof2[6].iteritems():
+            if opt in time_lopts:
+                time_lopts[opt] += t
+            else:
+                time_lopts[opt] = t
+        io_toposort_timing = merge_list(prof1[7], prof2[7])
+        assert (len(loop_timing) == len(global_opt_timing) ==
+                len(io_toposort_timing) == len(nb_nodes))
+        assert len(loop_timing) == max(len(prof1[1]), len(prof2[1]))
+        return (new_opt,
+                loop_timing,
+                process_count,
+                max_nb_nodes,
+                global_opt_timing,
+                nb_nodes,
+                time_lopts,
+                io_toposort_timing)
 #################
 ### Utilities ###

--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -229,7 +229,10 @@ class SequenceDB(DB):
        opts = [o for o in opts if self.__position__[o.name] < position_cutoff]
        opts.sort(key=lambda obj: self.__position__[obj.name])
-        return opt.SeqOptimizer(opts, failure_callback=self.failure_callback)
+        ret = opt.SeqOptimizer(opts, failure_callback=self.failure_callback)
+        if hasattr(tags[0], 'name'):
+            ret.name = tags[0].name
+        return ret
    def print_summary(self, stream=sys.stdout):
        print >> stream, "SequenceDB (id %i)" % id(self)

--- a/theano/gof/toolbox.py
+++ b/theano/gof/toolbox.py
 import sys
+import time
 from theano.gof.python25 import partial
@@ -71,10 +72,20 @@ class History:
 class Validator:
    def on_attach(self, env):
-        if hasattr(env, 'validate'):
+        for attr in ('validate', 'validate_time'):
-            raise AlreadyThere("Validator feature is already present or in"
+            if hasattr(env, attr):
-                               " conflict with another plugin.")
+                raise AlreadyThere("Validator feature is already present or in"
-        env.validate = lambda: env.execute_callbacks('validate')
+                                   " conflict with another plugin.")
+        def validate():
+            t0 = time.time()
+            ret = env.execute_callbacks('validate')
+            t1 = time.time()
+            if env.profile:
+                env.profile.validate_time += t1 - t0
+            return ret
+        env.validate = validate
        def consistent():
            try:

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -17,6 +17,9 @@ logger = logging.getLogger(__name__)
 AddConfigVar('profile',
        "If VM should collect profile information",
        BoolParam(False))
+AddConfigVar('profile_optimizer',
+        "If VM should collect optimizer profile information",
+        BoolParam(False))
 raise_with_op = link.raise_with_op

--- a/theano/sandbox/cuda/tests/test_driver.py
+++ b/theano/sandbox/cuda/tests/test_driver.py
@@ -24,7 +24,8 @@ def test_nvidia_driver1():
    """
    a = numpy.random.rand(10000).astype("float32")
    A = cuda.shared_constructor(a)
-    f = theano.function(inputs=[], outputs=A.sum(), mode=mode_with_gpu)
+    f = theano.function(inputs=[], outputs=A.sum(), mode=mode_with_gpu,
+                        profile=False)
    topo = f.maker.env.toposort()
    assert len(topo) == 2
    assert sum(isinstance(node.op, B.GpuSum) for node in topo) == 1
@@ -56,7 +57,8 @@ def test_nvidia_driver3():
        of the gpu device
    """
    var = cuda.fvector()
-    f = theano.function([var], var + 1, mode=mode_with_gpu)
+    f = theano.function([var], var + 1, mode=mode_with_gpu,
+                        profile=False)
    topo = f.maker.env.toposort()
    assert any([isinstance(node.op, cuda.GpuElemwise) for node in topo])
    assert theano.sandbox.cuda.use.device_number is not None

--- a/theano/scan_module/scan.py
+++ b/theano/scan_module/scan.py
@@ -794,7 +794,8 @@ def scan(fn,
                       updates=updates,
                       mode=compile.mode.Mode(linker='py',
                                              optimizer=None),
-                       on_unused_input='ignore')
+                       on_unused_input='ignore',
+                       profile=False)
    ##
    ### Step 5. Re-arange inputs of scan into a more strict order