Added print_profile to GraphtoGPU

c396d611 · sentient07 · de536bd5 · c396d611
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -3,14 +3,17 @@ import copy
 import numpy
 import logging
 import pdb
+import time
 from six.moves import xrange
 import theano
+from theano.compat import OrderedDict
 from theano import tensor, scalar, gof, config
 from theano.compile import optdb
 from theano.compile.ops import shape_i
 from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
-                        SequenceDB, Optimizer, DB, toolbox)
+                        SequenceDB, Optimizer, DB, toolbox, graph)
+from gof.opt import ChangeTracker
 from theano.gof.optdb import LocalGroupDB
 from theano.ifelse import IfElse
@@ -251,7 +254,7 @@ gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
                    0, 'fast_run', 'fast_compile', 'merge')
-class GraphToGPU(Optimizer):
+class GraphToGPU(Optimizer, NavigatorOptimizer):
    """
    Transfer the graph as a whole to GPU instead of transfering node by node.
    """
@@ -264,8 +267,18 @@ class GraphToGPU(Optimizer):
        fgraph.attach_feature(toolbox.ReplaceValidate())
    def apply(self, fgraph):
+        change_tracker = ChangeTracker()
        mapping = {}
+        global_process_count = {}
+        start_nb_nodes = len(fgraph.apply_nodes)
+        max_nb_nodes = len(fgraph.apply_nodes)
+        loop_timing = []
+        loop_process_count = []
+        local_opt_timing = []
+        io_toposort_timing = []
+        nb_nodes = []
+        node_created = {}
+        process_count = {}
        # Building a new graph
        # Iterating through inputs of graph
        for i in fgraph.inputs:
@@ -277,8 +290,23 @@ class GraphToGPU(Optimizer):
            if isinstance(i, theano.Constant):
                mapping[i] = i
+        for lopt in (self.local_optimizers_all +
+                     self.local_optimizers_map.get(type(node.op), []) +
+                     self.local_optimizers_map.get(node.op, [])):
+                process_count.setdefault(copt, 0)
+                global_process_count.setdefault(opt, 0)
+                time_opts.setdefault(opt, 0)
+                node_created.setdefault(opt, 0)
+        topo_t0 = time.time()
+        q = deque(graph.io_toposort(fgraph.inputs, start_from))
+        io_toposort_timing.append(time.time() - topo_t0)
+        nb_nodes.append(len(q))
+        max_nb_nodes = max(max_nb_nodes, len(q))
        for node in fgraph.toposort():
+            t0 = time.time()
            if isinstance(node.op, HostFromGpu):
                mapping[node.outputs[0]] = node.inputs[0]
                continue
@@ -310,7 +338,14 @@ class GraphToGPU(Optimizer):
            for lopt in (self.local_optimizers_all +
                         self.local_optimizers_map.get(type(node.op), []) +
                         self.local_optimizers_map.get(node.op, [])):
+                nb = change_tracker.nb_imported
+                process_count[lopt] += 1
+                global_process_count[lopt] += 1
+                t_opt = time.time()
+                lopt_change = self.process_node(fgraph, node, lopt)
+                time_opts[lopt] += time.time() - t_opt
+                node_created[lopt] += change_tracker.nb_imported - nb
                if move_to_GPU:
                    try:
                        new_ops = lopt.transform(
@@ -322,6 +357,7 @@ class GraphToGPU(Optimizer):
                            out_clients)
                    if new_ops:
                        break
+            local_opt_timing.append(float(time.time() - t0))
            if not new_ops:
                newnode = node.clone_with_new_inputs([mapping.get(i)
                                                      for i in node.inputs])
@@ -344,6 +380,9 @@ class GraphToGPU(Optimizer):
            for new_o, old_o in zip(outputs, node.outputs):
                mapping[old_o] = new_o
+            loop_process_count.append(process_count)
+            loop_timing.append(float(time.time() - t0))
        new_nodes = []
        for o in fgraph.outputs:
            new_o = mapping[o]
@@ -354,6 +393,143 @@ class GraphToGPU(Optimizer):
            new_nodes.append(new_o)
        fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes))
+    @staticmethod
+    def print_profile(stream, prof, level=0):
+        (opt, loop_timing, loop_process_count,
+         (start_nb_nodes, end_nb_nodes, max_nb_nodes),
+         local_opt_timing, nb_nodes, time_opts, io_toposort_timing,
+         node_created) = prof
+        blanc = ('    ' * level)
+        print(blanc, "GraphToGPUOptimizer", end=' ', file=stream)
+        print(blanc, getattr(opt, "name",
+                             getattr(opt, "__name__", "")), file=stream)
+        print(blanc, "  time %.3fs for %d passes" % (
+            sum(loop_timing), len(loop_timing)), file=stream)
+        print(blanc, "  nb nodes (start, end,  max) %d %d %d" % (
+            start_nb_nodes, end_nb_nodes, max_nb_nodes), file=stream)
+        print(blanc, "  time io_toposort %.3fs" % sum(
+            io_toposort_timing), file=stream)
+        s = sum([time_opts[o] for o in opt.local_optimizers_all])
+        print(blanc, "  time in local optimizers %.3fs" % s, file=stream)
+        for i in range(len(loop_timing)):
+            lopt = ""
+            if loop_process_count[i]:
+                d = list(reversed(sorted(iteritems(loop_process_count[i]),
+                                         key=lambda a: a[1])))
+                lopt = " ".join([str((str(k), v)) for k, v
+                                 in d[:5]])
+                if len(d) > 5:
+                    lopt += " ..."
+            print(blanc, ('  %2d - %.3fs %d (%.3fs in global opts, '
+                          '%.3fs io_toposort) - %d nodes - %s' % (
+                              i, loop_timing[i],
+                              sum(loop_process_count[i].values()),
+                              local_opt_timing[i],
+                              io_toposort_timing[i], nb_nodes[i],
+                              lopt)), file=stream)
+        count_opt = []
+        not_used = []
+        not_used_time = 0
+        process_count = {}
+        for o in (opt.local_optimizers_all +
+                  list(opt.local_optimizers_map.get(type(node.op), [])) +
+                  list(opt.local_optimizers_map.get(node.op, [])) +):
+            process_count.setdefault(o, 0)
+        for count in loop_process_count:
+            for o, v in iteritems(count):
+                process_count[o] += v
+        for o, count in iteritems(process_count):
+            if count > 0:
+                count_opt.append((time_opts[o], count,
+                                  node_created[o], o))
+            else:
+                not_used.append((time_opts[o], o))
+                not_used_time += time_opts[o]
+        if count_opt:
+            print(blanc,
+                  '  times - times applied - nb node created - name:',
+                  file=stream)
+            count_opt.sort()
+            for (t, count, n_created, o) in count_opt[::-1]:
+                print(blanc, '  %.3fs - %d - %d - %s' % (
+                    t, count, n_created, o), file=stream)
+            print(blanc, '  %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
+                not_used_time, len(not_used)), file=stream)
+            not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
+            for (t, o) in not_used[::-1]:
+                if t > 0:
+                    # Skip opt that have 0 times, they probably wasn't even tried.
+                    print(blanc + "  ", '  %.3fs - %s' % (t, o), file=stream)
+            print(file=stream)
+    @staticmethod
+    def merge_profile(prof1, prof2):
+        # (opt, loop_timing, loop_process_count, max_nb_nodes,
+        # global_opt_timing, nb_nodes, time_opts, io_toposort_timing) = prof1
+        local_optimizers = OrderedSet(prof1[0].local_optimizers_all).union(
+            prof2[0].local_optimizers_all)
+        def merge_dict(d1, d2):
+            """
+            merge 2 dicts by adding the values.
+            """
+            d = d1.copy()
+            for k, v in iteritems(d2):
+                if k in d:
+                    d[k] += v
+                else:
+                    d[k] = v
+            return d
+        local_optimizers_map = merge_dict(prof1[0].local_optimizers_map,
+                                          prof2[0].local_optimizers_map)
+        new_opt = GraphToGPU(local_optimizers,local_optimizers_map)
+        def merge_list(l1, l2):
+            l = copy.copy(l1)
+            for idx, nb in enumerate(l2):
+                if idx < len(l):
+                    l[idx] += nb
+                else:
+                    l.append(nb)
+            return l
+        loop_timing = merge_list(prof1[1], prof2[1])
+        loop_process_count = list(prof1[2])
+        for i in range(min(len(loop_process_count), len(prof2[2]))):
+            process_count = loop_process_count[i]
+            for process, count in iteritems(prof2[2][i]):
+                if process in process_count:
+                    process_count[process] += count
+                else:
+                    process_count[process] = count
+        loop_process_count.extend(prof2[2][len(loop_process_count):])
+        max_nb_nodes = max(prof1[3], prof2[3])
+        nb_nodes = merge_list(prof1[4], prof2[4])
+        time_opts = merge_dict(prof1[5], prof2[5])
+        io_toposort_timing = merge_list(prof1[6], prof2[6])
+        assert len(loop_timing) == max(len(prof1[1]), len(prof2[1]))
+        node_created = merge_dict(prof1[7], prof2[7])
+        return (new_opt,
+                loop_timing,
+                loop_process_count,
+                max_nb_nodes,
+                nb_nodes,
+                time_opts,
+                io_toposort_timing,
+                node_created)
 @local_optimizer([GpuFromHost, GpuToGpu, HostFromGpu])
 def local_cut_gpu_transfers(node):