Merge pull request #1934 from RoyXue/GSoC2014_part2

Compute minimum peak

Merge pull request #1934 from RoyXue/GSoC2014_part2
feff4f12 · Frédéric Bastien · 7af47dd8 · 023876a0 · feff4f12 · feff4f12
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -320,6 +320,15 @@ import theano and print the config variable, as in:
    For the memory profile, do not print Apply nodes if the size
    of their outputs (in bytes) is lower than this.
+.. attribute:: profiling.min_peak_memory
+    Bool value: either True or False
+    Default False
+    Do the memory profile print the min peak memory usage?
+    It only works when profile=True, profile_memory=True
 .. attribute:: config.lib.amdlibm
    Bool value: either True or False

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -19,10 +19,12 @@ import copy
 import os
 import sys
 import time
+from theano.compat.python2x import defaultdict
 import numpy
 import theano
+from theano.gof import graph
 from theano.configparser import AddConfigVar, BoolParam, IntParam
@@ -54,6 +56,11 @@ AddConfigVar('profiling.min_memory_size',
             IntParam(1024, lambda i: i >= 0),
             in_c_key=False)
+AddConfigVar('profiling.min_peak_memory',
+            """The min peak memory usage of the order""",
+            BoolParam(False),
+            in_c_key=False)
 def _atexit_print_fn():
    """Print ProfileStat objects in _atexit_print_list to _atexit_print_file
@@ -641,7 +648,10 @@ class ProfileStats(object):
        new_max_node_memory_saved_by_view = 0
        new_max_node_memory_saved_by_inplace = 0
-        def count_running_memory(order, thunk_old_storage, nodes_mem):
+        # track min peak memory usage
+        min_max_peak = 0
+        def count_running_memory(order, fgraph, nodes_mem):
            """
            Calculate memory with specific node order 
            Return a list including the following values
@@ -658,88 +668,320 @@ class ProfileStats(object):
            5.  node_memory_saved_by_inplace
                The sum of memory saved by reusing the input instead of
                new allocation
            """
            node_memory_size = 0
            running_memory_size = 0
            running_max_memory_size = 0
            node_memory_saved_by_view = 0
            node_memory_saved_by_inplace = 0
+            # This take only the inputs/outputs dependencies.
+            dependencies = fgraph.profile.dependencies
+            # Initial compute_map which is used to check if a node is valid
+            compute_map = defaultdict(lambda: [0])
+            for var in fgraph.inputs:
+                compute_map[var][0] = 1
+            # two data structure used to mimic Python gc
+            viewed_by = {}  # {var1: [vars that view var1]}
+            # The len of the list is the value of python ref count. But we use a list, not just the ref count value. 
+            # This is more safe to help detect potential bug  in the algo
+            for var in fgraph.variables:
+                viewed_by[var] = []
+            view_of = {}  # {var1: original var viewed by var1}
+            # The orignal mean that we don't keep trac of all the intermediate relationship in the view.
            for node in order:
-                val = nodes_mem[node]
+                for var in node.outputs:
+                    compute_map[var][0] = 1
+                idx = 0
                dmap = getattr(node.op, 'destroy_map', None)
                vmap = getattr(node.op, 'view_map', None)
+                val = nodes_mem[node]
-                for idx, v in enumerate(val):
+                for v in val:
                    # TODO check the op returned a view
                    if dmap and idx in dmap:
                        node_memory_saved_by_inplace += v
                    # TODO check the op returned a view
                    elif vmap and idx in vmap:
                        node_memory_saved_by_view += v
-                    elif not isinstance(v, str):
+                    idx += 1
-                        node_memory_size += v
-                        running_memory_size += v
+                # Update the Python emulating dicts and add the memory
-                        if running_memory_size > running_max_memory_size:
+                # allocated by the node
-                            running_max_memory_size = running_memory_size
+                idx2 = 0
-                        old_storage = thunk_old_storage[order.index(node)]
+                for out in node.outputs:
-                        for old_s in old_storage:
+                    ins = None
-                            old_v = var_mem[node.inputs[old_s]]
+                    if dmap and idx2 in dmap:
-                            if not isinstance(old_v, str):
+                        vidx = dmap[idx2]
-                                running_memory_size -= old_v
+                        assert len(vidx) == 1, "Here we only support the possibility to destroy one input"
+                        ins = node.inputs[vidx[0]]
-            return [node_memory_size, running_memory_size, running_max_memory_size, node_memory_saved_by_inplace, node_memory_saved_by_view]
+                    if vmap and idx2 in vmap:
+                        assert ins is None
+                        vidx = vmap[idx2]
+                        assert len(vidx) == 1, "Here we only support the possibility to view one input"
+                        ins = node.inputs[vidx[0]]
+                    if ins is not None:
+                        # This is needed for destroy_map in case it
+                        # return a partial view that is destroyed.  So
+                        # the output could be different then the
+                        # input.
+                        assert isinstance(ins, theano.Variable)
+                        # we keep trac of view only again the origin
+                        origin = view_of.get(ins, ins)
+                        view_of[out] = origin
+                        viewed_by[origin].append(out)
+                    else:
+                        running_memory_size += var_mem[out]
+                        node_memory_size += var_mem[out]
+                    idx2 += 1
+                running_max_memory_size = max(running_max_memory_size,
+                                              running_memory_size)
+                # Mimic the combination of Theano and Python gc
+                for ins in node.inputs:
+                    assert not (ins in view_of and viewed_by[ins])
+                    # we trac the original var, so this shouldn't happen
+                    if (dependencies[ins] and
+                        ins not in fgraph.outputs and
+                        ins.owner and
+                        all([compute_map[v][0] for v in dependencies[ins]])):
+                        if ins not in view_of and not viewed_by.get(ins, []):
+                            running_memory_size -= var_mem[ins]
+                        elif ins in view_of:
+                            origin = view_of[ins]
+                            viewed_by[origin].remove(ins)
+                            if (not viewed_by[origin] and
+                                origin not in fgraph.inputs and
+                                not isinstance(origin, theano.Constant)):
+                                running_memory_size -= var_mem[origin]
+                    else:
+                        # ins is viewed_by something else, so its
+                        # memory isn't freed
+                        pass
+            return [node_memory_size, running_memory_size,
+                    running_max_memory_size, node_memory_saved_by_inplace,
+                    node_memory_saved_by_view]
+        def count_minimum_peak(node_list, fgraph, nodes_mem):
+            global mem_count, mem_bound, max_mem_count
+            node_list = list(node_list)
+            mem_count = 0
+            max_mem_count = 0
+            mem_bound = numpy.inf
+            # This take only the inputs/outputs dependencies.
+            dependencies = fgraph.profile.dependencies
+            # Initial compute_map which is used to check if a node is valid
+            compute_map = defaultdict(lambda: [0])
+            for var in fgraph.inputs:
+                compute_map[var][0] = 1
+            def check_node_state(node):
+                """
+                Check if an Apply node is valid(has inputs).
+                :param node: Apply Node
+                """
+                inputs = node.inputs
+                outputs = node.outputs
+                deps = inputs + node.destroy_dependencies
+                # TODO: Move at compute_map creation to speed things up.
+                for node in inputs:
+                    if isinstance(node, graph.Constant):
+                        compute_map[node][0] = 1
+                computed_ins = all(compute_map[v][0] for v in deps)
+                return computed_ins
+            # Initial executable_nodes
+            executable_nodes = set()
+            for var in fgraph.inputs:
+                for c, _ in var.clients:
+                    if c != "output" and check_node_state(c):
+                        executable_nodes.add(c)
+            def min_memory_generator(executable_nodes, viewed_by, view_of):
+                """
+                Generate all valid node order from node_list
+                and compute its memory peak.
+                :param executable_nodes: Set of executable nodes
+                """
+                global mem_count, mem_bound, max_mem_count
+                for node in executable_nodes:
+                    new_exec_nodes = executable_nodes.copy()
+                    new_exec_nodes.remove(node)
+                    # Check if cut path now
+                    if max_mem_count > mem_bound:
+                        continue
+                    view_of_temp = view_of.copy()
+                    # We don't want a shallow copy, but we don't want
+                    # a deep copy. So this do a "middle" copy, where
+                    # we copy the dict and the list, but not the var
+                    viewed_by_temp = {}
+                    for k, v in viewed_by.iteritems():
+                        viewed_by_temp[k] = list(v)
+                    for var in node.outputs:
+                        compute_map[var][0] = 1
+                    mem_created = 0
+                    mem_freed = 0
+                    max_storage = max_mem_count
+                    dmap = getattr(node.op, 'destroy_map', None)
+                    vmap = getattr(node.op, 'view_map', None)
+                    idx = 0
+                    # Update the Python emulating dicts and add the
+                    # memory allocated by the node
+                    for out in node.outputs:
+                        ins = None
+                        if dmap and idx in dmap:
+                            vidx = dmap[idx]
+                            assert len(vidx) == 1, "Here we only support the possibility to destroy one input"
+                            ins = node.inputs[vidx[0]]
+                        if vmap and idx in vmap:
+                            assert ins is None
+                            vidx = vmap[idx]
+                            assert len(vidx) == 1, "Here we only support the possibility to destroy one input"
+                            ins = node.inputs[vidx[0]]
+                        if ins is not None:
+                            # This is needed for destroy_map in case it
+                            # return a partial view that is destroyed.  So
+                            # the output could be different then the
+                            # input.
+                            assert isinstance(ins, theano.Variable)
+                            # We keep trac of view only again the original
+                            origin = view_of_temp.get(ins, ins)
+                            view_of_temp[out] = origin
+                            viewed_by_temp[origin].append(out)
+                        else:
+                            mem_created += var_mem[out]
+                        idx += 1
+                    mem_count += mem_created
+                    max_mem_count = max(max_mem_count, mem_count)
+                    # Mimic the combination of Theano and Python gc.
+                    for ins in node.inputs:
+                        assert not (ins in view_of_temp and
+                                    viewed_by_temp[ins])
+                        # We track of the original var, so this shouldn't happen
+                        if (dependencies[ins] and
+                            ins not in fgraph.outputs and
+                            ins.owner and
+                            all([compute_map[v][0] for v in dependencies[ins]])):
+                            if ins not in view_of_temp and not viewed_by_temp.get(ins, []):
+                                mem_freed += var_mem[ins]
+                            elif ins in view_of_temp:
+                                origin = view_of_temp[ins]
+                                viewed_by_temp[origin].remove(ins)
+                                if (not viewed_by_temp[origin] and
+                                    origin not in fgraph.inputs and
+                                    not isinstance(origin, theano.Constant)):
+                                    mem_freed += var_mem[origin]
+                        else:
+                            # ins is viewed_by something else, so its
+                            # memory isn't freed
+                            pass
+                    mem_count -= mem_freed
+                    for var in node.outputs:
+                        for c, _ in var.clients:
+                            if c != "output" and check_node_state(c):
+                                new_exec_nodes.add(c)
+                    if not new_exec_nodes:
+                        yield [node]
+                        # Check and Update mem_bound
+                        if max_mem_count < mem_bound:
+                            mem_bound = max_mem_count
+                    else:
+                        for p in min_memory_generator(new_exec_nodes,
+                                                      viewed_by_temp,
+                                                      view_of_temp):
+                            yield [node]+p
+                    # Reset track variables
+                    mem_count -= mem_created
+                    max_mem_count = max_storage
+                    mem_count += mem_freed
+                    for var in node.outputs:
+                        compute_map[var][0] = 0
+            # two data structure used to mimic Python gc
+            viewed_by = {}  # {var1: [vars that view var1]}
+            # The len of the list is the value of python ref count. But we use a list, not just the ref count value.
+            # This is more safe to help detect potential bug  in the algo
+            for var in fgraph.variables:
+                viewed_by[var] = []
+            view_of = {}  # {var1: original var viewed by var1}
+            # The orignal mean that we don't keep trac of all the intermediate relationship in the view.
+            # Loop all valid orders and find min peak(store in mem_bound)
+            for order in min_memory_generator(executable_nodes,
+                                              viewed_by,
+                                              view_of):
+                continue
+            return mem_bound
        for fgraph, nodes_mem in fct_memory.iteritems():
            # Sum of the size of all variables in bytes
            sum_size = sum([sum([v for v in val if not isinstance(v, str)])
-                            for key, val in nodes_mem.iteritems()])    
+                            for key, val in nodes_mem.iteritems()])
            order = fgraph.toposort()
            # A list of intermediate variable that are not need
            # after the execution of the corresponding node.
            # It mean that after executing the node,
            # the corresponding variable can be gc.
-            post_thunk_old_storage = []
-            computed, last_user = theano.gof.link.gc_helper(order)
-            for node in order:
-                post_thunk_old_storage.append([
-                    input_idx
-                    for input_idx, input in enumerate(node.inputs)
-                    if (input in computed) and
-                    (input not in fgraph.outputs) and
-                    node == last_user[input]])
-            old_running_memory = count_running_memory(order, post_thunk_old_storage, nodes_mem)
+            old_running_memory = count_running_memory(order, fgraph, nodes_mem)
            new_order = fgraph.profile.node_executed_order
            # A list of new executed node order
-            new_storage = fgraph.profile.node_cleared_order
-            # A list of variables that get freed
-            new_running_memory = count_running_memory(new_order, new_storage, nodes_mem)
+            new_running_memory = count_running_memory(new_order,
+                                                      fgraph, nodes_mem)
            # Store the max of some stats by any function in this profile.
            max_sum_size = max(max_sum_size, sum_size)
-            max_node_memory_size = max(max_node_memory_size, old_running_memory[0])
+            max_node_memory_size = max(max_node_memory_size,
+                                       old_running_memory[0])
            max_running_max_memory_size = max(max_running_max_memory_size,
-                                          old_running_memory[2])
+                                              old_running_memory[2])
            max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
                                                old_running_memory[4])
            max_node_memory_saved_by_inplace = max(
                max_node_memory_saved_by_inplace, old_running_memory[3])
            # Store max of some stats with new order
-            new_max_node_memory_size = max(new_max_node_memory_size, new_running_memory[0])
+            new_max_node_memory_size = max(new_max_node_memory_size,
+                                           new_running_memory[0])
            new_max_running_max_memory_size = max(new_max_running_max_memory_size,
-                                        new_running_memory[2])
+                                                  new_running_memory[2])
            new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
-                                                new_running_memory[4])
+                                                    new_running_memory[4])
            new_max_node_memory_saved_by_inplace = max(
                new_max_node_memory_saved_by_inplace, new_running_memory[3])
-            del fgraph, nodes_mem, post_thunk_old_storage, node
+            # Config: whether print min memory peak
+            if config.profiling.min_peak_memory:
+                node_list = fgraph.apply_nodes
+                min_peak = count_minimum_peak(node_list, fgraph, nodes_mem)
+                min_max_peak = max(min_max_peak, min_peak)
+            del fgraph, nodes_mem
        if len(fct_memory) > 1:
            print >> file,  ("Memory Profile "
@@ -760,6 +1002,9 @@ class ProfileStats(object):
        print >> file,  "    Max if linker=cvm(default): %dKB (%dKB)" % (int(round(
            new_max_running_max_memory_size / 1024.)), int(round(
            max_running_max_memory_size / 1024.)))
+        if min_max_peak:
+            print >> file,  "    Minimum peak from all valid apply node order is %dKB" % int(round(
+                min_max_peak / 1024.))
        print >> file,  "    Memory saved if views are used: %dKB (%dKB)" % (int(
            round(new_max_node_memory_saved_by_view / 1024.)), int(
            round(max_node_memory_saved_by_view / 1024.)))
@@ -837,6 +1082,7 @@ class ProfileStats(object):
                         " emitted in those cases.")
        print >> file, ''
    def summary(self, file=sys.stderr, n_ops_to_print=20,
                n_apply_to_print=20):
        self.summary_function(file)
@@ -857,6 +1103,8 @@ class ProfileStats(object):
                                                    self.optimizer_profile[1])
 if 0: # old code still to be ported from ProfileMode
    def long_print(self, file=sys.stderr, fct_name=None, message=None,
            n_apply_to_print=15, n_ops_to_print=20, print_apply=False):
@@ -1157,6 +1405,8 @@ if 0: # old code still to be ported from ProfileMode
                n_ops_to_print=n_ops_to_print, print_apply=False)
 class ScanProfileStats(ProfileStats):
    callcount = 0.0
    nbsteps = 0.0

--- a/theano/compile/tests/test_profiling.py
+++ b/theano/compile/tests/test_profiling.py
@@ -2,36 +2,57 @@
 Test of memory profiling
 """
+import StringIO
+import numpy
 import theano
 import theano.tensor as T
-import StringIO
 def test_profiling():
-    old1 = theano.config.profile
+    config1 = theano.config.profile
-    old2 = theano.config.profile_memory
+    config2 = theano.config.profile_memory
+    config3 = theano.config.profiling.min_peak_memory
    try:
        theano.config.profile = True
        theano.config.profile_memory = True
+        theano.config.profiling.min_peak_memory = True
+        x = [T.dvector("val%i" % i) for i in range(3)]
+        z = []
+        z += [T.outer(x[i], x[i+1]).sum(axis=1) for i in range(len(x)-1)]
+        z += [x[i] + x[i+1] for i in range(len(x)-1)]
-        x = T.dvector("x")
-        y = T.dvector("y")
-        z = x + y
        p = theano.ProfileStats(False)
        if theano.config.mode in ["DebugMode", "DEBUG_MODE"]:
            m = "FAST_RUN"
        else:
            m = None
-        f = theano.function([x, y], z, profile=p, name="test_profiling",
+        f = theano.function(x, z, profile=p, name="test_profiling",
                            mode=m)
-        output = f([1, 2, 3, 4], [1, 1, 1, 1])
+        inp = [numpy.arange(1024) + 1 for i in range(len(x))]
+        output = f(*inp)
        buf = StringIO.StringIO()
        f.profile.summary(buf)
+        # regression testing for future algo speed up
+        the_string = buf.getvalue()
+        lines1 = [l for l in the_string.split("\n") if "Max if linker" in l]
+        lines2 = [l for l in the_string.split("\n") if "Minimum peak" in l]
+        assert "Max if linker=cvm(default): 8224KB (16408KB)" in the_string, (lines1, lines2)
+        assert "Minimum peak from all valid apply node order is 8208KB" in the_string, (lines1, lines2)
    finally:
-        theano.config.profile = old1
+        theano.config.profile = config1
-        theano.config.profile_memory = old2
+        theano.config.profile_memory = config2
+        theano.config.profiling.min_peak_memory = config3
 if __name__ == '__main__':

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -149,6 +149,9 @@ class VM(object):
        if hasattr(self, 'node_cleared_order'):
            profile.node_cleared_order = self.node_cleared_order[:]
+        if hasattr(self, 'dependencies'):
+            profile.dependencies = self.dependencies.copy()
        # clear the timer info out of the buffers
        for i in xrange(len(self.call_times)):
            self.call_times[i] = 0.0