Merge pull request #1854 from RoyXue/GSoC2014_part1

Add a list store executed node order

Merge pull request #1854 from RoyXue/GSoC2014_part1
34824493 · Frédéric Bastien · 0d3dffac · cc51b810 · 34824493 · 34824493
--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -628,38 +628,38 @@ class ProfileStats(object):
        max_running_max_memory_size = 0
        max_node_memory_saved_by_view = 0
        max_node_memory_saved_by_inplace = 0
-        for fgraph, nodes_mem in fct_memory.iteritems():
-            # Sum of the size of all variables in bytes
+        # statistic with the new order
-            sum_size = sum([sum([v for v in val if not isinstance(v, str)])
+        new_max_node_memory_size = 0
-                            for key, val in nodes_mem.iteritems()])
+        new_max_running_max_memory_size = 0
-            # Sum of the size of all variables that actually allocate
+        new_max_node_memory_saved_by_view = 0
-            # memory (excluding views, and inplace);
+        new_max_node_memory_saved_by_inplace = 0
+        def count_running_memory(order, thunk_old_storage, nodes_mem):
+            """
+            Calculate memory with specific node order 
+            Return a list including the following values
+            1.  node_memory_size
+                Sum of the size of all variables that actually allocate
+                memory (excluding views, and inplace);
+            2. running_memory_size
+                The memory allocated after the current apply node
+            3. running_max_memory_size
+                The maximum of running_memory_size during the function   
+            4.  node_memory_saved_by_view
+                The sum of memory saved by returning view instead of new
+                allocation 
+            5.  node_memory_saved_by_inplace
+                The sum of memory saved by reusing the input instead of
+                new allocation
+            """
            node_memory_size = 0
-            # The sum of memory saved by returning view instead of new
-            # allocation
-            node_memory_saved_by_view = 0
-            # The sum of memory saved by reusing the input instead of
-            # new allocation
-            node_memory_saved_by_inplace = 0
-            # The memory allocated after the current apply node
            running_memory_size = 0
-            # The maximum of running_memory_size during the function
            running_max_memory_size = 0
+            node_memory_saved_by_view = 0
+            node_memory_saved_by_inplace = 0
-            order = fgraph.toposort()
-            # A list of intermediate variable that are not need
-            # after the execution of the corresponding node.
-            # It mean that after executing the node,
-            # the corresponding variable can be gc.
-            post_thunk_old_storage = []
-            computed, last_user = theano.gof.link.gc_helper(order)
-            for node in order:
-                post_thunk_old_storage.append([
-                    input_idx
-                    for input_idx, input in enumerate(node.inputs)
-                    if (input in computed) and
-                    (input not in fgraph.outputs) and
-                    node == last_user[input]])
            for node in order:
                val = nodes_mem[node]
                dmap = getattr(node.op, 'destroy_map', None)
@@ -677,21 +677,61 @@ class ProfileStats(object):
                        running_memory_size += v
                        if running_memory_size > running_max_memory_size:
                            running_max_memory_size = running_memory_size
-                        old_storage = post_thunk_old_storage[order.index(node)]
+                        old_storage = thunk_old_storage[order.index(node)]
                        for old_s in old_storage:
                            old_v = var_mem[node.inputs[old_s]]
                            if not isinstance(old_v, str):
                                running_memory_size -= old_v
+            return [node_memory_size, running_memory_size, running_max_memory_size, node_memory_saved_by_inplace, node_memory_saved_by_view]
+        for fgraph, nodes_mem in fct_memory.iteritems():
+            # Sum of the size of all variables in bytes
+            sum_size = sum([sum([v for v in val if not isinstance(v, str)])
+                            for key, val in nodes_mem.iteritems()])    
+            order = fgraph.toposort()
+            # A list of intermediate variable that are not need
+            # after the execution of the corresponding node.
+            # It mean that after executing the node,
+            # the corresponding variable can be gc.
+            post_thunk_old_storage = []
+            computed, last_user = theano.gof.link.gc_helper(order)
+            for node in order:
+                post_thunk_old_storage.append([
+                    input_idx
+                    for input_idx, input in enumerate(node.inputs)
+                    if (input in computed) and
+                    (input not in fgraph.outputs) and
+                    node == last_user[input]])
+            old_running_memory = count_running_memory(order, post_thunk_old_storage, nodes_mem)
+            new_order = fgraph.profile.node_executed_order
+            # A list of new executed node order
+            new_storage = fgraph.profile.node_cleared_order
+            # A list of variables that get freed
+            new_running_memory = count_running_memory(new_order, new_storage, nodes_mem)
            # Store the max of some stats by any function in this profile.
            max_sum_size = max(max_sum_size, sum_size)
-            max_node_memory_size = max(max_node_memory_size, node_memory_size)
+            max_node_memory_size = max(max_node_memory_size, old_running_memory[0])
            max_running_max_memory_size = max(max_running_max_memory_size,
-                                          running_max_memory_size)
+                                          old_running_memory[2])
            max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
-                                                node_memory_saved_by_view)
+                                                old_running_memory[4])
            max_node_memory_saved_by_inplace = max(
-                max_node_memory_saved_by_inplace, node_memory_saved_by_inplace)
+                max_node_memory_saved_by_inplace, old_running_memory[3])
+            # Store max of some stats with new order
+            new_max_node_memory_size = max(new_max_node_memory_size, new_running_memory[0])
+            new_max_running_max_memory_size = max(new_max_running_max_memory_size,
+                                        new_running_memory[2])
+            new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
+                                                new_running_memory[4])
+            new_max_node_memory_saved_by_inplace = max(
+                new_max_node_memory_saved_by_inplace, new_running_memory[3])
            del fgraph, nodes_mem, post_thunk_old_storage, node
@@ -702,21 +742,27 @@ class ProfileStats(object):
            print >> file,  "Memory Profile"
        print >> file, "(Sparse variables are ignored)"
+        print >> file, "(For values in brackets, it's for linker = c|py"
        print >> file,  "---"
 #        print >> file,  "    Max if no gc, inplace and view: %dKB" % int(
 #            round(max_sum_size / 1024))
-        print >> file,  "    Max if linker=cvm (default): unknown"
-        print >> file,  "    Max if no gc (allow_gc=False): %dKB" % int(round(
+        print >> file,  "    Max if no gc (allow_gc=False): %dKB (%dKB)" % (int(round(
-                             max_node_memory_size / 1024.))
+                             new_max_node_memory_size / 1024.)), int(round(
-        print >> file,  "    Max if linker=c|py: %dKB" % int(round(
+                             max_node_memory_size / 1024.)))
-            max_running_max_memory_size / 1024.))
+        print >> file,  "    Max if linker=cvm(default): %dKB (%dKB)" % (int(round(
-#        print >> file,  "    Memory saved if views are used: %dKB" % int(
+            new_max_running_max_memory_size / 1024.)), int(round(
-#            round(max_node_memory_saved_by_view / 1024.))
+            max_running_max_memory_size / 1024.)))
-#        print >> file,  "    Memory saved if inplace ops are used: %dKB" % \
+        print >> file,  "    Memory saved if views are used: %dKB (%dKB)" % (int(
-#            int(round(max_node_memory_saved_by_inplace / 1024.))
+            round(new_max_node_memory_saved_by_view / 1024.)), int(
-        print >> file,  "    Memory saved if gc is enabled (linker=c|py): %dKB" % int(
+            round(max_node_memory_saved_by_view / 1024.)))
-            round(max_node_memory_size - max_running_max_memory_size) / 1024.)
+        print >> file,  "    Memory saved if inplace ops are used: %dKB (%dKB)" % \
+            (int(round(new_max_node_memory_saved_by_inplace / 1024.)), int(round(max_node_memory_saved_by_inplace / 1024.)))
+        print >> file,  "    Memory saved if gc is enabled: %dKB (%dKB)" % (int(
+            round(new_max_node_memory_size - new_max_running_max_memory_size) / 1024.), int(
+            round(max_node_memory_size - max_running_max_memory_size) / 1024.))
        if (hasattr(theano, 'sandbox') and
            hasattr(theano.sandbox, 'cuda') and
            hasattr(theano.sandbox.cuda, 'cuda_ndarray') and

--- a/theano/compile/tests/test_profiling.py
+++ b/theano/compile/tests/test_profiling.py
+"""
+Test of memory profiling
+"""
+import theano
+import theano.tensor as T
+import StringIO
+def test_profiling():
+	old1 = theano.config.profile 
+	old2 = theano.config.profile_memory
+	theano.config.profile = True
+	theano.config.profile_memory = True
+	x = T.dvector("x")
+	y = T.dvector("y")
+	z = x + y
+	f = theano.function([x, y], z, profile=True, name="test_profiling")
+	output = f([1, 2, 3, 4],[1, 1, 1, 1])
+	buf = StringIO.StringIO()
+	f.profile.summary(buf)
+	theano.config.profile = old1
+	theano.config.profile_memory = old2
+if __name__ == '__main__':
+	test_profiling()
\ No newline at end of file
--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -141,6 +141,12 @@ class VM(object):
            profile.variable_shape = self.variable_shape.copy()
            profile.variable_strides = self.variable_strides.copy()
+        if hasattr(self, 'node_executed_order'):
+            profile.node_executed_order = self.node_executed_order[:]
+        if hasattr(self, 'node_cleared_order'):
+            profile.node_cleared_order = self.node_cleared_order[:]
        # clear the timer info out of the buffers
        for i in xrange(len(self.call_times)):
            self.call_times[i] = 0.0
@@ -298,6 +304,7 @@ class Stack(VM):
        idx = self.node_idx[node]
        t0 = time.time()
        rval = self.thunks[idx]()
+        self.node_executed_order.append(node)
        # Some thunks on some computers run faster than the granularity
        # of the time.time clock.
@@ -318,6 +325,9 @@ class Stack(VM):
        compute_map = self.compute_map
        thunks = self.thunks
        dependencies = self.dependencies
+        self.node_executed_order = []
+        self.node_cleared_order = []
        for k in self.storage_map:
            compute_map[k][0] = (k.owner is None)
@@ -404,6 +414,10 @@ class Stack(VM):
                                      self.thunks[self.node_idx[current_apply]])
                    for o in current_apply.outputs:
                        compute_map[o][0] = 1
+                    input_index = []
+                    # A list store the index of inputs variables
                    if self.allow_gc:
                        for i in current_apply.inputs:
                            # Garbage Collection -> check if anybody else uses
@@ -414,6 +428,8 @@ class Stack(VM):
                                if all(compute_map[v][0]
                                        for v in dependencies[i]):
                                    storage_map[i][0] = None
+                                    input_index.append(current_apply.inputs.index(i))
                                    #DO NOT set compute_map to 0
                                    #If values become False and the
@@ -435,6 +451,8 @@ class Stack(VM):
        #The stack level is not good when inside a Scan.
        stacklevel=3
                                        )
+                    self.node_cleared_order.append(input_index)
                elif not computed_ins:
                    # -- Non-lazy case, need inputs
                    apply_stack.append(current_apply)
@@ -442,6 +460,7 @@ class Stack(VM):
                            for inp in current_deps
                            if inp.owner)
            elif not computed_outs:
                #
                # stack loop: Lazy Evaluation Case
@@ -488,6 +507,8 @@ class Stack(VM):
                                st = 'c'
                            self.variable_strides[var] = st
+                    input_index = []
                    if self.allow_gc:
                        for i in current_apply.inputs:
                            if (dependencies[i] and i.owner and
@@ -499,17 +520,29 @@ class Stack(VM):
                                        break
                                if empty_storage_map:
                                    storage_map[i][0] = None
+                                    input_index.append(current_apply.inputs.index(i)) 
                                    #See the not lazy gc code for explanations
                                    #of compute_map change
                                    compute_map[i][0] = 2
+                    self.node_cleared_order.append(input_index)
        # Hacky coarse gc final pass
        # This is required until we have a proper gc algorithm for graphs with
        # lazy evaluation. See discussion on theano-dev June 19 2012.
+        final_index = []
        if self.allow_gc:
            for v in storage_map:
                if v.owner and not v in self.outputs:
+                    if compute_map[v][0] == 2:
+                        continue
+                    else:
                        storage_map[v][0] = None
+                        final_index.append(v)
+                        compute_map[v][0] = 2
+        self.node_cleared_order.append(final_index)
 try: