Merge pull request #2243 from RoyXue/fix_crash_and_GPU_support

Fix ifelse crash and gpu support

Merge pull request #2243 from RoyXue/fix_crash_and_GPU_support
b6410099 · Frédéric Bastien · 8971b5ca · e318dc8c · b6410099 · b6410099
--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -642,12 +642,15 @@ class ProfileStats(object):
            fct_shapes[node.fgraph].setdefault(node, [])
            sum_dense = 0
            for out in node.outputs:
-                sh = self.variable_shape[out]
-                if hasattr(out.type, 'get_size'):
-                    v = out.type.get_size(sh)
-                    sum_dense += v
+                if out in self.variable_shape.keys():
+                    sh = self.variable_shape[out]
+                    if hasattr(out.type, 'get_size'):
+                        v = out.type.get_size(sh)
+                        sum_dense += v
+                    else:
+                        v = 'Unknown'
                else:
-                    v = "Unknown"
+                    v = 'Variable isnt created'

                var_mem[out] = v
                fct_memory[node.fgraph][node].append(v)
@@ -656,14 +659,17 @@ class ProfileStats(object):

        # Find the function that used the most of that statistic
        max_sum_size = 0
-        max_node_memory_size = 0
-        max_running_max_memory_size = 0
+
+        # statistics with the old order
+        # TODO: Make list more flexible with mulitply GPUs later
+        max_node_memory_size = [0, 0, 0]
+        max_running_max_memory_size = [0, 0, 0]
        max_node_memory_saved_by_view = 0
        max_node_memory_saved_by_inplace = 0

-        # statistic with the new order
-        new_max_node_memory_size = 0
-        new_max_running_max_memory_size = 0
+        # statistics with the new order
+        new_max_node_memory_size = [0, 0, 0]
+        new_max_running_max_memory_size = [0, 0, 0]
        new_max_node_memory_saved_by_view = 0
        new_max_node_memory_saved_by_inplace = 0

@@ -689,10 +695,11 @@ class ProfileStats(object):
                The sum of memory saved by reusing the input instead of
                new allocation
            """
-
-            node_memory_size = 0
-            running_memory_size = 0
-            running_max_memory_size = 0
+            from theano.sandbox.cuda import CudaNdarrayType
+            # Initial Mem info values [CPU, GPU]
+            node_memory_size = [0, 0]
+            running_memory_size = [0, 0]
+            running_max_memory_size = [0, 0]
            node_memory_saved_by_view = 0
            node_memory_saved_by_inplace = 0
            # This take only the inputs/outputs dependencies.
@@ -734,6 +741,10 @@ class ProfileStats(object):
                # allocated by the node
                idx2 = 0
                for out in node.outputs:
+                    if isinstance(out.type, CudaNdarrayType):
+                        cg = 1
+                    else:
+                        cg = 0
                    ins = None
                    if dmap and idx2 in dmap:
                        vidx = dmap[idx2]
@@ -757,30 +768,36 @@ class ProfileStats(object):
                        view_of[out] = origin
                        viewed_by[origin].append(out)
                    else:
-                        running_memory_size += var_mem[out]
-                        node_memory_size += var_mem[out]
+                        running_memory_size[cg] += var_mem[out]
+                        node_memory_size[cg] += var_mem[out]
                    idx2 += 1

-                running_max_memory_size = max(running_max_memory_size,
-                                              running_memory_size)
+                running_max_memory_size[0] = max(running_max_memory_size[0],
+                                                 running_memory_size[0])
+                running_max_memory_size[1] = max(running_max_memory_size[1],
+                                                 running_memory_size[1])

                # Mimic the combination of Theano and Python gc
                for ins in node.inputs:
                    assert not (ins in view_of and viewed_by[ins])
                    # we trac the original var, so this shouldn't happen
+                    if isinstance(ins.type, CudaNdarrayType):
+                        cg = 1
+                    else:
+                        cg = 0
                    if (dependencies[ins] and
                            ins not in fgraph.outputs and
                            ins.owner and
                            all([compute_map[v][0] for v in dependencies[ins]])):
                        if ins not in view_of and not viewed_by.get(ins, []):
-                            running_memory_size -= var_mem[ins]
+                            running_memory_size[cg] -= var_mem[ins]
                        elif ins in view_of:
                            origin = view_of[ins]
                            viewed_by[origin].remove(ins)
                            if (not viewed_by[origin] and
                                    origin not in fgraph.inputs and
                                    not isinstance(origin, theano.Constant)):
-                                running_memory_size -= var_mem[origin]
+                                running_memory_size[cg] -= var_mem[origin]
                    else:
                        # ins is viewed_by something else, so its
                        # memory isn't freed
@@ -994,24 +1011,46 @@ class ProfileStats(object):

            # Store the max of some stats by any function in this profile.
            max_sum_size = max(max_sum_size, sum_size)
-            max_node_memory_size = max(max_node_memory_size,
-                                       old_running_memory[0])
-            max_running_max_memory_size = max(max_running_max_memory_size,
-                                              old_running_memory[2])
-            max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
-                                                old_running_memory[4])
+            max_node_memory_size[0] = max(max_node_memory_size[0],
+                                          sum(old_running_memory[0]))
+            max_running_max_memory_size[0] = max(max_running_max_memory_size[0],
+                                                 sum(old_running_memory[2]))
+
+            # Separate CPU and GPU
+            max_node_memory_size[1] = max(max_node_memory_size[1],
+                                          old_running_memory[0][0])
+            max_node_memory_size[2] = max(max_node_memory_size[2],
+                                          old_running_memory[0][1])
+            max_running_max_memory_size[1] = max(max_running_max_memory_size[1],
+                                                 old_running_memory[2][0])
+            max_running_max_memory_size[2] = max(max_running_max_memory_size[2],
+                                                 old_running_memory[2][1])
+
            max_node_memory_saved_by_inplace = max(
                max_node_memory_saved_by_inplace, old_running_memory[3])
+            max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
+                                                old_running_memory[4])

            # Store max of some stats with new order
-            new_max_node_memory_size = max(new_max_node_memory_size,
-                                           new_running_memory[0])
-            new_max_running_max_memory_size = max(new_max_running_max_memory_size,
-                                                  new_running_memory[2])
-            new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
-                                                    new_running_memory[4])
+            new_max_node_memory_size[0] = max(new_max_node_memory_size[0],
+                                              sum(new_running_memory[0]))
+            new_max_running_max_memory_size[0] = max(new_max_running_max_memory_size[0],
+                                                     sum(new_running_memory[2]))
+
+            # Separate CPU and GPU
+            new_max_node_memory_size[1] = max(new_max_node_memory_size[1],
+                                              new_running_memory[0][0])
+            new_max_node_memory_size[2] = max(new_max_node_memory_size[2],
+                                              new_running_memory[0][1])
+            new_max_running_max_memory_size[1] = max(new_max_running_max_memory_size[1],
+                                                     new_running_memory[2][0])
+            new_max_running_max_memory_size[2] = max(new_max_running_max_memory_size[2],
+                                                     new_running_memory[2][1])
+
            new_max_node_memory_saved_by_inplace = max(
                new_max_node_memory_saved_by_inplace, new_running_memory[3])
+            new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
+                                                    new_running_memory[4])

            # Config: whether print min memory peak
            if config.profiling.min_peak_memory:
@@ -1035,13 +1074,30 @@ class ProfileStats(object):
        print >> file,  "---"
 #        print >> file,  "    Max if no gc, inplace and view: %dKB" % int(
 #            round(max_sum_size / 1024))
-
        print >> file,  "    Max if no gc (allow_gc=False): %dKB (%dKB)" % (int(round(
-            new_max_node_memory_size / 1024.)), int(round(
-                max_node_memory_size / 1024.)))
+            new_max_node_memory_size[0] / 1024.)), int(round(
+                max_node_memory_size[0] / 1024.)))
+        print >> file,  "    CPU: %dKB (%dKB)" % ((int(round(
+            new_max_node_memory_size[1] / 1024.)), int(round(
+                max_node_memory_size[1] / 1024.))))
+        print >> file,  "    GPU: %dKB (%dKB)" % ((int(round(
+            new_max_node_memory_size[2] / 1024.)), int(round(
+                max_node_memory_size[2] / 1024.))))
+
+        print >> file,  "---"
+
        print >> file,  "    Max if linker=cvm(default): %dKB (%dKB)" % (int(round(
-            new_max_running_max_memory_size / 1024.)), int(round(
-                max_running_max_memory_size / 1024.)))
+            new_max_running_max_memory_size[0] / 1024.)), int(round(
+                max_running_max_memory_size[0] / 1024.)))
+        print >> file,  "    CPU: %dKB (%dKB)" % ((int(round(
+            new_max_running_max_memory_size[1] / 1024.)), int(round(
+                max_running_max_memory_size[1] / 1024.))))
+        print >> file,  "    GPU: %dKB (%dKB)" % ((int(round(
+            new_max_running_max_memory_size[2] / 1024.)), int(round(
+                max_running_max_memory_size[2] / 1024.))))
+
+        print >> file,  "---"
+
        if min_max_peak:
            print >> file,  "    Minimum peak from all valid apply node order is %dKB(took %.3fs to compute)" % (int(round(
                min_max_peak / 1024.)), min_peak_time)
@@ -1052,8 +1108,10 @@ class ProfileStats(object):
            (int(round(new_max_node_memory_saved_by_inplace / 1024.)),
             int(round(max_node_memory_saved_by_inplace / 1024.)))
        print >> file,  "    Memory saved if gc is enabled: %dKB (%dKB)" % (int(
-            round(new_max_node_memory_size - new_max_running_max_memory_size) / 1024.), int(
-            round(max_node_memory_size - max_running_max_memory_size) / 1024.))
+            round(new_max_node_memory_size[0] - new_max_running_max_memory_size[0]) / 1024.), int(
+            round(max_node_memory_size[0] - max_running_max_memory_size[0]) / 1024.))
+
+        print >> file,  "---"

        if (hasattr(theano, 'sandbox') and
            hasattr(theano.sandbox, 'cuda') and

--- a/theano/compile/tests/test_profiling.py
+++ b/theano/compile/tests/test_profiling.py
@@ -8,6 +8,7 @@ import numpy

 import theano
 import theano.tensor as T
+from theano.ifelse import ifelse


 def test_profiling():
@@ -20,11 +21,11 @@ def test_profiling():
        theano.config.profile_memory = True
        theano.config.profiling.min_peak_memory = True

-        x = [T.dvector("val%i" % i) for i in range(3)]
+        x = [T.fvector("val%i" % i) for i in range(3)]

        z = []
-        z += [T.outer(x[i], x[i+1]).sum(axis=1) for i in range(len(x)-1)]
-        z += [x[i] + x[i+1] for i in range(len(x)-1)]
+        z += [T.outer(x[i], x[i + 1]).sum(axis=1) for i in range(len(x) - 1)]
+        z += [x[i] + x[i + 1] for i in range(len(x) - 1)]

        p = theano.ProfileStats(False)

@@ -36,7 +37,7 @@ def test_profiling():
        f = theano.function(x, z, profile=p, name="test_profiling",
                            mode=m)

-        inp = [numpy.arange(1024) + 1 for i in range(len(x))]
+        inp = [numpy.arange(1024, dtype='float32') + 1 for i in range(len(x))]
        output = f(*inp)

        buf = StringIO.StringIO()
@@ -46,8 +47,16 @@ def test_profiling():
        the_string = buf.getvalue()
        lines1 = [l for l in the_string.split("\n") if "Max if linker" in l]
        lines2 = [l for l in the_string.split("\n") if "Minimum peak" in l]
-        assert "Max if linker=cvm(default): 8224KB (16408KB)" in the_string, (lines1, lines2)
-        assert "Minimum peak from all valid apply node order is 8208KB" in the_string, (lines1, lines2)
+        if theano.config.device == 'cpu':
+            assert "Max if linker=cvm(default): 4112KB (8204KB)" in the_string, (
+                lines1, lines2)
+            assert "Minimum peak from all valid apply node order is 4104KB" in the_string, (
+                lines1, lines2)
+        else:
+            assert "Max if linker=cvm(default): 8220KB (8220KB)" in the_string, (
+                lines1, lines2)
+            assert "Minimum peak from all valid apply node order is 4116KB" in the_string, (
+                lines1, lines2)

    finally:
        theano.config.profile = config1
@@ -55,5 +64,41 @@ def test_profiling():
        theano.config.profiling.min_peak_memory = config3


+def test_ifelse():
+    config1 = theano.config.profile
+    config2 = theano.config.profile_memory
+
+    try:
+        theano.config.profile = True
+        theano.config.profile_memory = True
+
+        a, b = T.scalars('a', 'b')
+        x, y = T.scalars('x', 'y')
+
+        z = ifelse(T.lt(a, b), x * 2, y * 2)
+
+        p = theano.ProfileStats(False)
+
+        if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]:
+            m = "FAST_RUN"
+        else:
+            m = None
+
+        f_ifelse = theano.function([a, b, x, y], z, profile=p, name="test_ifelse",
+                                   mode=m)
+
+        val1 = 0.
+        val2 = 1.
+        big_mat1 = 10
+        big_mat2 = 11
+
+        out = f_ifelse(val1, val2, big_mat1, big_mat2)
+
+    finally:
+        theano.config.profile = config1
+        theano.config.profile_memory = config2
+
+
 if __name__ == '__main__':
    test_profiling()
+    test_ifelse()
--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -56,6 +56,7 @@ raise_with_op = link.raise_with_op


 class VM(object):
+
    """
    A VM object's __call__ method evaluates a Theano program.

@@ -83,6 +84,7 @@ class VM(object):
        storage. False means it *must not* repeat that feedback.

    """
+
    def __init__(self, nodes, thunks, pre_call_clear):
        """
        Allocate a virtual machine.
@@ -159,10 +161,12 @@ class VM(object):


 class Loop(VM):
+
    """
    Unconditional start-to-finish program execution in Python.
    No garbage collection is allowed on intermediate results.
    """
+
    def __call__(self):
        if self.time_thunks:
            for cont in self.pre_call_clear:
@@ -188,10 +192,12 @@ class Loop(VM):


 class LoopGC(VM):
+
    """
    Unconditional start-to-finish program execution in Python.
    Garbage collection is possible on intermediate results.
    """
+
    def __init__(self, nodes, thunks, pre_call_clear, post_thunk_clear):
        super(LoopGC, self).__init__(nodes, thunks, pre_call_clear)
        self.post_thunk_clear = post_thunk_clear
@@ -231,6 +237,7 @@ class LoopGC(VM):


 class Stack(VM):
+
    """
    Finish-to-start evalution order of thunks.

@@ -340,7 +347,7 @@ class Stack(VM):
        apply_stack = list(self.base_apply_stack)
        last_apply_stack_len = -1

-        #This record all function inputs/shared varibles and constants
+        # This record all function inputs/shared varibles and constants
        for var, data in self.storage_map.iteritems():
            if data[0] is None:
                continue
@@ -396,7 +403,7 @@ class Stack(VM):
                            current_idx = self.node_idx[current_apply]
                            self.call_counts[current_idx] += 1
                            self.call_times[current_idx] += dt
-                            ## Computing the memory footprint of the the op
+                            # Computing the memory footprint of the the op
                            # ?? What about inplace .. if the op is inplace
                            # you don't actually ask for more memory!
                            for (idx, o) in enumerate(
@@ -411,7 +418,7 @@ class Stack(VM):
                                st = getattr(o[0], 'strides',
                                             'input no strides')
                                if (getattr(o[0], 'flags', False) and
-                                    o[0].flags.c_contiguous):
+                                        o[0].flags.c_contiguous):
                                    st = 'c'
                                elif (hasattr(data[0], 'is_c_contiguous') and
                                      data[0].is_c_contiguous()):
@@ -436,15 +443,16 @@ class Stack(VM):
                                if all(compute_map[v][0]
                                        for v in dependencies[i]):
                                    storage_map[i][0] = None
-                                    input_index.append(current_apply.inputs.index(i))
+                                    input_index.append(
+                                        current_apply.inputs.index(i))

-                                    #DO NOT set compute_map to 0
+                                    # DO NOT set compute_map to 0

-                                    #If values become False and the
+                                    # If values become False and the
                                    #current_apply is still in the
-                                    #stack, this will cause it to be
-                                    #recomputed! This can cause wrong value
-                                    #with some combination of inplace op.
+                                    # stack, this will cause it to be
+                                    # recomputed! This can cause wrong value
+                                    # with some combination of inplace op.
                                    compute_map[i][0] = 2
                                    if (config.warn.vm_gc_bug and
                                        current_apply in apply_stack and
@@ -452,12 +460,13 @@ class Stack(VM):
                                                'destroy_map',
                                                False)):
                                        warnings.warn(
-        "There was a bug that existed in the default Theano configuration,"
-        " only in the development version between July 5th 2012"
-        " and July 30th 2012. This was not in a released version."
-        " The bug was affecting this script.",
-        #The stack level is not good when inside a Scan.
-        stacklevel=3
+                                            "There was a bug that existed in the default Theano configuration,"
+                                            " only in the development version between July 5th 2012"
+                                            " and July 30th 2012. This was not in a released version."
+                                            " The bug was affecting this script.",
+                                            # The stack level is not good when
+                                            # inside a Scan.
+                                            stacklevel=3
                                        )
                    self.node_cleared_order.append(input_index)

@@ -465,9 +474,8 @@ class Stack(VM):
                    # -- Non-lazy case, need inputs
                    apply_stack.append(current_apply)
                    apply_stack.extend(inp.owner
-                            for inp in current_deps
-                            if inp.owner)
-
+                                       for inp in current_deps
+                                       if inp.owner)

            elif not computed_outs:
                #
@@ -511,7 +519,7 @@ class Stack(VM):
                            self.variable_shape[var] = sh
                            st = getattr(o[0], 'strides', 'input no strides')
                            if (getattr(o[0], 'flags', False) and
-                                o[0].flags.c_contiguous):
+                                    o[0].flags.c_contiguous):
                                st = 'c'
                            elif (hasattr(data[0], 'is_c_contiguous') and
                                  data[0].is_c_contiguous()):
@@ -523,7 +531,7 @@ class Stack(VM):
                    if self.allow_gc:
                        for i in current_apply.inputs:
                            if (dependencies[i] and i.owner and
-                                i not in self.outputs):
+                                    i not in self.outputs):
                                empty_storage_map = True
                                for x in dependencies[i]:
                                    if not compute_map[x][0]:
@@ -531,9 +539,10 @@ class Stack(VM):
                                        break
                                if empty_storage_map:
                                    storage_map[i][0] = None
-                                    input_index.append(current_apply.inputs.index(i)) 
-                                    #See the not lazy gc code for explanations
-                                    #of compute_map change
+                                    input_index.append(
+                                        current_apply.inputs.index(i))
+                                    # See the not lazy gc code for explanations
+                                    # of compute_map change
                                    compute_map[i][0] = 2

                    self.node_cleared_order.append(input_index)
@@ -560,6 +569,7 @@ try:
    import lazylinker_c

    class CVM(lazylinker_c.CLazyLinker, VM):
+
        def __init__(self, *args, **kwargs):
            lazylinker_c.CLazyLinker.__init__(self, *args, **kwargs)
            # skip VM.__init__
@@ -576,6 +586,7 @@ except (OSError, theano.gof.cmodule.MissingGXX), e:


 class VM_Linker(link.LocalLinker):
+
    """
    Class that satisfies the Linker interface by acting as a VM factory.
    """
@@ -625,9 +636,9 @@ class VM_Linker(link.LocalLinker):
            associated to self, else, a new VM_Linker associated to fgraph.
        """
        if (config.profile and
-            hasattr(theano, 'sandbox') and
-            hasattr(theano.sandbox, 'cuda') and
-            theano.sandbox.cuda.cuda_enabled):
+                hasattr(theano, 'sandbox') and
+                hasattr(theano.sandbox, 'cuda') and
+                theano.sandbox.cuda.cuda_enabled):
            if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
                raise Exception(
                    "You are running the Theano profiler with CUDA enabled."
@@ -644,12 +655,12 @@ class VM_Linker(link.LocalLinker):
            # Warning: make sure to forward the correct values of
            # all parameters to __init__ here.
            return type(self)(
-                    allow_gc=self.allow_gc,
-                    use_cloop=self.use_cloop,
-                    callback=self.callback,
-                    lazy=self.lazy,
-                    schedule=self.schedule
-                    ).accept(fgraph, no_recycling)
+                allow_gc=self.allow_gc,
+                use_cloop=self.use_cloop,
+                callback=self.callback,
+                lazy=self.lazy,
+                schedule=self.schedule
+            ).accept(fgraph, no_recycling)
        self.fgraph = fgraph
        self.no_recycling = no_recycling
        return self
@@ -700,17 +711,17 @@ class VM_Linker(link.LocalLinker):
        return dependencies

    def make_vm(self, nodes, thunks,
-            input_storage, output_storage, storage_map,
-            post_thunk_clear,
-            computed,
-            compute_map,
-            updated_vars
-            ):
+                input_storage, output_storage, storage_map,
+                post_thunk_clear,
+                computed,
+                compute_map,
+                updated_vars
+                ):

        pre_call_clear = [storage_map[v] for v in self.no_recycling]

        if (self.callback is not None or
-            (config.profile and config.profile_memory)):
+                (config.profile and config.profile_memory)):

            if self.use_cloop and self.callback is not None:
                logger.warn('CVM does not support callback, using Stack VM.')
@@ -721,11 +732,11 @@ class VM_Linker(link.LocalLinker):
            if self.allow_gc:
                deps = self.compute_gc_dependencies(storage_map)
            vm = Stack(
-                    nodes, thunks, pre_call_clear,
-                    storage_map, compute_map,
-                    self.fgraph, self.allow_gc,
-                    dependencies=deps,
-                    callback=self.callback)
+                nodes, thunks, pre_call_clear,
+                storage_map, compute_map,
+                self.fgraph, self.allow_gc,
+                dependencies=deps,
+                callback=self.callback)
        elif self.use_cloop:
            # create a map from nodes to ints and vars to ints
            nodes_idx = {}
@@ -747,9 +758,9 @@ class VM_Linker(link.LocalLinker):
            # put storage_map and compute_map into a int-based scheme
            n_applies = len(nodes)
            storage_map_list = [storage_map[vars_idx_inv[i]]
-                    for i in xrange(len(vars_idx_inv))]
+                                for i in xrange(len(vars_idx_inv))]
            compute_map_list = [compute_map[vars_idx_inv[i]]
-                    for i in xrange(len(vars_idx_inv))]
+                                for i in xrange(len(vars_idx_inv))]
            if nodes:
                assert type(storage_map_list[0]) is list
                assert type(compute_map_list[0]) is list
@@ -796,7 +807,7 @@ class VM_Linker(link.LocalLinker):
                prereq_var_idxs = []
                for prereq_node in ords.get(node, []):
                    prereq_var_idxs.extend(
-                            [vars_idx[v] for v in prereq_node.outputs])
+                        [vars_idx[v] for v in prereq_node.outputs])
                prereq_var_idxs = list(set(prereq_var_idxs))
                prereq_var_idxs.sort()  # TODO: why sort?
                node_prereqs.append(prereq_var_idxs)
@@ -816,27 +827,27 @@ class VM_Linker(link.LocalLinker):

            c0 = sys.getrefcount(node_n_inputs)
            vm = CVM(
-                    nodes,
-                    thunks,
-                    pre_call_clear,
-                    allow_gc=self.allow_gc,
-                    call_counts=[0] * len(nodes),
-                    call_times=[0.0] * len(nodes),
-                    compute_map_list=compute_map_list,
-                    storage_map_list=storage_map_list,
-                    base_input_output_list=base_input_output_list,
-                    node_n_inputs=node_n_inputs,
-                    node_n_outputs=node_n_outputs,
-                    node_input_offset=node_input_offset,
-                    node_output_offset=node_output_offset,
-                    var_owner=var_owner,
-                    is_lazy_list=is_lazy_list,
-                    output_vars=output_vars,
-                    node_prereqs=node_prereqs,
-                    node_output_size=node_output_size,
-                    update_storage=update_storage,
-                    dependencies=dependency_map_list,
-                    )
+                nodes,
+                thunks,
+                pre_call_clear,
+                allow_gc=self.allow_gc,
+                call_counts=[0] * len(nodes),
+                call_times=[0.0] * len(nodes),
+                compute_map_list=compute_map_list,
+                storage_map_list=storage_map_list,
+                base_input_output_list=base_input_output_list,
+                node_n_inputs=node_n_inputs,
+                node_n_outputs=node_n_outputs,
+                node_input_offset=node_input_offset,
+                node_output_offset=node_output_offset,
+                var_owner=var_owner,
+                is_lazy_list=is_lazy_list,
+                output_vars=output_vars,
+                node_prereqs=node_prereqs,
+                node_output_size=node_output_size,
+                update_storage=update_storage,
+                dependencies=dependency_map_list,
+            )
            assert c0 == sys.getrefcount(node_n_inputs)
        else:
            lazy = self.lazy
@@ -848,36 +859,36 @@ class VM_Linker(link.LocalLinker):
                # there is no conditional in the graph
                if self.allow_gc:
                    vm = LoopGC(
-                            nodes,
-                            thunks,
-                            pre_call_clear,
-                            post_thunk_clear)
+                        nodes,
+                        thunks,
+                        pre_call_clear,
+                        post_thunk_clear)
                else:
                    vm = Loop(
-                            nodes,
-                            thunks,
-                            pre_call_clear)
+                        nodes,
+                        thunks,
+                        pre_call_clear)
            else:
                deps = None
                if self.allow_gc:
                    deps = self.compute_gc_dependencies(storage_map)
                vm = Stack(
-                        nodes, thunks, pre_call_clear,
-                        storage_map, compute_map,
-                        self.fgraph, self.allow_gc,
-                        dependencies=deps
-                        )
+                    nodes, thunks, pre_call_clear,
+                    storage_map, compute_map,
+                    self.fgraph, self.allow_gc,
+                    dependencies=deps
+                )
        return vm

    def make_all(self, profiler=None, input_storage=None,
                 output_storage=None,
-                ):
+                 ):
        fgraph = self.fgraph
        order = self.schedule(fgraph)
        no_recycling = self.no_recycling

        input_storage, output_storage, storage_map = link.map_storage(
-                fgraph, order, input_storage, output_storage)
+            fgraph, order, input_storage, output_storage)
        compute_map = {}
        for k in storage_map:
            compute_map[k] = [k.owner is None]
@@ -917,12 +928,12 @@ class VM_Linker(link.LocalLinker):
            post_thunk_clear = None

        vm = self.make_vm(order, thunks,
-                input_storage, output_storage, storage_map,
-                post_thunk_clear,
-                computed,
-                compute_map,
-                self.updated_vars
-                )
+                          input_storage, output_storage, storage_map,
+                          post_thunk_clear,
+                          computed,
+                          compute_map,
+                          self.updated_vars
+                          )

        vm.storage_map = storage_map