Merge pull request #2243 from RoyXue/fix_crash_and_GPU_support

Fix ifelse crash and gpu support

Merge pull request #2243 from RoyXue/fix_crash_and_GPU_support
b6410099 · Frédéric Bastien · 8971b5ca · e318dc8c · b6410099 · b6410099
--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -642,12 +642,15 @@ class ProfileStats(object):
            fct_shapes[node.fgraph].setdefault(node, [])
            sum_dense = 0
            for out in node.outputs:
+                if out in self.variable_shape.keys():
                    sh = self.variable_shape[out]
                    if hasattr(out.type, 'get_size'):
                        v = out.type.get_size(sh)
                        sum_dense += v
                    else:
-                    v = "Unknown"
+                        v = 'Unknown'
+                else:
+                    v = 'Variable isnt created'
                var_mem[out] = v
                fct_memory[node.fgraph][node].append(v)
@@ -656,14 +659,17 @@ class ProfileStats(object):
        # Find the function that used the most of that statistic
        max_sum_size = 0
-        max_node_memory_size = 0
-        max_running_max_memory_size = 0
+        # statistics with the old order
+        # TODO: Make list more flexible with mulitply GPUs later
+        max_node_memory_size = [0, 0, 0]
+        max_running_max_memory_size = [0, 0, 0]
        max_node_memory_saved_by_view = 0
        max_node_memory_saved_by_inplace = 0
-        # statistic with the new order
+        # statistics with the new order
-        new_max_node_memory_size = 0
+        new_max_node_memory_size = [0, 0, 0]
-        new_max_running_max_memory_size = 0
+        new_max_running_max_memory_size = [0, 0, 0]
        new_max_node_memory_saved_by_view = 0
        new_max_node_memory_saved_by_inplace = 0
@@ -689,10 +695,11 @@ class ProfileStats(object):
                The sum of memory saved by reusing the input instead of
                new allocation
            """
+            from theano.sandbox.cuda import CudaNdarrayType
-            node_memory_size = 0
+            # Initial Mem info values [CPU, GPU]
-            running_memory_size = 0
+            node_memory_size = [0, 0]
-            running_max_memory_size = 0
+            running_memory_size = [0, 0]
+            running_max_memory_size = [0, 0]
            node_memory_saved_by_view = 0
            node_memory_saved_by_inplace = 0
            # This take only the inputs/outputs dependencies.
@@ -734,6 +741,10 @@ class ProfileStats(object):
                # allocated by the node
                idx2 = 0
                for out in node.outputs:
+                    if isinstance(out.type, CudaNdarrayType):
+                        cg = 1
+                    else:
+                        cg = 0
                    ins = None
                    if dmap and idx2 in dmap:
                        vidx = dmap[idx2]
@@ -757,30 +768,36 @@ class ProfileStats(object):
                        view_of[out] = origin
                        viewed_by[origin].append(out)
                    else:
-                        running_memory_size += var_mem[out]
+                        running_memory_size[cg] += var_mem[out]
-                        node_memory_size += var_mem[out]
+                        node_memory_size[cg] += var_mem[out]
                    idx2 += 1
-                running_max_memory_size = max(running_max_memory_size,
+                running_max_memory_size[0] = max(running_max_memory_size[0],
-                                              running_memory_size)
+                                                 running_memory_size[0])
+                running_max_memory_size[1] = max(running_max_memory_size[1],
+                                                 running_memory_size[1])
                # Mimic the combination of Theano and Python gc
                for ins in node.inputs:
                    assert not (ins in view_of and viewed_by[ins])
                    # we trac the original var, so this shouldn't happen
+                    if isinstance(ins.type, CudaNdarrayType):
+                        cg = 1
+                    else:
+                        cg = 0
                    if (dependencies[ins] and
                            ins not in fgraph.outputs and
                            ins.owner and
                            all([compute_map[v][0] for v in dependencies[ins]])):
                        if ins not in view_of and not viewed_by.get(ins, []):
-                            running_memory_size -= var_mem[ins]
+                            running_memory_size[cg] -= var_mem[ins]
                        elif ins in view_of:
                            origin = view_of[ins]
                            viewed_by[origin].remove(ins)
                            if (not viewed_by[origin] and
                                    origin not in fgraph.inputs and
                                    not isinstance(origin, theano.Constant)):
-                                running_memory_size -= var_mem[origin]
+                                running_memory_size[cg] -= var_mem[origin]
                    else:
                        # ins is viewed_by something else, so its
                        # memory isn't freed
@@ -994,24 +1011,46 @@ class ProfileStats(object):
            # Store the max of some stats by any function in this profile.
            max_sum_size = max(max_sum_size, sum_size)
-            max_node_memory_size = max(max_node_memory_size,
+            max_node_memory_size[0] = max(max_node_memory_size[0],
-                                       old_running_memory[0])
+                                          sum(old_running_memory[0]))
-            max_running_max_memory_size = max(max_running_max_memory_size,
+            max_running_max_memory_size[0] = max(max_running_max_memory_size[0],
-                                              old_running_memory[2])
+                                                 sum(old_running_memory[2]))
-            max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
-                                                old_running_memory[4])
+            # Separate CPU and GPU
+            max_node_memory_size[1] = max(max_node_memory_size[1],
+                                          old_running_memory[0][0])
+            max_node_memory_size[2] = max(max_node_memory_size[2],
+                                          old_running_memory[0][1])
+            max_running_max_memory_size[1] = max(max_running_max_memory_size[1],
+                                                 old_running_memory[2][0])
+            max_running_max_memory_size[2] = max(max_running_max_memory_size[2],
+                                                 old_running_memory[2][1])
            max_node_memory_saved_by_inplace = max(
                max_node_memory_saved_by_inplace, old_running_memory[3])
+            max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
+                                                old_running_memory[4])
            # Store max of some stats with new order
-            new_max_node_memory_size = max(new_max_node_memory_size,
+            new_max_node_memory_size[0] = max(new_max_node_memory_size[0],
-                                           new_running_memory[0])
+                                              sum(new_running_memory[0]))
-            new_max_running_max_memory_size = max(new_max_running_max_memory_size,
+            new_max_running_max_memory_size[0] = max(new_max_running_max_memory_size[0],
-                                                  new_running_memory[2])
+                                                     sum(new_running_memory[2]))
-            new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
-                                                    new_running_memory[4])
+            # Separate CPU and GPU
+            new_max_node_memory_size[1] = max(new_max_node_memory_size[1],
+                                              new_running_memory[0][0])
+            new_max_node_memory_size[2] = max(new_max_node_memory_size[2],
+                                              new_running_memory[0][1])
+            new_max_running_max_memory_size[1] = max(new_max_running_max_memory_size[1],
+                                                     new_running_memory[2][0])
+            new_max_running_max_memory_size[2] = max(new_max_running_max_memory_size[2],
+                                                     new_running_memory[2][1])
            new_max_node_memory_saved_by_inplace = max(
                new_max_node_memory_saved_by_inplace, new_running_memory[3])
+            new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
+                                                    new_running_memory[4])
            # Config: whether print min memory peak
            if config.profiling.min_peak_memory:
@@ -1035,13 +1074,30 @@ class ProfileStats(object):
        print >> file,  "---"
 #        print >> file,  "    Max if no gc, inplace and view: %dKB" % int(
 #            round(max_sum_size / 1024))
        print >> file,  "    Max if no gc (allow_gc=False): %dKB (%dKB)" % (int(round(
-            new_max_node_memory_size / 1024.)), int(round(
+            new_max_node_memory_size[0] / 1024.)), int(round(
-                max_node_memory_size / 1024.)))
+                max_node_memory_size[0] / 1024.)))
+        print >> file,  "    CPU: %dKB (%dKB)" % ((int(round(
+            new_max_node_memory_size[1] / 1024.)), int(round(
+                max_node_memory_size[1] / 1024.))))
+        print >> file,  "    GPU: %dKB (%dKB)" % ((int(round(
+            new_max_node_memory_size[2] / 1024.)), int(round(
+                max_node_memory_size[2] / 1024.))))
+        print >> file,  "---"
        print >> file,  "    Max if linker=cvm(default): %dKB (%dKB)" % (int(round(
-            new_max_running_max_memory_size / 1024.)), int(round(
+            new_max_running_max_memory_size[0] / 1024.)), int(round(
-                max_running_max_memory_size / 1024.)))
+                max_running_max_memory_size[0] / 1024.)))
+        print >> file,  "    CPU: %dKB (%dKB)" % ((int(round(
+            new_max_running_max_memory_size[1] / 1024.)), int(round(
+                max_running_max_memory_size[1] / 1024.))))
+        print >> file,  "    GPU: %dKB (%dKB)" % ((int(round(
+            new_max_running_max_memory_size[2] / 1024.)), int(round(
+                max_running_max_memory_size[2] / 1024.))))
+        print >> file,  "---"
        if min_max_peak:
            print >> file,  "    Minimum peak from all valid apply node order is %dKB(took %.3fs to compute)" % (int(round(
                min_max_peak / 1024.)), min_peak_time)
@@ -1052,8 +1108,10 @@ class ProfileStats(object):
            (int(round(new_max_node_memory_saved_by_inplace / 1024.)),
             int(round(max_node_memory_saved_by_inplace / 1024.)))
        print >> file,  "    Memory saved if gc is enabled: %dKB (%dKB)" % (int(
-            round(new_max_node_memory_size - new_max_running_max_memory_size) / 1024.), int(
+            round(new_max_node_memory_size[0] - new_max_running_max_memory_size[0]) / 1024.), int(
-            round(max_node_memory_size - max_running_max_memory_size) / 1024.))
+            round(max_node_memory_size[0] - max_running_max_memory_size[0]) / 1024.))
+        print >> file,  "---"
        if (hasattr(theano, 'sandbox') and
            hasattr(theano.sandbox, 'cuda') and

--- a/theano/compile/tests/test_profiling.py
+++ b/theano/compile/tests/test_profiling.py
@@ -8,6 +8,7 @@ import numpy
 import theano
 import theano.tensor as T
+from theano.ifelse import ifelse
 def test_profiling():
@@ -20,11 +21,11 @@ def test_profiling():
        theano.config.profile_memory = True
        theano.config.profiling.min_peak_memory = True
-        x = [T.dvector("val%i" % i) for i in range(3)]
+        x = [T.fvector("val%i" % i) for i in range(3)]
        z = []
-        z += [T.outer(x[i], x[i+1]).sum(axis=1) for i in range(len(x)-1)]
+        z += [T.outer(x[i], x[i + 1]).sum(axis=1) for i in range(len(x) - 1)]
-        z += [x[i] + x[i+1] for i in range(len(x)-1)]
+        z += [x[i] + x[i + 1] for i in range(len(x) - 1)]
        p = theano.ProfileStats(False)
@@ -36,7 +37,7 @@ def test_profiling():
        f = theano.function(x, z, profile=p, name="test_profiling",
                            mode=m)
-        inp = [numpy.arange(1024) + 1 for i in range(len(x))]
+        inp = [numpy.arange(1024, dtype='float32') + 1 for i in range(len(x))]
        output = f(*inp)
        buf = StringIO.StringIO()
@@ -46,8 +47,16 @@ def test_profiling():
        the_string = buf.getvalue()
        lines1 = [l for l in the_string.split("\n") if "Max if linker" in l]
        lines2 = [l for l in the_string.split("\n") if "Minimum peak" in l]
-        assert "Max if linker=cvm(default): 8224KB (16408KB)" in the_string, (lines1, lines2)
+        if theano.config.device == 'cpu':
-        assert "Minimum peak from all valid apply node order is 8208KB" in the_string, (lines1, lines2)
+            assert "Max if linker=cvm(default): 4112KB (8204KB)" in the_string, (
+                lines1, lines2)
+            assert "Minimum peak from all valid apply node order is 4104KB" in the_string, (
+                lines1, lines2)
+        else:
+            assert "Max if linker=cvm(default): 8220KB (8220KB)" in the_string, (
+                lines1, lines2)
+            assert "Minimum peak from all valid apply node order is 4116KB" in the_string, (
+                lines1, lines2)
    finally:
        theano.config.profile = config1
@@ -55,5 +64,41 @@ def test_profiling():
        theano.config.profiling.min_peak_memory = config3
+def test_ifelse():
+    config1 = theano.config.profile
+    config2 = theano.config.profile_memory
+    try:
+        theano.config.profile = True
+        theano.config.profile_memory = True
+        a, b = T.scalars('a', 'b')
+        x, y = T.scalars('x', 'y')
+        z = ifelse(T.lt(a, b), x * 2, y * 2)
+        p = theano.ProfileStats(False)
+        if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]:
+            m = "FAST_RUN"
+        else:
+            m = None
+        f_ifelse = theano.function([a, b, x, y], z, profile=p, name="test_ifelse",
+                                   mode=m)
+        val1 = 0.
+        val2 = 1.
+        big_mat1 = 10
+        big_mat2 = 11
+        out = f_ifelse(val1, val2, big_mat1, big_mat2)
+    finally:
+        theano.config.profile = config1
+        theano.config.profile_memory = config2
 if __name__ == '__main__':
    test_profiling()
+    test_ifelse()
--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -56,6 +56,7 @@ raise_with_op = link.raise_with_op
 class VM(object):
    """
    A VM object's __call__ method evaluates a Theano program.
@@ -83,6 +84,7 @@ class VM(object):
        storage. False means it *must not* repeat that feedback.
    """
    def __init__(self, nodes, thunks, pre_call_clear):
        """
        Allocate a virtual machine.
@@ -159,10 +161,12 @@ class VM(object):
 class Loop(VM):
    """
    Unconditional start-to-finish program execution in Python.
    No garbage collection is allowed on intermediate results.
    """
    def __call__(self):
        if self.time_thunks:
            for cont in self.pre_call_clear:
@@ -188,10 +192,12 @@ class Loop(VM):
 class LoopGC(VM):
    """
    Unconditional start-to-finish program execution in Python.
    Garbage collection is possible on intermediate results.
    """
    def __init__(self, nodes, thunks, pre_call_clear, post_thunk_clear):
        super(LoopGC, self).__init__(nodes, thunks, pre_call_clear)
        self.post_thunk_clear = post_thunk_clear
@@ -231,6 +237,7 @@ class LoopGC(VM):
 class Stack(VM):
    """
    Finish-to-start evalution order of thunks.
@@ -340,7 +347,7 @@ class Stack(VM):
        apply_stack = list(self.base_apply_stack)
        last_apply_stack_len = -1
-        #This record all function inputs/shared varibles and constants
+        # This record all function inputs/shared varibles and constants
        for var, data in self.storage_map.iteritems():
            if data[0] is None:
                continue
@@ -396,7 +403,7 @@ class Stack(VM):
                            current_idx = self.node_idx[current_apply]
                            self.call_counts[current_idx] += 1
                            self.call_times[current_idx] += dt
-                            ## Computing the memory footprint of the the op
+                            # Computing the memory footprint of the the op
                            # ?? What about inplace .. if the op is inplace
                            # you don't actually ask for more memory!
                            for (idx, o) in enumerate(
@@ -436,15 +443,16 @@ class Stack(VM):
                                if all(compute_map[v][0]
                                        for v in dependencies[i]):
                                    storage_map[i][0] = None
-                                    input_index.append(current_apply.inputs.index(i))
+                                    input_index.append(
+                                        current_apply.inputs.index(i))
-                                    #DO NOT set compute_map to 0
+                                    # DO NOT set compute_map to 0
-                                    #If values become False and the
+                                    # If values become False and the
                                    #current_apply is still in the
-                                    #stack, this will cause it to be
+                                    # stack, this will cause it to be
-                                    #recomputed! This can cause wrong value
+                                    # recomputed! This can cause wrong value
-                                    #with some combination of inplace op.
+                                    # with some combination of inplace op.
                                    compute_map[i][0] = 2
                                    if (config.warn.vm_gc_bug and
                                        current_apply in apply_stack and
@@ -456,7 +464,8 @@ class Stack(VM):
                                            " only in the development version between July 5th 2012"
                                            " and July 30th 2012. This was not in a released version."
                                            " The bug was affecting this script.",
-        #The stack level is not good when inside a Scan.
+                                            # The stack level is not good when
+                                            # inside a Scan.
                                            stacklevel=3
                                        )
                    self.node_cleared_order.append(input_index)
@@ -468,7 +477,6 @@ class Stack(VM):
                                       for inp in current_deps
                                       if inp.owner)
            elif not computed_outs:
                #
                # stack loop: Lazy Evaluation Case
@@ -531,9 +539,10 @@ class Stack(VM):
                                        break
                                if empty_storage_map:
                                    storage_map[i][0] = None
-                                    input_index.append(current_apply.inputs.index(i)) 
+                                    input_index.append(
-                                    #See the not lazy gc code for explanations
+                                        current_apply.inputs.index(i))
-                                    #of compute_map change
+                                    # See the not lazy gc code for explanations
+                                    # of compute_map change
                                    compute_map[i][0] = 2
                    self.node_cleared_order.append(input_index)
@@ -560,6 +569,7 @@ try:
    import lazylinker_c
    class CVM(lazylinker_c.CLazyLinker, VM):
        def __init__(self, *args, **kwargs):
            lazylinker_c.CLazyLinker.__init__(self, *args, **kwargs)
            # skip VM.__init__
@@ -576,6 +586,7 @@ except (OSError, theano.gof.cmodule.MissingGXX), e:
 class VM_Linker(link.LocalLinker):
    """
    Class that satisfies the Linker interface by acting as a VM factory.
    """