提交 b6410099 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2243 from RoyXue/fix_crash_and_GPU_support

Fix ifelse crash and gpu support
...@@ -642,12 +642,15 @@ class ProfileStats(object): ...@@ -642,12 +642,15 @@ class ProfileStats(object):
fct_shapes[node.fgraph].setdefault(node, []) fct_shapes[node.fgraph].setdefault(node, [])
sum_dense = 0 sum_dense = 0
for out in node.outputs: for out in node.outputs:
if out in self.variable_shape.keys():
sh = self.variable_shape[out] sh = self.variable_shape[out]
if hasattr(out.type, 'get_size'): if hasattr(out.type, 'get_size'):
v = out.type.get_size(sh) v = out.type.get_size(sh)
sum_dense += v sum_dense += v
else: else:
v = "Unknown" v = 'Unknown'
else:
v = 'Variable isnt created'
var_mem[out] = v var_mem[out] = v
fct_memory[node.fgraph][node].append(v) fct_memory[node.fgraph][node].append(v)
...@@ -656,14 +659,17 @@ class ProfileStats(object): ...@@ -656,14 +659,17 @@ class ProfileStats(object):
# Find the function that used the most of that statistic # Find the function that used the most of that statistic
max_sum_size = 0 max_sum_size = 0
max_node_memory_size = 0
max_running_max_memory_size = 0 # statistics with the old order
# TODO: Make list more flexible with mulitply GPUs later
max_node_memory_size = [0, 0, 0]
max_running_max_memory_size = [0, 0, 0]
max_node_memory_saved_by_view = 0 max_node_memory_saved_by_view = 0
max_node_memory_saved_by_inplace = 0 max_node_memory_saved_by_inplace = 0
# statistic with the new order # statistics with the new order
new_max_node_memory_size = 0 new_max_node_memory_size = [0, 0, 0]
new_max_running_max_memory_size = 0 new_max_running_max_memory_size = [0, 0, 0]
new_max_node_memory_saved_by_view = 0 new_max_node_memory_saved_by_view = 0
new_max_node_memory_saved_by_inplace = 0 new_max_node_memory_saved_by_inplace = 0
...@@ -689,10 +695,11 @@ class ProfileStats(object): ...@@ -689,10 +695,11 @@ class ProfileStats(object):
The sum of memory saved by reusing the input instead of The sum of memory saved by reusing the input instead of
new allocation new allocation
""" """
from theano.sandbox.cuda import CudaNdarrayType
node_memory_size = 0 # Initial Mem info values [CPU, GPU]
running_memory_size = 0 node_memory_size = [0, 0]
running_max_memory_size = 0 running_memory_size = [0, 0]
running_max_memory_size = [0, 0]
node_memory_saved_by_view = 0 node_memory_saved_by_view = 0
node_memory_saved_by_inplace = 0 node_memory_saved_by_inplace = 0
# This take only the inputs/outputs dependencies. # This take only the inputs/outputs dependencies.
...@@ -734,6 +741,10 @@ class ProfileStats(object): ...@@ -734,6 +741,10 @@ class ProfileStats(object):
# allocated by the node # allocated by the node
idx2 = 0 idx2 = 0
for out in node.outputs: for out in node.outputs:
if isinstance(out.type, CudaNdarrayType):
cg = 1
else:
cg = 0
ins = None ins = None
if dmap and idx2 in dmap: if dmap and idx2 in dmap:
vidx = dmap[idx2] vidx = dmap[idx2]
...@@ -757,30 +768,36 @@ class ProfileStats(object): ...@@ -757,30 +768,36 @@ class ProfileStats(object):
view_of[out] = origin view_of[out] = origin
viewed_by[origin].append(out) viewed_by[origin].append(out)
else: else:
running_memory_size += var_mem[out] running_memory_size[cg] += var_mem[out]
node_memory_size += var_mem[out] node_memory_size[cg] += var_mem[out]
idx2 += 1 idx2 += 1
running_max_memory_size = max(running_max_memory_size, running_max_memory_size[0] = max(running_max_memory_size[0],
running_memory_size) running_memory_size[0])
running_max_memory_size[1] = max(running_max_memory_size[1],
running_memory_size[1])
# Mimic the combination of Theano and Python gc # Mimic the combination of Theano and Python gc
for ins in node.inputs: for ins in node.inputs:
assert not (ins in view_of and viewed_by[ins]) assert not (ins in view_of and viewed_by[ins])
# we trac the original var, so this shouldn't happen # we trac the original var, so this shouldn't happen
if isinstance(ins.type, CudaNdarrayType):
cg = 1
else:
cg = 0
if (dependencies[ins] and if (dependencies[ins] and
ins not in fgraph.outputs and ins not in fgraph.outputs and
ins.owner and ins.owner and
all([compute_map[v][0] for v in dependencies[ins]])): all([compute_map[v][0] for v in dependencies[ins]])):
if ins not in view_of and not viewed_by.get(ins, []): if ins not in view_of and not viewed_by.get(ins, []):
running_memory_size -= var_mem[ins] running_memory_size[cg] -= var_mem[ins]
elif ins in view_of: elif ins in view_of:
origin = view_of[ins] origin = view_of[ins]
viewed_by[origin].remove(ins) viewed_by[origin].remove(ins)
if (not viewed_by[origin] and if (not viewed_by[origin] and
origin not in fgraph.inputs and origin not in fgraph.inputs and
not isinstance(origin, theano.Constant)): not isinstance(origin, theano.Constant)):
running_memory_size -= var_mem[origin] running_memory_size[cg] -= var_mem[origin]
else: else:
# ins is viewed_by something else, so its # ins is viewed_by something else, so its
# memory isn't freed # memory isn't freed
...@@ -994,24 +1011,46 @@ class ProfileStats(object): ...@@ -994,24 +1011,46 @@ class ProfileStats(object):
# Store the max of some stats by any function in this profile. # Store the max of some stats by any function in this profile.
max_sum_size = max(max_sum_size, sum_size) max_sum_size = max(max_sum_size, sum_size)
max_node_memory_size = max(max_node_memory_size, max_node_memory_size[0] = max(max_node_memory_size[0],
old_running_memory[0]) sum(old_running_memory[0]))
max_running_max_memory_size = max(max_running_max_memory_size, max_running_max_memory_size[0] = max(max_running_max_memory_size[0],
old_running_memory[2]) sum(old_running_memory[2]))
max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
old_running_memory[4]) # Separate CPU and GPU
max_node_memory_size[1] = max(max_node_memory_size[1],
old_running_memory[0][0])
max_node_memory_size[2] = max(max_node_memory_size[2],
old_running_memory[0][1])
max_running_max_memory_size[1] = max(max_running_max_memory_size[1],
old_running_memory[2][0])
max_running_max_memory_size[2] = max(max_running_max_memory_size[2],
old_running_memory[2][1])
max_node_memory_saved_by_inplace = max( max_node_memory_saved_by_inplace = max(
max_node_memory_saved_by_inplace, old_running_memory[3]) max_node_memory_saved_by_inplace, old_running_memory[3])
max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
old_running_memory[4])
# Store max of some stats with new order # Store max of some stats with new order
new_max_node_memory_size = max(new_max_node_memory_size, new_max_node_memory_size[0] = max(new_max_node_memory_size[0],
new_running_memory[0]) sum(new_running_memory[0]))
new_max_running_max_memory_size = max(new_max_running_max_memory_size, new_max_running_max_memory_size[0] = max(new_max_running_max_memory_size[0],
new_running_memory[2]) sum(new_running_memory[2]))
new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
new_running_memory[4]) # Separate CPU and GPU
new_max_node_memory_size[1] = max(new_max_node_memory_size[1],
new_running_memory[0][0])
new_max_node_memory_size[2] = max(new_max_node_memory_size[2],
new_running_memory[0][1])
new_max_running_max_memory_size[1] = max(new_max_running_max_memory_size[1],
new_running_memory[2][0])
new_max_running_max_memory_size[2] = max(new_max_running_max_memory_size[2],
new_running_memory[2][1])
new_max_node_memory_saved_by_inplace = max( new_max_node_memory_saved_by_inplace = max(
new_max_node_memory_saved_by_inplace, new_running_memory[3]) new_max_node_memory_saved_by_inplace, new_running_memory[3])
new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
new_running_memory[4])
# Config: whether print min memory peak # Config: whether print min memory peak
if config.profiling.min_peak_memory: if config.profiling.min_peak_memory:
...@@ -1035,13 +1074,30 @@ class ProfileStats(object): ...@@ -1035,13 +1074,30 @@ class ProfileStats(object):
print >> file, "---" print >> file, "---"
# print >> file, " Max if no gc, inplace and view: %dKB" % int( # print >> file, " Max if no gc, inplace and view: %dKB" % int(
# round(max_sum_size / 1024)) # round(max_sum_size / 1024))
print >> file, " Max if no gc (allow_gc=False): %dKB (%dKB)" % (int(round( print >> file, " Max if no gc (allow_gc=False): %dKB (%dKB)" % (int(round(
new_max_node_memory_size / 1024.)), int(round( new_max_node_memory_size[0] / 1024.)), int(round(
max_node_memory_size / 1024.))) max_node_memory_size[0] / 1024.)))
print >> file, " CPU: %dKB (%dKB)" % ((int(round(
new_max_node_memory_size[1] / 1024.)), int(round(
max_node_memory_size[1] / 1024.))))
print >> file, " GPU: %dKB (%dKB)" % ((int(round(
new_max_node_memory_size[2] / 1024.)), int(round(
max_node_memory_size[2] / 1024.))))
print >> file, "---"
print >> file, " Max if linker=cvm(default): %dKB (%dKB)" % (int(round( print >> file, " Max if linker=cvm(default): %dKB (%dKB)" % (int(round(
new_max_running_max_memory_size / 1024.)), int(round( new_max_running_max_memory_size[0] / 1024.)), int(round(
max_running_max_memory_size / 1024.))) max_running_max_memory_size[0] / 1024.)))
print >> file, " CPU: %dKB (%dKB)" % ((int(round(
new_max_running_max_memory_size[1] / 1024.)), int(round(
max_running_max_memory_size[1] / 1024.))))
print >> file, " GPU: %dKB (%dKB)" % ((int(round(
new_max_running_max_memory_size[2] / 1024.)), int(round(
max_running_max_memory_size[2] / 1024.))))
print >> file, "---"
if min_max_peak: if min_max_peak:
print >> file, " Minimum peak from all valid apply node order is %dKB(took %.3fs to compute)" % (int(round( print >> file, " Minimum peak from all valid apply node order is %dKB(took %.3fs to compute)" % (int(round(
min_max_peak / 1024.)), min_peak_time) min_max_peak / 1024.)), min_peak_time)
...@@ -1052,8 +1108,10 @@ class ProfileStats(object): ...@@ -1052,8 +1108,10 @@ class ProfileStats(object):
(int(round(new_max_node_memory_saved_by_inplace / 1024.)), (int(round(new_max_node_memory_saved_by_inplace / 1024.)),
int(round(max_node_memory_saved_by_inplace / 1024.))) int(round(max_node_memory_saved_by_inplace / 1024.)))
print >> file, " Memory saved if gc is enabled: %dKB (%dKB)" % (int( print >> file, " Memory saved if gc is enabled: %dKB (%dKB)" % (int(
round(new_max_node_memory_size - new_max_running_max_memory_size) / 1024.), int( round(new_max_node_memory_size[0] - new_max_running_max_memory_size[0]) / 1024.), int(
round(max_node_memory_size - max_running_max_memory_size) / 1024.)) round(max_node_memory_size[0] - max_running_max_memory_size[0]) / 1024.))
print >> file, "---"
if (hasattr(theano, 'sandbox') and if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and hasattr(theano.sandbox, 'cuda') and
......
...@@ -8,6 +8,7 @@ import numpy ...@@ -8,6 +8,7 @@ import numpy
import theano import theano
import theano.tensor as T import theano.tensor as T
from theano.ifelse import ifelse
def test_profiling(): def test_profiling():
...@@ -20,11 +21,11 @@ def test_profiling(): ...@@ -20,11 +21,11 @@ def test_profiling():
theano.config.profile_memory = True theano.config.profile_memory = True
theano.config.profiling.min_peak_memory = True theano.config.profiling.min_peak_memory = True
x = [T.dvector("val%i" % i) for i in range(3)] x = [T.fvector("val%i" % i) for i in range(3)]
z = [] z = []
z += [T.outer(x[i], x[i+1]).sum(axis=1) for i in range(len(x)-1)] z += [T.outer(x[i], x[i + 1]).sum(axis=1) for i in range(len(x) - 1)]
z += [x[i] + x[i+1] for i in range(len(x)-1)] z += [x[i] + x[i + 1] for i in range(len(x) - 1)]
p = theano.ProfileStats(False) p = theano.ProfileStats(False)
...@@ -36,7 +37,7 @@ def test_profiling(): ...@@ -36,7 +37,7 @@ def test_profiling():
f = theano.function(x, z, profile=p, name="test_profiling", f = theano.function(x, z, profile=p, name="test_profiling",
mode=m) mode=m)
inp = [numpy.arange(1024) + 1 for i in range(len(x))] inp = [numpy.arange(1024, dtype='float32') + 1 for i in range(len(x))]
output = f(*inp) output = f(*inp)
buf = StringIO.StringIO() buf = StringIO.StringIO()
...@@ -46,8 +47,16 @@ def test_profiling(): ...@@ -46,8 +47,16 @@ def test_profiling():
the_string = buf.getvalue() the_string = buf.getvalue()
lines1 = [l for l in the_string.split("\n") if "Max if linker" in l] lines1 = [l for l in the_string.split("\n") if "Max if linker" in l]
lines2 = [l for l in the_string.split("\n") if "Minimum peak" in l] lines2 = [l for l in the_string.split("\n") if "Minimum peak" in l]
assert "Max if linker=cvm(default): 8224KB (16408KB)" in the_string, (lines1, lines2) if theano.config.device == 'cpu':
assert "Minimum peak from all valid apply node order is 8208KB" in the_string, (lines1, lines2) assert "Max if linker=cvm(default): 4112KB (8204KB)" in the_string, (
lines1, lines2)
assert "Minimum peak from all valid apply node order is 4104KB" in the_string, (
lines1, lines2)
else:
assert "Max if linker=cvm(default): 8220KB (8220KB)" in the_string, (
lines1, lines2)
assert "Minimum peak from all valid apply node order is 4116KB" in the_string, (
lines1, lines2)
finally: finally:
theano.config.profile = config1 theano.config.profile = config1
...@@ -55,5 +64,41 @@ def test_profiling(): ...@@ -55,5 +64,41 @@ def test_profiling():
theano.config.profiling.min_peak_memory = config3 theano.config.profiling.min_peak_memory = config3
def test_ifelse():
config1 = theano.config.profile
config2 = theano.config.profile_memory
try:
theano.config.profile = True
theano.config.profile_memory = True
a, b = T.scalars('a', 'b')
x, y = T.scalars('x', 'y')
z = ifelse(T.lt(a, b), x * 2, y * 2)
p = theano.ProfileStats(False)
if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]:
m = "FAST_RUN"
else:
m = None
f_ifelse = theano.function([a, b, x, y], z, profile=p, name="test_ifelse",
mode=m)
val1 = 0.
val2 = 1.
big_mat1 = 10
big_mat2 = 11
out = f_ifelse(val1, val2, big_mat1, big_mat2)
finally:
theano.config.profile = config1
theano.config.profile_memory = config2
if __name__ == '__main__': if __name__ == '__main__':
test_profiling() test_profiling()
test_ifelse()
...@@ -56,6 +56,7 @@ raise_with_op = link.raise_with_op ...@@ -56,6 +56,7 @@ raise_with_op = link.raise_with_op
class VM(object): class VM(object):
""" """
A VM object's __call__ method evaluates a Theano program. A VM object's __call__ method evaluates a Theano program.
...@@ -83,6 +84,7 @@ class VM(object): ...@@ -83,6 +84,7 @@ class VM(object):
storage. False means it *must not* repeat that feedback. storage. False means it *must not* repeat that feedback.
""" """
def __init__(self, nodes, thunks, pre_call_clear): def __init__(self, nodes, thunks, pre_call_clear):
""" """
Allocate a virtual machine. Allocate a virtual machine.
...@@ -159,10 +161,12 @@ class VM(object): ...@@ -159,10 +161,12 @@ class VM(object):
class Loop(VM): class Loop(VM):
""" """
Unconditional start-to-finish program execution in Python. Unconditional start-to-finish program execution in Python.
No garbage collection is allowed on intermediate results. No garbage collection is allowed on intermediate results.
""" """
def __call__(self): def __call__(self):
if self.time_thunks: if self.time_thunks:
for cont in self.pre_call_clear: for cont in self.pre_call_clear:
...@@ -188,10 +192,12 @@ class Loop(VM): ...@@ -188,10 +192,12 @@ class Loop(VM):
class LoopGC(VM): class LoopGC(VM):
""" """
Unconditional start-to-finish program execution in Python. Unconditional start-to-finish program execution in Python.
Garbage collection is possible on intermediate results. Garbage collection is possible on intermediate results.
""" """
def __init__(self, nodes, thunks, pre_call_clear, post_thunk_clear): def __init__(self, nodes, thunks, pre_call_clear, post_thunk_clear):
super(LoopGC, self).__init__(nodes, thunks, pre_call_clear) super(LoopGC, self).__init__(nodes, thunks, pre_call_clear)
self.post_thunk_clear = post_thunk_clear self.post_thunk_clear = post_thunk_clear
...@@ -231,6 +237,7 @@ class LoopGC(VM): ...@@ -231,6 +237,7 @@ class LoopGC(VM):
class Stack(VM): class Stack(VM):
""" """
Finish-to-start evalution order of thunks. Finish-to-start evalution order of thunks.
...@@ -340,7 +347,7 @@ class Stack(VM): ...@@ -340,7 +347,7 @@ class Stack(VM):
apply_stack = list(self.base_apply_stack) apply_stack = list(self.base_apply_stack)
last_apply_stack_len = -1 last_apply_stack_len = -1
#This record all function inputs/shared varibles and constants # This record all function inputs/shared varibles and constants
for var, data in self.storage_map.iteritems(): for var, data in self.storage_map.iteritems():
if data[0] is None: if data[0] is None:
continue continue
...@@ -396,7 +403,7 @@ class Stack(VM): ...@@ -396,7 +403,7 @@ class Stack(VM):
current_idx = self.node_idx[current_apply] current_idx = self.node_idx[current_apply]
self.call_counts[current_idx] += 1 self.call_counts[current_idx] += 1
self.call_times[current_idx] += dt self.call_times[current_idx] += dt
## Computing the memory footprint of the the op # Computing the memory footprint of the the op
# ?? What about inplace .. if the op is inplace # ?? What about inplace .. if the op is inplace
# you don't actually ask for more memory! # you don't actually ask for more memory!
for (idx, o) in enumerate( for (idx, o) in enumerate(
...@@ -436,15 +443,16 @@ class Stack(VM): ...@@ -436,15 +443,16 @@ class Stack(VM):
if all(compute_map[v][0] if all(compute_map[v][0]
for v in dependencies[i]): for v in dependencies[i]):
storage_map[i][0] = None storage_map[i][0] = None
input_index.append(current_apply.inputs.index(i)) input_index.append(
current_apply.inputs.index(i))
#DO NOT set compute_map to 0 # DO NOT set compute_map to 0
#If values become False and the # If values become False and the
#current_apply is still in the #current_apply is still in the
#stack, this will cause it to be # stack, this will cause it to be
#recomputed! This can cause wrong value # recomputed! This can cause wrong value
#with some combination of inplace op. # with some combination of inplace op.
compute_map[i][0] = 2 compute_map[i][0] = 2
if (config.warn.vm_gc_bug and if (config.warn.vm_gc_bug and
current_apply in apply_stack and current_apply in apply_stack and
...@@ -456,7 +464,8 @@ class Stack(VM): ...@@ -456,7 +464,8 @@ class Stack(VM):
" only in the development version between July 5th 2012" " only in the development version between July 5th 2012"
" and July 30th 2012. This was not in a released version." " and July 30th 2012. This was not in a released version."
" The bug was affecting this script.", " The bug was affecting this script.",
#The stack level is not good when inside a Scan. # The stack level is not good when
# inside a Scan.
stacklevel=3 stacklevel=3
) )
self.node_cleared_order.append(input_index) self.node_cleared_order.append(input_index)
...@@ -468,7 +477,6 @@ class Stack(VM): ...@@ -468,7 +477,6 @@ class Stack(VM):
for inp in current_deps for inp in current_deps
if inp.owner) if inp.owner)
elif not computed_outs: elif not computed_outs:
# #
# stack loop: Lazy Evaluation Case # stack loop: Lazy Evaluation Case
...@@ -531,9 +539,10 @@ class Stack(VM): ...@@ -531,9 +539,10 @@ class Stack(VM):
break break
if empty_storage_map: if empty_storage_map:
storage_map[i][0] = None storage_map[i][0] = None
input_index.append(current_apply.inputs.index(i)) input_index.append(
#See the not lazy gc code for explanations current_apply.inputs.index(i))
#of compute_map change # See the not lazy gc code for explanations
# of compute_map change
compute_map[i][0] = 2 compute_map[i][0] = 2
self.node_cleared_order.append(input_index) self.node_cleared_order.append(input_index)
...@@ -560,6 +569,7 @@ try: ...@@ -560,6 +569,7 @@ try:
import lazylinker_c import lazylinker_c
class CVM(lazylinker_c.CLazyLinker, VM): class CVM(lazylinker_c.CLazyLinker, VM):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
lazylinker_c.CLazyLinker.__init__(self, *args, **kwargs) lazylinker_c.CLazyLinker.__init__(self, *args, **kwargs)
# skip VM.__init__ # skip VM.__init__
...@@ -576,6 +586,7 @@ except (OSError, theano.gof.cmodule.MissingGXX), e: ...@@ -576,6 +586,7 @@ except (OSError, theano.gof.cmodule.MissingGXX), e:
class VM_Linker(link.LocalLinker): class VM_Linker(link.LocalLinker):
""" """
Class that satisfies the Linker interface by acting as a VM factory. Class that satisfies the Linker interface by acting as a VM factory.
""" """
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论