提交 b6410099 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2243 from RoyXue/fix_crash_and_GPU_support

Fix ifelse crash and gpu support
......@@ -642,12 +642,15 @@ class ProfileStats(object):
fct_shapes[node.fgraph].setdefault(node, [])
sum_dense = 0
for out in node.outputs:
if out in self.variable_shape.keys():
sh = self.variable_shape[out]
if hasattr(out.type, 'get_size'):
v = out.type.get_size(sh)
sum_dense += v
else:
v = "Unknown"
v = 'Unknown'
else:
v = 'Variable isnt created'
var_mem[out] = v
fct_memory[node.fgraph][node].append(v)
......@@ -656,14 +659,17 @@ class ProfileStats(object):
# Find the function that used the most of that statistic
max_sum_size = 0
max_node_memory_size = 0
max_running_max_memory_size = 0
# statistics with the old order
# TODO: Make list more flexible with mulitply GPUs later
max_node_memory_size = [0, 0, 0]
max_running_max_memory_size = [0, 0, 0]
max_node_memory_saved_by_view = 0
max_node_memory_saved_by_inplace = 0
# statistic with the new order
new_max_node_memory_size = 0
new_max_running_max_memory_size = 0
# statistics with the new order
new_max_node_memory_size = [0, 0, 0]
new_max_running_max_memory_size = [0, 0, 0]
new_max_node_memory_saved_by_view = 0
new_max_node_memory_saved_by_inplace = 0
......@@ -689,10 +695,11 @@ class ProfileStats(object):
The sum of memory saved by reusing the input instead of
new allocation
"""
node_memory_size = 0
running_memory_size = 0
running_max_memory_size = 0
from theano.sandbox.cuda import CudaNdarrayType
# Initial Mem info values [CPU, GPU]
node_memory_size = [0, 0]
running_memory_size = [0, 0]
running_max_memory_size = [0, 0]
node_memory_saved_by_view = 0
node_memory_saved_by_inplace = 0
# This take only the inputs/outputs dependencies.
......@@ -734,6 +741,10 @@ class ProfileStats(object):
# allocated by the node
idx2 = 0
for out in node.outputs:
if isinstance(out.type, CudaNdarrayType):
cg = 1
else:
cg = 0
ins = None
if dmap and idx2 in dmap:
vidx = dmap[idx2]
......@@ -757,30 +768,36 @@ class ProfileStats(object):
view_of[out] = origin
viewed_by[origin].append(out)
else:
running_memory_size += var_mem[out]
node_memory_size += var_mem[out]
running_memory_size[cg] += var_mem[out]
node_memory_size[cg] += var_mem[out]
idx2 += 1
running_max_memory_size = max(running_max_memory_size,
running_memory_size)
running_max_memory_size[0] = max(running_max_memory_size[0],
running_memory_size[0])
running_max_memory_size[1] = max(running_max_memory_size[1],
running_memory_size[1])
# Mimic the combination of Theano and Python gc
for ins in node.inputs:
assert not (ins in view_of and viewed_by[ins])
# we trac the original var, so this shouldn't happen
if isinstance(ins.type, CudaNdarrayType):
cg = 1
else:
cg = 0
if (dependencies[ins] and
ins not in fgraph.outputs and
ins.owner and
all([compute_map[v][0] for v in dependencies[ins]])):
if ins not in view_of and not viewed_by.get(ins, []):
running_memory_size -= var_mem[ins]
running_memory_size[cg] -= var_mem[ins]
elif ins in view_of:
origin = view_of[ins]
viewed_by[origin].remove(ins)
if (not viewed_by[origin] and
origin not in fgraph.inputs and
not isinstance(origin, theano.Constant)):
running_memory_size -= var_mem[origin]
running_memory_size[cg] -= var_mem[origin]
else:
# ins is viewed_by something else, so its
# memory isn't freed
......@@ -994,24 +1011,46 @@ class ProfileStats(object):
# Store the max of some stats by any function in this profile.
max_sum_size = max(max_sum_size, sum_size)
max_node_memory_size = max(max_node_memory_size,
old_running_memory[0])
max_running_max_memory_size = max(max_running_max_memory_size,
old_running_memory[2])
max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
old_running_memory[4])
max_node_memory_size[0] = max(max_node_memory_size[0],
sum(old_running_memory[0]))
max_running_max_memory_size[0] = max(max_running_max_memory_size[0],
sum(old_running_memory[2]))
# Separate CPU and GPU
max_node_memory_size[1] = max(max_node_memory_size[1],
old_running_memory[0][0])
max_node_memory_size[2] = max(max_node_memory_size[2],
old_running_memory[0][1])
max_running_max_memory_size[1] = max(max_running_max_memory_size[1],
old_running_memory[2][0])
max_running_max_memory_size[2] = max(max_running_max_memory_size[2],
old_running_memory[2][1])
max_node_memory_saved_by_inplace = max(
max_node_memory_saved_by_inplace, old_running_memory[3])
max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
old_running_memory[4])
# Store max of some stats with new order
new_max_node_memory_size = max(new_max_node_memory_size,
new_running_memory[0])
new_max_running_max_memory_size = max(new_max_running_max_memory_size,
new_running_memory[2])
new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
new_running_memory[4])
new_max_node_memory_size[0] = max(new_max_node_memory_size[0],
sum(new_running_memory[0]))
new_max_running_max_memory_size[0] = max(new_max_running_max_memory_size[0],
sum(new_running_memory[2]))
# Separate CPU and GPU
new_max_node_memory_size[1] = max(new_max_node_memory_size[1],
new_running_memory[0][0])
new_max_node_memory_size[2] = max(new_max_node_memory_size[2],
new_running_memory[0][1])
new_max_running_max_memory_size[1] = max(new_max_running_max_memory_size[1],
new_running_memory[2][0])
new_max_running_max_memory_size[2] = max(new_max_running_max_memory_size[2],
new_running_memory[2][1])
new_max_node_memory_saved_by_inplace = max(
new_max_node_memory_saved_by_inplace, new_running_memory[3])
new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
new_running_memory[4])
# Config: whether print min memory peak
if config.profiling.min_peak_memory:
......@@ -1035,13 +1074,30 @@ class ProfileStats(object):
print >> file, "---"
# print >> file, " Max if no gc, inplace and view: %dKB" % int(
# round(max_sum_size / 1024))
print >> file, " Max if no gc (allow_gc=False): %dKB (%dKB)" % (int(round(
new_max_node_memory_size / 1024.)), int(round(
max_node_memory_size / 1024.)))
new_max_node_memory_size[0] / 1024.)), int(round(
max_node_memory_size[0] / 1024.)))
print >> file, " CPU: %dKB (%dKB)" % ((int(round(
new_max_node_memory_size[1] / 1024.)), int(round(
max_node_memory_size[1] / 1024.))))
print >> file, " GPU: %dKB (%dKB)" % ((int(round(
new_max_node_memory_size[2] / 1024.)), int(round(
max_node_memory_size[2] / 1024.))))
print >> file, "---"
print >> file, " Max if linker=cvm(default): %dKB (%dKB)" % (int(round(
new_max_running_max_memory_size / 1024.)), int(round(
max_running_max_memory_size / 1024.)))
new_max_running_max_memory_size[0] / 1024.)), int(round(
max_running_max_memory_size[0] / 1024.)))
print >> file, " CPU: %dKB (%dKB)" % ((int(round(
new_max_running_max_memory_size[1] / 1024.)), int(round(
max_running_max_memory_size[1] / 1024.))))
print >> file, " GPU: %dKB (%dKB)" % ((int(round(
new_max_running_max_memory_size[2] / 1024.)), int(round(
max_running_max_memory_size[2] / 1024.))))
print >> file, "---"
if min_max_peak:
print >> file, " Minimum peak from all valid apply node order is %dKB(took %.3fs to compute)" % (int(round(
min_max_peak / 1024.)), min_peak_time)
......@@ -1052,8 +1108,10 @@ class ProfileStats(object):
(int(round(new_max_node_memory_saved_by_inplace / 1024.)),
int(round(max_node_memory_saved_by_inplace / 1024.)))
print >> file, " Memory saved if gc is enabled: %dKB (%dKB)" % (int(
round(new_max_node_memory_size - new_max_running_max_memory_size) / 1024.), int(
round(max_node_memory_size - max_running_max_memory_size) / 1024.))
round(new_max_node_memory_size[0] - new_max_running_max_memory_size[0]) / 1024.), int(
round(max_node_memory_size[0] - max_running_max_memory_size[0]) / 1024.))
print >> file, "---"
if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
......
......@@ -8,6 +8,7 @@ import numpy
import theano
import theano.tensor as T
from theano.ifelse import ifelse
def test_profiling():
......@@ -20,11 +21,11 @@ def test_profiling():
theano.config.profile_memory = True
theano.config.profiling.min_peak_memory = True
x = [T.dvector("val%i" % i) for i in range(3)]
x = [T.fvector("val%i" % i) for i in range(3)]
z = []
z += [T.outer(x[i], x[i+1]).sum(axis=1) for i in range(len(x)-1)]
z += [x[i] + x[i+1] for i in range(len(x)-1)]
z += [T.outer(x[i], x[i + 1]).sum(axis=1) for i in range(len(x) - 1)]
z += [x[i] + x[i + 1] for i in range(len(x) - 1)]
p = theano.ProfileStats(False)
......@@ -36,7 +37,7 @@ def test_profiling():
f = theano.function(x, z, profile=p, name="test_profiling",
mode=m)
inp = [numpy.arange(1024) + 1 for i in range(len(x))]
inp = [numpy.arange(1024, dtype='float32') + 1 for i in range(len(x))]
output = f(*inp)
buf = StringIO.StringIO()
......@@ -46,8 +47,16 @@ def test_profiling():
the_string = buf.getvalue()
lines1 = [l for l in the_string.split("\n") if "Max if linker" in l]
lines2 = [l for l in the_string.split("\n") if "Minimum peak" in l]
assert "Max if linker=cvm(default): 8224KB (16408KB)" in the_string, (lines1, lines2)
assert "Minimum peak from all valid apply node order is 8208KB" in the_string, (lines1, lines2)
if theano.config.device == 'cpu':
assert "Max if linker=cvm(default): 4112KB (8204KB)" in the_string, (
lines1, lines2)
assert "Minimum peak from all valid apply node order is 4104KB" in the_string, (
lines1, lines2)
else:
assert "Max if linker=cvm(default): 8220KB (8220KB)" in the_string, (
lines1, lines2)
assert "Minimum peak from all valid apply node order is 4116KB" in the_string, (
lines1, lines2)
finally:
theano.config.profile = config1
......@@ -55,5 +64,41 @@ def test_profiling():
theano.config.profiling.min_peak_memory = config3
def test_ifelse():
config1 = theano.config.profile
config2 = theano.config.profile_memory
try:
theano.config.profile = True
theano.config.profile_memory = True
a, b = T.scalars('a', 'b')
x, y = T.scalars('x', 'y')
z = ifelse(T.lt(a, b), x * 2, y * 2)
p = theano.ProfileStats(False)
if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]:
m = "FAST_RUN"
else:
m = None
f_ifelse = theano.function([a, b, x, y], z, profile=p, name="test_ifelse",
mode=m)
val1 = 0.
val2 = 1.
big_mat1 = 10
big_mat2 = 11
out = f_ifelse(val1, val2, big_mat1, big_mat2)
finally:
theano.config.profile = config1
theano.config.profile_memory = config2
if __name__ == '__main__':
test_profiling()
test_ifelse()
......@@ -56,6 +56,7 @@ raise_with_op = link.raise_with_op
class VM(object):
"""
A VM object's __call__ method evaluates a Theano program.
......@@ -83,6 +84,7 @@ class VM(object):
storage. False means it *must not* repeat that feedback.
"""
def __init__(self, nodes, thunks, pre_call_clear):
"""
Allocate a virtual machine.
......@@ -159,10 +161,12 @@ class VM(object):
class Loop(VM):
"""
Unconditional start-to-finish program execution in Python.
No garbage collection is allowed on intermediate results.
"""
def __call__(self):
if self.time_thunks:
for cont in self.pre_call_clear:
......@@ -188,10 +192,12 @@ class Loop(VM):
class LoopGC(VM):
"""
Unconditional start-to-finish program execution in Python.
Garbage collection is possible on intermediate results.
"""
def __init__(self, nodes, thunks, pre_call_clear, post_thunk_clear):
super(LoopGC, self).__init__(nodes, thunks, pre_call_clear)
self.post_thunk_clear = post_thunk_clear
......@@ -231,6 +237,7 @@ class LoopGC(VM):
class Stack(VM):
"""
Finish-to-start evalution order of thunks.
......@@ -340,7 +347,7 @@ class Stack(VM):
apply_stack = list(self.base_apply_stack)
last_apply_stack_len = -1
#This record all function inputs/shared varibles and constants
# This record all function inputs/shared varibles and constants
for var, data in self.storage_map.iteritems():
if data[0] is None:
continue
......@@ -396,7 +403,7 @@ class Stack(VM):
current_idx = self.node_idx[current_apply]
self.call_counts[current_idx] += 1
self.call_times[current_idx] += dt
## Computing the memory footprint of the the op
# Computing the memory footprint of the the op
# ?? What about inplace .. if the op is inplace
# you don't actually ask for more memory!
for (idx, o) in enumerate(
......@@ -436,15 +443,16 @@ class Stack(VM):
if all(compute_map[v][0]
for v in dependencies[i]):
storage_map[i][0] = None
input_index.append(current_apply.inputs.index(i))
input_index.append(
current_apply.inputs.index(i))
#DO NOT set compute_map to 0
# DO NOT set compute_map to 0
#If values become False and the
# If values become False and the
#current_apply is still in the
#stack, this will cause it to be
#recomputed! This can cause wrong value
#with some combination of inplace op.
# stack, this will cause it to be
# recomputed! This can cause wrong value
# with some combination of inplace op.
compute_map[i][0] = 2
if (config.warn.vm_gc_bug and
current_apply in apply_stack and
......@@ -456,7 +464,8 @@ class Stack(VM):
" only in the development version between July 5th 2012"
" and July 30th 2012. This was not in a released version."
" The bug was affecting this script.",
#The stack level is not good when inside a Scan.
# The stack level is not good when
# inside a Scan.
stacklevel=3
)
self.node_cleared_order.append(input_index)
......@@ -468,7 +477,6 @@ class Stack(VM):
for inp in current_deps
if inp.owner)
elif not computed_outs:
#
# stack loop: Lazy Evaluation Case
......@@ -531,9 +539,10 @@ class Stack(VM):
break
if empty_storage_map:
storage_map[i][0] = None
input_index.append(current_apply.inputs.index(i))
#See the not lazy gc code for explanations
#of compute_map change
input_index.append(
current_apply.inputs.index(i))
# See the not lazy gc code for explanations
# of compute_map change
compute_map[i][0] = 2
self.node_cleared_order.append(input_index)
......@@ -560,6 +569,7 @@ try:
import lazylinker_c
class CVM(lazylinker_c.CLazyLinker, VM):
def __init__(self, *args, **kwargs):
lazylinker_c.CLazyLinker.__init__(self, *args, **kwargs)
# skip VM.__init__
......@@ -576,6 +586,7 @@ except (OSError, theano.gof.cmodule.MissingGXX), e:
class VM_Linker(link.LocalLinker):
"""
Class that satisfies the Linker interface by acting as a VM factory.
"""
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论