提交 8276edd7 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3933 from nouiz/profile

profile_memory, print the expected peak memory usage if excluding inplace opt.
......@@ -737,24 +737,17 @@ class ProfileStats(object):
# Find the function that used the most of that statistic
max_sum_size = 0
# statistics with the old order
# TODO: Make list more flexible with mulitply GPUs later
max_node_memory_size = [0, 0, 0]
max_running_max_memory_size = [0, 0, 0]
max_node_memory_saved_by_view = 0
max_node_memory_saved_by_inplace = 0
# statistics with the new order
new_max_node_memory_size = [0, 0, 0]
new_max_running_max_memory_size = [0, 0, 0]
new_max_node_memory_saved_by_view = 0
new_max_node_memory_saved_by_inplace = 0
# statistics with the old and new order
stats = [[[0, 0, 0], [0, 0, 0], 0, 0], # old, with dmap
[[0, 0, 0], [0, 0, 0], 0, 0], # old, without dmap
[[0, 0, 0], [0, 0, 0], 0, 0], # new, with dmap
[[0, 0, 0], [0, 0, 0], 0, 0]] # new, without dmap
# track min peak memory usage
min_max_peak = 0
min_peak_time = 0
def count_running_memory(order, fgraph, nodes_mem):
def count_running_memory(order, fgraph, nodes_mem, ignore_dmap=False):
"""
Calculate memory with specific node order.
......@@ -804,7 +797,10 @@ class ProfileStats(object):
for var in node.outputs:
compute_map[var][0] = 1
idx = 0
dmap = getattr(node.op, 'destroy_map', None)
if ignore_dmap:
dmap = None
else:
dmap = getattr(node.op, 'destroy_map', None)
vmap = getattr(node.op, 'view_map', None)
val = nodes_mem[node]
......@@ -1095,60 +1091,48 @@ class ProfileStats(object):
# It mean that after executing the node,
# the corresponding variable can be gc.
old_running_memory = count_running_memory(order, fgraph, nodes_mem)
# Store the max of some stats by any function in this profile.
max_sum_size = max(max_sum_size, sum_size)
def compute_max_stats(running_memory, stats):
(max_node_memory_size,
max_running_max_memory_size,
max_node_memory_saved_by_view,
max_node_memory_saved_by_inplace) = stats
max_node_memory_size[0] = max(max_node_memory_size[0],
sum(running_memory[0]))
max_running_max_memory_size[0] = \
max(max_running_max_memory_size[0], sum(running_memory[2]))
# Separate CPU and GPU
max_node_memory_size[1] = max(max_node_memory_size[1],
running_memory[0][0])
max_node_memory_size[2] = max(max_node_memory_size[2],
running_memory[0][1])
max_running_max_memory_size[1] = \
max(max_running_max_memory_size[1], running_memory[2][0])
max_running_max_memory_size[2] = \
max(max_running_max_memory_size[2], running_memory[2][1])
max_node_memory_saved_by_inplace = \
max(max_node_memory_saved_by_inplace, running_memory[3])
max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
running_memory[4])
return (max_node_memory_size,
max_running_max_memory_size,
max_node_memory_saved_by_view,
max_node_memory_saved_by_inplace)
new_order = fgraph.profile.node_executed_order
# A list of new executed node order
for i, (ord, ignore_dmap) in enumerate([(order, False),
(order, True),
(new_order, False),
(new_order, True)]):
running_memory = count_running_memory(
ord, fgraph, nodes_mem, ignore_dmap=ignore_dmap)
new_running_memory = count_running_memory(new_order,
fgraph, nodes_mem)
# Store the max of some stats by any function in this profile.
max_sum_size = max(max_sum_size, sum_size)
max_node_memory_size[0] = max(max_node_memory_size[0],
sum(old_running_memory[0]))
max_running_max_memory_size[0] = \
max(max_running_max_memory_size[0], sum(old_running_memory[2]))
# Separate CPU and GPU
max_node_memory_size[1] = max(max_node_memory_size[1],
old_running_memory[0][0])
max_node_memory_size[2] = max(max_node_memory_size[2],
old_running_memory[0][1])
max_running_max_memory_size[1] = \
max(max_running_max_memory_size[1], old_running_memory[2][0])
max_running_max_memory_size[2] = \
max(max_running_max_memory_size[2], old_running_memory[2][1])
max_node_memory_saved_by_inplace = \
max(max_node_memory_saved_by_inplace, old_running_memory[3])
max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
old_running_memory[4])
# Store max of some stats with new order
new_max_node_memory_size[0] = max(new_max_node_memory_size[0],
sum(new_running_memory[0]))
new_max_running_max_memory_size[0] = \
max(new_max_running_max_memory_size[0],
sum(new_running_memory[2]))
# Separate CPU and GPU
new_max_node_memory_size[1] = max(new_max_node_memory_size[1],
new_running_memory[0][0])
new_max_node_memory_size[2] = max(new_max_node_memory_size[2],
new_running_memory[0][1])
new_max_running_max_memory_size[1] = \
max(new_max_running_max_memory_size[1],
new_running_memory[2][0])
new_max_running_max_memory_size[2] = \
max(new_max_running_max_memory_size[2],
new_running_memory[2][1])
new_max_node_memory_saved_by_inplace = \
max(new_max_node_memory_saved_by_inplace,
new_running_memory[3])
new_max_node_memory_saved_by_view = \
max(new_max_node_memory_saved_by_view, new_running_memory[4])
stats[i] = compute_max_stats(running_memory, stats[i])
# Config: whether print min memory peak
if config.profiling.min_peak_memory:
......@@ -1169,51 +1153,43 @@ class ProfileStats(object):
print("(Sparse variables are ignored)", file=file)
print("(For values in brackets, it's for linker = c|py", file=file)
print("---", file=file)
# print >> file, " Max if no gc, inplace and view: %dKB" % int(
# round(max_sum_size / 1024))
print(" Max if no gc (allow_gc=False): %dKB (%dKB)" % (int(round(
new_max_node_memory_size[0] / 1024.)), int(round(
max_node_memory_size[0] / 1024.))), file=file)
print(" CPU: %dKB (%dKB)" % ((int(round(
new_max_node_memory_size[1] / 1024.)), int(round(
max_node_memory_size[1] / 1024.)))), file=file)
print(" GPU: %dKB (%dKB)" % ((int(round(
new_max_node_memory_size[2] / 1024.)), int(round(
max_node_memory_size[2] / 1024.)))), file=file)
print("---", file=file)
def print_stats(stats1, stats2):
(_, max_running_max_memory_size, _, _) = stats1
(_, new_max_running_max_memory_size, _, _) = stats2
print(" Max if linker=cvm(default): %dKB (%dKB)" % (int(round(
new_max_running_max_memory_size[0] / 1024.)), int(round(
max_running_max_memory_size[0] / 1024.))), file=file)
print(" CPU: %dKB (%dKB)" % ((int(round(
new_max_running_max_memory_size[1] / 1024.)), int(round(
max_running_max_memory_size[1] / 1024.)))), file=file)
print(" GPU: %dKB (%dKB)" % ((int(round(
new_max_running_max_memory_size[2] / 1024.)), int(round(
max_running_max_memory_size[2] / 1024.)))), file=file)
print(" CPU: %dKB (%dKB)" % ((int(round(
new_max_running_max_memory_size[1] / 1024.)), int(round(
max_running_max_memory_size[1] / 1024.)))), file=file)
print(" GPU: %dKB (%dKB)" % ((int(round(
new_max_running_max_memory_size[2] / 1024.)), int(round(
max_running_max_memory_size[2] / 1024.)))), file=file)
print(" CPU + GPU: %dKB (%dKB)" % (int(round(
new_max_running_max_memory_size[0] / 1024.)), int(round(
max_running_max_memory_size[0] / 1024.))), file=file)
print("---", file=file)
print(" Max peak memory with current setting", file=file)
print_stats(stats[0], stats[2])
print(" Max peak memory with current setting and Theano flag optimizer_excluding=inplace", file=file)
print_stats(stats[1], stats[3])
(max_node_memory_size, _, _, _) = stats[0]
(new_max_node_memory_size, _, _, _) = stats[2]
print(" Max peak memory if allow_gc=False (linker don't make a difference)", file=file)
print(" CPU: %dKB" % int(round(
new_max_node_memory_size[1] / 1024.)), file=file)
print(" GPU: %dKB" % int(round(
new_max_node_memory_size[2] / 1024.)), file=file)
print(" CPU + GPU: %dKB" % int(round(
new_max_node_memory_size[0] / 1024.)), file=file)
print("---", file=file)
if min_max_peak:
print(" Minimum peak from all valid apply node order is "
"%dKB(took %.3fs to compute)" %
(int(round(min_max_peak / 1024.)), min_peak_time), file=file)
print(" Memory saved if views are used: %dKB (%dKB)" %
(int(round(new_max_node_memory_saved_by_view / 1024.)),
int(round(max_node_memory_saved_by_view / 1024.))), file=file)
print(" Memory saved if inplace ops are used: %dKB (%dKB)" %
(int(round(new_max_node_memory_saved_by_inplace / 1024.)),
int(round(max_node_memory_saved_by_inplace / 1024.))),
file=file)
print(" Memory saved if gc is enabled: %dKB (%dKB)" %
(int(round(new_max_node_memory_size[0] -
new_max_running_max_memory_size[0]) / 1024.),
int(round(max_node_memory_size[0] -
max_running_max_memory_size[0]) / 1024.)), file=file)
print("---", file=file)
print("---", file=file)
if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
......
......@@ -54,13 +54,16 @@ class Test_profiling(unittest.TestCase):
lines1 = [l for l in the_string.split("\n") if "Max if linker" in l]
lines2 = [l for l in the_string.split("\n") if "Minimum peak" in l]
if theano.config.device == 'cpu':
assert "Max if linker=cvm(default): 4112KB (8204KB)" in the_string, (
lines1, lines2)
assert "CPU: 4112KB (8204KB)" in the_string, (lines1, lines2)
assert "CPU: 8204KB (12296KB)" in the_string, (lines1, lines2)
assert "CPU: 8208KB" in the_string, (lines1, lines2)
assert "Minimum peak from all valid apply node order is 4104KB" in the_string, (
lines1, lines2)
else:
assert "Max if linker=cvm(default): 8220KB (8220KB)" in the_string, (
lines1, lines2)
assert "CPU: 16KB (16KB)" in the_string, (lines1, lines2)
assert "GPU: 8204KB (8204KB)" in the_string, (lines1, lines2)
assert "GPU: 12300KB (12300KB)" in the_string, (lines1, lines2)
assert "GPU: 8212KB" in the_string, (lines1, lines2)
assert "Minimum peak from all valid apply node order is 4116KB" in the_string, (
lines1, lines2)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论