提交 34824493 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1854 from RoyXue/GSoC2014_part1

Add a list store executed node order
......@@ -628,38 +628,38 @@ class ProfileStats(object):
max_running_max_memory_size = 0
max_node_memory_saved_by_view = 0
max_node_memory_saved_by_inplace = 0
for fgraph, nodes_mem in fct_memory.iteritems():
# Sum of the size of all variables in bytes
sum_size = sum([sum([v for v in val if not isinstance(v, str)])
for key, val in nodes_mem.iteritems()])
# Sum of the size of all variables that actually allocate
# memory (excluding views, and inplace);
# statistic with the new order
new_max_node_memory_size = 0
new_max_running_max_memory_size = 0
new_max_node_memory_saved_by_view = 0
new_max_node_memory_saved_by_inplace = 0
def count_running_memory(order, thunk_old_storage, nodes_mem):
"""
Calculate memory with specific node order
Return a list including the following values
1. node_memory_size
Sum of the size of all variables that actually allocate
memory (excluding views, and inplace);
2. running_memory_size
The memory allocated after the current apply node
3. running_max_memory_size
The maximum of running_memory_size during the function
4. node_memory_saved_by_view
The sum of memory saved by returning view instead of new
allocation
5. node_memory_saved_by_inplace
The sum of memory saved by reusing the input instead of
new allocation
"""
node_memory_size = 0
# The sum of memory saved by returning view instead of new
# allocation
node_memory_saved_by_view = 0
# The sum of memory saved by reusing the input instead of
# new allocation
node_memory_saved_by_inplace = 0
# The memory allocated after the current apply node
running_memory_size = 0
# The maximum of running_memory_size during the function
running_max_memory_size = 0
order = fgraph.toposort()
# A list of intermediate variable that are not need
# after the execution of the corresponding node.
# It mean that after executing the node,
# the corresponding variable can be gc.
post_thunk_old_storage = []
computed, last_user = theano.gof.link.gc_helper(order)
for node in order:
post_thunk_old_storage.append([
input_idx
for input_idx, input in enumerate(node.inputs)
if (input in computed) and
(input not in fgraph.outputs) and
node == last_user[input]])
node_memory_saved_by_view = 0
node_memory_saved_by_inplace = 0
for node in order:
val = nodes_mem[node]
dmap = getattr(node.op, 'destroy_map', None)
......@@ -677,21 +677,61 @@ class ProfileStats(object):
running_memory_size += v
if running_memory_size > running_max_memory_size:
running_max_memory_size = running_memory_size
old_storage = post_thunk_old_storage[order.index(node)]
old_storage = thunk_old_storage[order.index(node)]
for old_s in old_storage:
old_v = var_mem[node.inputs[old_s]]
if not isinstance(old_v, str):
running_memory_size -= old_v
return [node_memory_size, running_memory_size, running_max_memory_size, node_memory_saved_by_inplace, node_memory_saved_by_view]
for fgraph, nodes_mem in fct_memory.iteritems():
# Sum of the size of all variables in bytes
sum_size = sum([sum([v for v in val if not isinstance(v, str)])
for key, val in nodes_mem.iteritems()])
order = fgraph.toposort()
# A list of intermediate variable that are not need
# after the execution of the corresponding node.
# It mean that after executing the node,
# the corresponding variable can be gc.
post_thunk_old_storage = []
computed, last_user = theano.gof.link.gc_helper(order)
for node in order:
post_thunk_old_storage.append([
input_idx
for input_idx, input in enumerate(node.inputs)
if (input in computed) and
(input not in fgraph.outputs) and
node == last_user[input]])
old_running_memory = count_running_memory(order, post_thunk_old_storage, nodes_mem)
new_order = fgraph.profile.node_executed_order
# A list of new executed node order
new_storage = fgraph.profile.node_cleared_order
# A list of variables that get freed
new_running_memory = count_running_memory(new_order, new_storage, nodes_mem)
# Store the max of some stats by any function in this profile.
max_sum_size = max(max_sum_size, sum_size)
max_node_memory_size = max(max_node_memory_size, node_memory_size)
max_node_memory_size = max(max_node_memory_size, old_running_memory[0])
max_running_max_memory_size = max(max_running_max_memory_size,
running_max_memory_size)
old_running_memory[2])
max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
node_memory_saved_by_view)
old_running_memory[4])
max_node_memory_saved_by_inplace = max(
max_node_memory_saved_by_inplace, node_memory_saved_by_inplace)
max_node_memory_saved_by_inplace, old_running_memory[3])
# Store max of some stats with new order
new_max_node_memory_size = max(new_max_node_memory_size, new_running_memory[0])
new_max_running_max_memory_size = max(new_max_running_max_memory_size,
new_running_memory[2])
new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
new_running_memory[4])
new_max_node_memory_saved_by_inplace = max(
new_max_node_memory_saved_by_inplace, new_running_memory[3])
del fgraph, nodes_mem, post_thunk_old_storage, node
......@@ -702,21 +742,27 @@ class ProfileStats(object):
print >> file, "Memory Profile"
print >> file, "(Sparse variables are ignored)"
print >> file, "(For values in brackets, it's for linker = c|py"
print >> file, "---"
# print >> file, " Max if no gc, inplace and view: %dKB" % int(
# round(max_sum_size / 1024))
print >> file, " Max if linker=cvm (default): unknown"
print >> file, " Max if no gc (allow_gc=False): %dKB" % int(round(
max_node_memory_size / 1024.))
print >> file, " Max if linker=c|py: %dKB" % int(round(
max_running_max_memory_size / 1024.))
# print >> file, " Memory saved if views are used: %dKB" % int(
# round(max_node_memory_saved_by_view / 1024.))
# print >> file, " Memory saved if inplace ops are used: %dKB" % \
# int(round(max_node_memory_saved_by_inplace / 1024.))
print >> file, " Memory saved if gc is enabled (linker=c|py): %dKB" % int(
round(max_node_memory_size - max_running_max_memory_size) / 1024.)
print >> file, " Max if no gc (allow_gc=False): %dKB (%dKB)" % (int(round(
new_max_node_memory_size / 1024.)), int(round(
max_node_memory_size / 1024.)))
print >> file, " Max if linker=cvm(default): %dKB (%dKB)" % (int(round(
new_max_running_max_memory_size / 1024.)), int(round(
max_running_max_memory_size / 1024.)))
print >> file, " Memory saved if views are used: %dKB (%dKB)" % (int(
round(new_max_node_memory_saved_by_view / 1024.)), int(
round(max_node_memory_saved_by_view / 1024.)))
print >> file, " Memory saved if inplace ops are used: %dKB (%dKB)" % \
(int(round(new_max_node_memory_saved_by_inplace / 1024.)), int(round(max_node_memory_saved_by_inplace / 1024.)))
print >> file, " Memory saved if gc is enabled: %dKB (%dKB)" % (int(
round(new_max_node_memory_size - new_max_running_max_memory_size) / 1024.), int(
round(max_node_memory_size - max_running_max_memory_size) / 1024.))
if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and
hasattr(theano.sandbox.cuda, 'cuda_ndarray') and
......
"""
Test of memory profiling
"""
import theano
import theano.tensor as T
import StringIO
def test_profiling():
old1 = theano.config.profile
old2 = theano.config.profile_memory
theano.config.profile = True
theano.config.profile_memory = True
x = T.dvector("x")
y = T.dvector("y")
z = x + y
f = theano.function([x, y], z, profile=True, name="test_profiling")
output = f([1, 2, 3, 4],[1, 1, 1, 1])
buf = StringIO.StringIO()
f.profile.summary(buf)
theano.config.profile = old1
theano.config.profile_memory = old2
if __name__ == '__main__':
test_profiling()
\ No newline at end of file
......@@ -141,6 +141,12 @@ class VM(object):
profile.variable_shape = self.variable_shape.copy()
profile.variable_strides = self.variable_strides.copy()
if hasattr(self, 'node_executed_order'):
profile.node_executed_order = self.node_executed_order[:]
if hasattr(self, 'node_cleared_order'):
profile.node_cleared_order = self.node_cleared_order[:]
# clear the timer info out of the buffers
for i in xrange(len(self.call_times)):
self.call_times[i] = 0.0
......@@ -298,7 +304,8 @@ class Stack(VM):
idx = self.node_idx[node]
t0 = time.time()
rval = self.thunks[idx]()
self.node_executed_order.append(node)
# Some thunks on some computers run faster than the granularity
# of the time.time clock.
# Profile output looks buggy if a node has run but takes 0 time.
......@@ -318,6 +325,9 @@ class Stack(VM):
compute_map = self.compute_map
thunks = self.thunks
dependencies = self.dependencies
self.node_executed_order = []
self.node_cleared_order = []
for k in self.storage_map:
compute_map[k][0] = (k.owner is None)
......@@ -404,6 +414,10 @@ class Stack(VM):
self.thunks[self.node_idx[current_apply]])
for o in current_apply.outputs:
compute_map[o][0] = 1
input_index = []
# A list store the index of inputs variables
if self.allow_gc:
for i in current_apply.inputs:
# Garbage Collection -> check if anybody else uses
......@@ -414,6 +428,8 @@ class Stack(VM):
if all(compute_map[v][0]
for v in dependencies[i]):
storage_map[i][0] = None
input_index.append(current_apply.inputs.index(i))
#DO NOT set compute_map to 0
#If values become False and the
......@@ -435,6 +451,8 @@ class Stack(VM):
#The stack level is not good when inside a Scan.
stacklevel=3
)
self.node_cleared_order.append(input_index)
elif not computed_ins:
# -- Non-lazy case, need inputs
apply_stack.append(current_apply)
......@@ -442,6 +460,7 @@ class Stack(VM):
for inp in current_deps
if inp.owner)
elif not computed_outs:
#
# stack loop: Lazy Evaluation Case
......@@ -488,6 +507,8 @@ class Stack(VM):
st = 'c'
self.variable_strides[var] = st
input_index = []
if self.allow_gc:
for i in current_apply.inputs:
if (dependencies[i] and i.owner and
......@@ -499,17 +520,29 @@ class Stack(VM):
break
if empty_storage_map:
storage_map[i][0] = None
input_index.append(current_apply.inputs.index(i))
#See the not lazy gc code for explanations
#of compute_map change
compute_map[i][0] = 2
self.node_cleared_order.append(input_index)
# Hacky coarse gc final pass
# This is required until we have a proper gc algorithm for graphs with
# lazy evaluation. See discussion on theano-dev June 19 2012.
final_index = []
if self.allow_gc:
for v in storage_map:
if v.owner and not v in self.outputs:
storage_map[v][0] = None
if compute_map[v][0] == 2:
continue
else:
storage_map[v][0] = None
final_index.append(v)
compute_map[v][0] = 2
self.node_cleared_order.append(final_index)
try:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论