提交 34824493 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1854 from RoyXue/GSoC2014_part1

Add a list store executed node order
...@@ -628,38 +628,38 @@ class ProfileStats(object): ...@@ -628,38 +628,38 @@ class ProfileStats(object):
max_running_max_memory_size = 0 max_running_max_memory_size = 0
max_node_memory_saved_by_view = 0 max_node_memory_saved_by_view = 0
max_node_memory_saved_by_inplace = 0 max_node_memory_saved_by_inplace = 0
for fgraph, nodes_mem in fct_memory.iteritems():
# Sum of the size of all variables in bytes # statistic with the new order
sum_size = sum([sum([v for v in val if not isinstance(v, str)]) new_max_node_memory_size = 0
for key, val in nodes_mem.iteritems()]) new_max_running_max_memory_size = 0
# Sum of the size of all variables that actually allocate new_max_node_memory_saved_by_view = 0
# memory (excluding views, and inplace); new_max_node_memory_saved_by_inplace = 0
def count_running_memory(order, thunk_old_storage, nodes_mem):
"""
Calculate memory with specific node order
Return a list including the following values
1. node_memory_size
Sum of the size of all variables that actually allocate
memory (excluding views, and inplace);
2. running_memory_size
The memory allocated after the current apply node
3. running_max_memory_size
The maximum of running_memory_size during the function
4. node_memory_saved_by_view
The sum of memory saved by returning view instead of new
allocation
5. node_memory_saved_by_inplace
The sum of memory saved by reusing the input instead of
new allocation
"""
node_memory_size = 0 node_memory_size = 0
# The sum of memory saved by returning view instead of new
# allocation
node_memory_saved_by_view = 0
# The sum of memory saved by reusing the input instead of
# new allocation
node_memory_saved_by_inplace = 0
# The memory allocated after the current apply node
running_memory_size = 0 running_memory_size = 0
# The maximum of running_memory_size during the function
running_max_memory_size = 0 running_max_memory_size = 0
node_memory_saved_by_view = 0
node_memory_saved_by_inplace = 0
order = fgraph.toposort()
# A list of intermediate variable that are not need
# after the execution of the corresponding node.
# It mean that after executing the node,
# the corresponding variable can be gc.
post_thunk_old_storage = []
computed, last_user = theano.gof.link.gc_helper(order)
for node in order:
post_thunk_old_storage.append([
input_idx
for input_idx, input in enumerate(node.inputs)
if (input in computed) and
(input not in fgraph.outputs) and
node == last_user[input]])
for node in order: for node in order:
val = nodes_mem[node] val = nodes_mem[node]
dmap = getattr(node.op, 'destroy_map', None) dmap = getattr(node.op, 'destroy_map', None)
...@@ -677,21 +677,61 @@ class ProfileStats(object): ...@@ -677,21 +677,61 @@ class ProfileStats(object):
running_memory_size += v running_memory_size += v
if running_memory_size > running_max_memory_size: if running_memory_size > running_max_memory_size:
running_max_memory_size = running_memory_size running_max_memory_size = running_memory_size
old_storage = post_thunk_old_storage[order.index(node)] old_storage = thunk_old_storage[order.index(node)]
for old_s in old_storage: for old_s in old_storage:
old_v = var_mem[node.inputs[old_s]] old_v = var_mem[node.inputs[old_s]]
if not isinstance(old_v, str): if not isinstance(old_v, str):
running_memory_size -= old_v running_memory_size -= old_v
return [node_memory_size, running_memory_size, running_max_memory_size, node_memory_saved_by_inplace, node_memory_saved_by_view]
for fgraph, nodes_mem in fct_memory.iteritems():
# Sum of the size of all variables in bytes
sum_size = sum([sum([v for v in val if not isinstance(v, str)])
for key, val in nodes_mem.iteritems()])
order = fgraph.toposort()
# A list of intermediate variable that are not need
# after the execution of the corresponding node.
# It mean that after executing the node,
# the corresponding variable can be gc.
post_thunk_old_storage = []
computed, last_user = theano.gof.link.gc_helper(order)
for node in order:
post_thunk_old_storage.append([
input_idx
for input_idx, input in enumerate(node.inputs)
if (input in computed) and
(input not in fgraph.outputs) and
node == last_user[input]])
old_running_memory = count_running_memory(order, post_thunk_old_storage, nodes_mem)
new_order = fgraph.profile.node_executed_order
# A list of new executed node order
new_storage = fgraph.profile.node_cleared_order
# A list of variables that get freed
new_running_memory = count_running_memory(new_order, new_storage, nodes_mem)
# Store the max of some stats by any function in this profile. # Store the max of some stats by any function in this profile.
max_sum_size = max(max_sum_size, sum_size) max_sum_size = max(max_sum_size, sum_size)
max_node_memory_size = max(max_node_memory_size, node_memory_size) max_node_memory_size = max(max_node_memory_size, old_running_memory[0])
max_running_max_memory_size = max(max_running_max_memory_size, max_running_max_memory_size = max(max_running_max_memory_size,
running_max_memory_size) old_running_memory[2])
max_node_memory_saved_by_view = max(max_node_memory_saved_by_view, max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
node_memory_saved_by_view) old_running_memory[4])
max_node_memory_saved_by_inplace = max( max_node_memory_saved_by_inplace = max(
max_node_memory_saved_by_inplace, node_memory_saved_by_inplace) max_node_memory_saved_by_inplace, old_running_memory[3])
# Store max of some stats with new order
new_max_node_memory_size = max(new_max_node_memory_size, new_running_memory[0])
new_max_running_max_memory_size = max(new_max_running_max_memory_size,
new_running_memory[2])
new_max_node_memory_saved_by_view = max(new_max_node_memory_saved_by_view,
new_running_memory[4])
new_max_node_memory_saved_by_inplace = max(
new_max_node_memory_saved_by_inplace, new_running_memory[3])
del fgraph, nodes_mem, post_thunk_old_storage, node del fgraph, nodes_mem, post_thunk_old_storage, node
...@@ -702,21 +742,27 @@ class ProfileStats(object): ...@@ -702,21 +742,27 @@ class ProfileStats(object):
print >> file, "Memory Profile" print >> file, "Memory Profile"
print >> file, "(Sparse variables are ignored)" print >> file, "(Sparse variables are ignored)"
print >> file, "(For values in brackets, it's for linker = c|py"
print >> file, "---" print >> file, "---"
# print >> file, " Max if no gc, inplace and view: %dKB" % int( # print >> file, " Max if no gc, inplace and view: %dKB" % int(
# round(max_sum_size / 1024)) # round(max_sum_size / 1024))
print >> file, " Max if linker=cvm (default): unknown"
print >> file, " Max if no gc (allow_gc=False): %dKB" % int(round( print >> file, " Max if no gc (allow_gc=False): %dKB (%dKB)" % (int(round(
max_node_memory_size / 1024.)) new_max_node_memory_size / 1024.)), int(round(
print >> file, " Max if linker=c|py: %dKB" % int(round( max_node_memory_size / 1024.)))
max_running_max_memory_size / 1024.)) print >> file, " Max if linker=cvm(default): %dKB (%dKB)" % (int(round(
# print >> file, " Memory saved if views are used: %dKB" % int( new_max_running_max_memory_size / 1024.)), int(round(
# round(max_node_memory_saved_by_view / 1024.)) max_running_max_memory_size / 1024.)))
# print >> file, " Memory saved if inplace ops are used: %dKB" % \ print >> file, " Memory saved if views are used: %dKB (%dKB)" % (int(
# int(round(max_node_memory_saved_by_inplace / 1024.)) round(new_max_node_memory_saved_by_view / 1024.)), int(
print >> file, " Memory saved if gc is enabled (linker=c|py): %dKB" % int( round(max_node_memory_saved_by_view / 1024.)))
round(max_node_memory_size - max_running_max_memory_size) / 1024.) print >> file, " Memory saved if inplace ops are used: %dKB (%dKB)" % \
(int(round(new_max_node_memory_saved_by_inplace / 1024.)), int(round(max_node_memory_saved_by_inplace / 1024.)))
print >> file, " Memory saved if gc is enabled: %dKB (%dKB)" % (int(
round(new_max_node_memory_size - new_max_running_max_memory_size) / 1024.), int(
round(max_node_memory_size - max_running_max_memory_size) / 1024.))
if (hasattr(theano, 'sandbox') and if (hasattr(theano, 'sandbox') and
hasattr(theano.sandbox, 'cuda') and hasattr(theano.sandbox, 'cuda') and
hasattr(theano.sandbox.cuda, 'cuda_ndarray') and hasattr(theano.sandbox.cuda, 'cuda_ndarray') and
......
"""
Test of memory profiling
"""
import theano
import theano.tensor as T
import StringIO
def test_profiling():
old1 = theano.config.profile
old2 = theano.config.profile_memory
theano.config.profile = True
theano.config.profile_memory = True
x = T.dvector("x")
y = T.dvector("y")
z = x + y
f = theano.function([x, y], z, profile=True, name="test_profiling")
output = f([1, 2, 3, 4],[1, 1, 1, 1])
buf = StringIO.StringIO()
f.profile.summary(buf)
theano.config.profile = old1
theano.config.profile_memory = old2
if __name__ == '__main__':
test_profiling()
\ No newline at end of file
...@@ -141,6 +141,12 @@ class VM(object): ...@@ -141,6 +141,12 @@ class VM(object):
profile.variable_shape = self.variable_shape.copy() profile.variable_shape = self.variable_shape.copy()
profile.variable_strides = self.variable_strides.copy() profile.variable_strides = self.variable_strides.copy()
if hasattr(self, 'node_executed_order'):
profile.node_executed_order = self.node_executed_order[:]
if hasattr(self, 'node_cleared_order'):
profile.node_cleared_order = self.node_cleared_order[:]
# clear the timer info out of the buffers # clear the timer info out of the buffers
for i in xrange(len(self.call_times)): for i in xrange(len(self.call_times)):
self.call_times[i] = 0.0 self.call_times[i] = 0.0
...@@ -298,6 +304,7 @@ class Stack(VM): ...@@ -298,6 +304,7 @@ class Stack(VM):
idx = self.node_idx[node] idx = self.node_idx[node]
t0 = time.time() t0 = time.time()
rval = self.thunks[idx]() rval = self.thunks[idx]()
self.node_executed_order.append(node)
# Some thunks on some computers run faster than the granularity # Some thunks on some computers run faster than the granularity
# of the time.time clock. # of the time.time clock.
...@@ -318,6 +325,9 @@ class Stack(VM): ...@@ -318,6 +325,9 @@ class Stack(VM):
compute_map = self.compute_map compute_map = self.compute_map
thunks = self.thunks thunks = self.thunks
dependencies = self.dependencies dependencies = self.dependencies
self.node_executed_order = []
self.node_cleared_order = []
for k in self.storage_map: for k in self.storage_map:
compute_map[k][0] = (k.owner is None) compute_map[k][0] = (k.owner is None)
...@@ -404,6 +414,10 @@ class Stack(VM): ...@@ -404,6 +414,10 @@ class Stack(VM):
self.thunks[self.node_idx[current_apply]]) self.thunks[self.node_idx[current_apply]])
for o in current_apply.outputs: for o in current_apply.outputs:
compute_map[o][0] = 1 compute_map[o][0] = 1
input_index = []
# A list store the index of inputs variables
if self.allow_gc: if self.allow_gc:
for i in current_apply.inputs: for i in current_apply.inputs:
# Garbage Collection -> check if anybody else uses # Garbage Collection -> check if anybody else uses
...@@ -414,6 +428,8 @@ class Stack(VM): ...@@ -414,6 +428,8 @@ class Stack(VM):
if all(compute_map[v][0] if all(compute_map[v][0]
for v in dependencies[i]): for v in dependencies[i]):
storage_map[i][0] = None storage_map[i][0] = None
input_index.append(current_apply.inputs.index(i))
#DO NOT set compute_map to 0 #DO NOT set compute_map to 0
#If values become False and the #If values become False and the
...@@ -435,6 +451,8 @@ class Stack(VM): ...@@ -435,6 +451,8 @@ class Stack(VM):
#The stack level is not good when inside a Scan. #The stack level is not good when inside a Scan.
stacklevel=3 stacklevel=3
) )
self.node_cleared_order.append(input_index)
elif not computed_ins: elif not computed_ins:
# -- Non-lazy case, need inputs # -- Non-lazy case, need inputs
apply_stack.append(current_apply) apply_stack.append(current_apply)
...@@ -442,6 +460,7 @@ class Stack(VM): ...@@ -442,6 +460,7 @@ class Stack(VM):
for inp in current_deps for inp in current_deps
if inp.owner) if inp.owner)
elif not computed_outs: elif not computed_outs:
# #
# stack loop: Lazy Evaluation Case # stack loop: Lazy Evaluation Case
...@@ -488,6 +507,8 @@ class Stack(VM): ...@@ -488,6 +507,8 @@ class Stack(VM):
st = 'c' st = 'c'
self.variable_strides[var] = st self.variable_strides[var] = st
input_index = []
if self.allow_gc: if self.allow_gc:
for i in current_apply.inputs: for i in current_apply.inputs:
if (dependencies[i] and i.owner and if (dependencies[i] and i.owner and
...@@ -499,17 +520,29 @@ class Stack(VM): ...@@ -499,17 +520,29 @@ class Stack(VM):
break break
if empty_storage_map: if empty_storage_map:
storage_map[i][0] = None storage_map[i][0] = None
input_index.append(current_apply.inputs.index(i))
#See the not lazy gc code for explanations #See the not lazy gc code for explanations
#of compute_map change #of compute_map change
compute_map[i][0] = 2 compute_map[i][0] = 2
self.node_cleared_order.append(input_index)
# Hacky coarse gc final pass # Hacky coarse gc final pass
# This is required until we have a proper gc algorithm for graphs with # This is required until we have a proper gc algorithm for graphs with
# lazy evaluation. See discussion on theano-dev June 19 2012. # lazy evaluation. See discussion on theano-dev June 19 2012.
final_index = []
if self.allow_gc: if self.allow_gc:
for v in storage_map: for v in storage_map:
if v.owner and not v in self.outputs: if v.owner and not v in self.outputs:
if compute_map[v][0] == 2:
continue
else:
storage_map[v][0] = None storage_map[v][0] = None
final_index.append(v)
compute_map[v][0] = 2
self.node_cleared_order.append(final_index)
try: try:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论