提交 c800e27c authored 作者: Frederic Bastien's avatar Frederic Bastien

Make the ProfileMode now profile the memory at the same time.

上级 ee0cd0a3
...@@ -37,8 +37,9 @@ class ProfileMode(Mode): ...@@ -37,8 +37,9 @@ class ProfileMode(Mode):
fct_call_time = {}#time passed inside theano fct call including op time. fct_call_time = {}#time passed inside theano fct call including op time.
fct_call = {} fct_call = {}
message="" message=""
outputs_size={}
self.__setstate__((linker, optimizer, apply_time, op_cimpl, self.__setstate__((linker, optimizer, apply_time, op_cimpl,
compile_time, fct_call_time, fct_call, message)) compile_time, fct_call_time, fct_call, message, outputs_size))
def function_maker(self, i,o,m, *args, **kwargs): def function_maker(self, i,o,m, *args, **kwargs):
"""Return an instance of `Profiler_Maker` which init the count""" """Return an instance of `Profiler_Maker` which init the count"""
...@@ -51,10 +52,11 @@ class ProfileMode(Mode): ...@@ -51,10 +52,11 @@ class ProfileMode(Mode):
def __getstate__(self): def __getstate__(self):
#print "__getstate__",self.provided_linker,self.provided_optimizer #print "__getstate__",self.provided_linker,self.provided_optimizer
return (self.provided_linker, self.provided_optimizer, self.apply_time, return (self.provided_linker, self.provided_optimizer, self.apply_time,
self.op_cimpl, self.compile_time, self.fct_call_time, self.fct_call, self.message) self.op_cimpl, self.compile_time, self.fct_call_time,
self.fct_call, self.message, self.outputs_size)
def __setstate__(self, (linker, optimizer, apply_time, op_cimpl, def __setstate__(self, (linker, optimizer, apply_time, op_cimpl,
compile_time, fct_call_time, fct_call, message)): compile_time, fct_call_time, fct_call, message, outputs_size)):
self.apply_time = apply_time self.apply_time = apply_time
self.op_cimpl = op_cimpl self.op_cimpl = op_cimpl
...@@ -64,8 +66,11 @@ class ProfileMode(Mode): ...@@ -64,8 +66,11 @@ class ProfileMode(Mode):
self.call_time = 0 self.call_time = 0
self.fn_time = 0 self.fn_time = 0
self.message = "" self.message = ""
self.outputs_size = outputs_size
def profile_thunk(i, node, th): def profile_thunk(i, node, th):
""" Profile only the execution time
"""
if hasattr(th, 'cthunk'): if hasattr(th, 'cthunk'):
t0 = time.time() t0 = time.time()
failure = run_cthunk(th.cthunk) failure = run_cthunk(th.cthunk)
...@@ -82,12 +87,50 @@ class ProfileMode(Mode): ...@@ -82,12 +87,50 @@ class ProfileMode(Mode):
apply_time[(i,node)] += dt apply_time[(i,node)] += dt
def profile_thunk2(i, node, th):
""" Profile the execution time and the memory size.
"""
if hasattr(th, 'cthunk'):
t0 = time.time()
failure = run_cthunk(th.cthunk)
dt = time.time() - t0
if failure:
raise RuntimeError(('A C Op raised an exception. PROFILE_MODE cannot'
' tell you what it was though. Use a standard mode such as'
' FAST_RUN_NOGC to correct the problem.'))
else:
t0 = time.time()
th()
dt = time.time() - t0
size=[]
for o in th.outputs:
s=o[0].size
#can't use o[0].dtype.itemsize as dtype is a str for CudaNdarray
dtype = str(o[0].dtype)
dtype2=dtype[-2:]
if dtype2 == '32':
s *= 4
elif dtype2 == '64':
s *= 8
elif dtype2 == '16':
s *= 2
elif dtype[-1] == '8':
s *= 1
elif dtype[-3:] == '128':
s *= 16
else:
raise Exception("Can't determine the memory size of dtype",o[0].dtype)
size.append(s)
outputs_size[node]=size
apply_time[(i,node)] += dt
self.provided_linker = linker self.provided_linker = linker
self.provided_optimizer = optimizer self.provided_optimizer = optimizer
if isinstance(linker, str) or linker is None: if isinstance(linker, str) or linker is None:
linker = predefined_linkers[linker] linker = predefined_linkers[linker]
linker = WrapLinker([linker], profile_thunk) linker = WrapLinker([linker], profile_thunk2)
self.linker = linker self.linker = linker
if isinstance(optimizer, str) or optimizer is None: if isinstance(optimizer, str) or optimizer is None:
...@@ -116,9 +159,10 @@ class ProfileMode(Mode): ...@@ -116,9 +159,10 @@ class ProfileMode(Mode):
apply_time = self.apply_time apply_time = self.apply_time
op_cimpl = self.op_cimpl op_cimpl = self.op_cimpl
message = self.message message = self.message
outputs_size = self.outputs_size
self.print_summary_("print_summary", compile_time, fct_call_time, fct_call, self.print_summary_("print_summary", compile_time, fct_call_time, fct_call,
apply_time, op_cimpl, message, apply_time, op_cimpl, message, outputs_size,
n_apply_to_print, n_ops_to_print) n_apply_to_print, n_ops_to_print)
...@@ -154,15 +198,16 @@ class ProfileMode(Mode): ...@@ -154,15 +198,16 @@ class ProfileMode(Mode):
apply_time = diff_dict(self.apply_time, other.apply_time) apply_time = diff_dict(self.apply_time, other.apply_time)
op_cimpl = self.op_cimpl and other.op_cimpl op_cimpl = self.op_cimpl and other.op_cimpl
message = self.message message = self.message
outputs_size = diff_dict(self.outputs_size,other.outputs_size)
self.print_summary_("print_diff_summary", compile_time, fct_call_time, fct_call, self.print_summary_("print_diff_summary", compile_time, fct_call_time, fct_call,
apply_time, op_cimpl, message, apply_time, op_cimpl, message, outputs_size,
n_apply_to_print=n_apply_to_print, n_apply_to_print=n_apply_to_print,
n_ops_to_print=n_ops_to_print, print_apply=False) n_ops_to_print=n_ops_to_print, print_apply=False)
@staticmethod @staticmethod
def print_summary_(fct_name, compile_time, fct_call_time, fct_call, def print_summary_(fct_name, compile_time, fct_call_time, fct_call,
apply_time, op_cimpl, message, apply_time, op_cimpl, message, outputs_size,
n_apply_to_print=15, n_ops_to_print=20, print_apply=True): n_apply_to_print=15, n_ops_to_print=20, print_apply=True):
""" """
do the actual printing of print_summary and print_diff_summary. do the actual printing of print_summary and print_diff_summary.
...@@ -348,6 +393,80 @@ class ProfileMode(Mode): ...@@ -348,6 +393,80 @@ class ProfileMode(Mode):
if hasattr(i.type, 'dtype') and i.type.dtype=='float64': if hasattr(i.type, 'dtype') and i.type.dtype=='float64':
print fct.name, i.name, i.type, i print fct.name, i.name, i.type, i
if outputs_size:
fct_memory={}#env->dict(node->(outputs size))
var_mem = {}
for node,val in outputs_size.items():
fct_memory.setdefault(node.env,{})
fct_memory[node.env][node]=val
for out,v in zip(node.outputs,val):
var_mem[out]=v
print
print "Profile of Theano functions memory:"
for env,nodes_mem in fct_memory.iteritems():
print "Theano fct:", [fct for fct in fct_call.keys() if fct.maker.env is env][0].name
size_sum=sum([sum(val) for key,val in nodes_mem.iteritems()])
print " Max without gc, inplace and view (KB)",size_sum/1024
node_memory_size = 0
node_memory_saved_by_view = 0
node_memory_saved_by_inplace = 0
running_memory_size = 0
running_max_memory_size = 0
post_thunk_old_storage = []
items = nodes_mem.items()
items.sort(key=lambda a: a[1])
items.reverse()
order = env.toposort()
computed, last_user = gof.link.gc_helper(order)
for node in order:
post_thunk_old_storage.append([ input_idx
for input_idx,input in enumerate(node.inputs)
if (input in computed) and (input not in env.outputs) and node == last_user[input]])
for node,val in items[:n_apply_to_print]:
dmap = getattr(node.op,'destroy_map',None)
vmap = getattr(node.op,'view_map',None)
for idx,v in enumerate(val):
if dmap and idx in dmap:#TODO check the op returned a view
node_memory_saved_by_inplace += v
elif vmap and idx in vmap:#TODO check the op returned a view
node_memory_saved_by_view += v
else:
node_memory_size += v
running_memory_size += v
if running_memory_size > running_max_memory_size:
running_max_memory_size = running_memory_size
old_storage = post_thunk_old_storage[order.index(node)]
for old_s in old_storage:
running_memory_size -= var_mem[node.inputs[old_s]]
pass
pass
print " Max FAST_RUN_NO_GC (KB)", node_memory_size/1024
print " Max FAST_RUN (KB)", running_max_memory_size/1024
print " Memory saved by view (KB)", node_memory_saved_by_view/1024
print " Memory saved by inplace (KB)", node_memory_saved_by_inplace/1024
print " Memory saved by GC (KB)", (node_memory_size-running_max_memory_size)/1024
n_apply_to_print+=10#TODO remove this line
print " <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> <created/inplace/view> <Apply node>"
print " <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected."
for key,val in items[:n_apply_to_print]:
code = ['c']*len(node.outputs)
for out,inp in getattr(key.op,'destroy_map',{}).iteritems():
code[out] = "i"
for out,inp in getattr(key.op,'view_map',{}).iteritems():
code[out] = "v"
print ' %9dB %s %s %s' % (sum(val), str(val), ' '.join(code), key)
print ' ... (remaining %i Apply account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(nodes_mem)-n_ops_to_print),
sum(sum(val) for key, val in items[n_ops_to_print:]),
sum(sum(val) for key, val in items[n_ops_to_print:])/size_sum)
print print
print "We guess some tips to make your code faster. If you think of new one, suggest them on the mailing list. Test them before use as they are not guaranted to always give a speed up." print "We guess some tips to make your code faster. If you think of new one, suggest them on the mailing list. Test them before use as they are not guaranted to always give a speed up."
from theano import tensor as T from theano import tensor as T
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论