提交 f4402e3b authored 作者: James Bergstra's avatar James Bergstra

adding profiling file

上级 3cefe303
"""ProfileStats object for runtime and memory profiling.
"""
#
# TODO: measure memory usage like ProfileMode did
# TODO: put the optimization tips into a tips section??
# TODO: add tip to use specify_shape (is specify_shape even in library doc?)
# TODO: ensure field width for string fields makes columns line up
# TODO: what to do about 'diff summary'? (ask Fred?)
#
__authors__ = "James Bergstra"
__reviewer__ = "Razvan Pascanu"
__copyright__ = "(c) 2011, Universite de Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev <theano-dev@googlegroups.com>"
__docformat__ = "restructuredtext en"
import atexit
import sys
import theano
config = theano.config
_atexit_print_list = []
_atexit_print_file = sys.stderr
def _atexit_print_fn():
"""Print ProfileStat objects in _atexit_print_list to _atexit_print_file
"""
for ps in _atexit_print_list:
if ps.fct_callcount or ps.compile_time > 0:
ps.summary(file=_atexit_print_file)
else:
print 'Skipping empty Profile'
atexit.register(_atexit_print_fn)
class ProfileStats(object):
"""
Object to store runtime and memory profiling information for all of
Theano's operations: compilation, optimization, execution.
"""
#
# Note on implementation:
# Class variables are used here so that each one can be
# documented and initialized together.
# dictionary variables are initialized with None.
#
compile_time = 0.0
# Total time spent in body of orig_function,
# dominated by graph optimization and compilation of C
#
fct_call_time = 0.0
# The total time spent in Function.__call__
#
fct_callcount = 0
# Number of calls to Function.__call__
#
vm_call_time = 0.0
# Total time spent in Function.fn.__call__
#
apply_time = None
# dict from node -> float runtime
#
apply_callcount = None
# dict from node -> number of executions
#
apply_cimpl = None
# dict from node -> bool (1 if c, 0 if py)
#
message = None
# pretty string to print in summary, to identify this output
#
outputs_size = None
# node -> size of allocated output
#
optimizer_time = 0.0
# time spent optimizing graph (FunctionMaker.__init__)
linker_time = 0.0
# time spent linking graph (FunctionMaker.create)
def __init__(self, atexit_print=True, **kwargs):
"""
atexit_print - bool. True means that this object will be printed to
stderr (using .summary()) at the end of the program.
**kwargs - misc initializers. These should (but need not) match the
names of the class vars declared in this class.
"""
self.apply_callcount = {}
self.output_size = {}
self.apply_time = {}
self.apply_cimpl = {}
self.__dict__.update(kwargs)
#print >> sys.stderr, "self.message", self.message
if atexit_print:
global _atexit_print_list
_atexit_print_list.append(self)
def op_time(self):
"""dict op -> total time on thunks"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
for node, t in self.apply_time.items():
rval.setdefault(node.op, 0)
rval[node.op] += t
return rval
def op_callcount(self):
"""dict op -> total number of thunk calls"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
for node, count in self.apply_callcount.items():
rval.setdefault(node.op, 0)
rval[node.op] += count
return rval
def op_nodes(self):
"""dict op -> total number of nodes"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
for node, count in self.apply_callcount.items():
rval.setdefault(node.op, 0)
rval[node.op] += 1
return rval
def op_impl(self):
"""dict op -> total number of nodes"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
for node in self.apply_callcount:
if self.apply_cimpl[node]:
rval[node.op] = 'C '
else:
rval[node.op] = 'Py'
return rval
def op_flops(self):
"""dict op -> total number of flops"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
return rval #TODO: continue here
for node, count in self.apply_callcount.items():
rval.setdefault(node.op, 0)
rval[node.op] += 1
return rval
for a,t in op_time.items():
if hasattr(a,'flops'):
op_flops[a]=a.flops*op_call[a]/t/1e6
flops_msg=''
if op_flops:
flops_msg=' <MFlops/s>'
print '\nHACK WARNING: we print the flops for some OP, but the logic don\' always work. You need to know the internal of Theano to make it work correctly. Otherwise don\'t use!'
print '\nOp-wise summary: <%% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> %s <nb_call> <nb apply> <Op name>'%(flops_msg)
def summary_ops(self, file=sys.stderr, N=None):
if self.apply_time:
local_time = sum(self.apply_time.values())
else:
local_time = 0
if local_time == 0:
print >> file, ('ProfileMode.summary_ops: total time 0'
' (did you forget to enable counters?)')
return
op_time = self.op_time()
op_call = self.op_callcount()
op_apply = self.op_nodes()
op_flops = self.op_flops()
op_impl = self.op_impl()
if N is None:
N = len(self.op_flops)
otimes = [(t*100/local_time,
t,
op,
op_impl.get(op, ' '),
op_call.get(op, 0),
op_apply.get(op,0))
for op, t in op_time.items()]
otimes.sort()
otimes.reverse()
tot=0
print >> file, 'Ops'
print >> file, '---'
print >> file, '<% time> <cumulative %%> <apply time> <cumulative seconds> <time per call> <nb_call> <Op name>'
for f,t,a,impl,nb_call,nb_apply in otimes[:N]:
if nb_call == 0:
assert t == 0
continue
tot+=t
ftot=tot*100/local_time
if op_flops:
print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %7.1f %5d %2d %s' % (
f, ftot, t, tot, t/nb_call, impl, op_flops.get(a,-1), nb_call, nb_apply, a)
else:
print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (
f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
print >>file, ' ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
%(max(0, len(otimes)-N),
sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
print >> file, ''
def summary_nodes(self, file=sys.stderr, N=None):
if self.apply_time:
local_time = sum(self.apply_time.values())
else:
local_time = 0
if local_time == 0:
print >> file, ('ProfileMode.summary_nodes: total time 0'
' (did you forget to enable counters?)')
return
print >> file, 'Thunks'
print >> file, '------'
print >> file, '<% time> <cumulative %%> <apply time> <cumulative seconds> <time per call> <nb_call> <Apply Op name>'
atimes = [(
t*100/local_time,
t,
a,
self.apply_callcount[a])
for a, t in self.apply_time.items()]
atimes.sort()
atimes.reverse()
tot=0
for (f, t, a, nb_call) in atimes[:N]:
tot+=t
ftot=tot*100/local_time
if nb_call==0:
continue
print >> file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %i %s'%(
f, ftot, t, tot, t/nb_call,nb_call, str(a))
print >> file, ' ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(atimes)-N),
sum(f for f, t, a, nb_call in atimes[N:]),
sum(t for f, t, a, nb_call in atimes[N:]))
print >> file, ''
def summary_function(self, file):
print >> file, 'Function profiling'
print >> file, '=================='
print >> file, ' Message: %s'%self.message
print >> file, ' Time in %i calls to Function.__call__: %es' % (
self.fct_callcount, self.fct_call_time)
if self.fct_call_time>0:
print >> file, ' Time in Function.fn.__call__: %es (%.3f%%)' %(
self.vm_call_time, 100*self.vm_call_time / self.fct_call_time)
local_time = sum(self.apply_time.values())
if local_time > 0:
print >> file, ' Time in thunks: %es (%.3f%%)' %(
local_time, 100*local_time / self.fct_call_time)
print >> file, ''
def summary(self, file=sys.stderr, n_ops_to_print=20, n_applies_to_print=20):
self.summary_function(file)
local_time = sum(self.apply_time.values())
if local_time > 0:
self.summary_ops(file, n_ops_to_print)
self.summary_nodes(file, n_applies_to_print)
else:
print >> file, " No node time accumulated (hint: try config profiling.time_thunks=1)"
if 0: # old code still to be ported from ProfileMode
def long_print(self, file=sys.stderr, fct_name=None, message=None,
n_apply_to_print=15, n_ops_to_print=20, print_apply=False):
"""
Print a readable summary of the stats.
param: n_apply_to_print the number of apply to print. Default 15.
param: n_ops_to_print the number of ops to print. Default 20.
"""
local_time = sum(self.apply_time.values())
print ''
print 'ProfileMode.long_print()'
print 'name = %s'%fct_name
print 'msg = %s'%message
print '---------------------------'
print ''
print 'Total time spent running thunks: %.3fs'% local_time
sop_time={}
sop_call={}
sop_op = {}
sop_c={} #map each op class to Bool. True iff all applies were done in c.
for a,t in op_time.items():
typ = type(a)
sop_time.setdefault(typ,0)
sop_time[typ]+=t
sop_op.setdefault(typ,0)
sop_op[typ]+=1
sop_c.setdefault(typ,True)
sop_c[typ]=sop_c[typ] and op_cimpl.get(a, False)
sop_call[typ]=sop_call.get(typ,0)+op_call[a]
print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> <nb_call> <nb_op> <nb_op> <Op name>'
sotimes = [(t*100/local_time, t, a, sop_c[a], sop_call[a], sop_op[a]) for a, t in sop_time.items()]
sotimes.sort()
sotimes.reverse()
tot=0
for f,t,a,ci, nb_call, nb_op in sotimes[:n_ops_to_print]:
if nb_call == 0:
assert t == 0
continue
tot+=t
ftot=tot*100/local_time
if ci:
msg = '*'
else:
msg = ' '
print ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, nb_call, nb_op, a)
print ' ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(sotimes)-n_ops_to_print),
sum(f for f, t, a, ci, nb_call, nb_op in sotimes[n_ops_to_print:]),
sum(t for f, t, a, ci, nb_call, nb_op in sotimes[n_ops_to_print:]))
total_time = time.time() - import_time
total_fct_time = sum(fct_call_time.values())
total_fct_call = sum(fct_call.values())
other_time = total_time - total_fct_time - compile_time
print
print 'Theano fct summary: <% total fct time> <total time> <time per call> <nb call> <fct name>'
for key in fct_call.keys():
if fct_call[key]>0:
print ' %4.1f%% %.3fs %.2es %d %s'%(fct_call_time[key]/total_fct_time*100 ,fct_call_time[key],
fct_call_time[key]/fct_call[key], fct_call[key], key.name)
else:
print ' NOT CALLED',key.name
if total_fct_time>0:
time_pr_in_fct=local_time/total_fct_time*100
time_per_call=total_fct_time/total_fct_call
else:
time_pr_in_fct=0
time_per_call=0
print
print 'Time since import %.3fs'%(total_time)
print 'Compile time: %.3fs %.1f%%'%(compile_time, compile_time/total_time*100)
print 'Theano fct call %.3fs %.1f%%'%(total_fct_time,total_fct_time/total_time*100)
print ' Theano Op time (included in fct call, Time spent running thunks) %.3fs %.1f%%(of total) %.1f%%(of fct call)'% (local_time,local_time/total_time*100, time_pr_in_fct)
print 'Other time since import %.3fs %.1f%%'%(other_time,other_time/total_time*100)
print '%i Theano fct call, %.3fs per call'%(total_fct_call, time_per_call)
print
print "List of apply that don't have float64 as input but have float64 in outputs. Usefull to know if we forgot some cast when using floatX=float32 or gpu code."
print '<Apply> <Apply position> <fct name> <inputs type> <outputs type>'
for fct in fct_call.keys():
for idx, node in enumerate(fct.maker.env.toposort()):
if any(hasattr(i,'dtype') and i.dtype=='float64' for i in node.outputs) and not any(hasattr(i,'dtype') and i.dtype=='float64' for i in node.inputs):
print str(node), idx, fct.name, str([getattr(i,'dtype',None) for i in node.inputs]),str([getattr(i,'dtype',None) for i in node.outputs])
if any([x[2].__name__.startswith("Gpu") for x in sotimes]):
cpu=[]
gpu=[]
trans=[]
for so in sotimes:
if so[2].__name__ in ["HostFromGpu", "GpuFromHost"]:
trans.append(so)
elif so[2].__name__.startswith("Gpu"):
gpu.append(so)
else:
cpu.append(so)
sum_cpu=sum(so[1] for so in cpu)
sum_gpu=sum(so[1] for so in gpu)
sum_trans=sum(so[1] for so in trans)
print
print "Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op"%(
sum_cpu, sum_cpu/local_time*100, sum_gpu, sum_gpu/local_time*100, sum_trans, sum_trans/local_time*100)
print "Theano function input that are float64"
print "<fct name> <input name> <input type> <str input>"
for fct in fct_call.keys():
for i in fct.input_storage:
if hasattr(i.type, 'dtype') and i.type.dtype=='float64':
print fct.name, i.name, i.type, i
if outputs_size:
fct_memory={}#env->dict(node->(outputs size))
var_mem = {}
for node,val in outputs_size.items():
fct_memory.setdefault(node.env,{})
fct_memory[node.env][node]=val
for out,v in zip(node.outputs,val):
var_mem[out]=v
print
print "Profile of Theano functions memory:"
for env,nodes_mem in fct_memory.iteritems():
print "Theano fct:", [fct for fct in fct_call.keys() if fct.maker.env is env][0].name
size_sum=sum([sum(val) for key,val in nodes_mem.iteritems()])
print " Max without gc, inplace and view (KB)",size_sum/1024
node_memory_size = 0
node_memory_saved_by_view = 0
node_memory_saved_by_inplace = 0
running_memory_size = 0
running_max_memory_size = 0
post_thunk_old_storage = []
items = nodes_mem.items()
items.sort(key=lambda a: a[1])
items.reverse()
order = env.toposort()
computed, last_user = gc_helper(order)
for node in order:
post_thunk_old_storage.append([ input_idx
for input_idx,input in enumerate(node.inputs)
if (input in computed) and (input not in env.outputs) and node == last_user[input]])
for node,val in items[:n_apply_to_print]:
dmap = getattr(node.op,'destroy_map',None)
vmap = getattr(node.op,'view_map',None)
for idx,v in enumerate(val):
if dmap and idx in dmap:#TODO check the op returned a view
node_memory_saved_by_inplace += v
elif vmap and idx in vmap:#TODO check the op returned a view
node_memory_saved_by_view += v
else:
node_memory_size += v
running_memory_size += v
if running_memory_size > running_max_memory_size:
running_max_memory_size = running_memory_size
old_storage = post_thunk_old_storage[order.index(node)]
for old_s in old_storage:
running_memory_size -= var_mem[node.inputs[old_s]]
pass
pass
print " Max FAST_RUN_NO_GC (KB)", node_memory_size/1024
print " Max FAST_RUN (KB)", running_max_memory_size/1024
print " Memory saved by view (KB)", node_memory_saved_by_view/1024
print " Memory saved by inplace (KB)", node_memory_saved_by_inplace/1024
print " Memory saved by GC (KB)", (node_memory_size-running_max_memory_size)/1024
n_apply_to_print+=10#TODO remove this line
print " <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> <created/inplace/view> <Apply node>"
print " <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected."
for key,val in items[:n_apply_to_print]:
code = ['c']*len(node.outputs)
for out,inp in getattr(key.op,'destroy_map',{}).iteritems():
code[out] = "i"
for out,inp in getattr(key.op,'view_map',{}).iteritems():
code[out] = "v"
print ' %9dB %s %s %s' % (sum(val), str(val), ' '.join(code), key)
print ' ... (remaining %i Apply account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(nodes_mem)-n_ops_to_print),
sum(sum(val) for key, val in items[n_ops_to_print:]),
sum(sum(val) for key, val in items[n_ops_to_print:])/size_sum)
print
print "Here are tips to potentially make your code run faster (if you think of new ones, suggest them on the mailing list). Test them first as they are not guaranteed to always provide a speedup."
from theano import tensor as T
from theano.tensor.raw_random import RandomFunction
import theano
import theano.scalar as scal
scalar_op_amdlibm_no_speed_up = [scal.LT, scal.GT, scal.LE, scal.GE, scal.EQ, scal.NEQ, scal.InRange, scal.Switch, scal.OR, scal.XOR, scal.AND, scal.Invert, scal.Maximum, scal.Minimum, scal.Add, scal.Mul, scal.Sub, scal.TrueDiv, scal.IntDiv, scal.Clip, scal.First, scal.Second, scal.Identity, scal.Cast, scal.Sgn, scal.Neg, scal.Inv, scal.Sqr ]
scalar_op_amdlibm_speed_up = [scal.Mod, scal.Pow, scal.Ceil, scal.Floor, scal.RoundHalfToEven, scal.RoundHalfAwayFromZero, scal.Log, scal.Log2, scal.Log10, scal.Log1p, scal.Exp, scal.Sqrt, scal.Abs, scal.Cos, scal.Sin, scal.Tan, scal.Tanh, scal.Cosh, scal.Sinh, T.nnet.sigm.ScalarSigmoid, T.nnet.sigm.ScalarSoftplus ]#Abs, Mod in float{32,64} only
def get_scalar_ops(s):
if isinstance(s, theano.scalar.Composite):
l = []
for node in s.env.toposort():
l+=get_scalar_ops(node.op)
return l
else: return [s]
def list_scalar_op(op):
if isinstance(op.scalar_op, theano.scalar.Composite):
return get_scalar_ops(op.scalar_op)
else: return [op.scalar_op]
def amdlibm_speed_up(op):
if not isinstance(op, T.Elemwise):
return False
else:
l = list_scalar_op(op)
for s_op in l:
if s_op.__class__ in scalar_op_amdlibm_speed_up:
return True
elif s_op.__class__ not in scalar_op_amdlibm_no_speed_up:
import pdb;pdb.set_trace()
print "We don't know if amdlibm will accelerate this scalar op.", s_op
return False
def exp_float32_op(op):
if not isinstance(op, T.Elemwise):
return False
else:
l = list_scalar_op(op)
return any([s_op.__class__ in [scal.Exp] for s_op in l])
#tip 1
if config.floatX=='float64':
print " - Try the Theano flag floatX=float32"
#tip 2
if not config.lib.amdlibm and any([amdlibm_speed_up(a.op) for i,a in apply_time]):
print " - Try installing amdlibm and set the Theano flag lib.amdlibm=True. This speed up only some Elemwise operation."
#tip 3
if not config.lib.amdlibm and any([exp_float32_op(a.op) and a.inputs[0].dtype=='float32' for i,a in apply_time]):
print " - With the default gcc libm, exp in float32 is slower then in float64! Try Theano flags floatX=float64 or install amdlibm and set the theano flags lib.amdlibm=True"
#tip 4
for a, t in apply_time.iteritems():
node = a
if isinstance(node.op, T.Dot) and all([ len(i.type.broadcastable)==2 for i in node.inputs]):
print " - You have a dot operation that was not optimized to dot22 that is faster. Make sure the inputs are float32 or 64 and are the same for both input. Currently they are:",[i.type for i in node.inputs]
#tip 5
for a, t in apply_time.iteritems():
node = a
if isinstance(node.op, RandomFunction):
print " - Replace the default random number generator by 'from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams' as this is is faster. It is still experimental, but seam to work correctly."
if config.device.startswith("gpu"):
print " - MRG_RandomStreams is the only random number supported on the GPU."
break
def print_summary(self,
n_apply_to_print=config.ProfileMode.n_apply_to_print,
n_ops_to_print=config.ProfileMode.n_ops_to_print):
""" Print 3 summary that show where the time is spend. The first show an Apply-wise summary, the second show an Op-wise summary, the third show an type-Op-wise summary.
The Apply-wise summary print the timing information for the worst offending Apply nodes. This corresponds to individual Op applications within your graph which take the longest to execute (so if you use dot twice, you will see two entries there).
The Op-wise summary print the execution time of all Apply nodes executing the same Op are grouped together and the total execution time per Op is shown (so if you use dot twice, you will see only one entry there corresponding to the sum of the time spent in each of them). If two Op have different hash value, they will be separate.
The type-Op-wise summary group the result by type of op. So event if two Op have different hash value, they will be merged.
Their is an hack with the Op-wise summary. Go see it if you want to know more.
:param n_apply_to_print: the number of apply to print. Default 15, or n_ops_to_print flag.
:param n_ops_to_print: the number of ops to print. Default 20, or n_apply_to_print flag.
"""
fct_call_time = self.mode.fct_call_time
fct_call = self.mode.fct_call
apply_time = self.apply_time
op_cimpl = self.op_cimpl
message = self.message
outputs_size = self.outputs_size
self.print_summary_("print_summary",
None,
None,
None,
apply_time,
op_cimpl,
message,
outputs_size,
n_apply_to_print,
n_ops_to_print)
def print_diff_summary(self, other, n_apply_to_print=15, n_ops_to_print=20):
""" As print_summary, but print the difference on two different profile mode.
TODO: Also we don't print the Apply-wise summary as it don't work for now.
TODO: make comparaison with gpu code.
:param other: the other instance of ProfileMode that we want to be compared to.
:param n_apply_to_print: the number of apply to print. Default 15.
:param n_ops_to_print: the number of ops to print. Default 20.
"""
def diff_dict(a_time,b_time_):
r = {}
b_time = copy.copy(b_time_)
for a,ta in a_time.items():
r.setdefault(a,0)
tb = b_time.pop(a,0)
r[a]+=ta-tb
#they are missing in a
for a,t in b_time.items():
r.setdefault(a,0)
r[a]+=t
return r
compile_time = self.compile_time-other.compile_time
fct_call_time = diff_dict(self.fct_call_time,other.fct_call_time)
fct_call = diff_dict(self.fct_call,other.fct_call)
apply_time = diff_dict(self.apply_time, other.apply_time)
op_cimpl = self.op_cimpl and other.op_cimpl
message = self.message
outputs_size = diff_dict(self.outputs_size,other.outputs_size)
self.print_summary_("print_diff_summary", compile_time, fct_call_time, fct_call,
apply_time, op_cimpl, message, outputs_size,
n_apply_to_print=n_apply_to_print,
n_ops_to_print=n_ops_to_print, print_apply=False)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论