# timing is stored by node, we compute timing by Op on demand
rval={}
fornode,tinself.apply_time.items():
rval.setdefault(node.op,0)
rval[node.op]+=t
returnrval
defop_callcount(self):
"""dict op -> total number of thunk calls"""
# timing is stored by node, we compute timing by Op on demand
rval={}
fornode,countinself.apply_callcount.items():
rval.setdefault(node.op,0)
rval[node.op]+=count
returnrval
defop_nodes(self):
"""dict op -> total number of nodes"""
# timing is stored by node, we compute timing by Op on demand
rval={}
fornode,countinself.apply_callcount.items():
rval.setdefault(node.op,0)
rval[node.op]+=1
returnrval
defop_impl(self):
"""dict op -> total number of nodes"""
# timing is stored by node, we compute timing by Op on demand
rval={}
fornodeinself.apply_callcount:
ifself.apply_cimpl[node]:
rval[node.op]='C '
else:
rval[node.op]='Py'
returnrval
defop_flops(self):
"""dict op -> total number of flops"""
# timing is stored by node, we compute timing by Op on demand
rval={}
returnrval#TODO: continue here
fornode,countinself.apply_callcount.items():
rval.setdefault(node.op,0)
rval[node.op]+=1
returnrval
fora,tinop_time.items():
ifhasattr(a,'flops'):
op_flops[a]=a.flops*op_call[a]/t/1e6
flops_msg=''
ifop_flops:
flops_msg=' <MFlops/s>'
print'\nHACK WARNING: we print the flops for some OP, but the logic don\' always work. You need to know the internal of Theano to make it work correctly. Otherwise don\'t use!'
print'\nOp-wise summary: <%% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> %s <nb_call> <nb apply> <Op name>'%(flops_msg)
defsummary_ops(self,file=sys.stderr,N=None):
ifself.apply_time:
local_time=sum(self.apply_time.values())
else:
local_time=0
iflocal_time==0:
print>>file,('ProfileMode.summary_ops: total time 0'
print' Theano Op time (included in fct call, Time spent running thunks) %.3fs %.1f%%(of total) %.1f%%(of fct call)'%(local_time,local_time/total_time*100,time_pr_in_fct)
print'Other time since import %.3fs %.1f%%'%(other_time,other_time/total_time*100)
print'%i Theano fct call, %.3fs per call'%(total_fct_call,time_per_call)
print
print"List of apply that don't have float64 as input but have float64 in outputs. Usefull to know if we forgot some cast when using floatX=float32 or gpu code."
print" <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected."
print"Here are tips to potentially make your code run faster (if you think of new ones, suggest them on the mailing list). Test them first as they are not guaranteed to always provide a speedup."
scalar_op_amdlibm_speed_up=[scal.Mod,scal.Pow,scal.Ceil,scal.Floor,scal.RoundHalfToEven,scal.RoundHalfAwayFromZero,scal.Log,scal.Log2,scal.Log10,scal.Log1p,scal.Exp,scal.Sqrt,scal.Abs,scal.Cos,scal.Sin,scal.Tan,scal.Tanh,scal.Cosh,scal.Sinh,T.nnet.sigm.ScalarSigmoid,T.nnet.sigm.ScalarSoftplus]#Abs, Mod in float{32,64} only
print" - With the default gcc libm, exp in float32 is slower then in float64! Try Theano flags floatX=float64 or install amdlibm and set the theano flags lib.amdlibm=True"
print" - You have a dot operation that was not optimized to dot22 that is faster. Make sure the inputs are float32 or 64 and are the same for both input. Currently they are:",[i.typeforiinnode.inputs]
#tip 5
fora,tinapply_time.iteritems():
node=a
ifisinstance(node.op,RandomFunction):
print" - Replace the default random number generator by 'from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams' as this is is faster. It is still experimental, but seam to work correctly."
ifconfig.device.startswith("gpu"):
print" - MRG_RandomStreams is the only random number supported on the GPU."
""" Print 3 summary that show where the time is spend. The first show an Apply-wise summary, the second show an Op-wise summary, the third show an type-Op-wise summary.
The Apply-wise summary print the timing information for the worst offending Apply nodes. This corresponds to individual Op applications within your graph which take the longest to execute (so if you use dot twice, you will see two entries there).
The Op-wise summary print the execution time of all Apply nodes executing the same Op are grouped together and the total execution time per Op is shown (so if you use dot twice, you will see only one entry there corresponding to the sum of the time spent in each of them). If two Op have different hash value, they will be separate.
The type-Op-wise summary group the result by type of op. So event if two Op have different hash value, they will be merged.
Their is an hack with the Op-wise summary. Go see it if you want to know more.
:param n_apply_to_print: the number of apply to print. Default 15, or n_ops_to_print flag.
:param n_ops_to_print: the number of ops to print. Default 20, or n_apply_to_print flag.