"""Return an instance of `Profiler_Maker` which init the count"""
assertmisself
...
...
@@ -128,9 +135,10 @@ class ProfileMode(Mode):
failure=run_cthunk(th.cthunk)
dt=time.time()-t0
iffailure:
raiseRuntimeError(('A C Op raised an exception. ProfileMode cannot'
' tell you what it was though. Use a standard mode such as'
' FAST_RUN to correct the problem.'))
raiseRuntimeError(
('A C Op raised an exception. ProfileMode cannot'
' tell you what it was though. Use a standard mode'
' such as FAST_RUN to correct the problem.'))
else:
t0=time.time()
th()
...
...
@@ -140,7 +148,6 @@ class ProfileMode(Mode):
# insufficient to measure it. So we add an epsilon.
self.apply_time[node]+=max(dt,1e-14)
defprofile_thunk2(i,node,th):
""" Profile the execution time and the memory size.
"""
...
...
@@ -150,25 +157,27 @@ class ProfileMode(Mode):
failure=run_cthunk(th.cthunk)
dt=time.time()-t0
iffailure:
raiseRuntimeError(('A C Op raised an exception. ProfileMode cannot'
' tell you what it was though. Use a standard mode such as'
' FAST_RUN to correct the problem.'))
raiseRuntimeError(
('A C Op raised an exception. ProfileMode cannot'
' tell you what it was though. Use a standard mode'
' such as FAST_RUN to correct the problem.'))
else:
t0=time.time()
th()
dt=time.time()-t0
size=[]
size=[]
forointh.outputs:
ifnothasattr(o[0],'size'):
ifnothasattr(o[0],'size'):
#if the output type don't have a size attribute, set -1
#to signify we can't evaluate it.
#This happen at least for mtrand.RandomState type(in numpy)
size.append(-1)
continue
s=o[0].size
#can't use o[0].dtype.itemsize as dtype is a str for CudaNdarray
s=o[0].size
#can't use o[0].dtype.itemsize as dtype is a str for
#CudaNdarray
dtype=str(o[0].dtype)
dtype2=dtype[-2:]
dtype2=dtype[-2:]
ifdtype2=='32':
s*=4
elifdtype2=='64':
...
...
@@ -180,12 +189,12 @@ class ProfileMode(Mode):
elifdtype[-3:]=='128':
s*=16
else:
raiseException("Can't determine the memory size of dtype",o[0].dtype)
raiseException("Can't determine the memory size of dtype",
o[0].dtype)
size.append(s)
self.outputs_size[node]=size
self.outputs_size[node]=size
self.apply_time[node]+=max(dt,1e-14)
self.provided_linker=linker
self.provided_optimizer=optimizer
ifisinstance(linker,basestring)orlinkerisNone:
...
...
@@ -207,7 +216,7 @@ class ProfileMode(Mode):
self.optimizer_time=0
self.linker_time=0
defprint_summary(self,**kwargs):
defprint_summary(self,**kwargs):
""" Print 3 summary that show where the time is spend. The first show an Apply-wise summary, the second show an Op-wise summary, the third show an type-Op-wise summary.
The Apply-wise summary print the timing information for the worst offending Apply nodes. This corresponds to individual Op applications within your graph which take the longest to execute (so if you use dot twice, you will see two entries there).
...
...
@@ -220,7 +229,8 @@ class ProfileMode(Mode):
Currently there is n_apply_to_print, n_ops_to_print and min_memory_size
scalar_op_amdlibm_speed_up=[scal.Mod,scal.Pow,scal.Ceil,scal.Floor,scal.RoundHalfToEven,scal.RoundHalfAwayFromZero,scal.Log,scal.Log2,scal.Log10,scal.Log1p,scal.Exp,scal.Sqrt,scal.Abs,scal.Cos,scal.Sin,scal.Tan,scal.Tanh,scal.Cosh,scal.Sinh,T.nnet.sigm.ScalarSigmoid,T.nnet.sigm.ScalarSoftplus]#Abs, Mod in float{32,64} only
print" - With the default gcc libm, exp in float32 is slower then in float64! Try Theano flag floatX=float64, or install amdlibm and set the theano flags lib.amdlibm=True"
printed_tip=True
...
...
@@ -656,10 +694,12 @@ Test them first, as they are not guaranteed to always provide a speedup."""
ifnotprinted_tip:
print" Sorry, no tip for today."
register_mode('PROFILE_MODE',ProfileMode())
register_mode('PROFILE_MODE',ProfileMode())
#needed to print the profile at the end automatically