"""Return an instance of `Profiler_Maker` which init the count"""
"""Return an instance of `Profiler_Maker` which init the count"""
assertmisself
assertmisself
...
@@ -128,9 +135,10 @@ class ProfileMode(Mode):
...
@@ -128,9 +135,10 @@ class ProfileMode(Mode):
failure=run_cthunk(th.cthunk)
failure=run_cthunk(th.cthunk)
dt=time.time()-t0
dt=time.time()-t0
iffailure:
iffailure:
raiseRuntimeError(('A C Op raised an exception. ProfileMode cannot'
raiseRuntimeError(
' tell you what it was though. Use a standard mode such as'
('A C Op raised an exception. ProfileMode cannot'
' FAST_RUN to correct the problem.'))
' tell you what it was though. Use a standard mode'
' such as FAST_RUN to correct the problem.'))
else:
else:
t0=time.time()
t0=time.time()
th()
th()
...
@@ -140,7 +148,6 @@ class ProfileMode(Mode):
...
@@ -140,7 +148,6 @@ class ProfileMode(Mode):
# insufficient to measure it. So we add an epsilon.
# insufficient to measure it. So we add an epsilon.
self.apply_time[node]+=max(dt,1e-14)
self.apply_time[node]+=max(dt,1e-14)
defprofile_thunk2(i,node,th):
defprofile_thunk2(i,node,th):
""" Profile the execution time and the memory size.
""" Profile the execution time and the memory size.
"""
"""
...
@@ -150,25 +157,27 @@ class ProfileMode(Mode):
...
@@ -150,25 +157,27 @@ class ProfileMode(Mode):
failure=run_cthunk(th.cthunk)
failure=run_cthunk(th.cthunk)
dt=time.time()-t0
dt=time.time()-t0
iffailure:
iffailure:
raiseRuntimeError(('A C Op raised an exception. ProfileMode cannot'
raiseRuntimeError(
' tell you what it was though. Use a standard mode such as'
('A C Op raised an exception. ProfileMode cannot'
' FAST_RUN to correct the problem.'))
' tell you what it was though. Use a standard mode'
' such as FAST_RUN to correct the problem.'))
else:
else:
t0=time.time()
t0=time.time()
th()
th()
dt=time.time()-t0
dt=time.time()-t0
size=[]
size=[]
forointh.outputs:
forointh.outputs:
ifnothasattr(o[0],'size'):
ifnothasattr(o[0],'size'):
#if the output type don't have a size attribute, set -1
#if the output type don't have a size attribute, set -1
#to signify we can't evaluate it.
#to signify we can't evaluate it.
#This happen at least for mtrand.RandomState type(in numpy)
#This happen at least for mtrand.RandomState type(in numpy)
size.append(-1)
size.append(-1)
continue
continue
s=o[0].size
s=o[0].size
#can't use o[0].dtype.itemsize as dtype is a str for CudaNdarray
#can't use o[0].dtype.itemsize as dtype is a str for
#CudaNdarray
dtype=str(o[0].dtype)
dtype=str(o[0].dtype)
dtype2=dtype[-2:]
dtype2=dtype[-2:]
ifdtype2=='32':
ifdtype2=='32':
s*=4
s*=4
elifdtype2=='64':
elifdtype2=='64':
...
@@ -180,12 +189,12 @@ class ProfileMode(Mode):
...
@@ -180,12 +189,12 @@ class ProfileMode(Mode):
elifdtype[-3:]=='128':
elifdtype[-3:]=='128':
s*=16
s*=16
else:
else:
raiseException("Can't determine the memory size of dtype",o[0].dtype)
raiseException("Can't determine the memory size of dtype",
o[0].dtype)
size.append(s)
size.append(s)
self.outputs_size[node]=size
self.outputs_size[node]=size
self.apply_time[node]+=max(dt,1e-14)
self.apply_time[node]+=max(dt,1e-14)
self.provided_linker=linker
self.provided_linker=linker
self.provided_optimizer=optimizer
self.provided_optimizer=optimizer
ifisinstance(linker,basestring)orlinkerisNone:
ifisinstance(linker,basestring)orlinkerisNone:
...
@@ -207,7 +216,7 @@ class ProfileMode(Mode):
...
@@ -207,7 +216,7 @@ class ProfileMode(Mode):
self.optimizer_time=0
self.optimizer_time=0
self.linker_time=0
self.linker_time=0
defprint_summary(self,**kwargs):
defprint_summary(self,**kwargs):
""" Print 3 summary that show where the time is spend. The first show an Apply-wise summary, the second show an Op-wise summary, the third show an type-Op-wise summary.
""" Print 3 summary that show where the time is spend. The first show an Apply-wise summary, the second show an Op-wise summary, the third show an type-Op-wise summary.
The Apply-wise summary print the timing information for the worst offending Apply nodes. This corresponds to individual Op applications within your graph which take the longest to execute (so if you use dot twice, you will see two entries there).
The Apply-wise summary print the timing information for the worst offending Apply nodes. This corresponds to individual Op applications within your graph which take the longest to execute (so if you use dot twice, you will see two entries there).
...
@@ -220,7 +229,8 @@ class ProfileMode(Mode):
...
@@ -220,7 +229,8 @@ class ProfileMode(Mode):
Currently there is n_apply_to_print, n_ops_to_print and min_memory_size
Currently there is n_apply_to_print, n_ops_to_print and min_memory_size
scalar_op_amdlibm_speed_up=[scal.Mod,scal.Pow,scal.Ceil,scal.Floor,scal.RoundHalfToEven,scal.RoundHalfAwayFromZero,scal.Log,scal.Log2,scal.Log10,scal.Log1p,scal.Exp,scal.Sqrt,scal.Abs,scal.Cos,scal.Sin,scal.Tan,scal.Tanh,scal.Cosh,scal.Sinh,T.nnet.sigm.ScalarSigmoid,T.nnet.sigm.ScalarSoftplus]#Abs, Mod in float{32,64} only
print" - With the default gcc libm, exp in float32 is slower then in float64! Try Theano flag floatX=float64, or install amdlibm and set the theano flags lib.amdlibm=True"
print" - With the default gcc libm, exp in float32 is slower then in float64! Try Theano flag floatX=float64, or install amdlibm and set the theano flags lib.amdlibm=True"
printed_tip=True
printed_tip=True
...
@@ -656,10 +694,12 @@ Test them first, as they are not guaranteed to always provide a speedup."""
...
@@ -656,10 +694,12 @@ Test them first, as they are not guaranteed to always provide a speedup."""
ifnotprinted_tip:
ifnotprinted_tip:
print" Sorry, no tip for today."
print" Sorry, no tip for today."
register_mode('PROFILE_MODE',ProfileMode())
register_mode('PROFILE_MODE',ProfileMode())
#needed to print the profile at the end automatically
#needed to print the profile at the end automatically