提交 8b9676e3 authored 作者: Frederic Bastien's avatar Frederic Bastien

ConvOp select automatically the fastest c code algo when none are specified.

The timming is done on maggie, so it could differ on other computer.
上级 94c8bc56
...@@ -197,6 +197,69 @@ class ConvOp(Op): ...@@ -197,6 +197,69 @@ class ConvOp(Op):
'imshp_logical', 'kshp_logical', 'kshp_logical_top_aligned'] 'imshp_logical', 'kshp_logical', 'kshp_logical_top_aligned']
"""These attributes uniquely identify the behaviour of this op for given inputs""" """These attributes uniquely identify the behaviour of this op for given inputs"""
#the value of speed_unroll_batch_kern,speed_unroll_patch_noshape,speed_unroll_patch_shape
#have bean calculated on maggie36 when their is only 1 session logged on and only this was running.
#It is an Intel(R) Xeon(R) CPU E5430 @ 2.66GHz. It is computer with theano/tensor/nnet/tests/speed_test_conv.py
# and took 5 minutes to run.
#TODO: we should compute this table for each computer/os as this can change.
# I saw on one computer that the speed with the shape can be slower then without!
# using the real shape and the same dtype could also help.
#unroll_batch, unroll_kern, valid time, full time
speed_unroll_batch_kern=[(1, 1, 2.4661250114440918, 6.5472931861877441) ,
(1, 2, 1.5869178771972656, 5.1499760150909424) ,
(1, 3, 1.4270510673522949, 3.6593470573425293) ,
(1, 4, 1.3373479843139648, 3.3451821804046631) ,
(1, 5, 1.2818830013275146, 3.1444568634033203) ,
(1, 6, 1.2521560192108154, 3.0256359577178955) ,
(1, 10, 1.2134110927581787, 2.9174180030822754) ,
(2, 1, 1.657214879989624, 4.5261678695678711) ,
(2, 2, 1.2123160362243652, 2.9747390747070312) ,
(2, 3, 1.0758891105651855, 2.5690360069274902) ,
(2, 4, 1.0683329105377197, 2.4233770370483398) ,
(2, 5, 1.0955719947814941, 2.3999948501586914) ,
(2, 6, 1.5935721397399902, 2.6878271102905273) ,
(2, 10, 1.8511250019073486, 3.2417428493499756) ,
(3, 1, 1.5948119163513184, 3.631148099899292) ,
(3, 2, 1.0761330127716064, 2.6011371612548828) ,
(3, 3, 1.0551531314849854, 2.4200370311737061) ,
(3, 4, 1.3930759429931641, 2.5211219787597656) ,
(3, 5, 1.4330689907073975, 2.5704989433288574) ,
(3, 6, 1.362138032913208, 2.5964410305023193) ,
(3, 10, 1.6582000255584717, 2.9907989501953125) ,
(4, 1, 1.4793620109558105, 3.3473429679870605) ,
(4, 2, 1.0671560764312744, 2.4171769618988037) ,
(4, 3, 1.2569692134857178, 2.2807950973510742) ,
(4, 4, 1.3456289768218994, 2.6219108104705811) ,
(4, 5, 1.4055080413818359, 2.4606490135192871) ,
(4, 6, 1.372107982635498, 2.551663875579834) ,
(4, 10, 1.599470853805542, 2.9172940254211426) ,
(5, 1, 1.4115700721740723, 3.2077109813690186) ,
(5, 2, 1.0635769367218018, 2.2648060321807861) ,
(5, 3, 1.3842809200286865, 2.6135518550872803) ,
(5, 4, 1.3470511436462402, 2.3852400779724121) ,
(5, 5, 1.3539440631866455, 2.5245928764343262) ,
(5, 6, 1.4037849903106689, 2.5985310077667236) ,
(5, 10, 1.6120610237121582, 2.8127608299255371) ,
(6, 1, 1.3623628616333008, 3.021122932434082) ,
(6, 2, 1.1697649955749512, 2.6285450458526611) ,
(6, 3, 1.2980999946594238, 2.4746189117431641) ,
(6, 4, 1.3739941120147705, 2.5579929351806641) ,
(6, 5, 1.3967819213867188, 2.5522029399871826) ,
(6, 6, 1.4279270172119141, 2.6127138137817383) ,
(6, 10, 1.605496883392334, 2.864037036895752) ,
(10, 1, 1.6401121616363525, 2.970099925994873) ,
(10, 2, 1.46710205078125, 2.7231831550598145) ,
(10, 3, 1.4193780422210693, 2.6087639331817627) ,
(10, 4, 1.4657118320465088, 2.6246678829193115) ,
(10, 5, 1.5052611827850342, 2.6542458534240723) ,
(10, 6, 1.5214400291442871, 2.7243161201477051) ,
(10, 10, 1.6116268634796143, 2.956165075302124)]
#valid time, full time
speed_unroll_patch_noshape=[2.0109100341796875, 5.8175678253173828]
#valid time, full time
speed_unroll_patch_shape=[1.2967290878295898, 5.5283889770507812]
def c_compile_args(self): def c_compile_args(self):
#when the ksph==(1,1) gcc 4.3.0 segfault during the compilation with -O3. #when the ksph==(1,1) gcc 4.3.0 segfault during the compilation with -O3.
...@@ -232,9 +295,11 @@ class ConvOp(Op): ...@@ -232,9 +295,11 @@ class ConvOp(Op):
def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None, def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None,
dx=None, dy=None, dx=None, dy=None,
output_mode='valid', unroll_batch=0, output_mode='valid',
unroll_kern=0,
unroll_patch=True, unroll_batch=None,
unroll_kern=None,
unroll_patch=None,
imshp_logical=None, imshp_logical=None,
kshp_logical=None, kshp_logical=None,
kshp_logical_top_aligned=True, kshp_logical_top_aligned=True,
...@@ -246,10 +311,16 @@ class ConvOp(Op): ...@@ -246,10 +311,16 @@ class ConvOp(Op):
code. code.
NOTES ON OPTIMIZATION: NOTES ON OPTIMIZATION:
If ALL (imshp, kshp, nkern and bsize) parameters are provided, we can Their is two type of optimization. The first is the selection of the
generate faster c-code. This make a significant difference for the fastest algo when bsize and nkern are probided with imshp and kshp.
'full' output_mode with unroll_patch=True. The current fastest By default we try to select the fastest version. You can specify it
implementation on x86-64 uses {unroll_batch=4, unroll_kern=4, with the unroll_batch, unroll_kern, and unroll_patch parameter.
The second type of optimization is hardcoding some dimensions into the code
when all shape are know.
This make a significant difference for the 'full' output_mode.
Some times, the fastest implementation on x86-64 uses {unroll_batch=4, unroll_kern=4,
unroll_patch=False} with all other shape parameters being provided. unroll_patch=False} with all other shape parameters being provided.
For optimizing other architectures, see: For optimizing other architectures, see:
...@@ -351,6 +422,7 @@ class ConvOp(Op): ...@@ -351,6 +422,7 @@ class ConvOp(Op):
self.unroll_kern=unroll_kern self.unroll_kern=unroll_kern
self.unroll_patch=unroll_patch self.unroll_patch=unroll_patch
#downcast unroll_batch if not a divisor of batch size
if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0: if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0:
if self.bsize<=self.unroll_batch: if self.bsize<=self.unroll_batch:
...@@ -364,12 +436,13 @@ class ConvOp(Op): ...@@ -364,12 +436,13 @@ class ConvOp(Op):
warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%i)"\ warnstr = "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%i)"\
"must be 0 or a divisor of bsize(%i). We revert it to %i. This"\ "must be 0 or a divisor of bsize(%i). We revert it to %i. This"\
"won't change the result, but may make it slower." " won't change the result, but may make it slower."
_warn(warnstr % (self.unroll_batch, self.bsize, new)) _warn(warnstr % (self.unroll_batch, self.bsize, new))
self.unroll_batch=new self.unroll_batch=new
if self.unroll_kern>0 and self.nkern % unroll_kern!=0: #downcast unroll_kern if not a divisor of nb of kernel
if self.unroll_kern>0 and self.nkern % self.unroll_kern!=0:
if self.nkern<=self.unroll_kern: if self.nkern<=self.unroll_kern:
self.unroll_kern = self.nkern self.unroll_kern = self.nkern
...@@ -404,6 +477,41 @@ class ConvOp(Op): ...@@ -404,6 +477,41 @@ class ConvOp(Op):
"(Hint: kerns must fit inside image in valid mode)")% "(Hint: kerns must fit inside image in valid mode)")%
(self.imshp_logical,self.kshp_logical)) (self.imshp_logical,self.kshp_logical))
if self.unroll_kern is None and self.unroll_batch is None and self.unroll_patch is None:
#no version specified. Find the faster we have
if self.bsize is None and self.nkern is None:
self.unroll_patch = True
elif self.bsize is not None and self.nkern is not None:
bsize=self.bsize
nkern=self.nkern
if bsize is None:
bsize=1
if nkern is None:
nkern=1
mode_idx=0
if self.out_mode!="valid":
mode_idx=1
if all_shape:
time_unroll_patch = self.speed_unroll_patch_shape[mode_idx]
else:
time_unroll_patch = self.speed_unroll_patch_noshape[mode_idx]
time_unroll_batch_kern = 9999999
for i in range(len(self.speed_unroll_batch_kern)):
if bsize%self.speed_unroll_batch_kern[i][0]==0 and nkern%self.speed_unroll_batch_kern[i][1]==0:
if self.speed_unroll_batch_kern[i][2+mode_idx]<time_unroll_batch_kern:
time_unroll_batch_kern=self.speed_unroll_batch_kern[i][2+mode_idx]
time_unroll_batch_kern_idx=i
if time_unroll_patch < time_unroll_batch_kern:
self.unroll_patch = True
else:
self.unroll_batch=self.speed_unroll_batch_kern[time_unroll_batch_kern_idx][0]
self.unroll_kern=self.speed_unroll_batch_kern[time_unroll_batch_kern_idx][1]
self.unroll_patch = False
print "AUTO FIND VERSION OF C_CODE OF CONV OP"
print self.unroll_batch, self.unroll_kern, self.unroll_patch, self.bsize, self.nkern, time_unroll_patch, time_unroll_batch_kern
self._rehash() self._rehash()
if config.op.set_flops: if config.op.set_flops:
self.set_flops() self.set_flops()
...@@ -673,7 +781,7 @@ class ConvOp(Op): ...@@ -673,7 +781,7 @@ class ConvOp(Op):
_warn("OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\ _warn("OPTIMISATION WARNING: in ConvOp.grad() we can't determine "\
"a good unroll value for the batch. Maybe you can optimize this!") "a good unroll value for the batch. Maybe you can optimize this!")
if un_k!=0 and nkern%un_k!=0: if all_shape and un_k!=0 and nkern%un_k!=0:
if nkern<un_k: if nkern<un_k:
un_k = nkern un_k = nkern
else: else:
...@@ -740,7 +848,7 @@ class ConvOp(Op): ...@@ -740,7 +848,7 @@ class ConvOp(Op):
return ['<numpy/noprefix.h>', '<iostream>', '<sstream>' ] return ['<numpy/noprefix.h>', '<iostream>', '<sstream>' ]
def c_code_cache_version(self): def c_code_cache_version(self):
return (1) return (2)
def c_support_code(self): def c_support_code(self):
return """ return """
......
...@@ -42,7 +42,7 @@ global_rng = N.random.RandomState(3423489) ...@@ -42,7 +42,7 @@ global_rng = N.random.RandomState(3423489)
dmatrix4=T.TensorType('float64', (False, False, False, False)) dmatrix4=T.TensorType('float64', (False, False, False, False))
def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, def exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp, kshps, nkerns,
unroll_batch=0, unroll_kern=0, img=T.dmatrix(), validate=True, unroll_batch=0, unroll_kern=0, img=T.dmatrix(), validate=True,
conv_op_py=False, do_print=True, repeat=1, conv_op_py=False, do_print=True, repeat=1,
unroll_patch=False, unroll_patch_size=False, verbose=0): unroll_patch=False, unroll_patch_size=False, verbose=0):
...@@ -136,15 +136,72 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, ...@@ -136,15 +136,72 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns,
return tctot, tpytot, ntot return tctot, tpytot, ntot
def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns,
unroll_batch=0, unroll_kern=0, img=T.dmatrix(),
do_print=True, repeat=1,
unroll_patch=False, unroll_patch_size=False, verbose=0):
# build actual input images
imgval = global_rng.rand(bsize, imshp[0], imshp[1], imshp[2])
a=T.dmatrix()
kerns = [a for i in nkerns]
inputs4=dmatrix4()
kerns4=dmatrix4()
# for each layer
ntot=0
tctot=0
tpytot=0
for kshp, kern, nkern, n_layer in zip(kshps, kerns, nkerns, range(len(nkerns))):
if do_print:
print '************* layer %i ***************' % n_layer
print conv_mode, ss, n_layer, kshp, nkern
# actual values
w = global_rng.random_sample(N.r_[nkern,imshp[0],kshp])
w_flip = flip(w,kshp).reshape(w.shape)
outshp = N.hstack((nkern, ConvOp.getOutputShape(imshp[1:], kshp, ss, conv_mode)))
time1 = time.time()
outval = N.zeros(N.r_[bsize,outshp])
# ConvOp
if unroll_patch and not unroll_patch_size:
conv_op = ConvOp(dx=ss[0],dy=ss[1], output_mode=conv_mode,
unroll_patch=unroll_patch, verbose=verbose)(inputs4, kerns4)
else:
conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode,
unroll_batch=unroll_batch, unroll_kern=unroll_kern, unroll_patch=unroll_patch, verbose=verbose)(inputs4, kerns4)
l1shp=N.hstack((nkern,
ConvOp.getOutputShape(imshp[1:], kshp, ss, conv_mode)))
propup2 = function([inputs4, kerns4], conv_op)
time1 = time.time()
for i in range(repeat):
hidval2_ = propup2(imgval,w_flip)
hidval2 = hidval2_#[:,:,0::ss[0],0::ss[1]]
tctot += time.time() - time1
imshp = tuple(outshp)
imgval = outval.reshape(bsize,outshp[0],outshp[1],outshp[2])
return tctot, tpytot, ntot
def speed_multilayer_conv(): def speed_multilayer_conv():
# calculate the speed up of different combination of unroll # calculate the speed up of different combination of unroll
# put the paramter to the same you will try. # put the paramter to the same you will try.
validate=False# we don't validate the result to have it much faster! validate=False# we don't validate the result to have it much faster!
repeat = 3
verbose=1 verbose=1
unroll_batch = [1,2,3,4,5,10]#15, 30, 60 always much slower unroll_batch = [1,2,3,4,5,6,10]#15, 30, 60 always much slower
unroll_kern = [1,2,3,4,5,10]#15, 30, 60 always much slower unroll_kern = [1,2,3,4,5,6,10]#15, 30, 60 always much slower
#unroll_batch = [1,4,5] #unroll_batch = [1,4,5]
#unroll_kern = [1,4,5] #unroll_kern = [1,4,5]
#unroll_batch = [1,4] #unroll_batch = [1,4]
...@@ -153,8 +210,8 @@ def speed_multilayer_conv(): ...@@ -153,8 +210,8 @@ def speed_multilayer_conv():
bsize = 60 # batch size bsize = 60 # batch size
imshp_start = (1,48,48)#un square shape to test more corner case. imshp_start = (1,48,48)#un square shape to test more corner case.
kshps = ([11,12],[12,11])#un square shape to test more corner case. kshps = ([11,12],)#un square shape to test more corner case.
nkerns = [60,60] # per output pixel nkerns = [60] # per output pixel
ssizes = [(1,1),]#(1,1)]#(2,2) bugged ssizes = [(1,1),]#(1,1)]#(2,2) bugged
convmodes = ['valid','full'] convmodes = ['valid','full']
do_convolve2=False do_convolve2=False
...@@ -168,9 +225,6 @@ def speed_multilayer_conv(): ...@@ -168,9 +225,6 @@ def speed_multilayer_conv():
#calculate the timing with unrolling #calculate the timing with unrolling
print 'time unroll batch kern' print 'time unroll batch kern'
t_=[[ 7.60572791, 3.95069814, 3.74271464], [ 4.05631089, 2.90384555, 2.93613672], [ 3.90551591, 2.92595196, 3.00102282]]
best=[0.52690219879150391, 2.4266397953033447]
worst=[0.92042708396911621, 6.8822150230407715]
best=[] best=[]
worst=[] worst=[]
t_=[] t_=[]
...@@ -181,7 +235,8 @@ def speed_multilayer_conv(): ...@@ -181,7 +235,8 @@ def speed_multilayer_conv():
tctot, tpytot, ntot=[],[],[] tctot, tpytot, ntot=[],[],[]
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))): for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
for ss, n_ss in zip(ssizes,range(len(ssizes))): for ss, n_ss in zip(ssizes,range(len(ssizes))):
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_b, unroll_kern=unroll_k, validate=validate, verbose=verbose,do_print=False) # tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_b, unroll_kern=unroll_k, validate=validate, verbose=verbose,do_print=False)
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_b, unroll_kern=unroll_k, verbose=verbose,do_print=False, repeat=repeat)
tctot+=[tctot_] tctot+=[tctot_]
tpytot+=[tpytot_] tpytot+=[tpytot_]
ntot+=[ntot_] ntot+=[ntot_]
...@@ -199,13 +254,13 @@ def speed_multilayer_conv(): ...@@ -199,13 +254,13 @@ def speed_multilayer_conv():
t=N.asarray(t) t=N.asarray(t)
#calculate the old timing #calculate the old timing
print 'time old version' print 'time old version'
tctot_=[0.52555489540100098, 6.6634182929992676]
tctot,tpytot,ntot=[],[],[] tctot,tpytot,ntot=[],[],[]
tctot_=[] tctot_=[]
if not tctot_: if not tctot_:
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))): for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
for ss, n_ss in zip(ssizes,range(len(ssizes))): for ss, n_ss in zip(ssizes,range(len(ssizes))):
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate, verbose=verbose,do_print=False) # tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate, verbose=verbose,do_print=False)
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, verbose=verbose,do_print=False, repeat=repeat)
tctot+=[tctot_] tctot+=[tctot_]
tpytot+=[tpytot_] tpytot+=[tpytot_]
ntot+=[ntot_] ntot+=[ntot_]
...@@ -217,7 +272,7 @@ def speed_multilayer_conv(): ...@@ -217,7 +272,7 @@ def speed_multilayer_conv():
print "unroll_batch/unroll_kern valid_mode full_mode" print "unroll_batch/unroll_kern valid_mode full_mode"
for n_b in range(len(unroll_batch)): for n_b in range(len(unroll_batch)):
for n_k in range(len(unroll_kern)): for n_k in range(len(unroll_kern)):
print unroll_batch[n_b],"/",unroll_kern[n_k], " ",t[n_b,n_k] print (unroll_batch[n_b], unroll_kern[n_k]) + tuple(t[n_b,n_k]),','
t_detail=t t_detail=t
t = t.sum(axis=2) t = t.sum(axis=2)
print "max %.3fs"%t.max(), "max param(batch unloop size/kernel unloop size)", t_b_k[t.argmax()] print "max %.3fs"%t.max(), "max param(batch unloop size/kernel unloop size)", t_b_k[t.argmax()]
...@@ -231,9 +286,11 @@ def speed_multilayer_conv(): ...@@ -231,9 +286,11 @@ def speed_multilayer_conv():
tctot_patch_size = [] tctot_patch_size = []
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))): for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
for ss, n_ss in zip(ssizes,range(len(ssizes))): for ss, n_ss in zip(ssizes,range(len(ssizes))):
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False) #tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False)
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, unroll_patch=True,verbose=verbose,do_print=False, repeat=repeat)
tctot_patch += [tctot_] tctot_patch += [tctot_]
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False,unroll_patch_size=True) #tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet_old(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=True,verbose=verbose,do_print=False,unroll_patch_size=True)
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, unroll_patch=True,verbose=verbose,do_print=False,unroll_patch_size=True, repeat=repeat)
tctot_patch_size += [tctot_] tctot_patch_size += [tctot_]
t_patch=sum(tctot_patch) t_patch=sum(tctot_patch)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论