提交 32bf9b72 authored 作者: Frederic Bastien's avatar Frederic Bastien

added a new unrolled verion of ConvOp(not used by default). It unroll the batch…

added a new unrolled verion of ConvOp(not used by default). It unroll the batch and the kernel at the same time. This give the biggest speed up in my test. Also modified the test_multilayer_conf fct to allow testing the different parameter for the unroll.
上级 e9adfb12
...@@ -37,7 +37,7 @@ class ConvOp(Op): ...@@ -37,7 +37,7 @@ class ConvOp(Op):
self.unroll_batch=unroll_batch self.unroll_batch=unroll_batch
self.unroll_kern=unroll_kern self.unroll_kern=unroll_kern
assert not(unroll_batch>0 and unroll_kern>0)
if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0: if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0:
raise Exception("unroll_batch(%s) should be 0 or a multiple of bsize(%s)"%(str(self.unroll_batch),str(self.bsize))) raise Exception("unroll_batch(%s) should be 0 or a multiple of bsize(%s)"%(str(self.unroll_batch),str(self.bsize)))
if self.unroll_kern>0 and self.nkern % unroll_kern!=0: if self.unroll_kern>0 and self.nkern % unroll_kern!=0:
...@@ -175,10 +175,14 @@ using namespace std; ...@@ -175,10 +175,14 @@ using namespace std;
if node.inputs[0].type.dtype=="float32": d["type"]="float" if node.inputs[0].type.dtype=="float32": d["type"]="float"
elif node.inputs[0].type.dtype=="float64": d["type"]="double" elif node.inputs[0].type.dtype=="float64": d["type"]="double"
else: raise Exception("Type %s not implemented"%node.inputs[0].type.dtype) else: raise Exception("Type %s not implemented"%node.inputs[0].type.dtype)
if self.unroll_kern>0: if self.unroll_kern>0 and self.unroll_batch>0:
print "return unrolled batch and kern code by",self.unroll_batch, self.unroll_kern
return gen_conv_code_unroll_batch_kern(d, self.unroll_batch,
self.unroll_kern)
elif self.unroll_kern>0:
print "return unrolled kern code by",self.unroll_kern print "return unrolled kern code by",self.unroll_kern
return gen_conv_code_unroll_kern(d, self.unroll_kern) return gen_conv_code_unroll_kern(d, self.unroll_kern)
if self.unroll_batch>0: elif self.unroll_batch>0:
print "return unrolled batch code by",self.unroll_batch print "return unrolled batch code by",self.unroll_batch
return gen_conv_code_unroll_batch(d, self.unroll_batch) return gen_conv_code_unroll_batch(d, self.unroll_batch)
...@@ -1105,3 +1109,251 @@ Py_XDECREF(img2d); ...@@ -1105,3 +1109,251 @@ Py_XDECREF(img2d);
Py_XDECREF(filtersflipped); Py_XDECREF(filtersflipped);
"""%d """%d
return ret return ret
def gen_conv_code_unroll_batch_kern(d,unloop_bsize=1, unloop_ksize=1):
""" c_code for ConvOp that unroll the batch size loop
"""
d["unloop_bsize"]=unloop_bsize
d["unloop_ksize"]=unloop_ksize
def my_dup(st,size):
s=""
for i in range(size):
d["unloop_iter"]=i
s+=st%d
return s+"\n"
def my_dup2(st):
s=""
iter=0
for i in range(unloop_bsize):
d["unloop_biter"]=i
for j in range(unloop_ksize):
d["unloop_kiter"]=j
d["unloop_iter"]=iter
iter+=1
s+=st%d
return s+"\n"
ret = """
int mode=-1,typenum=0, typenum_f=0;
PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL;
const %(type)s fill_value = 0;
int type_im=PyArray_TYPE(%(img2d)s);
int type_ker=PyArray_TYPE(%(filtersflipped)s);
npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
npy_intp dim_im[2]={%(self_imshp1)s,%(self_imshp2)s};
npy_intp dim_ker[2]={%(self_kshp0)s,%(self_kshp1)s};
PyArray_Dims img2d_shape;
npy_intp img2d_dim[4]={1,1,0,0};
img2d_shape.ptr=img2d_dim;
img2d_shape.len=4;
PyArray_Dims kerns_shape;
npy_intp kerns_dim[4]={1,1,0,0};
kerns_shape.ptr=kerns_dim;
kerns_shape.len=4;
PyObject *img2d=NULL, *contig, *filtersflipped=NULL;
string s="%(self_out_mode)s";
if(%(img2d)s->nd==2){
img2d_dim[3]=%(img2d)s->dimensions[1];
img2d_dim[2]=%(img2d)s->dimensions[0];
}else if(%(img2d)s->nd==3){
img2d_dim[3]=%(img2d)s->dimensions[2];
img2d_dim[2]=%(img2d)s->dimensions[1];
img2d_dim[0]=%(img2d)s->dimensions[0];
}else if(%(img2d)s->nd==4){
img2d_dim[3]=%(img2d)s->dimensions[3];
img2d_dim[2]=%(img2d)s->dimensions[2];
img2d_dim[1]=%(img2d)s->dimensions[1];
img2d_dim[0]=%(img2d)s->dimensions[0];
}else {
PyErr_SetString(PyExc_ValueError, "img don't have a good shape");
%(fail)s;
}
if(%(filtersflipped)s->nd==3){
kerns_dim[3]=%(filtersflipped)s->dimensions[2];
kerns_dim[2]=%(filtersflipped)s->dimensions[1];
kerns_dim[0]=%(filtersflipped)s->dimensions[0];
}else if(%(filtersflipped)s->nd==4){
kerns_dim[3]=%(filtersflipped)s->dimensions[3];
kerns_dim[2]=%(filtersflipped)s->dimensions[2];
kerns_dim[1]=%(filtersflipped)s->dimensions[1];
kerns_dim[0]=%(filtersflipped)s->dimensions[0];
}else{
PyErr_SetString(PyExc_ValueError, "kernel don't have a good shape");
%(fail)s;
}
img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
img2d_arr = (PyArrayObject*)img2d;
if ((img2d_arr->strides[3] != sizeof(%(type)s))
|| (img2d_arr->strides[2] != img2d_arr->dimensions[3]*sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
Py_DECREF(img2d);
img2d = contig;
if (!PyArray_ISCONTIGUOUS(img2d)){
PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
%(fail)s;
}
}
img2d_arr = (PyArrayObject*)img2d;
filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
filtersflipped_arr = (PyArrayObject*)filtersflipped;
if ((filtersflipped_arr->strides[3] != sizeof(%(type)s))
|| (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
Py_DECREF(filtersflipped);
filtersflipped = contig;
if (!PyArray_ISCONTIGUOUS(filtersflipped)){
PyErr_SetString(PyExc_ValueError, "filtersflipped isn't contiguous");
%(fail)s;
}
}
filtersflipped_arr = (PyArrayObject*)filtersflipped;
if(s=="valid") mode=0;
else if(s=="full") mode=2;
else {PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s;};
typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
if (!img2d) %(fail)s;
if (!filtersflipped) %(fail)s;
if ((!%(z)s)
|| *PyArray_DIMS(%(z)s)!=4
||(%(z)s->dimensions[0] != %(self_bsize)s)
||(%(z)s->dimensions[1] != %(self_nkern)s)
||(%(z)s->dimensions[2] != dim_zz[0])
|| (%(z)s->dimensions[3] != dim_zz[1])
)
{
if (%(z)s) Py_DECREF(%(z)s);
npy_intp dims[4] = {0,0,0,0};
if(!dims) %(fail)s;
dims[0]=%(self_bsize)s;
dims[1]=%(self_nkern)s;
dims[2]=dim_zz[0];
dims[3]=dim_zz[1];
%(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
}else{
//PyArray_FILLWBYTE((PyObject*)%(z)s,0);
}
int Os[2];
if (mode == FULL) {Os[0] = dim_im[0]+dim_ker[0]-1; Os[1] = dim_im[1]+dim_ker[1]-1;}
else {Os[0] = dim_im[0]-dim_ker[0]+1; Os[1] = dim_im[1]-dim_ker[1]+1;}
for(int b=0;b< %(self_bsize)s ;b+=%(unloop_bsize)s){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unloop_ksize)s){
//assertions
if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
"""%d
ret+=my_dup2("%(type)s * __restrict__ out%(unloop_iter)s=(%(type)s *)(PyArray_GETPTR2(%(z)s,b+%(unloop_biter)s,n_kern+%(unloop_kiter)s));")
ret+=my_dup("for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unloop_iter)s[i] = 0;",unloop_bsize*unloop_ksize)
ret+="""
for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
"""%d
ret+=my_dup("const %(type)s * __restrict__ in%(unloop_iter)d=(%(type)s *)(PyArray_GETPTR2(img2d,b+%(unloop_iter)s,stack_size));", unloop_bsize)
ret+=my_dup("const %(type)s * __restrict__ hvals%(unloop_iter)s=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern+%(unloop_iter)s,stack_size));",unloop_ksize)
ret+="""
int new_m;
for (int m=0; m < Os[0]; m++) {
// Reposition index into input image based on requested output size
if (mode == FULL) new_m = m ;
else new_m = (m+dim_ker[0]-1);
for (int n=0; n < Os[1]; n++) { // loop over columns
"""%d
ret+=my_dup("%(type)s sum%(unloop_iter)s=0;", unloop_bsize*unloop_ksize)
ret+="""
// Sum over kernel, if index into image is out of bounds
// fill with the value
for (int j=0; j < dim_ker[0]; j++) {
int ind0 = (new_m-j);
if(mode==FULL){
"""%d
ret+=my_dup("const %(type)s * idx_hvals%(unloop_iter)s=&hvals%(unloop_iter)s[j*dim_ker[1]];",unloop_ksize)
ret+="""
if(ind0 < 0 || ind0 >= dim_im[0]){
if(fill_value!=0)
for (int k=0; k < dim_ker[1]; k++) {
"""%d
ret+=my_dup2("sum%(unloop_iter)s += idx_hvals%(unloop_kiter)s[k] * fill_value;")
ret+="""
}
}else{
//do the part where kernel is to the right of the img
int k=0,max_k=max((int)(n-dim_im[1])+1,0);
if(fill_value!=0){
for(k=0;k<max_k;k++){
"""%d
ret+=my_dup2("sum%(unloop_iter)s += idx_hvals%(unloop_kiter)s[k] * fill_value;")
ret+="""
}
}else {k=max_k;}
//do the part where the kernel is on the img
max_k=min(n+1,(int)dim_ker[1]);
"""%d
ret+=my_dup("const %(type)s * idx_in%(unloop_iter)s=&in%(unloop_iter)s[ind0*dim_im[1]];", unloop_bsize)
ret+="""
for (int ind1=n-k; k<max_k; k++,ind1--) {
"""%d
ret+=my_dup2("sum%(unloop_iter)s+= idx_hvals%(unloop_kiter)s[k] * idx_in%(unloop_biter)s[ind1];")
ret+="""
}
//do the part to the left of the img
if(fill_value!=0)
for(;k<dim_ker[1];k++){
"""%d
ret+=my_dup2("sum%(unloop_iter)s += idx_hvals%(unloop_kiter)s[k] * fill_value;")
ret+="""
}
}
}else{
"""%d
ret+=my_dup("const %(type)s* idx_in%(unloop_iter)s=&in%(unloop_iter)s[ind0*dim_im[1]];", unloop_bsize)
ret+=my_dup("const %(type)s* idx_hvals%(unloop_iter)s=&hvals%(unloop_iter)s[j*dim_ker[1]];",unloop_ksize)
ret+="""
int new_n = (n+dim_ker[1]-1);
for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
"""%d
ret+=my_dup2("sum%(unloop_iter)s+=idx_hvals%(unloop_kiter)s[k]*idx_in%(unloop_biter)s[last];")
ret+="""
}
}
}//for j
"""%d
# ret+=my_dup("out%(unloop_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unloop_iter)s;", unloop_bsize)
ret+=my_dup("out%(unloop_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unloop_iter)s;", unloop_bsize*unloop_ksize)
# ret+=my_dup("cout<<sum%(unloop_iter)s<<endl;",unloop_bsize)
ret+="""
}//for n
}//for m
}//for stack_size
}//for n_kern
}//for b
Py_XDECREF(img2d);
Py_XDECREF(filtersflipped);
"""
return ret
...@@ -207,13 +207,13 @@ class TestConvOp(unittest.TestCase): ...@@ -207,13 +207,13 @@ class TestConvOp(unittest.TestCase):
ssizes = [(1,1),(2,2)]#2,2)] ssizes = [(1,1),(2,2)]#2,2)]
#test speed #test speed
bsize = 10 # batch size # bsize = 10 # batch size
imshp_start = (1,50,49) # imshp_start = (1,50,49)#un square shape to test more corner case.
kshps = ([11,12],[12,11]) # kshps = ([11,12],[12,11])#un square shape to test more corner case.
nkerns = [20,20] # per output pixel # nkerns = [20,20] # per output pixel
ssizes = [(1,1),]#(1,1)]#(2,2) bugged # ssizes = [(1,1),]#(1,1)]#(2,2) bugged
convmodes = ['valid','full'] # convmodes = ['valid','full']
do_theano=False # do_theano=False
N.set_printoptions(threshold=N.nan) N.set_printoptions(threshold=N.nan)
...@@ -221,23 +221,25 @@ class TestConvOp(unittest.TestCase): ...@@ -221,23 +221,25 @@ class TestConvOp(unittest.TestCase):
kerns = [T.matrix(),T.dmatrix()] kerns = [T.matrix(),T.dmatrix()]
img = T.dmatrix() img = T.dmatrix()
rng = N.random.RandomState(3423489) rng = N.random.RandomState(3423489)
tctot, tpytot, t2ctot, t2pytot, ntot, convtot = [], [], [], [], [], [] tctot, tpytot, ntot = [], [], []
dmatrix4=T.TensorType('float64', (False, False, False, False)) dmatrix4=T.TensorType('float64', (False, False, False, False))
inputs4=dmatrix4() inputs4=dmatrix4()
kerns4=dmatrix4() kerns4=dmatrix4()
assert len(kshps)==len(nkerns)==len(kerns) assert len(kshps)==len(nkerns)==len(kerns)
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))): def do_test(conv_mode, ss, unroll_batch=0, unroll_kern=0, img=img):
for ss, n_ss in zip(ssizes,range(len(ssizes))):
# build actual input images # build actual input images
imgval = rng.rand(bsize, imshp_start[0], imshp_start[1], imshp_start[2]) imgval = rng.rand(bsize, imshp_start[0], imshp_start[1], imshp_start[2])
imshp=imshp_start imshp=imshp_start
# for each layer # for each layer
for kshp, kern, nkern, n_layer in zip(kshps, kerns, nkerns, range(len(kerns))): ntot=0
tctot=0
tpytot=0
for kshp, kern, nkern, n_layer in zip(kshps, kerns, nkerns, range(len(kerns))):
print '************* layer %i ***************' % n_layer print '************* layer %i ***************' % n_layer
print conv_mode, ss, n_layer, kshp, nkern print conv_mode, ss, n_layer, kshp, nkern
...@@ -266,7 +268,7 @@ class TestConvOp(unittest.TestCase): ...@@ -266,7 +268,7 @@ class TestConvOp(unittest.TestCase):
for i in range(imshp[0]): # loop over input feature maps for i in range(imshp[0]): # loop over input feature maps
outval[b,n,...] += _convolve2d(\ outval[b,n,...] += _convolve2d(\
imgval[b,i,...], w_flip[n,i,...],1,val, bval, 0)[0::ss[0],0::ss[1]] imgval[b,i,...], w_flip[n,i,...],1,val, bval, 0)[0::ss[0],0::ss[1]]
ntot += [time.time() - time1] ntot += time.time() - time1
if do_theano: if do_theano:
####### test with new sp.convolve2 function ###### ####### test with new sp.convolve2 function ######
...@@ -290,14 +292,11 @@ class TestConvOp(unittest.TestCase): ...@@ -290,14 +292,11 @@ class TestConvOp(unittest.TestCase):
else: else:
hid = img #we don't need it, but it make the flow easier flow hid = img #we don't need it, but it make the flow easier flow
convtot+=[-1]
tctot+=[-1]
tpytot+=[-1]
hidval=outval.copy()#to keep the same memory hidval=outval.copy()#to keep the same memory
hidval1=outval.copy() hidval1=outval.copy()
# ConvOp # ConvOp
conv_op = ConvOp(imshp, kshp, nkern, bsize, 1,1, conv_mode, unroll_kern=10)(inputs4, kerns4) conv_op = ConvOp(imshp, kshp, nkern, bsize, 1,1, conv_mode, unroll_batch=unroll_batch, unroll_kern=unroll_kern)(inputs4, kerns4)
l1shp=N.hstack((nkern, l1shp=N.hstack((nkern,
getFilterOutShp(imshp, kshp, ss, conv_mode))) getFilterOutShp(imshp, kshp, ss, conv_mode)))
propup2 = function([inputs4, kerns4], conv_op) propup2 = function([inputs4, kerns4], conv_op)
...@@ -306,12 +305,12 @@ class TestConvOp(unittest.TestCase): ...@@ -306,12 +305,12 @@ class TestConvOp(unittest.TestCase):
time1 = time.time() time1 = time.time()
hidval2_ = propup2(imgval,w_flip) hidval2_ = propup2(imgval,w_flip)
hidval2 = hidval2_[:,:,0::ss[0],0::ss[1]] hidval2 = hidval2_[:,:,0::ss[0],0::ss[1]]
t2ctot += [time.time() - time1] tctot += time.time() - time1
time1 = time.time() time1 = time.time()
# hidval3_ = propup3(imgval,w_flip) # hidval3_ = propup3(imgval,w_flip)
# hidval3 = hidval3_[:,:,0::ss[0],0::ss[1]] # hidval3 = hidval3_[:,:,0::ss[0],0::ss[1]]
t2pytot += [time.time() - time1] tpytot += time.time() - time1
# assert (N.abs(hidval2-hidval3)<1e-5).all() # assert (N.abs(hidval2-hidval3)<1e-5).all()
temp = N.abs(outval - hidval2) temp = N.abs(outval - hidval2)
...@@ -322,14 +321,47 @@ class TestConvOp(unittest.TestCase): ...@@ -322,14 +321,47 @@ class TestConvOp(unittest.TestCase):
img, imshp = hid, tuple(outshp) img, imshp = hid, tuple(outshp)
imgval = outval.reshape(bsize,outshp[0],outshp[1],outshp[2]) imgval = outval.reshape(bsize,outshp[0],outshp[1],outshp[2])
return tctot, tpytot, ntot
if False:
unroll_batch = [0,1,2,5,10]
unroll_kern = [0,1,2,5,10,20]
# calculate the speed up of different combination of unroll
for unroll_b in unroll_batch:
for unroll_k in unroll_kern:
tctot, tpytot, ntot=[],[],[]
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
for ss, n_ss in zip(ssizes,range(len(ssizes))):
tctot_, tpytot_, ntot_ = do_test(conv_mode, ss,unroll_batch=unroll_b, unroll_kern=unroll_k)
tctot+=[tctot_]
tpytot+=[tpytot_]
ntot+=[ntot_]
print '**** Multilayer Convolution Profiling Results ****'
print 'unroll batch', unroll_b, 'unroll kern',unroll_k
print 'Numpy convolve2d processing time: %.3fs'%sum(ntot),ntot
print 'c Theano(ConvOp) processing time: %.3fs'%sum(tctot),tctot
print 'py Theano(ConvOp) processing time: %.3fs'%sum(tpytot),tpytot
d=N.asarray(ntot)/tctot
print 'speed up c theano(ConvOp) vs convolve2d: %.3f'%d.mean(),d
return
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
for ss, n_ss in zip(ssizes,range(len(ssizes))):
tctot_, tpytot_, ntot_ = do_test(conv_mode, ss)
tctot+=[tctot_]
tpytot+=[tpytot_]
ntot+=[ntot_]
print '**** Multilayer Convolution Profiling Results ****' print '**** Multilayer Convolution Profiling Results ****'
print 'Numpy convolve2d processing time: %.3fs'%sum(ntot),ntot print 'Numpy convolve2d processing time: %.3fs'%sum(ntot),ntot
print 'c Theano(ConvOp) processing time: %.3fs'%sum(t2ctot),t2ctot print 'c Theano(ConvOp) processing time: %.3fs'%sum(tctot),tctot
print 'py Theano(ConvOp) processing time: %.3fs'%sum(t2pytot),t2pytot print 'py Theano(ConvOp) processing time: %.3fs'%sum(tpytot),tpytot
print 'convolve processing time: %.3fs'%sum(convtot),convtot d=N.asarray(ntot)/tctot
d=N.asarray(ntot)/t2ctot
print 'speed up c theano(ConvOp) vs convolve2d: %.3f'%d.mean(),d print 'speed up c theano(ConvOp) vs convolve2d: %.3f'%d.mean(),d
d=N.asarray(ntot)/t2pytot d=N.asarray(ntot)/tpytot
print 'speed up py theano(ConvOp) vs convolve2d: %.3f'%d.mean(),d print 'speed up py theano(ConvOp) vs convolve2d: %.3f'%d.mean(),d
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论