提交 dc1aa62a authored 作者: Frederic Bastien's avatar Frederic Bastien

[mq]: unroll_patch

上级 3d8a5430
...@@ -29,9 +29,10 @@ class ConvOp(Op): ...@@ -29,9 +29,10 @@ class ConvOp(Op):
#TODO: make the stacksize its own parameter, and make imshp a pair #TODO: make the stacksize its own parameter, and make imshp a pair
def __init__(self, imshp, kshp, nkern, bsize, dx, dy, output_mode='valid', def __init__(self, imshp=None, kshp=None, nkern=None, bsize=None, dx=None, dy=None, output_mode='valid',
unroll_batch=4, unroll_batch=0,
unroll_kern=4, unroll_kern=0,
unroll_patch=False,
imshp_logical=None, imshp_logical=None,
kshp_logical=None, kshp_logical=None,
kshp_logical_top_aligned=True, kshp_logical_top_aligned=True,
...@@ -47,6 +48,7 @@ class ConvOp(Op): ...@@ -47,6 +48,7 @@ class ConvOp(Op):
dx - patch stride rows dx - patch stride rows
dy - patch stride cols dy - patch stride cols
out_mode - 'valid', 'full' out_mode - 'valid', 'full'
unroll_patch - c code generation option
unroll_batch - c code generation option unroll_batch - c code generation option
unroll_kern - c code generation option unroll_kern - c code generation option
verbose - passed to GpuConv verbose - passed to GpuConv
...@@ -60,6 +62,7 @@ class ConvOp(Op): ...@@ -60,6 +62,7 @@ class ConvOp(Op):
gradient on the filters. gradient on the filters.
unroll_patch. If True will use a version that is faster then without not unroll by unroll the patch loop.
unroll_batch. If >0 will use a version that will unroll the batch loop by the value of the option. By default don't use this version of the code. unroll_batch. If >0 will use a version that will unroll the batch loop by the value of the option. By default don't use this version of the code.
unroll_nkern. idem as unroll_batch but unroll the kernel loop. unroll_nkern. idem as unroll_batch but unroll the kernel loop.
...@@ -95,6 +98,7 @@ class ConvOp(Op): ...@@ -95,6 +98,7 @@ class ConvOp(Op):
self.unroll_batch=unroll_batch self.unroll_batch=unroll_batch
self.unroll_kern=unroll_kern self.unroll_kern=unroll_kern
self.unroll_patch=unroll_patch
if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0: if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0:
if self.bsize<=self.unroll_batch: if self.bsize<=self.unroll_batch:
...@@ -407,6 +411,7 @@ using namespace std; ...@@ -407,6 +411,7 @@ using namespace std;
d["self_imshp0"]=self.imshp[0] d["self_imshp0"]=self.imshp[0]
d["self_imshp1"]=self.imshp[1] d["self_imshp1"]=self.imshp[1]
d["self_imshp2"]=self.imshp[2] d["self_imshp2"]=self.imshp[2]
d["mode"]=self.out_mode.upper()
d["self_kshp0"]=self.kshp[0] d["self_kshp0"]=self.kshp[0]
d["self_kshp1"]=self.kshp[1] d["self_kshp1"]=self.kshp[1]
d["self_kshp_logical_r"] = self.kshp_logical[0] d["self_kshp_logical_r"] = self.kshp_logical[0]
...@@ -439,8 +444,12 @@ using namespace std; ...@@ -439,8 +444,12 @@ using namespace std;
#print self.out_mode, d["self_imshp_logical_stride_r"] #print self.out_mode, d["self_imshp_logical_stride_r"]
if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical: if self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
# print "return imshp!=imshp_logical or self.kshp != self.kshp_logical shape version"
return _conv_op_code_a % d return _conv_op_code_a % d
if self.unroll_patch:
# print "return unroll patch version",self.dx,self.dy
return _conv_op_code_unroll_patch%d
if self.unroll_batch>0 or self.unroll_kern>0: if self.unroll_batch>0 or self.unroll_kern>0:
if self.unroll_batch<=0: self.unroll_batch=1 if self.unroll_batch<=0: self.unroll_batch=1
if self.unroll_kern<=0: self.unroll_kern=1 if self.unroll_kern<=0: self.unroll_kern=1
...@@ -1212,3 +1221,295 @@ Py_XDECREF(img2d); ...@@ -1212,3 +1221,295 @@ Py_XDECREF(img2d);
Py_XDECREF(filtersflipped); Py_XDECREF(filtersflipped);
""" """
return ret return ret
_conv_op_code_unroll_patch = """
const int mode=%(mode)s;
int typenum=0, typenum_f=0;
PyArrayObject *ain1=NULL, *ain2=NULL, *filtersflipped_arr=NULL, *img2d_arr=NULL;
const %(type)s fill_value = 0;
int type_im=PyArray_TYPE(%(img2d)s);
int type_ker=PyArray_TYPE(%(filtersflipped)s);
npy_intp dim_zz[2]={%(self_outshp0)s,%(self_outshp1)s};
npy_intp dim_im[2]={%(self_imshp1)s,%(self_imshp2)s};
npy_intp dim_ker[2]={%(self_kshp0)s,%(self_kshp1)s};
PyArray_Dims img2d_shape;
npy_intp img2d_dim[4]={1,1,0,0};
img2d_shape.ptr=img2d_dim;
img2d_shape.len=4;
PyArray_Dims kerns_shape;
npy_intp kerns_dim[4]={1,1,0,0};
kerns_shape.ptr=kerns_dim;
kerns_shape.len=4;
PyObject *img2d=NULL, *contig, *filtersflipped=NULL;
if(%(img2d)s->nd==2){
img2d_dim[3]=%(img2d)s->dimensions[1];
img2d_dim[2]=%(img2d)s->dimensions[0];
}else if(%(img2d)s->nd==3){
img2d_dim[3]=%(img2d)s->dimensions[2];
img2d_dim[2]=%(img2d)s->dimensions[1];
img2d_dim[0]=%(img2d)s->dimensions[0];
}else if(%(img2d)s->nd==4){
img2d_dim[3]=%(img2d)s->dimensions[3];
img2d_dim[2]=%(img2d)s->dimensions[2];
img2d_dim[1]=%(img2d)s->dimensions[1];
img2d_dim[0]=%(img2d)s->dimensions[0];
}else {
PyErr_SetString(PyExc_ValueError, "img don't have a good shape");
%(fail)s;
}
if(%(filtersflipped)s->nd==3){
kerns_dim[3]=%(filtersflipped)s->dimensions[2];
kerns_dim[2]=%(filtersflipped)s->dimensions[1];
kerns_dim[0]=%(filtersflipped)s->dimensions[0];
}else if(%(filtersflipped)s->nd==4){
kerns_dim[3]=%(filtersflipped)s->dimensions[3];
kerns_dim[2]=%(filtersflipped)s->dimensions[2];
kerns_dim[1]=%(filtersflipped)s->dimensions[1];
kerns_dim[0]=%(filtersflipped)s->dimensions[0];
}else{
std:stringstream temp;
temp << "nddim="<<%(filtersflipped)s->nd;
std::string param = temp.str();
PyErr_SetString(PyExc_ValueError,
("kernel don't have a good shape. " + param).c_str());
%(fail)s;
}
img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
img2d_arr = (PyArrayObject*)img2d;
if ((img2d_arr->strides[3] != sizeof(%(type)s))
|| (img2d_arr->strides[2] != img2d_arr->dimensions[3]*sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
Py_DECREF(img2d);
img2d = contig;
if (!PyArray_ISCONTIGUOUS(img2d)){
PyErr_SetString(PyExc_ValueError, "img2d isn't contiguous");
%(fail)s;
}
}
img2d_arr = (PyArrayObject*)img2d;
filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
filtersflipped_arr = (PyArrayObject*)filtersflipped;
if ((filtersflipped_arr->strides[3] != sizeof(%(type)s))
|| (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*sizeof(%(type)s))){
contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
Py_DECREF(filtersflipped);
filtersflipped = contig;
if (!PyArray_ISCONTIGUOUS(filtersflipped)){
PyErr_SetString(PyExc_ValueError, "filtersflipped isn't contiguous");
%(fail)s;
}
}
filtersflipped_arr = (PyArrayObject*)filtersflipped;
if(mode != VALID && mode != FULL){
PyErr_SetString(PyExc_ValueError, "invalid mode, only full and valid are supported"); %(fail)s;
}
typenum = PyArray_ObjectType((PyObject*)%(img2d)s, 0);
typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
if (!img2d) %(fail)s;
if (!filtersflipped) %(fail)s;
if ((!%(z)s)
|| *PyArray_DIMS(%(z)s)!=4
||(%(z)s->dimensions[0] != %(self_bsize)s)
||(%(z)s->dimensions[1] != %(self_nkern)s)
||(%(z)s->dimensions[2] != dim_zz[0])
|| (%(z)s->dimensions[3] != dim_zz[1])
)
{
if (%(z)s) Py_DECREF(%(z)s);
npy_intp dims[4] = {0,0,0,0};
if(!dims) %(fail)s;
dims[0]=%(self_bsize)s;
dims[1]=%(self_nkern)s;
dims[2]=dim_zz[0];
dims[3]=dim_zz[1];
%(z)s = (PyArrayObject*) PyArray_ZEROS(4, dims, typenum,0);
}else{
//PyArray_FILLWBYTE((PyObject*)%(z)s,0);
}
int Os[2];
Os[0]=%(self_outshp0)s;
Os[1]=%(self_outshp1)s;
//I keep the formula to calculte Os in case we need it in the futur.
//if (mode == FULL) {Os[0] = (int)ceil((dim_im[0]+dim_ker[0]-1)/float(%(self_dx)s)); Os[1] = ceil((dim_im[1]+dim_ker[1]-1)/float(%(self_dy)s));}
//else {Os[0] = (int)ceil((dim_im[0]-dim_ker[0]+1)/float(%(self_dx)s)); Os[1] = (int)ceil((dim_im[1]-dim_ker[1]+1)/float(%(self_dy)s));}
for(int b=0;b< %(self_bsize)s;b++){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
//assertions
if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
%(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern));
for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0;
for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
const %(type)s * __restrict__ in=(%(type)s *)(PyArray_GETPTR2(img2d,b,stack_size));
const %(type)s * __restrict__ hvals=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern,stack_size));
int new_m;
for (int iter_m=0; iter_m < Os[0]; iter_m++) {
// Reposition index into input image based on requested output size
int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
if (mode == FULL) new_m = pos_m ;
else new_m = (pos_m+dim_ker[0]-1);
for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s;
%(type)s sum=0;
%(type)s sum2=0;
%(type)s sum3=0;
%(type)s sum4=0;
int nb_sum=0;
// Sum over kernel, if index into image is out of bounds
// fill with the value
for (int j=0; j < dim_ker[0]; j++) {
int ind0 = (new_m-j);
if(mode==FULL){
const %(type)s * idx_hvals=&hvals[j*dim_ker[1]];
if(ind0 < 0 || ind0 >= dim_im[0]){
if(fill_value!=0)
for (int k=0; k < dim_ker[1]; k++) {
sum+= idx_hvals[k] * fill_value;
}
}else{
//do the part where kernel is to the right of the img
//TODO: implement unroll patch for fill_value!=0
int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
if(fill_value!=0){
for(k=0;k<max_k;k++){
sum+= idx_hvals[k]*fill_value;
}
}else {k=max_k;}
//do the part where the kernel is on the img
max_k=min(pos_n+1,(int)dim_ker[1]);
const %(type)s * idx_in=&in[ind0*dim_im[1]];
if(iter_n + 4*%(self_dy)s < Os[1]
&& iter_n>dim_ker[1]-1+3
&& iter_n<dim_im[1]-dim_ker[1]+1-3){
nb_sum=4;
//cout<<4<<endl;
for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
sum+=idx_hvals[k]*idx_in[ind1];
sum2+=idx_hvals[k]*idx_in[ind1+%(self_dy)s];
sum3+=idx_hvals[k]*idx_in[ind1+2*%(self_dy)s];
sum4+=idx_hvals[k]*idx_in[ind1+3*%(self_dy)s];
}
}else if(iter_n + 2*%(self_dy)s < Os[1]
&& iter_n>dim_ker[1]-1
&& iter_n<dim_im[1]-dim_ker[1]+1){
//cout<<2<<endl;
nb_sum=2;
// if(iter_n==dim_ker[1]-1){//k-1<min(pos_n+%(self_dy)s,(int)dim_ker[1])){
// sum2+=idx_hvals[k-1]*idx_in[pos_n-k-%(self_dy)s];
// }
for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
sum+=idx_hvals[k]*idx_in[ind1];
sum2+=idx_hvals[k]*idx_in[ind1+%(self_dy)s];
}
// sum2+=idx_hvals[k]*idx_in[pos_n-k+%(self_dy)s];
// sum+=idx_hvals[k]*idx_in[pos_n-k];
// k++;
}else{
//cout<<1<<endl;
nb_sum=1;
/*
%(type)s sum_=0;
if((k-max_k) & 0x1 != 0){
sum+= idx_hvals[k] * idx_in[pos_n-k];
}
for (int ind1=pos_n-k; k<max_k; k+=2,ind1-=2) {
sum+= idx_hvals[k] * idx_in[ind1];
sum_+= idx_hvals[k+1] * idx_in[ind1-1];
}
sum+=sum_;
*/
for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
sum+=idx_hvals[k]*idx_in[ind1];
}
}
//do the part to the left of the img
if(fill_value!=0)
for(;k<dim_ker[1];k++) sum+= idx_hvals[k]*fill_value;
}
}else{//valid mode
const %(type)s* idx_in=&in[ind0*dim_im[1]];
const %(type)s* idx_hvals=&hvals[j*dim_ker[1]];
if(iter_n + 4*%(self_dy)s < Os[1]){
nb_sum=4;
for (int k=dim_ker[1]-1,im_idx=pos_n; k >=0; k--,im_idx++) {
sum+=idx_hvals[k]*idx_in[im_idx];
sum2+=idx_hvals[k]*idx_in[im_idx+%(self_dy)s];
sum3+=idx_hvals[k]*idx_in[im_idx+2*%(self_dy)s];
sum4+=idx_hvals[k]*idx_in[im_idx+3*%(self_dy)s];
}
}else if(iter_n + 2*%(self_dy)s < Os[1]){
nb_sum=2;
for (int k=dim_ker[1]-1,im_idx=pos_n; k >=0; k--,im_idx++) {
sum+=idx_hvals[k]*idx_in[im_idx];
sum2+=idx_hvals[k]*idx_in[im_idx+%(self_dy)s];
}
}else{
nb_sum=1;
for (int k=dim_ker[1]-1,im_idx=pos_n; k >=0; k--,im_idx++) {
sum+=idx_hvals[k]*idx_in[im_idx];
}
}
}//else valid mode
}//for j
switch(nb_sum){
case 4: out[iter_m*dim_zz[1]+iter_n+3] %(affectation)s sum4;
case 3: out[iter_m*dim_zz[1]+iter_n+2] %(affectation)s sum3;
case 2: out[iter_m*dim_zz[1]+iter_n+1] %(affectation)s sum2;
case 1: out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
}
iter_n+=nb_sum-1;
/*
out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
if(nb_sum>=2){
iter_n++;
out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum2;
}
if(nb_sum>=3){
iter_n++;
out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum3;
}
if(nb_sum>=4){
iter_n++;
out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum4;
}
*/
}//for iter_n
}//for iter_m
}//for stack_size
if (0 && (mode==FULL)){
for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i)
std::cout << " " << out[i];
std::cout << "\\n";
}
}//for n_kern
}//for b
Py_XDECREF(img2d);
Py_XDECREF(filtersflipped);
"""
...@@ -41,7 +41,7 @@ def flip(kern, kshp): ...@@ -41,7 +41,7 @@ def flip(kern, kshp):
global_rng = N.random.RandomState(3423489) global_rng = N.random.RandomState(3423489)
dmatrix4=T.TensorType('float64', (False, False, False, False)) dmatrix4=T.TensorType('float64', (False, False, False, False))
def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll_batch=0, unroll_kern=0, img=T.dmatrix(), validate=True, conv_op_py=False, do_convolve2=False, do_print=True, repeat=1): def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll_batch=0, unroll_kern=0, img=T.dmatrix(), validate=True, conv_op_py=False, do_convolve2=False, do_print=True, repeat=1, unroll_patch=0):
# build actual input images # build actual input images
imgval = global_rng.rand(bsize, imshp[0], imshp[1], imshp[2]) imgval = global_rng.rand(bsize, imshp[0], imshp[1], imshp[2])
...@@ -121,7 +121,7 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll ...@@ -121,7 +121,7 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll
hidval1=outval.copy() hidval1=outval.copy()
# ConvOp # ConvOp
conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode, unroll_batch=unroll_batch, unroll_kern=unroll_kern)(inputs4, kerns4) conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode, unroll_batch=unroll_batch, unroll_kern=unroll_kern, unroll_patch=unroll_patch)(inputs4, kerns4)
l1shp=N.hstack((nkern, l1shp=N.hstack((nkern,
getFilterOutShp(imshp, kshp, ss, conv_mode))) getFilterOutShp(imshp, kshp, ss, conv_mode)))
propup2 = function([inputs4, kerns4], conv_op) propup2 = function([inputs4, kerns4], conv_op)
...@@ -328,7 +328,7 @@ class TestConvOp(unittest.TestCase): ...@@ -328,7 +328,7 @@ class TestConvOp(unittest.TestCase):
ssizess = [[(1,1),(1,2)],[(1,1),(2,2)]] ssizess = [[(1,1),(1,2)],[(1,1),(2,2)]]
convmodes = ['valid','full'] convmodes = ['valid','full']
do_convolve2=True do_convolve2=True
unroll = [(0,0),(1,1),(2,2),(3,2)]#(batch,kern) unroll = [(0,0,False),(0,0,True),(1,1,False),(2,2,False),(3,2,False)]#(batch,kern,patch)
do_speed_test = False do_speed_test = False
# TODO: this version show a bug that was fixed # TODO: this version show a bug that was fixed
...@@ -338,6 +338,11 @@ class TestConvOp(unittest.TestCase): ...@@ -338,6 +338,11 @@ class TestConvOp(unittest.TestCase):
# nkerns = [2,2] # per output pixel # nkerns = [2,2] # per output pixel
# ssizes = [(1,1),(2,2)]#2,2)] # ssizes = [(1,1),(2,2)]#2,2)]
# bsizes = [1,1] # batch size
# imshp_starts = [(1,10,10),(1,5,6)]
# kshpss = ([[2,3],[3,2]],[[2,2],[2,2]])
# nkernss = [[1,1],[1,1]] # per output pixel
N.set_printoptions(threshold=N.nan) N.set_printoptions(threshold=N.nan)
# symbolic stuff # symbolic stuff
...@@ -356,8 +361,8 @@ class TestConvOp(unittest.TestCase): ...@@ -356,8 +361,8 @@ class TestConvOp(unittest.TestCase):
unroll_batch = [1,2,4,5,10,20] unroll_batch = [1,2,4,5,10,20]
unroll_kern = [1,2,4,5,10,20] unroll_kern = [1,2,4,5,10,20]
unroll_batch = [1,2,5] unroll_batch = [1,4,5]
unroll_kern = [1,2,5] unroll_kern = [1,4,5]
bsize = 20 # batch size bsize = 20 # batch size
imshp_start = (1,48,48)#un square shape to test more corner case. imshp_start = (1,48,48)#un square shape to test more corner case.
...@@ -374,9 +379,17 @@ class TestConvOp(unittest.TestCase): ...@@ -374,9 +379,17 @@ class TestConvOp(unittest.TestCase):
timing = N.zeros((len(unroll_batch),len(unroll_kern),3)) timing = N.zeros((len(unroll_batch),len(unroll_kern),3))
t_b_k=[] t_b_k=[]
#calculate the timing with unrolling #calculate the timing with unrolling
t_=[[ 7.60572791, 3.95069814, 3.74271464], [ 4.05631089, 2.90384555, 2.93613672], [ 3.90551591, 2.92595196, 3.00102282]]
best=[]
worst=[]
best=[0.52690219879150391, 2.4266397953033447]
worst=[0.92042708396911621, 6.8822150230407715]
t_=[]
for unroll_b, n_b in zip(unroll_batch,range(len(unroll_batch))): for unroll_b, n_b in zip(unroll_batch,range(len(unroll_batch))):
for unroll_k, n_k in zip(unroll_kern,range(len(unroll_kern))): for unroll_k, n_k in zip(unroll_kern,range(len(unroll_kern))):
t_b_k.append(str(unroll_b)+"/"+str(unroll_k)) t_b_k.append(str(unroll_b)+"/"+str(unroll_k))
if not t_:
tctot, tpytot, ntot=[],[],[] tctot, tpytot, ntot=[],[],[]
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))): for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
for ss, n_ss in zip(ssizes,range(len(ssizes))): for ss, n_ss in zip(ssizes,range(len(ssizes))):
...@@ -384,36 +397,68 @@ class TestConvOp(unittest.TestCase): ...@@ -384,36 +397,68 @@ class TestConvOp(unittest.TestCase):
tctot+=[tctot_] tctot+=[tctot_]
tpytot+=[tpytot_] tpytot+=[tpytot_]
ntot+=[ntot_] ntot+=[ntot_]
if unroll_b==4 and unroll_k==4:
print "unroll 4/4",tctot
best=tctot
if unroll_b==1 and unroll_k==1:
print "unroll 1/1",tctot
worst=tctot
timing[n_b,n_k]=[sum(tctot), sum(tpytot), sum(ntot)] timing[n_b,n_k]=[sum(tctot), sum(tpytot), sum(ntot)]
if not t_:
t=timing[:,:,0]#We select only the c timing.
else:
t=t_
t=N.asarray(t)
#calculate the old timing #calculate the old timing
tctot,tpytot,ntot=0,0,0 tctot_=[0.52555489540100098, 6.6634182929992676]
# tctot_=[]
tctot,tpytot,ntot=[],[],[]
if not tctot_:
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))): for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
for ss, n_ss in zip(ssizes,range(len(ssizes))): for ss, n_ss in zip(ssizes,range(len(ssizes))):
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate) tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate)
tctot+=tctot_ tctot+=[tctot_]
tpytot+=tpytot_ tpytot+=[tpytot_]
ntot+=ntot_ ntot+=[ntot_]
print "old code timing %.3fs"%tctot else: tctot=N.asarray(tctot_)
print "old code timing %.3fs"%sum(tctot),tctot
# print timing best=N.asarray(best)
t=timing[:,:,0]#We select only the c timing. worst=N.asarray(worst)
print "timing for unrolled version" print "timing for unrolled version"
print t_b_k print t_b_k
print t print t
print "max %.3fs"%t.max(), "max param(batch unloop size/kernel unloop size)", t_b_k[t.argmax()] print "max %.3fs"%t.max(), "max param(batch unloop size/kernel unloop size)", t_b_k[t.argmax()]
print "min %.3fs"%t.min(), "min param(batch unloop size/kernel unloop size)", t_b_k[t.argmin()] print "min %.3fs"%t.min(), "min param(batch unloop size/kernel unloop size)", t_b_k[t.argmin()]
print "speedup vs (1/1)%.3fx, vs old %.3fx"% (t.max()/t.min(),tctot/t.min()) print "speedup vs (1/1)%.3fx, vs old %.3fx"% (t.max()/t.min(),sum(tctot)/t.min())
print worst/best,tctot/best
tctot_patch = []
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
for ss, n_ss in zip(ssizes,range(len(ssizes))):
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp_start, kshps, nkerns, unroll_batch=0, unroll_kern=0, validate=validate,unroll_patch=2)
tctot_patch += [tctot_]
t_patch=sum(tctot_patch)
print "unroll_patch time", tctot_patch
print "speedup vs (1/1)%.3fx, vs old %.3fx"% (t.max()/t_patch,sum(tctot)/t_patch)
print best/tctot_patch, worst/tctot_patch
print best
print worst
print tctot
print tctot_patch
return return
for i in range(len(kshpss)): for i in range(len(kshpss)):
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))): for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
for ss, n_ss in zip(ssizess[i],range(len(ssizess[i]))): for ss, n_ss in zip(ssizess[i],range(len(ssizess[i]))):
for un_b, un_k in unroll: for un_b, un_k, un_p in unroll:
tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet( tctot_, tpytot_, ntot_ = exec_multilayer_conv_nnet(
conv_mode, ss, bsizes[i], imshp_starts[i], conv_mode, ss, bsizes[i], imshp_starts[i],
kshpss[i], nkernss[i], kshpss[i], nkernss[i],
img=img, unroll_batch=un_b, unroll_kern=un_k, img=img, unroll_batch=un_b, unroll_kern=un_k,
unroll_patch=un_p,
validate=True) validate=True)
tctot+=[tctot_] tctot+=[tctot_]
tpytot+=[tpytot_] tpytot+=[tpytot_]
...@@ -428,6 +473,11 @@ class TestConvOp(unittest.TestCase): ...@@ -428,6 +473,11 @@ class TestConvOp(unittest.TestCase):
d=N.asarray(ntot)/tpytot d=N.asarray(ntot)/tpytot
print 'speed up py theano(ConvOp) vs convolve2d: %.3fx'%d.mean(),d print 'speed up py theano(ConvOp) vs convolve2d: %.3fx'%d.mean(),d
def init_data(self,shape):
return N.ones(shape)
return N.random.random(shape)
def test_ConvOpGrad(self): def test_ConvOpGrad(self):
""" """
test the gradient in float and double test the gradient in float and double
...@@ -442,7 +492,7 @@ class TestConvOp(unittest.TestCase): ...@@ -442,7 +492,7 @@ class TestConvOp(unittest.TestCase):
kshps = [(2,3)] kshps = [(2,3)]
imshps = [(2,3,4)] imshps = [(2,3,4)]
modes = ['valid', 'full'] modes = ['valid', 'full']
unroll = [(0,0),(1,1),(2,3)] unroll = [(0,0,True),(1,1,False),(2,3,False),(1,1,False),(0,0,False)]#(batch,kern,patch)
ssizes = [(1,1),(2,2)] ssizes = [(1,1),(2,2)]
for typ in types: for typ in types:
...@@ -457,12 +507,12 @@ class TestConvOp(unittest.TestCase): ...@@ -457,12 +507,12 @@ class TestConvOp(unittest.TestCase):
imgvals = N.array(N.random.random(N.hstack((bsize,imshp))),dtype=imgs.dtype) imgvals = N.array(N.random.random(N.hstack((bsize,imshp))),dtype=imgs.dtype)
for kshp in kshps: for kshp in kshps:
t=numpy.array([imshp[1]-kshp[0],imshp[2]-kshp[1]]) t=numpy.array([imshp[1]-kshp[0],imshp[2]-kshp[1]])
kernvals = N.array(N.random.rand(nkern,visdim,kshp[0], kernvals = N.array(self.init_data((nkern,visdim,kshp[0],
kshp[1]),dtype=kerns.dtype) kshp[1])),dtype=kerns.dtype)
# 'full' mode should support kernels bigger than the input # 'full' mode should support kernels bigger than the input
if mode == 'valid' and (t<0).any(): if mode == 'valid' and (t<0).any():
continue continue
for un_b,un_k in unroll: for un_b,un_k, un_p in unroll:
for ss in ssizes: for ss in ssizes:
print 'test_ConvOpGrad' print 'test_ConvOpGrad'
print 'mode type:', mode, typ print 'mode type:', mode, typ
...@@ -476,14 +526,14 @@ class TestConvOp(unittest.TestCase): ...@@ -476,14 +526,14 @@ class TestConvOp(unittest.TestCase):
def test_i(imgs): def test_i(imgs):
convop = ConvOp(imshp, kshp, nkern, bsize, ss[0], ss[1], convop = ConvOp(imshp, kshp, nkern, bsize, ss[0], ss[1],
output_mode=mode, unroll_batch=un_b, unroll_kern=un_k) output_mode=mode, unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p)
return convop(imgs, kernvals) return convop(imgs, kernvals)
def test_k(kerns): def test_k(kerns):
convop = ConvOp(imshp, kshp, nkern, bsize, ss[0], ss[1], convop = ConvOp(imshp, kshp, nkern, bsize, ss[0], ss[1],
output_mode=mode, unroll_batch=un_b, unroll_kern=un_k) output_mode=mode, unroll_batch=un_b, unroll_kern=un_k, unroll_patch=un_p)
return convop(imgvals, kerns) return convop(imgvals, kerns)
print mode, imshp, kshp, un_b, un_k, ss
#TODO the tolerance needed to pass is very high for float32(0.17). Is this acceptable? Expected? #TODO the tolerance needed to pass is very high for float32(0.17). Is this acceptable? Expected?
tol = None tol = None
if typ=="float32": if typ=="float32":
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论