提交 4cc3b5f2 authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merged

...@@ -704,12 +704,18 @@ class CLinker(link.Linker): ...@@ -704,12 +704,18 @@ class CLinker(link.Linker):
instantiate.customize.add_support_code(self.struct_code) instantiate.customize.add_support_code(self.struct_code)
instantiate.customize.add_support_code(static) instantiate.customize.add_support_code(static)
for extra_arg in ( for extra_arg in (
"-O2", "-O3",
# "-fno-signaling-nans",
#"-fno-finite-math-only",
#"-fmath-errno", "-fno-unsafe-math-optimizations", "-fno-finite-math-only", "-frounding-math", "-fsignaling-nans","-fno-cx-limited-range","-fno-fast-math",
"-ffast-math", "-ffast-math",
#"-fno-finite-math-only",
# "-fno-signaling-nans",
#"-fmath-errno", "-fno-unsafe-math-optimizations", "-fno-finite-math-only", "-frounding-math", "-fsignaling-nans","-fno-cx-limited-range","-fno-fast-math",
#"-fprefetch-loop-arrays", #"-fprefetch-loop-arrays",
#"-ftree-vect-loop-version", #"-ftree-vect-loop-version",
#"-ftree-loop-optimize", #"-ftree-loop-optimize",
#"-ftree-vectorize"): #"-ftree-vectorize",
"-w" #-w means supress all warnings "-w" #-w means supress all warnings
): ):
instantiate.customize.add_extra_compile_arg(extra_arg) instantiate.customize.add_extra_compile_arg(extra_arg)
......
...@@ -8,7 +8,7 @@ def getFilterOutShp(inshp, kshp, (dx,dy)=(1,1), mode='valid'): ...@@ -8,7 +8,7 @@ def getFilterOutShp(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
s = -1 if mode=='valid' else 1 s = -1 if mode=='valid' else 1
inshp, kshp = N.array(inshp), N.array(kshp) inshp, kshp = N.array(inshp), N.array(kshp)
return N.int64(N.ceil((inshp[1:] + s*kshp - s*1)/\ return N.int64(N.ceil((inshp[1:] + s*kshp - s*1)/\
N.array([dy,dx], dtype='float'))) N.array([dx,dy], dtype='float')))
class ConvOp(Op): class ConvOp(Op):
""" """
...@@ -44,20 +44,19 @@ class ConvOp(Op): ...@@ -44,20 +44,19 @@ class ConvOp(Op):
self.unroll_kern=unroll_kern self.unroll_kern=unroll_kern
if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0: if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0:
if self.bsize<self.unroll_batch: if self.bsize<=self.unroll_batch:
self.unroll_batch = self.bsize self.unroll_batch = self.bsize
else: else:
self.unroll_batch=1
print "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%s) must be 0 or a multiple of bsize(%s). We revert it to 1. This won't change the result, but may make it slower."%(str(self.unroll_batch),str(self.bsize)) print "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%s) must be 0 or a multiple of bsize(%s). We revert it to 1. This won't change the result, but may make it slower."%(str(self.unroll_batch),str(self.bsize))
self.unroll_batch=1
if self.unroll_kern>0 and self.nkern % unroll_kern!=0: if self.unroll_kern>0 and self.nkern % unroll_kern!=0:
if self.nkern<self.unroll_kern: if self.nkern<=self.unroll_kern:
self.unroll_kern = self.nkern self.unroll_kern = self.nkern
else: else:
self.unroll_kern=1
print "OPTIMISATION WARNING: in ConvOp.__init__() unroll_kern(%s) should be 0 or a multiple of nkern(%s)We revert it to 1. This won't change the result, but may make it slower."%(str(self.unroll_kern),str(self.nkern)) print "OPTIMISATION WARNING: in ConvOp.__init__() unroll_kern(%s) should be 0 or a multiple of nkern(%s)We revert it to 1. This won't change the result, but may make it slower."%(str(self.unroll_kern),str(self.nkern))
if self.dx!=1 or self.dy!=1: self.unroll_kern=1
print "Warning, dx!=1 or dy!=1 only supported in python mode!" if (self.dx!=1 or self.dy!=1):
raise NotImplementedError() print "WARNING: dx(%d)!=1 or dy(%d)!=1. The gradient is not implemented for those case."
self.outshp = getFilterOutShp(self.imshp, kshp, (dx,dy), output_mode) self.outshp = getFilterOutShp(self.imshp, kshp, (dx,dy), output_mode)
self.out_mode = output_mode self.out_mode = output_mode
if not self.out_mode in ["valid", "full"]: if not self.out_mode in ["valid", "full"]:
...@@ -92,7 +91,7 @@ class ConvOp(Op): ...@@ -92,7 +91,7 @@ class ConvOp(Op):
raise Exception("The image and the kernel must have the same type." raise Exception("The image and the kernel must have the same type."
"inputs(%s), kerns(%s)"%(inputs.dtype, kerns.dtype)) "inputs(%s), kerns(%s)"%(inputs.dtype, kerns.dtype))
output = tensor.tensor(dtype=inputs.type.dtype, output = tensor.tensor(dtype=inputs.type.dtype,
broadcastable=[False]*outdim, broadcastable=[False]*outdim,
name="ConvOp_Output"); name="ConvOp_Output");
return gof.Apply(self, [inputs, kerns], [output]) return gof.Apply(self, [inputs, kerns], [output])
...@@ -131,7 +130,9 @@ class ConvOp(Op): ...@@ -131,7 +130,9 @@ class ConvOp(Op):
* inputs needs to be a 4D tensor. Couldn't get 3D to work * inputs needs to be a 4D tensor. Couldn't get 3D to work
* will crash if filter the same size as input image * will crash if filter the same size as input image
""" """
if self.dx!=1 or self.dy!=1:
raise NotImplementedError("I don't know how to implement the grad when dx!=1 or dy!=1! Is this possible?")
####### Determine gradient on kernels ######## ####### Determine gradient on kernels ########
if inputs.ndim == 3: if inputs.ndim == 3:
inputs = tensor.shape_padleft(inputs,1) inputs = tensor.shape_padleft(inputs,1)
...@@ -145,25 +146,27 @@ class ConvOp(Op): ...@@ -145,25 +146,27 @@ class ConvOp(Op):
(bsize, nkern) = (self.imshp[0], self.nkern) (bsize, nkern) = (self.imshp[0], self.nkern)
imshp = N.hstack((self.bsize, self.imshp[1:])) imshp = N.hstack((self.bsize, self.imshp[1:]))
kshp = self.outshp kshp = self.outshp
un_b = self.unroll_batch
un_k = self.unroll_kern
elif self.out_mode == 'full': elif self.out_mode == 'full':
(img, filters) = (newgz, newin) (img, filters) = (newgz, newin)
(bsize, nkern) = (self.nkern, self.imshp[0]) (bsize, nkern) = (self.nkern, self.imshp[0])
imshp = N.hstack((self.bsize, self.outshp)) imshp = N.hstack((self.bsize, self.outshp))
kshp = self.imshp[1:] kshp = self.imshp[1:]
un_b = self.unroll_kern
un_k = self.unroll_batch
else: else:
raise NotImplementedError('Only [full,valid] modes are currently supported.') raise NotImplementedError('Only [full,valid] modes are currently supported.')
filters = filters[:,:,::-1,::-1] filters = filters[:,:,::-1,::-1]
#find good value for the unroll #find good value for the unroll
un_b = self.unroll_batch
un_k = self.unroll_kern
if un_b!=0 and bsize%un_b!=0: if un_b!=0 and bsize%un_b!=0:
if bsize<un_b: if bsize<un_b:
un_b = bsize un_b = bsize
else: else:
un_b = 1 un_b = 1
print "OPTIMISATION WARNING: in ConvOp.grad() we can't determine a good unroll value for the batch. Maybe you can optimize this!" print "OPTIMISATION WARNING: in ConvOp.grad() we can't determine a good unroll value for the batch. Maybe you can optimize this!", bsize, un_b, self.unroll_batch, self.unroll_kern
if un_k!=0 and nkern%un_k!=0: if un_k!=0 and nkern%un_k!=0:
if nkern<un_k: if nkern<un_k:
un_k = nkern un_k = nkern
...@@ -238,7 +241,7 @@ using namespace std; ...@@ -238,7 +241,7 @@ using namespace std;
self.unroll_kern) self.unroll_kern)
#TODO: should we choose the unroll size automatically with the bigger divisor under 5? #TODO: should we choose the unroll size automatically with the bigger divisor under 5?
if self.out_mode == 'valid': if self.out_mode == 'valid' and self.dx==0 and self.dy==0:
# print "return gemm version" # print "return gemm version"
return _conv_op_code_valid_gemm % d return _conv_op_code_valid_gemm % d
else: else:
...@@ -388,8 +391,11 @@ if ((!%(z)s) ...@@ -388,8 +391,11 @@ if ((!%(z)s)
} }
int Os[2]; int Os[2];
if (mode == FULL) {Os[0] = dim_im[0]+dim_ker[0]-1; Os[1] = dim_im[1]+dim_ker[1]-1;} Os[0]=%(self_outshp0)s;
else {Os[0] = dim_im[0]-dim_ker[0]+1; Os[1] = dim_im[1]-dim_ker[1]+1;} Os[1]=%(self_outshp1)s;
//I keep the formula to calculte Os in case we need it in the futur.
//if (mode == FULL) {Os[0] = (int)ceil((dim_im[0]+dim_ker[0]-1)/float(%(self_dx)s)); Os[1] = ceil((dim_im[1]+dim_ker[1]-1)/float(%(self_dy)s));}
//else {Os[0] = (int)ceil((dim_im[0]-dim_ker[0]+1)/float(%(self_dx)s)); Os[1] = (int)ceil((dim_im[1]-dim_ker[1]+1)/float(%(self_dy)s));}
for(int b=0;b< %(self_bsize)s;b++){ for(int b=0;b< %(self_bsize)s;b++){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){ for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
...@@ -410,12 +416,14 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -410,12 +416,14 @@ for(int b=0;b< %(self_bsize)s;b++){
int new_m; int new_m;
for (int m=0; m < Os[0]; m++) { for (int iter_m=0; iter_m < Os[0]; iter_m++) {
// Reposition index into input image based on requested output size // Reposition index into input image based on requested output size
if (mode == FULL) new_m = m ; int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
else new_m = (m+dim_ker[0]-1); if (mode == FULL) new_m = pos_m ;
else new_m = (pos_m+dim_ker[0]-1);
for (int n=0; n < Os[1]; n++) { // loop over columns for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s;
%(type)s sum=0; %(type)s sum=0;
// Sum over kernel, if index into image is out of bounds // Sum over kernel, if index into image is out of bounds
...@@ -433,7 +441,7 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -433,7 +441,7 @@ for(int b=0;b< %(self_bsize)s;b++){
}else{ }else{
//do the part where kernel is to the right of the img //do the part where kernel is to the right of the img
int k=0,max_k=max((int)(n-dim_im[1])+1,0); int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
if(fill_value!=0){ if(fill_value!=0){
for(k=0;k<max_k;k++){ for(k=0;k<max_k;k++){
...@@ -442,9 +450,9 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -442,9 +450,9 @@ for(int b=0;b< %(self_bsize)s;b++){
}else {k=max_k;} }else {k=max_k;}
//do the part where the kernel is on the img //do the part where the kernel is on the img
max_k=min(n+1,(int)dim_ker[1]); max_k=min(pos_n+1,(int)dim_ker[1]);
const %(type)s * idx_in=&in[ind0*dim_im[1]]; const %(type)s * idx_in=&in[ind0*dim_im[1]];
for (int ind1=n-k; k<max_k; k++,ind1--) { for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
sum+= idx_hvals[k] * idx_in[ind1]; sum+= idx_hvals[k] * idx_in[ind1];
} }
//do the part to the left of the img //do the part to the left of the img
...@@ -454,14 +462,13 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -454,14 +462,13 @@ for(int b=0;b< %(self_bsize)s;b++){
}else{ }else{
const %(type)s* idx_in=&in[ind0*dim_im[1]]; //JB: should be dim_im[1] right? (was dim_im[0]) const %(type)s* idx_in=&in[ind0*dim_im[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
const %(type)s* idx_hvals=&hvals[j*dim_ker[1]]; const %(type)s* idx_hvals=&hvals[j*dim_ker[1]];
int new_n = (n+dim_ker[1]-1); int new_n = (pos_n+dim_ker[1]-1);
for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) { for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
sum+=idx_hvals[k]*idx_in[last]; sum+=idx_hvals[k]*idx_in[last];
} }
} }
}//for j }//for j
out[m*dim_zz[1]+n] %(affectation)s sum; out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
}//for n }//for n
}//for m }//for m
}//for stack_size }//for stack_size
...@@ -763,7 +770,11 @@ if(%(img2d)s->nd==2){ ...@@ -763,7 +770,11 @@ if(%(img2d)s->nd==2){
img2d_dim[1]=%(img2d)s->dimensions[1]; img2d_dim[1]=%(img2d)s->dimensions[1];
img2d_dim[0]=%(img2d)s->dimensions[0]; img2d_dim[0]=%(img2d)s->dimensions[0];
}else { }else {
PyErr_SetString(PyExc_ValueError, "img don't have a good shape"); std:stringstream temp;
temp << "nddim="<<%(img2d)s->nd;
std::string param = temp.str();
PyErr_SetString(PyExc_ValueError,
("img don't have a good shape. " + param).c_str());
%(fail)s; %(fail)s;
} }
...@@ -777,11 +788,7 @@ if(%(filtersflipped)s->nd==3){ ...@@ -777,11 +788,7 @@ if(%(filtersflipped)s->nd==3){
kerns_dim[1]=%(filtersflipped)s->dimensions[1]; kerns_dim[1]=%(filtersflipped)s->dimensions[1];
kerns_dim[0]=%(filtersflipped)s->dimensions[0]; kerns_dim[0]=%(filtersflipped)s->dimensions[0];
}else{ }else{
std:stringstream temp; PyErr_SetString(PyExc_ValueError, "kernel don't have a good shape");
temp << "nddim="<<%(filtersflipped)s->nd;
std::string param = temp.str();
PyErr_SetString(PyExc_ValueError,
("kernel don't have a good shape. " + param).c_str());
%(fail)s; %(fail)s;
} }
...@@ -844,8 +851,12 @@ if ((!%(z)s) ...@@ -844,8 +851,12 @@ if ((!%(z)s)
} }
int Os[2]; int Os[2];
if (mode == FULL) {Os[0] = dim_im[0]+dim_ker[0]-1; Os[1] = dim_im[1]+dim_ker[1]-1;} Os[0]=%(self_outshp0)s;
else {Os[0] = dim_im[0]-dim_ker[0]+1; Os[1] = dim_im[1]-dim_ker[1]+1;} Os[1]=%(self_outshp1)s;
//I keep the formula to calculte Os in case we need it in the futur.
//if (mode == FULL) {Os[0] = (int)ceil((dim_im[0]+dim_ker[0]-1)/float(%(self_dx)s)); Os[1] = ceil((dim_im[1]+dim_ker[1]-1)/float(%(self_dy)s));}
//else {Os[0] = (int)ceil((dim_im[0]-dim_ker[0]+1)/float(%(self_dx)s)); Os[1] = (int)ceil((dim_im[1]-dim_ker[1]+1)/float(%(self_dy)s));}
for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){ for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){
...@@ -866,12 +877,14 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){ ...@@ -866,12 +877,14 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
int new_m; int new_m;
for (int m=0; m < Os[0]; m++) { for (int iter_m=0; iter_m < Os[0]; iter_m++) {
// Reposition index into input image based on requested output size // Reposition index into input image based on requested output size
if (mode == FULL) new_m = m ; int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
else new_m = (m+dim_ker[0]-1); if (mode == FULL) new_m = pos_m ;
else new_m = (pos_m+dim_ker[0]-1);
for (int n=0; n < Os[1]; n++) { // loop over columns for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s;
"""%d """%d
ret+=my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize*unroll_ksize) ret+=my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize*unroll_ksize)
ret+=""" ret+="""
...@@ -895,7 +908,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){ ...@@ -895,7 +908,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
}else{ }else{
//do the part where kernel is to the right of the img //do the part where kernel is to the right of the img
int k=0,max_k=max((int)(n-dim_im[1])+1,0); int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
if(fill_value!=0){ if(fill_value!=0){
for(k=0;k<max_k;k++){ for(k=0;k<max_k;k++){
...@@ -906,11 +919,11 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){ ...@@ -906,11 +919,11 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
}else {k=max_k;} }else {k=max_k;}
//do the part where the kernel is on the img //do the part where the kernel is on the img
max_k=min(n+1,(int)dim_ker[1]); max_k=min(pos_n+1,(int)dim_ker[1]);
"""%d """%d
ret+=my_dup("const %(type)s * idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize) ret+=my_dup("const %(type)s * idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
ret+=""" ret+="""
for (int ind1=n-k; k<max_k; k++,ind1--) { for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
"""%d """%d
ret+=my_dup2("sum%(unroll_iter)s+= idx_hvals%(unroll_kiter)s[k] * idx_in%(unroll_biter)s[ind1];") ret+=my_dup2("sum%(unroll_iter)s+= idx_hvals%(unroll_kiter)s[k] * idx_in%(unroll_biter)s[ind1];")
...@@ -929,7 +942,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){ ...@@ -929,7 +942,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
ret+=my_dup("const %(type)s* idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize) ret+=my_dup("const %(type)s* idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
ret+=my_dup("const %(type)s* idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker[1]];",unroll_ksize) ret+=my_dup("const %(type)s* idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker[1]];",unroll_ksize)
ret+=""" ret+="""
int new_n = (n+dim_ker[1]-1); int new_n = (pos_n+dim_ker[1]-1);
for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) { for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
"""%d """%d
...@@ -940,7 +953,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){ ...@@ -940,7 +953,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
}//for j }//for j
"""%d """%d
ret+=my_dup("out%(unroll_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unroll_iter)s;", unroll_bsize*unroll_ksize) ret+=my_dup("out%(unroll_iter)s[iter_m*dim_zz[1]+iter_n] %(affectation)s sum%(unroll_iter)s;", unroll_bsize*unroll_ksize)
ret+=""" ret+="""
}//for n }//for n
}//for m }//for m
......
...@@ -90,16 +90,18 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll ...@@ -90,16 +90,18 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll
####### test with new sp.convolve2 function ###### ####### test with new sp.convolve2 function ######
time1 = time.time() time1 = time.time()
hid, outshp2 = convolve2(kern, kshp, nkern, img, imshp, hid, outshp2 = convolve2(kern, kshp, nkern, img, imshp,
bsize, (1,1), mode=conv_mode) bsize, (ss[0],ss[1]), mode=conv_mode)
propup = function([kern, img], hid) propup = function([kern, img], hid)
propup1 = function([kern, img], hid,mode=Mode(linker="py")) propup1 = function([kern, img], hid,mode=Mode(linker="py"))
hidval = propup(w_flip.reshape(nkern,-1), imgval.reshape(bsize,-1)) hidval = propup(w_flip.reshape(nkern,-1), imgval.reshape(bsize,-1))
hidval = hidval.reshape(bsize,nkern,outshp2[-2],outshp2[-1])[:,:,::ss[0],::ss[1]] hidval = hidval.reshape(bsize,nkern,outshp2[-2],outshp2[-1])
# hidval = hidval[:,:,::ss[0],::ss[1]]
hidval = hidval.reshape(bsize, -1) hidval = hidval.reshape(bsize, -1)
for i in range(repeat): for i in range(repeat):
hidval1 = propup1(w_flip.reshape(nkern,-1), imgval.reshape(bsize,-1)) hidval1 = propup1(w_flip.reshape(nkern,-1), imgval.reshape(bsize,-1))
hidval1 = hidval1.reshape(bsize,nkern,outshp2[-2],outshp2[-1])[:,:,::ss[0],::ss[1]] hidval1 = hidval1.reshape(bsize,nkern,outshp2[-2],outshp2[-1])
# hidval1 = hidval1[:,:,::ss[0],::ss[1]]
hidval1 = hidval1.reshape(bsize, -1) hidval1 = hidval1.reshape(bsize, -1)
assert (N.abs(hidval-hidval1)<1e-5).all() assert (N.abs(hidval-hidval1)<1e-5).all()
...@@ -113,7 +115,7 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll ...@@ -113,7 +115,7 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll
hidval1=outval.copy() hidval1=outval.copy()
# ConvOp # ConvOp
conv_op = ConvOp(imshp, kshp, nkern, bsize, 1,1, conv_mode, unroll_batch=unroll_batch, unroll_kern=unroll_kern)(inputs4, kerns4) conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode, unroll_batch=unroll_batch, unroll_kern=unroll_kern)(inputs4, kerns4)
l1shp=N.hstack((nkern, l1shp=N.hstack((nkern,
getFilterOutShp(imshp, kshp, ss, conv_mode))) getFilterOutShp(imshp, kshp, ss, conv_mode)))
propup2 = function([inputs4, kerns4], conv_op) propup2 = function([inputs4, kerns4], conv_op)
...@@ -122,14 +124,14 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll ...@@ -122,14 +124,14 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll
time1 = time.time() time1 = time.time()
for i in range(repeat): for i in range(repeat):
hidval2_ = propup2(imgval,w_flip) hidval2_ = propup2(imgval,w_flip)
hidval2 = hidval2_[:,:,0::ss[0],0::ss[1]] hidval2 = hidval2_#[:,:,0::ss[0],0::ss[1]]
tctot += time.time() - time1 tctot += time.time() - time1
if conv_op_py: if conv_op_py:
time1 = time.time() time1 = time.time()
for i in range(repeat): for i in range(repeat):
hidval3_ = propup3(imgval,w_flip) hidval3_ = propup3(imgval,w_flip)
hidval3 = hidval3_[:,:,0::ss[0],0::ss[1]] hidval3 = hidval3_#[:,:,0::ss[0],0::ss[1]]
tpytot += time.time() - time1 tpytot += time.time() - time1
assert (N.abs(hidval2-hidval3)<1e-5).all() assert (N.abs(hidval2-hidval3)<1e-5).all()
else: else:
...@@ -235,7 +237,7 @@ class TestConvOp(unittest.TestCase): ...@@ -235,7 +237,7 @@ class TestConvOp(unittest.TestCase):
# compute with new convolve2 (no timing info) # compute with new convolve2 (no timing info)
output4, outshp4 = convolve2(kerns, kshp, nkern, input,\ output4, outshp4 = convolve2(kerns, kshp, nkern, input,\
imshp, bsize, (1,1), bias=bias, mode=conv_mode) imshp, bsize, (ss[0],ss[1]), bias=bias, mode=conv_mode)
# print 'output4', output4 # print 'output4', output4
ttime1 = time.time() ttime1 = time.time()
...@@ -244,7 +246,7 @@ class TestConvOp(unittest.TestCase): ...@@ -244,7 +246,7 @@ class TestConvOp(unittest.TestCase):
# print 'out4', out4, img1d, filtersflipped # print 'out4', out4, img1d, filtersflipped
tconv2 += [time.time() - ttime1] tconv2 += [time.time() - ttime1]
out4 = out4.reshape(bsize, nkern, outshp4[1], outshp4[2]) out4 = out4.reshape(bsize, nkern, outshp4[1], outshp4[2])
out4 = out4[:,:,0::ss[0],0::ss[1]] out4 = out4#[:,:,0::ss[0],0::ss[1]]
out4 = out4.reshape(bsize, -1) out4 = out4.reshape(bsize, -1)
# compute with ConvOp # compute with ConvOp
...@@ -252,18 +254,18 @@ class TestConvOp(unittest.TestCase): ...@@ -252,18 +254,18 @@ class TestConvOp(unittest.TestCase):
inputs=dmatrix3() inputs=dmatrix3()
kerns3=dmatrix3() kerns3=dmatrix3()
bia=T.dscalar() bia=T.dscalar()
conv_op = ConvOp(imshp, kshp, nkern, bsize, 1,1, conv_mode)(inputs, kerns3) conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode)(inputs, kerns3)
f2 = function([inputs, kerns3], conv_op, mode=Mode(linker="c")) f2 = function([inputs, kerns3], conv_op, mode=Mode(linker="c"))
f3 = function([inputs, kerns3], conv_op, mode=Mode(linker="py")) f3 = function([inputs, kerns3], conv_op, mode=Mode(linker="py"))
ttime1 = time.time() ttime1 = time.time()
out2_ = f2(img2d, filtersflipped) out2_ = f2(img2d, filtersflipped)
out2__ = out2_[:,:,0::ss[0],0::ss[1]] out2__ = out2_#[:,:,0::ss[0],0::ss[1]]
tconvop += [time.time() - ttime1] tconvop += [time.time() - ttime1]
out2___ = out2__.copy() out2___ = out2__.copy()
out2 = out2___ + biasvals.reshape(1,nkern,1,1) out2 = out2___ + biasvals.reshape(1,nkern,1,1)
out3_ = f3(img2d, filtersflipped) out3_ = f3(img2d, filtersflipped)
out3__ = out3_[:,:,0::ss[0],0::ss[1]] out3__ = out3_#[:,:,0::ss[0],0::ss[1]]
out3___ = out3__.copy() out3___ = out3__.copy()
out3 = out3___ + biasvals.reshape(1,nkern,1,1) out3 = out3___ + biasvals.reshape(1,nkern,1,1)
assert (N.abs(out2_-out3_)<1e-5).all() assert (N.abs(out2_-out3_)<1e-5).all()
...@@ -302,15 +304,21 @@ class TestConvOp(unittest.TestCase): ...@@ -302,15 +304,21 @@ class TestConvOp(unittest.TestCase):
print 'speed up ConvOp vs convolve2d: %.3f'%d.mean(),d print 'speed up ConvOp vs convolve2d: %.3f'%d.mean(),d
def test_multilayer_conv(self): def test_multilayer_conv(self):
print '\n\n*************************************************'
print ' TEST MULTILAYER CONVOLUTION'
print '*************************************************'
# fixed parameters # fixed parameters
# test multiple configuration at the same time
bsizes = [6,6] # batch size bsizes = [6,6] # batch size
imshp_starts = [(1,28,28),(1,4,4)] imshp_starts = [(1,13,14),(1,4,5)]
kshpss = ([[5,6],[7,4]],[[2,2],[2,2]]) kshpss = ([[5,6],[7,4]],[[2,2],[2,2]])
nkernss = [[20,40],[2,2]] # per output pixel nkernss = [[20,40],[2,2]] # per output pixel
ssizess = [[(1,1),(2,2)],[(1,1),(2,2)]] ssizess = [[(1,1),(1,2)],[(1,1),(2,2)]]
convmodes = ['valid','full'] convmodes = ['valid','full']
do_convolve2=True do_convolve2=True
unroll = [(0,0),(1,1),(2,2),(3,2)]#(batch,kern) unroll = [(0,0),(1,1),(2,2),(3,2)]#(batch,kern)
do_speed_test = False
# TODO: this version show a bug that was fixed # TODO: this version show a bug that was fixed
# the test is included in the upper test. # the test is included in the upper test.
...@@ -319,15 +327,6 @@ class TestConvOp(unittest.TestCase): ...@@ -319,15 +327,6 @@ class TestConvOp(unittest.TestCase):
# nkerns = [2,2] # per output pixel # nkerns = [2,2] # per output pixel
# ssizes = [(1,1),(2,2)]#2,2)] # ssizes = [(1,1),(2,2)]#2,2)]
#test speed
# bsize = 10 # batch size
# imshp_start = (1,50,49)#un square shape to test more corner case.
# kshps = ([11,12],[12,11])#un square shape to test more corner case.
# nkerns = [20,20] # per output pixel
# ssizes = [(1,1),]#(1,1)]#(2,2) bugged
# convmodes = ['valid','full']
# do_convolve2=False
N.set_printoptions(threshold=N.nan) N.set_printoptions(threshold=N.nan)
# symbolic stuff # symbolic stuff
...@@ -338,7 +337,7 @@ class TestConvOp(unittest.TestCase): ...@@ -338,7 +337,7 @@ class TestConvOp(unittest.TestCase):
for i in range(len(kshpss)): for i in range(len(kshpss)):
assert len(kshpss[i])==len(nkernss[i])==len(kerns) assert len(kshpss[i])==len(nkernss[i])==len(kerns)
if False: if do_speed_test:
# calculate the speed up of different combination of unroll # calculate the speed up of different combination of unroll
# put the paramter to the same you will try. # put the paramter to the same you will try.
...@@ -418,16 +417,19 @@ class TestConvOp(unittest.TestCase): ...@@ -418,16 +417,19 @@ class TestConvOp(unittest.TestCase):
d=N.asarray(ntot)/tpytot d=N.asarray(ntot)/tpytot
print 'speed up py theano(ConvOp) vs convolve2d: %.3fx'%d.mean(),d print 'speed up py theano(ConvOp) vs convolve2d: %.3fx'%d.mean(),d
def test_ConvOpGrad(self): def test_ConvOpGrad(self):
""" """
test the gradient in float and double test the gradient in float and double
""" """
print '\n\n*************************************************'
print ' TEST ConvOp.grad'
print '*************************************************'
nkern = 4 nkern = 4
bsize = 3 bsize = 3
types = ["float32", "float64"] types = ["float32", "float64"]
kshps = [(5,5), (6,7)] kshps = [(5,5), (6,7)]
imshps = [(1,5,5), (2,8,8), (3,8,7)] imshps = [(1,5,5), (2,8,7)]
modes = ['valid', 'full'] modes = ['valid', 'full']
unroll_batch=[0,1,3] unroll_batch=[0,1,3]
unroll_kern=[0,1,4] unroll_kern=[0,1,4]
...@@ -468,19 +470,22 @@ class TestConvOp(unittest.TestCase): ...@@ -468,19 +470,22 @@ class TestConvOp(unittest.TestCase):
tol=None if typ!="float32" else 0.16) tol=None if typ!="float32" else 0.16)
if __name__ == '__main__': if __name__ == '__main__':
# t = TestConvOp("test_convolution") t = TestConvOp("test_convolution")
# t.test_convolution() # t.test_convolution()
# t.test_multilayer_conv() t.test_multilayer_conv()
# from theano.tests import main # from theano.tests import main
# main("test_sp") # main("test_sp")
bsize = 20 # batch size if False:
imshp_start = (1,100,100)#un square shape to test more corner case. #used to lanch 8 jobs at the same time.
kshps = ([11,12],[12,11])#un square shape to test more corner case. bsize = 20 # batch size
nkerns = [20,20] # per output pixel imshp_start = (1,100,100)#un square shape to test more corner case.
ssizes = [(1,1),]#(1,1)]#(2,2) bugged kshps = ([11,12],[12,11])#un square shape to test more corner case.
convmodes = ['valid','full'] nkerns = [20,20] # per output pixel
unroll_batch = 5 ssizes = [(1,1),]#(1,1)]#(2,2) bugged
unroll_kern = 2 convmodes = ['valid','full']
ctot=0 unroll_batch = 5
tctot, tpytot, ntot = exec_multilayer_conv_nnet(convmodes[1], ssizes[0], bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_batch, unroll_kern=unroll_kern, validate=False, do_print=False,repeat=5) unroll_kern = 2
print "total exec time %.3fs"%tctot ctot=0
tctot, tpytot, ntot = exec_multilayer_conv_nnet(convmodes[1], ssizes[0], bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_batch, unroll_kern=unroll_kern, validate=False, do_print=False,repeat=5)
print "total exec time %.3fs"%tctot
...@@ -30,7 +30,7 @@ _mtypes = [sparse.csc_matrix, sparse.csr_matrix] ...@@ -30,7 +30,7 @@ _mtypes = [sparse.csc_matrix, sparse.csr_matrix]
_mtype_to_str = {sparse.csc_matrix: "csc", sparse.csr_matrix: "csr"} _mtype_to_str = {sparse.csc_matrix: "csc", sparse.csr_matrix: "csr"}
import scipy import scipy
if scipy.__version__ != '0.7.0': if not scipy.__version__.startswith('0.7.'):
sys.stderr.write("WARNING: scipy version = %s. We prefer version >=0.7.0 because it has bugs fixed in the sparse matrix code.\n" % scipy.__version__) sys.stderr.write("WARNING: scipy version = %s. We prefer version >=0.7.0 because it has bugs fixed in the sparse matrix code.\n" % scipy.__version__)
def _is_sparse_variable(x): def _is_sparse_variable(x):
......
...@@ -764,8 +764,10 @@ class CrossentropyCategorical1Hot(gof.Op): ...@@ -764,8 +764,10 @@ class CrossentropyCategorical1Hot(gof.Op):
_true_one_of_n = tensor.as_tensor_variable(true_one_of_n) _true_one_of_n = tensor.as_tensor_variable(true_one_of_n)
if _coding_dist.type.ndim != 2: if _coding_dist.type.ndim != 2:
raise TypeError('matrix required for argument: coding_dist') raise TypeError('matrix required for argument: coding_dist')
if _true_one_of_n.type != tensor.lvector: if _true_one_of_n.type not in (tensor.lvector, tensor.ivector):
raise TypeError('integer vector required for argument: true_one_of_n') raise TypeError('integer vector required for argument: true_one_of_n'
'(got type: %s instead of: %s)' % (_true_one_of_n.type,
tensor.lvector))
return gof.Apply(self, [_coding_dist, _true_one_of_n], [tensor.dvector()]) return gof.Apply(self, [_coding_dist, _true_one_of_n], [tensor.dvector()])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论