提交 9e152f02 authored 作者: Frederic Bastien's avatar Frederic Bastien

renamed unloop to unroll for more consistency.

上级 32bf9b72
...@@ -638,14 +638,14 @@ Py_XDECREF(img2d); ...@@ -638,14 +638,14 @@ Py_XDECREF(img2d);
""" """
def gen_conv_code_unroll_batch(d,unloop_size=1): def gen_conv_code_unroll_batch(d,unroll_size=1):
""" c_code for ConvOp that unroll the batch size loop """ c_code for ConvOp that unroll the batch size loop
""" """
d["unloop_size"]=unloop_size d["unroll_size"]=unroll_size
def my_dup(st): def my_dup(st):
s="" s=""
for i in range(unloop_size): for i in range(unroll_size):
d["unloop_iter"]=i d["unroll_iter"]=i
s+=st%d s+=st%d
return s return s
ret = """ ret = """
...@@ -764,7 +764,7 @@ if ((!%(z)s) ...@@ -764,7 +764,7 @@ if ((!%(z)s)
int Os[2]; int Os[2];
if (mode == FULL) {Os[0] = dim_im[0]+dim_ker[0]-1; Os[1] = dim_im[1]+dim_ker[1]-1;} if (mode == FULL) {Os[0] = dim_im[0]+dim_ker[0]-1; Os[1] = dim_im[1]+dim_ker[1]-1;}
else {Os[0] = dim_im[0]-dim_ker[0]+1; Os[1] = dim_im[1]-dim_ker[1]+1;} else {Os[0] = dim_im[0]-dim_ker[0]+1; Os[1] = dim_im[1]-dim_ker[1]+1;}
for(int b=0;b< %(self_bsize)s ;b+=%(unloop_size)s){ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_size)s){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){ for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
//assertions //assertions
...@@ -773,12 +773,12 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_size)s){ ...@@ -773,12 +773,12 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_size)s){
if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s; if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s; if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
"""%d """%d
ret+=my_dup("%(type)s * __restrict__ out%(unloop_iter)s=(%(type)s *)(PyArray_GETPTR2(%(z)s,b+%(unloop_iter)s,n_kern));\n") ret+=my_dup("%(type)s * __restrict__ out%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(%(z)s,b+%(unroll_iter)s,n_kern));\n")
ret+=my_dup("for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unloop_iter)s[i] = 0;") ret+=my_dup("for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unroll_iter)s[i] = 0;")
ret+=""" ret+="""
for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){ for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
"""%d """%d
ret+=my_dup("const %(type)s * __restrict__ in%(unloop_iter)d=(%(type)s *)(PyArray_GETPTR2(img2d,b+%(unloop_iter)s,stack_size));\n") ret+=my_dup("const %(type)s * __restrict__ in%(unroll_iter)d=(%(type)s *)(PyArray_GETPTR2(img2d,b+%(unroll_iter)s,stack_size));\n")
ret+=""" ret+="""
const %(type)s * __restrict__ hvals=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern,stack_size)); const %(type)s * __restrict__ hvals=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern,stack_size));
...@@ -791,7 +791,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_size)s){ ...@@ -791,7 +791,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_size)s){
for (int n=0; n < Os[1]; n++) { // loop over columns for (int n=0; n < Os[1]; n++) { // loop over columns
"""%d """%d
ret+=my_dup("%(type)s sum%(unloop_iter)s=0;\n") ret+=my_dup("%(type)s sum%(unroll_iter)s=0;\n")
ret+=""" ret+="""
// Sum over kernel, if index into image is out of bounds // Sum over kernel, if index into image is out of bounds
...@@ -806,7 +806,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_size)s){ ...@@ -806,7 +806,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_size)s){
for (int k=0; k < dim_ker[1]; k++) { for (int k=0; k < dim_ker[1]; k++) {
%(type)s tmp = idx_hvals[k] * fill_value; %(type)s tmp = idx_hvals[k] * fill_value;
"""%d """%d
ret+=my_dup("sum%(unloop_iter)s += tmp;\n") ret+=my_dup("sum%(unroll_iter)s += tmp;\n")
ret+=""" ret+="""
} }
}else{ }else{
...@@ -818,7 +818,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_size)s){ ...@@ -818,7 +818,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_size)s){
for(k=0;k<max_k;k++){ for(k=0;k<max_k;k++){
%(type)s tmp = idx_hvals[k] * fill_value; %(type)s tmp = idx_hvals[k] * fill_value;
"""%d """%d
ret+=my_dup("sum%(unloop_iter)s += tmp;\n") ret+=my_dup("sum%(unroll_iter)s += tmp;\n")
ret+=""" ret+="""
} }
}else {k=max_k;} }else {k=max_k;}
...@@ -826,11 +826,11 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_size)s){ ...@@ -826,11 +826,11 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_size)s){
//do the part where the kernel is on the img //do the part where the kernel is on the img
max_k=min(n+1,(int)dim_ker[1]); max_k=min(n+1,(int)dim_ker[1]);
"""%d """%d
ret+=my_dup("const %(type)s * idx_in%(unloop_iter)s=&in%(unloop_iter)s[ind0*dim_im[1]];\n") ret+=my_dup("const %(type)s * idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];\n")
ret+=""" ret+="""
for (int ind1=n-k; k<max_k; k++,ind1--) { for (int ind1=n-k; k<max_k; k++,ind1--) {
"""%d """%d
ret+=my_dup("sum%(unloop_iter)s+= idx_hvals[k] * idx_in%(unloop_iter)s[ind1];\n") ret+=my_dup("sum%(unroll_iter)s+= idx_hvals[k] * idx_in%(unroll_iter)s[ind1];\n")
ret+=""" ret+="""
} }
//do the part to the left of the img //do the part to the left of the img
...@@ -838,28 +838,28 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_size)s){ ...@@ -838,28 +838,28 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_size)s){
for(;k<dim_ker[1];k++){ for(;k<dim_ker[1];k++){
%(type)s tmp = idx_hvals[k] * fill_value; %(type)s tmp = idx_hvals[k] * fill_value;
"""%d """%d
ret+=my_dup("sum%(unloop_iter)s += tmp;\n") ret+=my_dup("sum%(unroll_iter)s += tmp;\n")
ret+=""" ret+="""
} }
} }
}else{ }else{
"""%d """%d
ret+=my_dup("const %(type)s* idx_in%(unloop_iter)s=&in%(unloop_iter)s[ind0*dim_im[1]];\n") ret+=my_dup("const %(type)s* idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];\n")
ret+=""" ret+="""
const %(type)s* idx_hvals=&hvals[j*dim_ker[1]]; const %(type)s* idx_hvals=&hvals[j*dim_ker[1]];
int new_n = (n+dim_ker[1]-1); int new_n = (n+dim_ker[1]-1);
for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) { for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
"""%d """%d
ret+=my_dup("sum%(unloop_iter)s+=idx_hvals[k]*idx_in%(unloop_iter)s[last];\n") ret+=my_dup("sum%(unroll_iter)s+=idx_hvals[k]*idx_in%(unroll_iter)s[last];\n")
ret+=""" ret+="""
} }
} }
}//for j }//for j
"""%d """%d
ret+=my_dup("out%(unloop_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unloop_iter)s;\n") ret+=my_dup("out%(unroll_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unroll_iter)s;\n")
# ret+=my_dup("cout<<sum%(unloop_iter)s<<endl;") # ret+=my_dup("cout<<sum%(unroll_iter)s<<endl;")
ret+=""" ret+="""
}//for n }//for n
}//for m }//for m
...@@ -878,14 +878,14 @@ Py_XDECREF(filtersflipped); ...@@ -878,14 +878,14 @@ Py_XDECREF(filtersflipped);
def gen_conv_code_unroll_kern(d,unloop_size=1): def gen_conv_code_unroll_kern(d,unroll_size=1):
""" c_code for ConvOp that unroll the batch size loop """ c_code for ConvOp that unroll the batch size loop
""" """
d["unloop_size"]=unloop_size d["unroll_size"]=unroll_size
def my_dup(st): def my_dup(st):
s="" s=""
for i in range(unloop_size): for i in range(unroll_size):
d["unloop_iter"]=i d["unroll_iter"]=i
s+=st%d s+=st%d
return s return s
ret = """ ret = """
...@@ -1006,7 +1006,7 @@ if (mode == FULL) {Os[0] = dim_im[0]+dim_ker[0]-1; Os[1] = dim_im[1]+dim_ker[1]- ...@@ -1006,7 +1006,7 @@ if (mode == FULL) {Os[0] = dim_im[0]+dim_ker[0]-1; Os[1] = dim_im[1]+dim_ker[1]-
else {Os[0] = dim_im[0]-dim_ker[0]+1; Os[1] = dim_im[1]-dim_ker[1]+1;} else {Os[0] = dim_im[0]-dim_ker[0]+1; Os[1] = dim_im[1]-dim_ker[1]+1;}
for(int b=0;b< %(self_bsize)s;b++){ for(int b=0;b< %(self_bsize)s;b++){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unloop_size)s){ for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_size)s){
//assertions //assertions
if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s; if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
...@@ -1014,15 +1014,15 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -1014,15 +1014,15 @@ for(int b=0;b< %(self_bsize)s;b++){
if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s; if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s; if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
"""%d """%d
ret+=my_dup("%(type)s * __restrict__ out%(unloop_iter)s=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern+%(unloop_iter)s));") ret+=my_dup("%(type)s * __restrict__ out%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern+%(unroll_iter)s));")
ret+=my_dup("for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unloop_iter)s[i] = 0;") ret+=my_dup("for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unroll_iter)s[i] = 0;")
ret+=""" ret+="""
for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){ for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
const %(type)s * __restrict__ in=(%(type)s *)(PyArray_GETPTR2(img2d,b,stack_size)); const %(type)s * __restrict__ in=(%(type)s *)(PyArray_GETPTR2(img2d,b,stack_size));
"""%d """%d
ret+=my_dup("const %(type)s * __restrict__ hvals%(unloop_iter)s=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern+%(unloop_iter)s,stack_size));") ret+=my_dup("const %(type)s * __restrict__ hvals%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern+%(unroll_iter)s,stack_size));")
ret+=""" ret+="""
int new_m; int new_m;
...@@ -1034,7 +1034,7 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -1034,7 +1034,7 @@ for(int b=0;b< %(self_bsize)s;b++){
for (int n=0; n < Os[1]; n++) { // loop over columns for (int n=0; n < Os[1]; n++) { // loop over columns
"""%d """%d
ret+=my_dup("%(type)s sum%(unloop_iter)s=0;") ret+=my_dup("%(type)s sum%(unroll_iter)s=0;")
ret+=""" ret+="""
// Sum over kernel, if index into image is out of bounds // Sum over kernel, if index into image is out of bounds
...@@ -1044,13 +1044,13 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -1044,13 +1044,13 @@ for(int b=0;b< %(self_bsize)s;b++){
if(mode==FULL){ if(mode==FULL){
"""%d """%d
ret+=my_dup("const %(type)s * idx_hvals%(unloop_iter)s=&hvals%(unloop_iter)s[j*dim_ker[1]];") ret+=my_dup("const %(type)s * idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker[1]];")
ret+=""" ret+="""
if(ind0 < 0 || ind0 >= dim_im[0]){ if(ind0 < 0 || ind0 >= dim_im[0]){
if(fill_value!=0) if(fill_value!=0)
for (int k=0; k < dim_ker[1]; k++) { for (int k=0; k < dim_ker[1]; k++) {
"""%d """%d
ret+=my_dup("sum%(unloop_iter)s += idx_hvals%(unloop_iter)s[k] * fill_value;") ret+=my_dup("sum%(unroll_iter)s += idx_hvals%(unroll_iter)s[k] * fill_value;")
ret+=""" ret+="""
} }
}else{ }else{
...@@ -1061,7 +1061,7 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -1061,7 +1061,7 @@ for(int b=0;b< %(self_bsize)s;b++){
for(k=0;k<max_k;k++){ for(k=0;k<max_k;k++){
"""%d """%d
ret+=my_dup("sum%(unloop_iter)s += idx_hvals%(unloop_iter)s[k]*fill_value;") ret+=my_dup("sum%(unroll_iter)s += idx_hvals%(unroll_iter)s[k]*fill_value;")
ret+=""" ret+="""
} }
...@@ -1072,33 +1072,33 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -1072,33 +1072,33 @@ for(int b=0;b< %(self_bsize)s;b++){
const %(type)s * idx_in=&in[ind0*dim_im[1]]; const %(type)s * idx_in=&in[ind0*dim_im[1]];
for (int ind1=n-k; k<max_k; k++,ind1--) { for (int ind1=n-k; k<max_k; k++,ind1--) {
"""%d """%d
ret+=my_dup("sum%(unloop_iter)s += idx_hvals%(unloop_iter)s[k] * idx_in[ind1];") ret+=my_dup("sum%(unroll_iter)s += idx_hvals%(unroll_iter)s[k] * idx_in[ind1];")
ret+=""" ret+="""
} }
//do the part to the left of the img //do the part to the left of the img
if(fill_value!=0) if(fill_value!=0)
for(;k<dim_ker[1];k++){ for(;k<dim_ker[1];k++){
"""%d """%d
ret+=my_dup("sum%(unloop_iter)s+= idx_hvals%(unloop_iter)s[k]*fill_value;") ret+=my_dup("sum%(unroll_iter)s+= idx_hvals%(unroll_iter)s[k]*fill_value;")
ret+=""" ret+="""
} }
} }
}else{ }else{
const %(type)s* idx_in=&in[ind0*dim_im[1]]; const %(type)s* idx_in=&in[ind0*dim_im[1]];
"""%d """%d
ret+=my_dup("const %(type)s* idx_hvals%(unloop_iter)s=&hvals%(unloop_iter)s[j*dim_ker[1]];") ret+=my_dup("const %(type)s* idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker[1]];")
ret+=""" ret+="""
int new_n = (n+dim_ker[1]-1); int new_n = (n+dim_ker[1]-1);
for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) { for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
"""%d """%d
ret+=my_dup("sum%(unloop_iter)s += idx_hvals%(unloop_iter)s[k]*idx_in[last];") ret+=my_dup("sum%(unroll_iter)s += idx_hvals%(unroll_iter)s[k]*idx_in[last];")
ret+=""" ret+="""
} }
} }
}//for j }//for j
"""%d """%d
ret+=my_dup("out%(unloop_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unloop_iter)s;") ret+=my_dup("out%(unroll_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unroll_iter)s;")
ret+=""" ret+="""
}//for n }//for n
}//for m }//for m
...@@ -1112,25 +1112,25 @@ Py_XDECREF(filtersflipped); ...@@ -1112,25 +1112,25 @@ Py_XDECREF(filtersflipped);
def gen_conv_code_unroll_batch_kern(d,unloop_bsize=1, unloop_ksize=1): def gen_conv_code_unroll_batch_kern(d,unroll_bsize=1, unroll_ksize=1):
""" c_code for ConvOp that unroll the batch size loop """ c_code for ConvOp that unroll the batch size loop
""" """
d["unloop_bsize"]=unloop_bsize d["unroll_bsize"]=unroll_bsize
d["unloop_ksize"]=unloop_ksize d["unroll_ksize"]=unroll_ksize
def my_dup(st,size): def my_dup(st,size):
s="" s=""
for i in range(size): for i in range(size):
d["unloop_iter"]=i d["unroll_iter"]=i
s+=st%d s+=st%d
return s+"\n" return s+"\n"
def my_dup2(st): def my_dup2(st):
s="" s=""
iter=0 iter=0
for i in range(unloop_bsize): for i in range(unroll_bsize):
d["unloop_biter"]=i d["unroll_biter"]=i
for j in range(unloop_ksize): for j in range(unroll_ksize):
d["unloop_kiter"]=j d["unroll_kiter"]=j
d["unloop_iter"]=iter d["unroll_iter"]=iter
iter+=1 iter+=1
s+=st%d s+=st%d
return s+"\n" return s+"\n"
...@@ -1250,8 +1250,8 @@ if ((!%(z)s) ...@@ -1250,8 +1250,8 @@ if ((!%(z)s)
int Os[2]; int Os[2];
if (mode == FULL) {Os[0] = dim_im[0]+dim_ker[0]-1; Os[1] = dim_im[1]+dim_ker[1]-1;} if (mode == FULL) {Os[0] = dim_im[0]+dim_ker[0]-1; Os[1] = dim_im[1]+dim_ker[1]-1;}
else {Os[0] = dim_im[0]-dim_ker[0]+1; Os[1] = dim_im[1]-dim_ker[1]+1;} else {Os[0] = dim_im[0]-dim_ker[0]+1; Os[1] = dim_im[1]-dim_ker[1]+1;}
for(int b=0;b< %(self_bsize)s ;b+=%(unloop_bsize)s){ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unloop_ksize)s){ for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){
//assertions //assertions
if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s; if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
...@@ -1259,13 +1259,13 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_bsize)s){ ...@@ -1259,13 +1259,13 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_bsize)s){
if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s; if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s; if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
"""%d """%d
ret+=my_dup2("%(type)s * __restrict__ out%(unloop_iter)s=(%(type)s *)(PyArray_GETPTR2(%(z)s,b+%(unloop_biter)s,n_kern+%(unloop_kiter)s));") ret+=my_dup2("%(type)s * __restrict__ out%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(%(z)s,b+%(unroll_biter)s,n_kern+%(unroll_kiter)s));")
ret+=my_dup("for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unloop_iter)s[i] = 0;",unloop_bsize*unloop_ksize) ret+=my_dup("for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unroll_iter)s[i] = 0;",unroll_bsize*unroll_ksize)
ret+=""" ret+="""
for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){ for(int stack_size=0;stack_size<%(self_imshp0)s;stack_size++){
"""%d """%d
ret+=my_dup("const %(type)s * __restrict__ in%(unloop_iter)d=(%(type)s *)(PyArray_GETPTR2(img2d,b+%(unloop_iter)s,stack_size));", unloop_bsize) ret+=my_dup("const %(type)s * __restrict__ in%(unroll_iter)d=(%(type)s *)(PyArray_GETPTR2(img2d,b+%(unroll_iter)s,stack_size));", unroll_bsize)
ret+=my_dup("const %(type)s * __restrict__ hvals%(unloop_iter)s=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern+%(unloop_iter)s,stack_size));",unloop_ksize) ret+=my_dup("const %(type)s * __restrict__ hvals%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(filtersflipped,n_kern+%(unroll_iter)s,stack_size));",unroll_ksize)
ret+=""" ret+="""
int new_m; int new_m;
...@@ -1277,7 +1277,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_bsize)s){ ...@@ -1277,7 +1277,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_bsize)s){
for (int n=0; n < Os[1]; n++) { // loop over columns for (int n=0; n < Os[1]; n++) { // loop over columns
"""%d """%d
ret+=my_dup("%(type)s sum%(unloop_iter)s=0;", unloop_bsize*unloop_ksize) ret+=my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize*unroll_ksize)
ret+=""" ret+="""
// Sum over kernel, if index into image is out of bounds // Sum over kernel, if index into image is out of bounds
...@@ -1287,13 +1287,13 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_bsize)s){ ...@@ -1287,13 +1287,13 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_bsize)s){
if(mode==FULL){ if(mode==FULL){
"""%d """%d
ret+=my_dup("const %(type)s * idx_hvals%(unloop_iter)s=&hvals%(unloop_iter)s[j*dim_ker[1]];",unloop_ksize) ret+=my_dup("const %(type)s * idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker[1]];",unroll_ksize)
ret+=""" ret+="""
if(ind0 < 0 || ind0 >= dim_im[0]){ if(ind0 < 0 || ind0 >= dim_im[0]){
if(fill_value!=0) if(fill_value!=0)
for (int k=0; k < dim_ker[1]; k++) { for (int k=0; k < dim_ker[1]; k++) {
"""%d """%d
ret+=my_dup2("sum%(unloop_iter)s += idx_hvals%(unloop_kiter)s[k] * fill_value;") ret+=my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
ret+=""" ret+="""
} }
}else{ }else{
...@@ -1304,7 +1304,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_bsize)s){ ...@@ -1304,7 +1304,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_bsize)s){
for(k=0;k<max_k;k++){ for(k=0;k<max_k;k++){
"""%d """%d
ret+=my_dup2("sum%(unloop_iter)s += idx_hvals%(unloop_kiter)s[k] * fill_value;") ret+=my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
ret+=""" ret+="""
} }
}else {k=max_k;} }else {k=max_k;}
...@@ -1312,41 +1312,41 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_bsize)s){ ...@@ -1312,41 +1312,41 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unloop_bsize)s){
//do the part where the kernel is on the img //do the part where the kernel is on the img
max_k=min(n+1,(int)dim_ker[1]); max_k=min(n+1,(int)dim_ker[1]);
"""%d """%d
ret+=my_dup("const %(type)s * idx_in%(unloop_iter)s=&in%(unloop_iter)s[ind0*dim_im[1]];", unloop_bsize) ret+=my_dup("const %(type)s * idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
ret+=""" ret+="""
for (int ind1=n-k; k<max_k; k++,ind1--) { for (int ind1=n-k; k<max_k; k++,ind1--) {
"""%d """%d
ret+=my_dup2("sum%(unloop_iter)s+= idx_hvals%(unloop_kiter)s[k] * idx_in%(unloop_biter)s[ind1];") ret+=my_dup2("sum%(unroll_iter)s+= idx_hvals%(unroll_kiter)s[k] * idx_in%(unroll_biter)s[ind1];")
ret+=""" ret+="""
} }
//do the part to the left of the img //do the part to the left of the img
if(fill_value!=0) if(fill_value!=0)
for(;k<dim_ker[1];k++){ for(;k<dim_ker[1];k++){
"""%d """%d
ret+=my_dup2("sum%(unloop_iter)s += idx_hvals%(unloop_kiter)s[k] * fill_value;") ret+=my_dup2("sum%(unroll_iter)s += idx_hvals%(unroll_kiter)s[k] * fill_value;")
ret+=""" ret+="""
} }
} }
}else{ }else{
"""%d """%d
ret+=my_dup("const %(type)s* idx_in%(unloop_iter)s=&in%(unloop_iter)s[ind0*dim_im[1]];", unloop_bsize) ret+=my_dup("const %(type)s* idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
ret+=my_dup("const %(type)s* idx_hvals%(unloop_iter)s=&hvals%(unloop_iter)s[j*dim_ker[1]];",unloop_ksize) ret+=my_dup("const %(type)s* idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker[1]];",unroll_ksize)
ret+=""" ret+="""
int new_n = (n+dim_ker[1]-1); int new_n = (n+dim_ker[1]-1);
for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) { for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
"""%d """%d
ret+=my_dup2("sum%(unloop_iter)s+=idx_hvals%(unloop_kiter)s[k]*idx_in%(unloop_biter)s[last];") ret+=my_dup2("sum%(unroll_iter)s+=idx_hvals%(unroll_kiter)s[k]*idx_in%(unroll_biter)s[last];")
ret+=""" ret+="""
} }
} }
}//for j }//for j
"""%d """%d
# ret+=my_dup("out%(unloop_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unloop_iter)s;", unloop_bsize) # ret+=my_dup("out%(unroll_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unroll_iter)s;", unroll_bsize)
ret+=my_dup("out%(unloop_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unloop_iter)s;", unloop_bsize*unloop_ksize) ret+=my_dup("out%(unroll_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unroll_iter)s;", unroll_bsize*unroll_ksize)
# ret+=my_dup("cout<<sum%(unloop_iter)s<<endl;",unloop_bsize) # ret+=my_dup("cout<<sum%(unroll_iter)s<<endl;",unroll_bsize)
ret+=""" ret+="""
}//for n }//for n
}//for m }//for m
......
...@@ -228,7 +228,7 @@ class TestConvOp(unittest.TestCase): ...@@ -228,7 +228,7 @@ class TestConvOp(unittest.TestCase):
kerns4=dmatrix4() kerns4=dmatrix4()
assert len(kshps)==len(nkerns)==len(kerns) assert len(kshps)==len(nkerns)==len(kerns)
def do_test(conv_mode, ss, unroll_batch=0, unroll_kern=0, img=img): def do_test(conv_mode, ss, unroll_batch=0, unroll_kern=0, img=img, validate=True):
# build actual input images # build actual input images
imgval = rng.rand(bsize, imshp_start[0], imshp_start[1], imshp_start[2]) imgval = rng.rand(bsize, imshp_start[0], imshp_start[1], imshp_start[2])
...@@ -261,14 +261,15 @@ class TestConvOp(unittest.TestCase): ...@@ -261,14 +261,15 @@ class TestConvOp(unittest.TestCase):
time1 = time.time() time1 = time.time()
outval = N.zeros(N.r_[bsize,outshp]) outval = N.zeros(N.r_[bsize,outshp])
val = _valfrommode(conv_mode) if validate:
bval = _bvalfromboundary('fill') val = _valfrommode(conv_mode)
for b in range(bsize): # loop over batches bval = _bvalfromboundary('fill')
for n in range(nkern): # loop over filters for b in range(bsize): # loop over batches
for i in range(imshp[0]): # loop over input feature maps for n in range(nkern): # loop over filters
outval[b,n,...] += _convolve2d(\ for i in range(imshp[0]): # loop over input feature maps
imgval[b,i,...], w_flip[n,i,...],1,val, bval, 0)[0::ss[0],0::ss[1]] outval[b,n,...] += _convolve2d(\
ntot += time.time() - time1 imgval[b,i,...], w_flip[n,i,...],1,val, bval, 0)[0::ss[0],0::ss[1]]
ntot += time.time() - time1
if do_theano: if do_theano:
####### test with new sp.convolve2 function ###### ####### test with new sp.convolve2 function ######
...@@ -288,7 +289,8 @@ class TestConvOp(unittest.TestCase): ...@@ -288,7 +289,8 @@ class TestConvOp(unittest.TestCase):
assert (N.abs(hidval-hidval1)<1e-5).all() assert (N.abs(hidval-hidval1)<1e-5).all()
temp = N.abs(outval.reshape(bsize,-1) - hidval) temp = N.abs(outval.reshape(bsize,-1) - hidval)
assert (temp < 1e-5).all() if validate:
assert (temp < 1e-5).all()
else: else:
hid = img #we don't need it, but it make the flow easier flow hid = img #we don't need it, but it make the flow easier flow
...@@ -313,8 +315,9 @@ class TestConvOp(unittest.TestCase): ...@@ -313,8 +315,9 @@ class TestConvOp(unittest.TestCase):
tpytot += time.time() - time1 tpytot += time.time() - time1
# assert (N.abs(hidval2-hidval3)<1e-5).all() # assert (N.abs(hidval2-hidval3)<1e-5).all()
temp = N.abs(outval - hidval2) if validate:
assert (temp < 1e-5).all() temp = N.abs(outval - hidval2)
assert (temp < 1e-5).all()
# temp = N.abs(outval - hidval3) # temp = N.abs(outval - hidval3)
# assert (temp < 1e-5).all() # assert (temp < 1e-5).all()
...@@ -323,22 +326,35 @@ class TestConvOp(unittest.TestCase): ...@@ -323,22 +326,35 @@ class TestConvOp(unittest.TestCase):
return tctot, tpytot, ntot return tctot, tpytot, ntot
if False: if True:
# calculate the speed up of different combination of unroll
# we don't validate the result to have it much faster!
validate=False
unroll_batch = [0,1,2,5,10] unroll_batch = [0,1,2,5,10]
unroll_kern = [0,1,2,5,10,20] unroll_kern = [0,1,2,5,10,20]
# calculate the speed up of different combination of unroll bsize = 10 # batch size
for unroll_b in unroll_batch: imshp_start = (1,50,49)#un square shape to test more corner case.
for unroll_k in unroll_kern: kshps = ([11,12],[12,11])#un square shape to test more corner case.
nkerns = [20,20] # per output pixel
ssizes = [(1,1),]#(1,1)]#(2,2) bugged
convmodes = ['valid','full']
do_theano=False
timing = N.zeros((len(unroll_batch),len(unroll_kern),3))
for unroll_b, n_b in zip(unroll_batch,range(len(unroll_batch))):
for unroll_k, n_k in zip(unroll_kern,range(len(unroll_kern))):
tctot, tpytot, ntot=[],[],[] tctot, tpytot, ntot=[],[],[]
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))): for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
for ss, n_ss in zip(ssizes,range(len(ssizes))): for ss, n_ss in zip(ssizes,range(len(ssizes))):
tctot_, tpytot_, ntot_ = do_test(conv_mode, ss,unroll_batch=unroll_b, unroll_kern=unroll_k) tctot_, tpytot_, ntot_ = do_test(conv_mode, ss,unroll_batch=unroll_b, unroll_kern=unroll_k, validate=validate)
tctot+=[tctot_] tctot+=[tctot_]
tpytot+=[tpytot_] tpytot+=[tpytot_]
ntot+=[ntot_] ntot+=[ntot_]
timing[n_b,n_k]=[sum(tctot), sum(tpytot), sum(ntot)]
print '**** Multilayer Convolution Profiling Results ****' print '**** Multilayer Convolution Profiling Results ****'
print 'unroll batch', unroll_b, 'unroll kern',unroll_k print 'unroll batch', unroll_b, 'unroll kern',unroll_k
print 'Numpy convolve2d processing time: %.3fs'%sum(ntot),ntot print 'Numpy convolve2d processing time: %.3fs'%sum(ntot),ntot
...@@ -346,6 +362,13 @@ class TestConvOp(unittest.TestCase): ...@@ -346,6 +362,13 @@ class TestConvOp(unittest.TestCase):
print 'py Theano(ConvOp) processing time: %.3fs'%sum(tpytot),tpytot print 'py Theano(ConvOp) processing time: %.3fs'%sum(tpytot),tpytot
d=N.asarray(ntot)/tctot d=N.asarray(ntot)/tctot
print 'speed up c theano(ConvOp) vs convolve2d: %.3f'%d.mean(),d print 'speed up c theano(ConvOp) vs convolve2d: %.3f'%d.mean(),d
print timing
t=timing[:,:,0]
for b in unroll_batch:
for k in unroll_kern:
print b,"/",k," ",
print t
print "min", t.min(), "max", t.max(), "speedup", t.max()/t.min()
return return
for conv_mode, n_mode in zip(convmodes,range(len(convmodes))): for conv_mode, n_mode in zip(convmodes,range(len(convmodes))):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论