for (int col=0; col < kern_wid; col++) {//loop over col
sum+=data[col]*kern[col];
}
}
__device__ void fill(float * dst, int N, float value, int thread_id, int nb_thread){
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=value;
}
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a %% b) ? 1: 0);
}
/**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more then the 10/16 max for each thread and thus this could lower the occupency.
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
* thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
* grid block size=batch_id, nkern
* dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
const int len_to_load=min(%(kern_len)s+nb_rows,%(img_len)s-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
const int empty_row = max(%(kern_len)s-1-out_row_iter*nb_rows,0);//number of empty row at the start
//we need to reload some row as when we change of out_row we lost the last load du to the stack.
const int previous_row = min(out_row_iter*nb_rows,%(kern_len)s-1);//number of row from last out_row iteration to reload
ifTrueandself.subsample==(1,1)andself.border_mode=='full'andself.versionin[3,4,5,-1]andout_dim_3<=512and((self.logical_kern_hw[0]+2*self.logical_kern_hw[0]-2)*img_wid_padded*4+self.logical_kern_hw[0]*self.logical_kern_hw[1]*4<(16*1024-128))andout_.dtype=='float32'andkern_.dtype=='float32'andimg_.dtype=='float32':#-128 as this is the number of shared memory used statically
return"""
CudaNdarray* img = %(img)s;
CudaNdarray* kern = %(kern)s;
CudaNdarray* out_ = %(out)s;
CudaNdarray* out = out;
int version = %(version)s;
const int verbose = %(verbose)s;
if (!img || img->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required img of 4D");
return -1;
}
if (! kern || kern->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required kern of 4D");
return -1;
}
int out_dim[4]={CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
//Max of 16k of shared memory
if(version==5)
while ((((%(kern_len)s+ceil_intdiv(out_len,nb_split)-1)+2*%(kern_len)s-2)*%(img_wid_padded)s*sizeof(float) + kern_size_byte)>16*1024) nb_split++;
//327 as we use 25 register
//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>512)version=4;
if(version==-1)version=3;
if(version==-1 && nb_split>1) version=4;
else if(version==-1) version=3;
else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more then 1 split as to be always execute.