//if we remove the restriction img_size_byte+kern_size_byte>8*1024, we can enter in condition where we will lower the occupency due to shared memory and/or registers.
if ((version == -1) && (out_size<64 || img_size_byte+kern_size_byte>8*1024) && out_size<=256){
//if we remove the restriction
//img_size_byte+kern_size_byte>8*1024, we can enter in condition where
//we will lower the occupency due to shared memory and/or registers.
std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete)
version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
out_contiguous&&
out_size<512&&//Maximum of 512 theads by block
std::max(int(img_size_byte+2*kern_wid*sizeof(float)),out_size_byte*2)<shared_avail&&//their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete)
version=7;//conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
}
if(!subsample&&c_contiguous&&
(version==0||version==2||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch
(version==0||version==2||version==-1)&&
out_wid<512&&//Maximum of 512 theads by block
nstack==1&&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail&&//their is only 16k of shared memory
!work_complete)//conv_patch
{
intnb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
if(version==2&&out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
boolimg_batch_stack_contiguous=(img_stride_stack==img_stride_row*img_len)&&(img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
//if the lower 2 dims are c_contiguous but flipped, unflipping the
//stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when we don't need to unflip it, but have the new value when we unflip it.
//the unflipped version of variable have the original value when
//we don't need to unflip it, but have the new value when we unflip it.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
//Max of 16k of shared memory
if(version==5)
while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
//327 as we use 25 register
//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>512)version=4;
if(version==-1)version=3;
if(version==-1 && nb_split>1) version=4;
else if(version==-1) version=3;
else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more than 1 split as to be always execute.
assert(version!=3 || nb_split==1);
assert(version!=5 || kern_len>1);
assert(version!=-1);
if((version==4||version==5)&&out_len>1)nb_split++;//to force the use of split=true when testing.
if(kern_len==1&&version==5){
//version 5 don't support kern_len==1 as 1%0 return -1.
version=-1;
if(verbose)fprintf(stderr,"WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
if(false&&!subsample&&//disabled as test fail for this kernel
(version==1||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block
(version==1||version==-1)&&
out_size<512&&//Maximum of 512 theads by block
(nbatch>20||version==1)&&// we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything
nstack*img_size_byte+nstack*kern_size_byte<shared_avail&&//there is only 16k of shared memory