//if we remove the restriction img_size_byte+kern_size_byte>8*1024, we can enter in condition where we will lower the occupency due to shared memory and/or registers.
//if we remove the restriction
if ((version == -1) && (out_size<64 || img_size_byte+kern_size_byte>8*1024) && out_size<=256){
//img_size_byte+kern_size_byte>8*1024, we can enter in condition where
//we will lower the occupency due to shared memory and/or registers.
std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
std::max(int(img_size_byte+2*kern_wid*sizeof(float)),out_size_byte*2)<shared_avail&&//their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete)
!work_complete)
version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
version=7;//conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
}
}
if(!subsample&&c_contiguous&&
if(!subsample&&c_contiguous&&
(version==0||version==2||version==-1) &&
(version==0||version==2||version==-1)&&
out_wid<512 &&//Maximum of 512 theads by block
out_wid<512&&//Maximum of 512 theads by block
nstack == 1 &&// don't implement the stack in the kernel.
nstack==1&&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
img_size_byte+kern_size_byte<shared_avail&&//their is only 16k of shared memory
!work_complete) //conv_patch
!work_complete)//conv_patch
{
{
intnb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
intnb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
if(version==2&&out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
boolimg_batch_stack_contiguous=(img_stride_stack==img_stride_row*img_len)&&(img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
boolimg_batch_stack_contiguous=(img_stride_stack==img_stride_row*img_len)&&(img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
//if the lower 2 dims are c_contiguous but flipped, unflipping the
//stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when we don't need to unflip it, but have the new value when we unflip it.
//the unflipped version of variable have the original value when
//we don't need to unflip it, but have the new value when we unflip it.
//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
//327 as we use 25 register
int max_thread = (version!=5?327:450);
//version 5 will have only 1 block running at a time, so we
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
//can use 32 registers per threads, but their is some other stuff that
if(false&&!subsample&&//disabled as test fail for this kernel
if(false&&!subsample&&//disabled as test fail for this kernel
(version==1||version==-1) &&
(version==1||version==-1)&&
out_size<512 &&//Maximum of 512 theads by block
out_size<512&&//Maximum of 512 theads by block
(nbatch>20||version==1)&&// we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
(nbatch>20||version==1)&&// we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
nstack*img_size_byte+nstack*kern_size_byte<shared_avail&&//there is only 16k of shared memory