//if we remove the restriction img_size_byte+kern_size_byte>8*1024, we can enter in condition where we will lower the occupency due to shared memory and/or registers.
//if we remove the restriction
if ((version == -1) && (out_size<64 || img_size_byte+kern_size_byte>8*1024) && out_size<=256){
//img_size_byte+kern_size_byte>8*1024, we can enter in condition where
//we will lower the occupency due to shared memory and/or registers.
if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
boolimg_batch_stack_contiguous=(img_stride_stack==img_stride_row*img_len)&&(img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
boolimg_batch_stack_contiguous=(img_stride_stack==img_stride_row*img_len)&&(img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
//if the lower 2 dims are c_contiguous but flipped, unflipping the
//stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when we don't need to unflip it, but have the new value when we unflip it.
//the unflipped version of variable have the original value when
//we don't need to unflip it, but have the new value when we unflip it.
//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
//version 5 will have only 1 block running at a time, so we
//can use 32 registers per threads, but their is some other stuff that