out_size<=max_threads_dim0&&//Maximum of X threads by block
std::max(int(img_size_byte+2*kern_wid*sizeof(float)),out_size_byte*2)<shared_avail&&//their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete)
version=7;//conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
}
if(!subsample&&c_contiguous&&
(version==0||version==2||version==-1)&&
out_wid<=max_threads_dim0&&//Maximum of X threads for block.x
nstack==1&&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail&&//their is only 16k of shared memory
!work_complete)//conv_patch
{
intnb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if(version==2&&out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
boolimg_batch_stack_contiguous=(img_stride_stack==img_stride_row*img_len)&&(img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
//if the lower 2 dims are c_contiguous but flipped, unflipping the
//stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when
//we don't need to unflip it, but have the new value when we unflip it.
if(false&&!subsample&&//disabled as test fail for this kernel
(version==1||version==-1)&&
out_size<=max_threads_dim0&&//Maximum of X threads by block
(nbatch>20||version==1)&&// we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail&&//there is only 16k of shared memory
//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
//Needed as not all thread finish at the same time the loop
//And we don't want to overwrite the shared memory.
__syncthreads();
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col]=sum;
}
/**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
* thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
* grid block size=batch_id, nkern
* dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template img_contiguous: if true, the img have are collon and row contiguous
* template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
sum+=sum_;//We pass by an intermediate variable to have more precission.
}
}
}
__syncthreads();
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col]=sum;
}
/**
* Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
* kernel-image inner-product in parallel.
*
* This implementation uses shared memory for the reduce, so it is limited by the product of stacklen x kern_len
*
* template stack_loop: if true, we accept that blockDim.x < nstack and we add a loop for this(use 3 more registers, so lower occupency when true, but accept nstack*kern_len>512)