out_size<max_threads_dim0&&//Maximum of X threads by block
out_size<=max_threads_dim0&&//Maximum of X threads by block
std::max(int(img_size_byte+2*kern_wid*sizeof(float)),out_size_byte*2)<shared_avail&&//their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete)
version=7;//conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
if(false&&!subsample&&//disabled as test fail for this kernel
(version==1||version==-1)&&
out_size<max_threads_dim0&&//Maximum of X threads by block
out_size<=max_threads_dim0&&//Maximum of X threads by block
(nbatch>20||version==1)&&// we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail&&//there is only 16k of shared memory