out_size<=max_threads_dim0&&//Maximum of X threads by block
out_size<=max_threads_dim0&&//Maximum of X threads by block
std::max(int(img_size_byte+2*kern_wid*sizeof(float)),out_size_byte*2)<shared_avail&&//their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
std::max(int(img_size_byte+2*kern_wid*sizeof(float)),out_size_byte*2)<shared_avail&&//there is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete)
!work_complete)
version=7;//conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
version=7;//conv_patch_stack_reduce, switch to version 8/13 automatically if needed.