//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
for (; col < max_col; col++, img_col++) {//loop over col
sum+=idx_in[col]*idx_kern[col];
}
}
//Needed as not all thread finish at the same time the loop
//And we don't want to overwrite the shared memory.
__syncthreads();
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum;
}
/**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more then the 10/16 max for each thread and thus this could lower the occupency.
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
* thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
* grid block size=batch_id, nkern
* dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template img_contiguous: if true, the img have are collon and row contiguous
* template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
*/
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}
/**
* Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
* kernel-image inner-product in parallel.
*
* This implementation uses shared memory for the reduce, so it is limited by the product of stacklen x kern_len
*
* template stack_loop: if true, we accept that blockDim.x < nstack and we add a loop for this(use 3 more registers, so lower occupency when true, but accept nstack*kern_len>512)
* TODO: explain parameters, preconditions
*/
template<bool stack_loop>
__global__ void
conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols,
const int initial_reduce_boundary)
{
const int outsize = nB * nK * out_len * out_wid;
extern __shared__ float reducebuf[];
for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x)
{
//figure out what output element we're in charge of computing
int ii = i;
int iB = ii % nB; // output batch index
ii = ii / nB;
int iK = ii % nK; // output kernel index
ii = ii / nK;
int iR_physical = ii % out_len; //output kernel row
int iC_physical = ii / out_len; // output kernel column
int iR_logical = iR_physical * subsample_rows;
int iC_logical = iC_physical * subsample_cols;
int ss = threadIdx.x;
int rr = threadIdx.y;
int img_rr = iR_logical + kern_len - 1 - rr;
int reduceIdx = threadIdx.x * blockDim.y + threadIdx.y;