提交 0dae8f43 authored 作者: Frederic Bastien's avatar Frederic Bastien

move the convolution from cuda_ndarray to the GpuConvOp.

上级 ccc01a40
from theano import Op, Type, Apply, Variable, Constant
from theano import tensor, scalar
import StringIO
import StringIO, os
import cuda_ndarray.cuda_ndarray as cuda
from theano.sandbox.cuda.type import CudaNdarrayType
......@@ -186,626 +186,57 @@ class GpuConv(Op):
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0], False, False]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def perform(self, node, (img, kern), (out,)):
out[0] = cuda.conv(img, kern,
mode=self.border_mode,
out=out[0],
subsample=self.subsample,
logical_img_shape=self.logical_img_hw,
logical_kern_shape=self.logical_kern_hw,
kern_align=self.logical_kern_align_top,
version=self.version,
verbose=self.verbose)
def c_support_code_apply(self, node, nodename):
if self.logical_img_hw is None or self.logical_kern_hw is None:
return super(GpuConv,self).c_support_code_apply(node, nodename)
img_wid = self.logical_img_hw[1]
img_len = self.logical_img_hw[0]
kern_wid=self.logical_kern_hw[1]
kern_len=self.logical_kern_hw[0]
return"""
const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
#define MASKED_OFFSET(src) (((int)((unsigned long int)src - (((unsigned long int)src) & COALESCED_ALIGN))) / sizeof(float))
__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
if (nb_thread < 64)
{
if(flipped)
//TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=src[N - 1 - i];
//dst[N-1-i]=src[i];
else
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=src[i];
}
else
{
nb_thread = nb_thread & 0xFFFFFFE0; //make nb_thread a multiple of 32
// Global memory:
// <-------------------------------------->
// A A A A A // points of 128-bit alignment
// dddddddddddddddddddddd // layout of src in global memory
// |--| // masked_src_offset
//
if (thread_id < nb_thread)
{
const int masked_src_offset = MASKED_OFFSET(src);
for(int masked_i=thread_id; masked_i<N + masked_src_offset; masked_i+=nb_thread)
{
int i = masked_i - masked_src_offset;
if (i >= 0)
if (flipped)
dst[N-1-i] = src[i];
else
dst[i]=src[i];
}
}
}
}
/*
* We load from global memory to shared memory. The outer if is optimized away at compilation.
*/
__device__ void load_to_shared(float * dst, const float * src, const int thread_id,
int nb_thread, const int nb_col, const int nb_row,
const int stride_col, const int stride_row,
const bool flipped=false, const bool c_contiguous=true){
if(flipped && ! c_contiguous){
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
dst[nb_row*nb_col-1-i]=src[i/nb_col*stride_row+i%%nb_col*stride_col];
}else if(c_contiguous){
load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
}else if(flipped){//c_contiguous==true
//TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
int N=nb_col*nb_row;
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=src[N - 1 - i];
//dst[N-1-i]=src[i];
}else if(c_contiguous){//flipped==false
for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread)
dst[i]=src[i];
}else{ // !flipped && !c_contiguous
/*
for(int i=thread_id;i<nb_row;i+=nb_thread){
float* s=&src[i*stride_row];
float* d=&dst[i*nb_col];
for(int j=thread_id;j<nb_col;i+=nb_thread)
// dst[i*nb_col+j]=src[i*stride_row+j*stride_col];//dst[i]=src[i];
d[j]=s[j*stride_col];
}*/
/* We don't do this if as nvcc 2.3 take 2 more registers when we add the if
Why it do this?
if(stride_col==1 && stride_row==nb_col)
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
dst[i]=src[i];
else*/
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
dst[i]=src[i/nb_col*stride_row+i%%nb_col*stride_col];
}
}
__device__ void load_padded_col_to_shared(float * dst, const float * src,
const int thread_id, const int nb_thread,
const int nb_col, const int nb_row,
const int stride_col, const int stride_row,
const int wid_pad, const bool c_contiguous=true){
if(c_contiguous){//flipped==false
//template nb_col to have better performance!
int row=0;
int col=thread_id;
for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread, col+=nb_thread){
col-=nb_col;row++;
while(col>nb_col){
col-=nb_col;row++;
}
dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[i];
}
/*
for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){
int col=i%%nb_col;
int row=i/nb_col;
dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[i];
}
*/
}else{
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread){
int col=i%%nb_col;
int row=i/nb_col;
dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[row*stride_row+col*stride_col];
}
}
}
template<int i> __device__ float convolutionRowNoFlip(const float *data,
const float *kern){
return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
}
template<> __device__ float convolutionRowNoFlip<0>(const float *data,
const float *kern){
return 0;
}
template<int KERN_WIDTH>
__device__ void convolutionRowNoFlip(float& sum,
const float *data,
const float *kern, const int kern_wid){
if(KERN_WIDTH>0)
sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
else
#pragma unroll 8
for (int col=0; col < kern_wid; col++) {//loop over col
sum+=data[col]*kern[col];
}
}
__device__ void fill(float * dst, int N, float value, int thread_id, int nb_thread){
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=value;
}
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a %% b) ? 1: 0);
}
/**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more then the 10/16 max for each thread and thus this could lower the occupency.
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
* thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
* grid block size=batch_id, nkern
* dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
* dynamic shared memory: low mem:((kern_len+nb_row-1)+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
*
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template c_contiguous: if true, the image and kernel have are c_contiguous.(use less registers)
* template split: if true, each thread compute more then 1 output pixel.
* template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
* if you set split and low_mem to true, we will use the low_mem version!
*/
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
__global__ void
conv_full_patch_stack_padded( float* img, float* kern, float* out,
const int img_len, const int img_wid,
const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = %(img_len)s + %(kern_len)s - 1;
out_wid = %(img_wid)s + %(kern_wid)s - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
__shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
batch_id = blockIdx.x;
kern_id = blockIdx.y;
nb_rows = blockDim.y;
// Thread index
const int tx = threadIdx.x;
const int ty = threadIdx.y;
int out_col = tx;//output col
const int thread_id = ty*blockDim.x + tx;
float * d_kern=&s_data[0];//size of [KERNEL_LEN * KERNEL_WID];
float * d_img=&s_data[%(kern_len)s*%(kern_wid)s];//size of [see fct doc];
kern+=kern_stride_nkern*kern_id;//the good nkern
img+=img_stride_batch*batch_id;//the good batch
img_wid_valid=%(img_wid)s+2*%(kern_wid)s-2;
if(!split && !low_mem){
fill(d_img,img_wid_valid*(%(img_len)s+2*%(kern_len)s-2), 0, thread_id, nb_thread_id);
const int out_row = ty;//output row
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){
__syncthreads();
load_padded_col_to_shared(d_img+img_wid_valid*(%(kern_len)s-1),img,
thread_id,nb_thread_id,%(img_wid)s,%(img_len)s,
img_stride_col, img_stride_row, %(kern_wid)s-1,
c_contiguous);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, %(kern_wid)s,%(kern_len)s,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for (int row=0; row < %(kern_len)s; row++) {//loop over row
const float* idx_kern=&d_kern[row*%(kern_wid)s];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, %(kern_wid)s);
}
}
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}else if(split && !low_mem){
fill(d_img,img_wid_valid*(%(img_len)s+2*%(kern_len)s-2), 0, thread_id, nb_thread_id);
//out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
__shared__ int out_len_max;
//TODO pass a parameter nb_split
out_len_max = (out_len/blockDim.y+(out_len %% blockDim.y==0?0:1))*blockDim.y;
for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
__syncthreads();
//TODO: load only the part of the image needed or put the partial result in shared memory
load_padded_col_to_shared(d_img+img_wid_valid*(%(kern_len)s-1),
img+img_stride_stack*stack,
thread_id,nb_thread_id,%(img_wid)s,%(img_len)s,
img_stride_col, img_stride_row, %(kern_wid)s-1,
c_contiguous);
load_to_shared(d_kern, kern+kern_stride_stack*stack,
thread_id, nb_thread_id, %(kern_wid)s,%(kern_len)s,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for (int row=0; row < %(kern_len)s; row++) {//loop over row
const float* idx_kern=&d_kern[row*%(kern_wid)s];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, %(kern_wid)s);
}
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col] = sum;
}
}
}else{//low_mem version
//don't need to fill the last rows padding as this is done later.
fill(d_img,img_wid_valid*((%(kern_len)s+nb_rows-1)+2*%(kern_len)s-2), 0, thread_id, nb_thread_id);
//out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
__shared__ int out_len_max;
//TODO pass a parameter nb_split
if(thread_id==0)
out_len_max = (out_len/nb_rows+(out_len %% nb_rows==0?0:1))*nb_rows;
__syncthreads();
for(int out_row = ty, out_row_iter=0;out_row<out_len_max;
out_row+=nb_rows, out_row_iter++){
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
__syncthreads();
const int len_to_load=min(%(kern_len)s+nb_rows,%(img_len)s-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
const int empty_row = max(%(kern_len)s-1-out_row_iter*nb_rows,0);//number of empty row at the start
//we need to reload some row as when we change of out_row we lost the last load du to the stack.
const int previous_row = min(out_row_iter*nb_rows,%(kern_len)s-1);//number of row from last out_row iteration to reload
load_padded_col_to_shared(d_img+(%(kern_len)s-1-previous_row)*img_wid_valid,
img+img_stride_stack*stack//the good stack image
+(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
thread_id,nb_thread_id,%(img_wid)s,
len_to_load+previous_row,
img_stride_col, img_stride_row, %(kern_wid)s-1,
c_contiguous);
__syncthreads();
//TODO: fill the last row padding only when needed.
//We always fill the last rows padding event when not needed.
int row_to_fill = 2*%(kern_len)s-2+nb_rows- empty_row - previous_row - len_to_load;
row_to_fill = min(row_to_fill,%(kern_len)s-1);
fill(d_img+(%(kern_len)s-1+len_to_load)*img_wid_valid,
img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
load_to_shared(d_kern, kern+kern_stride_stack*stack,
thread_id, nb_thread_id, %(kern_wid)s,%(kern_len)s,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for (int row=0; row < %(kern_len)s; row++) {//loop over row
const float* idx_kern=&d_kern[row*%(kern_wid)s];
const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, %(kern_wid)s);
}
}
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col] = sum;
}
}
}
def c_compile_args(self):
return ['-DDONT_UNROLL']
def c_headers(self):
return ['cuda_ndarray.cuh','<stdio.h>']
void (*f_contig_3_flipped)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,true,false,false>;
void (*f_contig_4_flipped)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,true,true,false>;
void (*f_contig_5_flipped)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,true,false,true>;
void (*f_3_flipped)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,false,false,false>;
void (*f_4_flipped)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,false,true,false>;
void (*f_5_flipped)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,false,false,true>;
void (*f_contig_3)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,true,false,false>;
void (*f_contig_4)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,true,true,false>;
void (*f_contig_5)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,true,false,true>;
void (*f_3)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,false,false,false>;
void (*f_4)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,false,true,false>;
void (*f_5)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,false,false,true>;
def c_support_code_apply(self, node, nodename):
return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\
open(os.path.join(os.path.split(__file__)[0],'conv_full_kernel.cu')).read()+\
open(os.path.join(os.path.split(__file__)[0],'conv.cu')).read()
"""%locals()
def c_code(self, node, nodename, (img, kern), (out,), sub):
out_=node.outputs[0]
img_=node.inputs[0]
kern_=node.inputs[1]
subsample_rows=self.subsample[0]
subsample_cols=self.subsample[1]
version=self.version
verbose=self.verbose
if self.logical_img_hw is None or self.logical_kern_hw is None:
return super(GpuConv,self).c_code(node,nodename,(img, kern), (out,),sub)
#todo assert out is ccontiguous
img_wid = self.logical_img_hw[1]
img_len = self.logical_img_hw[0]
kern_wid = self.logical_kern_hw[1]
kern_len=self.logical_kern_hw[0]
img_wid_padded=self.logical_img_hw[1]+2*self.logical_kern_hw[1]-2;
img_len_padded=self.logical_img_hw[0]+2*self.logical_kern_hw[0]-2;
img_size_padded=img_len_padded * img_wid_padded;
out_dim_2, out_dim_3 = self.logical_output_shape_2d(self.logical_img_hw,self.logical_kern_hw,self.border_mode)
dx = self.subsample[0]
dy = self.subsample[1]
border_mode = self.border_mode
version = self.version
verbose = self.verbose
sub = sub.copy()
sub.update(locals())
return """
//Mandatory args
const char *mode_str = "%(border_mode)s";
fail=sub['fail']
if False and self.subsample==(1,1) and self.border_mode=='full' and self.version in [3,4,5,-1] and out_dim_3<=512 and ((self.logical_kern_hw[0]+2*self.logical_kern_hw[0]-2)*img_wid_padded*4 + self.logical_kern_hw[0]*self.logical_kern_hw[1]*4<(16*1024-128)) and out_.dtype=='float32' and kern_.dtype=='float32' and img_.dtype=='float32':#-128 as this is the number of shared memory used statically
return """
//Optional args
int version = %(version)s;
int verbose = %(verbose)s;
int dx = %(dx)s;
int dy = %(dy)s;
CudaNdarray* img = %(img)s;
CudaNdarray* kern = %(kern)s;
CudaNdarray* out_ = %(out)s;
CudaNdarray* out = out;
int version = %(version)s;
const int verbose = %(verbose)s;
if (!img || img->nd != 4)
int mode;
if (strcmp(mode_str, "full") == 0)
{
PyErr_SetString(PyExc_ValueError, "required img of 4D");
return -1;
mode = ConvMode_FULL;
}
if (! kern || kern->nd != 4)
else if (strcmp(mode_str, "valid") == 0)
{
PyErr_SetString(PyExc_ValueError, "required kern of 4D");
return -1;
}
int out_dim[4]={CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
%(out_dim_2)s, %(out_dim_3)s};
if(!(out && out->nd==4 && CudaNdarray_is_c_contiguous(out)
&& CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
&& CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
&& CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
&& CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])){
if (out)
{
Py_DECREF(out);
fprintf(stderr, "Warning: Conv is ignoring 'out' argument with wrong structure.\\n");
}
out = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
%(out)s = out;
mode = ConvMode_VALID;
}
if (! out || out->nd != 4)
else
{
PyErr_SetString(PyExc_ValueError, "required out of 4D");
return -1;
}
if (%(subsample_rows)s==1 && %(subsample_cols)s==1)
{
//TODO: rethink these asserts in light of the difference between physical and logical dimensions
assert (CudaNdarray_HOST_DIMS(out)[2] == CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1);
assert (CudaNdarray_HOST_DIMS(out)[3] == CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1);
}
assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
//TODO: make separate version as if all fill this is slower.
//TODO: make a parameter the number of division
//TODO: Should we make them in separate grid block instead?
const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
const int img_wid=%(img_wid)s;
const int img_len=%(img_len)s;
const int kern_wid=%(kern_wid)s;
const int kern_len=%(kern_len)s;
const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
const int out_len=CudaNdarray_HOST_DIMS(out)[2];
const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
const int img_stride_stack= CudaNdarray_HOST_STRIDES(img)[1];
const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
const int img_size=img_len*img_wid;
const int kern_size=%(kern_len)s*%(kern_wid)s;
const int out_size=out_len*out_wid;
const int img_size_byte = img_size*sizeof(float);
const int kern_size_byte = kern_size*sizeof(float);
const int out_size_byte = out_size*sizeof(float);
bool subsample = %(subsample_rows)s!=1 || %(subsample_cols)s!=1;
bool img_contiguous = CudaNdarray_is_c_contiguous(img);
bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
bool out_contiguous = CudaNdarray_is_c_contiguous(out);
bool c_contiguous = img_contiguous && kern_contiguous && out_contiguous;
bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==%(kern_wid)s);
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable haev the original value when we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata;
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if (!subsample &&
out_contiguous &&
(version==3||version==4||version==5||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block
(%(kern_len)s+2*%(kern_len)s-2)*%(img_wid_padded)s*sizeof(float) + kern_size_byte<16*1024 //their is only 16k of shared memory
) //conv_full_patch_stack_padded
{
//version 3 without split
//version 4 with split (more registers)
//version 5 with split (more registers) low mem version(some restriction and still more register)
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
if(%(kern_len)s==1 && version==5){
//version 5 don't support %(kern_len)s==1 as 1%%0 return -1.
version=-1;
if(verbose)printf("WARNING:conv full: Asking version 5 with %(kern_len)s==1. Combination not supported!\\n");
}
if(%(img_size_padded)s*4+kern_size_byte>16*1024) version=5;
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
//Max of 16k of shared memory
if(version==5)
while ((((%(kern_len)s+ceil_intdiv(out_len,nb_split)-1)+2*%(kern_len)s-2)*%(img_wid_padded)s*sizeof(float) + kern_size_byte)>16*1024) nb_split++;
//327 as we use 25 register
//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>512)version=4;
if(version==-1)version=3;
if(version==-1 && nb_split>1) version=4;
else if(version==-1) version=3;
else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more then 1 split as to be always execute.
assert(version!=3 || nb_split==1);
assert(version!=5 || %(kern_len)s>1);
assert(version!=-1);
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch,nkern);
int shared_size=%(img_size_padded)s*4 + kern_size_byte;
if(version==5)
shared_size=((%(kern_len)s+threads.y-1)+2*%(kern_len)s-2)*%(img_wid_padded)s*sizeof(float) + kern_size_byte;
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=f_contig_3_flipped;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=f_contig_4_flipped;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=f_contig_5_flipped;
else if(version==3 && kern_flipped) f=f_3_flipped;
else if(version==4 && kern_flipped) f=f_4_flipped;
else if(version==5 && kern_flipped) f=f_5_flipped;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=f_contig_3;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=f_contig_4;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=f_contig_5;
else if(version==3) f=f_3;
else if(version==4) f=f_4;
else if(version==5) f=f_5;
else assert(false);
f<<< grid, threads, shared_size>>>
(img->devdata, kern_data_unflipped, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i,shared_size=%%i, nb_threads=%%i, out_len=%%i, nb_split=%%i, version=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
if (verbose) printf("INFO: used 'conv_full_patch_stack_padded' nb_split=%%d low_mem=%%s\\n",nb_split,(version==5?"true":"false"));
}
else
{
if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i,shared_size=%%i, nb_threads=%%i, out_len=%%i, nb_split=%%i, version=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
if (verbose) printf("INFO: impl 'conv_full_patch_stack_padded' %%s %%s failed (%%s), trying next implementation\\n",
version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"),
cudaGetErrorString(sts));
//TODO: raise an error!
PyErr_Format(PyExc_RuntimeError, "INFO: impl 'conv_full_patch_stack_padded' %%s %%s failed (%%s), trying next implementation\\n",
version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"),
cudaGetErrorString(sts));
%(fail)s;
}
PyErr_SetString(PyExc_ValueError, "mode must be one of 'full' or 'valid'");
return NULL;
}
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, %(out)s,
mode, dx, dy, version, verbose);
if(%(out)s && %(out)s==out2)
Py_DECREF(out2);//CudaNdarray_Conv incremented the count to out
%(out)s = out2;
"""%sub
"""%locals()
else:
super(GpuConv,self).c_code(node,nodename,(img, kern), (out,),sub)
class GpuDownsampleFactorMax(Op):
def __init__(self, ds, ignore_border=False):
......
enum { ConvMode_FULL, ConvMode_VALID };
PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
bool msgdisplayed_conv_patch__kern_width = false;
bool msgdisplayed_conv_patch_stack__kern_width = false;
bool msgdisplayed_conv_rows__kern_width = false;
bool msgdisplayed_conv_rows_stack__kern_width = false;
bool msgdisplayed_conv_rows_stack2__kern_width = false;
bool msgdisplayed_conv_patch_stack_reduce__kern_width = false;
bool msgdisplayed_conv_full_patch_stack__kern_width = false;
/*
* version: -1, autodetect, >=0 a specific version to use.
* If it can't be executed, we revert to the reference implementation
*/
int
CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows, int subsample_cols,
int version = -1, int verbose=0)
{
int work_complete = 0;
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
if (img->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required img of 4D");
return -1;
}
if (kern->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required kern of 4D");
return -1;
}
if (out->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required out of 4D");
return -1;
}
if (subsample_rows==1 && subsample_cols==1)
{
//TODO: rethink these asserts in light of the difference between physical and logical dimensions
assert (CudaNdarray_HOST_DIMS(out)[2] == CudaNdarray_HOST_DIMS(img)[2] - CudaNdarray_HOST_DIMS(kern)[2] + 1);
assert (CudaNdarray_HOST_DIMS(out)[3] == CudaNdarray_HOST_DIMS(img)[3] - CudaNdarray_HOST_DIMS(kern)[3] + 1);
}
assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
// we now search through a few implementations until one applies to our arguments.
//TODO: make separate version as if all fill this is slower.
//TODO: Make a switch with power of 2 max size as template
//TODO: make a parameter the number of division
//TODO: Should we make them in separate grid block instead?
const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
const int img_len=CudaNdarray_HOST_DIMS(img)[2];
const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
const int out_len=CudaNdarray_HOST_DIMS(out)[2];
const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
const int img_stride_stack= CudaNdarray_HOST_STRIDES(img)[1];
const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid;
const int out_size=out_len*out_wid;
const int img_size_byte = img_size*sizeof(float);
const int kern_size_byte = kern_size*sizeof(float);
const int out_size_byte = out_size*sizeof(float);
bool subsample = subsample_rows!=1 || subsample_cols!=1;
bool img_contiguous = CudaNdarray_is_c_contiguous(img);
bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
bool out_contiguous = CudaNdarray_is_c_contiguous(out);
bool c_contiguous = img_contiguous && kern_contiguous && out_contiguous;
bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable haev the original value when we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata;
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped=1;
kern_stride_row_unflipped=kern_wid;
kern_flipped=false;
kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
}
if (verbose>1)
{
printf("INFO: Running conv_valid version %d with inputs:\n",version);
printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]);
printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]);
}
//if we remove the restriction img_size_byte+kern_size_byte>8*1024, we can enter in condition where we will lower the occupency due to shared memory and/or registers.
if ((version == -1) && (out_size<64 || img_size_byte+kern_size_byte>8*1024) && out_size<=256){
//condition for exec
if(!subsample &&
out_contiguous &&
out_size<512 &&//Maximum of 512 theads by block
(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete)
version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
}
if (!subsample && c_contiguous &&
(version==0||version==2||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch
{
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch, nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int);
#define CONV_PATCH_SPECIAL(kern_wid) \
if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
else f=conv_patch<true,kern_wid,true>;
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_PATCH_SPECIAL(1); break;//test_conv.py:test_valid
case 2: CONV_PATCH_SPECIAL(2); break;//test_conv.py:test_valid
case 3: CONV_PATCH_SPECIAL(3); break;//test_conv.py:test_valid
case 4: CONV_PATCH_SPECIAL(4); break;
case 5: CONV_PATCH_SPECIAL(5); break;
case 6: CONV_PATCH_SPECIAL(6); break;
case 7: CONV_PATCH_SPECIAL(7); break;
case 10: CONV_PATCH_SPECIAL(10); break;
#endif
default:
if(!msgdisplayed_conv_patch__kern_width) {
printf("OPTIMISATION WARNING: conv_patch template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
msgdisplayed_conv_patch__kern_width=true;
}
CONV_PATCH_SPECIAL(0);
}
f<<< grid, threads, shared_size>>>
(img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose) printf("INFO: used 'conv_patch' version %s nb_split=%d\n",threads.y==out_len?"no split": "split",nb_split);
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i, nb_split=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split);
if (verbose) printf("INFO: impl 'conv_patch' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample &&
out_contiguous &&
(version==1||version==3||version==11||version==12||version==-1) &&
(version!=1 || out_size<512) &&//Maximum of 512 theads by block
out_wid<512 &&//Maximum of 512 theads by block
img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch_stack
{
//version 1 is without split and preload the full kernel
//version 3 is with split and preload the full kernel
//version 11 is without split and load only 1 kernel row at a time.
//version 12 is with split and load only 1 kernel row at a time.
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
if(version==11 || version==12) preload_full_kernel=false;
dim3 grid(nbatch,nkern);
int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_PATCH_STACK_SPECIAL(kern_wid) \
if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,false,true>;\
if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,false,true>;\
if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,false,true>;\
if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,false,true>;\
if(preload_full_kernel && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,true,true>;\
if(preload_full_kernel && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,true,true>;\
if(preload_full_kernel && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,true,true>;\
if(preload_full_kernel && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,true,true>;\
if(nb_split==1 && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,false,false>;\
if(nb_split==1 && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,false,false>;\
if(nb_split==1 && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,false,false>;\
if(nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,false,false>;\
if(img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,true,false>;\
if(img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,true,false>;\
if(!img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,true,false>;\
if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,true,false>;
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_PATCH_STACK_SPECIAL(1); break;
case 2: CONV_PATCH_STACK_SPECIAL(2); break;
case 3: CONV_PATCH_STACK_SPECIAL(3); break;
case 4: CONV_PATCH_STACK_SPECIAL(4); break;
case 5: CONV_PATCH_STACK_SPECIAL(5); break;
case 6: CONV_PATCH_STACK_SPECIAL(6); break;
case 7: CONV_PATCH_STACK_SPECIAL(7); break;
case 8: CONV_PATCH_STACK_SPECIAL(8); break;
case 9: CONV_PATCH_STACK_SPECIAL(9); break;
case 10: CONV_PATCH_STACK_SPECIAL(10); break;
//////// Special cases
case 12: CONV_PATCH_STACK_SPECIAL(12); break;//on cifar10
case 21: CONV_PATCH_STACK_SPECIAL(21); break;//on cifar10
case 23: CONV_PATCH_STACK_SPECIAL(23); break;//test_nnet.py:test_lenet_64
case 24: CONV_PATCH_STACK_SPECIAL(24); break;//on cifar10
case 25: CONV_PATCH_STACK_SPECIAL(25); break;//on cifar10
case 28: CONV_PATCH_STACK_SPECIAL(28); break;
case 32: CONV_PATCH_STACK_SPECIAL(32); break;// Alex speed example
case 45: CONV_PATCH_STACK_SPECIAL(45); break;//used by test_nnet.py:test_lenet_108
#endif
//////// default case
default:
if(!msgdisplayed_conv_patch_stack__kern_width) {
printf("OPTIMISATION HINT: conv_patch_stack template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
msgdisplayed_conv_patch_stack__kern_width = true;
}
CONV_PATCH_STACK_SPECIAL(0);
}
f<<< grid, threads, shared_size>>>
(img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, nb_split=%i preload_full_kernel=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split, preload_full_kernel);
if (verbose) printf("INFO: used 'conv_patch_stack' version with nb_split=%i and preload_full_kernel=%i\n",
nb_split,preload_full_kernel);
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, nb_split=%i preload_full_kernel=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split, preload_full_kernel);
if (verbose) printf("INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample && out_contiguous &&
(version==4||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block
nstack == 1 &&// don't implement the stack in the kernel.
kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
!work_complete) //conv_rows
{
dim3 threads(out_wid);
dim3 grid(out_len, nbatch*nkern);
int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_ROWS_SPECIAL(kern_wid) \
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
else f = conv_rows<kern_wid, true>;\
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_ROWS_SPECIAL(1); break;//test_conv.py:test_valid
case 2: CONV_ROWS_SPECIAL(2); break;//test_conv.py:test_valid
case 3: CONV_ROWS_SPECIAL(3); break;//test_conv.py:test_valid
case 4: CONV_ROWS_SPECIAL(4); break;//test_conv.py:test_valid
case 5: CONV_ROWS_SPECIAL(5); break;//test_conv.py:test_valid
// case 6: CONV_ROWS_SPECIAL(6); break;
case 7: CONV_ROWS_SPECIAL(7); break;//used by test_nnet.py:test_lenet_108
// case 8: CONV_ROWS_SPECIAL(8); break;
case 9: CONV_ROWS_SPECIAL(9); break;//used by test_nnet.py:test_lenet_256
case 10: CONV_ROWS_SPECIAL(10); break;//test_conv.py:test_valid
//////// Special cases
case 28: CONV_ROWS_SPECIAL(28); break;
#endif
//////// default case
default:
if(!msgdisplayed_conv_rows__kern_width){
printf("OPTIMISATION HINT: conv_rows template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
msgdisplayed_conv_rows__kern_width = true;
}
CONV_ROWS_SPECIAL(0);
}
f<<< grid, threads, shared_size >>>
(img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose) printf("INFO: used 'conv_rows' version\n");
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_rows' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample && out_contiguous &&
(version==5||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block
img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_rows_stack
{
int nb_row=1;
int max_threads=512;
//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
for(int i=2;i<=out_len;i++){
if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
nb_row=i;
}
dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_ROWS_STACK_SPECIAL(kern_wid) \
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack<kern_wid, false>;\
else f = conv_rows_stack<kern_wid, true>;\
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_ROWS_STACK_SPECIAL(1); break;//test_conv.py:test_valid
case 2: CONV_ROWS_STACK_SPECIAL(2); break;
case 3: CONV_ROWS_STACK_SPECIAL(3); break;//test_conv.py:test_valid
case 4: CONV_ROWS_STACK_SPECIAL(4); break;//test_conv.py:test_valid
case 5: CONV_ROWS_STACK_SPECIAL(5); break;//test_conv.py:test_valid
case 6: CONV_ROWS_STACK_SPECIAL(6); break;//test_conv.py:test_valid
case 7: CONV_ROWS_STACK_SPECIAL(7); break;//test_nnet.py:test_lenet_108
case 8: CONV_ROWS_STACK_SPECIAL(8); break;//test_conv.py:test_valid
case 9: CONV_ROWS_STACK_SPECIAL(9); break;//test_nnet.py:test_lenet_256
case 10: CONV_ROWS_STACK_SPECIAL(10); break;//test_conv.py:test_valid
//////// Special cases
case 23: CONV_ROWS_STACK_SPECIAL(23); break;//test_conv.py:test_valid
case 24: CONV_ROWS_STACK_SPECIAL(24); break;//test_conv.py:test_valid
case 28: CONV_ROWS_STACK_SPECIAL(28); break;//test_conv.py:test_valid
case 45: CONV_ROWS_STACK_SPECIAL(45); break;//test_nnet.py:test_lenet_64
case 102: CONV_ROWS_STACK_SPECIAL(102); break;//test_nnet.py:test_lenet_108
#endif
//////// default case
default:
if(!msgdisplayed_conv_rows_stack__kern_width){
printf("OPTIMISATION HINT: conv_rows_stack template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
msgdisplayed_conv_rows_stack__kern_width = true;
}
CONV_ROWS_STACK_SPECIAL(0);
}
f<<< grid, threads, shared_size >>>
(img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack' version\n");
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_rows_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample && out_contiguous &&
(version==9||version==10||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block
(img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
(version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
!work_complete) //conv_rows_stack2
{
int nb_row=1;
int max_threads=512;
int version_back = version;
//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
version = 9;
else if(version==-1)version = 10;
int k_size = kern_size;
if(version==10)
k_size=kern_wid;
for(int i=2;i<=out_len;i++){
if(i*out_wid<max_threads && (i*img_wid + k_size)*sizeof(float)<shared_avail)
nb_row=i;
}
//to test the case when we don't have a thread by output pixel.
if((version_back!=-1)&& nb_row>1) nb_row--;
dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
else f = conv_rows_stack2<kern_wid, true, false>;\
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_ROWS_STACK2_SPECIAL(1); break;//test_conv.py:test_valid
case 2: CONV_ROWS_STACK2_SPECIAL(2); break;
case 3: CONV_ROWS_STACK2_SPECIAL(3); break;//test_conv.py:test_valid
case 4: CONV_ROWS_STACK2_SPECIAL(4); break;//test_conv.py:test_valid
case 5: CONV_ROWS_STACK2_SPECIAL(5); break;//test_conv.py:test_valid
case 6: CONV_ROWS_STACK2_SPECIAL(6); break;//test_conv.py:test_valid
case 7: CONV_ROWS_STACK2_SPECIAL(7); break;//test_nnet.py:test_lenet_108
case 8: CONV_ROWS_STACK2_SPECIAL(8); break;//test_conv.py:test_valid
// case 9: CONV_ROWS_STACK2_SPECIAL(9); break;
case 10: CONV_ROWS_STACK2_SPECIAL(10); break;//test_conv.py:test_valid
//////// Special cases
case 23: CONV_ROWS_STACK2_SPECIAL(23); break;//test_conv.py:test_valid
case 24: CONV_ROWS_STACK2_SPECIAL(24); break;//test_conv.py:test_valid
case 28: CONV_ROWS_STACK2_SPECIAL(28); break;//test_conv.py:test_valid
case 45: CONV_ROWS_STACK2_SPECIAL(45); break;//test_nnet.py:test_lenet_108
case 58: CONV_ROWS_STACK2_SPECIAL(58); break;//test_nnet.py:test_lenet_108
case 70: CONV_ROWS_STACK2_SPECIAL(70); break;//mobahi_2009.py
case 102: CONV_ROWS_STACK2_SPECIAL(102); break;//test_nnet.py:test_lenet_108
case 116: CONV_ROWS_STACK2_SPECIAL(116); break;//test_nnet.py:test_lenet_256
case 248: CONV_ROWS_STACK2_SPECIAL(248); break;//test_nnet.py:test_lenet_256
#endif
//////// default case
default:
if(!msgdisplayed_conv_rows_stack2__kern_width){
printf("OPTIMISATION HINT: conv_rows_stack{2,3} template default add"
" kern_wid=%d in %s at line %i to have an optimized version"
" for your kern_wid\n", kern_wid, __FILE__, __LINE__);
msgdisplayed_conv_rows_stack2__kern_width = true;
}
CONV_ROWS_STACK2_SPECIAL(0);
}
f<<< grid, threads, shared_size >>>
(img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,(version==9?2:3));
if (verbose) printf("INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
//version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient.
//version 8 need a minimum of kernel length as we force the split.
//version 8 is needed to test more easily this kernel template parameter.
//version 13 load only 1 kernel row at a time.
if (!subsample &&
out_contiguous &&
out_size<512 &&//Maximum of 512 theads by block
(version==7||version==8||version==13||version==-1) &&
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
(version!=13||kern_len>1) && //version 13 need a minimal kernel length as big as the split.
(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete) //conv_patch_stack_reduce
{
int nb_split=1;
int full_kern=true;
if(version==8||version==13) nb_split++;//force the split.
if(version==13)full_kern=false;
while(ceil_intdiv(kern_len,nb_split)>64)nb_split++;//device 1.3 have a max of 64 thread in z
while(out_size*ceil_intdiv(kern_len,nb_split)>512)nb_split++;
int shared_size=(img_size + kern_size + out_size*kern_len)*sizeof(float);
if(shared_size>=shared_avail){
//if we can't fit the kernel in shared memory, we can split it more.
full_kern=false;
assert((img_size+kern_wid*2+out_size*2)*sizeof(float)<=shared_avail);
shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
while(shared_size>=shared_avail || ceil_intdiv(kern_len,nb_split)>64){
nb_split++;
shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
}
}
int thread_z=ceil_intdiv(kern_len,nb_split);
assert(thread_z>0);//should not happen, but in case...
assert(shared_size<=shared_avail);
if(!full_kern)
assert(thread_z!=kern_len);
dim3 threads(out_wid, out_len, thread_z);
dim3 grid(nbatch,nkern);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int,
int, int,
int, int);
const bool split=thread_z!=kern_len;
const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
//We will always be split when we don't load the full kernel
#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
if (kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
else if(kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
else if(kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
else if(kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
else if(!kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
else if(!kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
else if(!kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
/*else if(kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
/*else if(kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
else if(kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
else if(kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
/*else if(!kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
/*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
else if(!kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_PATCH_STACK_REDUCE_SPECIAL(1); break;
case 2: CONV_PATCH_STACK_REDUCE_SPECIAL(2); break;
case 3: CONV_PATCH_STACK_REDUCE_SPECIAL(3); break;
case 4: CONV_PATCH_STACK_REDUCE_SPECIAL(4); break;
case 5: CONV_PATCH_STACK_REDUCE_SPECIAL(5); break;
case 6: CONV_PATCH_STACK_REDUCE_SPECIAL(6); break;
case 7: CONV_PATCH_STACK_REDUCE_SPECIAL(7); break;
case 8: CONV_PATCH_STACK_REDUCE_SPECIAL(8); break;
case 9: CONV_PATCH_STACK_REDUCE_SPECIAL(9); break;
case 10: CONV_PATCH_STACK_REDUCE_SPECIAL(10); break;
//////// Special cases
case 20: CONV_PATCH_STACK_REDUCE_SPECIAL(20); break;
case 23: CONV_PATCH_STACK_REDUCE_SPECIAL(23); break;//test_nnet.py:test_lenet64
case 24: CONV_PATCH_STACK_REDUCE_SPECIAL(24); break;
case 28: CONV_PATCH_STACK_REDUCE_SPECIAL(28); break;
case 32: CONV_PATCH_STACK_REDUCE_SPECIAL(32); break;//Alex speed demonstration
#endif
//////// default case
default:
if(!msgdisplayed_conv_patch_stack_reduce__kern_width) {
printf("OPTIMISATION HINT: conv_patch_stack_reduce template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
msgdisplayed_conv_patch_stack_reduce__kern_width = true;
}
CONV_PATCH_STACK_REDUCE_SPECIAL(0);
}
if (verbose) printf("INFO: using 'conv_patch_stack_reduce' version nb_split=%d, preload_full_kern=%d\n",
nb_split,full_kern);
if (verbose>1) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, threads.z, grid.x, grid.y,
shared_size, threads.x * threads.y * threads.z);
f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
img_len, img_wid, kern_len, kern_wid,
nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z);
if (verbose) printf("INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (1 && (version==6||version==-1) &&
!work_complete) //conv_valid_row_reduce
{
int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int block_nstack=nstack;
//Max of 512 threads per blocks.
//On old hardware, we have a max of 356 threads as we have only
//8k registers and the kernel use 23 register
//TODO: check if we have 8k or 16k of register...
while(block_nstack*kern_len>320)block_nstack--;
dim3 n_threads(block_nstack, kern_len, 1);
int n_reduce_buf = block_nstack * kern_len * sizeof(float);
/* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
*
* if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
* */
int initial_reduce_boundary = (1 << (int)(log2((double)(n_reduce_buf/sizeof(float)))));
if (initial_reduce_boundary == (n_reduce_buf / sizeof(float)))
initial_reduce_boundary >>= 1;
if (n_reduce_buf == sizeof(float))
assert (initial_reduce_boundary == 0);
else
{
assert (initial_reduce_boundary * 2 >= n_reduce_buf/sizeof(float));
assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
}
void (*f)(int, int, int, int,
int, int, int, int, int,
float*, int, int, int, int,
float*, int, int, int, int,
float*, int, int, int, int,
int, int, int);
//std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
//std::cerr << "kerns " << nstack << " " << kern_len << "\n";
//std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
if(block_nstack==nstack)
f=conv_valid_row_reduce<false>;
else
f=conv_valid_row_reduce<true>;
f<<<n_blocks, n_threads, n_reduce_buf>>>(
nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
img_len, img_wid,
kern_len, kern_wid,
out_len, out_wid,
img->devdata,
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],
img_stride_row, img_stride_col,
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols, initial_reduce_boundary);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose) printf("INFO: used 'conv_valid_row_reduce' version\n");
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, shared_size=%i, nb_threads=%i\n", n_threads.x, n_threads.y, n_blocks, n_reduce_buf, n_threads.x * n_threads.y);
if (verbose) printf("INFO: impl 'conv_valid_row_reduce' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (1 && !work_complete) //conv_reference_valid
{
int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (0)
{
if (verbose) printf("INFO: launching conv_reference_valid\n");
if (verbose) printf(" img : %i %i %i %i %p %i %i %i %i\n",
nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
img->devdata,
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose) printf(" kern: %i %i %i %i %p %i %i %i %i\n",
nkern, nstack, kern_len, kern_wid,
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3]
);
if (verbose) printf(" out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose) printf(" launch params: %i %i %i\n", outsize, n_blocks, n_threads);
}
conv_reference_valid<<<n_blocks, n_threads>>>( nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
img_len, img_wid,
kern_len, kern_wid,
out_len, out_wid,
img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3],
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose) printf("INFO: used 'conv_reference_valid' version\n");
}
else
{
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed! (%s)",
cudaGetErrorString(sts));
return -1;
}
}
return 0;
//PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
//return -1;
}
int
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
{
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
int work_complete = 0;
if (img->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required img of 4D");
return -1;
}
if (kern->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required kern of 4D");
return -1;
}
if (out->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required out of 4D");
return -1;
}
if (0)
{
//TODO: rethink these to use physical / logical dimensions, subsampling, offsets, etc.
assert (CudaNdarray_HOST_DIMS(out)[2] == CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1);
assert (CudaNdarray_HOST_DIMS(out)[3] == CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1);
}
assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
const int img_len=CudaNdarray_HOST_DIMS(img)[2];
const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
const int out_len=CudaNdarray_HOST_DIMS(out)[2];
const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
const int img_stride_stack=CudaNdarray_HOST_STRIDES(img)[1];
const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid;
const int out_size=out_len*out_wid;
const int img_size_byte = img_size*sizeof(float);
const int kern_size_byte = kern_size*sizeof(float);
//padded image sizes
const int img_wid_padded=img_wid+2*kern_wid-2;
const int img_len_padded=img_len+2*kern_len-2;
const int img_size_padded=img_len_padded * img_wid_padded;
const int img_size_padded_byte = img_size_padded*sizeof(float);
//const int out_size_byte = out_size*sizeof(float); // unused
bool subsample = subsample_rows!=1 || subsample_cols!=1;
bool img_contiguous = CudaNdarray_is_c_contiguous(img);
bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
bool out_contiguous = CudaNdarray_is_c_contiguous(out);
bool c_contiguous = img_contiguous && kern_contiguous && out_contiguous;
bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata;
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped=1;
kern_stride_row_unflipped=kern_wid;
kern_flipped=false;
kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
}
if (verbose>1)
{
printf("INFO: Running conv_full version %d with inputs:\n",version);
printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]);
printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]);
}
if (!subsample &&
out_contiguous &&
(version==3||version==4||version==5||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block
(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack_padded
{
//version 3 without split
//version 4 with split (more registers)
//version 5 with split (more registers) low mem version(some restriction and still more register)
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
if(kern_len==1 && version==5){
//version 5 don't support kern_len==1 as 1%0 return -1.
version=-1;
if(verbose)printf("WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
}
if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
//Max of 16k of shared memory
if(version==5)
while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
//327 as we use 25 register
//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>512)version=4;
if(version==-1)version=3;
if(version==-1 && nb_split>1) version=4;
else if(version==-1) version=3;
else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more then 1 split as to be always execute.
assert(version!=3 || nb_split==1);
assert(version!=5 || kern_len>1);
assert(version!=-1);
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch,nkern);
int shared_size=img_size_padded_byte + kern_size_byte;
if(version==5)
shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
else assert(false);
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(1); break;
case 2: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(2); break;
case 3: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(3); break;
case 4: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(4); break;
case 5: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(5); break;//test_conv.py:test_full
case 6: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(6); break;//test_conv.py:test_full
case 7: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(7); break;//test_nnet.py:test_lenet_64
case 8: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(8); break;//test_conv.py:test_full
case 9: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(9); break;//test_nnet.py:test_lenet_256
case 10: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(10); break;//test_conv.py:test_full
case 12: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(12); break;//test_conv.py:test_full
//////// Special cases
case 28: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(28); break;
#endif
//////// default case
default:
if(!msgdisplayed_conv_full_patch_stack__kern_width){
printf("OPTIMISATION HINT: conv_full_patch_stack_padded template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
msgdisplayed_conv_full_patch_stack__kern_width = true;
}
CONV_FULL_PATCH_STACK_PADDED_SPECIAL(0);
}
f<<< grid, threads, shared_size>>>
(img->devdata, kern_data_unflipped, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
if (verbose) printf("INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false"));
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
if (verbose) printf("INFO: impl 'conv_full_patch_stack_padded' %s %s failed (%s), trying next implementation\n",
version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"),
cudaGetErrorString(sts));
}
}
if (!subsample && c_contiguous &&
(version==0||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
conv_full_patch<<< grid, threads, shared_size>>>
(img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose) printf("INFO: used 'conv_full_patch' version\n");
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_full_patch' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (false && !subsample && //disabled as test fail for this kernel
(version==1||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block
(nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch);
int shared_size=(img_size + kern_size)*nstack*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
//typeof(conv_full_load_everything<0>) f = ;
void (*f)(float*, float*, float*,
int, int, int, int, int, int,
int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
switch(nstack)
{
#ifdef UNROLL_LOOP
case 1: f = conv_full_load_everything<1>; break;
//case 10: f = conv_full_load_everything<10>; break;
//case 30: f = conv_full_load_everything<30>; break; //This is actually slower than the general version??
#endif
default:
printf("OPTIMISATION HINT: conv_full_load_everything template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
f = conv_full_load_everything<0>;
};
f<<< grid, threads, shared_size>>>
(img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack,
CudaNdarray_HOST_STRIDES(img)[3],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[0]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose) printf("INFO: used 'conv_full_load_everything' version\n");
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_full_load_everything' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample &&
img_batch_stack_contiguous &&
out_contiguous &&
(version==2||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int);
if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<false,true>;\
else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>;
f<<< grid, threads, shared_size>>>(
img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack,img_stride_col, img_stride_row,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose) printf("INFO: used 'conv_full_patch_stack' version\n");
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (1 && !work_complete) //conv_reference_full
{
if(verbose>1)printf("INFO: will start conv_reference_full\n");
int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (0)
{
if (verbose) printf("INFO: launching conv_reference_valid\n");
if (verbose) printf(" img : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
img->devdata,
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose) printf(" kern: %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3]
);
if (verbose) printf(" out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose) printf(" launch params: %i %i %i\n", outsize, n_blocks, n_threads);
if (verbose) printf(" subsample params: %i %i\n", subsample_rows, subsample_cols);
}
conv_reference_full<<<n_blocks, n_threads>>>(CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3],
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose) printf("INFO: used 'conv_reference_full' version ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d nkern=%d nstack=%d subsample=%d\n",
img_len,img_wid, kern_len, kern_wid,
out_len, out_wid, nbatch, nkern, nstack, subsample);
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", n_threads, 1, n_blocks, 1, 0, n_threads);
if (verbose) printf("INFO: impl 'conv_reference_full' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed! (%s)",
cudaGetErrorString(sts));
return -1;
}
}
return 0;
}
PyObject *
CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
CudaNdarray * out, const int mode,
const int subsample_rows, const int subsample_cols,
const int version, const int verbose)
{
if (img->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
if (kern->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
int out_dim[4];
out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(kern)[0];
int logical_rows, logical_cols;
if (mode == ConvMode_VALID)
{
logical_rows = CudaNdarray_HOST_DIMS(img)[2] - CudaNdarray_HOST_DIMS(kern)[2] + 1;
logical_cols = CudaNdarray_HOST_DIMS(img)[3] - CudaNdarray_HOST_DIMS(kern)[3] + 1;
}
else
{
logical_rows = CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1;
logical_cols = CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1;
}
out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
CudaNdarray * rval = out;
if(!(out && out->nd==4 && CudaNdarray_is_c_contiguous(out)
&& CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
&& CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
&& CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
&& CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])){
if (out)
{
fprintf(stderr, "Warning: Conv is ignoring 'out' argument with wrong structure.\n");
Py_DECREF(out);
}
rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
}
if ((rval==NULL)
|| ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
|| ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
)
{
// if rval is something we just allocated,
// and there was a problem, then we have to free it.
if (rval != out) Py_XDECREF(rval);
return NULL;
}
//TODO: Get refcount story clearer!
// This function does a weird thing as work-around with Conv_VARARGS
if (rval == out) Py_INCREF(rval);
return (PyObject*)rval;
}
#include"conv_kernel.cu"
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len/nb_split
......
......@@ -28,8 +28,6 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
#ifndef CONV_KERNEL_CU
#define CONV_KERNEL_CU
#include <stdio.h>
/*
#define CHECK_BANK_CONFLICTS 0
#if CHECK_BANK_CONFLICTS
......
......@@ -9,19 +9,10 @@
#define UNROLL_LOOP
#endif
#ifndef SHARED_SIZE
#define SHARED_SIZE (16*1024)
#endif
/////////////////////////
// Static helper methods
/////////////////////////
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
static void
CudaNdarray_null_init(CudaNdarray*self)
{
......@@ -1380,92 +1371,6 @@ CudaNdarray_Dot(PyObject* _unsed, PyObject * args)
Py_XDECREF(rval);
return NULL;
}
static PyObject *
CudaNdarray_Conv_VARARGS(PyObject * _unused, PyObject *args, PyObject * kwargs)
{
//Mandatory args
PyObject *img = NULL;
PyObject *kern = NULL;
PyObject *mode_str = NULL;
//Optional args
PyObject *out = NULL;
PyObject *subsample = NULL;
PyObject *logical_img_shape = NULL;
PyObject *logical_kern_shape = NULL;
PyObject *kern_align = NULL;
int version = -1;
int verbose = 0;
// the output_downsampling arguments as integers
const int od_0_orig = 1;
const int od_1_orig = 1;
int od_0 = od_0_orig;
int od_1 = od_1_orig;
PyObject *out_2 = NULL;
static char *kwlist[] = {"img", "kern", "mode", "out", "subsample", "logical_img_shape", "logical_kern_shape", "kern_align", "version", "verbose", NULL };
if (! PyArg_ParseTupleAndKeywords(args, kwargs, "OOS|OOOOOii", kwlist,
&img, &kern, &mode_str,
&out, &subsample, &logical_img_shape, &logical_kern_shape, &kern_align, &version, &verbose))
return NULL;
int mode;
if (strcmp(PyString_AsString(mode_str), "full") == 0)
{
mode = ConvMode_FULL;
}
else if (strcmp(PyString_AsString(mode_str), "valid") == 0)
{
mode = ConvMode_VALID;
}
else
{
PyErr_SetString(PyExc_ValueError, "mode must be one of 'full' or 'valid'");
return NULL;
}
if (!CudaNdarray_Check(img))
{
PyErr_SetString(PyExc_TypeError, "img argument must be a CudaNdarray");
return NULL;
}
if (!CudaNdarray_Check(kern))
{
PyErr_SetString(PyExc_TypeError, "kern argument must be a CudaNdarray");
return NULL;
}
if (out && CudaNdarray_Check(out))
{
out_2 = out;
}
else if (out && Py_None != out)
{
fprintf(stderr, "Warning: Conv is ignoring 'out' argument that wasn't a CudaNdarray.\n");
}
if (subsample)
{
if ((!PySequence_Check(subsample))
|| (PySequence_Length(subsample) != 2))
{
PyErr_SetString(PyExc_TypeError, "'subsample' argument must be a length-2 sequence of integers");
return NULL;
}
PyObject *py_od_0 = PySequence_GetItem(subsample, 0);
PyObject *py_od_1 = PySequence_GetItem(subsample, 1);
od_0 = PyInt_AsLong(py_od_0);
od_1 = PyInt_AsLong(py_od_1);
if (PyErr_Occurred())
{
od_0 = od_0_orig;
od_1 = od_1_orig;
Py_XDECREF(py_od_0);
Py_XDECREF(py_od_1);
PyErr_SetString(PyExc_TypeError, "'subsample' argument must be a length-2 sequence of integers");
return NULL;
}
}
return CudaNdarray_Conv((CudaNdarray*)img, (CudaNdarray*)kern, (CudaNdarray*)out_2, mode, od_0, od_1, version, verbose);
}
static PyObject *
filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict)
......@@ -1552,7 +1457,6 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
static PyMethodDef module_methods[] = {
{"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
{"conv", (PyCFunction)CudaNdarray_Conv_VARARGS, METH_VARARGS|METH_KEYWORDS, "Returns the 2D convolution of one CudaNdarray argument with another. WRITEME"},
{"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Allow to select the gpu card to use."},
{"filter", filter, METH_VARARGS, "no doc!"},
{NULL, NULL, NULL, NULL} /* Sentinel */
......@@ -2379,1273 +2283,3 @@ CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern
return 0;
}
#include <conv_kernel.cu>
#include <conv_full_kernel.cu>
bool msgdisplayed_conv_patch__kern_width = false;
bool msgdisplayed_conv_patch_stack__kern_width = false;
bool msgdisplayed_conv_rows__kern_width = false;
bool msgdisplayed_conv_rows_stack__kern_width = false;
bool msgdisplayed_conv_rows_stack2__kern_width = false;
bool msgdisplayed_conv_patch_stack_reduce__kern_width = false;
bool msgdisplayed_conv_full_patch_stack__kern_width = false;
/*
* version: -1, autodetect, >=0 a specific version to use.
* If it can't be executed, we revert to the reference implementation
*/
int
CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows, int subsample_cols,
int version = -1, int verbose=0)
{
int work_complete = 0;
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
if (img->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required img of 4D");
return -1;
}
if (kern->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required kern of 4D");
return -1;
}
if (out->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required out of 4D");
return -1;
}
if (subsample_rows==1 && subsample_cols==1)
{
//TODO: rethink these asserts in light of the difference between physical and logical dimensions
assert (CudaNdarray_HOST_DIMS(out)[2] == CudaNdarray_HOST_DIMS(img)[2] - CudaNdarray_HOST_DIMS(kern)[2] + 1);
assert (CudaNdarray_HOST_DIMS(out)[3] == CudaNdarray_HOST_DIMS(img)[3] - CudaNdarray_HOST_DIMS(kern)[3] + 1);
}
assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
// we now search through a few implementations until one applies to our arguments.
//TODO: make separate version as if all fill this is slower.
//TODO: Make a switch with power of 2 max size as template
//TODO: make a parameter the number of division
//TODO: Should we make them in separate grid block instead?
const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
const int img_len=CudaNdarray_HOST_DIMS(img)[2];
const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
const int out_len=CudaNdarray_HOST_DIMS(out)[2];
const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
const int img_stride_stack= CudaNdarray_HOST_STRIDES(img)[1];
const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid;
const int out_size=out_len*out_wid;
const int img_size_byte = img_size*sizeof(float);
const int kern_size_byte = kern_size*sizeof(float);
const int out_size_byte = out_size*sizeof(float);
bool subsample = subsample_rows!=1 || subsample_cols!=1;
bool img_contiguous = CudaNdarray_is_c_contiguous(img);
bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
bool out_contiguous = CudaNdarray_is_c_contiguous(out);
bool c_contiguous = img_contiguous && kern_contiguous && out_contiguous;
bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable haev the original value when we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata;
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped=1;
kern_stride_row_unflipped=kern_wid;
kern_flipped=false;
kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
}
if (verbose>1)
{
printf("INFO: Running conv_valid version %d with inputs:\n",version);
printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]);
printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]);
}
//if we remove the restriction img_size_byte+kern_size_byte>8*1024, we can enter in condition where we will lower the occupency due to shared memory and/or registers.
if ((version == -1) && (out_size<64 || img_size_byte+kern_size_byte>8*1024) && out_size<=256){
//condition for exec
if(!subsample &&
out_contiguous &&
out_size<512 &&//Maximum of 512 theads by block
(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete)
version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
}
if (!subsample && c_contiguous &&
(version==0||version==2||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch
{
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch, nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int);
#define CONV_PATCH_SPECIAL(kern_wid) \
if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
else f=conv_patch<true,kern_wid,true>;
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_PATCH_SPECIAL(1); break;//test_conv.py:test_valid
case 2: CONV_PATCH_SPECIAL(2); break;//test_conv.py:test_valid
case 3: CONV_PATCH_SPECIAL(3); break;//test_conv.py:test_valid
case 4: CONV_PATCH_SPECIAL(4); break;
case 5: CONV_PATCH_SPECIAL(5); break;
case 6: CONV_PATCH_SPECIAL(6); break;
case 7: CONV_PATCH_SPECIAL(7); break;
case 10: CONV_PATCH_SPECIAL(10); break;
#endif
default:
if(!msgdisplayed_conv_patch__kern_width) {
printf("OPTIMISATION WARNING: conv_patch template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
msgdisplayed_conv_patch__kern_width=true;
}
CONV_PATCH_SPECIAL(0);
}
f<<< grid, threads, shared_size>>>
(img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose) printf("INFO: used 'conv_patch' version %s nb_split=%d\n",threads.y==out_len?"no split": "split",nb_split);
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i, nb_split=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split);
if (verbose) printf("INFO: impl 'conv_patch' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample &&
out_contiguous &&
(version==1||version==3||version==11||version==12||version==-1) &&
(version!=1 || out_size<512) &&//Maximum of 512 theads by block
out_wid<512 &&//Maximum of 512 theads by block
img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch_stack
{
//version 1 is without split and preload the full kernel
//version 3 is with split and preload the full kernel
//version 11 is without split and load only 1 kernel row at a time.
//version 12 is with split and load only 1 kernel row at a time.
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
if(version==11 || version==12) preload_full_kernel=false;
dim3 grid(nbatch,nkern);
int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_PATCH_STACK_SPECIAL(kern_wid) \
if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,false,true>;\
if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,false,true>;\
if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,false,true>;\
if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,false,true>;\
if(preload_full_kernel && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,true,true>;\
if(preload_full_kernel && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,true,true>;\
if(preload_full_kernel && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,true,true>;\
if(preload_full_kernel && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,true,true>;\
if(nb_split==1 && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,false,false>;\
if(nb_split==1 && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,false,false>;\
if(nb_split==1 && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,false,false>;\
if(nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,false,false>;\
if(img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,true,false>;\
if(img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,true,false>;\
if(!img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,true,false>;\
if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,true,false>;
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_PATCH_STACK_SPECIAL(1); break;
case 2: CONV_PATCH_STACK_SPECIAL(2); break;
case 3: CONV_PATCH_STACK_SPECIAL(3); break;
case 4: CONV_PATCH_STACK_SPECIAL(4); break;
case 5: CONV_PATCH_STACK_SPECIAL(5); break;
case 6: CONV_PATCH_STACK_SPECIAL(6); break;
case 7: CONV_PATCH_STACK_SPECIAL(7); break;
case 8: CONV_PATCH_STACK_SPECIAL(8); break;
case 9: CONV_PATCH_STACK_SPECIAL(9); break;
case 10: CONV_PATCH_STACK_SPECIAL(10); break;
//////// Special cases
case 12: CONV_PATCH_STACK_SPECIAL(12); break;//on cifar10
case 21: CONV_PATCH_STACK_SPECIAL(21); break;//on cifar10
case 23: CONV_PATCH_STACK_SPECIAL(23); break;//test_nnet.py:test_lenet_64
case 24: CONV_PATCH_STACK_SPECIAL(24); break;//on cifar10
case 25: CONV_PATCH_STACK_SPECIAL(25); break;//on cifar10
case 28: CONV_PATCH_STACK_SPECIAL(28); break;
case 32: CONV_PATCH_STACK_SPECIAL(32); break;// Alex speed example
case 45: CONV_PATCH_STACK_SPECIAL(45); break;//used by test_nnet.py:test_lenet_108
#endif
//////// default case
default:
if(!msgdisplayed_conv_patch_stack__kern_width) {
printf("OPTIMISATION HINT: conv_patch_stack template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
msgdisplayed_conv_patch_stack__kern_width = true;
}
CONV_PATCH_STACK_SPECIAL(0);
}
f<<< grid, threads, shared_size>>>
(img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, nb_split=%i preload_full_kernel=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split, preload_full_kernel);
if (verbose) printf("INFO: used 'conv_patch_stack' version with nb_split=%i and preload_full_kernel=%i\n",
nb_split,preload_full_kernel);
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, nb_split=%i preload_full_kernel=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split, preload_full_kernel);
if (verbose) printf("INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample && out_contiguous &&
(version==4||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block
nstack == 1 &&// don't implement the stack in the kernel.
kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
!work_complete) //conv_rows
{
dim3 threads(out_wid);
dim3 grid(out_len, nbatch*nkern);
int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_ROWS_SPECIAL(kern_wid) \
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
else f = conv_rows<kern_wid, true>;\
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_ROWS_SPECIAL(1); break;//test_conv.py:test_valid
case 2: CONV_ROWS_SPECIAL(2); break;//test_conv.py:test_valid
case 3: CONV_ROWS_SPECIAL(3); break;//test_conv.py:test_valid
case 4: CONV_ROWS_SPECIAL(4); break;//test_conv.py:test_valid
case 5: CONV_ROWS_SPECIAL(5); break;//test_conv.py:test_valid
// case 6: CONV_ROWS_SPECIAL(6); break;
case 7: CONV_ROWS_SPECIAL(7); break;//used by test_nnet.py:test_lenet_108
// case 8: CONV_ROWS_SPECIAL(8); break;
case 9: CONV_ROWS_SPECIAL(9); break;//used by test_nnet.py:test_lenet_256
case 10: CONV_ROWS_SPECIAL(10); break;//test_conv.py:test_valid
//////// Special cases
case 28: CONV_ROWS_SPECIAL(28); break;
#endif
//////// default case
default:
if(!msgdisplayed_conv_rows__kern_width){
printf("OPTIMISATION HINT: conv_rows template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
msgdisplayed_conv_rows__kern_width = true;
}
CONV_ROWS_SPECIAL(0);
}
f<<< grid, threads, shared_size >>>
(img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose) printf("INFO: used 'conv_rows' version\n");
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_rows' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample && out_contiguous &&
(version==5||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block
img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_rows_stack
{
int nb_row=1;
int max_threads=512;
//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
for(int i=2;i<=out_len;i++){
if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
nb_row=i;
}
dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_ROWS_STACK_SPECIAL(kern_wid) \
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack<kern_wid, false>;\
else f = conv_rows_stack<kern_wid, true>;\
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_ROWS_STACK_SPECIAL(1); break;//test_conv.py:test_valid
case 2: CONV_ROWS_STACK_SPECIAL(2); break;
case 3: CONV_ROWS_STACK_SPECIAL(3); break;//test_conv.py:test_valid
case 4: CONV_ROWS_STACK_SPECIAL(4); break;//test_conv.py:test_valid
case 5: CONV_ROWS_STACK_SPECIAL(5); break;//test_conv.py:test_valid
case 6: CONV_ROWS_STACK_SPECIAL(6); break;//test_conv.py:test_valid
case 7: CONV_ROWS_STACK_SPECIAL(7); break;//test_nnet.py:test_lenet_108
case 8: CONV_ROWS_STACK_SPECIAL(8); break;//test_conv.py:test_valid
case 9: CONV_ROWS_STACK_SPECIAL(9); break;//test_nnet.py:test_lenet_256
case 10: CONV_ROWS_STACK_SPECIAL(10); break;//test_conv.py:test_valid
//////// Special cases
case 23: CONV_ROWS_STACK_SPECIAL(23); break;//test_conv.py:test_valid
case 24: CONV_ROWS_STACK_SPECIAL(24); break;//test_conv.py:test_valid
case 28: CONV_ROWS_STACK_SPECIAL(28); break;//test_conv.py:test_valid
case 45: CONV_ROWS_STACK_SPECIAL(45); break;//test_nnet.py:test_lenet_64
case 102: CONV_ROWS_STACK_SPECIAL(102); break;//test_nnet.py:test_lenet_108
#endif
//////// default case
default:
if(!msgdisplayed_conv_rows_stack__kern_width){
printf("OPTIMISATION HINT: conv_rows_stack template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
msgdisplayed_conv_rows_stack__kern_width = true;
}
CONV_ROWS_STACK_SPECIAL(0);
}
f<<< grid, threads, shared_size >>>
(img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack' version\n");
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_rows_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample && out_contiguous &&
(version==9||version==10||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block
(img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
(version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
!work_complete) //conv_rows_stack2
{
int nb_row=1;
int max_threads=512;
int version_back = version;
//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
version = 9;
else if(version==-1)version = 10;
int k_size = kern_size;
if(version==10)
k_size=kern_wid;
for(int i=2;i<=out_len;i++){
if(i*out_wid<max_threads && (i*img_wid + k_size)*sizeof(float)<shared_avail)
nb_row=i;
}
//to test the case when we don't have a thread by output pixel.
if((version_back!=-1)&& nb_row>1) nb_row--;
dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
else f = conv_rows_stack2<kern_wid, true, false>;\
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_ROWS_STACK2_SPECIAL(1); break;//test_conv.py:test_valid
case 2: CONV_ROWS_STACK2_SPECIAL(2); break;
case 3: CONV_ROWS_STACK2_SPECIAL(3); break;//test_conv.py:test_valid
case 4: CONV_ROWS_STACK2_SPECIAL(4); break;//test_conv.py:test_valid
case 5: CONV_ROWS_STACK2_SPECIAL(5); break;//test_conv.py:test_valid
case 6: CONV_ROWS_STACK2_SPECIAL(6); break;//test_conv.py:test_valid
case 7: CONV_ROWS_STACK2_SPECIAL(7); break;//test_nnet.py:test_lenet_108
case 8: CONV_ROWS_STACK2_SPECIAL(8); break;//test_conv.py:test_valid
// case 9: CONV_ROWS_STACK2_SPECIAL(9); break;
case 10: CONV_ROWS_STACK2_SPECIAL(10); break;//test_conv.py:test_valid
//////// Special cases
case 23: CONV_ROWS_STACK2_SPECIAL(23); break;//test_conv.py:test_valid
case 24: CONV_ROWS_STACK2_SPECIAL(24); break;//test_conv.py:test_valid
case 28: CONV_ROWS_STACK2_SPECIAL(28); break;//test_conv.py:test_valid
case 45: CONV_ROWS_STACK2_SPECIAL(45); break;//test_nnet.py:test_lenet_108
case 58: CONV_ROWS_STACK2_SPECIAL(58); break;//test_nnet.py:test_lenet_108
case 70: CONV_ROWS_STACK2_SPECIAL(70); break;//mobahi_2009.py
case 102: CONV_ROWS_STACK2_SPECIAL(102); break;//test_nnet.py:test_lenet_108
case 116: CONV_ROWS_STACK2_SPECIAL(116); break;//test_nnet.py:test_lenet_256
case 248: CONV_ROWS_STACK2_SPECIAL(248); break;//test_nnet.py:test_lenet_256
#endif
//////// default case
default:
if(!msgdisplayed_conv_rows_stack2__kern_width){
printf("OPTIMISATION HINT: conv_rows_stack{2,3} template default add"
" kern_wid=%d in %s at line %i to have an optimized version"
" for your kern_wid\n", kern_wid, __FILE__, __LINE__);
msgdisplayed_conv_rows_stack2__kern_width = true;
}
CONV_ROWS_STACK2_SPECIAL(0);
}
f<<< grid, threads, shared_size >>>
(img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,(version==9?2:3));
if (verbose) printf("INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
//version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient.
//version 8 need a minimum of kernel length as we force the split.
//version 8 is needed to test more easily this kernel template parameter.
//version 13 load only 1 kernel row at a time.
if (!subsample &&
out_contiguous &&
out_size<512 &&//Maximum of 512 theads by block
(version==7||version==8||version==13||version==-1) &&
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
(version!=13||kern_len>1) && //version 13 need a minimal kernel length as big as the split.
(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete) //conv_patch_stack_reduce
{
int nb_split=1;
int full_kern=true;
if(version==8||version==13) nb_split++;//force the split.
if(version==13)full_kern=false;
while(ceil_intdiv(kern_len,nb_split)>64)nb_split++;//device 1.3 have a max of 64 thread in z
while(out_size*ceil_intdiv(kern_len,nb_split)>512)nb_split++;
int shared_size=(img_size + kern_size + out_size*kern_len)*sizeof(float);
if(shared_size>=shared_avail){
//if we can't fit the kernel in shared memory, we can split it more.
full_kern=false;
assert((img_size+kern_wid*2+out_size*2)*sizeof(float)<=shared_avail);
shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
while(shared_size>=shared_avail || ceil_intdiv(kern_len,nb_split)>64){
nb_split++;
shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
}
}
int thread_z=ceil_intdiv(kern_len,nb_split);
assert(thread_z>0);//should not happen, but in case...
assert(shared_size<=shared_avail);
if(!full_kern)
assert(thread_z!=kern_len);
dim3 threads(out_wid, out_len, thread_z);
dim3 grid(nbatch,nkern);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int,
int, int,
int, int);
const bool split=thread_z!=kern_len;
const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
//We will always be split when we don't load the full kernel
#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
if (kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
else if(kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
else if(kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
else if(kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
else if(!kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
else if(!kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
else if(!kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
/*else if(kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
/*else if(kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
else if(kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
else if(kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
/*else if(!kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
/*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
else if(!kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_PATCH_STACK_REDUCE_SPECIAL(1); break;
case 2: CONV_PATCH_STACK_REDUCE_SPECIAL(2); break;
case 3: CONV_PATCH_STACK_REDUCE_SPECIAL(3); break;
case 4: CONV_PATCH_STACK_REDUCE_SPECIAL(4); break;
case 5: CONV_PATCH_STACK_REDUCE_SPECIAL(5); break;
case 6: CONV_PATCH_STACK_REDUCE_SPECIAL(6); break;
case 7: CONV_PATCH_STACK_REDUCE_SPECIAL(7); break;
case 8: CONV_PATCH_STACK_REDUCE_SPECIAL(8); break;
case 9: CONV_PATCH_STACK_REDUCE_SPECIAL(9); break;
case 10: CONV_PATCH_STACK_REDUCE_SPECIAL(10); break;
//////// Special cases
case 20: CONV_PATCH_STACK_REDUCE_SPECIAL(20); break;
case 23: CONV_PATCH_STACK_REDUCE_SPECIAL(23); break;//test_nnet.py:test_lenet64
case 24: CONV_PATCH_STACK_REDUCE_SPECIAL(24); break;
case 28: CONV_PATCH_STACK_REDUCE_SPECIAL(28); break;
case 32: CONV_PATCH_STACK_REDUCE_SPECIAL(32); break;//Alex speed demonstration
#endif
//////// default case
default:
if(!msgdisplayed_conv_patch_stack_reduce__kern_width) {
printf("OPTIMISATION HINT: conv_patch_stack_reduce template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
msgdisplayed_conv_patch_stack_reduce__kern_width = true;
}
CONV_PATCH_STACK_REDUCE_SPECIAL(0);
}
if (verbose) printf("INFO: using 'conv_patch_stack_reduce' version nb_split=%d, preload_full_kern=%d\n",
nb_split,full_kern);
if (verbose>1) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, threads.z, grid.x, grid.y,
shared_size, threads.x * threads.y * threads.z);
f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
img_len, img_wid, kern_len, kern_wid,
nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z);
if (verbose) printf("INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (1 && (version==6||version==-1) &&
!work_complete) //conv_valid_row_reduce
{
int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int block_nstack=nstack;
//Max of 512 threads per blocks.
//On old hardware, we have a max of 356 threads as we have only
//8k registers and the kernel use 23 register
//TODO: check if we have 8k or 16k of register...
while(block_nstack*kern_len>320)block_nstack--;
dim3 n_threads(block_nstack, kern_len, 1);
int n_reduce_buf = block_nstack * kern_len * sizeof(float);
/* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
*
* if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
* */
int initial_reduce_boundary = (1 << (int)(log2((double)(n_reduce_buf/sizeof(float)))));
if (initial_reduce_boundary == (n_reduce_buf / sizeof(float)))
initial_reduce_boundary >>= 1;
if (n_reduce_buf == sizeof(float))
assert (initial_reduce_boundary == 0);
else
{
assert (initial_reduce_boundary * 2 >= n_reduce_buf/sizeof(float));
assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
}
void (*f)(int, int, int, int,
int, int, int, int, int,
float*, int, int, int, int,
float*, int, int, int, int,
float*, int, int, int, int,
int, int, int);
//std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
//std::cerr << "kerns " << nstack << " " << kern_len << "\n";
//std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
if(block_nstack==nstack)
f=conv_valid_row_reduce<false>;
else
f=conv_valid_row_reduce<true>;
f<<<n_blocks, n_threads, n_reduce_buf>>>(
nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
img_len, img_wid,
kern_len, kern_wid,
out_len, out_wid,
img->devdata,
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],
img_stride_row, img_stride_col,
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols, initial_reduce_boundary);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose) printf("INFO: used 'conv_valid_row_reduce' version\n");
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, shared_size=%i, nb_threads=%i\n", n_threads.x, n_threads.y, n_blocks, n_reduce_buf, n_threads.x * n_threads.y);
if (verbose) printf("INFO: impl 'conv_valid_row_reduce' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (1 && !work_complete) //conv_reference_valid
{
int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (0)
{
if (verbose) printf("INFO: launching conv_reference_valid\n");
if (verbose) printf(" img : %i %i %i %i %p %i %i %i %i\n",
nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
img->devdata,
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose) printf(" kern: %i %i %i %i %p %i %i %i %i\n",
nkern, nstack, kern_len, kern_wid,
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3]
);
if (verbose) printf(" out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose) printf(" launch params: %i %i %i\n", outsize, n_blocks, n_threads);
}
conv_reference_valid<<<n_blocks, n_threads>>>( nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
img_len, img_wid,
kern_len, kern_wid,
out_len, out_wid,
img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3],
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose) printf("INFO: used 'conv_reference_valid' version\n");
}
else
{
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed! (%s)",
cudaGetErrorString(sts));
return -1;
}
}
return 0;
//PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
//return -1;
}
int
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
{
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
int work_complete = 0;
if (img->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required img of 4D");
return -1;
}
if (kern->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required kern of 4D");
return -1;
}
if (out->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required out of 4D");
return -1;
}
if (0)
{
//TODO: rethink these to use physical / logical dimensions, subsampling, offsets, etc.
assert (CudaNdarray_HOST_DIMS(out)[2] == CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1);
assert (CudaNdarray_HOST_DIMS(out)[3] == CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1);
}
assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
const int img_len=CudaNdarray_HOST_DIMS(img)[2];
const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
const int out_len=CudaNdarray_HOST_DIMS(out)[2];
const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
const int img_stride_stack=CudaNdarray_HOST_STRIDES(img)[1];
const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid;
const int out_size=out_len*out_wid;
const int img_size_byte = img_size*sizeof(float);
const int kern_size_byte = kern_size*sizeof(float);
//padded image sizes
const int img_wid_padded=img_wid+2*kern_wid-2;
const int img_len_padded=img_len+2*kern_len-2;
const int img_size_padded=img_len_padded * img_wid_padded;
const int img_size_padded_byte = img_size_padded*sizeof(float);
//const int out_size_byte = out_size*sizeof(float); // unused
bool subsample = subsample_rows!=1 || subsample_cols!=1;
bool img_contiguous = CudaNdarray_is_c_contiguous(img);
bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
bool out_contiguous = CudaNdarray_is_c_contiguous(out);
bool c_contiguous = img_contiguous && kern_contiguous && out_contiguous;
bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata;
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped=1;
kern_stride_row_unflipped=kern_wid;
kern_flipped=false;
kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
}
if (verbose>1)
{
printf("INFO: Running conv_full version %d with inputs:\n",version);
printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]);
printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]);
}
if (!subsample &&
out_contiguous &&
(version==3||version==4||version==5||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block
(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack_padded
{
//version 3 without split
//version 4 with split (more registers)
//version 5 with split (more registers) low mem version(some restriction and still more register)
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
if(kern_len==1 && version==5){
//version 5 don't support kern_len==1 as 1%0 return -1.
version=-1;
if(verbose)printf("WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
}
if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
//Max of 16k of shared memory
if(version==5)
while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
//327 as we use 25 register
//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>512)version=4;
if(version==-1)version=3;
if(version==-1 && nb_split>1) version=4;
else if(version==-1) version=3;
else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more then 1 split as to be always execute.
assert(version!=3 || nb_split==1);
assert(version!=5 || kern_len>1);
assert(version!=-1);
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch,nkern);
int shared_size=img_size_padded_byte + kern_size_byte;
if(version==5)
shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
else assert(false);
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(1); break;
case 2: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(2); break;
case 3: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(3); break;
case 4: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(4); break;
case 5: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(5); break;//test_conv.py:test_full
case 6: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(6); break;//test_conv.py:test_full
case 7: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(7); break;//test_nnet.py:test_lenet_64
case 8: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(8); break;//test_conv.py:test_full
case 9: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(9); break;//test_nnet.py:test_lenet_256
case 10: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(10); break;//test_conv.py:test_full
case 12: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(12); break;//test_conv.py:test_full
//////// Special cases
case 28: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(28); break;
#endif
//////// default case
default:
if(!msgdisplayed_conv_full_patch_stack__kern_width){
printf("OPTIMISATION HINT: conv_full_patch_stack_padded template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
msgdisplayed_conv_full_patch_stack__kern_width = true;
}
CONV_FULL_PATCH_STACK_PADDED_SPECIAL(0);
}
f<<< grid, threads, shared_size>>>
(img->devdata, kern_data_unflipped, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
if (verbose) printf("INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false"));
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
if (verbose) printf("INFO: impl 'conv_full_patch_stack_padded' %s %s failed (%s), trying next implementation\n",
version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"),
cudaGetErrorString(sts));
}
}
if (!subsample && c_contiguous &&
(version==0||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
conv_full_patch<<< grid, threads, shared_size>>>
(img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose) printf("INFO: used 'conv_full_patch' version\n");
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_full_patch' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (false && !subsample && //disabled as test fail for this kernel
(version==1||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block
(nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch);
int shared_size=(img_size + kern_size)*nstack*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
//typeof(conv_full_load_everything<0>) f = ;
void (*f)(float*, float*, float*,
int, int, int, int, int, int,
int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
switch(nstack)
{
#ifdef UNROLL_LOOP
case 1: f = conv_full_load_everything<1>; break;
//case 10: f = conv_full_load_everything<10>; break;
//case 30: f = conv_full_load_everything<30>; break; //This is actually slower than the general version??
#endif
default:
printf("OPTIMISATION HINT: conv_full_load_everything template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
f = conv_full_load_everything<0>;
};
f<<< grid, threads, shared_size>>>
(img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack,
CudaNdarray_HOST_STRIDES(img)[3],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[0]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose) printf("INFO: used 'conv_full_load_everything' version\n");
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_full_load_everything' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample &&
img_batch_stack_contiguous &&
out_contiguous &&
(version==2||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int);
if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<false,true>;\
else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>;
f<<< grid, threads, shared_size>>>(
img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack,img_stride_col, img_stride_row,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose) printf("INFO: used 'conv_full_patch_stack' version\n");
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (1 && !work_complete) //conv_reference_full
{
if(verbose>1)printf("INFO: will start conv_reference_full\n");
int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (0)
{
if (verbose) printf("INFO: launching conv_reference_valid\n");
if (verbose) printf(" img : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
img->devdata,
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose) printf(" kern: %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3]
);
if (verbose) printf(" out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose) printf(" launch params: %i %i %i\n", outsize, n_blocks, n_threads);
if (verbose) printf(" subsample params: %i %i\n", subsample_rows, subsample_cols);
}
conv_reference_full<<<n_blocks, n_threads>>>(CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3],
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose) printf("INFO: used 'conv_reference_full' version ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d nkern=%d nstack=%d subsample=%d\n",
img_len,img_wid, kern_len, kern_wid,
out_len, out_wid, nbatch, nkern, nstack, subsample);
work_complete = true;
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", n_threads, 1, n_blocks, 1, 0, n_threads);
if (verbose) printf("INFO: impl 'conv_reference_full' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed! (%s)",
cudaGetErrorString(sts));
return -1;
}
}
return 0;
}
PyObject *
CudaNdarray_Conv(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, const int mode,
const int subsample_rows, const int subsample_cols,
const int version, const int verbose)
{
if (img->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
if (kern->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
int out_dim[4];
out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(kern)[0];
int logical_rows, logical_cols;
if (mode == ConvMode_VALID)
{
logical_rows = CudaNdarray_HOST_DIMS(img)[2] - CudaNdarray_HOST_DIMS(kern)[2] + 1;
logical_cols = CudaNdarray_HOST_DIMS(img)[3] - CudaNdarray_HOST_DIMS(kern)[3] + 1;
}
else
{
logical_rows = CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1;
logical_cols = CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1;
}
out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
CudaNdarray * rval = out;
if(!(out && out->nd==4 && CudaNdarray_is_c_contiguous(out)
&& CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
&& CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
&& CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
&& CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])){
if (out)
{
fprintf(stderr, "Warning: Conv is ignoring 'out' argument with wrong structure.\n");
}
rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
}
if ((rval==NULL)
|| ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
|| ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
)
{
// if rval is something we just allocated,
// and there was a problem, then we have to free it.
if (rval != out) Py_XDECREF(rval);
return NULL;
}
//TODO: Get refcount story clearer!
// This function does a weird thing as work-around with Conv_VARARGS
if (rval == out) Py_INCREF(rval);
return (PyObject*)rval;
}
......@@ -26,6 +26,16 @@ typedef float real;
#endif
#ifndef SHARED_SIZE
#define SHARED_SIZE (16*1024)
#endif
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
/**
* struct CudaNdarray
*
......@@ -408,14 +418,6 @@ int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray * A);
int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern);
enum { ConvMode_FULL, ConvMode_VALID };
PyObject * CudaNdarray_Conv(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
PyObject * CudaNdarray_Conv(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, const int mode)
{
return CudaNdarray_Conv(img, kern, out, mode, 1, 1, -1, 0);
}
int CudaNdarray_conv(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, const int mode);
void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
{
fprintf(fd, "CudaNdarray <%p, %p> nd=%i \n", self, self->devdata, self->nd);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论