提交 41daf4a8 authored 作者: Sean Lee's avatar Sean Lee

Use the CUDA Driver API for conv operations

上级 0d5cffbe
...@@ -10,12 +10,6 @@ PyObject * PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern, ...@@ -10,12 +10,6 @@ PyObject * PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
const size_t subsample_cols, const size_t subsample_cols,
const int version, const int verbose); const int version, const int verbose);
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
/* /*
* version: -1, autodetect, >=0 a specific version to use. * version: -1, autodetect, >=0 a specific version to use.
* If it can't be executed, we revert to the reference implementation * If it can't be executed, we revert to the reference implementation
...@@ -108,6 +102,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -108,6 +102,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//TODO: make a parameter the number of division //TODO: make a parameter the number of division
//TODO: Should we make them in separate grid block instead? //TODO: Should we make them in separate grid block instead?
const int stack_len = PyGpuArray_DIMS(img)[1];
const int nstack=PyGpuArray_DIMS(kern)[1]; const int nstack=PyGpuArray_DIMS(kern)[1];
const int nbatch=PyGpuArray_DIMS(img)[0]; const int nbatch=PyGpuArray_DIMS(img)[0];
const int nkern=PyGpuArray_DIMS(kern)[0]; const int nkern=PyGpuArray_DIMS(kern)[0];
...@@ -126,6 +121,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -126,6 +121,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4; const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4; const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4; const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
const int out_stride_col = PyGpuArray_STRIDES(out)[3]/4;
const int out_stride_row = PyGpuArray_STRIDES(out)[2]/4;
const int out_stride_nkern = PyGpuArray_STRIDES(out)[1]/4;
const int out_stride_batch = PyGpuArray_STRIDES(out)[0]/4;
const int img_size=img_len*img_wid; const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid; const int kern_size=kern_len*kern_wid;
...@@ -156,16 +155,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -156,16 +155,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//we don't need to unflip it, but have the new value when we unflip it. //we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true; bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d; bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
const float * kern_data_unflipped = cuda_get_ptr(kern); if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
//the last two dimensions are c_contiguous but flipped! //the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped=1;
kern_stride_row_unflipped=kern_wid;
kern_flipped=false; kern_flipped=false;
kern_contiguous_2d_unflipped = true; kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
} }
//if we remove the restriction //if we remove the restriction
...@@ -195,43 +188,47 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -195,43 +188,47 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration. //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0)
nb_split++; nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split)); size_t threads_per_block[3] = {(size_t)out_wid,
ceil_intdiv((size_t)out_len,(size_t)nb_split),
dim3 grid(nbatch, nkern); (size_t)1};
int shared_size=(img_size + kern_size)*sizeof(float); size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
void (*f)(const float*, const float*, float*,
int, int, int, int, size_t shmem_sz = (img_size + kern_size)*sizeof(float);
int, int);
GpuKernel *k = NULL;
if(threads.y==out_len) f=conv_patch_2; if(threads_per_block[1]==out_len) k=&conv_patch_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else f=conv_patch_3; else k=&conv_patch_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
f<<< grid, threads, shared_size>>> void *kernel_params[] = {(void *)img->ga.data, (void *)&img->ga.offset,
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out), (void *)kern->ga.data, (void *)&kern->ga.offset,
img_len, img_wid, kern_len, kern_wid, nkern, nstack); (void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
cudaError_t sts = cudaGetLastError(); (void *)&kern_len, (void *)&kern_wid,
if (cudaSuccess == sts) (void *)&nkern, (void *)&nstack};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_patch' version %s nb_split=%d\n", "INFO: used 'conv_patch' version %s nb_split=%d\n",
threads.y==out_len ? "no split": "split", nb_split); threads_per_block[1]==out_len ? "no split": "split", nb_split);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i,"
" shared_size=%i, nb_threads=%i, nb_split=%i\n", " n_blocks[0]=%i, n_blocks[1]=%i,"
threads.x, threads.y, grid.x, grid.y, " shmem_sz=%i, nb_threads=%i, nb_split=%i\n",
shared_size, threads.x * threads.y, nb_split); threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1], nb_split);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch' failed (%s)," "INFO: impl 'conv_patch' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
...@@ -250,75 +247,77 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -250,75 +247,77 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing. if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration. //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++; while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split)); size_t threads_per_block[3] = {(size_t)out_wid,
(size_t)ceil_intdiv(out_len,nb_split),
(size_t)1};
bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail; bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
if(version==11 || version==12) preload_full_kernel=false; if(version==11 || version==12) preload_full_kernel=false;
dim3 grid(nbatch,nkern); size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float); size_t shmem_sz = (img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
void (*f)(const float*, const float*, float*, GpuKernel *k = NULL;
int, int, int, int, if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_64_node_<<<<HASH_PLACEHOLDER>>>>_0;}
int, int, int, int, else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_65_node_<<<<HASH_PLACEHOLDER>>>>_0;}
int, int, int, int, else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_66_node_<<<<HASH_PLACEHOLDER>>>>_0;}
int, int, int, int, else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_67_node_<<<<HASH_PLACEHOLDER>>>>_0;}
int, int); else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_68_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_69_node_<<<<HASH_PLACEHOLDER>>>>_0;}
if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_64;} else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_70_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_65;} else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_71_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_66;} else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_72_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_67;} else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_73_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_68;} else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_74_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_69;} else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_75_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_70;} else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_76_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_71;} else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_77_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_72;} else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_78_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_73;} else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_79_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_74;} else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_80_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_75;} else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_81_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_76;} else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_82_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_77;} else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_83_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_78;} else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_84_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_79;} else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_85_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_80;} else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_86_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_81;} else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_87_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_82;} else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_88_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_83;} else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_89_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_84;} else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_90_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_85;} else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_91_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_86;} else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_92_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_87;} else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_93_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_88;} else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_94_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_89;} else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_95_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_90;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_91;} void *kernel_params[] = {(void *)img->ga.data, (void *)&img->ga.offset,
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_92;} (void *)kern->ga.data, (void *)&kern->ga.offset,
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_93;} (void *)out->ga.data, (void *)&out->ga.offset,
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_94;} (void *)&img_len, (void *)&img_wid,
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_95;} (void *)&kern_len, (void *)&kern_wid,
(void *)&out_len, (void *)&out_wid,
f<<< grid, threads, shared_size>>> (void *)&nkern, (void *)&nstack,
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out), (void *)&img_stride_col, (void *)&img_stride_row,
img_len, img_wid, kern_len, kern_wid, (void *)&img_stride_stack, (void *)&img_stride_batch,
out_len, out_wid, nkern, nstack, (void *)&kern_stride_col, (void *)&kern_stride_row,
img_stride_col, img_stride_row, img_stride_stack, (void *)&kern_stride_stack, (void *)&kern_stride_nkern,
img_stride_batch, kern_stride_col, kern_stride_row, (void *)&subsample_rows, (void *)&subsample_cols};
kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (err == GA_NO_ERROR)
{ {
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i," " shmem_sz=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i," " kern_flipped=true, accumulate=false, kern_width=%i,"
" img_c_contiguous_2d=%i," " img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i," " kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i," " preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu\n", " subsample_rows=%llu, subsample_cols=%llu\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y, shmem_sz, threads_per_block[0] * threads_per_block[1],
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d, THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel, nb_split, preload_full_kernel,
(unsigned long long)subsample_rows, (unsigned long long)subsample_rows,
...@@ -337,15 +336,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -337,15 +336,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i," " shmem_sz=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false," " kern_flipped=true, accumulate=false,"
" kern_width=%i, img_c_contiguous_2d=%i," " kern_width=%i, img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i," " kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i," " preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu\n", " subsample_rows=%llu, subsample_cols=%llu\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y, shmem_sz, threads_per_block[0] * threads_per_block[1],
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d, THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel, nb_split, preload_full_kernel,
(unsigned long long)subsample_rows, (unsigned long long)subsample_rows,
...@@ -354,7 +353,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -354,7 +353,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch_stack' failed (%s)," "INFO: impl 'conv_patch_stack' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
...@@ -366,28 +365,28 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -366,28 +365,28 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
!work_complete) //conv_rows !work_complete) //conv_rows
{ {
dim3 threads(out_wid); size_t threads_per_block[3] = {(size_t)out_wid, (size_t)1, (size_t)1};
dim3 grid(out_len, nbatch*nkern); size_t n_blocks[3] = {(size_t)out_len, (size_t)nbatch*nkern, (size_t)1};
int shared_size=(kern_len*img_wid + kern_size)*sizeof(float); size_t shmem_sz = (kern_len*img_wid + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*,
int, int, int, int, GpuKernel *k = NULL;
int, int, int, int, if(!img_contiguous_2d || !kern_contiguous_2d) k=&conv_rows_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int, int, int, else k=&conv_rows_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int);
void *kernel_params[] = {
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_0; (void *)img->ga.data, (void *)&img->ga.offset,
else f = conv_rows_1; (void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
f<<< grid, threads, shared_size >>> (void *)&img_len, (void *)&img_wid,
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out), (void *)&kern_len, (void *)&kern_wid,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, (void *)&nkern, (void *)&nstack,
img_stride_col, img_stride_row, (void *)&img_stride_col, (void *)&img_stride_row,
img_stride_stack,img_stride_batch, (void *)&img_stride_stack, (void *)&img_stride_batch,
kern_stride_col, kern_stride_row, (void *)&kern_stride_col, (void *)&kern_stride_row,
kern_stride_stack, kern_stride_nkern); (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (err == GA_NO_ERROR)
{ {
work_complete = true; work_complete = true;
if (verbose) if (verbose)
...@@ -397,15 +396,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -397,15 +396,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y); shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows' failed (%s)," "INFO: impl 'conv_rows' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
...@@ -423,52 +422,50 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -423,52 +422,50 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
nb_row=i; nb_row=i;
} }
dim3 threads(out_wid,nb_row); size_t threads_per_block[3] = {(size_t)out_wid, (size_t)nb_row, (size_t)1};
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern); size_t n_blocks[3] = {(size_t)ceil_intdiv(out_len,nb_row),
(size_t)nbatch*nkern, (size_t)1};
int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float); size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
if (0) if (0)
fprintf(stderr, fprintf(stderr,
"IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n", "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
img_contiguous_2d, kern_contiguous_2d, img_contiguous_2d, kern_contiguous_2d,
threads.x, threads.y, threads.z, threads_per_block[0], threads_per_block[1], threads_per_block[2],
grid.x, grid.y, grid.z); n_blocks[0], n_blocks[1], n_blocks[2]);
GpuKernel *k = NULL;
if(!img_contiguous_2d || !kern_contiguous_2d) { if(!img_contiguous_2d || !kern_contiguous_2d) {
//fprintf(stderr, "using false version\n"); //fprintf(stderr, "using false version\n");
f = conv_rows_stack_0; k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
} else { } else {
//fprintf(stderr, "using true version\n"); //fprintf(stderr, "using true version\n");
f = conv_rows_stack_1; k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
} }
f<<< grid, threads, shared_size >>> void *kernel_params[] = {
(cuda_get_ptr(img), (void *)img->ga.data, (void *)&img->ga.offset,
cuda_get_ptr(kern), (void *)kern->ga.data, (void *)&kern->ga.offset,
cuda_get_ptr(out), (void *)out->ga.data, (void *)&out->ga.offset,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, (void *)&img_len, (void *)&img_wid,
img_stride_col, img_stride_row, (void *)&kern_len, (void *)&kern_wid,
img_stride_stack,img_stride_batch, (void *)&nkern, (void *)&nstack,
kern_stride_col, kern_stride_row, (void *)&img_stride_col, (void *)&img_stride_row,
kern_stride_stack, kern_stride_nkern); (void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
cudaError_t sts = cudaGetLastError(); (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
if (cudaSuccess == sts) int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{ {
work_complete = true; work_complete = true;
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y); shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: used 'conv_rows_stack' version\n"); fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
} }
...@@ -476,15 +473,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -476,15 +473,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y); shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows_stack' failed (%s)," "INFO: impl 'conv_rows_stack' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
...@@ -517,42 +514,41 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -517,42 +514,41 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//to test the case when we don't have a thread by output pixel. //to test the case when we don't have a thread by output pixel.
if((version_back!=-1)&& nb_row>1) nb_row--; if((version_back!=-1)&& nb_row>1) nb_row--;
dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern); size_t threads_per_block[3] = {(size_t)out_wid, (size_t)nb_row, (size_t)1};
size_t n_blocks[3] = {(size_t)ceil_intdiv(out_len,nb_row),
(size_t)nbatch*nkern, (size_t)1};
int shared_size=(threads.y*img_wid + k_size)*sizeof(float); size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*, GpuKernel *k = NULL;
int, int, int, int, if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) k=&conv_rows_stack2_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int, int, int, else if(version==9) k=&conv_rows_stack2_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int, int, int, else if(!img_contiguous_2d || !kern_contiguous_2d) k=&conv_rows_stack2_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int); else k=&conv_rows_stack2_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2_1; void *kernel_params[] = {
else if(version==9) f = conv_rows_stack2_3; (void *)img->ga.data, (void *)&img->ga.offset,
else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2_0; (void *)kern->ga.data, (void *)&kern->ga.offset,
else f = conv_rows_stack2_2; (void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
f<<< grid, threads, shared_size >>> (void *)&kern_len, (void *)&kern_wid,
(cuda_get_ptr(img), (void *)&nkern, (void *)&nstack,
cuda_get_ptr(kern), (void *)&img_stride_col, (void *)&img_stride_row,
cuda_get_ptr(out), (void *)&img_stride_stack, (void *)&img_stride_batch,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, (void *)&kern_stride_col, (void *)&kern_stride_row,
img_stride_col, img_stride_row, (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
img_stride_stack,img_stride_batch, int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern); if (err == GA_NO_ERROR)
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y); shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_rows_stack2' version %s with" "INFO: used 'conv_rows_stack2' version %s with"
...@@ -564,15 +560,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -564,15 +560,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i version=%d\n", " shmem_sz=%i, nb_threads=%i version=%d\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y,(version==9?2:3)); shmem_sz, threads_per_block[0] * threads_per_block[1],(version==9?2:3));
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows_stack2' failed (%s)," "INFO: impl 'conv_rows_stack2' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
...@@ -619,18 +615,18 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -619,18 +615,18 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
nb_split++; nb_split++;
// tentative estimates (prior to contraint c) // tentative estimates (prior to contraint c)
int thread_z=ceil_intdiv(kern_len,nb_split); size_t thread_z=ceil_intdiv(kern_len,nb_split);
int shared_size = sizeof(float)*(full_kern size_t shmem_sz = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z) ? std::max((size_t)img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z)); : std::max((size_t)img_size + thread_z*kern_wid, out_size*thread_z));
// constraint (c) // constraint (c)
while ((shared_size >= shared_avail) && (nb_split <= kern_len)){ while ((shmem_sz >= shared_avail) && (nb_split <= kern_len)){
//if we can't fit the kernel in shared memory, we must split it more. //if we can't fit the kernel in shared memory, we must split it more.
nb_split++; nb_split++;
thread_z=ceil_intdiv(kern_len,nb_split); thread_z=ceil_intdiv(kern_len,nb_split);
shared_size = sizeof(float)*(full_kern shmem_sz = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z) ? std::max((size_t)img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z)); : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
} }
if (nb_split <= kern_len) if (nb_split <= kern_len)
...@@ -638,15 +634,12 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -638,15 +634,12 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
assert(thread_z>0);//should not happen, but in case... assert(thread_z>0);//should not happen, but in case...
if(!full_kern) assert(thread_z!=kern_len); if(!full_kern) assert(thread_z!=kern_len);
dim3 threads(out_wid, out_len, thread_z); size_t threads_per_block[3] = {(size_t)out_wid,
dim3 grid(nbatch,nkern); (size_t)out_len,
(size_t)thread_z};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
void (*f)(const float*, const float*, float*, GpuKernel *k = NULL;
int, int, int, int,
int, int, int, int,
int, int,
int, int,
int, int);
const bool split=thread_z!=kern_len; const bool split=thread_z!=kern_len;
const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped; const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
...@@ -654,40 +647,46 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -654,40 +647,46 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern); //printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
//We will always be split when we don't load the full kernel //We will always be split when we don't load the full kernel
/* if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce_0;*/ /* if(!kern_flipped && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
/*else*/ if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce_1; /*else*/ if(!kern_flipped && !ccontig && !split && full_kern) k=&conv_patch_stack_reduce_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce_2; else if(!kern_flipped && !ccontig && split && !full_kern) k=&conv_patch_stack_reduce_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce_3; else if(!kern_flipped && !ccontig && split && full_kern) k=&conv_patch_stack_reduce_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
/*else if(!kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce_4;*/ /*else if(!kern_flipped && ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_4_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else if(!kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce_5; else if(!kern_flipped && ccontig && !split && full_kern) k=&conv_patch_stack_reduce_5_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce_6; else if(!kern_flipped && ccontig && split && !full_kern) k=&conv_patch_stack_reduce_6_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce_7; else if(!kern_flipped && ccontig && split && full_kern) k=&conv_patch_stack_reduce_7_node_<<<<HASH_PLACEHOLDER>>>>_0;
/*else if(kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce_8;*/ /*else if(kern_flipped && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_8_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else if(kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce_9; else if(kern_flipped && !ccontig && !split && full_kern) k=&conv_patch_stack_reduce_9_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce_10; else if(kern_flipped && !ccontig && split && !full_kern) k=&conv_patch_stack_reduce_10_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce_11; else if(kern_flipped && !ccontig && split && full_kern) k=&conv_patch_stack_reduce_11_node_<<<<HASH_PLACEHOLDER>>>>_0;
/*else if(kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce_12;*/ /*else if(kern_flipped && ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_12_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else if(kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce_13; else if(kern_flipped && ccontig && !split && full_kern) k=&conv_patch_stack_reduce_13_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce_14; else if(kern_flipped && ccontig && split && !full_kern) k=&conv_patch_stack_reduce_14_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce_15; else if(kern_flipped && ccontig && split && full_kern) k=&conv_patch_stack_reduce_15_node_<<<<HASH_PLACEHOLDER>>>>_0;
f<<< grid, threads, shared_size>>>(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out), void *kernel_params[] = {
img_len, img_wid, kern_len, kern_wid, (void *)img->ga.data, (void *)&img->ga.offset,
nkern, nstack, (void *)kern->ga.data, (void *)&kern->ga.offset,
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, (void *)out->ga.data, (void *)&out->ga.offset,
kern_stride_col_unflipped, kern_stride_row_unflipped, (void *)&img_len, (void *)&img_wid,
kern_stride_stack, kern_stride_nkern); (void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
cudaError_t sts = cudaGetLastError(); (void *)&img_stride_col, (void *)&img_stride_row,
if (cudaSuccess == sts) (void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col,
(void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{ {
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i, " "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i, "
"grid.x=%i, grid.y=%i, shared_size=%i," "n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i,"
" nb_threads=%i\n", " nb_threads=%i\n",
threads.x, threads.y, threads.z, grid.x, grid.y, threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y * threads.z); shmem_sz, threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_patch_stack_reduce' version" "INFO: used 'conv_patch_stack_reduce' version"
...@@ -700,17 +699,17 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -700,17 +699,17 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i," " n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i,"
" nb_threads=%i\n", " nb_threads=%i\n",
threads.x, threads.y, threads.z, threads_per_block[0], threads_per_block[1], threads_per_block[2],
grid.x, grid.y, shared_size, n_blocks[0], n_blocks[1], shmem_sz,
threads.x * threads.y * threads.z); threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch_stack_reduce' failed (%s)," "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} // else no good nb_splits was found } // else no good nb_splits was found
} }
...@@ -719,8 +718,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -719,8 +718,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
kern_len<=320 && kern_len<=320 &&
!work_complete) //conv_valid_row_reduce !work_complete) //conv_valid_row_reduce
{ {
int outsize = PyGpuArray_SIZE(out); size_t outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, 4096); size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
(size_t)1, (size_t)1};
int block_nstack=nstack; int block_nstack=nstack;
//Max of 512 threads per blocks. //Max of 512 threads per blocks.
...@@ -728,9 +728,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -728,9 +728,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//8k registers and the kernel use 23 register //8k registers and the kernel use 23 register
//TODO: check if we have 8k or 16k of register... //TODO: check if we have 8k or 16k of register...
while(block_nstack*kern_len>320)block_nstack--; while(block_nstack*kern_len>320)block_nstack--;
dim3 n_threads(block_nstack, kern_len, 1); size_t threads_per_block[3] = {(size_t)block_nstack, (size_t)kern_len, (size_t)1};
int n_reduce_buf = block_nstack * kern_len * sizeof(float); size_t n_reduce_buf = block_nstack * kern_len * sizeof(float);
/* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float) /* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
* *
* if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0. * if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
...@@ -747,39 +747,34 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -747,39 +747,34 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
assert (initial_reduce_boundary < n_reduce_buf/sizeof(float)); assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
} }
GpuKernel *k = NULL;
void (*f)(int, int, int, int,
int, int, int, int, int,
const float*, int, int, int, int,
const float*, int, int, int, int,
float*, int, int, int, int,
int, int, int);
//std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n"; //std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
//std::cerr << "kerns " << nstack << " " << kern_len << "\n"; //std::cerr << "kerns " << nstack << " " << kern_len << "\n";
//std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n"; //std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
if(block_nstack==nstack) if(block_nstack==nstack)
f=conv_valid_row_reduce_0; k=&conv_valid_row_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else else
f=conv_valid_row_reduce_1; k=&conv_valid_row_reduce_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
f<<<n_blocks, n_threads, n_reduce_buf>>>(
nbatch, nkern, PyGpuArray_DIMS(img)[1], void *kernel_params[] = {
img_len, img_wid, (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
kern_len, kern_wid, (void *)&img_len, (void *)&img_wid,
out_len, out_wid, (void *)&kern_len, (void *)&kern_wid,
cuda_get_ptr(img), (void *)&out_len, (void *)&out_wid,
PyGpuArray_STRIDES(img)[0]/4, PyGpuArray_STRIDES(img)[1]/4, (void *)img->ga.data, (void *)&img->ga.offset,
img_stride_row, img_stride_col, (void *)&img_stride_batch, (void *)&img_stride_stack,
cuda_get_ptr(kern), (void *)&img_stride_row, (void *)&img_stride_col,
PyGpuArray_STRIDES(kern)[0]/4, PyGpuArray_STRIDES(kern)[1]/4, (void *)kern->ga.data, (void *)&kern->ga.offset,
PyGpuArray_STRIDES(kern)[2]/4, PyGpuArray_STRIDES(kern)[3]/4, (void *)&kern_stride_nkern, (void *)&kern_stride_stack,
cuda_get_ptr(out), (void *)&kern_stride_row, (void *)&kern_stride_col,
PyGpuArray_STRIDES(out)[0]/4, PyGpuArray_STRIDES(out)[1]/4, (void *)out->ga.data, (void *)&out->ga.offset,
PyGpuArray_STRIDES(out)[2]/4, PyGpuArray_STRIDES(out)[3]/4, (void *)&out_stride_batch, (void *)&out_stride_nkern,
subsample_rows, subsample_cols, initial_reduce_boundary); (void *)&out_stride_row, (void *)&out_stride_col,
(void *)&subsample_rows, (void *)&subsample_cols,
cudaError_t sts = cudaGetLastError(); (void *)&initial_reduce_boundary};
if (cudaSuccess == sts) int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, n_reduce_buf, kernel_params);
if (err == GA_NO_ERROR)
{ {
work_complete = true; work_complete = true;
if (verbose) if (verbose)
...@@ -789,24 +784,27 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -789,24 +784,27 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
n_threads.x, n_threads.y, n_blocks, threads_per_block[0], threads_per_block[1], n_blocks[0],
n_reduce_buf, n_threads.x * n_threads.y); n_reduce_buf, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_valid_row_reduce' failed (%s)," "INFO: impl 'conv_valid_row_reduce' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
if (1 && !work_complete) //conv_reference_valid if (1 && !work_complete) //conv_reference_valid
{ {
int outsize = PyGpuArray_SIZE(out); size_t outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, 4096); size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), (size_t)1, (size_t)1};
256); size_t threads_per_block[3] = {std::min(ceil_intdiv(outsize, n_blocks[0]),
(size_t)256),
(size_t)1, (size_t)1};
if (1) if (1)
{ {
if (verbose) if (verbose)
...@@ -814,61 +812,56 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -814,61 +812,56 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (verbose>1) if (verbose>1)
fprintf(stderr, " img : %i %llu %i %i %p " fprintf(stderr, " img : %i %llu %i %i %p "
"%lld %lld %lld %lld\n", "%lld %lld %lld %lld\n",
nbatch, (unsigned long long)PyGpuArray_DIMS(img)[1], nbatch, (unsigned long long)stack_len, img_len, img_wid,
img_len, img_wid, (void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
cuda_get_ptr(img), (long long)img_stride_batch,
(long long)PyGpuArray_STRIDES(img)[0]/4, (long long)img_stride_stack,
(long long)PyGpuArray_STRIDES(img)[1]/4, (long long)img_stride_row,
(long long)PyGpuArray_STRIDES(img)[2]/4, (long long)img_stride_col);
(long long)PyGpuArray_STRIDES(img)[3]/4);
if (verbose>1) if (verbose>1)
fprintf(stderr, " kern: %i %i %i %i %p " fprintf(stderr, " kern: %i %i %i %i %p "
"%lld %lld %lld %lld\n", "%lld %lld %lld %lld\n",
nkern, nstack, kern_len, kern_wid, nkern, nstack, kern_len, kern_wid,
cuda_get_ptr(kern), (void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
(long long)PyGpuArray_STRIDES(kern)[0]/4, (long long)kern_stride_nkern,
(long long)PyGpuArray_STRIDES(kern)[1]/4, (long long)kern_stride_stack,
(long long)PyGpuArray_STRIDES(kern)[2]/4, (long long)kern_stride_row,
(long long)PyGpuArray_STRIDES(kern)[3]/4); (long long)kern_stride_col);
if (verbose>1) if (verbose>1)
fprintf(stderr, " out : %llu %llu %i %i %p " fprintf(stderr, " out : %llu %llu %i %i %p "
"%lld %lld %lld %lld\n", "%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0], (unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1], (unsigned long long)PyGpuArray_DIMS(out)[1],
out_len, out_wid, out_len, out_wid,
cuda_get_ptr(out), (void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
(long long)PyGpuArray_STRIDES(out)[0]/4, (long long)out_stride_batch,
(long long)PyGpuArray_STRIDES(out)[1]/4, (long long)out_stride_nkern,
(long long)PyGpuArray_STRIDES(out)[2]/4, (long long)out_stride_row,
(long long)PyGpuArray_STRIDES(out)[3]/4); (long long)out_stride_col);
if (verbose>1) if (verbose>1)
fprintf(stderr, " launch params: %i %i %i\n", fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads); outsize, n_blocks[0], threads_per_block[0]);
} }
conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
PyGpuArray_DIMS(img)[1], void *kernel_params[] = {
img_len, img_wid, (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
kern_len, kern_wid, (void *)&img_len, (void *)&img_wid,
out_len, out_wid, (void *)&kern_len, (void *)&kern_wid,
cuda_get_ptr(img), (void *)&out_len, (void *)&out_wid,
PyGpuArray_STRIDES(img)[0]/4, (void *)img->ga.data, (void *)&img->ga.offset,
PyGpuArray_STRIDES(img)[1]/4, (void *)&img_stride_batch, (void *)&img_stride_stack,
PyGpuArray_STRIDES(img)[2]/4, (void *)&img_stride_row, (void *)&img_stride_col,
PyGpuArray_STRIDES(img)[3]/4, (void *)kern->ga.data, (void *)&kern->ga.offset,
cuda_get_ptr(kern), (void *)&kern_stride_nkern, (void *)&kern_stride_stack,
PyGpuArray_STRIDES(kern)[0]/4, (void *)&kern_stride_row, (void *)&kern_stride_col,
PyGpuArray_STRIDES(kern)[1]/4, (void *)out->ga.data, (void *)&out->ga.offset,
PyGpuArray_STRIDES(kern)[2]/4, (void *)&out_stride_batch, (void *)&out_stride_nkern,
PyGpuArray_STRIDES(kern)[3]/4, (void *)&out_stride_row, (void *)&out_stride_col,
cuda_get_ptr(out), (void *)&subsample_rows, (void *)&subsample_cols};
PyGpuArray_STRIDES(out)[0]/4, int err = GpuKernel_call(&conv_reference_valid_node_<<<<HASH_PLACEHOLDER>>>>_0,
PyGpuArray_STRIDES(out)[1]/4, 3, threads_per_block, n_blocks, 0, kernel_params);
PyGpuArray_STRIDES(out)[2]/4,
PyGpuArray_STRIDES(out)[3]/4, if (err == GA_NO_ERROR)
subsample_rows, subsample_cols);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) if (verbose)
...@@ -881,7 +874,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -881,7 +874,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for" "ERROR: all implementations failed for"
" PyGpuArray_conv_valid! (%s)", " PyGpuArray_conv_valid! (%s)",
cudaGetErrorString(sts)); GpuKernel_error(&conv_reference_valid_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
return -1; return -1;
} }
} }
...@@ -930,6 +923,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -930,6 +923,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]); assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]);
assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]); assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]);
const int stack_len=PyGpuArray_DIMS(img)[1];
const int nstack=PyGpuArray_DIMS(kern)[1]; const int nstack=PyGpuArray_DIMS(kern)[1];
const int nbatch=PyGpuArray_DIMS(img)[0]; const int nbatch=PyGpuArray_DIMS(img)[0];
const int nkern=PyGpuArray_DIMS(kern)[0]; const int nkern=PyGpuArray_DIMS(kern)[0];
...@@ -948,6 +942,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -948,6 +942,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4; const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4; const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4; const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
const int out_stride_col = PyGpuArray_STRIDES(out)[3]/4;
const int out_stride_row = PyGpuArray_STRIDES(out)[2]/4;
const int out_stride_nkern = PyGpuArray_STRIDES(out)[1]/4;
const int out_stride_batch = PyGpuArray_STRIDES(out)[0]/4;
const int img_size=img_len*img_wid; const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid; const int kern_size=kern_len*kern_wid;
...@@ -990,16 +988,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -990,16 +988,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
//we don't need to unflip it, but have the new value when we unflip it. //we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true; bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d; bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
const float * kern_data_unflipped = cuda_get_ptr(kern); if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
//the last two dimensions are c_contiguous but flipped! //the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped=1;
kern_stride_row_unflipped=kern_wid;
kern_flipped=false; kern_flipped=false;
kern_contiguous_2d_unflipped = true; kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
} }
if (verbose>1) if (verbose>1)
...@@ -1008,34 +1000,34 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1008,34 +1000,34 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
" MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID); " MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
printf("INFO: img dim: %llu %llu %llu %llu " printf("INFO: img dim: %llu %llu %llu %llu "
"img stride: %lld %lld %lld %lld\n", "img stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(img)[0], (unsigned long long)nbatch,
(unsigned long long)PyGpuArray_DIMS(img)[1], (unsigned long long)stack_len,
(unsigned long long)PyGpuArray_DIMS(img)[2], (unsigned long long)img_len,
(unsigned long long)PyGpuArray_DIMS(img)[3], (unsigned long long)img_wid,
(long long)PyGpuArray_STRIDES(img)[0]/4, (long long)img_stride_batch,
(long long)PyGpuArray_STRIDES(img)[1]/4, (long long)img_stride_stack,
(long long)PyGpuArray_STRIDES(img)[2]/4, (long long)img_stride_row,
(long long)PyGpuArray_STRIDES(img)[3]/4); (long long)img_stride_col);
printf("INFO: kern dim: %llu %llu %llu %llu " printf("INFO: kern dim: %llu %llu %llu %llu "
"kern stride: %lld %lld %lld %lld\n", "kern stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(kern)[0], (unsigned long long)nkern,
(unsigned long long)PyGpuArray_DIMS(kern)[1], (unsigned long long)nstack,
(unsigned long long)PyGpuArray_DIMS(kern)[2], (unsigned long long)kern_len,
(unsigned long long)PyGpuArray_DIMS(kern)[3], (unsigned long long)kern_wid,
(long long)PyGpuArray_STRIDES(kern)[0]/4, (long long)kern_stride_nkern,
(long long)PyGpuArray_STRIDES(kern)[1]/4, (long long)kern_stride_stack,
(long long)PyGpuArray_STRIDES(kern)[2]/4, (long long)kern_stride_row,
(long long)PyGpuArray_STRIDES(kern)[3]/4); (long long)kern_stride_col);
printf("INFO: out dim: %llu %llu %llu %llu " printf("INFO: out dim: %llu %llu %llu %llu "
"out stride: %lld %lld %lld %lld\n", "out stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0], (unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1], (unsigned long long)PyGpuArray_DIMS(out)[1],
(unsigned long long)PyGpuArray_DIMS(out)[2], (unsigned long long)out_len,
(unsigned long long)PyGpuArray_DIMS(out)[3], (unsigned long long)out_wid,
(long long)PyGpuArray_STRIDES(out)[0]/4, (long long)out_stride_batch,
(long long)PyGpuArray_STRIDES(out)[1]/4, (long long)out_stride_nkern,
(long long)PyGpuArray_STRIDES(out)[2]/4, (long long)out_stride_row,
(long long)PyGpuArray_STRIDES(out)[3]/4); (long long)out_stride_col);
} }
if (!subsample && if (!subsample &&
...@@ -1082,50 +1074,53 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1082,50 +1074,53 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
assert(version!=5 || kern_len>1); assert(version!=5 || kern_len>1);
assert(version!=-1); assert(version!=-1);
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split)); size_t threads_per_block[3] = {(size_t)out_wid,
dim3 grid(nbatch,nkern); ceil_intdiv((size_t)out_len,(size_t)nb_split),
(size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
int shared_size=img_size_padded_byte + kern_size_byte; size_t shmem_sz=img_size_padded_byte + kern_size_byte;
if(version==5) if(version==5)
shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte; shmem_sz=((kern_len+threads_per_block[1]-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
void (*f)(const float*, const float*, float*,
int, int, int, int, GpuKernel *k = NULL;
int, int, int, int, if(version==3) k=&conv_full_patch_stack_padded_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int, int, int, else if(version==5) k=&conv_full_patch_stack_padded_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int); else if(version==4) k=&conv_full_patch_stack_padded_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) k=&conv_full_patch_stack_padded_4_node_<<<<HASH_PLACEHOLDER>>>>_0;
if(version==3) f=conv_full_patch_stack_padded_0; else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) k=&conv_full_patch_stack_padded_5_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==5) f=conv_full_patch_stack_padded_1; else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) k=&conv_full_patch_stack_padded_6_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==4) f=conv_full_patch_stack_padded_2; else if(version==3 && kern_flipped) k=&conv_full_patch_stack_padded_8_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded_4; else if(version==5 && kern_flipped)k=&conv_full_patch_stack_padded_9_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded_5; else if(version==4 && kern_flipped)k=&conv_full_patch_stack_padded_10_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded_6; else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) k=&conv_full_patch_stack_padded_12_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded_8; else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) k=&conv_full_patch_stack_padded_13_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded_9; else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) k=&conv_full_patch_stack_padded_14_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded_10;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded_12;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded_13;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded_14;
else assert(false); else assert(false);
f<<< grid, threads, shared_size>>> void *kernel_params[] = {
(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out), (void *)img->ga.data, (void *)&img->ga.offset,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, (void *)kern->ga.data, (void *)&kern->ga.offset,
img_stride_col, img_stride_row, img_stride_stack, (void *)out->ga.data, (void *)&out->ga.offset,
img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped, (void *)&img_len, (void *)&img_wid,
kern_stride_stack, kern_stride_nkern); (void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
cudaError_t sts = cudaGetLastError(); (void *)&img_stride_col, (void *)&img_stride_row,
if (cudaSuccess == sts) (void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{ {
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i," " n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n", " out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z, threads_per_block[0], threads_per_block[1], threads_per_block[2],
grid.x, grid.y, shared_size, n_blocks[0], n_blocks[1], shmem_sz,
threads.x * threads.y * threads.z, threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
out_len, nb_split, version); out_len, nb_split, version);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
...@@ -1138,12 +1133,12 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1138,12 +1133,12 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i," " n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n", " out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z, threads_per_block[0], threads_per_block[1], threads_per_block[2],
grid.x, grid.y, shared_size, n_blocks[0], n_blocks[1], shmem_sz,
threads.x * threads.y * threads.z, threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
out_len, nb_split, version); out_len, nb_split, version);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
...@@ -1151,7 +1146,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1151,7 +1146,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
" failed (%s), trying next implementation\n", " failed (%s), trying next implementation\n",
version==3?"no split": "split", version==3?"no split": "split",
(version==5?"low_mem":"not_low_mem"), (version==5?"low_mem":"not_low_mem"),
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
...@@ -1162,21 +1157,22 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1162,21 +1157,22 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch !work_complete) //conv_full_patch
{ {
dim3 threads(out_wid, out_len); size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
dim3 grid(nbatch,nkern); size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
int shared_size=(img_size + kern_size)*sizeof(float); size_t shmem_sz = (img_size + kern_size)*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions. //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
conv_full_patch<<< grid, threads, shared_size>>> void *kernel_params[] = {
(cuda_get_ptr(img), (void *)img->ga.data, (void *)&img->ga.offset,
cuda_get_ptr(kern), (void *)kern->ga.data, (void *)&kern->ga.offset,
cuda_get_ptr(out), (void *)out->ga.data, (void *)&out->ga.offset,
img_len, img_wid, (void *)&img_len, (void *)&img_wid,
kern_len, kern_wid, (void *)&kern_len, (void *)&kern_wid,
nkern, nstack); (void *)&nkern, (void *)&nstack};
int err = GpuKernel_call(&conv_full_patch_node_<<<<HASH_PLACEHOLDER>>>>_0,
cudaError_t sts = cudaGetLastError(); 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (cudaSuccess == sts)
if (err == GA_NO_ERROR)
{ {
if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n"); if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n");
work_complete = true; work_complete = true;
...@@ -1185,15 +1181,15 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1185,15 +1181,15 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
threads.x * threads.y); threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_full_patch' failed (%s)," "INFO: impl 'conv_full_patch' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(&conv_full_patch_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
} }
} }
if (false && !subsample && //disabled as test fail for this kernel if (false && !subsample && //disabled as test fail for this kernel
...@@ -1203,35 +1199,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1203,35 +1199,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything !work_complete) //conv_full_load_everything
{ {
dim3 threads(out_wid, out_len); size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
dim3 grid(nbatch); size_t n_blocks[3] = {(size_t)nbatch, (size_t)1, (size_t)1};
int shared_size=(img_size + kern_size)*nstack*sizeof(float); size_t shmem_sz = (img_size + kern_size)*nstack*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions. //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
//typeof(conv_full_load_everything<0>) f = ; void *kernel_params[] = {
void (*f)(const float*, const float*, float*, (void *)img->ga.data, (void *)&img->ga.offset,
int, int, int, int, int, int, (void *)kern->ga.data, (void *)&kern->ga.offset,
int, int, int, int, int, int, int, int) = conv_full_load_everything; (void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
f<<< grid, threads, shared_size>>> (void *)&kern_len, (void *)&kern_wid,
(cuda_get_ptr(img), (void *)&nkern, (void *)&nstack,
cuda_get_ptr(kern), (void *)&img_stride_col, (void *)&img_stride_row,
cuda_get_ptr(out), (void *)&img_stride_stack, (void *)&img_stride_batch,
img_len, img_wid, (void *)&kern_stride_col, (void *)&kern_stride_row,
kern_len, kern_wid, (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
nkern, nstack, int err = GpuKernel_call(&conv_full_load_everything_node_<<<<HASH_PLACEHOLDER>>>>_0,
PyGpuArray_STRIDES(img)[3]/4, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
PyGpuArray_STRIDES(img)[2]/4,
PyGpuArray_STRIDES(img)[1]/4, if (err == GA_NO_ERROR)
PyGpuArray_STRIDES(img)[0]/4,
PyGpuArray_STRIDES(kern)[3]/4,
PyGpuArray_STRIDES(kern)[2]/4,
PyGpuArray_STRIDES(kern)[1]/4,
PyGpuArray_STRIDES(kern)[0]/4
);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{ {
if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n"); if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n");
work_complete = true; work_complete = true;
...@@ -1240,14 +1227,14 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1240,14 +1227,14 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
threads.x * threads.y); threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_load_everything'" fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
" failed (%s), trying next implementation\n", " failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(&conv_full_load_everything_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
} }
} }
...@@ -1259,32 +1246,29 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1259,32 +1246,29 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack !work_complete) //conv_full_patch_stack
{ {
dim3 threads(out_wid, out_len); size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
dim3 grid(nbatch,nkern); size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
int shared_size=(img_size + kern_size)*sizeof(float); size_t shmem_sz = (img_size + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*, GpuKernel *k = NULL;
int, int, int, int, if(!img_contiguous_2d && !kern_contiguous_2d) k=&conv_full_patch_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int, int, int, else if(!img_contiguous_2d && kern_contiguous_2d) k=&conv_full_patch_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int, int, int); else if(img_contiguous_2d && !kern_contiguous_2d) k=&conv_full_patch_stack_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d) k=&conv_full_patch_stack_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack_0;
else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack_1; void *kernel_params[] = {
else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack_2; (void *)img->ga.data, (void *)&img->ga.offset,
else if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack_3; (void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
f<<< grid, threads, shared_size>>>( (void *)&img_len, (void *)&img_wid,
cuda_get_ptr(img), (void *)&kern_len, (void *)&kern_wid,
cuda_get_ptr(kern), (void *)&nkern, (void *)&nstack,
cuda_get_ptr(out), (void *)&img_stride_col, (void *)&img_stride_row,
img_len, img_wid, (void *)&kern_stride_col, (void *)&kern_stride_row,
kern_len, kern_wid, (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
nkern, nstack,img_stride_col, img_stride_row, int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern); if (err == GA_NO_ERROR)
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{ {
if (verbose) if (verbose)
fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n"); fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
...@@ -1294,23 +1278,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1294,23 +1278,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y); shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n", fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
if (1 && !work_complete) //conv_reference_full if (1 && !work_complete) //conv_reference_full
{ {
if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n"); if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");
int outsize = PyGpuArray_SIZE(out); size_t outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, 4096); size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), (size_t)1, (size_t)1};
256); size_t threads_per_block[3] = {std::min(ceil_intdiv(outsize, n_blocks[0]),
(size_t)256),
(size_t)1, (size_t)1};
if (0) if (0)
{ {
if (verbose) if (verbose)
...@@ -1318,70 +1305,67 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1318,70 +1305,67 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
if (verbose) if (verbose)
fprintf(stderr, " img : %llu %llu %llu %llu %p " fprintf(stderr, " img : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n", "%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(img)[0], (unsigned long long)nbatch,
(unsigned long long)PyGpuArray_DIMS(img)[1], (unsigned long long)stack_len,
(unsigned long long)PyGpuArray_DIMS(img)[2], (unsigned long long)img_len,
(unsigned long long)PyGpuArray_DIMS(img)[3], (unsigned long long)img_wid,
cuda_get_ptr(img), (void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
(long long)PyGpuArray_STRIDES(img)[0]/4, (long long)img_stride_batch,
(long long)PyGpuArray_STRIDES(img)[1]/4, (long long)img_stride_stack,
(long long)PyGpuArray_STRIDES(img)[2]/4, (long long)img_stride_row,
(long long)PyGpuArray_STRIDES(img)[3]/4); (long long)img_stride_col);
if (verbose) if (verbose)
fprintf(stderr, " kern: %llu %llu %llu %llu %p " fprintf(stderr, " kern: %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n", "%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(kern)[0], (unsigned long long)nkern,
(unsigned long long)PyGpuArray_DIMS(kern)[1], (unsigned long long)nstack,
(unsigned long long)PyGpuArray_DIMS(kern)[2], (unsigned long long)kern_len,
(unsigned long long)PyGpuArray_DIMS(kern)[3], (unsigned long long)kern_wid,
cuda_get_ptr(kern), (void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
(long long)PyGpuArray_STRIDES(kern)[0]/4, (long long)kern_stride_nkern,
(long long)PyGpuArray_STRIDES(kern)[1]/4, (long long)kern_stride_stack,
(long long)PyGpuArray_STRIDES(kern)[2]/4, (long long)kern_stride_row,
(long long)PyGpuArray_STRIDES(kern)[3]/4 (long long)kern_stride_col);
);
if (verbose) if (verbose)
fprintf(stderr, " out : %llu %llu %llu %llu %p " fprintf(stderr, " out : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n", "%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0], (unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1], (unsigned long long)PyGpuArray_DIMS(out)[1],
(unsigned long long)PyGpuArray_DIMS(out)[2], (unsigned long long)out_len,
(unsigned long long)PyGpuArray_DIMS(out)[3], (unsigned long long)out_wid,
cuda_get_ptr(out), (void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
(long long)PyGpuArray_STRIDES(out)[0]/4, (long long)out_stride_batch,
(long long)PyGpuArray_STRIDES(out)[1]/4, (long long)out_stride_nkern,
(long long)PyGpuArray_STRIDES(out)[2]/4, (long long)out_stride_row,
(long long)PyGpuArray_STRIDES(out)[3]/4); (long long)out_stride_col);
if (verbose) if (verbose)
fprintf(stderr, " launch params: %i %i %i\n", fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads); outsize, n_blocks[0], threads_per_block[0]);
if (verbose) if (verbose)
fprintf(stderr, " subsample params: %llu %llu\n", fprintf(stderr, " subsample params: %llu %llu\n",
(unsigned long long)subsample_rows, (unsigned long long)subsample_rows,
(unsigned long long)subsample_cols); (unsigned long long)subsample_cols);
} }
conv_reference_full<<<n_blocks, n_threads>>>(
PyGpuArray_DIMS(img)[0], PyGpuArray_DIMS(kern)[0], void *kernel_params[] = {
PyGpuArray_DIMS(img)[1], (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
PyGpuArray_DIMS(img)[2], PyGpuArray_DIMS(img)[3], (void *)&img_len, (void *)&img_wid,
PyGpuArray_DIMS(kern)[2], PyGpuArray_DIMS(kern)[3], (void *)&kern_len, (void *)&kern_wid,
PyGpuArray_DIMS(out)[2], PyGpuArray_DIMS(out)[3], (void *)&out_len, (void *)&out_wid,
cuda_get_ptr(img), PyGpuArray_STRIDES(img)[0]/4, (void *)img->ga.data, (void *)&img->ga.offset,
PyGpuArray_STRIDES(img)[1]/4, (void *)&img_stride_batch, (void *)&img_stride_stack,
PyGpuArray_STRIDES(img)[2]/4, (void *)&img_stride_row, (void *)&img_stride_col,
PyGpuArray_STRIDES(img)[3]/4, (void *)kern->ga.data, (void *)&kern->ga.offset,
cuda_get_ptr(kern), PyGpuArray_STRIDES(kern)[0]/4, (void *)&kern_stride_nkern, (void *)&kern_stride_stack,
PyGpuArray_STRIDES(kern)[1]/4, (void *)&kern_stride_row, (void *)&kern_stride_col,
PyGpuArray_STRIDES(kern)[2]/4, (void *)out->ga.data, (void *)&out->ga.offset,
PyGpuArray_STRIDES(kern)[3]/4, (void *)&out_stride_batch, (void *)&out_stride_nkern,
cuda_get_ptr(out), PyGpuArray_STRIDES(out)[0]/4, (void *)&out_stride_row, (void *)&out_stride_col,
PyGpuArray_STRIDES(out)[1]/4, (void *)&subsample_rows, (void *)&subsample_cols};
PyGpuArray_STRIDES(out)[2]/4, int err = GpuKernel_call(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0,
PyGpuArray_STRIDES(out)[3]/4, 3, threads_per_block, n_blocks, 0, kernel_params);
subsample_rows, subsample_cols);
if (err == GA_NO_ERROR)
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{ {
if (verbose) if (verbose)
fprintf(stderr, "INFO: used 'conv_reference_full' version" fprintf(stderr, "INFO: used 'conv_reference_full' version"
...@@ -1394,17 +1378,18 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1394,17 +1378,18 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," fprintf(stderr, "threads_per_block[0]=%i, threads_per_block[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " n_blocks[0]=%i, n_blocks[1]=%i,"
n_threads, 1, n_blocks, 1, 0, n_threads); " shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], 1, n_blocks[0], 1, 0, threads_per_block[0]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s)," fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for" "ERROR: all implementations failed for"
" CudaNdarray_conv_full! (%s)", " CudaNdarray_conv_full! (%s)",
cudaGetErrorString(sts)); GpuKernel_error(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
return -1; return -1;
} }
} }
......
...@@ -3,13 +3,20 @@ import os ...@@ -3,13 +3,20 @@ import os
import theano import theano
from theano import config, gof from theano import config, gof
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
from six.moves import reduce from six.moves import reduce
from .comp import NVCC_compiler from .comp import NVCC_compiler
from .type import GpuArrayType from .type import GpuArrayType
from .basic_ops import as_gpuarray_variable from .basic_ops import (as_gpuarray_variable, GpuKernelBase, HideC, Kernel)
from theano.gof import utils
class GpuConv(gof.Op): class GpuConv(GpuKernelBase, HideC, gof.Op):
""" """
Implement the batched and stacked 2d convolution on the gpu. Implement the batched and stacked 2d convolution on the gpu.
...@@ -223,29 +230,30 @@ class GpuConv(gof.Op): ...@@ -223,29 +230,30 @@ class GpuConv(gof.Op):
return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G'] return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G']
def c_headers(self): def c_headers(self):
return ['<stdio.h>', 'cuda.h', if pygpu.get_default_context().kind == 'opencl':
'<gpuarray/extension.h>', '<numpy_compat.h>'] raise MethodNotDefined('cuda only')
return ['<stdint.h>', '<stdio.h>', 'cuda.h',
'<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
import os
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 21) return (0, 21)
def c_init_code(self): def c_init_code(self):
return ['cuda_get_ptr_raw = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");'] if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
def c_support_code_apply(self, node, nodename): return ['setup_ext_cuda();']
# REMEMBER TO RAISE c_code_cache_version when changing any of
# these files
files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu']
codes = ["CUdeviceptr (*cuda_get_ptr_raw)(gpudata *g);",
"float* cuda_get_ptr(PyGpuArrayObject * o){return (float*) (cuda_get_ptr_raw(o->ga.data) + o->ga.offset);}",
"const float* cuda_get_ptr(const PyGpuArrayObject * o){return (float*) (cuda_get_ptr_raw(o->ga.data) + o->ga.offset);}"]
codes += [open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in files]
return reduce(str.__add__, codes)
def c_compiler(self):
return NVCC_compiler
def c_code(self, node, nodename, inp, out_, sub): def c_code(self, node, nodename, inp, out_, sub):
img, kern = inp img, kern = inp
...@@ -270,8 +278,8 @@ class GpuConv(gof.Op): ...@@ -270,8 +278,8 @@ class GpuConv(gof.Op):
//Optional args //Optional args
int version = %(version)s; int version = %(version)s;
int verbose = %(verbose)s; int verbose = %(verbose)s;
int dx = %(dx)s; size_t dx = %(dx)s;
int dy = %(dy)s; size_t dy = %(dy)s;
int mode; int mode;
if (strcmp(mode_str, "full") == 0) if (strcmp(mode_str, "full") == 0)
...@@ -286,7 +294,7 @@ class GpuConv(gof.Op): ...@@ -286,7 +294,7 @@ class GpuConv(gof.Op):
{ {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"mode must be one of 'full' or 'valid'"); "mode must be one of 'full' or 'valid'");
return NULL; return 0;
} }
// TODO, make out be decref before we alloc out2! // TODO, make out be decref before we alloc out2!
...@@ -303,3 +311,266 @@ class GpuConv(gof.Op): ...@@ -303,3 +311,266 @@ class GpuConv(gof.Op):
%(fail)s %(fail)s
} }
""" % sub """ % sub
def c_support_code_apply(self, node, name):
nb = 0
if self.kshp is not None:
nb = self.kshp[1]
kernels = self.gpu_kernels(node, name)
k = kernels[0]
code = """
#define THEANO_KERN_WID %(nb)d
""" % locals()
code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
kname = "conv_full_load_everything"
gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
bin = gk._binary
bcode = ','.join(hex(ord(c)) for c in bin)
code = code.replace('\\', '\\\\')
code = code.replace('"', '\\"')
code = code.replace('\n', '\\n')
mod = """
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a %% b) ? 1: 0);
}
static const char conv_bcode[] = {%(bcode)s};
static const char *conv_code = "%(code)s";
""" % locals()
for k in kernels:
mod += "static GpuKernel " + k.name + '_' + name + ";\n"
mod += open(os.path.join(os.path.split(__file__)[0], "conv.cu")).read()
return mod
@utils.memoize
def gpu_kernels(self, node, name):
dtypes = [i.dtype for i in node.inputs]
dtypes.extend([o.dtype for o in node.outputs])
flags = Kernel.get_flags(*dtypes)
kernels = self.conv_patch_kernels(name, flags)
kernels.extend(self.conv_patch_stack_kernels(name, flags))
kernels.extend(self.conv_patch_stack_reduce_kernels(name, flags))
kernels.extend(self.conv_rows_kernels(name, flags))
kernels.extend(self.conv_rows_stack_kernels(name, flags))
kernels.extend(self.conv_rows_stack2_kernels(name, flags))
kernels.extend(self.conv_valid_row_reduce_kernels(name, flags))
kernels.extend(self.conv_reference_valid_kernels(name, flags))
kernels.extend(self.conv_reference_full_kernels(name, flags))
kernels.extend(self.conv_full_patch_kernels(name, flags))
kernels.extend(self.conv_full_patch_stack_kernels(name, flags))
kernels.extend(self.conv_full_patch_stack_padded_kernels(name, flags))
kernels.extend(self.conv_full_load_everything_kernels(name, flags))
return kernels
def conv_patch_kernels(self, name, flags):
kname = "conv_patch_%d"
k_var = "conv_patch_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [2, 3]
]
def conv_patch_stack_kernels(self, name, flags):
kname = "conv_patch_stack_%d"
k_var = "conv_patch_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in range(64, 96)
]
def conv_patch_stack_reduce_kernels(self, name, flags):
kname = "conv_patch_stack_reduce_%d"
k_var = "conv_patch_stack_reduce_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15]
]
def conv_rows_kernels(self, name, flags):
kname = "conv_rows_%d"
k_var = "conv_rows_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_rows_stack_kernels(self, name, flags):
kname = "conv_rows_stack_%d"
k_var = "conv_rows_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_rows_stack2_kernels(self, name, flags):
kname = "conv_rows_stack2_%d"
k_var = "conv_rows_stack2_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 3]
]
def conv_valid_row_reduce_kernels(self, name, flags):
kname = "conv_valid_row_reduce_%d"
k_var = "conv_valid_row_reduce_%d_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_reference_valid_kernels(self, name, flags):
kname = "conv_reference_valid"
k_var = "conv_reference_valid_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_reference_full_kernels(self, name, flags):
kname = "conv_reference_full"
k_var = "conv_reference_full_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_full_patch_kernels(self, name, flags):
kname = "conv_full_patch"
k_var = "conv_full_patch_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_full_patch_stack_kernels(self, name, flags):
kname = "conv_full_patch_stack_%d"
k_var = "conv_full_patch_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 3]
]
def conv_full_patch_stack_padded_kernels(self, name, flags):
kname = "conv_full_patch_stack_padded_%d"
k_var = "conv_full_patch_stack_padded_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14]
]
def conv_full_load_everything_kernels(self, name, flags):
kname = "conv_full_load_everything"
k_var = "conv_full_load_everything_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
extern __shared__ float s_data[];
//we store the full image and the full kernel in the shared memory //we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output //each thread compute only one value for the output
//thread block size=out_wid, out_len/nb_split //thread block size=out_wid, out_len/nb_split
//grid block size=batch_id //grid block size=batch_id
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void extern "C" __global__ void
conv_full_patch_split(const float* img, const float* kern, float* out, conv_full_patch_split(const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid, int nb_split) int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1; out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x; int batch_id = blockIdx.x;
// Thread index // Thread index
...@@ -60,18 +67,23 @@ conv_full_patch_split(const float* img, const float* kern, float* out, ...@@ -60,18 +67,23 @@ conv_full_patch_split(const float* img, const float* kern, float* out,
//thread block size=out_wid, out_len //thread block size=out_wid, out_len
//grid block size=batch_id, nkern //grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void extern "C" __global__ void
conv_full_patch( const float* img, const float* kern, float* out, conv_full_patch( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack) int kern_len, int kern_wid, int nkern, int nstack)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1; out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x; int batch_id = blockIdx.x;
// Thread index // Thread index
...@@ -114,6 +126,8 @@ conv_full_patch( const float* img, const float* kern, float* out, ...@@ -114,6 +126,8 @@ conv_full_patch( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
//we store the full image and the full kernel in the shared memory //we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output //each thread compute only one value for the output
//thread block size=out_wid, out_len //thread block size=out_wid, out_len
...@@ -123,7 +137,9 @@ conv_full_patch( const float* img, const float* kern, float* out, ...@@ -123,7 +137,9 @@ conv_full_patch( const float* img, const float* kern, float* out,
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d> template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__device__ inline void __device__ inline void
conv_full_patch_stack( const float* img, const float* kern, float* out, conv_full_patch_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack, int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
...@@ -131,12 +147,15 @@ conv_full_patch_stack( const float* img, const float* kern, float* out, ...@@ -131,12 +147,15 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1; out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;//blockDim.z* nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
const float __shared__ *kern_, *img_; const float __shared__ *kern_, *img_;
extern __shared__ float s_data[];
const int batch_id = blockIdx.x; const int batch_id = blockIdx.x;
const int nkern_id = blockIdx.y; const int nkern_id = blockIdx.y;
...@@ -186,7 +205,9 @@ extern "C" { ...@@ -186,7 +205,9 @@ extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \ #define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \
__global__ void \ __global__ void \
conv_full_patch_stack_##suffix( \ conv_full_patch_stack_##suffix( \
const float *img, const float *kern, float *out, \ const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, \ int img_len, int img_wid, \
int kern_len, int kern_wid, int nkern, int nstack, \ int kern_len, int kern_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \ int img_stride_col, int img_stride_row, \
...@@ -194,7 +215,8 @@ conv_full_patch_stack_##suffix( \ ...@@ -194,7 +215,8 @@ conv_full_patch_stack_##suffix( \
int kern_stride_stack, int kern_stride_nkern) \ int kern_stride_stack, int kern_stride_nkern) \
{ \ { \
conv_full_patch_stack<__VA_ARGS__>( \ conv_full_patch_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \ img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, kern_stride_col, kern_stride_row, \ img_stride_col, img_stride_row, kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \ kern_stride_stack, kern_stride_nkern); \
} }
...@@ -207,6 +229,8 @@ __INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true) ...@@ -207,6 +229,8 @@ __INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK #undef __INSTANTIATE_CONV_FULL_PATCH_STACK
} }
/** /**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory. * As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency. * I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
...@@ -227,22 +251,34 @@ __INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true) ...@@ -227,22 +251,34 @@ __INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
*/ */
template<bool flipped_kern, bool c_contiguous, bool split, bool low_mem > template<bool flipped_kern, bool c_contiguous, bool split, bool low_mem >
__device__ inline void __device__ inline void
conv_full_patch_stack_padded( const float* img, const float* kern, float* out, conv_full_patch_stack_padded( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid, const int img_len, const int img_wid,
const int kern_len, const int kern_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch, const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row, int kern_stride_col, int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern) const int kern_stride_stack, const int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern = &(kern[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
kern_stride_col=1;
kern_stride_row=kern_wid;
}
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1; out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
__shared__ int batch_id, kern_id, img_wid_valid, nb_rows; __shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
batch_id = blockIdx.x; batch_id = blockIdx.x;
kern_id = blockIdx.y; kern_id = blockIdx.y;
...@@ -380,7 +416,9 @@ extern "C" { ...@@ -380,7 +416,9 @@ extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \ #define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \
__global__ void \ __global__ void \
conv_full_patch_stack_padded_##suffix( \ conv_full_patch_stack_padded_##suffix( \
const float *img, const float *kern, float *out, \ const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \ const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \ const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \ const int nkern, const int nstack, \
...@@ -390,7 +428,8 @@ conv_full_patch_stack_padded_##suffix( \ ...@@ -390,7 +428,8 @@ conv_full_patch_stack_padded_##suffix( \
const int kern_stride_stack, const int kern_stride_nkern) \ const int kern_stride_stack, const int kern_stride_nkern) \
{ \ { \
conv_full_patch_stack_padded<__VA_ARGS__>( \ conv_full_patch_stack_padded<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \ img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \ img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \ kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \ kern_stride_stack, kern_stride_nkern); \
...@@ -412,6 +451,7 @@ __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false) ...@@ -412,6 +451,7 @@ __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED #undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED
} }
template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy) template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy)
{ {
return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ; return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
...@@ -425,8 +465,10 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co ...@@ -425,8 +465,10 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
{ {
return x[0] * y[0]; return x[0] * y[0];
} }
__global__ void extern "C" __global__ void
conv_full_load_everything( const float* img, const float* kern, float* out, conv_full_load_everything( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack, int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
...@@ -435,12 +477,15 @@ conv_full_load_everything( const float* img, const float* kern, float* out, ...@@ -435,12 +477,15 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1; out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x; nb_thread_id = blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x; int batch_id = blockIdx.x;
const int out_col = threadIdx.x;//output col const int out_col = threadIdx.x;//output col
...@@ -503,6 +548,8 @@ conv_full_load_everything( const float* img, const float* kern, float* out, ...@@ -503,6 +548,8 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
__syncthreads(); //don't start loading another kernel until we're done here __syncthreads(); //don't start loading another kernel until we're done here
} }
} }
/* /*
Local Variables: Local Variables:
mode:c++ mode:c++
......
...@@ -29,7 +29,6 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) { ...@@ -29,7 +29,6 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
*/ */
#ifndef CONV_KERNEL_CU #ifndef CONV_KERNEL_CU
#define CONV_KERNEL_CU #define CONV_KERNEL_CU
#include <stdint.h>
/* /*
#define CHECK_BANK_CONFLICTS 0 #define CHECK_BANK_CONFLICTS 0
...@@ -220,11 +219,18 @@ __device__ void store_or_accumulate(float& dst,const float value ){ ...@@ -220,11 +219,18 @@ __device__ void store_or_accumulate(float& dst,const float value ){
*/ */
template<bool flipped_kern, bool split> template<bool flipped_kern, bool split>
__device__ inline void __device__ inline void
conv_patch( const float* img, const float* kern, float* out, conv_patch( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack) int nkern, int nstack)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1; out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1; out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
...@@ -282,11 +288,14 @@ conv_patch( const float* img, const float* kern, float* out, ...@@ -282,11 +288,14 @@ conv_patch( const float* img, const float* kern, float* out,
extern "C" { extern "C" {
#define __INSTANTIATE_CONV_PATCH(suffix, ...) \ #define __INSTANTIATE_CONV_PATCH(suffix, ...) \
__global__ void \ __global__ void \
conv_patch_##suffix(const float *img, const float *kern, float *out, \ conv_patch_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \ int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack) \ int nkern, int nstack) \
{ \ { \
conv_patch<__VA_ARGS__>(img, kern, out, img_len, img_wid, kern_len, \ conv_patch<__VA_ARGS__>(img, img_offset, kern, kern_offset, \
out, out_offset, img_len, img_wid, kern_len, \
kern_wid, nkern, nstack); \ kern_wid, nkern, nstack); \
} }
...@@ -297,6 +306,7 @@ __INSTANTIATE_CONV_PATCH(3, true, true) ...@@ -297,6 +306,7 @@ __INSTANTIATE_CONV_PATCH(3, true, true)
} }
/** /**
* As conv_patch, but implement the stack in the kernel. * As conv_patch, but implement the stack in the kernel.
* I keep it separated from conv_patch as we take more registers and this could lower the occupency. * I keep it separated from conv_patch as we take more registers and this could lower the occupency.
...@@ -320,7 +330,9 @@ __INSTANTIATE_CONV_PATCH(3, true, true) ...@@ -320,7 +330,9 @@ __INSTANTIATE_CONV_PATCH(3, true, true)
*/ */
template<bool flipped_kern, bool accumulate, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample> template<bool flipped_kern, bool accumulate, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__device__ inline void __device__ inline void
conv_patch_stack( const float* img, const float* kern, float* out, conv_patch_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid, int out_len, int out_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
...@@ -329,6 +341,11 @@ conv_patch_stack( const float* img, const float* kern, float* out, ...@@ -329,6 +341,11 @@ conv_patch_stack( const float* img, const float* kern, float* out,
int kern_stride_stack, int kern_stride_nkern, int dx, int dy) int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
{ {
int __shared__ nb_thread_id; int __shared__ nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[]; extern __shared__ float s_data[];
...@@ -459,7 +476,9 @@ conv_patch_stack( const float* img, const float* kern, float* out, ...@@ -459,7 +476,9 @@ conv_patch_stack( const float* img, const float* kern, float* out,
extern "C" { extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK(suffix, ...) \ #define __INSTANTIATE_CONV_PATCH_STACK(suffix, ...) \
__global__ void \ __global__ void \
conv_patch_stack_##suffix(const float *img, const float *kern, float *out, \ conv_patch_stack_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \ int img_len, int img_wid, int kern_len, int kern_wid, \
int out_len, int out_wid, int nkern, int nstack, \ int out_len, int out_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \ int img_stride_col, int img_stride_row, \
...@@ -469,7 +488,8 @@ conv_patch_stack_##suffix(const float *img, const float *kern, float *out, \ ...@@ -469,7 +488,8 @@ conv_patch_stack_##suffix(const float *img, const float *kern, float *out, \
int dx, int dy) \ int dx, int dy) \
{ \ { \
conv_patch_stack<__VA_ARGS__>( \ conv_patch_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, out_len, \ img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, out_len, \
out_wid, nkern, nstack, img_stride_col, img_stride_row, \ out_wid, nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \ img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \ kern_stride_col, kern_stride_row, \
...@@ -513,6 +533,7 @@ __INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true) ...@@ -513,6 +533,7 @@ __INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true)
} }
/** /**
* As conv_patch_stack, but kern_len thread for each output pixel * As conv_patch_stack, but kern_len thread for each output pixel
* I keep it separated as use more register. * I keep it separated as use more register.
...@@ -529,7 +550,9 @@ __INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true) ...@@ -529,7 +550,9 @@ __INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true)
*/ */
template<bool flipped_kern, bool c_contiguous, bool split, bool preload_full_kern> template<bool flipped_kern, bool c_contiguous, bool split, bool preload_full_kern>
__device__ inline void __device__ inline void
conv_patch_stack_reduce( const float* img, const float* kern, float* out, conv_patch_stack_reduce( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
...@@ -543,6 +566,17 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out, ...@@ -543,6 +566,17 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
const int out_len = blockDim.y; const int out_len = blockDim.y;
const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x; const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern = &(kern[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
kern_stride_col=1;
kern_stride_row=kern_wid;
}
extern __shared__ float s_data[]; extern __shared__ float s_data[];
int batch_id = blockIdx.x; int batch_id = blockIdx.x;
...@@ -636,7 +670,9 @@ extern "C" { ...@@ -636,7 +670,9 @@ extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK_REDUCE(suffix, ...) \ #define __INSTANTIATE_CONV_PATCH_STACK_REDUCE(suffix, ...) \
__global__ void \ __global__ void \
conv_patch_stack_reduce_##suffix( \ conv_patch_stack_reduce_##suffix( \
const float *img, const float *kern, float *out, \ const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \ int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, int img_stride_col, int img_stride_row, \ int nkern, int nstack, int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \ int img_stride_stack, int img_stride_batch, \
...@@ -644,33 +680,35 @@ conv_patch_stack_reduce_##suffix( \ ...@@ -644,33 +680,35 @@ conv_patch_stack_reduce_##suffix( \
int kern_stride_stack, int kern_stride_nkern) \ int kern_stride_stack, int kern_stride_nkern) \
{ \ { \
conv_patch_stack_reduce<__VA_ARGS__>( \ conv_patch_stack_reduce<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \ img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \ img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \ kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \ kern_stride_stack, kern_stride_nkern); \
} }
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(0, false, false, false, false);*/ /*__INSTANTIATE_CONV_PATCH_STACK_REDUCE#(0, false, false, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(1, false, false, false, true); __INSTANTIATE_CONV_PATCH_STACK_REDUCE(1, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(2, false, false, true, false); __INSTANTIATE_CONV_PATCH_STACK_REDUCE(2, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(3, false, false, true, true); __INSTANTIATE_CONV_PATCH_STACK_REDUCE(3, false, false, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(4, false, true, false, false);*/ /*__INSTANTIATE_CONV_PATCH_STACK_REDUCE#(4, false, true, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(5, false, true, false, true); __INSTANTIATE_CONV_PATCH_STACK_REDUCE(5, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(6, false, true, true, false); __INSTANTIATE_CONV_PATCH_STACK_REDUCE(6, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(7, false, true, true, true); __INSTANTIATE_CONV_PATCH_STACK_REDUCE(7, false, true, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(8, true, false, false, false);*/ /*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(8, true, false, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(9, true, false, false, true); __INSTANTIATE_CONV_PATCH_STACK_REDUCE(9, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(10, true, false, true, false); __INSTANTIATE_CONV_PATCH_STACK_REDUCE(10, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(11, true, false, true, true); __INSTANTIATE_CONV_PATCH_STACK_REDUCE(11, true, false, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(12, true, true, false, false);*/ /*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(12, true, true, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(13, true, true, false, true); __INSTANTIATE_CONV_PATCH_STACK_REDUCE(13, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(14, true, true, true, false); __INSTANTIATE_CONV_PATCH_STACK_REDUCE(14, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true); __INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true)
#undef __INSTANTIATE_CONV_PATCH_STACK_REDUCE #undef __INSTANTIATE_CONV_PATCH_STACK_REDUCE
} }
/** /**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* we store kern_len row of the image and the full kernel in the shared memory * we store kern_len row of the image and the full kernel in the shared memory
...@@ -684,7 +722,9 @@ __INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true); ...@@ -684,7 +722,9 @@ __INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true);
*/ */
template<bool c_contiguous> template<bool c_contiguous>
__device__ inline void __device__ inline void
conv_rows( const float* img, const float* kern, float* out, conv_rows( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
...@@ -694,6 +734,11 @@ conv_rows( const float* img, const float* kern, float* out, ...@@ -694,6 +734,11 @@ conv_rows( const float* img, const float* kern, float* out,
{ {
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id; int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
float __shared__ *d_img, *d_kern; float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1; out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1; out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
...@@ -735,7 +780,9 @@ conv_rows( const float* img, const float* kern, float* out, ...@@ -735,7 +780,9 @@ conv_rows( const float* img, const float* kern, float* out,
extern "C" { extern "C" {
#define __INSTANTIATE_CONV_ROWS(suffix, ...) \ #define __INSTANTIATE_CONV_ROWS(suffix, ...) \
__global__ void \ __global__ void \
conv_rows_##suffix(const float *img, const float *kern, float *out, \ conv_rows_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \ int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, \ int nkern, int nstack, \
int img_stride_col, int img_stride_row, \ int img_stride_col, int img_stride_row, \
...@@ -744,7 +791,8 @@ conv_rows_##suffix(const float *img, const float *kern, float *out, \ ...@@ -744,7 +791,8 @@ conv_rows_##suffix(const float *img, const float *kern, float *out, \
int kern_stride_stack, int kern_stride_nkern) \ int kern_stride_stack, int kern_stride_nkern) \
{ \ { \
conv_rows<__VA_ARGS__>( \ conv_rows<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, \ img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \ nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \ img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \ kern_stride_col, kern_stride_row, \
...@@ -757,6 +805,8 @@ __INSTANTIATE_CONV_ROWS(1, true) ...@@ -757,6 +805,8 @@ __INSTANTIATE_CONV_ROWS(1, true)
#undef __INSTANTIATE_CONV_ROWS #undef __INSTANTIATE_CONV_ROWS
} }
/** /**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows, but implement the stack. Separate as this use more register. * as conv_rows, but implement the stack. Separate as this use more register.
...@@ -770,7 +820,9 @@ __INSTANTIATE_CONV_ROWS(1, true) ...@@ -770,7 +820,9 @@ __INSTANTIATE_CONV_ROWS(1, true)
*/ */
template<bool c_contiguous> template<bool c_contiguous>
__device__ inline void __device__ inline void
conv_rows_stack( const float* img, const float* kern, float* out, conv_rows_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid, const int kern_len, const int kern_wid, const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
...@@ -780,6 +832,11 @@ conv_rows_stack( const float* img, const float* kern, float* out, ...@@ -780,6 +832,11 @@ conv_rows_stack( const float* img, const float* kern, float* out,
{ {
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows; int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern; float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1; out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1; out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
...@@ -859,7 +916,9 @@ extern "C" { ...@@ -859,7 +916,9 @@ extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK(suffix, ...) \ #define __INSTANTIATE_CONV_ROWS_STACK(suffix, ...) \
__global__ void \ __global__ void \
conv_rows_stack_##suffix( \ conv_rows_stack_##suffix( \
const float *img, const float *kern, float *out, \ const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \ const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \ const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \ const int nkern, const int nstack, \
...@@ -869,7 +928,8 @@ conv_rows_stack_##suffix( \ ...@@ -869,7 +928,8 @@ conv_rows_stack_##suffix( \
const int kern_stride_stack, const int kern_stride_nkern) \ const int kern_stride_stack, const int kern_stride_nkern) \
{ \ { \
conv_rows_stack<__VA_ARGS__>( \ conv_rows_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, \ img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \ nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \ img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \ kern_stride_col, kern_stride_row, \
...@@ -882,6 +942,8 @@ __INSTANTIATE_CONV_ROWS_STACK(1, true) ...@@ -882,6 +942,8 @@ __INSTANTIATE_CONV_ROWS_STACK(1, true)
#undef __INSTANTIATE_CONV_ROWS_STACK #undef __INSTANTIATE_CONV_ROWS_STACK
} }
/** /**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row. * as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
...@@ -895,7 +957,9 @@ __INSTANTIATE_CONV_ROWS_STACK(1, true) ...@@ -895,7 +957,9 @@ __INSTANTIATE_CONV_ROWS_STACK(1, true)
*/ */
template<bool c_contiguous, bool preload_full_kern> template<bool c_contiguous, bool preload_full_kern>
__device__ inline void __device__ inline void
conv_rows_stack2(const float* img, const float* kern, float* out, conv_rows_stack2(const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid, const int kern_len, const int kern_wid, const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
...@@ -905,6 +969,11 @@ conv_rows_stack2(const float* img, const float* kern, float* out, ...@@ -905,6 +969,11 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
{ {
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows; int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern; float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1; out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1; out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
...@@ -984,7 +1053,9 @@ extern "C" { ...@@ -984,7 +1053,9 @@ extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK2(suffix, ...) \ #define __INSTANTIATE_CONV_ROWS_STACK2(suffix, ...) \
__global__ void \ __global__ void \
conv_rows_stack2_##suffix( \ conv_rows_stack2_##suffix( \
const float *img, const float *kern, float *out, \ const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \ const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \ const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \ const int nkern, const int nstack, \
...@@ -994,8 +1065,8 @@ conv_rows_stack2_##suffix( \ ...@@ -994,8 +1065,8 @@ conv_rows_stack2_##suffix( \
const int kern_stride_stack, const int kern_stride_nkern) \ const int kern_stride_stack, const int kern_stride_nkern) \
{ \ { \
conv_rows_stack2<__VA_ARGS__>( \ conv_rows_stack2<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, \ img, img_offset, kern, kern_offset, out, out_offset, \
kern_len, kern_wid, nkern, nstack, \ img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \ img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \ kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \ kern_stride_stack, kern_stride_nkern); \
...@@ -1009,6 +1080,8 @@ __INSTANTIATE_CONV_ROWS_STACK2(3, true, true) ...@@ -1009,6 +1080,8 @@ __INSTANTIATE_CONV_ROWS_STACK2(3, true, true)
#undef __INSTANTIATE_CONV_ROWS_STACK2 #undef __INSTANTIATE_CONV_ROWS_STACK2
} }
/** /**
* Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the * Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
* kernel-image inner-product in parallel. * kernel-image inner-product in parallel.
...@@ -1024,13 +1097,18 @@ conv_valid_row_reduce(int nB, int nK, int stacklen, ...@@ -1024,13 +1097,18 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int kern_len, int kern_wid,
int out_len, int out_wid, //physical int out_len, int out_wid, //physical
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, const float *img, const size_t img_offset, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, const float *kern, const size_t kern_offset, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C , float *out, const size_t out_offset, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols, int subsample_rows, int subsample_cols,
const int initial_reduce_boundary) const int initial_reduce_boundary)
{ {
const int outsize = nB * nK * out_len * out_wid; const int outsize = nB * nK * out_len * out_wid;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
extern __shared__ float reducebuf[]; extern __shared__ float reducebuf[];
for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x) for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x)
{ {
...@@ -1110,18 +1188,21 @@ __global__ void \ ...@@ -1110,18 +1188,21 @@ __global__ void \
conv_valid_row_reduce_##suffix( \ conv_valid_row_reduce_##suffix( \
int nB, int nK, int stacklen, int img_len, int img_wid, \ int nB, int nK, int stacklen, int img_len, int img_wid, \
int kern_len, int kern_wid, int out_len, int out_wid, \ int kern_len, int kern_wid, int out_len, int out_wid, \
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, \ const float *img, const size_t img_offset, \
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, \ int img_str_B, int img_str_S, int img_str_R, int img_str_C, \
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C, \ const float *kern, const size_t kern_offset, \
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, \
float *out, const size_t out_offset, \
int out_str_B, int out_str_K, int out_str_R, int out_str_C, \
int subsample_rows, int subsample_cols, \ int subsample_rows, int subsample_cols, \
const int initial_reduce_boundary) \ const int initial_reduce_boundary) \
{ \ { \
conv_valid_row_reduce<__VA_ARGS__>( \ conv_valid_row_reduce<__VA_ARGS__>( \
nB, nK, stacklen, img_len, img_wid, \ nB, nK, stacklen, img_len, img_wid, \
kern_len, kern_wid, out_len, out_wid, \ kern_len, kern_wid, out_len, out_wid, \
img, img_str_B, img_str_S, img_str_R, img_str_C, \ img, img_offset, img_str_B, img_str_S, img_str_R, img_str_C, \
kern, kern_str_K, kern_str_S, kern_str_R, kern_str_C, \ kern, kern_offset, kern_str_K, kern_str_S, kern_str_R, kern_str_C, \
out, out_str_B, out_str_K, out_str_R, out_str_C, \ out, out_offset, out_str_B, out_str_K, out_str_R, out_str_C, \
subsample_rows, subsample_cols, initial_reduce_boundary); \ subsample_rows, subsample_cols, initial_reduce_boundary); \
} }
...@@ -1132,6 +1213,7 @@ __INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true) ...@@ -1132,6 +1213,7 @@ __INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true)
} }
/** /**
* Reference implementation of 'valid' mode convolution (with stack) * Reference implementation of 'valid' mode convolution (with stack)
* *
...@@ -1139,18 +1221,26 @@ __INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true) ...@@ -1139,18 +1221,26 @@ __INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true)
* *
* TODO: explain parameters, preconditions * TODO: explain parameters, preconditions
*/ */
__global__ void extern "C" __global__ void
conv_reference_valid(int nB, int nK, int stacklen, conv_reference_valid(int nB, int nK, int stacklen,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int kern_len, int kern_wid,
int out_len, int out_wid, //physical int out_len, int out_wid, //physical
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, const float *img, const size_t img_offset,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C , const float *kern, const size_t kern_offset,
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, const size_t out_offset,
int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols) int subsample_rows, int subsample_cols)
{ {
const int idx = blockIdx.x * blockDim.x + threadIdx.x; const int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int numThreads, outsize; __shared__ int numThreads, outsize;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
numThreads = blockDim.x * gridDim.x; numThreads = blockDim.x * gridDim.x;
outsize = nB * nK * out_len * out_wid; outsize = nB * nK * out_len * out_wid;
...@@ -1191,6 +1281,8 @@ conv_reference_valid(int nB, int nK, int stacklen, ...@@ -1191,6 +1281,8 @@ conv_reference_valid(int nB, int nK, int stacklen,
} }
} }
/** /**
* Reference implementation of 'full' mode convolution (with stack) * Reference implementation of 'full' mode convolution (with stack)
* *
...@@ -1198,18 +1290,26 @@ conv_reference_valid(int nB, int nK, int stacklen, ...@@ -1198,18 +1290,26 @@ conv_reference_valid(int nB, int nK, int stacklen,
* *
* TODO: explain parameters, preconditions * TODO: explain parameters, preconditions
*/ */
__global__ void extern "C" __global__ void
conv_reference_full(int nB, int nK, int stacklen, conv_reference_full(int nB, int nK, int stacklen,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int kern_len, int kern_wid,
int out_len, int out_wid, //physical dimensions int out_len, int out_wid, //physical dimensions
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, const float *img, const size_t img_offset,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C, const float *kern, const size_t kern_offset,
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, const size_t out_offset,
int out_str_B, int out_str_K, int out_str_R, int out_str_C,
int subsample_rows, int subsample_cols) int subsample_rows, int subsample_cols)
{ {
const int idx = blockIdx.x * blockDim.x + threadIdx.x; const int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int numThreads, physical_outsize; __shared__ int numThreads, physical_outsize;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
numThreads = blockDim.x * gridDim.x; numThreads = blockDim.x * gridDim.x;
physical_outsize = nB * nK * out_len * out_wid; physical_outsize = nB * nK * out_len * out_wid;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论