提交 41daf4a8 authored 作者: Sean Lee's avatar Sean Lee

Use the CUDA Driver API for conv operations

上级 0d5cffbe
......@@ -10,12 +10,6 @@ PyObject * PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
const size_t subsample_cols,
const int version, const int verbose);
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
/*
* version: -1, autodetect, >=0 a specific version to use.
* If it can't be executed, we revert to the reference implementation
......@@ -108,6 +102,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//TODO: make a parameter the number of division
//TODO: Should we make them in separate grid block instead?
const int stack_len = PyGpuArray_DIMS(img)[1];
const int nstack=PyGpuArray_DIMS(kern)[1];
const int nbatch=PyGpuArray_DIMS(img)[0];
const int nkern=PyGpuArray_DIMS(kern)[0];
......@@ -126,6 +121,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
const int out_stride_col = PyGpuArray_STRIDES(out)[3]/4;
const int out_stride_row = PyGpuArray_STRIDES(out)[2]/4;
const int out_stride_nkern = PyGpuArray_STRIDES(out)[1]/4;
const int out_stride_batch = PyGpuArray_STRIDES(out)[0]/4;
const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid;
......@@ -156,16 +155,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
const float * kern_data_unflipped = cuda_get_ptr(kern);
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped=1;
kern_stride_row_unflipped=kern_wid;
kern_flipped=false;
kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
}
//if we remove the restriction
......@@ -195,43 +188,47 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0)
nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch, nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int);
if(threads.y==out_len) f=conv_patch_2;
else f=conv_patch_3;
f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
size_t threads_per_block[3] = {(size_t)out_wid,
ceil_intdiv((size_t)out_len,(size_t)nb_split),
(size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
size_t shmem_sz = (img_size + kern_size)*sizeof(float);
GpuKernel *k = NULL;
if(threads_per_block[1]==out_len) k=&conv_patch_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else k=&conv_patch_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch' version %s nb_split=%d\n",
threads.y==out_len ? "no split": "split", nb_split);
threads_per_block[1]==out_len ? "no split": "split", nb_split);
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i, nb_split=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y, nb_split);
"threads_per_block[0]=%i, threads_per_block[1]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i, nb_split=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1], nb_split);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
......@@ -250,75 +247,77 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
size_t threads_per_block[3] = {(size_t)out_wid,
(size_t)ceil_intdiv(out_len,nb_split),
(size_t)1};
bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
if(version==11 || version==12) preload_full_kernel=false;
dim3 grid(nbatch,nkern);
int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_64;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_65;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_66;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_67;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_68;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_69;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_70;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_71;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_72;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_73;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_74;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_75;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_76;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_77;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_78;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_79;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_80;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_81;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_82;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_83;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_84;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_85;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_86;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_87;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_88;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_89;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_90;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_91;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_92;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_93;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_94;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_95;}
f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid,
out_len, out_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
size_t shmem_sz = (img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
GpuKernel *k = NULL;
if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_64_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_65_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_66_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_67_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_68_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_69_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_70_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_71_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_72_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_73_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_74_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_75_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_76_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_77_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_78_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_79_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_80_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_81_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_82_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_83_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_84_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_85_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_86_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_87_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_88_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_89_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_90_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_91_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_92_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_93_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_94_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_95_node_<<<<HASH_PLACEHOLDER>>>>_0;}
void *kernel_params[] = {(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&out_len, (void *)&out_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern,
(void *)&subsample_rows, (void *)&subsample_cols};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i,"
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i,"
" img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1],
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel,
(unsigned long long)subsample_rows,
......@@ -337,15 +336,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i,"
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false,"
" kern_width=%i, img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1],
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel,
(unsigned long long)subsample_rows,
......@@ -354,7 +353,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
fprintf(stderr,
"INFO: impl 'conv_patch_stack' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
......@@ -366,28 +365,28 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
!work_complete) //conv_rows
{
dim3 threads(out_wid);
dim3 grid(out_len, nbatch*nkern);
int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_0;
else f = conv_rows_1;
f<<< grid, threads, shared_size >>>
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)1, (size_t)1};
size_t n_blocks[3] = {(size_t)out_len, (size_t)nbatch*nkern, (size_t)1};
size_t shmem_sz = (kern_len*img_wid + kern_size)*sizeof(float);
GpuKernel *k = NULL;
if(!img_contiguous_2d || !kern_contiguous_2d) k=&conv_rows_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else k=&conv_rows_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose)
......@@ -397,15 +396,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
if (!subsample && out_contiguous &&
......@@ -423,52 +422,50 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
nb_row=i;
}
dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)nb_row, (size_t)1};
size_t n_blocks[3] = {(size_t)ceil_intdiv(out_len,nb_row),
(size_t)nbatch*nkern, (size_t)1};
int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
if (0)
fprintf(stderr,
"IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
img_contiguous_2d, kern_contiguous_2d,
threads.x, threads.y, threads.z,
grid.x, grid.y, grid.z);
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], n_blocks[2]);
GpuKernel *k = NULL;
if(!img_contiguous_2d || !kern_contiguous_2d) {
//fprintf(stderr, "using false version\n");
f = conv_rows_stack_0;
k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
} else {
//fprintf(stderr, "using true version\n");
f = conv_rows_stack_1;
k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
}
f<<< grid, threads, shared_size >>>
(cuda_get_ptr(img),
cuda_get_ptr(kern),
cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
}
......@@ -476,15 +473,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows_stack' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
......@@ -517,42 +514,41 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//to test the case when we don't have a thread by output pixel.
if((version_back!=-1)&& nb_row>1) nb_row--;
dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)nb_row, (size_t)1};
size_t n_blocks[3] = {(size_t)ceil_intdiv(out_len,nb_row),
(size_t)nbatch*nkern, (size_t)1};
int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2_1;
else if(version==9) f = conv_rows_stack2_3;
else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2_0;
else f = conv_rows_stack2_2;
f<<< grid, threads, shared_size >>>
(cuda_get_ptr(img),
cuda_get_ptr(kern),
cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
GpuKernel *k = NULL;
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) k=&conv_rows_stack2_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==9) k=&conv_rows_stack2_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!img_contiguous_2d || !kern_contiguous_2d) k=&conv_rows_stack2_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else k=&conv_rows_stack2_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_rows_stack2' version %s with"
......@@ -564,15 +560,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i version=%d\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,(version==9?2:3));
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i version=%d\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1],(version==9?2:3));
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows_stack2' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
......@@ -619,18 +615,18 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
nb_split++;
// tentative estimates (prior to contraint c)
int thread_z=ceil_intdiv(kern_len,nb_split);
int shared_size = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z));
size_t thread_z=ceil_intdiv(kern_len,nb_split);
size_t shmem_sz = sizeof(float)*(full_kern
? std::max((size_t)img_size + kern_size, out_size*thread_z)
: std::max((size_t)img_size + thread_z*kern_wid, out_size*thread_z));
// constraint (c)
while ((shared_size >= shared_avail) && (nb_split <= kern_len)){
while ((shmem_sz >= shared_avail) && (nb_split <= kern_len)){
//if we can't fit the kernel in shared memory, we must split it more.
nb_split++;
thread_z=ceil_intdiv(kern_len,nb_split);
shared_size = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z)
shmem_sz = sizeof(float)*(full_kern
? std::max((size_t)img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z));
}
if (nb_split <= kern_len)
......@@ -638,15 +634,12 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
assert(thread_z>0);//should not happen, but in case...
if(!full_kern) assert(thread_z!=kern_len);
dim3 threads(out_wid, out_len, thread_z);
dim3 grid(nbatch,nkern);
size_t threads_per_block[3] = {(size_t)out_wid,
(size_t)out_len,
(size_t)thread_z};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int,
int, int,
int, int);
GpuKernel *k = NULL;
const bool split=thread_z!=kern_len;
const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
......@@ -654,40 +647,46 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
//We will always be split when we don't load the full kernel
/* if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce_0;*/
/*else*/ if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce_1;
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce_2;
else if(!kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce_3;
/*else if(!kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce_4;*/
else if(!kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce_5;
else if(!kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce_6;
else if(!kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce_7;
/*else if(kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce_8;*/
else if(kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce_9;
else if(kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce_10;
else if(kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce_11;
/*else if(kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce_12;*/
else if(kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce_13;
else if(kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce_14;
else if(kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce_15;
f<<< grid, threads, shared_size>>>(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid,
nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
/* if(!kern_flipped && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
/*else*/ if(!kern_flipped && !ccontig && !split && full_kern) k=&conv_patch_stack_reduce_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && !ccontig && split && !full_kern) k=&conv_patch_stack_reduce_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && !ccontig && split && full_kern) k=&conv_patch_stack_reduce_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
/*else if(!kern_flipped && ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_4_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else if(!kern_flipped && ccontig && !split && full_kern) k=&conv_patch_stack_reduce_5_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && ccontig && split && !full_kern) k=&conv_patch_stack_reduce_6_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && ccontig && split && full_kern) k=&conv_patch_stack_reduce_7_node_<<<<HASH_PLACEHOLDER>>>>_0;
/*else if(kern_flipped && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_8_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else if(kern_flipped && !ccontig && !split && full_kern) k=&conv_patch_stack_reduce_9_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && !ccontig && split && !full_kern) k=&conv_patch_stack_reduce_10_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && !ccontig && split && full_kern) k=&conv_patch_stack_reduce_11_node_<<<<HASH_PLACEHOLDER>>>>_0;
/*else if(kern_flipped && ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_12_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else if(kern_flipped && ccontig && !split && full_kern) k=&conv_patch_stack_reduce_13_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && ccontig && split && !full_kern) k=&conv_patch_stack_reduce_14_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && ccontig && split && full_kern) k=&conv_patch_stack_reduce_15_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col,
(void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i, "
"grid.x=%i, grid.y=%i, shared_size=%i,"
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i, "
"n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i,"
" nb_threads=%i\n",
threads.x, threads.y, threads.z, grid.x, grid.y,
shared_size, threads.x * threads.y * threads.z);
threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch_stack_reduce' version"
......@@ -700,17 +699,17 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i,"
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i,"
" nb_threads=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z);
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch_stack_reduce' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
} // else no good nb_splits was found
}
......@@ -719,8 +718,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
kern_len<=320 &&
!work_complete) //conv_valid_row_reduce
{
int outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, 4096);
size_t outsize = PyGpuArray_SIZE(out);
size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
(size_t)1, (size_t)1};
int block_nstack=nstack;
//Max of 512 threads per blocks.
......@@ -728,9 +728,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//8k registers and the kernel use 23 register
//TODO: check if we have 8k or 16k of register...
while(block_nstack*kern_len>320)block_nstack--;
dim3 n_threads(block_nstack, kern_len, 1);
size_t threads_per_block[3] = {(size_t)block_nstack, (size_t)kern_len, (size_t)1};
int n_reduce_buf = block_nstack * kern_len * sizeof(float);
size_t n_reduce_buf = block_nstack * kern_len * sizeof(float);
/* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
*
* if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
......@@ -747,39 +747,34 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
}
void (*f)(int, int, int, int,
int, int, int, int, int,
const float*, int, int, int, int,
const float*, int, int, int, int,
float*, int, int, int, int,
int, int, int);
GpuKernel *k = NULL;
//std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
//std::cerr << "kerns " << nstack << " " << kern_len << "\n";
//std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
if(block_nstack==nstack)
f=conv_valid_row_reduce_0;
k=&conv_valid_row_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else
f=conv_valid_row_reduce_1;
f<<<n_blocks, n_threads, n_reduce_buf>>>(
nbatch, nkern, PyGpuArray_DIMS(img)[1],
img_len, img_wid,
kern_len, kern_wid,
out_len, out_wid,
cuda_get_ptr(img),
PyGpuArray_STRIDES(img)[0]/4, PyGpuArray_STRIDES(img)[1]/4,
img_stride_row, img_stride_col,
cuda_get_ptr(kern),
PyGpuArray_STRIDES(kern)[0]/4, PyGpuArray_STRIDES(kern)[1]/4,
PyGpuArray_STRIDES(kern)[2]/4, PyGpuArray_STRIDES(kern)[3]/4,
cuda_get_ptr(out),
PyGpuArray_STRIDES(out)[0]/4, PyGpuArray_STRIDES(out)[1]/4,
PyGpuArray_STRIDES(out)[2]/4, PyGpuArray_STRIDES(out)[3]/4,
subsample_rows, subsample_cols, initial_reduce_boundary);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
k=&conv_valid_row_reduce_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&out_len, (void *)&out_wid,
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)&img_stride_batch, (void *)&img_stride_stack,
(void *)&img_stride_row, (void *)&img_stride_col,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)&kern_stride_nkern, (void *)&kern_stride_stack,
(void *)&kern_stride_row, (void *)&kern_stride_col,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&out_stride_batch, (void *)&out_stride_nkern,
(void *)&out_stride_row, (void *)&out_stride_col,
(void *)&subsample_rows, (void *)&subsample_cols,
(void *)&initial_reduce_boundary};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, n_reduce_buf, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose)
......@@ -789,24 +784,27 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i,"
" shared_size=%i, nb_threads=%i\n",
n_threads.x, n_threads.y, n_blocks,
n_reduce_buf, n_threads.x * n_threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0],
n_reduce_buf, threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_valid_row_reduce' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
if (1 && !work_complete) //conv_reference_valid
{
int outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, 4096);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
256);
size_t outsize = PyGpuArray_SIZE(out);
size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
(size_t)1, (size_t)1};
size_t threads_per_block[3] = {std::min(ceil_intdiv(outsize, n_blocks[0]),
(size_t)256),
(size_t)1, (size_t)1};
if (1)
{
if (verbose)
......@@ -814,61 +812,56 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (verbose>1)
fprintf(stderr, " img : %i %llu %i %i %p "
"%lld %lld %lld %lld\n",
nbatch, (unsigned long long)PyGpuArray_DIMS(img)[1],
img_len, img_wid,
cuda_get_ptr(img),
(long long)PyGpuArray_STRIDES(img)[0]/4,
(long long)PyGpuArray_STRIDES(img)[1]/4,
(long long)PyGpuArray_STRIDES(img)[2]/4,
(long long)PyGpuArray_STRIDES(img)[3]/4);
nbatch, (unsigned long long)stack_len, img_len, img_wid,
(void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
(long long)img_stride_batch,
(long long)img_stride_stack,
(long long)img_stride_row,
(long long)img_stride_col);
if (verbose>1)
fprintf(stderr, " kern: %i %i %i %i %p "
"%lld %lld %lld %lld\n",
nkern, nstack, kern_len, kern_wid,
cuda_get_ptr(kern),
(long long)PyGpuArray_STRIDES(kern)[0]/4,
(long long)PyGpuArray_STRIDES(kern)[1]/4,
(long long)PyGpuArray_STRIDES(kern)[2]/4,
(long long)PyGpuArray_STRIDES(kern)[3]/4);
(void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
(long long)kern_stride_nkern,
(long long)kern_stride_stack,
(long long)kern_stride_row,
(long long)kern_stride_col);
if (verbose>1)
fprintf(stderr, " out : %llu %llu %i %i %p "
"%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1],
out_len, out_wid,
cuda_get_ptr(out),
(long long)PyGpuArray_STRIDES(out)[0]/4,
(long long)PyGpuArray_STRIDES(out)[1]/4,
(long long)PyGpuArray_STRIDES(out)[2]/4,
(long long)PyGpuArray_STRIDES(out)[3]/4);
(void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
(long long)out_stride_batch,
(long long)out_stride_nkern,
(long long)out_stride_row,
(long long)out_stride_col);
if (verbose>1)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads);
outsize, n_blocks[0], threads_per_block[0]);
}
conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
PyGpuArray_DIMS(img)[1],
img_len, img_wid,
kern_len, kern_wid,
out_len, out_wid,
cuda_get_ptr(img),
PyGpuArray_STRIDES(img)[0]/4,
PyGpuArray_STRIDES(img)[1]/4,
PyGpuArray_STRIDES(img)[2]/4,
PyGpuArray_STRIDES(img)[3]/4,
cuda_get_ptr(kern),
PyGpuArray_STRIDES(kern)[0]/4,
PyGpuArray_STRIDES(kern)[1]/4,
PyGpuArray_STRIDES(kern)[2]/4,
PyGpuArray_STRIDES(kern)[3]/4,
cuda_get_ptr(out),
PyGpuArray_STRIDES(out)[0]/4,
PyGpuArray_STRIDES(out)[1]/4,
PyGpuArray_STRIDES(out)[2]/4,
PyGpuArray_STRIDES(out)[3]/4,
subsample_rows, subsample_cols);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&out_len, (void *)&out_wid,
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)&img_stride_batch, (void *)&img_stride_stack,
(void *)&img_stride_row, (void *)&img_stride_col,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)&kern_stride_nkern, (void *)&kern_stride_stack,
(void *)&kern_stride_row, (void *)&kern_stride_col,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&out_stride_batch, (void *)&out_stride_nkern,
(void *)&out_stride_row, (void *)&out_stride_col,
(void *)&subsample_rows, (void *)&subsample_cols};
int err = GpuKernel_call(&conv_reference_valid_node_<<<<HASH_PLACEHOLDER>>>>_0,
3, threads_per_block, n_blocks, 0, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose)
......@@ -881,7 +874,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for"
" PyGpuArray_conv_valid! (%s)",
cudaGetErrorString(sts));
GpuKernel_error(&conv_reference_valid_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
return -1;
}
}
......@@ -930,6 +923,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]);
assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]);
const int stack_len=PyGpuArray_DIMS(img)[1];
const int nstack=PyGpuArray_DIMS(kern)[1];
const int nbatch=PyGpuArray_DIMS(img)[0];
const int nkern=PyGpuArray_DIMS(kern)[0];
......@@ -948,6 +942,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
const int out_stride_col = PyGpuArray_STRIDES(out)[3]/4;
const int out_stride_row = PyGpuArray_STRIDES(out)[2]/4;
const int out_stride_nkern = PyGpuArray_STRIDES(out)[1]/4;
const int out_stride_batch = PyGpuArray_STRIDES(out)[0]/4;
const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid;
......@@ -990,16 +988,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
//we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
const float * kern_data_unflipped = cuda_get_ptr(kern);
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped=1;
kern_stride_row_unflipped=kern_wid;
kern_flipped=false;
kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
}
if (verbose>1)
......@@ -1008,34 +1000,34 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
" MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
printf("INFO: img dim: %llu %llu %llu %llu "
"img stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(img)[0],
(unsigned long long)PyGpuArray_DIMS(img)[1],
(unsigned long long)PyGpuArray_DIMS(img)[2],
(unsigned long long)PyGpuArray_DIMS(img)[3],
(long long)PyGpuArray_STRIDES(img)[0]/4,
(long long)PyGpuArray_STRIDES(img)[1]/4,
(long long)PyGpuArray_STRIDES(img)[2]/4,
(long long)PyGpuArray_STRIDES(img)[3]/4);
(unsigned long long)nbatch,
(unsigned long long)stack_len,
(unsigned long long)img_len,
(unsigned long long)img_wid,
(long long)img_stride_batch,
(long long)img_stride_stack,
(long long)img_stride_row,
(long long)img_stride_col);
printf("INFO: kern dim: %llu %llu %llu %llu "
"kern stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(kern)[0],
(unsigned long long)PyGpuArray_DIMS(kern)[1],
(unsigned long long)PyGpuArray_DIMS(kern)[2],
(unsigned long long)PyGpuArray_DIMS(kern)[3],
(long long)PyGpuArray_STRIDES(kern)[0]/4,
(long long)PyGpuArray_STRIDES(kern)[1]/4,
(long long)PyGpuArray_STRIDES(kern)[2]/4,
(long long)PyGpuArray_STRIDES(kern)[3]/4);
(unsigned long long)nkern,
(unsigned long long)nstack,
(unsigned long long)kern_len,
(unsigned long long)kern_wid,
(long long)kern_stride_nkern,
(long long)kern_stride_stack,
(long long)kern_stride_row,
(long long)kern_stride_col);
printf("INFO: out dim: %llu %llu %llu %llu "
"out stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1],
(unsigned long long)PyGpuArray_DIMS(out)[2],
(unsigned long long)PyGpuArray_DIMS(out)[3],
(long long)PyGpuArray_STRIDES(out)[0]/4,
(long long)PyGpuArray_STRIDES(out)[1]/4,
(long long)PyGpuArray_STRIDES(out)[2]/4,
(long long)PyGpuArray_STRIDES(out)[3]/4);
(unsigned long long)out_len,
(unsigned long long)out_wid,
(long long)out_stride_batch,
(long long)out_stride_nkern,
(long long)out_stride_row,
(long long)out_stride_col);
}
if (!subsample &&
......@@ -1082,50 +1074,53 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
assert(version!=5 || kern_len>1);
assert(version!=-1);
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch,nkern);
size_t threads_per_block[3] = {(size_t)out_wid,
ceil_intdiv((size_t)out_len,(size_t)nb_split),
(size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
int shared_size=img_size_padded_byte + kern_size_byte;
size_t shmem_sz=img_size_padded_byte + kern_size_byte;
if(version==5)
shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
if(version==3) f=conv_full_patch_stack_padded_0;
else if(version==5) f=conv_full_patch_stack_padded_1;
else if(version==4) f=conv_full_patch_stack_padded_2;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded_4;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded_5;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded_6;
else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded_8;
else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded_9;
else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded_10;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded_12;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded_13;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded_14;
shmem_sz=((kern_len+threads_per_block[1]-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
GpuKernel *k = NULL;
if(version==3) k=&conv_full_patch_stack_padded_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==5) k=&conv_full_patch_stack_padded_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==4) k=&conv_full_patch_stack_padded_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) k=&conv_full_patch_stack_padded_4_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) k=&conv_full_patch_stack_padded_5_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) k=&conv_full_patch_stack_padded_6_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==3 && kern_flipped) k=&conv_full_patch_stack_padded_8_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==5 && kern_flipped)k=&conv_full_patch_stack_padded_9_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==4 && kern_flipped)k=&conv_full_patch_stack_padded_10_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) k=&conv_full_patch_stack_padded_12_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) k=&conv_full_patch_stack_padded_13_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) k=&conv_full_patch_stack_padded_14_node_<<<<HASH_PLACEHOLDER>>>>_0;
else assert(false);
f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z,
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
out_len, nb_split, version);
if (verbose)
fprintf(stderr,
......@@ -1138,12 +1133,12 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i,"
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z,
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
out_len, nb_split, version);
if (verbose)
fprintf(stderr,
......@@ -1151,7 +1146,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
" failed (%s), trying next implementation\n",
version==3?"no split": "split",
(version==5?"low_mem":"not_low_mem"),
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
......@@ -1162,21 +1157,22 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
size_t shmem_sz = (img_size + kern_size)*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
conv_full_patch<<< grid, threads, shared_size>>>
(cuda_get_ptr(img),
cuda_get_ptr(kern),
cuda_get_ptr(out),
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack};
int err = GpuKernel_call(&conv_full_patch_node_<<<<HASH_PLACEHOLDER>>>>_0,
3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n");
work_complete = true;
......@@ -1185,15 +1181,15 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size,
threads.x * threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_full_patch' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(&conv_full_patch_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
}
}
if (false && !subsample && //disabled as test fail for this kernel
......@@ -1203,35 +1199,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch);
int shared_size=(img_size + kern_size)*nstack*sizeof(float);
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)1, (size_t)1};
size_t shmem_sz = (img_size + kern_size)*nstack*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
//typeof(conv_full_load_everything<0>) f = ;
void (*f)(const float*, const float*, float*,
int, int, int, int, int, int,
int, int, int, int, int, int, int, int) = conv_full_load_everything;
f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img),
cuda_get_ptr(kern),
cuda_get_ptr(out),
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack,
PyGpuArray_STRIDES(img)[3]/4,
PyGpuArray_STRIDES(img)[2]/4,
PyGpuArray_STRIDES(img)[1]/4,
PyGpuArray_STRIDES(img)[0]/4,
PyGpuArray_STRIDES(kern)[3]/4,
PyGpuArray_STRIDES(kern)[2]/4,
PyGpuArray_STRIDES(kern)[1]/4,
PyGpuArray_STRIDES(kern)[0]/4
);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(&conv_full_load_everything_node_<<<<HASH_PLACEHOLDER>>>>_0,
3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n");
work_complete = true;
......@@ -1240,14 +1227,14 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size,
threads.x * threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
" failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(&conv_full_load_everything_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
}
}
......@@ -1259,32 +1246,29 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int);
if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack_0;
else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack_1;
else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack_2;
else if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack_3;
f<<< grid, threads, shared_size>>>(
cuda_get_ptr(img),
cuda_get_ptr(kern),
cuda_get_ptr(out),
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack,img_stride_col, img_stride_row,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
size_t shmem_sz = (img_size + kern_size)*sizeof(float);
GpuKernel *k = NULL;
if(!img_contiguous_2d && !kern_contiguous_2d) k=&conv_full_patch_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!img_contiguous_2d && kern_contiguous_2d) k=&conv_full_patch_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && !kern_contiguous_2d) k=&conv_full_patch_stack_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d) k=&conv_full_patch_stack_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose)
fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
......@@ -1294,23 +1278,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
if (1 && !work_complete) //conv_reference_full
{
if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");
int outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, 4096);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
256);
size_t outsize = PyGpuArray_SIZE(out);
size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
(size_t)1, (size_t)1};
size_t threads_per_block[3] = {std::min(ceil_intdiv(outsize, n_blocks[0]),
(size_t)256),
(size_t)1, (size_t)1};
if (0)
{
if (verbose)
......@@ -1318,70 +1305,67 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
if (verbose)
fprintf(stderr, " img : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(img)[0],
(unsigned long long)PyGpuArray_DIMS(img)[1],
(unsigned long long)PyGpuArray_DIMS(img)[2],
(unsigned long long)PyGpuArray_DIMS(img)[3],
cuda_get_ptr(img),
(long long)PyGpuArray_STRIDES(img)[0]/4,
(long long)PyGpuArray_STRIDES(img)[1]/4,
(long long)PyGpuArray_STRIDES(img)[2]/4,
(long long)PyGpuArray_STRIDES(img)[3]/4);
(unsigned long long)nbatch,
(unsigned long long)stack_len,
(unsigned long long)img_len,
(unsigned long long)img_wid,
(void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
(long long)img_stride_batch,
(long long)img_stride_stack,
(long long)img_stride_row,
(long long)img_stride_col);
if (verbose)
fprintf(stderr, " kern: %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(kern)[0],
(unsigned long long)PyGpuArray_DIMS(kern)[1],
(unsigned long long)PyGpuArray_DIMS(kern)[2],
(unsigned long long)PyGpuArray_DIMS(kern)[3],
cuda_get_ptr(kern),
(long long)PyGpuArray_STRIDES(kern)[0]/4,
(long long)PyGpuArray_STRIDES(kern)[1]/4,
(long long)PyGpuArray_STRIDES(kern)[2]/4,
(long long)PyGpuArray_STRIDES(kern)[3]/4
);
(unsigned long long)nkern,
(unsigned long long)nstack,
(unsigned long long)kern_len,
(unsigned long long)kern_wid,
(void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
(long long)kern_stride_nkern,
(long long)kern_stride_stack,
(long long)kern_stride_row,
(long long)kern_stride_col);
if (verbose)
fprintf(stderr, " out : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1],
(unsigned long long)PyGpuArray_DIMS(out)[2],
(unsigned long long)PyGpuArray_DIMS(out)[3],
cuda_get_ptr(out),
(long long)PyGpuArray_STRIDES(out)[0]/4,
(long long)PyGpuArray_STRIDES(out)[1]/4,
(long long)PyGpuArray_STRIDES(out)[2]/4,
(long long)PyGpuArray_STRIDES(out)[3]/4);
(unsigned long long)out_len,
(unsigned long long)out_wid,
(void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
(long long)out_stride_batch,
(long long)out_stride_nkern,
(long long)out_stride_row,
(long long)out_stride_col);
if (verbose)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads);
outsize, n_blocks[0], threads_per_block[0]);
if (verbose)
fprintf(stderr, " subsample params: %llu %llu\n",
(unsigned long long)subsample_rows,
(unsigned long long)subsample_cols);
}
conv_reference_full<<<n_blocks, n_threads>>>(
PyGpuArray_DIMS(img)[0], PyGpuArray_DIMS(kern)[0],
PyGpuArray_DIMS(img)[1],
PyGpuArray_DIMS(img)[2], PyGpuArray_DIMS(img)[3],
PyGpuArray_DIMS(kern)[2], PyGpuArray_DIMS(kern)[3],
PyGpuArray_DIMS(out)[2], PyGpuArray_DIMS(out)[3],
cuda_get_ptr(img), PyGpuArray_STRIDES(img)[0]/4,
PyGpuArray_STRIDES(img)[1]/4,
PyGpuArray_STRIDES(img)[2]/4,
PyGpuArray_STRIDES(img)[3]/4,
cuda_get_ptr(kern), PyGpuArray_STRIDES(kern)[0]/4,
PyGpuArray_STRIDES(kern)[1]/4,
PyGpuArray_STRIDES(kern)[2]/4,
PyGpuArray_STRIDES(kern)[3]/4,
cuda_get_ptr(out), PyGpuArray_STRIDES(out)[0]/4,
PyGpuArray_STRIDES(out)[1]/4,
PyGpuArray_STRIDES(out)[2]/4,
PyGpuArray_STRIDES(out)[3]/4,
subsample_rows, subsample_cols);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&out_len, (void *)&out_wid,
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)&img_stride_batch, (void *)&img_stride_stack,
(void *)&img_stride_row, (void *)&img_stride_col,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)&kern_stride_nkern, (void *)&kern_stride_stack,
(void *)&kern_stride_row, (void *)&kern_stride_col,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&out_stride_batch, (void *)&out_stride_nkern,
(void *)&out_stride_row, (void *)&out_stride_col,
(void *)&subsample_rows, (void *)&subsample_cols};
int err = GpuKernel_call(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0,
3, threads_per_block, n_blocks, 0, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose)
fprintf(stderr, "INFO: used 'conv_reference_full' version"
......@@ -1394,17 +1378,18 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
else
{
if (verbose)
fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
n_threads, 1, n_blocks, 1, 0, n_threads);
fprintf(stderr, "threads_per_block[0]=%i, threads_per_block[1]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], 1, n_blocks[0], 1, 0, threads_per_block[0]);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for"
" CudaNdarray_conv_full! (%s)",
cudaGetErrorString(sts));
GpuKernel_error(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
return -1;
}
}
......
......@@ -3,13 +3,20 @@ import os
import theano
from theano import config, gof
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
from six.moves import reduce
from .comp import NVCC_compiler
from .type import GpuArrayType
from .basic_ops import as_gpuarray_variable
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, HideC, Kernel)
from theano.gof import utils
class GpuConv(gof.Op):
class GpuConv(GpuKernelBase, HideC, gof.Op):
"""
Implement the batched and stacked 2d convolution on the gpu.
......@@ -223,29 +230,30 @@ class GpuConv(gof.Op):
return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G']
def c_headers(self):
return ['<stdio.h>', 'cuda.h',
'<gpuarray/extension.h>', '<numpy_compat.h>']
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['<stdint.h>', '<stdio.h>', 'cuda.h',
'<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
import os
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 21)
def c_init_code(self):
return ['cuda_get_ptr_raw = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of
# these files
files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu']
codes = ["CUdeviceptr (*cuda_get_ptr_raw)(gpudata *g);",
"float* cuda_get_ptr(PyGpuArrayObject * o){return (float*) (cuda_get_ptr_raw(o->ga.data) + o->ga.offset);}",
"const float* cuda_get_ptr(const PyGpuArrayObject * o){return (float*) (cuda_get_ptr_raw(o->ga.data) + o->ga.offset);}"]
codes += [open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in files]
return reduce(str.__add__, codes)
def c_compiler(self):
return NVCC_compiler
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
def c_code(self, node, nodename, inp, out_, sub):
img, kern = inp
......@@ -270,8 +278,8 @@ class GpuConv(gof.Op):
//Optional args
int version = %(version)s;
int verbose = %(verbose)s;
int dx = %(dx)s;
int dy = %(dy)s;
size_t dx = %(dx)s;
size_t dy = %(dy)s;
int mode;
if (strcmp(mode_str, "full") == 0)
......@@ -286,7 +294,7 @@ class GpuConv(gof.Op):
{
PyErr_SetString(PyExc_ValueError,
"mode must be one of 'full' or 'valid'");
return NULL;
return 0;
}
// TODO, make out be decref before we alloc out2!
......@@ -303,3 +311,266 @@ class GpuConv(gof.Op):
%(fail)s
}
""" % sub
def c_support_code_apply(self, node, name):
nb = 0
if self.kshp is not None:
nb = self.kshp[1]
kernels = self.gpu_kernels(node, name)
k = kernels[0]
code = """
#define THEANO_KERN_WID %(nb)d
""" % locals()
code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
kname = "conv_full_load_everything"
gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
bin = gk._binary
bcode = ','.join(hex(ord(c)) for c in bin)
code = code.replace('\\', '\\\\')
code = code.replace('"', '\\"')
code = code.replace('\n', '\\n')
mod = """
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a %% b) ? 1: 0);
}
static const char conv_bcode[] = {%(bcode)s};
static const char *conv_code = "%(code)s";
""" % locals()
for k in kernels:
mod += "static GpuKernel " + k.name + '_' + name + ";\n"
mod += open(os.path.join(os.path.split(__file__)[0], "conv.cu")).read()
return mod
@utils.memoize
def gpu_kernels(self, node, name):
dtypes = [i.dtype for i in node.inputs]
dtypes.extend([o.dtype for o in node.outputs])
flags = Kernel.get_flags(*dtypes)
kernels = self.conv_patch_kernels(name, flags)
kernels.extend(self.conv_patch_stack_kernels(name, flags))
kernels.extend(self.conv_patch_stack_reduce_kernels(name, flags))
kernels.extend(self.conv_rows_kernels(name, flags))
kernels.extend(self.conv_rows_stack_kernels(name, flags))
kernels.extend(self.conv_rows_stack2_kernels(name, flags))
kernels.extend(self.conv_valid_row_reduce_kernels(name, flags))
kernels.extend(self.conv_reference_valid_kernels(name, flags))
kernels.extend(self.conv_reference_full_kernels(name, flags))
kernels.extend(self.conv_full_patch_kernels(name, flags))
kernels.extend(self.conv_full_patch_stack_kernels(name, flags))
kernels.extend(self.conv_full_patch_stack_padded_kernels(name, flags))
kernels.extend(self.conv_full_load_everything_kernels(name, flags))
return kernels
def conv_patch_kernels(self, name, flags):
kname = "conv_patch_%d"
k_var = "conv_patch_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [2, 3]
]
def conv_patch_stack_kernels(self, name, flags):
kname = "conv_patch_stack_%d"
k_var = "conv_patch_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in range(64, 96)
]
def conv_patch_stack_reduce_kernels(self, name, flags):
kname = "conv_patch_stack_reduce_%d"
k_var = "conv_patch_stack_reduce_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15]
]
def conv_rows_kernels(self, name, flags):
kname = "conv_rows_%d"
k_var = "conv_rows_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_rows_stack_kernels(self, name, flags):
kname = "conv_rows_stack_%d"
k_var = "conv_rows_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_rows_stack2_kernels(self, name, flags):
kname = "conv_rows_stack2_%d"
k_var = "conv_rows_stack2_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 3]
]
def conv_valid_row_reduce_kernels(self, name, flags):
kname = "conv_valid_row_reduce_%d"
k_var = "conv_valid_row_reduce_%d_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_reference_valid_kernels(self, name, flags):
kname = "conv_reference_valid"
k_var = "conv_reference_valid_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_reference_full_kernels(self, name, flags):
kname = "conv_reference_full"
k_var = "conv_reference_full_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_full_patch_kernels(self, name, flags):
kname = "conv_full_patch"
k_var = "conv_full_patch_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_full_patch_stack_kernels(self, name, flags):
kname = "conv_full_patch_stack_%d"
k_var = "conv_full_patch_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 3]
]
def conv_full_patch_stack_padded_kernels(self, name, flags):
kname = "conv_full_patch_stack_padded_%d"
k_var = "conv_full_patch_stack_padded_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14]
]
def conv_full_load_everything_kernels(self, name, flags):
kname = "conv_full_load_everything"
k_var = "conv_full_load_everything_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
extern __shared__ float s_data[];
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len/nb_split
//grid block size=batch_id
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void
conv_full_patch_split(const float* img, const float* kern, float* out,
extern "C" __global__ void
conv_full_patch_split(const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
// Thread index
......@@ -60,18 +67,23 @@ conv_full_patch_split(const float* img, const float* kern, float* out,
//thread block size=out_wid, out_len
//grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void
conv_full_patch( const float* img, const float* kern, float* out,
extern "C" __global__ void
conv_full_patch( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
// Thread index
......@@ -114,6 +126,8 @@ conv_full_patch( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum;
}
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len
......@@ -123,7 +137,9 @@ conv_full_patch( const float* img, const float* kern, float* out,
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__device__ inline void
conv_full_patch_stack( const float* img, const float* kern, float* out,
conv_full_patch_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row,
......@@ -131,12 +147,15 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
const float __shared__ *kern_, *img_;
extern __shared__ float s_data[];
const int batch_id = blockIdx.x;
const int nkern_id = blockIdx.y;
......@@ -186,7 +205,9 @@ extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \
__global__ void \
conv_full_patch_stack_##suffix( \
const float *img, const float *kern, float *out, \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, \
int kern_len, int kern_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
......@@ -194,7 +215,8 @@ conv_full_patch_stack_##suffix( \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_full_patch_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
......@@ -207,6 +229,8 @@ __INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK
}
/**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
......@@ -227,22 +251,34 @@ __INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
*/
template<bool flipped_kern, bool c_contiguous, bool split, bool low_mem >
__device__ inline void
conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
conv_full_patch_stack_padded( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid,
const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row,
int kern_stride_col, int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern = &(kern[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
kern_stride_col=1;
kern_stride_row=kern_wid;
}
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
__shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
batch_id = blockIdx.x;
kern_id = blockIdx.y;
......@@ -380,7 +416,9 @@ extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \
__global__ void \
conv_full_patch_stack_padded_##suffix( \
const float *img, const float *kern, float *out, \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
......@@ -390,7 +428,8 @@ conv_full_patch_stack_padded_##suffix( \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_full_patch_stack_padded<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
......@@ -412,6 +451,7 @@ __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED
}
template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy)
{
return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
......@@ -425,8 +465,10 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
{
return x[0] * y[0];
}
__global__ void
conv_full_load_everything( const float* img, const float* kern, float* out,
extern "C" __global__ void
conv_full_load_everything( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row,
......@@ -435,12 +477,15 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
const int out_col = threadIdx.x;//output col
......@@ -503,6 +548,8 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
__syncthreads(); //don't start loading another kernel until we're done here
}
}
/*
Local Variables:
mode:c++
......
......@@ -29,7 +29,6 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
*/
#ifndef CONV_KERNEL_CU
#define CONV_KERNEL_CU
#include <stdint.h>
/*
#define CHECK_BANK_CONFLICTS 0
......@@ -220,11 +219,18 @@ __device__ void store_or_accumulate(float& dst,const float value ){
*/
template<bool flipped_kern, bool split>
__device__ inline void
conv_patch( const float* img, const float* kern, float* out,
conv_patch( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
......@@ -282,11 +288,14 @@ conv_patch( const float* img, const float* kern, float* out,
extern "C" {
#define __INSTANTIATE_CONV_PATCH(suffix, ...) \
__global__ void \
conv_patch_##suffix(const float *img, const float *kern, float *out, \
conv_patch_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack) \
{ \
conv_patch<__VA_ARGS__>(img, kern, out, img_len, img_wid, kern_len, \
conv_patch<__VA_ARGS__>(img, img_offset, kern, kern_offset, \
out, out_offset, img_len, img_wid, kern_len, \
kern_wid, nkern, nstack); \
}
......@@ -297,6 +306,7 @@ __INSTANTIATE_CONV_PATCH(3, true, true)
}
/**
* As conv_patch, but implement the stack in the kernel.
* I keep it separated from conv_patch as we take more registers and this could lower the occupency.
......@@ -320,7 +330,9 @@ __INSTANTIATE_CONV_PATCH(3, true, true)
*/
template<bool flipped_kern, bool accumulate, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__device__ inline void
conv_patch_stack( const float* img, const float* kern, float* out,
conv_patch_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row,
......@@ -329,6 +341,11 @@ conv_patch_stack( const float* img, const float* kern, float* out,
int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
{
int __shared__ nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
......@@ -459,7 +476,9 @@ conv_patch_stack( const float* img, const float* kern, float* out,
extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK(suffix, ...) \
__global__ void \
conv_patch_stack_##suffix(const float *img, const float *kern, float *out, \
conv_patch_stack_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int out_len, int out_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
......@@ -469,7 +488,8 @@ conv_patch_stack_##suffix(const float *img, const float *kern, float *out, \
int dx, int dy) \
{ \
conv_patch_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, out_len, \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, out_len, \
out_wid, nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
......@@ -513,6 +533,7 @@ __INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true)
}
/**
* As conv_patch_stack, but kern_len thread for each output pixel
* I keep it separated as use more register.
......@@ -529,7 +550,9 @@ __INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true)
*/
template<bool flipped_kern, bool c_contiguous, bool split, bool preload_full_kern>
__device__ inline void
conv_patch_stack_reduce( const float* img, const float* kern, float* out,
conv_patch_stack_reduce( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch,
......@@ -543,6 +566,17 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
const int out_len = blockDim.y;
const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern = &(kern[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
kern_stride_col=1;
kern_stride_row=kern_wid;
}
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
......@@ -636,7 +670,9 @@ extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK_REDUCE(suffix, ...) \
__global__ void \
conv_patch_stack_reduce_##suffix( \
const float *img, const float *kern, float *out, \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
......@@ -644,33 +680,35 @@ conv_patch_stack_reduce_##suffix( \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_patch_stack_reduce<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(0, false, false, false, false);*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(1, false, false, false, true);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(2, false, false, true, false);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(3, false, false, true, true);
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(4, false, true, false, false);*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(5, false, true, false, true);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(6, false, true, true, false);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(7, false, true, true, true);
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(8, true, false, false, false);*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(9, true, false, false, true);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(10, true, false, true, false);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(11, true, false, true, true);
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(12, true, true, false, false);*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(13, true, true, false, true);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(14, true, true, true, false);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true);
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE#(0, false, false, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(1, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(2, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(3, false, false, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE#(4, false, true, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(5, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(6, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(7, false, true, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(8, true, false, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(9, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(10, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(11, true, false, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(12, true, true, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(13, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(14, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true)
#undef __INSTANTIATE_CONV_PATCH_STACK_REDUCE
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* we store kern_len row of the image and the full kernel in the shared memory
......@@ -684,7 +722,9 @@ __INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true);
*/
template<bool c_contiguous>
__device__ inline void
conv_rows( const float* img, const float* kern, float* out,
conv_rows( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack,
int img_stride_col, int img_stride_row,
......@@ -694,6 +734,11 @@ conv_rows( const float* img, const float* kern, float* out,
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
......@@ -735,7 +780,9 @@ conv_rows( const float* img, const float* kern, float* out,
extern "C" {
#define __INSTANTIATE_CONV_ROWS(suffix, ...) \
__global__ void \
conv_rows_##suffix(const float *img, const float *kern, float *out, \
conv_rows_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
......@@ -744,7 +791,8 @@ conv_rows_##suffix(const float *img, const float *kern, float *out, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_rows<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
......@@ -757,6 +805,8 @@ __INSTANTIATE_CONV_ROWS(1, true)
#undef __INSTANTIATE_CONV_ROWS
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows, but implement the stack. Separate as this use more register.
......@@ -770,7 +820,9 @@ __INSTANTIATE_CONV_ROWS(1, true)
*/
template<bool c_contiguous>
__device__ inline void
conv_rows_stack( const float* img, const float* kern, float* out,
conv_rows_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
......@@ -780,6 +832,11 @@ conv_rows_stack( const float* img, const float* kern, float* out,
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
......@@ -859,7 +916,9 @@ extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK(suffix, ...) \
__global__ void \
conv_rows_stack_##suffix( \
const float *img, const float *kern, float *out, \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
......@@ -869,7 +928,8 @@ conv_rows_stack_##suffix( \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_rows_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
......@@ -882,6 +942,8 @@ __INSTANTIATE_CONV_ROWS_STACK(1, true)
#undef __INSTANTIATE_CONV_ROWS_STACK
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
......@@ -895,7 +957,9 @@ __INSTANTIATE_CONV_ROWS_STACK(1, true)
*/
template<bool c_contiguous, bool preload_full_kern>
__device__ inline void
conv_rows_stack2(const float* img, const float* kern, float* out,
conv_rows_stack2(const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
......@@ -905,6 +969,11 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
......@@ -984,7 +1053,9 @@ extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK2(suffix, ...) \
__global__ void \
conv_rows_stack2_##suffix( \
const float *img, const float *kern, float *out, \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
......@@ -994,8 +1065,8 @@ conv_rows_stack2_##suffix( \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_rows_stack2<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, \
kern_len, kern_wid, nkern, nstack, \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
......@@ -1009,6 +1080,8 @@ __INSTANTIATE_CONV_ROWS_STACK2(3, true, true)
#undef __INSTANTIATE_CONV_ROWS_STACK2
}
/**
* Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
* kernel-image inner-product in parallel.
......@@ -1024,13 +1097,18 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
const float *img, const size_t img_offset, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, const size_t kern_offset, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, const size_t out_offset, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols,
const int initial_reduce_boundary)
{
const int outsize = nB * nK * out_len * out_wid;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
extern __shared__ float reducebuf[];
for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x)
{
......@@ -1110,18 +1188,21 @@ __global__ void \
conv_valid_row_reduce_##suffix( \
int nB, int nK, int stacklen, int img_len, int img_wid, \
int kern_len, int kern_wid, int out_len, int out_wid, \
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, \
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, \
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C, \
const float *img, const size_t img_offset, \
int img_str_B, int img_str_S, int img_str_R, int img_str_C, \
const float *kern, const size_t kern_offset, \
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, \
float *out, const size_t out_offset, \
int out_str_B, int out_str_K, int out_str_R, int out_str_C, \
int subsample_rows, int subsample_cols, \
const int initial_reduce_boundary) \
{ \
conv_valid_row_reduce<__VA_ARGS__>( \
nB, nK, stacklen, img_len, img_wid, \
kern_len, kern_wid, out_len, out_wid, \
img, img_str_B, img_str_S, img_str_R, img_str_C, \
kern, kern_str_K, kern_str_S, kern_str_R, kern_str_C, \
out, out_str_B, out_str_K, out_str_R, out_str_C, \
img, img_offset, img_str_B, img_str_S, img_str_R, img_str_C, \
kern, kern_offset, kern_str_K, kern_str_S, kern_str_R, kern_str_C, \
out, out_offset, out_str_B, out_str_K, out_str_R, out_str_C, \
subsample_rows, subsample_cols, initial_reduce_boundary); \
}
......@@ -1132,6 +1213,7 @@ __INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true)
}
/**
* Reference implementation of 'valid' mode convolution (with stack)
*
......@@ -1139,18 +1221,26 @@ __INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true)
*
* TODO: explain parameters, preconditions
*/
__global__ void
extern "C" __global__ void
conv_reference_valid(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
const float *img, const size_t img_offset,
int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, const size_t kern_offset,
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, const size_t out_offset,
int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int numThreads, outsize;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
numThreads = blockDim.x * gridDim.x;
outsize = nB * nK * out_len * out_wid;
......@@ -1191,6 +1281,8 @@ conv_reference_valid(int nB, int nK, int stacklen,
}
}
/**
* Reference implementation of 'full' mode convolution (with stack)
*
......@@ -1198,18 +1290,26 @@ conv_reference_valid(int nB, int nK, int stacklen,
*
* TODO: explain parameters, preconditions
*/
__global__ void
extern "C" __global__ void
conv_reference_full(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical dimensions
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C,
const float *img, const size_t img_offset,
int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, const size_t kern_offset,
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, const size_t out_offset,
int out_str_B, int out_str_K, int out_str_R, int out_str_C,
int subsample_rows, int subsample_cols)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int numThreads, physical_outsize;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
numThreads = blockDim.x * gridDim.x;
physical_outsize = nB * nK * out_len * out_wid;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论