提交 5ecbbde2 authored 作者: abergeron's avatar abergeron

Merge pull request #3364 from seanprime7/drvapi

Use the libgpuarray APIs to manage GPU code compilation, execution, etc.
......@@ -144,6 +144,15 @@ class GpuKernelBase(object):
def _generate_kernel_vars(self, k):
return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar)
def c_support_code(self):
return """
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
"""
def c_support_code_apply(self, node, name):
kernels = self.gpu_kernels(node, name)
bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels)
......
......@@ -10,12 +10,6 @@ PyObject * PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
const size_t subsample_cols,
const int version, const int verbose);
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
/*
* version: -1, autodetect, >=0 a specific version to use.
* If it can't be executed, we revert to the reference implementation
......@@ -108,6 +102,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//TODO: make a parameter the number of division
//TODO: Should we make them in separate grid block instead?
const int stack_len = PyGpuArray_DIMS(img)[1];
const int nstack=PyGpuArray_DIMS(kern)[1];
const int nbatch=PyGpuArray_DIMS(img)[0];
const int nkern=PyGpuArray_DIMS(kern)[0];
......@@ -126,6 +121,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
const int out_stride_col = PyGpuArray_STRIDES(out)[3]/4;
const int out_stride_row = PyGpuArray_STRIDES(out)[2]/4;
const int out_stride_nkern = PyGpuArray_STRIDES(out)[1]/4;
const int out_stride_batch = PyGpuArray_STRIDES(out)[0]/4;
const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid;
......@@ -156,16 +155,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
const float * kern_data_unflipped = cuda_get_ptr(kern);
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped=1;
kern_stride_row_unflipped=kern_wid;
kern_flipped=false;
kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
}
//if we remove the restriction
......@@ -195,46 +188,47 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0)
nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch, nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int);
#define CONV_PATCH_SPECIAL(kern_wid) \
if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
else f=conv_patch<true,kern_wid,true>;
CONV_PATCH_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
size_t threads_per_block[3] = {(size_t)out_wid,
ceil_intdiv((size_t)out_len,(size_t)nb_split),
(size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
size_t shmem_sz = (img_size + kern_size)*sizeof(float);
GpuKernel *k = NULL;
if(threads_per_block[1]==out_len) k=&conv_patch_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else k=&conv_patch_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch' version %s nb_split=%d\n",
threads.y==out_len ? "no split": "split", nb_split);
threads_per_block[1]==out_len ? "no split": "split", nb_split);
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i, nb_split=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y, nb_split);
"threads_per_block[0]=%i, threads_per_block[1]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i, nb_split=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1], nb_split);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
......@@ -253,77 +247,77 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
size_t threads_per_block[3] = {(size_t)out_wid,
(size_t)ceil_intdiv(out_len,nb_split),
(size_t)1};
bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
if(version==11 || version==12) preload_full_kernel=false;
dim3 grid(nbatch,nkern);
int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_PATCH_STACK_SPECIAL(kern_wid) \
if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;}
CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid,
out_len, out_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
size_t shmem_sz = (img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
GpuKernel *k = NULL;
if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_64_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_65_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_66_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_67_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_68_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_69_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_70_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_71_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_72_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_73_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_74_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_75_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_76_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_77_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_78_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_79_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_80_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_81_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_82_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_83_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_84_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_85_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_86_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_87_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_88_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_89_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_90_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_91_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_92_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_93_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_94_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_95_node_<<<<HASH_PLACEHOLDER>>>>_0;}
void *kernel_params[] = {(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&out_len, (void *)&out_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern,
(void *)&subsample_rows, (void *)&subsample_cols};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i,"
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i,"
" img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1],
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel,
(unsigned long long)subsample_rows,
......@@ -342,15 +336,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i,"
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false,"
" kern_width=%i, img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1],
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel,
(unsigned long long)subsample_rows,
......@@ -359,7 +353,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
fprintf(stderr,
"INFO: impl 'conv_patch_stack' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
......@@ -371,30 +365,28 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
!work_complete) //conv_rows
{
dim3 threads(out_wid);
dim3 grid(out_len, nbatch*nkern);
int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_ROWS_SPECIAL(kern_wid) \
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
else f = conv_rows<kern_wid, true>;\
CONV_ROWS_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>>
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)1, (size_t)1};
size_t n_blocks[3] = {(size_t)out_len, (size_t)nbatch*nkern, (size_t)1};
size_t shmem_sz = (kern_len*img_wid + kern_size)*sizeof(float);
GpuKernel *k = NULL;
if(!img_contiguous_2d || !kern_contiguous_2d) k=&conv_rows_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else k=&conv_rows_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose)
......@@ -404,15 +396,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
if (!subsample && out_contiguous &&
......@@ -430,52 +422,50 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
nb_row=i;
}
dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)nb_row, (size_t)1};
size_t n_blocks[3] = {(size_t)ceil_intdiv(out_len,nb_row),
(size_t)nbatch*nkern, (size_t)1};
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
if (0)
fprintf(stderr,
"IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
img_contiguous_2d, kern_contiguous_2d,
threads.x, threads.y, threads.z,
grid.x, grid.y, grid.z);
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], n_blocks[2]);
GpuKernel *k = NULL;
if(!img_contiguous_2d || !kern_contiguous_2d) {
//fprintf(stderr, "using false version\n");
f = conv_rows_stack<THEANO_KERN_WID, false>;
k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
} else {
//fprintf(stderr, "using true version\n");
f = conv_rows_stack<THEANO_KERN_WID, true>;
k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
}
f<<< grid, threads, shared_size >>>
(cuda_get_ptr(img),
cuda_get_ptr(kern),
cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
}
......@@ -483,15 +473,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows_stack' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
......@@ -524,45 +514,41 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//to test the case when we don't have a thread by output pixel.
if((version_back!=-1)&& nb_row>1) nb_row--;
dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)nb_row, (size_t)1};
size_t n_blocks[3] = {(size_t)ceil_intdiv(out_len,nb_row),
(size_t)nbatch*nkern, (size_t)1};
int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
else f = conv_rows_stack2<kern_wid, true, false>;
CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>>
(cuda_get_ptr(img),
cuda_get_ptr(kern),
cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
GpuKernel *k = NULL;
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) k=&conv_rows_stack2_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==9) k=&conv_rows_stack2_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!img_contiguous_2d || !kern_contiguous_2d) k=&conv_rows_stack2_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else k=&conv_rows_stack2_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_rows_stack2' version %s with"
......@@ -574,15 +560,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i version=%d\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,(version==9?2:3));
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i version=%d\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1],(version==9?2:3));
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows_stack2' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
......@@ -629,18 +615,18 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
nb_split++;
// tentative estimates (prior to contraint c)
int thread_z=ceil_intdiv(kern_len,nb_split);
int shared_size = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z));
size_t thread_z=ceil_intdiv(kern_len,nb_split);
size_t shmem_sz = sizeof(float)*(full_kern
? std::max((size_t)img_size + kern_size, out_size*thread_z)
: std::max((size_t)img_size + thread_z*kern_wid, out_size*thread_z));
// constraint (c)
while ((shared_size >= shared_avail) && (nb_split <= kern_len)){
while ((shmem_sz >= shared_avail) && (nb_split <= kern_len)){
//if we can't fit the kernel in shared memory, we must split it more.
nb_split++;
thread_z=ceil_intdiv(kern_len,nb_split);
shared_size = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z)
shmem_sz = sizeof(float)*(full_kern
? std::max((size_t)img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z));
}
if (nb_split <= kern_len)
......@@ -648,57 +634,59 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
assert(thread_z>0);//should not happen, but in case...
if(!full_kern) assert(thread_z!=kern_len);
dim3 threads(out_wid, out_len, thread_z);
dim3 grid(nbatch,nkern);
size_t threads_per_block[3] = {(size_t)out_wid,
(size_t)out_len,
(size_t)thread_z};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int,
int, int,
int, int);
GpuKernel *k = NULL;
const bool split=thread_z!=kern_len;
const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
//We will always be split when we don't load the full kernel
#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
if (kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
else if(kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
else if(kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
else if(kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
else if(!kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
else if(!kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
else if(!kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
/*else if(kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
/*else if(kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
else if(kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
else if(kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
/*else if(!kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
/*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
else if(!kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>>(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid,
nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
/* if(!kern_flipped && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
/*else*/ if(!kern_flipped && !ccontig && !split && full_kern) k=&conv_patch_stack_reduce_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && !ccontig && split && !full_kern) k=&conv_patch_stack_reduce_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && !ccontig && split && full_kern) k=&conv_patch_stack_reduce_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
/*else if(!kern_flipped && ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_4_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else if(!kern_flipped && ccontig && !split && full_kern) k=&conv_patch_stack_reduce_5_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && ccontig && split && !full_kern) k=&conv_patch_stack_reduce_6_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && ccontig && split && full_kern) k=&conv_patch_stack_reduce_7_node_<<<<HASH_PLACEHOLDER>>>>_0;
/*else if(kern_flipped && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_8_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else if(kern_flipped && !ccontig && !split && full_kern) k=&conv_patch_stack_reduce_9_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && !ccontig && split && !full_kern) k=&conv_patch_stack_reduce_10_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && !ccontig && split && full_kern) k=&conv_patch_stack_reduce_11_node_<<<<HASH_PLACEHOLDER>>>>_0;
/*else if(kern_flipped && ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_12_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else if(kern_flipped && ccontig && !split && full_kern) k=&conv_patch_stack_reduce_13_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && ccontig && split && !full_kern) k=&conv_patch_stack_reduce_14_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && ccontig && split && full_kern) k=&conv_patch_stack_reduce_15_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col,
(void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i, "
"grid.x=%i, grid.y=%i, shared_size=%i,"
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i, "
"n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i,"
" nb_threads=%i\n",
threads.x, threads.y, threads.z, grid.x, grid.y,
shared_size, threads.x * threads.y * threads.z);
threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch_stack_reduce' version"
......@@ -711,17 +699,17 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i,"
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i,"
" nb_threads=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z);
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch_stack_reduce' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
} // else no good nb_splits was found
}
......@@ -730,8 +718,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
kern_len<=320 &&
!work_complete) //conv_valid_row_reduce
{
int outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, 4096);
size_t outsize = PyGpuArray_SIZE(out);
size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
(size_t)1, (size_t)1};
int block_nstack=nstack;
//Max of 512 threads per blocks.
......@@ -739,9 +728,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//8k registers and the kernel use 23 register
//TODO: check if we have 8k or 16k of register...
while(block_nstack*kern_len>320)block_nstack--;
dim3 n_threads(block_nstack, kern_len, 1);
size_t threads_per_block[3] = {(size_t)block_nstack, (size_t)kern_len, (size_t)1};
int n_reduce_buf = block_nstack * kern_len * sizeof(float);
size_t n_reduce_buf = block_nstack * kern_len * sizeof(float);
/* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
*
* if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
......@@ -758,39 +747,34 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
}
void (*f)(int, int, int, int,
int, int, int, int, int,
const float*, int, int, int, int,
const float*, int, int, int, int,
float*, int, int, int, int,
int, int, int);
GpuKernel *k = NULL;
//std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
//std::cerr << "kerns " << nstack << " " << kern_len << "\n";
//std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
if(block_nstack==nstack)
f=conv_valid_row_reduce<false>;
k=&conv_valid_row_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else
f=conv_valid_row_reduce<true>;
f<<<n_blocks, n_threads, n_reduce_buf>>>(
nbatch, nkern, PyGpuArray_DIMS(img)[1],
img_len, img_wid,
kern_len, kern_wid,
out_len, out_wid,
cuda_get_ptr(img),
PyGpuArray_STRIDES(img)[0]/4, PyGpuArray_STRIDES(img)[1]/4,
img_stride_row, img_stride_col,
cuda_get_ptr(kern),
PyGpuArray_STRIDES(kern)[0]/4, PyGpuArray_STRIDES(kern)[1]/4,
PyGpuArray_STRIDES(kern)[2]/4, PyGpuArray_STRIDES(kern)[3]/4,
cuda_get_ptr(out),
PyGpuArray_STRIDES(out)[0]/4, PyGpuArray_STRIDES(out)[1]/4,
PyGpuArray_STRIDES(out)[2]/4, PyGpuArray_STRIDES(out)[3]/4,
subsample_rows, subsample_cols, initial_reduce_boundary);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
k=&conv_valid_row_reduce_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&out_len, (void *)&out_wid,
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)&img_stride_batch, (void *)&img_stride_stack,
(void *)&img_stride_row, (void *)&img_stride_col,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)&kern_stride_nkern, (void *)&kern_stride_stack,
(void *)&kern_stride_row, (void *)&kern_stride_col,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&out_stride_batch, (void *)&out_stride_nkern,
(void *)&out_stride_row, (void *)&out_stride_col,
(void *)&subsample_rows, (void *)&subsample_cols,
(void *)&initial_reduce_boundary};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, n_reduce_buf, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose)
......@@ -800,24 +784,27 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i,"
" shared_size=%i, nb_threads=%i\n",
n_threads.x, n_threads.y, n_blocks,
n_reduce_buf, n_threads.x * n_threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0],
n_reduce_buf, threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_valid_row_reduce' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
if (1 && !work_complete) //conv_reference_valid
{
int outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, 4096);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
256);
size_t outsize = PyGpuArray_SIZE(out);
size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
(size_t)1, (size_t)1};
size_t threads_per_block[3] = {std::min(ceil_intdiv(outsize, n_blocks[0]),
(size_t)256),
(size_t)1, (size_t)1};
if (1)
{
if (verbose)
......@@ -825,61 +812,56 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (verbose>1)
fprintf(stderr, " img : %i %llu %i %i %p "
"%lld %lld %lld %lld\n",
nbatch, (unsigned long long)PyGpuArray_DIMS(img)[1],
img_len, img_wid,
cuda_get_ptr(img),
(long long)PyGpuArray_STRIDES(img)[0]/4,
(long long)PyGpuArray_STRIDES(img)[1]/4,
(long long)PyGpuArray_STRIDES(img)[2]/4,
(long long)PyGpuArray_STRIDES(img)[3]/4);
nbatch, (unsigned long long)stack_len, img_len, img_wid,
(void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
(long long)img_stride_batch,
(long long)img_stride_stack,
(long long)img_stride_row,
(long long)img_stride_col);
if (verbose>1)
fprintf(stderr, " kern: %i %i %i %i %p "
"%lld %lld %lld %lld\n",
nkern, nstack, kern_len, kern_wid,
cuda_get_ptr(kern),
(long long)PyGpuArray_STRIDES(kern)[0]/4,
(long long)PyGpuArray_STRIDES(kern)[1]/4,
(long long)PyGpuArray_STRIDES(kern)[2]/4,
(long long)PyGpuArray_STRIDES(kern)[3]/4);
(void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
(long long)kern_stride_nkern,
(long long)kern_stride_stack,
(long long)kern_stride_row,
(long long)kern_stride_col);
if (verbose>1)
fprintf(stderr, " out : %llu %llu %i %i %p "
"%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1],
out_len, out_wid,
cuda_get_ptr(out),
(long long)PyGpuArray_STRIDES(out)[0]/4,
(long long)PyGpuArray_STRIDES(out)[1]/4,
(long long)PyGpuArray_STRIDES(out)[2]/4,
(long long)PyGpuArray_STRIDES(out)[3]/4);
(void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
(long long)out_stride_batch,
(long long)out_stride_nkern,
(long long)out_stride_row,
(long long)out_stride_col);
if (verbose>1)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads);
outsize, n_blocks[0], threads_per_block[0]);
}
conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
PyGpuArray_DIMS(img)[1],
img_len, img_wid,
kern_len, kern_wid,
out_len, out_wid,
cuda_get_ptr(img),
PyGpuArray_STRIDES(img)[0]/4,
PyGpuArray_STRIDES(img)[1]/4,
PyGpuArray_STRIDES(img)[2]/4,
PyGpuArray_STRIDES(img)[3]/4,
cuda_get_ptr(kern),
PyGpuArray_STRIDES(kern)[0]/4,
PyGpuArray_STRIDES(kern)[1]/4,
PyGpuArray_STRIDES(kern)[2]/4,
PyGpuArray_STRIDES(kern)[3]/4,
cuda_get_ptr(out),
PyGpuArray_STRIDES(out)[0]/4,
PyGpuArray_STRIDES(out)[1]/4,
PyGpuArray_STRIDES(out)[2]/4,
PyGpuArray_STRIDES(out)[3]/4,
subsample_rows, subsample_cols);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&out_len, (void *)&out_wid,
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)&img_stride_batch, (void *)&img_stride_stack,
(void *)&img_stride_row, (void *)&img_stride_col,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)&kern_stride_nkern, (void *)&kern_stride_stack,
(void *)&kern_stride_row, (void *)&kern_stride_col,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&out_stride_batch, (void *)&out_stride_nkern,
(void *)&out_stride_row, (void *)&out_stride_col,
(void *)&subsample_rows, (void *)&subsample_cols};
int err = GpuKernel_call(&conv_reference_valid_node_<<<<HASH_PLACEHOLDER>>>>_0,
3, threads_per_block, n_blocks, 0, kernel_params);
if (err == GA_NO_ERROR)
{
work_complete = true;
if (verbose)
......@@ -892,7 +874,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for"
" PyGpuArray_conv_valid! (%s)",
cudaGetErrorString(sts));
GpuKernel_error(&conv_reference_valid_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
return -1;
}
}
......@@ -941,6 +923,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]);
assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]);
const int stack_len=PyGpuArray_DIMS(img)[1];
const int nstack=PyGpuArray_DIMS(kern)[1];
const int nbatch=PyGpuArray_DIMS(img)[0];
const int nkern=PyGpuArray_DIMS(kern)[0];
......@@ -959,6 +942,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
const int out_stride_col = PyGpuArray_STRIDES(out)[3]/4;
const int out_stride_row = PyGpuArray_STRIDES(out)[2]/4;
const int out_stride_nkern = PyGpuArray_STRIDES(out)[1]/4;
const int out_stride_batch = PyGpuArray_STRIDES(out)[0]/4;
const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid;
......@@ -1001,16 +988,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
//we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
const float * kern_data_unflipped = cuda_get_ptr(kern);
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped=1;
kern_stride_row_unflipped=kern_wid;
kern_flipped=false;
kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
}
if (verbose>1)
......@@ -1019,34 +1000,34 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
" MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
printf("INFO: img dim: %llu %llu %llu %llu "
"img stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(img)[0],
(unsigned long long)PyGpuArray_DIMS(img)[1],
(unsigned long long)PyGpuArray_DIMS(img)[2],
(unsigned long long)PyGpuArray_DIMS(img)[3],
(long long)PyGpuArray_STRIDES(img)[0]/4,
(long long)PyGpuArray_STRIDES(img)[1]/4,
(long long)PyGpuArray_STRIDES(img)[2]/4,
(long long)PyGpuArray_STRIDES(img)[3]/4);
(unsigned long long)nbatch,
(unsigned long long)stack_len,
(unsigned long long)img_len,
(unsigned long long)img_wid,
(long long)img_stride_batch,
(long long)img_stride_stack,
(long long)img_stride_row,
(long long)img_stride_col);
printf("INFO: kern dim: %llu %llu %llu %llu "
"kern stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(kern)[0],
(unsigned long long)PyGpuArray_DIMS(kern)[1],
(unsigned long long)PyGpuArray_DIMS(kern)[2],
(unsigned long long)PyGpuArray_DIMS(kern)[3],
(long long)PyGpuArray_STRIDES(kern)[0]/4,
(long long)PyGpuArray_STRIDES(kern)[1]/4,
(long long)PyGpuArray_STRIDES(kern)[2]/4,
(long long)PyGpuArray_STRIDES(kern)[3]/4);
(unsigned long long)nkern,
(unsigned long long)nstack,
(unsigned long long)kern_len,
(unsigned long long)kern_wid,
(long long)kern_stride_nkern,
(long long)kern_stride_stack,
(long long)kern_stride_row,
(long long)kern_stride_col);
printf("INFO: out dim: %llu %llu %llu %llu "
"out stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1],
(unsigned long long)PyGpuArray_DIMS(out)[2],
(unsigned long long)PyGpuArray_DIMS(out)[3],
(long long)PyGpuArray_STRIDES(out)[0]/4,
(long long)PyGpuArray_STRIDES(out)[1]/4,
(long long)PyGpuArray_STRIDES(out)[2]/4,
(long long)PyGpuArray_STRIDES(out)[3]/4);
(unsigned long long)out_len,
(unsigned long long)out_wid,
(long long)out_stride_batch,
(long long)out_stride_nkern,
(long long)out_stride_row,
(long long)out_stride_col);
}
if (!subsample &&
......@@ -1093,53 +1074,53 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
assert(version!=5 || kern_len>1);
assert(version!=-1);
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch,nkern);
size_t threads_per_block[3] = {(size_t)out_wid,
ceil_intdiv((size_t)out_len,(size_t)nb_split),
(size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
int shared_size=img_size_padded_byte + kern_size_byte;
size_t shmem_sz=img_size_padded_byte + kern_size_byte;
if(version==5)
shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
shmem_sz=((kern_len+threads_per_block[1]-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
GpuKernel *k = NULL;
if(version==3) k=&conv_full_patch_stack_padded_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==5) k=&conv_full_patch_stack_padded_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==4) k=&conv_full_patch_stack_padded_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) k=&conv_full_patch_stack_padded_4_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) k=&conv_full_patch_stack_padded_5_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) k=&conv_full_patch_stack_padded_6_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==3 && kern_flipped) k=&conv_full_patch_stack_padded_8_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==5 && kern_flipped)k=&conv_full_patch_stack_padded_9_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==4 && kern_flipped)k=&conv_full_patch_stack_padded_10_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) k=&conv_full_patch_stack_padded_12_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) k=&conv_full_patch_stack_padded_13_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) k=&conv_full_patch_stack_padded_14_node_<<<<HASH_PLACEHOLDER>>>>_0;
else assert(false);
CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z,
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
out_len, nb_split, version);
if (verbose)
fprintf(stderr,
......@@ -1152,12 +1133,12 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i,"
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z,
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
out_len, nb_split, version);
if (verbose)
fprintf(stderr,
......@@ -1165,7 +1146,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
" failed (%s), trying next implementation\n",
version==3?"no split": "split",
(version==5?"low_mem":"not_low_mem"),
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
......@@ -1176,21 +1157,22 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
size_t shmem_sz = (img_size + kern_size)*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
conv_full_patch<<< grid, threads, shared_size>>>
(cuda_get_ptr(img),
cuda_get_ptr(kern),
cuda_get_ptr(out),
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack};
int err = GpuKernel_call(&conv_full_patch_node_<<<<HASH_PLACEHOLDER>>>>_0,
3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n");
work_complete = true;
......@@ -1199,15 +1181,15 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size,
threads.x * threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_full_patch' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(&conv_full_patch_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
}
}
if (false && !subsample && //disabled as test fail for this kernel
......@@ -1217,37 +1199,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch);
int shared_size=(img_size + kern_size)*nstack*sizeof(float);
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)1, (size_t)1};
size_t shmem_sz = (img_size + kern_size)*nstack*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
//typeof(conv_full_load_everything<0>) f = ;
void (*f)(const float*, const float*, float*,
int, int, int, int, int, int,
int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
f = conv_full_load_everything<THEANO_KERN_WID>;
f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img),
cuda_get_ptr(kern),
cuda_get_ptr(out),
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack,
PyGpuArray_STRIDES(img)[3]/4,
PyGpuArray_STRIDES(img)[2]/4,
PyGpuArray_STRIDES(img)[1]/4,
PyGpuArray_STRIDES(img)[0]/4,
PyGpuArray_STRIDES(kern)[3]/4,
PyGpuArray_STRIDES(kern)[2]/4,
PyGpuArray_STRIDES(kern)[1]/4,
PyGpuArray_STRIDES(kern)[0]/4
);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(&conv_full_load_everything_node_<<<<HASH_PLACEHOLDER>>>>_0,
3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n");
work_complete = true;
......@@ -1256,14 +1227,14 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size,
threads.x * threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
" failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(&conv_full_load_everything_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
}
}
......@@ -1275,32 +1246,29 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int);
if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<false,true>;\
else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>;
f<<< grid, threads, shared_size>>>(
cuda_get_ptr(img),
cuda_get_ptr(kern),
cuda_get_ptr(out),
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack,img_stride_col, img_stride_row,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
size_t shmem_sz = (img_size + kern_size)*sizeof(float);
GpuKernel *k = NULL;
if(!img_contiguous_2d && !kern_contiguous_2d) k=&conv_full_patch_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!img_contiguous_2d && kern_contiguous_2d) k=&conv_full_patch_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && !kern_contiguous_2d) k=&conv_full_patch_stack_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d) k=&conv_full_patch_stack_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
(void *)&kern_stride_col, (void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose)
fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
......@@ -1310,23 +1278,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(k, err));
}
}
if (1 && !work_complete) //conv_reference_full
{
if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");
int outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, 4096);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
256);
size_t outsize = PyGpuArray_SIZE(out);
size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
(size_t)1, (size_t)1};
size_t threads_per_block[3] = {std::min(ceil_intdiv(outsize, n_blocks[0]),
(size_t)256),
(size_t)1, (size_t)1};
if (0)
{
if (verbose)
......@@ -1334,70 +1305,67 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
if (verbose)
fprintf(stderr, " img : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(img)[0],
(unsigned long long)PyGpuArray_DIMS(img)[1],
(unsigned long long)PyGpuArray_DIMS(img)[2],
(unsigned long long)PyGpuArray_DIMS(img)[3],
cuda_get_ptr(img),
(long long)PyGpuArray_STRIDES(img)[0]/4,
(long long)PyGpuArray_STRIDES(img)[1]/4,
(long long)PyGpuArray_STRIDES(img)[2]/4,
(long long)PyGpuArray_STRIDES(img)[3]/4);
(unsigned long long)nbatch,
(unsigned long long)stack_len,
(unsigned long long)img_len,
(unsigned long long)img_wid,
(void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
(long long)img_stride_batch,
(long long)img_stride_stack,
(long long)img_stride_row,
(long long)img_stride_col);
if (verbose)
fprintf(stderr, " kern: %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(kern)[0],
(unsigned long long)PyGpuArray_DIMS(kern)[1],
(unsigned long long)PyGpuArray_DIMS(kern)[2],
(unsigned long long)PyGpuArray_DIMS(kern)[3],
cuda_get_ptr(kern),
(long long)PyGpuArray_STRIDES(kern)[0]/4,
(long long)PyGpuArray_STRIDES(kern)[1]/4,
(long long)PyGpuArray_STRIDES(kern)[2]/4,
(long long)PyGpuArray_STRIDES(kern)[3]/4
);
(unsigned long long)nkern,
(unsigned long long)nstack,
(unsigned long long)kern_len,
(unsigned long long)kern_wid,
(void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
(long long)kern_stride_nkern,
(long long)kern_stride_stack,
(long long)kern_stride_row,
(long long)kern_stride_col);
if (verbose)
fprintf(stderr, " out : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1],
(unsigned long long)PyGpuArray_DIMS(out)[2],
(unsigned long long)PyGpuArray_DIMS(out)[3],
cuda_get_ptr(out),
(long long)PyGpuArray_STRIDES(out)[0]/4,
(long long)PyGpuArray_STRIDES(out)[1]/4,
(long long)PyGpuArray_STRIDES(out)[2]/4,
(long long)PyGpuArray_STRIDES(out)[3]/4);
(unsigned long long)out_len,
(unsigned long long)out_wid,
(void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
(long long)out_stride_batch,
(long long)out_stride_nkern,
(long long)out_stride_row,
(long long)out_stride_col);
if (verbose)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads);
outsize, n_blocks[0], threads_per_block[0]);
if (verbose)
fprintf(stderr, " subsample params: %llu %llu\n",
(unsigned long long)subsample_rows,
(unsigned long long)subsample_cols);
}
conv_reference_full<<<n_blocks, n_threads>>>(
PyGpuArray_DIMS(img)[0], PyGpuArray_DIMS(kern)[0],
PyGpuArray_DIMS(img)[1],
PyGpuArray_DIMS(img)[2], PyGpuArray_DIMS(img)[3],
PyGpuArray_DIMS(kern)[2], PyGpuArray_DIMS(kern)[3],
PyGpuArray_DIMS(out)[2], PyGpuArray_DIMS(out)[3],
cuda_get_ptr(img), PyGpuArray_STRIDES(img)[0]/4,
PyGpuArray_STRIDES(img)[1]/4,
PyGpuArray_STRIDES(img)[2]/4,
PyGpuArray_STRIDES(img)[3]/4,
cuda_get_ptr(kern), PyGpuArray_STRIDES(kern)[0]/4,
PyGpuArray_STRIDES(kern)[1]/4,
PyGpuArray_STRIDES(kern)[2]/4,
PyGpuArray_STRIDES(kern)[3]/4,
cuda_get_ptr(out), PyGpuArray_STRIDES(out)[0]/4,
PyGpuArray_STRIDES(out)[1]/4,
PyGpuArray_STRIDES(out)[2]/4,
PyGpuArray_STRIDES(out)[3]/4,
subsample_rows, subsample_cols);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len,
(void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
(void *)&out_len, (void *)&out_wid,
(void *)img->ga.data, (void *)&img->ga.offset,
(void *)&img_stride_batch, (void *)&img_stride_stack,
(void *)&img_stride_row, (void *)&img_stride_col,
(void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)&kern_stride_nkern, (void *)&kern_stride_stack,
(void *)&kern_stride_row, (void *)&kern_stride_col,
(void *)out->ga.data, (void *)&out->ga.offset,
(void *)&out_stride_batch, (void *)&out_stride_nkern,
(void *)&out_stride_row, (void *)&out_stride_col,
(void *)&subsample_rows, (void *)&subsample_cols};
int err = GpuKernel_call(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0,
3, threads_per_block, n_blocks, 0, kernel_params);
if (err == GA_NO_ERROR)
{
if (verbose)
fprintf(stderr, "INFO: used 'conv_reference_full' version"
......@@ -1410,17 +1378,18 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
else
{
if (verbose)
fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
n_threads, 1, n_blocks, 1, 0, n_threads);
fprintf(stderr, "threads_per_block[0]=%i, threads_per_block[1]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], 1, n_blocks[0], 1, 0, threads_per_block[0]);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
GpuKernel_error(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for"
" CudaNdarray_conv_full! (%s)",
cudaGetErrorString(sts));
GpuKernel_error(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
return -1;
}
}
......
......@@ -3,13 +3,20 @@ import os
import theano
from theano import config, gof
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
from six.moves import reduce
from .comp import NVCC_compiler
from .type import GpuArrayType
from .basic_ops import as_gpuarray_variable
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
from theano.gof import utils
class GpuConv(gof.Op):
class GpuConv(GpuKernelBase, gof.Op):
"""
Implement the batched and stacked 2d convolution on the gpu.
......@@ -223,29 +230,29 @@ class GpuConv(gof.Op):
return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G']
def c_headers(self):
return ['<stdio.h>', 'cuda.h',
'<gpuarray/extension.h>', '<numpy_compat.h>']
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['<stdint.h>', '<stdio.h>', 'cuda.h',
'<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 21)
def c_init_code(self):
return ['cuda_get_ptr_raw = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of
# these files
files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu']
codes = ["CUdeviceptr (*cuda_get_ptr_raw)(gpudata *g);",
"float* cuda_get_ptr(PyGpuArrayObject * o){return (float*) (cuda_get_ptr_raw(o->ga.data) + o->ga.offset);}",
"const float* cuda_get_ptr(const PyGpuArrayObject * o){return (float*) (cuda_get_ptr_raw(o->ga.data) + o->ga.offset);}"]
codes += [open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in files]
return reduce(str.__add__, codes)
def c_compiler(self):
return NVCC_compiler
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
def c_code(self, node, nodename, inp, out_, sub):
img, kern = inp
......@@ -270,8 +277,8 @@ class GpuConv(gof.Op):
//Optional args
int version = %(version)s;
int verbose = %(verbose)s;
int dx = %(dx)s;
int dy = %(dy)s;
size_t dx = %(dx)s;
size_t dy = %(dy)s;
int mode;
if (strcmp(mode_str, "full") == 0)
......@@ -286,7 +293,7 @@ class GpuConv(gof.Op):
{
PyErr_SetString(PyExc_ValueError,
"mode must be one of 'full' or 'valid'");
return NULL;
return 0;
}
// TODO, make out be decref before we alloc out2!
......@@ -303,3 +310,261 @@ class GpuConv(gof.Op):
%(fail)s
}
""" % sub
def c_support_code_apply(self, node, name):
nb = 0
if self.kshp is not None:
nb = self.kshp[1]
kernels = self.gpu_kernels(node, name)
k = kernels[0]
code = """
#define THEANO_KERN_WID %(nb)d
""" % locals()
code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
kname = "conv_full_load_everything"
gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
bin = gk._binary
bcode = ','.join(hex(ord(c)) for c in bin)
code = code.replace('\\', '\\\\')
code = code.replace('"', '\\"')
code = code.replace('\n', '\\n')
mod = """
static const char conv_bcode[] = {%(bcode)s};
static const char *conv_code = "%(code)s";
""" % locals()
for k in kernels:
mod += "static GpuKernel " + k.name + '_' + name + ";\n"
mod += open(os.path.join(os.path.split(__file__)[0], "conv.cu")).read()
return mod
@utils.memoize
def gpu_kernels(self, node, name):
dtypes = [i.dtype for i in node.inputs]
dtypes.extend([o.dtype for o in node.outputs])
flags = Kernel.get_flags(*dtypes)
kernels = self.conv_patch_kernels(name, flags)
kernels.extend(self.conv_patch_stack_kernels(name, flags))
kernels.extend(self.conv_patch_stack_reduce_kernels(name, flags))
kernels.extend(self.conv_rows_kernels(name, flags))
kernels.extend(self.conv_rows_stack_kernels(name, flags))
kernels.extend(self.conv_rows_stack2_kernels(name, flags))
kernels.extend(self.conv_valid_row_reduce_kernels(name, flags))
kernels.extend(self.conv_reference_valid_kernels(name, flags))
kernels.extend(self.conv_reference_full_kernels(name, flags))
kernels.extend(self.conv_full_patch_kernels(name, flags))
kernels.extend(self.conv_full_patch_stack_kernels(name, flags))
kernels.extend(self.conv_full_patch_stack_padded_kernels(name, flags))
kernels.extend(self.conv_full_load_everything_kernels(name, flags))
return kernels
def conv_patch_kernels(self, name, flags):
kname = "conv_patch_%d"
k_var = "conv_patch_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [2, 3]
]
def conv_patch_stack_kernels(self, name, flags):
kname = "conv_patch_stack_%d"
k_var = "conv_patch_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in range(64, 96)
]
def conv_patch_stack_reduce_kernels(self, name, flags):
kname = "conv_patch_stack_reduce_%d"
k_var = "conv_patch_stack_reduce_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15]
]
def conv_rows_kernels(self, name, flags):
kname = "conv_rows_%d"
k_var = "conv_rows_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_rows_stack_kernels(self, name, flags):
kname = "conv_rows_stack_%d"
k_var = "conv_rows_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_rows_stack2_kernels(self, name, flags):
kname = "conv_rows_stack2_%d"
k_var = "conv_rows_stack2_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 3]
]
def conv_valid_row_reduce_kernels(self, name, flags):
kname = "conv_valid_row_reduce_%d"
k_var = "conv_valid_row_reduce_%d_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_reference_valid_kernels(self, name, flags):
kname = "conv_reference_valid"
k_var = "conv_reference_valid_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_reference_full_kernels(self, name, flags):
kname = "conv_reference_full"
k_var = "conv_reference_full_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_full_patch_kernels(self, name, flags):
kname = "conv_full_patch"
k_var = "conv_full_patch_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_full_patch_stack_kernels(self, name, flags):
kname = "conv_full_patch_stack_%d"
k_var = "conv_full_patch_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 3]
]
def conv_full_patch_stack_padded_kernels(self, name, flags):
kname = "conv_full_patch_stack_padded_%d"
k_var = "conv_full_patch_stack_padded_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14]
]
def conv_full_load_everything_kernels(self, name, flags):
kname = "conv_full_load_everything"
k_var = "conv_full_load_everything_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
extern __shared__ float s_data[];
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len/nb_split
//grid block size=batch_id
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void
conv_full_patch_split(const float* img, const float* kern, float* out,
extern "C" __global__ void
conv_full_patch_split(const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
// Thread index
......@@ -60,18 +67,23 @@ conv_full_patch_split(const float* img, const float* kern, float* out,
//thread block size=out_wid, out_len
//grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void
conv_full_patch( const float* img, const float* kern, float* out,
extern "C" __global__ void
conv_full_patch( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
// Thread index
......@@ -114,6 +126,8 @@ conv_full_patch( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum;
}
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len
......@@ -122,8 +136,10 @@ conv_full_patch( const float* img, const float* kern, float* out,
//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__global__ void
conv_full_patch_stack( const float* img, const float* kern, float* out,
__device__ inline void
conv_full_patch_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row,
......@@ -131,12 +147,15 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
const float __shared__ *kern_, *img_;
extern __shared__ float s_data[];
const int batch_id = blockIdx.x;
const int nkern_id = blockIdx.y;
......@@ -182,6 +201,36 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum;
}
extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \
__global__ void \
conv_full_patch_stack_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, \
int kern_len, int kern_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_full_patch_stack<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_FULL_PATCH_STACK(0, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK(1, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK(2, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK
}
/**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
......@@ -200,24 +249,36 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
* template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
* if you set split and low_mem to true, we will use the low_mem version!
*/
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
__global__ void
conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
template<bool flipped_kern, bool c_contiguous, bool split, bool low_mem >
__device__ inline void
conv_full_patch_stack_padded( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid,
const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row,
int kern_stride_col, int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern = &(kern[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
kern_stride_col=1;
kern_stride_row=kern_wid;
}
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
__shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
batch_id = blockIdx.x;
kern_id = blockIdx.y;
......@@ -257,7 +318,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
}
}
out[batch_id*out_wid*out_len*nkern+//the good batch
......@@ -292,7 +353,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
}
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
......@@ -340,7 +401,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
}
}
if(out_row<out_len)
......@@ -351,6 +412,46 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
}
}
extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \
__global__ void \
conv_full_patch_stack_padded_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_full_patch_stack_padded<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(0, false, false, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(1, false, false, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(2, false, false, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(4, false, true, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(5, false, true, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(6, false, true, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(8, true, false, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(9, true, false, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(10, true, false, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(12, true, true, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(13, true, true, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED
}
template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy)
{
return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
......@@ -364,9 +465,10 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
{
return x[0] * y[0];
}
template<int NSTACK>
__global__ void
conv_full_load_everything( const float* img, const float* kern, float* out,
extern "C" __global__ void
conv_full_load_everything( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row,
......@@ -375,12 +477,15 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
const int out_col = threadIdx.x;//output col
......@@ -423,9 +528,9 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
{
int icol = out_col - kern_wid+1+col;
if (icol < 0 || icol > img_wid) continue;
if (NSTACK > 0)
if (THEANO_KERN_WID > 0)
{
sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid,
sum += everything_dot<THEANO_KERN_WID>(d_img + irow*img_wid + icol, img_len*img_wid,
d_kern + row*kern_wid+col, kern_len*kern_wid);
}
else
......@@ -443,6 +548,8 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
__syncthreads(); //don't start loading another kernel until we're done here
}
}
/*
Local Variables:
mode:c++
......
......@@ -29,7 +29,6 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
*/
#ifndef CONV_KERNEL_CU
#define CONV_KERNEL_CU
#include <stdint.h>
/*
#define CHECK_BANK_CONFLICTS 0
......@@ -182,12 +181,11 @@ template<> __device__ float convolutionRowNoFlip<0>(const float *data,
return 0;
}
template<int KERN_WIDTH>
__device__ void convolutionRowNoFlip(float& sum,
const float *data,
const float *kern, const int kern_wid){
if(KERN_WIDTH>0)
sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
if(THEANO_KERN_WID>0)
sum+=convolutionRowNoFlip<THEANO_KERN_WID>(data,kern);
else
#pragma unroll 8
for (int col=0; col < kern_wid; col++) {//loop over col
......@@ -219,13 +217,20 @@ __device__ void store_or_accumulate(float& dst,const float value ){
* When true, allow for output image bigger then 512 pixel.
* Use more registers.
*/
template<bool flipped_kern, int KERN_WIDTH, bool split>
__global__ void
conv_patch( const float* img, const float* kern, float* out,
template<bool flipped_kern, bool split>
__device__ inline void
conv_patch( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack)
{
int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
......@@ -260,7 +265,7 @@ conv_patch( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
blockIdx.y*out_wid*out_len+//the output image
......@@ -271,7 +276,7 @@ conv_patch( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
......@@ -280,6 +285,28 @@ conv_patch( const float* img, const float* kern, float* out,
}
}
extern "C" {
#define __INSTANTIATE_CONV_PATCH(suffix, ...) \
__global__ void \
conv_patch_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack) \
{ \
conv_patch<__VA_ARGS__>(img, img_offset, kern, kern_offset, \
out, out_offset, img_len, img_wid, kern_len, \
kern_wid, nkern, nstack); \
}
__INSTANTIATE_CONV_PATCH(2, true, false)
__INSTANTIATE_CONV_PATCH(3, true, true)
#undef __INSTANTIATE_CONV_PATCH
}
/**
* As conv_patch, but implement the stack in the kernel.
* I keep it separated from conv_patch as we take more registers and this could lower the occupency.
......@@ -295,16 +322,17 @@ conv_patch( const float* img, const float* kern, float* out,
* dy: patch stride cols(1 for normal convolution)
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template accumulate: if true, we add the result, else we override the result
* template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
* template img_c_contiguous_2d: if true, the img have are collon and row contiguous
* template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
* template split: if true, each thread generate more than 1 output pixel, but use more registers.
* template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
* template subsample: if false, remove some computation needed when dx or dy!=1.
*/
template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__global__ void
conv_patch_stack( const float* img, const float* kern, float* out,
template<bool flipped_kern, bool accumulate, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__device__ inline void
conv_patch_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row,
......@@ -313,6 +341,11 @@ conv_patch_stack( const float* img, const float* kern, float* out,
int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
{
int __shared__ nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
......@@ -365,7 +398,7 @@ conv_patch_stack( const float* img, const float* kern, float* out,
else
idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
......@@ -425,7 +458,7 @@ conv_patch_stack( const float* img, const float* kern, float* out,
//as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len)
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
......@@ -440,6 +473,67 @@ conv_patch_stack( const float* img, const float* kern, float* out,
}
extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK(suffix, ...) \
__global__ void \
conv_patch_stack_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int out_len, int out_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern, \
int dx, int dy) \
{ \
conv_patch_stack<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, out_len, \
out_wid, nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern, dx, dy); \
}
__INSTANTIATE_CONV_PATCH_STACK(64, true, false, false, false, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(65, true, false, false, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(66, true, false, false, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(67, true, false, false, false, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(68, true, false, false, false, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(69, true, false, false, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(70, true, false, false, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(71, true, false, false, false, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(72, true, false, false, true, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(73, true, false, false, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(74, true, false, false, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(75, true, false, false, true, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(76, true, false, false, true, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(77, true, false, false, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(78, true, false, false, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(79, true, false, false, true, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(80, true, false, true, false, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(81, true, false, true, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(82, true, false, true, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(83, true, false, true, false, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(84, true, false, true, false, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(85, true, false, true, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(86, true, false, true, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(87, true, false, true, false, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(88, true, false, true, true, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(89, true, false, true, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(90, true, false, true, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(91, true, false, true, true, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(92, true, false, true, true, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(93, true, false, true, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(94, true, false, true, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true)
#undef __INSTANTIATE_CONV_PATCH_STACK
}
/**
* As conv_patch_stack, but kern_len thread for each output pixel
* I keep it separated as use more register.
......@@ -454,9 +548,11 @@ conv_patch_stack( const float* img, const float* kern, float* out,
* template img_contiguous: if true, the img have are collon and row contiguous
* template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
*/
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
__global__ void
conv_patch_stack_reduce( const float* img, const float* kern, float* out,
template<bool flipped_kern, bool c_contiguous, bool split, bool preload_full_kern>
__device__ inline void
conv_patch_stack_reduce( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch,
......@@ -470,6 +566,17 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
const int out_len = blockDim.y;
const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern = &(kern[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
kern_stride_col=1;
kern_stride_row=kern_wid;
}
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
......@@ -521,7 +628,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
float sum2 = 0;
if(tz<len3)
convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum2,idx_in,idx_kern,kern_wid);
sum+=sum2;
}
}else if(split){
......@@ -531,7 +638,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
for(int row=tz;row<kern_len;row+=blockDim.z){
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
}else{
int row = tz;//The row of the kernel.
......@@ -540,7 +647,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
......@@ -559,6 +666,49 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
}
}
extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK_REDUCE(suffix, ...) \
__global__ void \
conv_patch_stack_reduce_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_patch_stack_reduce<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE#(0, false, false, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(1, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(2, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(3, false, false, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE#(4, false, true, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(5, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(6, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(7, false, true, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(8, true, false, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(9, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(10, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(11, true, false, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(12, true, true, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(13, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(14, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true)
#undef __INSTANTIATE_CONV_PATCH_STACK_REDUCE
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* we store kern_len row of the image and the full kernel in the shared memory
......@@ -570,9 +720,11 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
* Diff with conv_patch: don't store the full image in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<int KERN_WIDTH, bool c_contiguous>
__global__ void
conv_rows( const float* img, const float* kern, float* out,
template<bool c_contiguous>
__device__ inline void
conv_rows( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack,
int img_stride_col, int img_stride_row,
......@@ -582,6 +734,11 @@ conv_rows( const float* img, const float* kern, float* out,
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
......@@ -612,7 +769,7 @@ conv_rows( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
......@@ -620,6 +777,36 @@ conv_rows( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum;
}
extern "C" {
#define __INSTANTIATE_CONV_ROWS(suffix, ...) \
__global__ void \
conv_rows_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_rows<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS(0, false)
__INSTANTIATE_CONV_ROWS(1, true)
#undef __INSTANTIATE_CONV_ROWS
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows, but implement the stack. Separate as this use more register.
......@@ -631,9 +818,11 @@ conv_rows( const float* img, const float* kern, float* out,
* Diff with conv_patch: don't store the full image in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<int KERN_WIDTH, bool c_contiguous>
__global__ void
conv_rows_stack( const float* img, const float* kern, float* out,
template<bool c_contiguous>
__device__ inline void
conv_rows_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
......@@ -643,6 +832,11 @@ conv_rows_stack( const float* img, const float* kern, float* out,
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
......@@ -708,7 +902,7 @@ conv_rows_stack( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads();//to be sure all thread have finished before we modif the shared memory.
}
......@@ -718,6 +912,38 @@ conv_rows_stack( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum;
}
extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK(suffix, ...) \
__global__ void \
conv_rows_stack_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_rows_stack<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS_STACK(0, false)
__INSTANTIATE_CONV_ROWS_STACK(1, true)
#undef __INSTANTIATE_CONV_ROWS_STACK
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
......@@ -729,9 +955,11 @@ conv_rows_stack( const float* img, const float* kern, float* out,
* Diff with conv_patch: don't store the full image and kernel in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
__global__ void
conv_rows_stack2(const float* img, const float* kern, float* out,
template<bool c_contiguous, bool preload_full_kern>
__device__ inline void
conv_rows_stack2(const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
......@@ -741,6 +969,11 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
......@@ -804,7 +1037,7 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
else idx_kern=d_kern;
const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
float sum_ =0.0f;
convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum_,idx_in,idx_kern,kern_wid);
sum+=sum_;//We pass by an intermediate variable to have more precission.
}
}
......@@ -816,6 +1049,39 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum;
}
extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK2(suffix, ...) \
__global__ void \
conv_rows_stack2_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_rows_stack2<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS_STACK2(0, false, false)
__INSTANTIATE_CONV_ROWS_STACK2(1, false, true)
__INSTANTIATE_CONV_ROWS_STACK2(2, true, false)
__INSTANTIATE_CONV_ROWS_STACK2(3, true, true)
#undef __INSTANTIATE_CONV_ROWS_STACK2
}
/**
* Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
* kernel-image inner-product in parallel.
......@@ -826,18 +1092,23 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
* TODO: explain parameters, preconditions
*/
template<bool stack_loop>
__global__ void
__device__ inline void
conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
const float *img, const size_t img_offset, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, const size_t kern_offset, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, const size_t out_offset, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols,
const int initial_reduce_boundary)
{
const int outsize = nB * nK * out_len * out_wid;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
extern __shared__ float reducebuf[];
for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x)
{
......@@ -911,6 +1182,36 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
}
}
extern "C" {
#define __INSTANTIATE_CONV_VALID_ROW_REDUCE(suffix, ...) \
__global__ void \
conv_valid_row_reduce_##suffix( \
int nB, int nK, int stacklen, int img_len, int img_wid, \
int kern_len, int kern_wid, int out_len, int out_wid, \
const float *img, const size_t img_offset, \
int img_str_B, int img_str_S, int img_str_R, int img_str_C, \
const float *kern, const size_t kern_offset, \
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, \
float *out, const size_t out_offset, \
int out_str_B, int out_str_K, int out_str_R, int out_str_C, \
int subsample_rows, int subsample_cols, \
const int initial_reduce_boundary) \
{ \
conv_valid_row_reduce<__VA_ARGS__>( \
nB, nK, stacklen, img_len, img_wid, \
kern_len, kern_wid, out_len, out_wid, \
img, img_offset, img_str_B, img_str_S, img_str_R, img_str_C, \
kern, kern_offset, kern_str_K, kern_str_S, kern_str_R, kern_str_C, \
out, out_offset, out_str_B, out_str_K, out_str_R, out_str_C, \
subsample_rows, subsample_cols, initial_reduce_boundary); \
}
__INSTANTIATE_CONV_VALID_ROW_REDUCE(0, false)
__INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true)
#undef __INSTANTIATE_CONV_VALID_ROW_REDUCE
}
/**
......@@ -920,18 +1221,26 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
*
* TODO: explain parameters, preconditions
*/
__global__ void
extern "C" __global__ void
conv_reference_valid(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
const float *img, const size_t img_offset,
int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, const size_t kern_offset,
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, const size_t out_offset,
int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int numThreads, outsize;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
numThreads = blockDim.x * gridDim.x;
outsize = nB * nK * out_len * out_wid;
......@@ -972,6 +1281,8 @@ conv_reference_valid(int nB, int nK, int stacklen,
}
}
/**
* Reference implementation of 'full' mode convolution (with stack)
*
......@@ -979,18 +1290,26 @@ conv_reference_valid(int nB, int nK, int stacklen,
*
* TODO: explain parameters, preconditions
*/
__global__ void
extern "C" __global__ void
conv_reference_full(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical dimensions
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C,
const float *img, const size_t img_offset,
int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, const size_t kern_offset,
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, const size_t out_offset,
int out_str_B, int out_str_K, int out_str_R, int out_str_C,
int subsample_rows, int subsample_cols)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int numThreads, physical_outsize;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
numThreads = blockDim.x * gridDim.x;
physical_outsize = nB * nK * out_len * out_wid;
......
from __future__ import print_function
import copy
import os
from theano.compat import izip
import numpy
......@@ -8,6 +9,7 @@ from theano import Apply, scalar, config
from theano import scalar as scal
from six.moves import StringIO, xrange
from theano.gof.utils import MethodNotDefined
from theano.gof.cmodule import GCC_compiler
from theano.scalar import Scalar
from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)
......@@ -23,7 +25,6 @@ except ImportError:
from .basic_ops import (as_gpuarray_variable, HideC,
GpuKernelBase, Kernel)
from .comp import NVCC_compiler
from .type import GpuArrayType
from .fp16_help import load_w, write_w
......@@ -57,7 +58,7 @@ def as_C_string_const(s):
for l in s.split('\n'))
class GpuElemwise(HideC, Elemwise):
class GpuElemwise(GpuKernelBase, HideC, Elemwise):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
_f16_ok = True
......@@ -150,39 +151,7 @@ class GpuElemwise(HideC, Elemwise):
code.append('}')
kop = '\n'.join(code)
# Translate types for scalar composite ops (except complex).
# NB: OpenCL implicitly has 'stdint' defs at the kernel
# compilation stage
support_code = "" if pygpu.get_default_context().kind == 'opencl' else """
#ifdef _MSC_VER
#define signed __int8 int8_t
#define unsigned __int8 uint8_t
#define signed __int16 int16_t
#define unsigned __int16 uint16_t
#define signed __int32 int32_t
#define unsigned __int32 uint32_t
#define signed __int64 int64_t
#define unsigned __int64 uint64_t
#else
#include <stdint.h>
#endif
"""
# Translate ga_ pseudo-types into their specific realizations
support_code += """
#define ga_bool uint8_t
#define ga_byte int8_t
#define ga_ubyte uint8_t
#define ga_short int16_t
#define ga_ushort uint16_t
#define ga_int int32_t
#define ga_uint uint32_t
#define ga_long int64_t
#define ga_ulong uint64_t
#define ga_float float
#define ga_double double
#define ga_half uint16_t
"""
support_code = ""
try:
# We accept only some c_support_code().
# This filter is done in the make_node()
......@@ -204,60 +173,64 @@ class GpuElemwise(HideC, Elemwise):
kop = kop.replace(npy, ga)
return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
def c_headers(self):
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>']
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_compiler(self):
return GCC_compiler
def c_headers(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return NVCC_compiler
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_support_code(self):
return self.scalar_op.c_support_code()
def c_support_code_apply(self, node, nodename):
def _gpu_kernel_code(self, node, nodename):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
# This is useless by itself, but will serve an eventual c_code
# implementation
k = self.generate_kernel(node, nodename)
nd = node.inputs[0].type.ndim
CLUDA_PREAMBLE = """
#define local_barrier() __syncthreads();
#define WITHIN_KERNEL __device__
#define KERNEL extern "C" __global__
#define GLOBAL_MEM /* empty */
#define LOCAL_MEM __shared__
#define LOCAL_MEM_ARG /* empty */
#define REQD_WG_SIZE(X,Y,Z) __launch_bounds__(X*Y*Z, 1)
#define LID_0 threadIdx.x
#define LID_1 threadIdx.y
#define LID_2 threadIdx.z
#define GID_0 blockIdx.x
#define GID_1 blockIdx.y
#define GID_2 blockIdx.z
#define LDIM_0 blockDim.x
#define LDIM_1 blockDim.y
#define LDIM_2 blockDim.z
#define GDIM_0 gridDim.x
#define GDIM_1 gridDim.y
#define GDIM_2 gridDim.z
"""
res = [CLUDA_PREAMBLE]
res = []
for i in range(0, nd + 1):
res.append(k.render_basic(i, name="elem_" + str(i)) + ';')
res.append(k.contig_src + ';')
return '\n'.join(res)
def gpu_kernels(self, node, nodename):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
src = self._gpu_kernel_code(node, nodename)
nd = node.outputs[0].ndim
params = ['uintp']
params.extend('uintp' for _ in range(nd))
num_inputs = len(node.inputs)
num_outputs = len(node.outputs)
for n in range(num_inputs + num_outputs):
if (n - len(node.inputs)) in self.inplace_pattern:
continue
params.extend([gpuarray.GpuArray, 'uintp'])
params.extend('intp' for _ in range(nd))
acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype
return [Kernel(code=src, name="elem_%d" % nd, params=params,
flags=Kernel.get_flags(node.inputs[0].type.dtype,
acc_dtype,
node.outputs[0].type.dtype),
objvar='elem_%d_%s' % (nd, nodename))]
def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
......@@ -273,11 +246,15 @@ class GpuElemwise(HideC, Elemwise):
# check that all inputs have valid dimensions
emitted_inames = {}
num_kernel_params = 1 + nd + len(inputs + outputs) * (2 + nd)
code = """
int n_blocks = 0;
int threads_per_block = 0;
size_t n_blocks = 0;
size_t threads_per_block = 0;
size_t numEls = 0;
"""
const ssize_t zero = 0;
void *kernel_params[%(num_kernel_params)d] = {0};
int err;
""" % locals()
if nd > 0:
code += """
size_t dims[%(nd)s] = {%(initial_dims)s};
......@@ -416,23 +393,41 @@ class GpuElemwise(HideC, Elemwise):
//std::cerr << "calling callkernel returned\\n";
""" % locals()
code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals()
param = []
kname = 'elem_%d_%s' % (nd, name)
param = ["(void *)&numEls"]
for i in range(nd):
param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
i=i))
param.append("(void *)&%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
i=i))
for n, (name, var) in enumerate(zip(inputs + outputs,
node.inputs + node.outputs)):
if (n - len(inputs)) in self.inplace_pattern:
continue
dtype = dtype_to_ctype(var.dtype)
param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
param.append("%(name)s->ga.offset" % locals())
param.append("(void *)%(name)s->ga.data" % locals())
param.append("(void *)&%(name)s->ga.offset" % locals())
for i in range(nd):
param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
code += ',\n'.join(param) + ");\n"
param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? (void *)&zero: (void *)&PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
for n, p in enumerate(param):
code += "kernel_params[%(n)d] = %(p)s;\n" % locals()
code += """
err = GpuKernel_call(&%(kname)s, 1, &threads_per_block, &n_blocks, 0, kernel_params);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(kname)s: %%s.",
GpuKernel_error(&%(kname)s, err));
%(fail)s;
}
""" % dict(kname=kname,fail=fail)
if config.gpuarray.sync:
code += "GpuArray_sync(&%(z)s->ga);\n" % dict(z=z)
code += """
err = GpuArray_sync(&%(z)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(kname)s: %%s.",
GpuKernel_error(&%(kname)s, err));
%(fail)s;
}
""" % locals()
return str(code)
def perform(self, node, inputs, output_storage):
......@@ -573,7 +568,7 @@ class GpuDimShuffle(HideC, DimShuffle):
return (4,)
class GpuCAReduceCuda(HideC, CAReduceDtype):
class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
"""
GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.
......@@ -737,12 +732,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
return False
return True
def c_header_dirs(self):
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>']
def c_compiler(self):
return NVCC_compiler
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_init_code(self):
return ['setup_ext_cuda();']
......@@ -840,7 +837,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
# \begin bracket the reduction in a check that there is
# actually work to do
if getattr(self.scalar_op, 'identity', None) == 0:
zero_shp = "cudaMemset((%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(out_dtype)s))" % locals()
zero_shp = "GpuArray_memset(&%(z)s->ga, 0)" % locals()
# TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
else:
scalar_op = self.scalar_op
......@@ -891,28 +888,24 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
.. code-block:: c
ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
if (verbose)
printf("running kernel_reduce_10_%(name)s\\n");
int n_shared = sizeof(%(acc_dtype)s) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
n_shared>>>(
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s)
);
[
if config.gpuarray.sync:
code += "GpuArray_sync(&%(z)s->ga);\n" % dict(z=z)
]
if (cudaSuccess != cudaGetLastError())
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: ... );
%(fail)s;
}
size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0] * n_threads[1] * n_threads[2];
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(%(x)s)[0],
(void *)&PyGpuArray_DIMS(%(x)s)[1],
(void *)%(x)s->ga.data,
(void *)&%(x)s->ga.offset,
(void *)&stride_A0,
(void *)&stride_A1,
(void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset,
(void *)&stride_Z0};
int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
%(err_check)s
"""
in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype
......@@ -923,64 +916,66 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
ndim = len(self.reduce_mask)
nd_out = ndim - sum(self.reduce_mask)
shapes_format = "shape=(%s)" % ",".join(["%llu"] * node.inputs[0].ndim)
shapes_data = ",".join(["(unsigned long long) PyGpuArray_DIMS(%s)[%d]" % (x, i)
shapes_data = ",".join(["(size_t) PyGpuArray_DIMS(%s)[%d]" % (x, i)
for i in range(node.inputs[0].ndim)])
k_var = "kernel_reduce_%(pattern)s_%(name)s" % locals()
params = []
print("""
if (verbose)
printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
int n_shared = sizeof(%(acc_dtype)s) * n_threads.x * n_threads.y * n_threads.z;
if (verbose>1)
printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
" nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
" nb_block=%%d, n_shared=%%d, %(shapes_format)s\\n",
n_threads.x,n_threads.y,n_threads.z,
n_threads.x*n_threads.y*n_threads.z,
n_blocks.x,n_blocks.y,
n_blocks.x*n_blocks.y, n_shared, %(shapes_data)s);
kernel_reduce_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
""" % locals(), file=sio)
for i in xrange(ndim):
print("""
PyGpuArray_DIMS(%(x)s)[%(i)s],
""" % locals(), file=sio)
print("""
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset)
""" % locals(), file=sio)
params.append("(void *)&PyGpuArray_DIMS(%(x)s)[%(i)s]" % locals())
params.append("(void *)%(x)s->ga.data" % locals())
params.append("(void *)&%(x)s->ga.offset" % locals())
for i in xrange(ndim):
print("""
,PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s)
""" % locals(), file=sio)
print("""
,(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset)
ssize_t stride_A%(i)d = PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s);
""" % locals(), file=sio)
params.append("(void *)&stride_A%(i)d" % locals())
params.append("(void *)%(z)s->ga.data" % locals())
params.append("(void *)&%(z)s->ga.offset" % locals())
for i in xrange(nd_out):
print("""
,PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(out_dtype)s)
ssize_t stride_Z%(i)d = PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(out_dtype)s);
""" % locals(), file=sio)
params.append("(void *)&stride_Z%(i)d" % locals())
kernel_params = ', '.join(params)
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
print("""
if (verbose)
printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0] * n_threads[1] * n_threads[2];
void *kernel_params[] = { %(kernel_params)s };
if (verbose>1)
printf("n_threads[0]=%%lu, n_threads[1]=%%lu, "
"n_threads[2]=%%lu, n_threads=%%lu, "
"n_blocks[0]=%%lu, n_blocks[1]=%%lu, n_blocks[2]=%%lu, "
"n_blocks=%%lu, n_shared=%%d, %(shapes_format)s\\n",
n_threads[0],n_threads[1],
n_threads[2],
n_threads[0]*n_threads[1]*
n_threads[2],
n_blocks[0],n_blocks[1],n_blocks[2],
n_blocks[0]*n_blocks[1]*n_blocks[2],
n_shared, %(shapes_data)s);
int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
%(err_check)s
""" % locals(), file=sio)
sync = ""
if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print("""
);
%(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)"
" %(shapes_format)s \\n",
"kernel_reduce_%(pattern)s_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
%(shapes_data)s);
%(fail)s;
}
""" % locals(), file=sio)
return sio.getvalue()
......@@ -993,66 +988,86 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
.. code-block:: c
static __global__ void kernel_reduce_110_%(nodename)s(
const int d0,
const int d1,
const int d2,
const %(in_dtype)s *A,
const int sA0,
const int sA1,
const int sA2,
%(out_dtype)s * Z,
const int sZ0)
KERNEL void kernel_reduce_110_%(nodename)s(
const ga_size d0,
const ga_size d1,
const ga_size d2,
const %(in_type)s *A,
const ga_size offset_A,
const ga_ssize sA0,
const ga_ssize sA1,
const ga_ssize sA2,
%(out_type)s * Z,
const ga_size offset_Z,
const ga_ssize sZ0)
Since the nodename is unique, we don't need to put the name
of the scalar_op in here.
"""
in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype
in_dtype = node.inputs[0].dtype
out_dtype = node.outputs[0].dtype
in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype)
if reduce_mask is None:
reduce_mask = self.reduce_mask
if ndim is None:
ndim = len(reduce_mask)
if pattern is None:
pattern = ''.join(str(i) for i in reduce_mask)
kname = "kernel_reduce_%(pattern)s" % locals()
k_var = "kernel_reduce_%(pattern)s_%(nodename)s" % locals()
params = []
sio = StringIO()
print("""
static __global__ void kernel_reduce_%(pattern)s_%(nodename)s(
KERNEL void %(kname)s(
""" % locals(), file=sio)
for i in xrange(ndim):
params.append('uintp')
print("""
const int d%(i)s,
const ga_size d%(i)s,
""" % locals(), file=sio)
params.append(gpuarray.GpuArray)
params.append('uintp')
print("""
const %(in_dtype)s *A,
const %(in_type)s *A, const ga_size offset_A,
""" % locals(), file=sio)
for i in xrange(ndim):
params.append('intp')
print("""
const int sA%(i)s,
const ga_ssize sA%(i)s,
""" % locals(), file=sio)
params.append(gpuarray.GpuArray)
params.append('uintp')
print("""
%(out_dtype)s * Z
%(out_type)s * Z, const ga_size offset_Z
""" % locals(), file=sio)
for i in xrange(ndim - sum(reduce_mask)):
params.append('intp')
print("""
, const int sZ%(i)s
, const ga_ssize sZ%(i)s
""" % locals(), file=sio)
print(")", file=sio)
return sio.getvalue()
return sio.getvalue(), kname, params, k_var
def _k_init(self, node, nodename):
in_dtype = node.inputs[0].dtype
out_dtype = node.outputs[0].dtype
acc_dtype = self._acc_dtype(node.inputs[0].dtype)
# We need to use theano_complex* and not npy_complex*
acc_dtype = theano.scalar.basic.Scalar(acc_dtype).dtype_specs()[1]
in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype)
acc_type = gpuarray.dtype_to_ctype(acc_dtype)
return """
const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y
+ threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_dtype)s buf[];
%(acc_dtype)s myresult = 0;
extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = 0;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
//This is caught in cuda/init.py when we init the gpu. I keep
//it here to ease finding code that rely on this.
......@@ -1315,7 +1330,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype
if getattr(self.scalar_op, 'identity', None) == 0:
zero_shp = "cudaMemset((%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(out_dtype)s))" % locals()
zero_shp = "GpuArray_memset(&%(z)s->ga, 0)" % locals()
# TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
else:
zero_shp = """
......@@ -1325,44 +1340,43 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
""" % locals()
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
k_var = "kernel_reduce_ccontig_%(name)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print("""
{
if(PyGpuArray_SIZE(%(x)s)==0){
%(zero_shp)s;
}else{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_SIZE(%(x)s),
(size_t) 256));
dim3 n_blocks(1);
size_t numEls = PyGpuArray_SIZE(%(x)s);
size_t n_threads = std::min(numEls, (size_t) 256);
size_t n_blocks = 1;
void *kernel_params[] = {(void *)&numEls,
(void *)%(x)s->ga.data,
(void *)&%(x)s->ga.offset,
(void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset};
if (verbose) printf("running kernel_reduce_ccontig_%(name)s"
" n_threads.x=%%d, size=%%d, ndim=%%d\\n",
n_threads.x,PyGpuArray_SIZE(%(x)s),
" n_threads=%%lu, size=%%lu, ndim=%%d\\n",
n_threads,numEls,
PyGpuArray_NDIM(%(x)s));
int n_shared = sizeof(%(acc_dtype)s) * n_threads.x;
kernel_reduce_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
PyGpuArray_SIZE(%(x)s),
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset));
size_t n_shared = sizeof(%(acc_dtype)s) * n_threads;
int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
%(err_check)s
%(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_ccontig_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
}
}
""" % locals(), file=sio)
......@@ -1372,10 +1386,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print("""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(%(x)s)[0],
(size_t) 256));
dim3 n_blocks(1);
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
size_t n_blocks[3] = {1, 1, 1};
%(makecall)s
}
""" % locals(), file=sio)
......@@ -1385,15 +1397,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print("""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(%(x)s)[1],
(size_t) 256));
while (n_threads.y * n_threads.x <= 256) ++n_threads.y;
n_threads.y -= 1;
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[0])
n_threads.y = PyGpuArray_DIMS(%(x)s)[0];
dim3 n_blocks(1);
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
while (n_threads[1] * n_threads[0] <= 256) ++n_threads[1];
n_threads[1] -= 1;
if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[0])
n_threads[1] = PyGpuArray_DIMS(%(x)s)[0];
size_t n_blocks[3] = {1, 1, 1};
%(makecall)s
}
""" % locals(), file=sio)
......@@ -1421,25 +1432,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
threads_y = """
//get as many y threads as we can fit
while (n_threads.x * (n_threads.y+1) <= 256)
while (n_threads[0] * (n_threads[1]+1) <= 256)
{
if (n_threads.y < PyGpuArray_DIMS(%(x)s)[%(N)s-1])
n_threads.y += 1;
if (n_threads[1] < PyGpuArray_DIMS(%(x)s)[%(N)s-1])
n_threads[1] += 1;
else
break;
}""" % locals()
threads_z = """
//get as many z threads as we can fit
while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256)
while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256)
{
if (n_threads.z < PyGpuArray_DIMS(%(x)s)[%(N)s-2])
n_threads.z += 1;
if (n_threads[2] < PyGpuArray_DIMS(%(x)s)[%(N)s-2])
n_threads[2] += 1;
else
break;
}
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
n_threads[2] = std::min(n_threads[2], (size_t)64);
""" % locals()
if len(self.reduce_mask) == 2:
......@@ -1452,13 +1463,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print("""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(%(x)s)[%(N)s],
(size_t) 256));
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[%(N)s], (size_t) 256), 1, 1};
%(threads_y)s
%(threads_z)s
dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[0],
(size_t) 4096));
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
%(makecall)s
}
""" % locals(), file=sio)
......@@ -1476,9 +1484,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
k_var = "kernel_reduce_10_%(name)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print("""
{
int verbose = 0;
......@@ -1491,95 +1511,71 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
// we could schedule more threads if we were maxing out the gridsize below, but
// the gridsize is way more than the physical hardware and I think 32 threads
// on a huge grid is enough to fully use the hardware.
dim3 n_threads(32,1,1);
size_t n_threads[3] = {32, 1, 1};
// We kindof reshape the input implicitly to something 4D:
// the shape A,B,C -> A, B, D, E
// where C <= D*E < C+32
// where E==32
int A = 1;
int B = PyGpuArray_DIMS(%(x)s)[0];
int C = PyGpuArray_DIMS(%(x)s)[1];
int D = C/32;
GpuKernel *%(k_var)s = &kernel_reduce_010_AD_%(name)s;
size_t A = 1;
size_t B = PyGpuArray_DIMS(%(x)s)[0];
size_t C = PyGpuArray_DIMS(%(x)s)[1];
size_t D = C/32;
if (32*D < C) D+= 1;
assert ((C <= 32*D) && (32*D < C+32));
// The gridsize would ideally be (A, D). But we do the following logic to make
// sure we don't ask for a grid that is too big.
dim3 n_blocks(A,D);
if (n_blocks.x > 4096) n_blocks.x = 4096;
if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x;
kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads>>>(
A,B,C,D,
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
1,
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
1,
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s)
);
%(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_10_AD%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
size_t n_blocks[3] = {A, D, 1};
if (n_blocks[0] > 4096) n_blocks[0] = 4096;
if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
ssize_t stride_A0 = 1;
ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
ssize_t stride_Z0 = 1;
ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
void *kernel_params[] = {
(void *)&A, (void *)&B, (void *)&C, (void *)&D,
(void *)%(x)s->ga.data,
(void *)&%(x)s->ga.offset,
(void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
(void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
%(err_check)s
%(sync)s
}else{
dim3 n_threads(
std::min(PyGpuArray_DIMS(%(x)s)[0],
(size_t) 256));
dim3 n_blocks(1,
std::min(PyGpuArray_DIMS(%(x)s)[1],
(size_t) 4096));
GpuKernel *%(k_var)s = &kernel_reduce_010_%(name)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
size_t n_blocks[3] = {1, std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 4096), 1};
if (verbose) {
fprintf(stderr,
"running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n",
n_blocks.x,
n_blocks.y);
n_blocks[0],
n_blocks[1]);
}
assert(PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]);
int n_shared = sizeof(%(acc_dtype)s) * n_threads.x;
kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
1,
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
1,
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
1,
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s)
);
size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0];
size_t dim_0 = 1;
ssize_t stride_A0 = 1;
ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
ssize_t stride_Z0 = 1;
ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
void *kernel_params[] = {
(void *)&dim_0,
(void *)&PyGpuArray_DIMS(%(x)s)[0],
(void *)&PyGpuArray_DIMS(%(x)s)[1],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
(void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
(void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
%(err_check)s
%(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_010_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
}
}
""" % locals(), file=sio)
......@@ -1591,9 +1587,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
pattern = ''.join(str(i) for i in self.reduce_mask)
in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype
k_var = "kernel_reduce_010_AD_%(name)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print("""
{
//int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2];
......@@ -1608,108 +1616,82 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
// we could schedule more threads if we were maxing out the gridsize below, but
// the gridsize is way more than the physical hardware and I think 32 threads
// on a huge grid is enough to fully use the hardware.
dim3 n_threads(32,1,1);
size_t n_threads[3] = {32, 1, 1};
// We kindof reshape the input implicitly to something 4D:
// the shape A,B,C -> A, B, D, E
// where C <= D*E < C+32
// where E==32
int A = PyGpuArray_DIMS(%(x)s)[0];
int B = PyGpuArray_DIMS(%(x)s)[1];
int C = PyGpuArray_DIMS(%(x)s)[2];
int D = C/32;
size_t A = PyGpuArray_DIMS(%(x)s)[0];
size_t B = PyGpuArray_DIMS(%(x)s)[1];
size_t C = PyGpuArray_DIMS(%(x)s)[2];
size_t D = C/32;
if (32*D < C) D+= 1;
assert ((C <= 32*D) && (32*D < C+32));
// The gridsize would ideally be (A, D). But we do the following logic to make
// sure we don't ask for a grid that is too big.
dim3 n_blocks(A,D);
if (n_blocks.x > 4096) n_blocks.x = 4096;
if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x;
int n_shared = 0;
kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
A,B,C,D,
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s),
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s),
PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s)
);
size_t n_blocks[3] = {A, D, 1};
if (n_blocks[0] > 4096) n_blocks[0] = 4096;
if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s);
ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s);
void *kernel_params[] = {
(void *)&A, (void *)&B, (void *)&C, (void *)&D,
(void *)%(x)s->ga.data,
(void *)&%(x)s->ga.offset,
(void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
(void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
%(err_check)s
%(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_010_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
}
else
{
int verbose = 2;
dim3 n_threads(std::min((size_t) 32,
PyGpuArray_DIMS(%(x)s)[2]));
while( (n_threads.x*(n_threads.y+1)<=256)
&& (n_threads.y<PyGpuArray_DIMS(%(x)s)[1])){
n_threads.y++;
size_t n_threads[3] = {std::min((size_t) 32, PyGpuArray_DIMS(%(x)s)[2]), 1, 1};
while( (n_threads[0]*(n_threads[1]+1)<=256)
&& (n_threads[1]<PyGpuArray_DIMS(%(x)s)[1])){
n_threads[1]++;
}
dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[0],
(size_t)4096));
n_blocks.y = std::min(
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096), 1, 1};
n_blocks[1] = std::min(
ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
(size_t)n_threads.x),
(size_t)(4096 / n_blocks.x)
(size_t)n_threads[0]),
(size_t)(4096 / n_blocks[0])
);
if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s)),
PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s))
==PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s)
&& n_blocks.y==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
(size_t)n_threads.x)){
&& n_blocks[1]==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
(size_t)n_threads[0])){
if(verbose>1)
printf("n_block.x.1=%%d, n_block.x.2=%%d, n_block.y.1=%%d, n_block.y.2=%%d,\\n",
PyGpuArray_DIMS(%(x)s)[0],4096,
ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],(size_t)n_threads.x),
(size_t)(4096 / n_blocks.x));
assert(n_threads.x<=32);
ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],(size_t)n_threads[0]),
(size_t)(4096 / n_blocks[0]));
assert(n_threads[0]<=32);
%(makecall_inner)s
}else{
n_threads.x = std::min(PyGpuArray_DIMS(%(x)s)[1],
n_threads[0] = std::min(PyGpuArray_DIMS(%(x)s)[1],
(size_t) 256);
n_blocks.x = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096);
n_blocks.y = std::min(
n_blocks[0] = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096);
n_blocks[1] = std::min(
PyGpuArray_DIMS(%(x)s)[2],
(size_t)(4096 / n_blocks.x)
(size_t)(4096 / n_blocks[0])
);
%(makecall)s
}
%(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_%(pattern)s_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
}
}
""" % locals(), file=sio)
......@@ -1719,16 +1701,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print("""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(%(x)s)[3],
(size_t) 256));
while (n_threads.x * n_threads.y <= 256)
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
while (n_threads[0] * n_threads[1] <= 256)
{
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1]) break;
n_threads.y += 1;
if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1]) break;
n_threads[1] += 1;
}
n_threads.y -= 1;
dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[2]);
n_threads[1] -= 1;
size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[2], 1};
%(makecall)s
}
""" % locals(), file=sio)
......@@ -1738,7 +1718,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
sync = bool(config.gpuarray.sync)
k_var = "kernel_reduce_010_AD_%(name)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
# use threadIdx.x for i0
# use blockIdx.x for i1
# use blockIdx.y for i2
......@@ -1747,15 +1741,12 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
int verbose = 0;
if (PyGpuArray_STRIDES(%(x)s)[2] != sizeof(%(in_dtype)s)){
printf("slow\\n");
dim3 n_threads(
std::min(PyGpuArray_DIMS(%(x)s)[0],
(size_t) 256));
dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[1],
(size_t)4096));
while (n_blocks.x * (n_blocks.y+1) <= 4096 &&
n_blocks.y <= PyGpuArray_DIMS(%(x)s)[2])
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)4096), 1, 1};
while (n_blocks[0] * (n_blocks[1]+1) <= 4096 &&
n_blocks[1] <= PyGpuArray_DIMS(%(x)s)[2])
{
n_blocks.y += 1;
n_blocks[1] += 1;
}
%(makecall)s
}
......@@ -1763,50 +1754,38 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ // reuse 010_AD kernel, we transpose the 2 first dim
// See the reduction for the real 010_AD kernel for
// explanation. We do this to get coalesced read.
dim3 n_threads(32,1,1);
size_t n_threads[3] = {32, 1, 1};
int A = PyGpuArray_DIMS(%(x)s)[1];
int B = PyGpuArray_DIMS(%(x)s)[0];
int C = PyGpuArray_DIMS(%(x)s)[2];
int D = C/32;
size_t A = PyGpuArray_DIMS(%(x)s)[1];
size_t B = PyGpuArray_DIMS(%(x)s)[0];
size_t C = PyGpuArray_DIMS(%(x)s)[2];
size_t D = C/32;
if (32*D < C) D+= 1;
assert ((C <= 32*D) && (32*D < C+32));
// The gridsize would ideally be (A, D). But we do the following logic to make
// sure we don't ask for a grid that is too big.
dim3 n_blocks(A,D);
if (n_blocks.x > 4096) n_blocks.x = 4096;
if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x;
int n_shared = 0;
kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
A,B,C,D,
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s),
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s),
PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s)
);
if (%(sync)d)
GpuArray_sync(&%(z)s->ga);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_010_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
size_t n_blocks[3] = {A, D, 1};
if (n_blocks[0] > 4096) n_blocks[0] = 4096;
if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
size_t n_shared = 0;
ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s);
ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s);
void *kernel_params[] = {
(void *)&A, (void *)&B, (void *)&C, (void *)&D,
(void *)%(x)s->ga.data,
(void *)&%(x)s->ga.offset,
(void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
(void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
%(err_check)s
%(sync)s
}
}
""" % locals(), file=sio)
......@@ -1815,18 +1794,16 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print("""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(%(x)s)[1],
(size_t) 256));
while (n_threads.x*n_threads.y <= 256)
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
while (n_threads[0]*n_threads[1] <= 256)
{
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[0])
if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[0])
break;
n_threads.y += 1;
n_threads[1] += 1;
}
n_threads.y -= 1;
n_threads[1] -= 1;
dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[2]);
size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[2], 1, 1};
%(makecall)s
}
""" % locals(), file=sio)
......@@ -1836,19 +1813,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print("""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(%(x)s)[2],
(size_t) 256));
dim3 n_blocks(
std::min(PyGpuArray_DIMS(%(x)s)[0],
(size_t) 4096));
while (n_blocks.x * n_blocks.y <= 4096)
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
while (n_blocks[0] * n_blocks[1] <= 4096)
{
if (n_blocks.y > PyGpuArray_DIMS(%(x)s)[1])
if (n_blocks[1] > PyGpuArray_DIMS(%(x)s)[1])
break;
n_blocks.y += 1;
n_blocks[1] += 1;
}
n_blocks.y -= 1;
n_blocks[1] -= 1;
%(makecall)s
}
""" % locals(), file=sio)
......@@ -1858,31 +1831,29 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print("""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(%(x)s)[2],
(size_t) 256));
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
//get as many y threads as we can fit
while (n_threads.x * n_threads.y <= 256)
while (n_threads[0] * n_threads[1] <= 256)
{
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1])
if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1])
break;
n_threads.y += 1;
n_threads[1] += 1;
}
n_threads.y -= 1;
n_threads[1] -= 1;
//get as many z threads as we can fit
while (n_threads.x * n_threads.y * n_threads.z <= 256)
while (n_threads[0] * n_threads[1] * n_threads[2] <= 256)
{
if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
break;
n_threads.z += 1;
n_threads[2] += 1;
}
n_threads.z -= 1;
n_threads[2] -= 1;
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
n_threads[2] = std::min(n_threads[2], (size_t)64);
dim3 n_blocks(1,1,1);
size_t n_blocks[3] = {1, 1, 1};
%(makecall)s
}
""" % locals(), file=sio)
......@@ -1896,24 +1867,20 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{
int verbose = 0;
dim3 n_blocks(
std::min(PyGpuArray_DIMS(%(x)s)[0],
(size_t) 4096));
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
while (n_blocks.x * n_blocks.y <= 4096 &&
n_blocks.y < PyGpuArray_DIMS(%(x)s)[1])
while (n_blocks[0] * n_blocks[1] <= 4096 &&
n_blocks[1] < PyGpuArray_DIMS(%(x)s)[1])
{
n_blocks.y += 1;
n_blocks[1] += 1;
}
dim3 n_threads(
std::min(PyGpuArray_DIMS(%(x)s)[3],
(size_t) 256));
while (n_threads.x * n_threads.y <= 256
&& n_threads.y < PyGpuArray_DIMS(%(x)s)[2]
&& n_threads.x * n_threads.y * sizeof(%(acc_dtype)s) <=(15*1024-200))
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
while (n_threads[0] * n_threads[1] <= 256
&& n_threads[1] < PyGpuArray_DIMS(%(x)s)[2]
&& n_threads[0] * n_threads[1] * sizeof(%(acc_dtype)s) <=(15*1024-200))
{
n_threads.y += 1;
n_threads[1] += 1;
}
%(makecall)s
......@@ -1925,32 +1892,30 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print("""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(%(x)s)[2],
(size_t) 256));
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
//get as many y threads as we can fit
while (n_threads.x * n_threads.y <= 256)
while (n_threads[0] * n_threads[1] <= 256)
{
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1])
if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1])
break;
n_threads.y += 1;
n_threads[1] += 1;
}
n_threads.y -= 1;
n_threads[1] -= 1;
//get as many z threads as we can fit
while (n_threads.x * n_threads.y * n_threads.z <= 256)
while (n_threads[0] * n_threads[1] * n_threads[2] <= 256)
{
if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
break;
n_threads.z += 1;
n_threads[2] += 1;
}
n_threads.z -= 1;
n_threads[2] -= 1;
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
n_threads[2] = std::min(n_threads[2], (size_t)64);
dim3 n_blocks(1,1,1);
size_t n_blocks[3] = {1, 1, 1};
%(makecall)s
}
""" % locals(), file=sio)
......@@ -1960,27 +1925,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print("""
{
int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(%(x)s)[3],
(size_t) 256));
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
while (n_threads.x * (n_threads.y+1) <= 256) ++n_threads.y;
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[2])
n_threads.y = PyGpuArray_DIMS(%(x)s)[2];
while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1];
if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[2])
n_threads[1] = PyGpuArray_DIMS(%(x)s)[2];
while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256) ++n_threads.z;
if (n_threads.z > 64)
n_threads.z = 64;
if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
n_threads.z = PyGpuArray_DIMS(%(x)s)[0];
while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256) ++n_threads[2];
if (n_threads[2] > 64)
n_threads[2] = 64;
if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
n_threads[2] = PyGpuArray_DIMS(%(x)s)[0];
dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[1]);
size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[1], 1, 1};
%(makecall)s
}
""" % locals(), file=sio)
def c_code_cache_version_apply(self, node):
version = [15] # the version corresponding to the c code in this Op
version = [16] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op,
......@@ -1994,14 +1957,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
else:
return ()
def c_support_code_apply(self, node, nodename):
sio = StringIO()
def gpu_kernels(self, node, nodename):
nd_in = len(self.reduce_mask)
in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
load_in = load_w(node.inputs[0].dtype)
write_out = write_w(node.outputs[0].dtype)
in_dtype = node.inputs[0].dtype
out_dtype = node.outputs[0].dtype
acc_dtype = self._acc_dtype(node.inputs[0].dtype)
flags=Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype)
acc_type = gpuarray.dtype_to_ctype(acc_dtype)
load_in = load_w(in_dtype)
write_out = write_w(out_dtype)
kernels = []
if all(i == 1 for i in self.reduce_mask):
# this kernel is ok for up to a few thousand elements, but
......@@ -2011,16 +1978,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[0])")
kname = "kernel_reduce_ccontig"
k_var = "kernel_reduce_ccontig_" + nodename
sio = StringIO()
print("""
static __global__ void kernel_reduce_ccontig_%(nodename)s(
const unsigned int d0,
const %(in_dtype)s *A,
%(out_dtype)s * Z)
KERNEL void %(kname)s(
const ga_size d0,
const %(in_type)s *A, const ga_size offset_A,
%(out_type)s *Z, const ga_size offset_Z)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ %(acc_dtype)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s;
extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
......@@ -2034,6 +2006,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s
}
""" % locals(), file=sio)
params = [
'uintp',
gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1,):
# this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
......@@ -2042,16 +2021,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[0])")
kname = "kernel_reduce_1"
k_var = "kernel_reduce_1_" + nodename
sio = StringIO()
print("""
static __global__ void kernel_reduce_1_%(nodename)s(
const unsigned int d0,
const %(in_dtype)s *A, const int sA0,
%(out_dtype)s * Z)
KERNEL void %(kname)s(
const ga_size d0,
const %(in_type)s *A, const ga_size offset_A,
const ga_ssize sA0,
%(out_type)s * Z, const ga_size offset_Z)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ %(acc_dtype)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s;
extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
......@@ -2065,6 +2050,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s
}
""" % locals(), file=sio)
params = [
'uintp',
gpuarray.GpuArray, 'uintp',
'intp',
gpuarray.GpuArray, 'uintp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 1):
# this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
......@@ -2073,17 +2066,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[0])")
kname = "kernel_reduce_11"
k_var = "kernel_reduce_11_" + nodename
sio = StringIO()
print("""
static __global__ void kernel_reduce_11_%(nodename)s(
const int d0,
const int d1,
const %(in_dtype)s *A, const int sA0, const int sA1,
%(out_dtype)s * Z)
KERNEL void %(kname)s(
const ga_size d0, const ga_size d1,
const %(in_type)s *A, const ga_size offset_A,
const ga_ssize sA0, const ga_ssize sA1,
%(out_type)s * Z, const ga_size offset_Z)
{
const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
extern __shared__ %(acc_dtype)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s;
extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
......@@ -2100,6 +2098,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s
}
""" % locals(), file=sio)
params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp',
gpuarray.GpuArray, 'uintp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
#01, 011, 0111
if (0 == self.reduce_mask[0] and
all(self.reduce_mask[1:]) and
......@@ -2144,17 +2150,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node,
nodename, sub={})
param_dim = ",".join(["const int d%d" % i
param_dim = ",".join(["const ga_size d%d" % i
for i in xrange(nd_in)])
param_strides = ",".join(["const int sA%d" % i
param_strides = ",".join(["const ga_ssize sA%d" % i
for i in xrange(nd_in)])
decl = self._k_decl(node, nodename)
decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals())
reduce_fct = self._assign_reduce(
node, nodename, "myresult",
load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
{}, True)
sio = StringIO()
print("""
%(decl)s{
%(init)s
......@@ -2171,6 +2178,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
}
}
""" % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
# this kernel uses one block for each column,
# threads per block for each element per column.
......@@ -2184,18 +2193,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])")
kname = "kernel_reduce_010"
k_var = "kernel_reduce_010_" + nodename
sio = StringIO()
print("""
static __global__ void kernel_reduce_010_%(nodename)s(
const int d0,
const int d1,
const int d2,
const %(in_dtype)s *A, const int sA0,
const int sA1, const int sA2,
%(out_dtype)s * Z, const int sZ0, const int sZ1)
KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2,
const %(in_type)s *A, const ga_size offset_A,
const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
%(out_type)s * Z, const ga_size offset_Z,
const ga_ssize sZ0, const ga_ssize sZ1)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ %(acc_dtype)s buf[];
extern __shared__ %(acc_type)s buf[];
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
......@@ -2207,7 +2220,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{
%(acc_dtype)s myresult = %(reduce_init)s;
%(acc_type)s myresult = %(reduce_init)s;
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{
%(reduce_fct)s;
......@@ -2218,25 +2231,36 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
}
""" % locals(), file=sio)
params = [
'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask in [(0, 1, 0), (1, 0), (1, 0, 0)]:
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(X[a * sX0 + b * sX1 + c * sX2])",
{}, True)
reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])")
kname = "kernel_reduce_010_AD"
k_var = "kernel_reduce_010_AD_" + nodename
sio = StringIO()
print("""
static __global__ void kernel_reduce_010_AD_%(nodename)s(
const int A,
const int B,
const int C,
const int D,
//const int E, // THIS is 32
const %(in_dtype)s *X, const int sX0,
const int sX1, const int sX2,
%(out_dtype)s * Z, const int sZ0, const int sZ1)
KERNEL void %(kname)s(
const ga_size A, const ga_size B, const ga_size C, const ga_size D,
const %(in_type)s *X, const ga_size offset_X,
const ga_ssize sX0, const ga_ssize sX1, const ga_ssize sX2,
%(out_type)s * Z, const ga_size offset_Z,
const ga_ssize sZ0, const ga_ssize sZ1)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
%(acc_dtype)s myresult = 0;
%(acc_type)s myresult = 0;
X = (const %(in_type)s *)(((char *)X)+offset_X);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
......@@ -2262,6 +2286,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
}
""" % locals(), file=sio)
params = [
'uintp', 'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 1, 0):
#
# This kernel is optimized when the inner most dimensions
......@@ -2275,7 +2308,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
# block.x = dim 0
# block.y = dim 1 rest
init = self._k_init(node, nodename)
decl = self._k_decl(node, nodename, pattern="010_inner")
decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner")
reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
node, nodename,
'blockDim.x')
......@@ -2283,6 +2316,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])")
sio = StringIO()
print("""
%(decl)s
{
......@@ -2307,6 +2341,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
}
}
""" % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 1, 0):
# this kernel uses one block for each column,
# threads per block for each element per column.
......@@ -2319,19 +2355,23 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])")
kname = "kernel_reduce_110"
k_var = "kernel_reduce_110_" + nodename
sio = StringIO()
print("""
static __global__ void kernel_reduce_110_%(nodename)s(
const int d0,
const int d1,
const int d2,
const %(in_dtype)s *A, const int sA0,
const int sA1, const int sA2,
%(out_dtype)s * Z, const int sZ0)
KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2,
const %(in_type)s *A, const ga_size offset_A,
const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
%(out_type)s * Z, const ga_size offset_Z,
const ga_ssize sZ0)
{
const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_dtype)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s;
extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
......@@ -2351,15 +2391,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s
}
""" % locals(), file=sio)
params = [
'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp',
'intp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 0, 0):
reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]',
node, nodename, sub={})
decl = self._k_decl(node, nodename)
decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])")
sio = StringIO()
print("""
%(decl)s
{
......@@ -2378,15 +2428,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
}
}
""" % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 1, 1):
reducebuf = self._k_reduce_buf('Z[0]', node,
nodename, sub={})
decl = self._k_decl(node, nodename)
decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[0])")
sio = StringIO()
print("""
%(decl)s
{
......@@ -2405,6 +2458,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s
}
""" % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 0, 1):
# this kernel uses one block for each row,
# threads per block for each element per row.
......@@ -2414,18 +2469,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
kname = "kernel_reduce_001"
k_var = "kernel_reduce_001_" + nodename
sio = StringIO()
print("""
static __global__ void kernel_reduce_001_%(nodename)s(
const int d0,
const int d1,
const int d2,
const %(in_dtype)s *A, const int sA0,
const int sA1, const int sA2,
%(out_dtype)s * Z, const int sZ0, const int sZ1)
KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2,
const %(in_type)s *A, const ga_size offset_A,
const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
%(out_type)s * Z, const ga_size offset_Z,
const ga_ssize sZ0, const ga_ssize sZ1)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ %(acc_dtype)s buf[];
extern __shared__ %(acc_type)s buf[];
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
......@@ -2436,7 +2495,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{
%(acc_dtype)s myresult = %(reduce_init)s;
%(acc_type)s myresult = %(reduce_init)s;
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
{
%(reduce_fct)s;
......@@ -2446,17 +2505,27 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
}
}
""" % locals(), file=sio)
params = [
'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 0, 1, 1):
# this kernel uses one block for each row,
# threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
node, nodename, sub={})
decl = self._k_decl(node, nodename)
decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
sio = StringIO()
print("""
%(decl)s
{
......@@ -2466,7 +2535,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{
%(acc_dtype)s myresult = %(reduce_init)s;
%(acc_type)s myresult = %(reduce_init)s;
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
{
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
......@@ -2479,17 +2548,20 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
}
}
""" % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 1, 0, 1):
# this kernel uses one block for each row,
# threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]',
node, nodename, sub={})
decl = self._k_decl(node, nodename)
decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])")
sio = StringIO()
print("""
%(decl)s
{
......@@ -2499,7 +2571,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{
%(acc_dtype)s myresult = %(reduce_init)s;
%(acc_type)s myresult = %(reduce_init)s;
for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
{
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
......@@ -2512,15 +2584,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
}
}
""" % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 1, 1, 1):
reducebuf = self._k_reduce_buf('Z[0]', node, nodename,
sub={})
decl = self._k_decl(node, nodename)
decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[0])")
sio = StringIO()
print("""
%(decl)s
{
......@@ -2540,6 +2615,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s
}
""" % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 0, 1, 1):
reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
node, nodename, sub={})
......@@ -2547,20 +2624,23 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3])",
{}, True)
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])")
kname = "kernel_reduce_1011"
k_var= "kernel_reduce_1011_" + nodename
sio = StringIO()
print("""
static __global__ void kernel_reduce_1011_%(nodename)s(
const unsigned int d0,
const unsigned int d1,
const unsigned int d2,
const unsigned int d3,
const %(in_dtype)s *A, const int sA0, const int sA1,
const int sA2, const int sA3,
%(out_dtype)s * Z, const int sZ0)
KERNEL void %(kname)s(
const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3,
const %(in_type)s *A, const ga_size offset_A,
const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2, const ga_ssize sA3,
%(out_type)s * Z, const ga_size offset_Z,
const ga_ssize sZ0)
{
const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_dtype)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s;
extern __shared__ %(acc_type)s buf[];
%(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32)
{
......@@ -2580,14 +2660,16 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s
}
""" % locals(), file=sio)
print("""
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
""", file=sio)
return sio.getvalue()
params = [
'uintp', 'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp',
'intp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
return kernels
class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
......@@ -2820,8 +2902,15 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(output)s = tmp;
}
if (%(sync)d)
GpuArray_sync(&%(output)s->ga);
if (%(sync)d) {
err = GpuArray_sync(&%(output)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: GpuCAReduceCPY: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s
}
}
""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
......
......@@ -3,6 +3,12 @@ Helper routines for generating gpu kernels for nvcc.
"""
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
def nvcc_kernel(name, params, body):
"""
Return the c code of a kernel function.
......@@ -26,7 +32,7 @@ def nvcc_kernel(name, params, body):
else:
yield b
bodystr = ';\n'.join(flatbody())
return """__global__ void %(name)s (%(paramstr)s)
return """KERNEL void %(name)s (%(paramstr)s)
{
%(bodystr)s;
}
......@@ -167,11 +173,12 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
We use __i as an int variable in a loop.
"""
ctype = gpuarray.dtype_to_ctype(dtype)
return [
# get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()',
('npy_%s row_max = ' + buf + '[0]') % dtype,
('%s row_max = ' + buf + '[0]') % ctype,
'__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
......@@ -181,7 +188,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
'__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()',
('npy_%s row_sum = ' + buf + '[0]') % dtype,
('%s row_sum = ' + buf + '[0]') % ctype,
'__syncthreads()',
# divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N +
......@@ -259,11 +266,12 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
ctype = gpuarray.dtype_to_ctype(dtype)
return """
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
npy_%(dtype)s red = %(init)s;
%(ctype)s red = %(init)s;
#pragma unroll 16
for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
red = %(loop_line)s;
......@@ -356,6 +364,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
We use tx as an int variable in a loop.
"""
ctype = gpuarray.dtype_to_ctype(dtype)
ret = [
# get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x,
......@@ -363,7 +372,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
b, stride_b, load_b,
dtype),
'__syncthreads()',
('npy_%s row_max = ' + buf + '[0]') % dtype,
('%s row_max = ' + buf + '[0]') % ctype,
'__syncthreads()',
inline_reduce_fixed_shared(N, buf, x, stride_x, load_x,
threadPos, threadCount,
......@@ -371,7 +380,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
lambda a: "exp(%s - row_max)" % a,
b, stride_b, load_b, dtype),
'__syncthreads()',
('npy_%s row_sum = ' + buf + '[0]') % dtype,
('%s row_sum = ' + buf + '[0]') % ctype,
'__syncthreads()',
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
]
......
import os
import numpy
from theano import Op, Apply, config
......@@ -12,13 +13,14 @@ except ImportError:
pass
from .basic_ops import (as_gpuarray_variable,
host_from_gpu, gpu_from_host)
host_from_gpu, gpu_from_host,
GpuKernelBase, Kernel)
from .opt import register_opt as register_gpu_opt, op_lifter
from .type import GpuArrayType
from .comp import NVCC_compiler
class GpuImages2Neibs(Images2Neibs, Op):
class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
def __init__(self, mode='valid'):
if mode not in ['valid', 'ignore_borders', 'wrap_centered']:
raise NotImplementedError("Only the mode valid, ignore_borders"
......@@ -43,25 +45,41 @@ class GpuImages2Neibs(Images2Neibs, Op):
dtype=ten4.type.dtype)()])
def c_code_cache_version(self):
return (9, 1)
return (10,1)
def c_headers(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>']
def c_compiler(self):
return NVCC_compiler
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
def c_support_code_apply(self, node, nodename):
def gpu_kernels(self, node, nodename):
dtype_ten4 = node.inputs[0].dtype
dtype_z = node.outputs[0].dtype
flags = Kernel.get_flags(dtype_ten4, dtype_z)
type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
type_z = gpuarray.dtype_to_ctype(dtype_z)
mode = self.mode
return """
kernels = []
kname = "k_multi_warp_less"
k_var = "k_multi_warp_less_" + nodename
code = """
//a version that use less register but don't work in all case.
static __global__ void k_multi_warp_less_%(nodename)s(
KERNEL void %(kname)s(
const int nb_batch,
const int nb_stack,
const int height,
......@@ -72,15 +90,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
const int step_y,
const int grid_c,
const int grid_d,
const int stride0, const int stride1,
const int stride2, const int stride3,
npy_%(dtype_ten4)s * global_ten4,
const int out_s0, const int out_s1,
npy_%(dtype_z)s * global_out
const size_t stride0, const size_t stride1,
const size_t stride2, const size_t stride3,
const %(type_ten4)s * global_ten4, const size_t offset_ten4,
const size_t out_s0, const size_t out_s1,
%(type_z)s * global_out, const size_t offset_out
)
{
const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2;
global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
global_out = (%(type_z)s *)(((char *)global_out)+offset_out);
for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
tblock<nb_batch*nb_stack*grid_c*grid_d;
......@@ -131,9 +151,22 @@ class GpuImages2Neibs(Images2Neibs, Op):
}
}
}
}
static __global__ void k_multi_warp_%(nodename)s(
}""" % locals()
params = [
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'uintp', 'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
]
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
kname = "k_multi_warp"
k_var = "k_multi_warp_" + nodename
code = """
KERNEL void %(kname)s(
const int nb_batch,
const int nb_stack,
const int height,
......@@ -144,15 +177,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
const int step_y,
const int grid_c,
const int grid_d,
const int stride0, const int stride1,
const int stride2, const int stride3,
npy_%(dtype_ten4)s * global_ten4,
const int out_s0, const int out_s1,
npy_%(dtype_z)s * global_out
const size_t stride0, const size_t stride1,
const size_t stride2, const size_t stride3,
const %(type_ten4)s * global_ten4, const size_t offset_ten4,
const size_t out_s0, const size_t out_s1,
%(type_z)s * global_out, const size_t offset_out
)
{
const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2;
global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
global_out = (%(type_z)s *)(((char *)global_out)+offset_out);
for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
tblock<nb_batch*nb_stack*grid_c*grid_d;
......@@ -207,6 +242,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
}
}
""" % locals()
params = [
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'uintp', 'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
]
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
return kernels
def c_code(self, node, name, inp, out, sub):
dtype_ten4 = node.inputs[0].dtype
......@@ -220,15 +266,21 @@ class GpuImages2Neibs(Images2Neibs, Op):
z, = out
fail = sub['fail']
mode = self.mode
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: *fptr: %%s.",
GpuKernel_error(fptr, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
cnda_thread_sync = "GpuArray_sync(&%(z)s->ga);" % dict(z=z)
else:
cnda_thread_sync = ""
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
return """
#ifndef CEIL_INTDIV
#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
#endif
int grid_c = -1;
int grid_d = -1;
......@@ -281,10 +333,10 @@ class GpuImages2Neibs(Images2Neibs, Op):
PyGpuArray_DIMS(%(ten4)s)[3]);
%(fail)s;
}
grid_c = CEIL_INTDIV(((PyGpuArray_DIMS(%(ten4)s))[2]),
step_x);
grid_d = CEIL_INTDIV(((PyGpuArray_DIMS(%(ten4)s))[3]),
step_y);
grid_c = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[2]),
(size_t)step_x);
grid_d = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[3]),
(size_t)step_y);
}else if ( "%(mode)s" == "valid") {
......@@ -367,75 +419,57 @@ class GpuImages2Neibs(Images2Neibs, Op):
const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
PyArray_GETPTR1(%(neib_step)s, 1);
dim3 n_threads(d,c,1);
size_t threads_per_block[3] = {d, c, 1};
//Their is a max of 512 threads per blocks
while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--;
while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--;
while(threads_per_block[0]*threads_per_block[1]>512 && threads_per_block[1]>1)threads_per_block[1]--;
while(threads_per_block[0]*threads_per_block[1]>512 && threads_per_block[0]>1)threads_per_block[0]--;
//Make bigger block to have better memory access pattern and
//a higher core utilisation. for smaller patch size
while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 &&
n_threads.z<PyGpuArray_DIMS(%(z)s)[0]){
n_threads.z++;
while(c*d*(threads_per_block[2]+1) < 128 && threads_per_block[2]<64 &&
threads_per_block[2]<PyGpuArray_DIMS(%(z)s)[0]){
threads_per_block[2]++;
}
int nb_block;
if (PyGpuArray_DIMS(%(z)s)[0] %% n_threads.z == 0)
nb_block = PyGpuArray_DIMS(%(z)s)[0] / n_threads.z;
if (PyGpuArray_DIMS(%(z)s)[0] %% threads_per_block[2] == 0)
nb_block = PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2];
else
nb_block = (PyGpuArray_DIMS(%(z)s)[0] / n_threads.z) + 1;
dim3 n_blocks(std::min(32*1024,nb_block));
int n_shared = 0;
void (*f)(int, int, int ,int,
int, int, int ,int,
int, int,
int, int, int, int,
npy_%(dtype_ten4)s*,
int, int,
npy_%(dtype_z)s*);
if(n_threads.x==d && n_threads.y==c){
f = k_multi_warp_less_%(name)s;
}else{
f = k_multi_warp_%(name)s;
}
nb_block = (PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]) + 1;
size_t n_blocks[3] = {std::min(32*1024,nb_block), 1, 1};
f<<<n_blocks, n_threads, n_shared>>>(
nb_batch,
nb_stack,
height, width,
c, d, step_x, step_y,
grid_c, grid_d,
PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s,
PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s,
PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s,
PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s,
(npy_%(dtype_ten4)s*)(
((char *)cuda_get_ptr(%(ten4)s->ga.data)) +
%(ten4)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s,
(npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset)
);
%(cnda_thread_sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "GpuImages2Neibs:"
" Cuda error: %%s: %%s. (grid: %%i x %%i;"
" block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
n_shared);
%(fail)s;
GpuKernel *fptr;
if(threads_per_block[0]==d && threads_per_block[1]==c){
fptr = &k_multi_warp_less_%(name)s;
}else{
fptr = &k_multi_warp_%(name)s;
}
size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s;
size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s;
size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s;
size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s;
size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
void *kernel_params[] = {(void *)&nb_batch,
(void *)&nb_stack,
(void *)&height, (void *)&width,
(void *)&c, (void *)&d,
(void *)&step_x, (void *)&step_y,
(void *)&grid_c, (void *)&grid_d,
(void *)&stride_A0,
(void *)&stride_A1,
(void *)&stride_A2,
(void *)&stride_A3,
(void *)%(ten4)s->ga.data,
(void *)&%(ten4)s->ga.offset,
(void *)&stride_Z0,
(void *)&stride_Z1,
(void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset};
int err = GpuKernel_call(fptr, 3, threads_per_block, n_blocks, 0, kernel_params);
%(err_check)s
%(sync)s
} // END NESTED SCOPE
""" % locals()
......
from __future__ import print_function
import numpy
import os
from theano import Op, Apply, config
from six import StringIO
......@@ -10,16 +11,15 @@ try:
except ImportError:
pass
from .basic_ops import as_gpuarray_variable
from .comp import NVCC_compiler
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
from .type import GpuArrayType
from .kernel_codegen import (nvcc_kernel,
inline_softmax,
inline_softmax_fixed_shared)
inline_softmax,
inline_softmax_fixed_shared)
from .fp16_help import work_dtype, load_w, write_w
class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
"""
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
......@@ -41,10 +41,18 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am])
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>']
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/types.h>']
def c_support_code_apply(self, node, nodename):
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
......@@ -54,28 +62,48 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
load_b = load_w(dtype_b)
write_x = write_w(dtype_x)
write_b = write_w(dtype_b)
return """
__global__ void k_xent_sm_1hot_bias_%(nodename)s(int M, int N,
const npy_%(dtype_x)s* x_data, int xs0, int xs1,
const npy_%(dtype_b)s* b, int bs0,
const npy_%(dtype_y_idx)s* y_idx_data, int y_idxs0,
npy_%(dtype_x)s* nll_data, int nlls0,
npy_%(dtype_x)s* sm_data, int sms0, int sms1,
npy_%(dtype_y_idx)s* am_data, int ams0)
flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
type_x = gpuarray.dtype_to_ctype(work_x)
type_b = gpuarray.dtype_to_ctype(work_b)
type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
kname = "k_xent_sm_1hot_bias"
k_var = "k_xent_sm_1hot_bias_" + nodename
sio = StringIO()
print("""
KERNEL void %(kname)s(const ga_size M, const ga_size N,
const %(type_x)s* x_data, const ga_size offset_x,
const ga_ssize xs0, const ga_ssize xs1,
const %(type_b)s* b, const ga_size offset_b,
const ga_ssize bs0,
const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx,
const ga_ssize y_idxs0,
%(type_x)s* nll_data, const ga_size offset_nll,
const ga_ssize nlls0,
%(type_x)s* sm_data, const ga_size offset_sm,
const ga_ssize sms0, const ga_ssize sms1,
%(type_y_idx)s* am_data, const ga_size offset_am,
const ga_ssize ams0)
{
x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
b = (const %(type_b)s *)(((char *)b)+offset_b);
y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll);
sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm);
am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);
for (int row = blockIdx.x; row < M; row += gridDim.x){
const npy_%(dtype_x)s* x = x_data + xs0 * row;
const npy_%(dtype_y_idx)s y_idx = y_idx_data[row * y_idxs0];
npy_%(dtype_x)s* sm = sm_data + sms0 * row;
const %(type_x)s* x = x_data + xs0 * row;
const %(type_y_idx)s y_idx = y_idx_data[row * y_idxs0];
%(type_x)s* sm = sm_data + sms0 * row;
npy_%(work_x)s sum = 0.0;
%(type_x)s sum = 0.0;
int row_max_j = 0;
npy_%(work_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
%(type_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
for (int j = 1; j < N; ++j)
{
npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) +
%(load_b)s(b[j*bs0]);
%(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
%(load_b)s(b[j*bs0]);
//todo: store to shared memory
row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max;
......@@ -83,16 +111,16 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
//compute the exp
for (int j = 0; j < N; ++j)
{
npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) +
%(load_b)s(b[j*bs0]);
npy_%(work_x)s sm_ij = exp(row_ij - row_max);
%(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
%(load_b)s(b[j*bs0]);
%(type_x)s sm_ij = exp(row_ij - row_max);
sum += sm_ij;
sm[j * sms1] = %(write_x)s(sm_ij);
}
npy_%(work_x)s sum_inv = 1.0 / sum;
%(type_x)s sum_inv = 1.0 / sum;
for (int j = 0; j < N; ++j)
{
npy_%(work_x)s __tmp = %(load_x)s(sm[j * sms1]);
%(type_x)s __tmp = %(load_x)s(sm[j * sms1]);
__tmp *= sum_inv;
sm[j * sms1] = %(write_x)s(__tmp);
}
......@@ -111,12 +139,18 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
am_data[row*ams0] = row_max_j;
}
}
CUdeviceptr (*cuda_get_ptr)(gpudata *g);
""" % locals()
def c_init_code(self):
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
""" % locals(), file=sio)
params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp'
]
return [Kernel(code=sio.getvalue(), name=kname, params=params,
flags=flags, objvar=k_var)]
def c_code(self, node, nodename, inp, out, sub):
typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
......@@ -138,6 +172,21 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
dtype_am = node.outputs[2].dtype
classname = self.__class__.__name__
fail = sub['fail']
k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
sio = StringIO()
print("""
if (PyGpuArray_NDIM(%(y_idx)s) != 1)
......@@ -219,62 +268,47 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
}
}
{
int n_blocks = PyGpuArray_DIMS(%(x)s)[0] < 256 ? PyGpuArray_DIMS(%(x)s)[0] : 256;
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)256), 1, 1};
size_t threads_per_block[3] = {1, 1, 1};
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
ssize_t stride_NLL0 = PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s;
ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
ssize_t stride_AM0 = PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s;
//TODO: launch more threads per row and do parallel sum and max reductions
int n_threads = 1;
int n_shared_bytes = 0; //n_threads * sizeof(dtype);
k_xent_sm_1hot_bias_%(nodename)s<<<n_blocks, n_threads, n_shared_bytes>>>(
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(npy_%(dtype_x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
%(b)s->ga.offset),
PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
(npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
%(y_idx)s->ga.offset),
PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
(npy_%(dtype_nll)s*)(((char *)cuda_get_ptr(%(nll)s->ga.data)) +
%(nll)s->ga.offset),
PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s,
(npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
%(sm)s->ga.offset),
PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
(npy_%(dtype_am)s*)(((char *)cuda_get_ptr(%(am)s->ga.data)) +
%(am)s->ga.offset),
PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s);
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %(classname)s %(nodename)s: %%s.\\n"
"The kernel was launched with %%d threads,"
" %%d blocks and %%d shared memory\\n",
cudaGetErrorString(err),
n_threads, n_blocks, n_shared_bytes);
// no need to decref output vars the cleanup code will do it
%(fail)s;
}
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(%(x)s)[0],
(void *)&PyGpuArray_DIMS(%(x)s)[1],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
(void *)&stride_X0, (void *)&stride_X1,
(void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
(void *)&stride_B0,
(void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
(void *)&stride_YIDX0,
(void *)%(nll)s->ga.data, (void *)&%(nll)s->ga.offset,
(void *)&stride_NLL0,
(void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
(void *)&stride_SM0, (void *)&stride_SM1,
(void *)%(am)s->ga.data, (void *)&%(am)s->ga.offset,
(void *)&stride_AM0};
int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
%(err_check)s
%(sync)s
}
""" % locals(), file=sio)
return sio.getvalue()
def c_code_cache_version(self):
return (6,)
def c_compiler(self):
return NVCC_compiler
return (7,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
class GpuCrossentropySoftmax1HotWithBiasDx(Op):
class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
"""
Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
......@@ -294,13 +328,18 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self):
return (9,)
return (10,)
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_compiler(self):
return NVCC_compiler
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub):
typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
......@@ -312,20 +351,36 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
dtype_sm = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
dtype_dx = node.outputs[0].dtype
type_intp = gpuarray.dtype_to_ctype(numpy.intp)
dnll, sm, y_idx = inp
dx, = out
fail = sub['fail']
k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
return """
// Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
const npy_intp %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
PyGpuArray_DIMS(%(dnll)s)[0] :
(npy_intp) 0);
const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
PyGpuArray_DIMS(%(dnll)s)[0] :
(ssize_t) 0);
// Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
// or a vector with just one element.
const npy_intp %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
PyGpuArray_STRIDES(%(dnll)s)[0] :
(npy_intp) 0);
const ssize_t %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
PyGpuArray_STRIDES(%(dnll)s)[0] :
(ssize_t) 0);
if ((PyGpuArray_NDIM(%(dnll)s) > 1)
|| (PyGpuArray_NDIM(%(sm)s) != 2)
......@@ -373,48 +428,33 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
}
}
{
int n_blocks = PyGpuArray_DIMS(%(dx)s)[0] < 256 ? PyGpuArray_DIMS(%(dx)s)[0] : 256;
int n_threads = PyGpuArray_DIMS(%(dx)s)[1] < 256 ? PyGpuArray_DIMS(%(dx)s)[1] : 256;
kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
<<<n_blocks, n_threads>>>(
PyGpuArray_DIMS(%(dx)s)[0],
PyGpuArray_DIMS(%(dx)s)[1],
(npy_%(dtype_dnll)s*)(((char *)cuda_get_ptr(%(dnll)s->ga.data)) +
%(dnll)s->ga.offset),
%(dnll)s_strides0 / %(itemsize_dnll)s,
(npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
%(sm)s->ga.offset),
PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
(npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
%(y_idx)s->ga.offset),
PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
(npy_%(dtype_dx)s*)(((char *)cuda_get_ptr(%(dx)s->ga.data)) +
%(dx)s->ga.offset),
PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s,
PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s
);
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n"
"The kernel was launched with %%d threads and"
" %%d blocks\\n",
"kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
cudaGetErrorString(err), n_threads, n_blocks);
%(fail)s;
}
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[0], (size_t)256), 1, 1};
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[1], (size_t)256), 1, 1};
ssize_t stride_DNLL0 = %(dnll)s_strides0 / %(itemsize_dnll)s;
ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
ssize_t stride_DX0 = PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s;
ssize_t stride_DX1 = PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s;
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(%(dx)s)[0],
(void *)&PyGpuArray_DIMS(%(dx)s)[1],
(void *)%(dnll)s->ga.data, (void *)&%(dnll)s->ga.offset,
(void *)&stride_DNLL0,
(void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
(void *)&stride_SM0, (void *)&stride_SM1,
(void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
(void *)&stride_YIDX0,
(void *)%(dx)s->ga.data, (void *)&%(dx)s->ga.offset,
(void *)&stride_DX0, (void *)&stride_DX1};
int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
%(err_check)s
%(sync)s
}
assert(%(dx)s);
""" % locals()
def c_support_code_apply(self, node, nodename):
def gpu_kernels(self, node, nodename):
dtype_dnll = node.inputs[0].dtype
dtype_sm = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
......@@ -423,18 +463,35 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
load_dnll = load_w(dtype_dnll)
load_sm = load_w(dtype_sm)
write_dx = write_w(dtype_dx)
return """
__global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s(
int N, int K,
const npy_%(dtype_dnll)s* dnll, const int dnll_s0,
const npy_%(dtype_sm)s* sm, const int sm_s0, const int sm_s1,
const npy_%(dtype_y_idx)s* y_idx, const int y_idx_s0,
npy_%(dtype_dx)s* dx, const int dx_s0, const int dx_s1)
flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx)
type_dnll = gpuarray.dtype_to_ctype(work_dnll)
type_sm = gpuarray.dtype_to_ctype(dtype_sm)
type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
type_dx = gpuarray.dtype_to_ctype(dtype_dx)
kname = "kCrossEntropySoftmax1HotWithBiasDx"
k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
sio = StringIO()
print("""
KERNEL void %(kname)s(
const ga_size N, const ga_size K,
const %(type_dnll)s* dnll, const ga_size offset_dnll,
const ga_ssize dnll_s0,
const %(type_sm)s* sm, const ga_size offset_sm,
const ga_ssize sm_s0, const ga_ssize sm_s1,
const %(type_y_idx)s* y_idx, const ga_size offset_y_idx,
const ga_ssize y_idx_s0,
%(type_dx)s* dx, const ga_size offset_dx,
const ga_ssize dx_s0, const ga_ssize dx_s1)
{
dnll = (const %(type_dnll)s *)(((char *)dnll)+offset_dnll);
sm = (const %(type_sm)s *)(((char *)sm)+offset_sm);
y_idx = (const %(type_y_idx)s *)(((char *)y_idx)+offset_y_idx);
dx = (%(type_dx)s *)(((char *)dx)+offset_dx);
for (int i = blockIdx.x; i < N; i += gridDim.x)
{
npy_%(work_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
npy_%(dtype_y_idx)s y_i = y_idx[i * y_idx_s0];
%(type_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
%(type_y_idx)s y_i = y_idx[i * y_idx_s0];
for (int j = threadIdx.x; j < K; j += blockDim.x)
{
......@@ -453,17 +510,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
}
}
}
CUdeviceptr (*cuda_get_ptr)(gpudata *g);
""" % locals()
def c_init_code(self):
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
""" % locals(), file=sio)
params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp'
]
return [Kernel(code=sio.getvalue(), name=kname, params=params,
flags=flags, objvar=k_var)]
gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
class GpuSoftmax (Op):
class GpuSoftmax (GpuKernelBase, Op):
"""
Implement Softmax on the gpu.
......@@ -482,12 +543,16 @@ class GpuSoftmax (Op):
def c_code_cache_version(self):
return (13,) + inline_softmax.code_version
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>']
def c_compiler(self):
return NVCC_compiler
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_init_code(self):
return ['setup_ext_cuda();']
......@@ -502,10 +567,21 @@ class GpuSoftmax (Op):
x, = inp
z, = out
fail = sub['fail']
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
sync = """
err = GpuArray_sync(&%(z)s->ga);
msg = "sync error";
%(err_check)s
""" % locals()
else:
cnda_thread_sync = ""
sync = ""
return """
if (PyGpuArray_NDIM(%(x)s) != 2)
{
......@@ -528,97 +604,82 @@ class GpuSoftmax (Op):
}
}
{
int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0],
(size_t)(32 * 1024));
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32 * 1024)), 1, 1};
//TODO, detect the maximum number of thread per block.
int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512);
int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512), 1, 1};
size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(work_x)s);
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
const char *fmt_str, *msg;
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(%(x)s)[0],
(void *)&PyGpuArray_DIMS(%(x)s)[1],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
(void *)&stride_X0, (void *)&stride_X1,
(void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GA_NO_ERROR;
if (PyGpuArray_DIMS(%(x)s)[0] > 0)
{
//Those numbers are based on not too recent GPU
//to make them compatible with more GPU.
//TODO: read the information from the card.
if(n_shared_bytes < (32 * 1024 - 500)){
kSoftmax_%(nodename)s
<<<
n_blocks,
n_threads,
n_shared_bytes
>>>(
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_z)s*)(
((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
);
if(shmem_sz < (32 * 1024 - 500)){
err = GpuKernel_call(&kSoftmax_%(nodename)s, 3,
threads_per_block, n_blocks, shmem_sz,
kernel_params);
fmt_str = "gpuarray error: kSoftmax_%(nodename)s: %%s";
msg = GpuKernel_error(&kSoftmax_%(nodename)s, err);
}else{
kSoftmax_fixed_shared%(nodename)s
<<<
n_blocks,
n_threads,
n_threads * sizeof(npy_%(work_x)s)
>>>(
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_z)s*)(
((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
);
}
%(cnda_thread_sync)s
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n Used %%d blocks,"
" %%d threads %%d bytes of shared memory",
"kSoftmax[_fixed_shared]%(nodename)s",
cudaGetErrorString(err),
n_blocks, n_threads, n_shared_bytes);
%(fail)s;
err = GpuKernel_call(&kSoftmax_fixed_shared%(nodename)s, 3,
threads_per_block, n_blocks,
threads_per_block[0] * sizeof(npy_%(work_x)s),
kernel_params);
fmt_str = "gpuarray error: kSoftmax_fixed_shared%(nodename)s: %%s";
msg = GpuKernel_error(&kSoftmax_fixed_shared%(nodename)s, err);
}
%(err_check)s
%(sync)s
}
}
assert(%(z)s);
""" % locals()
def c_support_code_apply(self, node, nodename):
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_sm = node.outputs[0].dtype
load_x = load_w(node.inputs[0].dtype)
load_x = load_w(dtype_x)
write_sm = write_w(node.outputs[0].dtype)
work_sm = work_dtype(node.outputs[0].dtype)
ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
params=['int M', 'int N',
'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
work_sm = work_dtype(dtype_sm)
flags = Kernel.get_flags(dtype_x, dtype_sm)
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_sm = gpuarray.dtype_to_ctype(work_sm)
params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp'
]
kernels = []
kname = "kSoftmax"
k_var= "kSoftmax_" + nodename
code = nvcc_kernel(kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[
"extern __shared__ npy_%(work_sm)s buf[]",
"npy_%(work_sm)s * buf2 = buf + N",
"extern __shared__ %s buf[]" % type_sm,
"%s * buf2 = buf + N" % type_sm,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1])",
"buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
"buf2[tx] = buf[tx]",
"}",
"__syncthreads()",
......@@ -626,21 +687,29 @@ class GpuSoftmax (Op):
'threadIdx.x', 'blockDim.x', work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx])",
"sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
"}",
"__syncthreads()",
"}",
])
ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
params=['int M', 'int N',
'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
kname = "kSoftmax_fixed_shared"
k_var= "kSoftmax_fixed_shared" + nodename
code = nvcc_kernel(kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[
"extern __shared__ npy_%(work_sm)s buf[]",
"extern __shared__ %s buf[]" % type_sm,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
"npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
"const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
"%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
load_x,
'sm_ptr', 'sm_s1', write_sm,
......@@ -649,12 +718,14 @@ class GpuSoftmax (Op):
"__syncthreads()",
"}",
])
return (ret1 + "\n" + ret2) % locals()
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
return kernels
gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias (Op):
class GpuSoftmaxWithBias (GpuKernelBase, Op):
"""
Implement SoftmaxWithBias on the gpu.
......@@ -676,12 +747,18 @@ class GpuSoftmaxWithBias (Op):
def c_code_cache_version(self):
return (12,) + inline_softmax.code_version
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>']
def c_compiler(self):
return NVCC_compiler
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_init_code(self):
return ['setup_ext_cuda();']
......@@ -698,10 +775,19 @@ class GpuSoftmaxWithBias (Op):
x, b = inp
z, = out
fail = sub['fail']
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
else:
cnda_thread_sync = ""
sync = """
err = GpuArray_sync(&%(z)s->ga);
msg = "sync error";
%(err_check)s
""" % locals()
return """
if (PyGpuArray_NDIM(%(x)s) != 2)
{
......@@ -739,82 +825,51 @@ class GpuSoftmaxWithBias (Op):
}
}
{
int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024));
size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024)), 1, 1};
//TODO, detect the maximum number of thread per block.
int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512);
int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512), 1, 1};
size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(work_x)s);
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
const char *fmt_str, *msg;
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(%(x)s)[0],
(void *)&PyGpuArray_DIMS(%(x)s)[1],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
(void *)&stride_X0, (void *)&stride_X1,
(void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
(void *)&stride_B0,
(void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GA_NO_ERROR;
if (PyGpuArray_DIMS(%(x)s)[0] > 0)
{
if(n_shared_bytes < (32 * 1024 - 500)){
kSoftmaxWithBias_%(nodename)s
<<<
n_blocks,
n_threads,
n_shared_bytes
>>>(
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
%(b)s->ga.offset),
PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
(npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
);
if(shmem_sz < (32 * 1024 - 500)){
err = GpuKernel_call(&kSoftmaxWithBias_%(nodename)s, 3,
threads_per_block, n_blocks, shmem_sz,
kernel_params);
fmt_str = "gpuarray error: kSoftmaxWithBias_%(nodename)s: %%s";
msg = GpuKernel_error(&kSoftmaxWithBias_%(nodename)s, err);
}else{
kSoftmaxWithBias_fixed_shared%(nodename)s
<<<
n_blocks,
n_threads,
n_threads * sizeof(npy_%(work_x)s)
>>>(
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_b)s*)(
((char *)cuda_get_ptr(%(b)s->ga.data)) +
%(b)s->ga.offset),
PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
(npy_%(dtype_z)s*)(
((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
);
err = GpuKernel_call(&kSoftmaxWithBias_fixed_shared%(nodename)s,
3, threads_per_block, n_blocks,
threads_per_block[0] * sizeof(npy_%(work_x)s),
kernel_params);
fmt_str = "gpuarray error: kSoftmaxWithBias_fixed_shared%(nodename)s: %%s";
msg = GpuKernel_error(&kSoftmaxWithBias_fixed_shared%(nodename)s, err);
}
%(cnda_thread_sync)s
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n",
"kSoftmaxWithBias_%(nodename)s",
cudaGetErrorString(err));
%(fail)s;
}
%(err_check)s
%(sync)s
}
}
assert(%(z)s);
""" % locals()
def c_support_code_apply(self, node, nodename):
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_sm = node.outputs[0].dtype
......@@ -822,55 +877,80 @@ class GpuSoftmaxWithBias (Op):
load_b = load_w(node.inputs[1].dtype)
write_sm = write_w(node.outputs[0].dtype)
work_sm = work_dtype(node.outputs[0].dtype)
ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
params=['int M', 'int N',
'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
'const npy_%(dtype_b)s * b', 'const int sb0',
'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_b = gpuarray.dtype_to_ctype(dtype_b)
type_sm = gpuarray.dtype_to_ctype(work_sm)
params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp'
]
kernels = []
kname = "kSoftmaxWithBias"
k_var = "kSoftmaxWithBias_" + nodename
code = nvcc_kernel(kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'const %s * b' % type_b, 'const ga_size offset_b',
'const ga_ssize sb0',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[
"extern __shared__ npy_%(work_sm)s buf[]",
"npy_%(work_sm)s * buf2 = buf + N",
"extern __shared__ %s buf[]" % type_sm,
"%s * buf2 = buf + N" % type_sm,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"b = (const %s *)(((char *)b)+offset_b)" % type_b,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1])",
"buf[tx] += %(load_b)s(b[tx * sb0])",
"buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
"buf[tx] += %s(b[tx * sb0])" % load_b,
"buf2[tx] = buf[tx]",
"}",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x', work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx])",
"sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
"}",
"__syncthreads()",
"}",
])
ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
params=['int M', 'int N',
'const npy_%(dtype_x)s * x',
'const int sx0', 'const int sx1',
'const npy_%(dtype_b)s * b', 'const int sb0',
'npy_%(dtype_sm)s * sm',
'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ npy_%(work_sm)s buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
"npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf',
'x_ptr', 'sx1',
load_x,
'sm_ptr', 'sm_s1',
write_sm,
'threadIdx.x',
'blockDim.x',
'b', 'sb0', load_b,
work_sm),
"__syncthreads()",
"}",
])
return (ret1 + "\n" + ret2) % locals()
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
kname = "kSoftmaxWithBias_fixed_shared"
k_var = "kSoftmaxWithBias_fixed_shared" + nodename
code = nvcc_kernel(kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'const %s * b' % type_b, 'const ga_size offset_b',
'const ga_ssize sb0',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[
"extern __shared__ %s buf[]" % type_sm,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"b = (const %s *)(((char *)b)+offset_b)" % type_b,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
"%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
load_x,
'sm_ptr', 'sm_s1', write_sm,
'threadIdx.x', 'blockDim.x',
'b', 'sb0', load_b, work_sm),
"__syncthreads()",
"}",
])
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
return kernels
gpu_softmax_with_bias = GpuSoftmaxWithBias()
from __future__ import print_function
import copy
import numpy
import os
import theano
from theano import tensor, gof, Op
from theano import tensor, gof, Op, config
from six.moves import StringIO
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
import theano.tensor.inplace
......@@ -15,7 +16,7 @@ except ImportError:
pass
from .type import GpuArrayType
from .basic_ops import as_gpuarray_variable, HideC
from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel)
from .elemwise import GpuElemwise
from .comp import NVCC_compiler
......@@ -159,7 +160,7 @@ class GpuSubtensor(HideC, Subtensor):
return (6,)
class GpuIncSubtensor(IncSubtensor):
class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
"""
Implement IncSubtensor on the gpu.
......@@ -177,6 +178,13 @@ class GpuIncSubtensor(IncSubtensor):
def _f16_ok(self):
return self.iadd_node.op._f16_ok
def c_header_dirs(self):
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_headers(self):
return self.iadd_node.op.c_headers()
......@@ -186,6 +194,10 @@ class GpuIncSubtensor(IncSubtensor):
def c_init_code(self):
return self.iadd_node.op.c_init_code()
def gpu_kernels(self, node, nodename):
subname = nodename + "_add_to_zview"
return self.iadd_node.op.gpu_kernels(self.iadd_node, subname)
def make_node(self, x, y, *inputs):
x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y)
......@@ -486,7 +498,7 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
k(x[i], reshaped_y, broadcast=True)
class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
"""
Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent.
......@@ -525,16 +537,24 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self):
return (4,)
return (5,)
def c_headers(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>']
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_compiler(self):
return NVCC_compiler
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
def c_code(self, node, name, inputs, outputs, sub):
......@@ -569,7 +589,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
}
""" % locals()
def c_support_code_apply(self, node, nodename):
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
......@@ -578,7 +598,14 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
itemsize_y = numpy.dtype(dtype_y).itemsize
itemsize_ind = numpy.dtype(dtype_ind).itemsize
itemsize_out = numpy.dtype(dtype_out).itemsize
return """
flags=Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_y = gpuarray.dtype_to_ctype(dtype_y)
type_ind = gpuarray.dtype_to_ctype(dtype_ind)
type_out = gpuarray.dtype_to_ctype(dtype_out)
kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename
code = """
/*
* This is a version of atomicAdd that works for half-floats. It may
......@@ -587,37 +614,43 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
* will not be modified.
*/
__device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
npy_uint32 *base = (npy_uint32 *)((size_t)addr & ~2);
npy_uint32 old, assumed, sum, new_;
__device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
ga_uint old, assumed, sum, new_;
old = *base;
do {
assumed = old;
sum = __float2half_rn(
__half2float(val) +
__half2float((npy_float16)__byte_perm(old, 0,
((size_t)addr & 2) ? 0x4432 : 0x4410)));
new_ = __byte_perm(old, sum, ((size_t)addr & 2) ? 0x5410 : 0x3254);
__half2float((ga_half)__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410)));
new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_);
} while (assumed != old);
return (npy_float16)__byte_perm(old, 0,
((size_t)addr & 2) ? 0x4432 : 0x4410);
return (ga_half)__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410);
}
__global__ void k_vector_add_fast(int numRowsX,
int numColsX,
int stridesX0,
int stridesX1,
npy_%(dtype_x)s *X,
int numRowsY,
int numColsY,
int stridesY0,
int stridesY1,
npy_%(dtype_y)s *Y,
int numIndices,
int stridesIndices,
npy_%(dtype_ind)s *indices_arr)
KERNEL void k_vector_add_fast(const ga_size numRowsX,
const ga_size numColsX,
const ga_ssize stridesX0,
const ga_ssize stridesX1,
%(type_x)s *X,
const ga_size offset_X,
const ga_size numRowsY,
const ga_size numColsY,
const ga_ssize stridesY0,
const ga_ssize stridesY1,
%(type_y)s *Y,
const ga_size offset_Y,
const ga_size numIndices,
const ga_ssize stridesIndices,
%(type_ind)s *indices_arr,
const ga_size offset_indices_arr)
{
X = (%(type_x)s *)(((char *)X)+offset_X);
Y = (%(type_y)s *)(((char *)Y)+offset_Y);
indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr);
for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
{
for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
......@@ -631,41 +664,71 @@ __device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
}
return;
}
""" % locals()
params = [
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'intp', gpuarray.GpuArray, 'uintp'
]
return [Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)]
def c_support_code_apply(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
dtype_out = node.outputs[0].dtype
itemsize_x = numpy.dtype(dtype_x).itemsize
itemsize_y = numpy.dtype(dtype_y).itemsize
itemsize_ind = numpy.dtype(dtype_ind).itemsize
itemsize_out = numpy.dtype(dtype_out).itemsize
k_var = "k_vector_add_fast_" + nodename
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """
void GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr)
{
int num_threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1],
(size_t)256);
int num_blocks = std::min(PyGpuArray_SIZE(indices_arr),
(size_t)4096);
dim3 n_blocks(num_blocks);
dim3 n_threads(num_threads_per_block);
k_vector_add_fast<<<n_blocks, n_threads>>>(
PyGpuArray_DIM(py_self, 0),
PyGpuArray_DIM(py_self, 1),
PyGpuArray_STRIDE(py_self, 0) / %(itemsize_x)s,
PyGpuArray_STRIDE(py_self, 1) / %(itemsize_x)s,
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(py_self->ga.data)) +
py_self->ga.offset),
PyGpuArray_DIM(py_other, 0),
PyGpuArray_DIM(py_other, 1),
PyGpuArray_DIM(py_other, 0) == 1 ? 0 : PyGpuArray_STRIDE(py_other, 0) / %(itemsize_y)s,
PyGpuArray_DIM(py_other, 1) == 1 ? 0 : PyGpuArray_STRIDE(py_other, 1) / %(itemsize_y)s,
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(py_other->ga.data)) +
py_other->ga.offset),
PyGpuArray_DIMS(indices_arr)[0],
PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s,
(npy_%(dtype_ind)s*)(
((char *)cuda_get_ptr(indices_arr->ga.data)) +
indices_arr->ga.offset)
);
return;
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s;
ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s;
ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s;
ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s;
void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0],
(void *)&PyGpuArray_DIMS(py_self)[1],
(void *)&stride_X0,
(void *)&stride_X1,
(void *)py_self->ga.data,
(void *)&py_self->ga.offset,
(void *)&PyGpuArray_DIMS(py_other)[0],
(void *)&PyGpuArray_DIMS(py_other)[1],
(void *)&stride_Y0,
(void *)&stride_Y1,
(void *)py_other->ga.data,
(void *)&py_other->ga.offset,
(void *)&PyGpuArray_DIMS(indices_arr)[0],
(void *)&stride_ind,
(void *)indices_arr->ga.data,
(void *)&indices_arr->ga.offset};
int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
%(err_check)s
%(sync)s
}
}
""" % locals()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论