提交 5ecbbde2 authored 作者: abergeron's avatar abergeron

Merge pull request #3364 from seanprime7/drvapi

Use the libgpuarray APIs to manage GPU code compilation, execution, etc.
...@@ -144,6 +144,15 @@ class GpuKernelBase(object): ...@@ -144,6 +144,15 @@ class GpuKernelBase(object):
def _generate_kernel_vars(self, k): def _generate_kernel_vars(self, k):
return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar) return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar)
def c_support_code(self):
return """
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
"""
def c_support_code_apply(self, node, name): def c_support_code_apply(self, node, name):
kernels = self.gpu_kernels(node, name) kernels = self.gpu_kernels(node, name)
bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels) bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels)
......
...@@ -10,12 +10,6 @@ PyObject * PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern, ...@@ -10,12 +10,6 @@ PyObject * PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
const size_t subsample_cols, const size_t subsample_cols,
const int version, const int verbose); const int version, const int verbose);
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
/* /*
* version: -1, autodetect, >=0 a specific version to use. * version: -1, autodetect, >=0 a specific version to use.
* If it can't be executed, we revert to the reference implementation * If it can't be executed, we revert to the reference implementation
...@@ -108,6 +102,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -108,6 +102,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//TODO: make a parameter the number of division //TODO: make a parameter the number of division
//TODO: Should we make them in separate grid block instead? //TODO: Should we make them in separate grid block instead?
const int stack_len = PyGpuArray_DIMS(img)[1];
const int nstack=PyGpuArray_DIMS(kern)[1]; const int nstack=PyGpuArray_DIMS(kern)[1];
const int nbatch=PyGpuArray_DIMS(img)[0]; const int nbatch=PyGpuArray_DIMS(img)[0];
const int nkern=PyGpuArray_DIMS(kern)[0]; const int nkern=PyGpuArray_DIMS(kern)[0];
...@@ -126,6 +121,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -126,6 +121,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4; const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4; const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4; const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
const int out_stride_col = PyGpuArray_STRIDES(out)[3]/4;
const int out_stride_row = PyGpuArray_STRIDES(out)[2]/4;
const int out_stride_nkern = PyGpuArray_STRIDES(out)[1]/4;
const int out_stride_batch = PyGpuArray_STRIDES(out)[0]/4;
const int img_size=img_len*img_wid; const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid; const int kern_size=kern_len*kern_wid;
...@@ -156,16 +155,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -156,16 +155,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//we don't need to unflip it, but have the new value when we unflip it. //we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true; bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d; bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
const float * kern_data_unflipped = cuda_get_ptr(kern); if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
//the last two dimensions are c_contiguous but flipped! //the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped=1;
kern_stride_row_unflipped=kern_wid;
kern_flipped=false; kern_flipped=false;
kern_contiguous_2d_unflipped = true; kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
} }
//if we remove the restriction //if we remove the restriction
...@@ -195,46 +188,47 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -195,46 +188,47 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration. //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0)
nb_split++; nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split)); size_t threads_per_block[3] = {(size_t)out_wid,
ceil_intdiv((size_t)out_len,(size_t)nb_split),
dim3 grid(nbatch, nkern); (size_t)1};
int shared_size=(img_size + kern_size)*sizeof(float); size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
void (*f)(const float*, const float*, float*,
int, int, int, int, size_t shmem_sz = (img_size + kern_size)*sizeof(float);
int, int);
GpuKernel *k = NULL;
#define CONV_PATCH_SPECIAL(kern_wid) \ if(threads_per_block[1]==out_len) k=&conv_patch_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\ else k=&conv_patch_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
else f=conv_patch<true,kern_wid,true>;
void *kernel_params[] = {(void *)img->ga.data, (void *)&img->ga.offset,
CONV_PATCH_SPECIAL(THEANO_KERN_WID); (void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
f<<< grid, threads, shared_size>>> (void *)&img_len, (void *)&img_wid,
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out), (void *)&kern_len, (void *)&kern_wid,
img_len, img_wid, kern_len, kern_wid, nkern, nstack); (void *)&nkern, (void *)&nstack};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (err == GA_NO_ERROR)
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_patch' version %s nb_split=%d\n", "INFO: used 'conv_patch' version %s nb_split=%d\n",
threads.y==out_len ? "no split": "split", nb_split); threads_per_block[1]==out_len ? "no split": "split", nb_split);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i,"
" shared_size=%i, nb_threads=%i, nb_split=%i\n", " n_blocks[0]=%i, n_blocks[1]=%i,"
threads.x, threads.y, grid.x, grid.y, " shmem_sz=%i, nb_threads=%i, nb_split=%i\n",
shared_size, threads.x * threads.y, nb_split); threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1], nb_split);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch' failed (%s)," "INFO: impl 'conv_patch' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
...@@ -253,77 +247,77 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -253,77 +247,77 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing. if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration. //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++; while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split)); size_t threads_per_block[3] = {(size_t)out_wid,
(size_t)ceil_intdiv(out_len,nb_split),
(size_t)1};
bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail; bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
if(version==11 || version==12) preload_full_kernel=false; if(version==11 || version==12) preload_full_kernel=false;
dim3 grid(nbatch,nkern); size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float); size_t shmem_sz = (img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
void (*f)(const float*, const float*, float*, GpuKernel *k = NULL;
int, int, int, int, if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_64_node_<<<<HASH_PLACEHOLDER>>>>_0;}
int, int, int, int, else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_65_node_<<<<HASH_PLACEHOLDER>>>>_0;}
int, int, int, int, else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_66_node_<<<<HASH_PLACEHOLDER>>>>_0;}
int, int, int, int, else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_67_node_<<<<HASH_PLACEHOLDER>>>>_0;}
int, int); else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_68_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_69_node_<<<<HASH_PLACEHOLDER>>>>_0;}
#define CONV_PATCH_STACK_SPECIAL(kern_wid) \ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_70_node_<<<<HASH_PLACEHOLDER>>>>_0;}
if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_71_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_72_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_73_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_74_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_75_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\ else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_76_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\ else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_77_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_78_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_79_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_80_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_81_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_82_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_83_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_84_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_85_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_86_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_87_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_88_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_89_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_90_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_91_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_92_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_93_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_94_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_95_node_<<<<HASH_PLACEHOLDER>>>>_0;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\ void *kernel_params[] = {(void *)img->ga.data, (void *)&img->ga.offset,
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\ (void *)kern->ga.data, (void *)&kern->ga.offset,
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \ (void *)out->ga.data, (void *)&out->ga.offset,
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \ (void *)&img_len, (void *)&img_wid,
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \ (void *)&kern_len, (void *)&kern_wid,
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;} (void *)&out_len, (void *)&out_wid,
(void *)&nkern, (void *)&nstack,
CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID); (void *)&img_stride_col, (void *)&img_stride_row,
f<<< grid, threads, shared_size>>> (void *)&img_stride_stack, (void *)&img_stride_batch,
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out), (void *)&kern_stride_col, (void *)&kern_stride_row,
img_len, img_wid, kern_len, kern_wid, (void *)&kern_stride_stack, (void *)&kern_stride_nkern,
out_len, out_wid, nkern, nstack, (void *)&subsample_rows, (void *)&subsample_cols};
img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col, kern_stride_row, int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
if (err == GA_NO_ERROR)
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{ {
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i," " shmem_sz=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i," " kern_flipped=true, accumulate=false, kern_width=%i,"
" img_c_contiguous_2d=%i," " img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i," " kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i," " preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu\n", " subsample_rows=%llu, subsample_cols=%llu\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y, shmem_sz, threads_per_block[0] * threads_per_block[1],
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d, THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel, nb_split, preload_full_kernel,
(unsigned long long)subsample_rows, (unsigned long long)subsample_rows,
...@@ -342,15 +336,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -342,15 +336,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i," " shmem_sz=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false," " kern_flipped=true, accumulate=false,"
" kern_width=%i, img_c_contiguous_2d=%i," " kern_width=%i, img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i," " kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i," " preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu\n", " subsample_rows=%llu, subsample_cols=%llu\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y, shmem_sz, threads_per_block[0] * threads_per_block[1],
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d, THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel, nb_split, preload_full_kernel,
(unsigned long long)subsample_rows, (unsigned long long)subsample_rows,
...@@ -359,7 +353,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -359,7 +353,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch_stack' failed (%s)," "INFO: impl 'conv_patch_stack' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
...@@ -371,30 +365,28 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -371,30 +365,28 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
!work_complete) //conv_rows !work_complete) //conv_rows
{ {
dim3 threads(out_wid); size_t threads_per_block[3] = {(size_t)out_wid, (size_t)1, (size_t)1};
dim3 grid(out_len, nbatch*nkern); size_t n_blocks[3] = {(size_t)out_len, (size_t)nbatch*nkern, (size_t)1};
int shared_size=(kern_len*img_wid + kern_size)*sizeof(float); size_t shmem_sz = (kern_len*img_wid + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*,
int, int, int, int, GpuKernel *k = NULL;
int, int, int, int, if(!img_contiguous_2d || !kern_contiguous_2d) k=&conv_rows_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int, int, int, else k=&conv_rows_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int);
void *kernel_params[] = {
#define CONV_ROWS_SPECIAL(kern_wid) \ (void *)img->ga.data, (void *)&img->ga.offset,
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\ (void *)kern->ga.data, (void *)&kern->ga.offset,
else f = conv_rows<kern_wid, true>;\ (void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
CONV_ROWS_SPECIAL(THEANO_KERN_WID); (void *)&kern_len, (void *)&kern_wid,
f<<< grid, threads, shared_size >>> (void *)&nkern, (void *)&nstack,
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out), (void *)&img_stride_col, (void *)&img_stride_row,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, (void *)&img_stride_stack, (void *)&img_stride_batch,
img_stride_col, img_stride_row, (void *)&kern_stride_col, (void *)&kern_stride_row,
img_stride_stack,img_stride_batch, (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
kern_stride_col, kern_stride_row, int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
kern_stride_stack, kern_stride_nkern);
if (err == GA_NO_ERROR)
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) if (verbose)
...@@ -404,15 +396,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -404,15 +396,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y); shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows' failed (%s)," "INFO: impl 'conv_rows' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
...@@ -430,52 +422,50 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -430,52 +422,50 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
nb_row=i; nb_row=i;
} }
dim3 threads(out_wid,nb_row); size_t threads_per_block[3] = {(size_t)out_wid, (size_t)nb_row, (size_t)1};
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern); size_t n_blocks[3] = {(size_t)ceil_intdiv(out_len,nb_row),
(size_t)nbatch*nkern, (size_t)1};
int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*, size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
if (0) if (0)
fprintf(stderr, fprintf(stderr,
"IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n", "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
img_contiguous_2d, kern_contiguous_2d, img_contiguous_2d, kern_contiguous_2d,
threads.x, threads.y, threads.z, threads_per_block[0], threads_per_block[1], threads_per_block[2],
grid.x, grid.y, grid.z); n_blocks[0], n_blocks[1], n_blocks[2]);
GpuKernel *k = NULL;
if(!img_contiguous_2d || !kern_contiguous_2d) { if(!img_contiguous_2d || !kern_contiguous_2d) {
//fprintf(stderr, "using false version\n"); //fprintf(stderr, "using false version\n");
f = conv_rows_stack<THEANO_KERN_WID, false>; k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
} else { } else {
//fprintf(stderr, "using true version\n"); //fprintf(stderr, "using true version\n");
f = conv_rows_stack<THEANO_KERN_WID, true>; k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
} }
f<<< grid, threads, shared_size >>> void *kernel_params[] = {
(cuda_get_ptr(img), (void *)img->ga.data, (void *)&img->ga.offset,
cuda_get_ptr(kern), (void *)kern->ga.data, (void *)&kern->ga.offset,
cuda_get_ptr(out), (void *)out->ga.data, (void *)&out->ga.offset,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, (void *)&img_len, (void *)&img_wid,
img_stride_col, img_stride_row, (void *)&kern_len, (void *)&kern_wid,
img_stride_stack,img_stride_batch, (void *)&nkern, (void *)&nstack,
kern_stride_col, kern_stride_row, (void *)&img_stride_col, (void *)&img_stride_row,
kern_stride_stack, kern_stride_nkern); (void *)&img_stride_stack, (void *)&img_stride_batch,
(void *)&kern_stride_col, (void *)&kern_stride_row,
cudaError_t sts = cudaGetLastError(); (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
if (cudaSuccess == sts) int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{ {
work_complete = true; work_complete = true;
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y); shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: used 'conv_rows_stack' version\n"); fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
} }
...@@ -483,15 +473,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -483,15 +473,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y); shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows_stack' failed (%s)," "INFO: impl 'conv_rows_stack' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
...@@ -524,45 +514,41 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -524,45 +514,41 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//to test the case when we don't have a thread by output pixel. //to test the case when we don't have a thread by output pixel.
if((version_back!=-1)&& nb_row>1) nb_row--; if((version_back!=-1)&& nb_row>1) nb_row--;
dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern); size_t threads_per_block[3] = {(size_t)out_wid, (size_t)nb_row, (size_t)1};
size_t n_blocks[3] = {(size_t)ceil_intdiv(out_len,nb_row),
(size_t)nbatch*nkern, (size_t)1};
int shared_size=(threads.y*img_wid + k_size)*sizeof(float); size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*, GpuKernel *k = NULL;
int, int, int, int, if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) k=&conv_rows_stack2_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int, int, int, else if(version==9) k=&conv_rows_stack2_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int, int, int, else if(!img_contiguous_2d || !kern_contiguous_2d) k=&conv_rows_stack2_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int); else k=&conv_rows_stack2_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \ void *kernel_params[] = {
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\ (void *)img->ga.data, (void *)&img->ga.offset,
else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\ (void *)kern->ga.data, (void *)&kern->ga.offset,
else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\ (void *)out->ga.data, (void *)&out->ga.offset,
else f = conv_rows_stack2<kern_wid, true, false>; (void *)&img_len, (void *)&img_wid,
(void *)&kern_len, (void *)&kern_wid,
CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID); (void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
f<<< grid, threads, shared_size >>> (void *)&img_stride_stack, (void *)&img_stride_batch,
(cuda_get_ptr(img), (void *)&kern_stride_col, (void *)&kern_stride_row,
cuda_get_ptr(kern), (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
cuda_get_ptr(out), int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, if (err == GA_NO_ERROR)
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y); shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_rows_stack2' version %s with" "INFO: used 'conv_rows_stack2' version %s with"
...@@ -574,15 +560,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -574,15 +560,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i version=%d\n", " shmem_sz=%i, nb_threads=%i version=%d\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y,(version==9?2:3)); shmem_sz, threads_per_block[0] * threads_per_block[1],(version==9?2:3));
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows_stack2' failed (%s)," "INFO: impl 'conv_rows_stack2' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
...@@ -629,18 +615,18 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -629,18 +615,18 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
nb_split++; nb_split++;
// tentative estimates (prior to contraint c) // tentative estimates (prior to contraint c)
int thread_z=ceil_intdiv(kern_len,nb_split); size_t thread_z=ceil_intdiv(kern_len,nb_split);
int shared_size = sizeof(float)*(full_kern size_t shmem_sz = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z) ? std::max((size_t)img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z)); : std::max((size_t)img_size + thread_z*kern_wid, out_size*thread_z));
// constraint (c) // constraint (c)
while ((shared_size >= shared_avail) && (nb_split <= kern_len)){ while ((shmem_sz >= shared_avail) && (nb_split <= kern_len)){
//if we can't fit the kernel in shared memory, we must split it more. //if we can't fit the kernel in shared memory, we must split it more.
nb_split++; nb_split++;
thread_z=ceil_intdiv(kern_len,nb_split); thread_z=ceil_intdiv(kern_len,nb_split);
shared_size = sizeof(float)*(full_kern shmem_sz = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z) ? std::max((size_t)img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z)); : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
} }
if (nb_split <= kern_len) if (nb_split <= kern_len)
...@@ -648,57 +634,59 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -648,57 +634,59 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
assert(thread_z>0);//should not happen, but in case... assert(thread_z>0);//should not happen, but in case...
if(!full_kern) assert(thread_z!=kern_len); if(!full_kern) assert(thread_z!=kern_len);
dim3 threads(out_wid, out_len, thread_z); size_t threads_per_block[3] = {(size_t)out_wid,
dim3 grid(nbatch,nkern); (size_t)out_len,
(size_t)thread_z};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
void (*f)(const float*, const float*, float*, GpuKernel *k = NULL;
int, int, int, int,
int, int, int, int,
int, int,
int, int,
int, int);
const bool split=thread_z!=kern_len; const bool split=thread_z!=kern_len;
const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped; const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern); //printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
//We will always be split when we don't load the full kernel //We will always be split when we don't load the full kernel
#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
if (kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\ /* if(!kern_flipped && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else if(kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\ /*else*/ if(!kern_flipped && !ccontig && !split && full_kern) k=&conv_patch_stack_reduce_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\ else if(!kern_flipped && !ccontig && split && !full_kern) k=&conv_patch_stack_reduce_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\ else if(!kern_flipped && !ccontig && split && full_kern) k=&conv_patch_stack_reduce_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\ /*else if(!kern_flipped && ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_4_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\ else if(!kern_flipped && ccontig && !split && full_kern) k=&conv_patch_stack_reduce_5_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\ else if(!kern_flipped && ccontig && split && !full_kern) k=&conv_patch_stack_reduce_6_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\ else if(!kern_flipped && ccontig && split && full_kern) k=&conv_patch_stack_reduce_7_node_<<<<HASH_PLACEHOLDER>>>>_0;
/*else if(kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\ /*else if(kern_flipped && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_8_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
/*else if(kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\ else if(kern_flipped && !ccontig && !split && full_kern) k=&conv_patch_stack_reduce_9_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\ else if(kern_flipped && !ccontig && split && !full_kern) k=&conv_patch_stack_reduce_10_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\ else if(kern_flipped && !ccontig && split && full_kern) k=&conv_patch_stack_reduce_11_node_<<<<HASH_PLACEHOLDER>>>>_0;
/*else if(!kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\ /*else if(kern_flipped && ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_12_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
/*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\ else if(kern_flipped && ccontig && !split && full_kern) k=&conv_patch_stack_reduce_13_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\ else if(kern_flipped && ccontig && split && !full_kern) k=&conv_patch_stack_reduce_14_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>; else if(kern_flipped && ccontig && split && full_kern) k=&conv_patch_stack_reduce_15_node_<<<<HASH_PLACEHOLDER>>>>_0;
CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
void *kernel_params[] = {
f<<< grid, threads, shared_size>>>(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out), (void *)img->ga.data, (void *)&img->ga.offset,
img_len, img_wid, kern_len, kern_wid, (void *)kern->ga.data, (void *)&kern->ga.offset,
nkern, nstack, (void *)out->ga.data, (void *)&out->ga.offset,
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, (void *)&img_len, (void *)&img_wid,
kern_stride_col_unflipped, kern_stride_row_unflipped, (void *)&kern_len, (void *)&kern_wid,
kern_stride_stack, kern_stride_nkern); (void *)&nkern, (void *)&nstack,
(void *)&img_stride_col, (void *)&img_stride_row,
cudaError_t sts = cudaGetLastError(); (void *)&img_stride_stack, (void *)&img_stride_batch,
if (cudaSuccess == sts) (void *)&kern_stride_col,
(void *)&kern_stride_row,
(void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{ {
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i, " "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i, "
"grid.x=%i, grid.y=%i, shared_size=%i," "n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i,"
" nb_threads=%i\n", " nb_threads=%i\n",
threads.x, threads.y, threads.z, grid.x, grid.y, threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y * threads.z); shmem_sz, threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_patch_stack_reduce' version" "INFO: used 'conv_patch_stack_reduce' version"
...@@ -711,17 +699,17 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -711,17 +699,17 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i," " n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i,"
" nb_threads=%i\n", " nb_threads=%i\n",
threads.x, threads.y, threads.z, threads_per_block[0], threads_per_block[1], threads_per_block[2],
grid.x, grid.y, shared_size, n_blocks[0], n_blocks[1], shmem_sz,
threads.x * threads.y * threads.z); threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch_stack_reduce' failed (%s)," "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} // else no good nb_splits was found } // else no good nb_splits was found
} }
...@@ -730,8 +718,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -730,8 +718,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
kern_len<=320 && kern_len<=320 &&
!work_complete) //conv_valid_row_reduce !work_complete) //conv_valid_row_reduce
{ {
int outsize = PyGpuArray_SIZE(out); size_t outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, 4096); size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
(size_t)1, (size_t)1};
int block_nstack=nstack; int block_nstack=nstack;
//Max of 512 threads per blocks. //Max of 512 threads per blocks.
...@@ -739,9 +728,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -739,9 +728,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//8k registers and the kernel use 23 register //8k registers and the kernel use 23 register
//TODO: check if we have 8k or 16k of register... //TODO: check if we have 8k or 16k of register...
while(block_nstack*kern_len>320)block_nstack--; while(block_nstack*kern_len>320)block_nstack--;
dim3 n_threads(block_nstack, kern_len, 1); size_t threads_per_block[3] = {(size_t)block_nstack, (size_t)kern_len, (size_t)1};
int n_reduce_buf = block_nstack * kern_len * sizeof(float); size_t n_reduce_buf = block_nstack * kern_len * sizeof(float);
/* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float) /* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
* *
* if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0. * if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
...@@ -758,39 +747,34 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -758,39 +747,34 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
assert (initial_reduce_boundary < n_reduce_buf/sizeof(float)); assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
} }
GpuKernel *k = NULL;
void (*f)(int, int, int, int,
int, int, int, int, int,
const float*, int, int, int, int,
const float*, int, int, int, int,
float*, int, int, int, int,
int, int, int);
//std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n"; //std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
//std::cerr << "kerns " << nstack << " " << kern_len << "\n"; //std::cerr << "kerns " << nstack << " " << kern_len << "\n";
//std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n"; //std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
if(block_nstack==nstack) if(block_nstack==nstack)
f=conv_valid_row_reduce<false>; k=&conv_valid_row_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
else else
f=conv_valid_row_reduce<true>; k=&conv_valid_row_reduce_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
f<<<n_blocks, n_threads, n_reduce_buf>>>(
nbatch, nkern, PyGpuArray_DIMS(img)[1], void *kernel_params[] = {
img_len, img_wid, (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
kern_len, kern_wid, (void *)&img_len, (void *)&img_wid,
out_len, out_wid, (void *)&kern_len, (void *)&kern_wid,
cuda_get_ptr(img), (void *)&out_len, (void *)&out_wid,
PyGpuArray_STRIDES(img)[0]/4, PyGpuArray_STRIDES(img)[1]/4, (void *)img->ga.data, (void *)&img->ga.offset,
img_stride_row, img_stride_col, (void *)&img_stride_batch, (void *)&img_stride_stack,
cuda_get_ptr(kern), (void *)&img_stride_row, (void *)&img_stride_col,
PyGpuArray_STRIDES(kern)[0]/4, PyGpuArray_STRIDES(kern)[1]/4, (void *)kern->ga.data, (void *)&kern->ga.offset,
PyGpuArray_STRIDES(kern)[2]/4, PyGpuArray_STRIDES(kern)[3]/4, (void *)&kern_stride_nkern, (void *)&kern_stride_stack,
cuda_get_ptr(out), (void *)&kern_stride_row, (void *)&kern_stride_col,
PyGpuArray_STRIDES(out)[0]/4, PyGpuArray_STRIDES(out)[1]/4, (void *)out->ga.data, (void *)&out->ga.offset,
PyGpuArray_STRIDES(out)[2]/4, PyGpuArray_STRIDES(out)[3]/4, (void *)&out_stride_batch, (void *)&out_stride_nkern,
subsample_rows, subsample_cols, initial_reduce_boundary); (void *)&out_stride_row, (void *)&out_stride_col,
(void *)&subsample_rows, (void *)&subsample_cols,
cudaError_t sts = cudaGetLastError(); (void *)&initial_reduce_boundary};
if (cudaSuccess == sts) int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, n_reduce_buf, kernel_params);
if (err == GA_NO_ERROR)
{ {
work_complete = true; work_complete = true;
if (verbose) if (verbose)
...@@ -800,24 +784,27 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -800,24 +784,27 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
n_threads.x, n_threads.y, n_blocks, threads_per_block[0], threads_per_block[1], n_blocks[0],
n_reduce_buf, n_threads.x * n_threads.y); n_reduce_buf, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_valid_row_reduce' failed (%s)," "INFO: impl 'conv_valid_row_reduce' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
if (1 && !work_complete) //conv_reference_valid if (1 && !work_complete) //conv_reference_valid
{ {
int outsize = PyGpuArray_SIZE(out); size_t outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, 4096); size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), (size_t)1, (size_t)1};
256); size_t threads_per_block[3] = {std::min(ceil_intdiv(outsize, n_blocks[0]),
(size_t)256),
(size_t)1, (size_t)1};
if (1) if (1)
{ {
if (verbose) if (verbose)
...@@ -825,61 +812,56 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -825,61 +812,56 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (verbose>1) if (verbose>1)
fprintf(stderr, " img : %i %llu %i %i %p " fprintf(stderr, " img : %i %llu %i %i %p "
"%lld %lld %lld %lld\n", "%lld %lld %lld %lld\n",
nbatch, (unsigned long long)PyGpuArray_DIMS(img)[1], nbatch, (unsigned long long)stack_len, img_len, img_wid,
img_len, img_wid, (void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
cuda_get_ptr(img), (long long)img_stride_batch,
(long long)PyGpuArray_STRIDES(img)[0]/4, (long long)img_stride_stack,
(long long)PyGpuArray_STRIDES(img)[1]/4, (long long)img_stride_row,
(long long)PyGpuArray_STRIDES(img)[2]/4, (long long)img_stride_col);
(long long)PyGpuArray_STRIDES(img)[3]/4);
if (verbose>1) if (verbose>1)
fprintf(stderr, " kern: %i %i %i %i %p " fprintf(stderr, " kern: %i %i %i %i %p "
"%lld %lld %lld %lld\n", "%lld %lld %lld %lld\n",
nkern, nstack, kern_len, kern_wid, nkern, nstack, kern_len, kern_wid,
cuda_get_ptr(kern), (void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
(long long)PyGpuArray_STRIDES(kern)[0]/4, (long long)kern_stride_nkern,
(long long)PyGpuArray_STRIDES(kern)[1]/4, (long long)kern_stride_stack,
(long long)PyGpuArray_STRIDES(kern)[2]/4, (long long)kern_stride_row,
(long long)PyGpuArray_STRIDES(kern)[3]/4); (long long)kern_stride_col);
if (verbose>1) if (verbose>1)
fprintf(stderr, " out : %llu %llu %i %i %p " fprintf(stderr, " out : %llu %llu %i %i %p "
"%lld %lld %lld %lld\n", "%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0], (unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1], (unsigned long long)PyGpuArray_DIMS(out)[1],
out_len, out_wid, out_len, out_wid,
cuda_get_ptr(out), (void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
(long long)PyGpuArray_STRIDES(out)[0]/4, (long long)out_stride_batch,
(long long)PyGpuArray_STRIDES(out)[1]/4, (long long)out_stride_nkern,
(long long)PyGpuArray_STRIDES(out)[2]/4, (long long)out_stride_row,
(long long)PyGpuArray_STRIDES(out)[3]/4); (long long)out_stride_col);
if (verbose>1) if (verbose>1)
fprintf(stderr, " launch params: %i %i %i\n", fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads); outsize, n_blocks[0], threads_per_block[0]);
} }
conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
PyGpuArray_DIMS(img)[1], void *kernel_params[] = {
img_len, img_wid, (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
kern_len, kern_wid, (void *)&img_len, (void *)&img_wid,
out_len, out_wid, (void *)&kern_len, (void *)&kern_wid,
cuda_get_ptr(img), (void *)&out_len, (void *)&out_wid,
PyGpuArray_STRIDES(img)[0]/4, (void *)img->ga.data, (void *)&img->ga.offset,
PyGpuArray_STRIDES(img)[1]/4, (void *)&img_stride_batch, (void *)&img_stride_stack,
PyGpuArray_STRIDES(img)[2]/4, (void *)&img_stride_row, (void *)&img_stride_col,
PyGpuArray_STRIDES(img)[3]/4, (void *)kern->ga.data, (void *)&kern->ga.offset,
cuda_get_ptr(kern), (void *)&kern_stride_nkern, (void *)&kern_stride_stack,
PyGpuArray_STRIDES(kern)[0]/4, (void *)&kern_stride_row, (void *)&kern_stride_col,
PyGpuArray_STRIDES(kern)[1]/4, (void *)out->ga.data, (void *)&out->ga.offset,
PyGpuArray_STRIDES(kern)[2]/4, (void *)&out_stride_batch, (void *)&out_stride_nkern,
PyGpuArray_STRIDES(kern)[3]/4, (void *)&out_stride_row, (void *)&out_stride_col,
cuda_get_ptr(out), (void *)&subsample_rows, (void *)&subsample_cols};
PyGpuArray_STRIDES(out)[0]/4, int err = GpuKernel_call(&conv_reference_valid_node_<<<<HASH_PLACEHOLDER>>>>_0,
PyGpuArray_STRIDES(out)[1]/4, 3, threads_per_block, n_blocks, 0, kernel_params);
PyGpuArray_STRIDES(out)[2]/4,
PyGpuArray_STRIDES(out)[3]/4, if (err == GA_NO_ERROR)
subsample_rows, subsample_cols);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) if (verbose)
...@@ -892,7 +874,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -892,7 +874,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for" "ERROR: all implementations failed for"
" PyGpuArray_conv_valid! (%s)", " PyGpuArray_conv_valid! (%s)",
cudaGetErrorString(sts)); GpuKernel_error(&conv_reference_valid_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
return -1; return -1;
} }
} }
...@@ -941,6 +923,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -941,6 +923,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]); assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]);
assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]); assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]);
const int stack_len=PyGpuArray_DIMS(img)[1];
const int nstack=PyGpuArray_DIMS(kern)[1]; const int nstack=PyGpuArray_DIMS(kern)[1];
const int nbatch=PyGpuArray_DIMS(img)[0]; const int nbatch=PyGpuArray_DIMS(img)[0];
const int nkern=PyGpuArray_DIMS(kern)[0]; const int nkern=PyGpuArray_DIMS(kern)[0];
...@@ -959,6 +942,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -959,6 +942,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4; const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4; const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4; const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
const int out_stride_col = PyGpuArray_STRIDES(out)[3]/4;
const int out_stride_row = PyGpuArray_STRIDES(out)[2]/4;
const int out_stride_nkern = PyGpuArray_STRIDES(out)[1]/4;
const int out_stride_batch = PyGpuArray_STRIDES(out)[0]/4;
const int img_size=img_len*img_wid; const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid; const int kern_size=kern_len*kern_wid;
...@@ -1001,16 +988,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1001,16 +988,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
//we don't need to unflip it, but have the new value when we unflip it. //we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true; bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d; bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
const float * kern_data_unflipped = cuda_get_ptr(kern); if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
//the last two dimensions are c_contiguous but flipped! //the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped=1;
kern_stride_row_unflipped=kern_wid;
kern_flipped=false; kern_flipped=false;
kern_contiguous_2d_unflipped = true; kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
} }
if (verbose>1) if (verbose>1)
...@@ -1019,34 +1000,34 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1019,34 +1000,34 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
" MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID); " MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
printf("INFO: img dim: %llu %llu %llu %llu " printf("INFO: img dim: %llu %llu %llu %llu "
"img stride: %lld %lld %lld %lld\n", "img stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(img)[0], (unsigned long long)nbatch,
(unsigned long long)PyGpuArray_DIMS(img)[1], (unsigned long long)stack_len,
(unsigned long long)PyGpuArray_DIMS(img)[2], (unsigned long long)img_len,
(unsigned long long)PyGpuArray_DIMS(img)[3], (unsigned long long)img_wid,
(long long)PyGpuArray_STRIDES(img)[0]/4, (long long)img_stride_batch,
(long long)PyGpuArray_STRIDES(img)[1]/4, (long long)img_stride_stack,
(long long)PyGpuArray_STRIDES(img)[2]/4, (long long)img_stride_row,
(long long)PyGpuArray_STRIDES(img)[3]/4); (long long)img_stride_col);
printf("INFO: kern dim: %llu %llu %llu %llu " printf("INFO: kern dim: %llu %llu %llu %llu "
"kern stride: %lld %lld %lld %lld\n", "kern stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(kern)[0], (unsigned long long)nkern,
(unsigned long long)PyGpuArray_DIMS(kern)[1], (unsigned long long)nstack,
(unsigned long long)PyGpuArray_DIMS(kern)[2], (unsigned long long)kern_len,
(unsigned long long)PyGpuArray_DIMS(kern)[3], (unsigned long long)kern_wid,
(long long)PyGpuArray_STRIDES(kern)[0]/4, (long long)kern_stride_nkern,
(long long)PyGpuArray_STRIDES(kern)[1]/4, (long long)kern_stride_stack,
(long long)PyGpuArray_STRIDES(kern)[2]/4, (long long)kern_stride_row,
(long long)PyGpuArray_STRIDES(kern)[3]/4); (long long)kern_stride_col);
printf("INFO: out dim: %llu %llu %llu %llu " printf("INFO: out dim: %llu %llu %llu %llu "
"out stride: %lld %lld %lld %lld\n", "out stride: %lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0], (unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1], (unsigned long long)PyGpuArray_DIMS(out)[1],
(unsigned long long)PyGpuArray_DIMS(out)[2], (unsigned long long)out_len,
(unsigned long long)PyGpuArray_DIMS(out)[3], (unsigned long long)out_wid,
(long long)PyGpuArray_STRIDES(out)[0]/4, (long long)out_stride_batch,
(long long)PyGpuArray_STRIDES(out)[1]/4, (long long)out_stride_nkern,
(long long)PyGpuArray_STRIDES(out)[2]/4, (long long)out_stride_row,
(long long)PyGpuArray_STRIDES(out)[3]/4); (long long)out_stride_col);
} }
if (!subsample && if (!subsample &&
...@@ -1093,53 +1074,53 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1093,53 +1074,53 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
assert(version!=5 || kern_len>1); assert(version!=5 || kern_len>1);
assert(version!=-1); assert(version!=-1);
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split)); size_t threads_per_block[3] = {(size_t)out_wid,
dim3 grid(nbatch,nkern); ceil_intdiv((size_t)out_len,(size_t)nb_split),
(size_t)1};
size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
int shared_size=img_size_padded_byte + kern_size_byte; size_t shmem_sz=img_size_padded_byte + kern_size_byte;
if(version==5) if(version==5)
shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte; shmem_sz=((kern_len+threads_per_block[1]-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
void (*f)(const float*, const float*, float*,
int, int, int, int, GpuKernel *k = NULL;
int, int, int, int, if(version==3) k=&conv_full_patch_stack_padded_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int, int, int, else if(version==5) k=&conv_full_patch_stack_padded_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int); else if(version==4) k=&conv_full_patch_stack_padded_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) k=&conv_full_patch_stack_padded_4_node_<<<<HASH_PLACEHOLDER>>>>_0;
#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) k=&conv_full_patch_stack_padded_5_node_<<<<HASH_PLACEHOLDER>>>>_0;
if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) k=&conv_full_patch_stack_padded_6_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\ else if(version==3 && kern_flipped) k=&conv_full_patch_stack_padded_8_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\ else if(version==5 && kern_flipped)k=&conv_full_patch_stack_padded_9_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\ else if(version==4 && kern_flipped)k=&conv_full_patch_stack_padded_10_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) k=&conv_full_patch_stack_padded_12_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) k=&conv_full_patch_stack_padded_13_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) k=&conv_full_patch_stack_padded_14_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
else assert(false); else assert(false);
CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID); void *kernel_params[] = {
(void *)img->ga.data, (void *)&img->ga.offset,
f<<< grid, threads, shared_size>>> (void *)kern->ga.data, (void *)&kern->ga.offset,
(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out), (void *)out->ga.data, (void *)&out->ga.offset,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, (void *)&img_len, (void *)&img_wid,
img_stride_col, img_stride_row, img_stride_stack, (void *)&kern_len, (void *)&kern_wid,
img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped, (void *)&nkern, (void *)&nstack,
kern_stride_stack, kern_stride_nkern); (void *)&img_stride_col, (void *)&img_stride_row,
(void *)&img_stride_stack, (void *)&img_stride_batch,
cudaError_t sts = cudaGetLastError(); (void *)&kern_stride_col, (void *)&kern_stride_row,
if (cudaSuccess == sts) (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (err == GA_NO_ERROR)
{ {
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i," " n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n", " out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z, threads_per_block[0], threads_per_block[1], threads_per_block[2],
grid.x, grid.y, shared_size, n_blocks[0], n_blocks[1], shmem_sz,
threads.x * threads.y * threads.z, threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
out_len, nb_split, version); out_len, nb_split, version);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
...@@ -1152,12 +1133,12 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1152,12 +1133,12 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i," " n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n", " out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z, threads_per_block[0], threads_per_block[1], threads_per_block[2],
grid.x, grid.y, shared_size, n_blocks[0], n_blocks[1], shmem_sz,
threads.x * threads.y * threads.z, threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
out_len, nb_split, version); out_len, nb_split, version);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
...@@ -1165,7 +1146,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1165,7 +1146,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
" failed (%s), trying next implementation\n", " failed (%s), trying next implementation\n",
version==3?"no split": "split", version==3?"no split": "split",
(version==5?"low_mem":"not_low_mem"), (version==5?"low_mem":"not_low_mem"),
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
...@@ -1176,21 +1157,22 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1176,21 +1157,22 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch !work_complete) //conv_full_patch
{ {
dim3 threads(out_wid, out_len); size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
dim3 grid(nbatch,nkern); size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
int shared_size=(img_size + kern_size)*sizeof(float); size_t shmem_sz = (img_size + kern_size)*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions. //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
conv_full_patch<<< grid, threads, shared_size>>> void *kernel_params[] = {
(cuda_get_ptr(img), (void *)img->ga.data, (void *)&img->ga.offset,
cuda_get_ptr(kern), (void *)kern->ga.data, (void *)&kern->ga.offset,
cuda_get_ptr(out), (void *)out->ga.data, (void *)&out->ga.offset,
img_len, img_wid, (void *)&img_len, (void *)&img_wid,
kern_len, kern_wid, (void *)&kern_len, (void *)&kern_wid,
nkern, nstack); (void *)&nkern, (void *)&nstack};
int err = GpuKernel_call(&conv_full_patch_node_<<<<HASH_PLACEHOLDER>>>>_0,
cudaError_t sts = cudaGetLastError(); 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
if (cudaSuccess == sts)
if (err == GA_NO_ERROR)
{ {
if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n"); if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n");
work_complete = true; work_complete = true;
...@@ -1199,15 +1181,15 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1199,15 +1181,15 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
threads.x * threads.y); threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_full_patch' failed (%s)," "INFO: impl 'conv_full_patch' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(&conv_full_patch_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
} }
} }
if (false && !subsample && //disabled as test fail for this kernel if (false && !subsample && //disabled as test fail for this kernel
...@@ -1217,37 +1199,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1217,37 +1199,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything !work_complete) //conv_full_load_everything
{ {
dim3 threads(out_wid, out_len); size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
dim3 grid(nbatch); size_t n_blocks[3] = {(size_t)nbatch, (size_t)1, (size_t)1};
int shared_size=(img_size + kern_size)*nstack*sizeof(float); size_t shmem_sz = (img_size + kern_size)*nstack*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions. //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
//typeof(conv_full_load_everything<0>) f = ; void *kernel_params[] = {
void (*f)(const float*, const float*, float*, (void *)img->ga.data, (void *)&img->ga.offset,
int, int, int, int, int, int, (void *)kern->ga.data, (void *)&kern->ga.offset,
int, int, int, int, int, int, int, int) = conv_full_load_everything<0>; (void *)out->ga.data, (void *)&out->ga.offset,
(void *)&img_len, (void *)&img_wid,
f = conv_full_load_everything<THEANO_KERN_WID>; (void *)&kern_len, (void *)&kern_wid,
(void *)&nkern, (void *)&nstack,
f<<< grid, threads, shared_size>>> (void *)&img_stride_col, (void *)&img_stride_row,
(cuda_get_ptr(img), (void *)&img_stride_stack, (void *)&img_stride_batch,
cuda_get_ptr(kern), (void *)&kern_stride_col, (void *)&kern_stride_row,
cuda_get_ptr(out), (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
img_len, img_wid, int err = GpuKernel_call(&conv_full_load_everything_node_<<<<HASH_PLACEHOLDER>>>>_0,
kern_len, kern_wid, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
nkern, nstack,
PyGpuArray_STRIDES(img)[3]/4, if (err == GA_NO_ERROR)
PyGpuArray_STRIDES(img)[2]/4,
PyGpuArray_STRIDES(img)[1]/4,
PyGpuArray_STRIDES(img)[0]/4,
PyGpuArray_STRIDES(kern)[3]/4,
PyGpuArray_STRIDES(kern)[2]/4,
PyGpuArray_STRIDES(kern)[1]/4,
PyGpuArray_STRIDES(kern)[0]/4
);
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{ {
if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n"); if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n");
work_complete = true; work_complete = true;
...@@ -1256,14 +1227,14 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1256,14 +1227,14 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
threads.x * threads.y); threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_load_everything'" fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
" failed (%s), trying next implementation\n", " failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(&conv_full_load_everything_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
} }
} }
...@@ -1275,32 +1246,29 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1275,32 +1246,29 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack !work_complete) //conv_full_patch_stack
{ {
dim3 threads(out_wid, out_len); size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
dim3 grid(nbatch,nkern); size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
int shared_size=(img_size + kern_size)*sizeof(float); size_t shmem_sz = (img_size + kern_size)*sizeof(float);
void (*f)(const float*, const float*, float*, GpuKernel *k = NULL;
int, int, int, int, if(!img_contiguous_2d && !kern_contiguous_2d) k=&conv_full_patch_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int, int, int, else if(!img_contiguous_2d && kern_contiguous_2d) k=&conv_full_patch_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
int, int, int, int); else if(img_contiguous_2d && !kern_contiguous_2d) k=&conv_full_patch_stack_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
else if(img_contiguous_2d && kern_contiguous_2d) k=&conv_full_patch_stack_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\ void *kernel_params[] = {
else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<false,true>;\ (void *)img->ga.data, (void *)&img->ga.offset,
else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>; (void *)kern->ga.data, (void *)&kern->ga.offset,
(void *)out->ga.data, (void *)&out->ga.offset,
f<<< grid, threads, shared_size>>>( (void *)&img_len, (void *)&img_wid,
cuda_get_ptr(img), (void *)&kern_len, (void *)&kern_wid,
cuda_get_ptr(kern), (void *)&nkern, (void *)&nstack,
cuda_get_ptr(out), (void *)&img_stride_col, (void *)&img_stride_row,
img_len, img_wid, (void *)&kern_stride_col, (void *)&kern_stride_row,
kern_len, kern_wid, (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
nkern, nstack,img_stride_col, img_stride_row, int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern); if (err == GA_NO_ERROR)
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{ {
if (verbose) if (verbose)
fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n"); fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
...@@ -1310,23 +1278,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1310,23 +1278,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " shmem_sz=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shared_size, threads.x * threads.y); shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n", fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(k, err));
} }
} }
if (1 && !work_complete) //conv_reference_full if (1 && !work_complete) //conv_reference_full
{ {
if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n"); if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");
int outsize = PyGpuArray_SIZE(out); size_t outsize = PyGpuArray_SIZE(out);
int n_blocks = std::min(outsize, 4096); size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), (size_t)1, (size_t)1};
256); size_t threads_per_block[3] = {std::min(ceil_intdiv(outsize, n_blocks[0]),
(size_t)256),
(size_t)1, (size_t)1};
if (0) if (0)
{ {
if (verbose) if (verbose)
...@@ -1334,70 +1305,67 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1334,70 +1305,67 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
if (verbose) if (verbose)
fprintf(stderr, " img : %llu %llu %llu %llu %p " fprintf(stderr, " img : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n", "%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(img)[0], (unsigned long long)nbatch,
(unsigned long long)PyGpuArray_DIMS(img)[1], (unsigned long long)stack_len,
(unsigned long long)PyGpuArray_DIMS(img)[2], (unsigned long long)img_len,
(unsigned long long)PyGpuArray_DIMS(img)[3], (unsigned long long)img_wid,
cuda_get_ptr(img), (void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
(long long)PyGpuArray_STRIDES(img)[0]/4, (long long)img_stride_batch,
(long long)PyGpuArray_STRIDES(img)[1]/4, (long long)img_stride_stack,
(long long)PyGpuArray_STRIDES(img)[2]/4, (long long)img_stride_row,
(long long)PyGpuArray_STRIDES(img)[3]/4); (long long)img_stride_col);
if (verbose) if (verbose)
fprintf(stderr, " kern: %llu %llu %llu %llu %p " fprintf(stderr, " kern: %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n", "%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(kern)[0], (unsigned long long)nkern,
(unsigned long long)PyGpuArray_DIMS(kern)[1], (unsigned long long)nstack,
(unsigned long long)PyGpuArray_DIMS(kern)[2], (unsigned long long)kern_len,
(unsigned long long)PyGpuArray_DIMS(kern)[3], (unsigned long long)kern_wid,
cuda_get_ptr(kern), (void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
(long long)PyGpuArray_STRIDES(kern)[0]/4, (long long)kern_stride_nkern,
(long long)PyGpuArray_STRIDES(kern)[1]/4, (long long)kern_stride_stack,
(long long)PyGpuArray_STRIDES(kern)[2]/4, (long long)kern_stride_row,
(long long)PyGpuArray_STRIDES(kern)[3]/4 (long long)kern_stride_col);
);
if (verbose) if (verbose)
fprintf(stderr, " out : %llu %llu %llu %llu %p " fprintf(stderr, " out : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n", "%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0], (unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1], (unsigned long long)PyGpuArray_DIMS(out)[1],
(unsigned long long)PyGpuArray_DIMS(out)[2], (unsigned long long)out_len,
(unsigned long long)PyGpuArray_DIMS(out)[3], (unsigned long long)out_wid,
cuda_get_ptr(out), (void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
(long long)PyGpuArray_STRIDES(out)[0]/4, (long long)out_stride_batch,
(long long)PyGpuArray_STRIDES(out)[1]/4, (long long)out_stride_nkern,
(long long)PyGpuArray_STRIDES(out)[2]/4, (long long)out_stride_row,
(long long)PyGpuArray_STRIDES(out)[3]/4); (long long)out_stride_col);
if (verbose) if (verbose)
fprintf(stderr, " launch params: %i %i %i\n", fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads); outsize, n_blocks[0], threads_per_block[0]);
if (verbose) if (verbose)
fprintf(stderr, " subsample params: %llu %llu\n", fprintf(stderr, " subsample params: %llu %llu\n",
(unsigned long long)subsample_rows, (unsigned long long)subsample_rows,
(unsigned long long)subsample_cols); (unsigned long long)subsample_cols);
} }
conv_reference_full<<<n_blocks, n_threads>>>(
PyGpuArray_DIMS(img)[0], PyGpuArray_DIMS(kern)[0], void *kernel_params[] = {
PyGpuArray_DIMS(img)[1], (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
PyGpuArray_DIMS(img)[2], PyGpuArray_DIMS(img)[3], (void *)&img_len, (void *)&img_wid,
PyGpuArray_DIMS(kern)[2], PyGpuArray_DIMS(kern)[3], (void *)&kern_len, (void *)&kern_wid,
PyGpuArray_DIMS(out)[2], PyGpuArray_DIMS(out)[3], (void *)&out_len, (void *)&out_wid,
cuda_get_ptr(img), PyGpuArray_STRIDES(img)[0]/4, (void *)img->ga.data, (void *)&img->ga.offset,
PyGpuArray_STRIDES(img)[1]/4, (void *)&img_stride_batch, (void *)&img_stride_stack,
PyGpuArray_STRIDES(img)[2]/4, (void *)&img_stride_row, (void *)&img_stride_col,
PyGpuArray_STRIDES(img)[3]/4, (void *)kern->ga.data, (void *)&kern->ga.offset,
cuda_get_ptr(kern), PyGpuArray_STRIDES(kern)[0]/4, (void *)&kern_stride_nkern, (void *)&kern_stride_stack,
PyGpuArray_STRIDES(kern)[1]/4, (void *)&kern_stride_row, (void *)&kern_stride_col,
PyGpuArray_STRIDES(kern)[2]/4, (void *)out->ga.data, (void *)&out->ga.offset,
PyGpuArray_STRIDES(kern)[3]/4, (void *)&out_stride_batch, (void *)&out_stride_nkern,
cuda_get_ptr(out), PyGpuArray_STRIDES(out)[0]/4, (void *)&out_stride_row, (void *)&out_stride_col,
PyGpuArray_STRIDES(out)[1]/4, (void *)&subsample_rows, (void *)&subsample_cols};
PyGpuArray_STRIDES(out)[2]/4, int err = GpuKernel_call(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0,
PyGpuArray_STRIDES(out)[3]/4, 3, threads_per_block, n_blocks, 0, kernel_params);
subsample_rows, subsample_cols);
if (err == GA_NO_ERROR)
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{ {
if (verbose) if (verbose)
fprintf(stderr, "INFO: used 'conv_reference_full' version" fprintf(stderr, "INFO: used 'conv_reference_full' version"
...@@ -1410,17 +1378,18 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1410,17 +1378,18 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," fprintf(stderr, "threads_per_block[0]=%i, threads_per_block[1]=%i,"
" shared_size=%i, nb_threads=%i\n", " n_blocks[0]=%i, n_blocks[1]=%i,"
n_threads, 1, n_blocks, 1, 0, n_threads); " shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], 1, n_blocks[0], 1, 0, threads_per_block[0]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s)," fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); GpuKernel_error(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for" "ERROR: all implementations failed for"
" CudaNdarray_conv_full! (%s)", " CudaNdarray_conv_full! (%s)",
cudaGetErrorString(sts)); GpuKernel_error(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
return -1; return -1;
} }
} }
......
...@@ -3,13 +3,20 @@ import os ...@@ -3,13 +3,20 @@ import os
import theano import theano
from theano import config, gof from theano import config, gof
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
from six.moves import reduce from six.moves import reduce
from .comp import NVCC_compiler from .comp import NVCC_compiler
from .type import GpuArrayType from .type import GpuArrayType
from .basic_ops import as_gpuarray_variable from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
from theano.gof import utils
class GpuConv(gof.Op): class GpuConv(GpuKernelBase, gof.Op):
""" """
Implement the batched and stacked 2d convolution on the gpu. Implement the batched and stacked 2d convolution on the gpu.
...@@ -223,29 +230,29 @@ class GpuConv(gof.Op): ...@@ -223,29 +230,29 @@ class GpuConv(gof.Op):
return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G'] return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G']
def c_headers(self): def c_headers(self):
return ['<stdio.h>', 'cuda.h', if pygpu.get_default_context().kind == 'opencl':
'<gpuarray/extension.h>', '<numpy_compat.h>'] raise MethodNotDefined('cuda only')
return ['<stdint.h>', '<stdio.h>', 'cuda.h',
'<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 21) return (0, 21)
def c_init_code(self): def c_init_code(self):
return ['cuda_get_ptr_raw = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");'] if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
def c_support_code_apply(self, node, nodename): return ['setup_ext_cuda();']
# REMEMBER TO RAISE c_code_cache_version when changing any of
# these files
files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu']
codes = ["CUdeviceptr (*cuda_get_ptr_raw)(gpudata *g);",
"float* cuda_get_ptr(PyGpuArrayObject * o){return (float*) (cuda_get_ptr_raw(o->ga.data) + o->ga.offset);}",
"const float* cuda_get_ptr(const PyGpuArrayObject * o){return (float*) (cuda_get_ptr_raw(o->ga.data) + o->ga.offset);}"]
codes += [open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in files]
return reduce(str.__add__, codes)
def c_compiler(self):
return NVCC_compiler
def c_code(self, node, nodename, inp, out_, sub): def c_code(self, node, nodename, inp, out_, sub):
img, kern = inp img, kern = inp
...@@ -270,8 +277,8 @@ class GpuConv(gof.Op): ...@@ -270,8 +277,8 @@ class GpuConv(gof.Op):
//Optional args //Optional args
int version = %(version)s; int version = %(version)s;
int verbose = %(verbose)s; int verbose = %(verbose)s;
int dx = %(dx)s; size_t dx = %(dx)s;
int dy = %(dy)s; size_t dy = %(dy)s;
int mode; int mode;
if (strcmp(mode_str, "full") == 0) if (strcmp(mode_str, "full") == 0)
...@@ -286,7 +293,7 @@ class GpuConv(gof.Op): ...@@ -286,7 +293,7 @@ class GpuConv(gof.Op):
{ {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"mode must be one of 'full' or 'valid'"); "mode must be one of 'full' or 'valid'");
return NULL; return 0;
} }
// TODO, make out be decref before we alloc out2! // TODO, make out be decref before we alloc out2!
...@@ -303,3 +310,261 @@ class GpuConv(gof.Op): ...@@ -303,3 +310,261 @@ class GpuConv(gof.Op):
%(fail)s %(fail)s
} }
""" % sub """ % sub
def c_support_code_apply(self, node, name):
nb = 0
if self.kshp is not None:
nb = self.kshp[1]
kernels = self.gpu_kernels(node, name)
k = kernels[0]
code = """
#define THEANO_KERN_WID %(nb)d
""" % locals()
code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
kname = "conv_full_load_everything"
gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
bin = gk._binary
bcode = ','.join(hex(ord(c)) for c in bin)
code = code.replace('\\', '\\\\')
code = code.replace('"', '\\"')
code = code.replace('\n', '\\n')
mod = """
static const char conv_bcode[] = {%(bcode)s};
static const char *conv_code = "%(code)s";
""" % locals()
for k in kernels:
mod += "static GpuKernel " + k.name + '_' + name + ";\n"
mod += open(os.path.join(os.path.split(__file__)[0], "conv.cu")).read()
return mod
@utils.memoize
def gpu_kernels(self, node, name):
dtypes = [i.dtype for i in node.inputs]
dtypes.extend([o.dtype for o in node.outputs])
flags = Kernel.get_flags(*dtypes)
kernels = self.conv_patch_kernels(name, flags)
kernels.extend(self.conv_patch_stack_kernels(name, flags))
kernels.extend(self.conv_patch_stack_reduce_kernels(name, flags))
kernels.extend(self.conv_rows_kernels(name, flags))
kernels.extend(self.conv_rows_stack_kernels(name, flags))
kernels.extend(self.conv_rows_stack2_kernels(name, flags))
kernels.extend(self.conv_valid_row_reduce_kernels(name, flags))
kernels.extend(self.conv_reference_valid_kernels(name, flags))
kernels.extend(self.conv_reference_full_kernels(name, flags))
kernels.extend(self.conv_full_patch_kernels(name, flags))
kernels.extend(self.conv_full_patch_stack_kernels(name, flags))
kernels.extend(self.conv_full_patch_stack_padded_kernels(name, flags))
kernels.extend(self.conv_full_load_everything_kernels(name, flags))
return kernels
def conv_patch_kernels(self, name, flags):
kname = "conv_patch_%d"
k_var = "conv_patch_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [2, 3]
]
def conv_patch_stack_kernels(self, name, flags):
kname = "conv_patch_stack_%d"
k_var = "conv_patch_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in range(64, 96)
]
def conv_patch_stack_reduce_kernels(self, name, flags):
kname = "conv_patch_stack_reduce_%d"
k_var = "conv_patch_stack_reduce_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15]
]
def conv_rows_kernels(self, name, flags):
kname = "conv_rows_%d"
k_var = "conv_rows_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_rows_stack_kernels(self, name, flags):
kname = "conv_rows_stack_%d"
k_var = "conv_rows_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_rows_stack2_kernels(self, name, flags):
kname = "conv_rows_stack2_%d"
k_var = "conv_rows_stack2_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 3]
]
def conv_valid_row_reduce_kernels(self, name, flags):
kname = "conv_valid_row_reduce_%d"
k_var = "conv_valid_row_reduce_%d_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1]
]
def conv_reference_valid_kernels(self, name, flags):
kname = "conv_reference_valid"
k_var = "conv_reference_valid_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_reference_full_kernels(self, name, flags):
kname = "conv_reference_full"
k_var = "conv_reference_full_" + name
params = [
'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_full_patch_kernels(self, name, flags):
kname = "conv_full_patch"
k_var = "conv_full_patch_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
def conv_full_patch_stack_kernels(self, name, flags):
kname = "conv_full_patch_stack_%d"
k_var = "conv_full_patch_stack_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 3]
]
def conv_full_patch_stack_padded_kernels(self, name, flags):
kname = "conv_full_patch_stack_padded_%d"
k_var = "conv_full_patch_stack_padded_%d_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname % i, flags,
'conv_code', 'conv_bcode', k_var % i)
for i in [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14]
]
def conv_full_load_everything_kernels(self, name, flags):
kname = "conv_full_load_everything"
k_var = "conv_full_load_everything_" + name
params = [
gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp',
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc'
]
return [
Kernel(None, params, kname, flags,
'conv_code', 'conv_bcode', k_var)
]
extern __shared__ float s_data[];
//we store the full image and the full kernel in the shared memory //we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output //each thread compute only one value for the output
//thread block size=out_wid, out_len/nb_split //thread block size=out_wid, out_len/nb_split
//grid block size=batch_id //grid block size=batch_id
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void extern "C" __global__ void
conv_full_patch_split(const float* img, const float* kern, float* out, conv_full_patch_split(const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid, int nb_split) int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1; out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x; int batch_id = blockIdx.x;
// Thread index // Thread index
...@@ -60,18 +67,23 @@ conv_full_patch_split(const float* img, const float* kern, float* out, ...@@ -60,18 +67,23 @@ conv_full_patch_split(const float* img, const float* kern, float* out,
//thread block size=out_wid, out_len //thread block size=out_wid, out_len
//grid block size=batch_id, nkern //grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void extern "C" __global__ void
conv_full_patch( const float* img, const float* kern, float* out, conv_full_patch( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack) int kern_len, int kern_wid, int nkern, int nstack)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1; out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x; int batch_id = blockIdx.x;
// Thread index // Thread index
...@@ -114,6 +126,8 @@ conv_full_patch( const float* img, const float* kern, float* out, ...@@ -114,6 +126,8 @@ conv_full_patch( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
//we store the full image and the full kernel in the shared memory //we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output //each thread compute only one value for the output
//thread block size=out_wid, out_len //thread block size=out_wid, out_len
...@@ -122,8 +136,10 @@ conv_full_patch( const float* img, const float* kern, float* out, ...@@ -122,8 +136,10 @@ conv_full_patch( const float* img, const float* kern, float* out,
//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions. //template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d> template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__global__ void __device__ inline void
conv_full_patch_stack( const float* img, const float* kern, float* out, conv_full_patch_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack, int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
...@@ -131,12 +147,15 @@ conv_full_patch_stack( const float* img, const float* kern, float* out, ...@@ -131,12 +147,15 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1; out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;//blockDim.z* nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
const float __shared__ *kern_, *img_; const float __shared__ *kern_, *img_;
extern __shared__ float s_data[];
const int batch_id = blockIdx.x; const int batch_id = blockIdx.x;
const int nkern_id = blockIdx.y; const int nkern_id = blockIdx.y;
...@@ -182,6 +201,36 @@ conv_full_patch_stack( const float* img, const float* kern, float* out, ...@@ -182,6 +201,36 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \
__global__ void \
conv_full_patch_stack_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, \
int kern_len, int kern_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_full_patch_stack<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_FULL_PATCH_STACK(0, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK(1, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK(2, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK
}
/** /**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory. * As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency. * I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
...@@ -200,24 +249,36 @@ conv_full_patch_stack( const float* img, const float* kern, float* out, ...@@ -200,24 +249,36 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
* template low_mem: if true, as split but with use less dynamic shared memory but use more registers. * template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
* if you set split and low_mem to true, we will use the low_mem version! * if you set split and low_mem to true, we will use the low_mem version!
*/ */
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem > template<bool flipped_kern, bool c_contiguous, bool split, bool low_mem >
__global__ void __device__ inline void
conv_full_patch_stack_padded( const float* img, const float* kern, float* out, conv_full_patch_stack_padded( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid, const int img_len, const int img_wid,
const int kern_len, const int kern_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch, const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row, int kern_stride_col, int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern) const int kern_stride_stack, const int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern = &(kern[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
kern_stride_col=1;
kern_stride_row=kern_wid;
}
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1; out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
__shared__ int batch_id, kern_id, img_wid_valid, nb_rows; __shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
batch_id = blockIdx.x; batch_id = blockIdx.x;
kern_id = blockIdx.y; kern_id = blockIdx.y;
...@@ -257,7 +318,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out, ...@@ -257,7 +318,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
} }
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
...@@ -292,7 +353,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out, ...@@ -292,7 +353,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
} }
if(out_row<out_len) if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
...@@ -340,7 +401,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out, ...@@ -340,7 +401,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
} }
} }
if(out_row<out_len) if(out_row<out_len)
...@@ -351,6 +412,46 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out, ...@@ -351,6 +412,46 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
} }
} }
extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \
__global__ void \
conv_full_patch_stack_padded_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_full_patch_stack_padded<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(0, false, false, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(1, false, false, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(2, false, false, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(4, false, true, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(5, false, true, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(6, false, true, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(8, true, false, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(9, true, false, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(10, true, false, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(12, true, true, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(13, true, true, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED
}
template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy) template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy)
{ {
return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ; return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
...@@ -364,9 +465,10 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co ...@@ -364,9 +465,10 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
{ {
return x[0] * y[0]; return x[0] * y[0];
} }
template<int NSTACK> extern "C" __global__ void
__global__ void conv_full_load_everything( const float* img, const size_t img_offset,
conv_full_load_everything( const float* img, const float* kern, float* out, const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack, int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
...@@ -375,12 +477,15 @@ conv_full_load_everything( const float* img, const float* kern, float* out, ...@@ -375,12 +477,15 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1; out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x; nb_thread_id = blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x; int batch_id = blockIdx.x;
const int out_col = threadIdx.x;//output col const int out_col = threadIdx.x;//output col
...@@ -423,9 +528,9 @@ conv_full_load_everything( const float* img, const float* kern, float* out, ...@@ -423,9 +528,9 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
{ {
int icol = out_col - kern_wid+1+col; int icol = out_col - kern_wid+1+col;
if (icol < 0 || icol > img_wid) continue; if (icol < 0 || icol > img_wid) continue;
if (NSTACK > 0) if (THEANO_KERN_WID > 0)
{ {
sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid, sum += everything_dot<THEANO_KERN_WID>(d_img + irow*img_wid + icol, img_len*img_wid,
d_kern + row*kern_wid+col, kern_len*kern_wid); d_kern + row*kern_wid+col, kern_len*kern_wid);
} }
else else
...@@ -443,6 +548,8 @@ conv_full_load_everything( const float* img, const float* kern, float* out, ...@@ -443,6 +548,8 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
__syncthreads(); //don't start loading another kernel until we're done here __syncthreads(); //don't start loading another kernel until we're done here
} }
} }
/* /*
Local Variables: Local Variables:
mode:c++ mode:c++
......
...@@ -29,7 +29,6 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) { ...@@ -29,7 +29,6 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
*/ */
#ifndef CONV_KERNEL_CU #ifndef CONV_KERNEL_CU
#define CONV_KERNEL_CU #define CONV_KERNEL_CU
#include <stdint.h>
/* /*
#define CHECK_BANK_CONFLICTS 0 #define CHECK_BANK_CONFLICTS 0
...@@ -182,12 +181,11 @@ template<> __device__ float convolutionRowNoFlip<0>(const float *data, ...@@ -182,12 +181,11 @@ template<> __device__ float convolutionRowNoFlip<0>(const float *data,
return 0; return 0;
} }
template<int KERN_WIDTH>
__device__ void convolutionRowNoFlip(float& sum, __device__ void convolutionRowNoFlip(float& sum,
const float *data, const float *data,
const float *kern, const int kern_wid){ const float *kern, const int kern_wid){
if(KERN_WIDTH>0) if(THEANO_KERN_WID>0)
sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern); sum+=convolutionRowNoFlip<THEANO_KERN_WID>(data,kern);
else else
#pragma unroll 8 #pragma unroll 8
for (int col=0; col < kern_wid; col++) {//loop over col for (int col=0; col < kern_wid; col++) {//loop over col
...@@ -219,13 +217,20 @@ __device__ void store_or_accumulate(float& dst,const float value ){ ...@@ -219,13 +217,20 @@ __device__ void store_or_accumulate(float& dst,const float value ){
* When true, allow for output image bigger then 512 pixel. * When true, allow for output image bigger then 512 pixel.
* Use more registers. * Use more registers.
*/ */
template<bool flipped_kern, int KERN_WIDTH, bool split> template<bool flipped_kern, bool split>
__global__ void __device__ inline void
conv_patch( const float* img, const float* kern, float* out, conv_patch( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack) int nkern, int nstack)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1; out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1; out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
...@@ -260,7 +265,7 @@ conv_patch( const float* img, const float* kern, float* out, ...@@ -260,7 +265,7 @@ conv_patch( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
blockIdx.y*out_wid*out_len+//the output image blockIdx.y*out_wid*out_len+//the output image
...@@ -271,7 +276,7 @@ conv_patch( const float* img, const float* kern, float* out, ...@@ -271,7 +276,7 @@ conv_patch( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image kern_id*out_wid*out_len+//the output image
...@@ -280,6 +285,28 @@ conv_patch( const float* img, const float* kern, float* out, ...@@ -280,6 +285,28 @@ conv_patch( const float* img, const float* kern, float* out,
} }
} }
extern "C" {
#define __INSTANTIATE_CONV_PATCH(suffix, ...) \
__global__ void \
conv_patch_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack) \
{ \
conv_patch<__VA_ARGS__>(img, img_offset, kern, kern_offset, \
out, out_offset, img_len, img_wid, kern_len, \
kern_wid, nkern, nstack); \
}
__INSTANTIATE_CONV_PATCH(2, true, false)
__INSTANTIATE_CONV_PATCH(3, true, true)
#undef __INSTANTIATE_CONV_PATCH
}
/** /**
* As conv_patch, but implement the stack in the kernel. * As conv_patch, but implement the stack in the kernel.
* I keep it separated from conv_patch as we take more registers and this could lower the occupency. * I keep it separated from conv_patch as we take more registers and this could lower the occupency.
...@@ -295,16 +322,17 @@ conv_patch( const float* img, const float* kern, float* out, ...@@ -295,16 +322,17 @@ conv_patch( const float* img, const float* kern, float* out,
* dy: patch stride cols(1 for normal convolution) * dy: patch stride cols(1 for normal convolution)
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template accumulate: if true, we add the result, else we override the result * template accumulate: if true, we add the result, else we override the result
* template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
* template img_c_contiguous_2d: if true, the img have are collon and row contiguous * template img_c_contiguous_2d: if true, the img have are collon and row contiguous
* template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous * template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
* template split: if true, each thread generate more than 1 output pixel, but use more registers. * template split: if true, each thread generate more than 1 output pixel, but use more registers.
* template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time. * template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
* template subsample: if false, remove some computation needed when dx or dy!=1. * template subsample: if false, remove some computation needed when dx or dy!=1.
*/ */
template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample> template<bool flipped_kern, bool accumulate, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__global__ void __device__ inline void
conv_patch_stack( const float* img, const float* kern, float* out, conv_patch_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid, int out_len, int out_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
...@@ -313,6 +341,11 @@ conv_patch_stack( const float* img, const float* kern, float* out, ...@@ -313,6 +341,11 @@ conv_patch_stack( const float* img, const float* kern, float* out,
int kern_stride_stack, int kern_stride_nkern, int dx, int dy) int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
{ {
int __shared__ nb_thread_id; int __shared__ nb_thread_id;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[]; extern __shared__ float s_data[];
...@@ -365,7 +398,7 @@ conv_patch_stack( const float* img, const float* kern, float* out, ...@@ -365,7 +398,7 @@ conv_patch_stack( const float* img, const float* kern, float* out,
else else
idx_in=&d_img[(row+out_row)*img_wid+out_col]; idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
...@@ -425,7 +458,7 @@ conv_patch_stack( const float* img, const float* kern, float* out, ...@@ -425,7 +458,7 @@ conv_patch_stack( const float* img, const float* kern, float* out,
//as we store the result of only the good thread. //as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card. //This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len) if(out_row<out_len)
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
...@@ -440,6 +473,67 @@ conv_patch_stack( const float* img, const float* kern, float* out, ...@@ -440,6 +473,67 @@ conv_patch_stack( const float* img, const float* kern, float* out,
} }
extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK(suffix, ...) \
__global__ void \
conv_patch_stack_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int out_len, int out_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern, \
int dx, int dy) \
{ \
conv_patch_stack<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, out_len, \
out_wid, nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern, dx, dy); \
}
__INSTANTIATE_CONV_PATCH_STACK(64, true, false, false, false, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(65, true, false, false, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(66, true, false, false, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(67, true, false, false, false, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(68, true, false, false, false, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(69, true, false, false, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(70, true, false, false, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(71, true, false, false, false, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(72, true, false, false, true, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(73, true, false, false, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(74, true, false, false, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(75, true, false, false, true, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(76, true, false, false, true, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(77, true, false, false, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(78, true, false, false, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(79, true, false, false, true, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(80, true, false, true, false, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(81, true, false, true, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(82, true, false, true, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(83, true, false, true, false, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(84, true, false, true, false, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(85, true, false, true, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(86, true, false, true, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(87, true, false, true, false, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(88, true, false, true, true, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(89, true, false, true, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(90, true, false, true, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(91, true, false, true, true, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(92, true, false, true, true, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(93, true, false, true, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(94, true, false, true, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true)
#undef __INSTANTIATE_CONV_PATCH_STACK
}
/** /**
* As conv_patch_stack, but kern_len thread for each output pixel * As conv_patch_stack, but kern_len thread for each output pixel
* I keep it separated as use more register. * I keep it separated as use more register.
...@@ -454,9 +548,11 @@ conv_patch_stack( const float* img, const float* kern, float* out, ...@@ -454,9 +548,11 @@ conv_patch_stack( const float* img, const float* kern, float* out,
* template img_contiguous: if true, the img have are collon and row contiguous * template img_contiguous: if true, the img have are collon and row contiguous
* template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time * template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
*/ */
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern> template<bool flipped_kern, bool c_contiguous, bool split, bool preload_full_kern>
__global__ void __device__ inline void
conv_patch_stack_reduce( const float* img, const float* kern, float* out, conv_patch_stack_reduce( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
...@@ -470,6 +566,17 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out, ...@@ -470,6 +566,17 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
const int out_len = blockDim.y; const int out_len = blockDim.y;
const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x; const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern = &(kern[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
kern_stride_col=1;
kern_stride_row=kern_wid;
}
extern __shared__ float s_data[]; extern __shared__ float s_data[];
int batch_id = blockIdx.x; int batch_id = blockIdx.x;
...@@ -521,7 +628,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out, ...@@ -521,7 +628,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
float sum2 = 0; float sum2 = 0;
if(tz<len3) if(tz<len3)
convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum2,idx_in,idx_kern,kern_wid);
sum+=sum2; sum+=sum2;
} }
}else if(split){ }else if(split){
...@@ -531,7 +638,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out, ...@@ -531,7 +638,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
for(int row=tz;row<kern_len;row+=blockDim.z){ for(int row=tz;row<kern_len;row+=blockDim.z){
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
}else{ }else{
int row = tz;//The row of the kernel. int row = tz;//The row of the kernel.
...@@ -540,7 +647,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out, ...@@ -540,7 +647,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len, load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
...@@ -559,6 +666,49 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out, ...@@ -559,6 +666,49 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
} }
} }
extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK_REDUCE(suffix, ...) \
__global__ void \
conv_patch_stack_reduce_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_patch_stack_reduce<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE#(0, false, false, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(1, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(2, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(3, false, false, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE#(4, false, true, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(5, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(6, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(7, false, true, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(8, true, false, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(9, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(10, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(11, true, false, true, true)
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(12, true, true, false, false)*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(13, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(14, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true)
#undef __INSTANTIATE_CONV_PATCH_STACK_REDUCE
}
/** /**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* we store kern_len row of the image and the full kernel in the shared memory * we store kern_len row of the image and the full kernel in the shared memory
...@@ -570,9 +720,11 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out, ...@@ -570,9 +720,11 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
* Diff with conv_patch: don't store the full image in the shared memory. * Diff with conv_patch: don't store the full image in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>. * I.E. work for bigger image then conv_patch<split=true,...>.
*/ */
template<int KERN_WIDTH, bool c_contiguous> template<bool c_contiguous>
__global__ void __device__ inline void
conv_rows( const float* img, const float* kern, float* out, conv_rows( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
...@@ -582,6 +734,11 @@ conv_rows( const float* img, const float* kern, float* out, ...@@ -582,6 +734,11 @@ conv_rows( const float* img, const float* kern, float* out,
{ {
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id; int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
float __shared__ *d_img, *d_kern; float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1; out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1; out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
...@@ -612,7 +769,7 @@ conv_rows( const float* img, const float* kern, float* out, ...@@ -612,7 +769,7 @@ conv_rows( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row)*img_wid+out_col]; const float* idx_in=&d_img[(row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
...@@ -620,6 +777,36 @@ conv_rows( const float* img, const float* kern, float* out, ...@@ -620,6 +777,36 @@ conv_rows( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
extern "C" {
#define __INSTANTIATE_CONV_ROWS(suffix, ...) \
__global__ void \
conv_rows_##suffix(const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_rows<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS(0, false)
__INSTANTIATE_CONV_ROWS(1, true)
#undef __INSTANTIATE_CONV_ROWS
}
/** /**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows, but implement the stack. Separate as this use more register. * as conv_rows, but implement the stack. Separate as this use more register.
...@@ -631,9 +818,11 @@ conv_rows( const float* img, const float* kern, float* out, ...@@ -631,9 +818,11 @@ conv_rows( const float* img, const float* kern, float* out,
* Diff with conv_patch: don't store the full image in the shared memory. * Diff with conv_patch: don't store the full image in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>. * I.E. work for bigger image then conv_patch<split=true,...>.
*/ */
template<int KERN_WIDTH, bool c_contiguous> template<bool c_contiguous>
__global__ void __device__ inline void
conv_rows_stack( const float* img, const float* kern, float* out, conv_rows_stack( const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid, const int kern_len, const int kern_wid, const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
...@@ -643,6 +832,11 @@ conv_rows_stack( const float* img, const float* kern, float* out, ...@@ -643,6 +832,11 @@ conv_rows_stack( const float* img, const float* kern, float* out,
{ {
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows; int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern; float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1; out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1; out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
...@@ -708,7 +902,7 @@ conv_rows_stack( const float* img, const float* kern, float* out, ...@@ -708,7 +902,7 @@ conv_rows_stack( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads();//to be sure all thread have finished before we modif the shared memory. __syncthreads();//to be sure all thread have finished before we modif the shared memory.
} }
...@@ -718,6 +912,38 @@ conv_rows_stack( const float* img, const float* kern, float* out, ...@@ -718,6 +912,38 @@ conv_rows_stack( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK(suffix, ...) \
__global__ void \
conv_rows_stack_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_rows_stack<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS_STACK(0, false)
__INSTANTIATE_CONV_ROWS_STACK(1, true)
#undef __INSTANTIATE_CONV_ROWS_STACK
}
/** /**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row. * as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
...@@ -729,9 +955,11 @@ conv_rows_stack( const float* img, const float* kern, float* out, ...@@ -729,9 +955,11 @@ conv_rows_stack( const float* img, const float* kern, float* out,
* Diff with conv_patch: don't store the full image and kernel in the shared memory. * Diff with conv_patch: don't store the full image and kernel in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>. * I.E. work for bigger image then conv_patch<split=true,...>.
*/ */
template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern> template<bool c_contiguous, bool preload_full_kern>
__global__ void __device__ inline void
conv_rows_stack2(const float* img, const float* kern, float* out, conv_rows_stack2(const float* img, const size_t img_offset,
const float* kern, const size_t kern_offset,
float* out, const size_t out_offset,
const int img_len, const int img_wid, const int kern_len, const int kern_wid, const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
...@@ -741,6 +969,11 @@ conv_rows_stack2(const float* img, const float* kern, float* out, ...@@ -741,6 +969,11 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
{ {
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows; int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern; float __shared__ *d_img, *d_kern;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
out_len = img_len - kern_len + 1; out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1; out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
...@@ -804,7 +1037,7 @@ conv_rows_stack2(const float* img, const float* kern, float* out, ...@@ -804,7 +1037,7 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
else idx_kern=d_kern; else idx_kern=d_kern;
const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col]; const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
float sum_ =0.0f; float sum_ =0.0f;
convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum_,idx_in,idx_kern,kern_wid);
sum+=sum_;//We pass by an intermediate variable to have more precission. sum+=sum_;//We pass by an intermediate variable to have more precission.
} }
} }
...@@ -816,6 +1049,39 @@ conv_rows_stack2(const float* img, const float* kern, float* out, ...@@ -816,6 +1049,39 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK2(suffix, ...) \
__global__ void \
conv_rows_stack2_##suffix( \
const float *img, const size_t img_offset, \
const float *kern, const size_t kern_offset, \
float *out, const size_t out_offset, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_rows_stack2<__VA_ARGS__>( \
img, img_offset, kern, kern_offset, out, out_offset, \
img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS_STACK2(0, false, false)
__INSTANTIATE_CONV_ROWS_STACK2(1, false, true)
__INSTANTIATE_CONV_ROWS_STACK2(2, true, false)
__INSTANTIATE_CONV_ROWS_STACK2(3, true, true)
#undef __INSTANTIATE_CONV_ROWS_STACK2
}
/** /**
* Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the * Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
* kernel-image inner-product in parallel. * kernel-image inner-product in parallel.
...@@ -826,18 +1092,23 @@ conv_rows_stack2(const float* img, const float* kern, float* out, ...@@ -826,18 +1092,23 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
* TODO: explain parameters, preconditions * TODO: explain parameters, preconditions
*/ */
template<bool stack_loop> template<bool stack_loop>
__global__ void __device__ inline void
conv_valid_row_reduce(int nB, int nK, int stacklen, conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int kern_len, int kern_wid,
int out_len, int out_wid, //physical int out_len, int out_wid, //physical
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, const float *img, const size_t img_offset, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, const float *kern, const size_t kern_offset, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C , float *out, const size_t out_offset, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols, int subsample_rows, int subsample_cols,
const int initial_reduce_boundary) const int initial_reduce_boundary)
{ {
const int outsize = nB * nK * out_len * out_wid; const int outsize = nB * nK * out_len * out_wid;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
extern __shared__ float reducebuf[]; extern __shared__ float reducebuf[];
for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x) for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x)
{ {
...@@ -911,6 +1182,36 @@ conv_valid_row_reduce(int nB, int nK, int stacklen, ...@@ -911,6 +1182,36 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
} }
} }
extern "C" {
#define __INSTANTIATE_CONV_VALID_ROW_REDUCE(suffix, ...) \
__global__ void \
conv_valid_row_reduce_##suffix( \
int nB, int nK, int stacklen, int img_len, int img_wid, \
int kern_len, int kern_wid, int out_len, int out_wid, \
const float *img, const size_t img_offset, \
int img_str_B, int img_str_S, int img_str_R, int img_str_C, \
const float *kern, const size_t kern_offset, \
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, \
float *out, const size_t out_offset, \
int out_str_B, int out_str_K, int out_str_R, int out_str_C, \
int subsample_rows, int subsample_cols, \
const int initial_reduce_boundary) \
{ \
conv_valid_row_reduce<__VA_ARGS__>( \
nB, nK, stacklen, img_len, img_wid, \
kern_len, kern_wid, out_len, out_wid, \
img, img_offset, img_str_B, img_str_S, img_str_R, img_str_C, \
kern, kern_offset, kern_str_K, kern_str_S, kern_str_R, kern_str_C, \
out, out_offset, out_str_B, out_str_K, out_str_R, out_str_C, \
subsample_rows, subsample_cols, initial_reduce_boundary); \
}
__INSTANTIATE_CONV_VALID_ROW_REDUCE(0, false)
__INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true)
#undef __INSTANTIATE_CONV_VALID_ROW_REDUCE
}
/** /**
...@@ -920,18 +1221,26 @@ conv_valid_row_reduce(int nB, int nK, int stacklen, ...@@ -920,18 +1221,26 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
* *
* TODO: explain parameters, preconditions * TODO: explain parameters, preconditions
*/ */
__global__ void extern "C" __global__ void
conv_reference_valid(int nB, int nK, int stacklen, conv_reference_valid(int nB, int nK, int stacklen,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int kern_len, int kern_wid,
int out_len, int out_wid, //physical int out_len, int out_wid, //physical
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, const float *img, const size_t img_offset,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C , const float *kern, const size_t kern_offset,
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, const size_t out_offset,
int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols) int subsample_rows, int subsample_cols)
{ {
const int idx = blockIdx.x * blockDim.x + threadIdx.x; const int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int numThreads, outsize; __shared__ int numThreads, outsize;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
numThreads = blockDim.x * gridDim.x; numThreads = blockDim.x * gridDim.x;
outsize = nB * nK * out_len * out_wid; outsize = nB * nK * out_len * out_wid;
...@@ -972,6 +1281,8 @@ conv_reference_valid(int nB, int nK, int stacklen, ...@@ -972,6 +1281,8 @@ conv_reference_valid(int nB, int nK, int stacklen,
} }
} }
/** /**
* Reference implementation of 'full' mode convolution (with stack) * Reference implementation of 'full' mode convolution (with stack)
* *
...@@ -979,18 +1290,26 @@ conv_reference_valid(int nB, int nK, int stacklen, ...@@ -979,18 +1290,26 @@ conv_reference_valid(int nB, int nK, int stacklen,
* *
* TODO: explain parameters, preconditions * TODO: explain parameters, preconditions
*/ */
__global__ void extern "C" __global__ void
conv_reference_full(int nB, int nK, int stacklen, conv_reference_full(int nB, int nK, int stacklen,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int kern_len, int kern_wid,
int out_len, int out_wid, //physical dimensions int out_len, int out_wid, //physical dimensions
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, const float *img, const size_t img_offset,
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C, const float *kern, const size_t kern_offset,
int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, const size_t out_offset,
int out_str_B, int out_str_K, int out_str_R, int out_str_C,
int subsample_rows, int subsample_cols) int subsample_rows, int subsample_cols)
{ {
const int idx = blockIdx.x * blockDim.x + threadIdx.x; const int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int numThreads, physical_outsize; __shared__ int numThreads, physical_outsize;
kern = (const float *)(((const char *)kern)+kern_offset);
img = (const float *)(((const char *)img)+img_offset);
out = (float *)(((char *)out)+out_offset);
numThreads = blockDim.x * gridDim.x; numThreads = blockDim.x * gridDim.x;
physical_outsize = nB * nK * out_len * out_wid; physical_outsize = nB * nK * out_len * out_wid;
......
from __future__ import print_function from __future__ import print_function
import copy import copy
import os
from theano.compat import izip from theano.compat import izip
import numpy import numpy
...@@ -8,6 +9,7 @@ from theano import Apply, scalar, config ...@@ -8,6 +9,7 @@ from theano import Apply, scalar, config
from theano import scalar as scal from theano import scalar as scal
from six.moves import StringIO, xrange from six.moves import StringIO, xrange
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
from theano.gof.cmodule import GCC_compiler
from theano.scalar import Scalar from theano.scalar import Scalar
from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype) from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)
...@@ -23,7 +25,6 @@ except ImportError: ...@@ -23,7 +25,6 @@ except ImportError:
from .basic_ops import (as_gpuarray_variable, HideC, from .basic_ops import (as_gpuarray_variable, HideC,
GpuKernelBase, Kernel) GpuKernelBase, Kernel)
from .comp import NVCC_compiler
from .type import GpuArrayType from .type import GpuArrayType
from .fp16_help import load_w, write_w from .fp16_help import load_w, write_w
...@@ -57,7 +58,7 @@ def as_C_string_const(s): ...@@ -57,7 +58,7 @@ def as_C_string_const(s):
for l in s.split('\n')) for l in s.split('\n'))
class GpuElemwise(HideC, Elemwise): class GpuElemwise(GpuKernelBase, HideC, Elemwise):
nin = property(lambda self: self.scalar_op.nin) nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout) nout = property(lambda self: self.scalar_op.nout)
_f16_ok = True _f16_ok = True
...@@ -150,39 +151,7 @@ class GpuElemwise(HideC, Elemwise): ...@@ -150,39 +151,7 @@ class GpuElemwise(HideC, Elemwise):
code.append('}') code.append('}')
kop = '\n'.join(code) kop = '\n'.join(code)
# Translate types for scalar composite ops (except complex). support_code = ""
# NB: OpenCL implicitly has 'stdint' defs at the kernel
# compilation stage
support_code = "" if pygpu.get_default_context().kind == 'opencl' else """
#ifdef _MSC_VER
#define signed __int8 int8_t
#define unsigned __int8 uint8_t
#define signed __int16 int16_t
#define unsigned __int16 uint16_t
#define signed __int32 int32_t
#define unsigned __int32 uint32_t
#define signed __int64 int64_t
#define unsigned __int64 uint64_t
#else
#include <stdint.h>
#endif
"""
# Translate ga_ pseudo-types into their specific realizations
support_code += """
#define ga_bool uint8_t
#define ga_byte int8_t
#define ga_ubyte uint8_t
#define ga_short int16_t
#define ga_ushort uint16_t
#define ga_int int32_t
#define ga_uint uint32_t
#define ga_long int64_t
#define ga_ulong uint64_t
#define ga_float float
#define ga_double double
#define ga_half uint16_t
"""
try: try:
# We accept only some c_support_code(). # We accept only some c_support_code().
# This filter is done in the make_node() # This filter is done in the make_node()
...@@ -204,60 +173,64 @@ class GpuElemwise(HideC, Elemwise): ...@@ -204,60 +173,64 @@ class GpuElemwise(HideC, Elemwise):
kop = kop.replace(npy, ga) kop = kop.replace(npy, ga)
return ElemwiseKernel(None, inps+outs, kop, preamble=support_code) return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
def c_headers(self): def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', cuda_root = config.cuda.root
'<gpuarray/ext_cuda.h>'] if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_compiler(self): def c_compiler(self):
return GCC_compiler
def c_headers(self):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
return NVCC_compiler return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_support_code(self): def c_support_code(self):
return self.scalar_op.c_support_code() return self.scalar_op.c_support_code()
def c_support_code_apply(self, node, nodename): def _gpu_kernel_code(self, node, nodename):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
# This is useless by itself, but will serve an eventual c_code # This is useless by itself, but will serve an eventual c_code
# implementation # implementation
k = self.generate_kernel(node, nodename) k = self.generate_kernel(node, nodename)
nd = node.inputs[0].type.ndim nd = node.inputs[0].type.ndim
CLUDA_PREAMBLE = """ res = []
#define local_barrier() __syncthreads();
#define WITHIN_KERNEL __device__
#define KERNEL extern "C" __global__
#define GLOBAL_MEM /* empty */
#define LOCAL_MEM __shared__
#define LOCAL_MEM_ARG /* empty */
#define REQD_WG_SIZE(X,Y,Z) __launch_bounds__(X*Y*Z, 1)
#define LID_0 threadIdx.x
#define LID_1 threadIdx.y
#define LID_2 threadIdx.z
#define GID_0 blockIdx.x
#define GID_1 blockIdx.y
#define GID_2 blockIdx.z
#define LDIM_0 blockDim.x
#define LDIM_1 blockDim.y
#define LDIM_2 blockDim.z
#define GDIM_0 gridDim.x
#define GDIM_1 gridDim.y
#define GDIM_2 gridDim.z
"""
res = [CLUDA_PREAMBLE]
for i in range(0, nd + 1): for i in range(0, nd + 1):
res.append(k.render_basic(i, name="elem_" + str(i)) + ';') res.append(k.render_basic(i, name="elem_" + str(i)) + ';')
res.append(k.contig_src + ';') res.append(k.contig_src + ';')
return '\n'.join(res) return '\n'.join(res)
def gpu_kernels(self, node, nodename):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
src = self._gpu_kernel_code(node, nodename)
nd = node.outputs[0].ndim
params = ['uintp']
params.extend('uintp' for _ in range(nd))
num_inputs = len(node.inputs)
num_outputs = len(node.outputs)
for n in range(num_inputs + num_outputs):
if (n - len(node.inputs)) in self.inplace_pattern:
continue
params.extend([gpuarray.GpuArray, 'uintp'])
params.extend('intp' for _ in range(nd))
acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype
return [Kernel(code=src, name="elem_%d" % nd, params=params,
flags=Kernel.get_flags(node.inputs[0].type.dtype,
acc_dtype,
node.outputs[0].type.dtype),
objvar='elem_%d_%s' % (nd, nodename))]
def c_init_code(self): def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
...@@ -273,11 +246,15 @@ class GpuElemwise(HideC, Elemwise): ...@@ -273,11 +246,15 @@ class GpuElemwise(HideC, Elemwise):
# check that all inputs have valid dimensions # check that all inputs have valid dimensions
emitted_inames = {} emitted_inames = {}
num_kernel_params = 1 + nd + len(inputs + outputs) * (2 + nd)
code = """ code = """
int n_blocks = 0; size_t n_blocks = 0;
int threads_per_block = 0; size_t threads_per_block = 0;
size_t numEls = 0; size_t numEls = 0;
""" const ssize_t zero = 0;
void *kernel_params[%(num_kernel_params)d] = {0};
int err;
""" % locals()
if nd > 0: if nd > 0:
code += """ code += """
size_t dims[%(nd)s] = {%(initial_dims)s}; size_t dims[%(nd)s] = {%(initial_dims)s};
...@@ -416,23 +393,41 @@ class GpuElemwise(HideC, Elemwise): ...@@ -416,23 +393,41 @@ class GpuElemwise(HideC, Elemwise):
//std::cerr << "calling callkernel returned\\n"; //std::cerr << "calling callkernel returned\\n";
""" % locals() """ % locals()
code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals() kname = 'elem_%d_%s' % (nd, name)
param = [] param = ["(void *)&numEls"]
for i in range(nd): for i in range(nd):
param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0], param.append("(void *)&%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
i=i)) i=i))
for n, (name, var) in enumerate(zip(inputs + outputs, for n, (name, var) in enumerate(zip(inputs + outputs,
node.inputs + node.outputs)): node.inputs + node.outputs)):
if (n - len(inputs)) in self.inplace_pattern: if (n - len(inputs)) in self.inplace_pattern:
continue continue
dtype = dtype_to_ctype(var.dtype) dtype = dtype_to_ctype(var.dtype)
param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals()) param.append("(void *)%(name)s->ga.data" % locals())
param.append("%(name)s->ga.offset" % locals()) param.append("(void *)&%(name)s->ga.offset" % locals())
for i in range(nd): for i in range(nd):
param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals()) param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? (void *)&zero: (void *)&PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
code += ',\n'.join(param) + ");\n" for n, p in enumerate(param):
code += "kernel_params[%(n)d] = %(p)s;\n" % locals()
code += """
err = GpuKernel_call(&%(kname)s, 1, &threads_per_block, &n_blocks, 0, kernel_params);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(kname)s: %%s.",
GpuKernel_error(&%(kname)s, err));
%(fail)s;
}
""" % dict(kname=kname,fail=fail)
if config.gpuarray.sync: if config.gpuarray.sync:
code += "GpuArray_sync(&%(z)s->ga);\n" % dict(z=z) code += """
err = GpuArray_sync(&%(z)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(kname)s: %%s.",
GpuKernel_error(&%(kname)s, err));
%(fail)s;
}
""" % locals()
return str(code) return str(code)
def perform(self, node, inputs, output_storage): def perform(self, node, inputs, output_storage):
...@@ -573,7 +568,7 @@ class GpuDimShuffle(HideC, DimShuffle): ...@@ -573,7 +568,7 @@ class GpuDimShuffle(HideC, DimShuffle):
return (4,) return (4,)
class GpuCAReduceCuda(HideC, CAReduceDtype): class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" """
GpuCAReduceCuda is a Reduction along some dimensions by a scalar op. GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.
...@@ -737,12 +732,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -737,12 +732,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
return False return False
return True return True
def c_header_dirs(self):
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>'] '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_compiler(self):
return NVCC_compiler
def c_init_code(self): def c_init_code(self):
return ['setup_ext_cuda();'] return ['setup_ext_cuda();']
...@@ -840,7 +837,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -840,7 +837,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
# \begin bracket the reduction in a check that there is # \begin bracket the reduction in a check that there is
# actually work to do # actually work to do
if getattr(self.scalar_op, 'identity', None) == 0: if getattr(self.scalar_op, 'identity', None) == 0:
zero_shp = "cudaMemset((%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(out_dtype)s))" % locals() zero_shp = "GpuArray_memset(&%(z)s->ga, 0)" % locals()
# TODO: elif getattr(self.scalar_op, 'identity', None) == 1: # TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
else: else:
scalar_op = self.scalar_op scalar_op = self.scalar_op
...@@ -891,28 +888,24 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -891,28 +888,24 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
.. code-block:: c .. code-block:: c
ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
if (verbose) if (verbose)
printf("running kernel_reduce_10_%(name)s\\n"); printf("running kernel_reduce_10_%(name)s\\n");
int n_shared = sizeof(%(acc_dtype)s) * n_threads.x * n_threads.y * n_threads.z; size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0] * n_threads[1] * n_threads[2];
kernel_reduce_10_%(name)s<<<n_blocks, n_threads, void *kernel_params[] = {
n_shared>>>( (void *)&PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[0], (void *)&PyGpuArray_DIMS(%(x)s)[1],
PyGpuArray_DIMS(%(x)s)[1], (void *)%(x)s->ga.data,
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset), (void *)&%(x)s->ga.offset,
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s), (void *)&stride_A0,
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s), (void *)&stride_A1,
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), (void *)%(z)s->ga.data,
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s) (void *)&%(z)s->ga.offset,
); (void *)&stride_Z0};
[ int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
if config.gpuarray.sync: %(err_check)s
code += "GpuArray_sync(&%(z)s->ga);\n" % dict(z=z)
]
if (cudaSuccess != cudaGetLastError())
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: ... );
%(fail)s;
}
""" """
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
...@@ -923,64 +916,66 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -923,64 +916,66 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
ndim = len(self.reduce_mask) ndim = len(self.reduce_mask)
nd_out = ndim - sum(self.reduce_mask) nd_out = ndim - sum(self.reduce_mask)
shapes_format = "shape=(%s)" % ",".join(["%llu"] * node.inputs[0].ndim) shapes_format = "shape=(%s)" % ",".join(["%llu"] * node.inputs[0].ndim)
shapes_data = ",".join(["(unsigned long long) PyGpuArray_DIMS(%s)[%d]" % (x, i) shapes_data = ",".join(["(size_t) PyGpuArray_DIMS(%s)[%d]" % (x, i)
for i in range(node.inputs[0].ndim)]) for i in range(node.inputs[0].ndim)])
k_var = "kernel_reduce_%(pattern)s_%(name)s" % locals()
params = []
print("""
if (verbose)
printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
int n_shared = sizeof(%(acc_dtype)s) * n_threads.x * n_threads.y * n_threads.z;
if (verbose>1)
printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
" nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
" nb_block=%%d, n_shared=%%d, %(shapes_format)s\\n",
n_threads.x,n_threads.y,n_threads.z,
n_threads.x*n_threads.y*n_threads.z,
n_blocks.x,n_blocks.y,
n_blocks.x*n_blocks.y, n_shared, %(shapes_data)s);
kernel_reduce_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
""" % locals(), file=sio)
for i in xrange(ndim): for i in xrange(ndim):
print(""" params.append("(void *)&PyGpuArray_DIMS(%(x)s)[%(i)s]" % locals())
PyGpuArray_DIMS(%(x)s)[%(i)s], params.append("(void *)%(x)s->ga.data" % locals())
""" % locals(), file=sio) params.append("(void *)&%(x)s->ga.offset" % locals())
print("""
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset)
""" % locals(), file=sio)
for i in xrange(ndim): for i in xrange(ndim):
print(""" print("""
,PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s) ssize_t stride_A%(i)d = PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s);
""" % locals(), file=sio)
print("""
,(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset)
""" % locals(), file=sio) """ % locals(), file=sio)
params.append("(void *)&stride_A%(i)d" % locals())
params.append("(void *)%(z)s->ga.data" % locals())
params.append("(void *)&%(z)s->ga.offset" % locals())
for i in xrange(nd_out): for i in xrange(nd_out):
print(""" print("""
,PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(out_dtype)s) ssize_t stride_Z%(i)d = PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(out_dtype)s);
""" % locals(), file=sio) """ % locals(), file=sio)
params.append("(void *)&stride_Z%(i)d" % locals())
kernel_params = ', '.join(params)
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
print("""
if (verbose)
printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0] * n_threads[1] * n_threads[2];
void *kernel_params[] = { %(kernel_params)s };
if (verbose>1)
printf("n_threads[0]=%%lu, n_threads[1]=%%lu, "
"n_threads[2]=%%lu, n_threads=%%lu, "
"n_blocks[0]=%%lu, n_blocks[1]=%%lu, n_blocks[2]=%%lu, "
"n_blocks=%%lu, n_shared=%%d, %(shapes_format)s\\n",
n_threads[0],n_threads[1],
n_threads[2],
n_threads[0]*n_threads[1]*
n_threads[2],
n_blocks[0],n_blocks[1],n_blocks[2],
n_blocks[0]*n_blocks[1]*n_blocks[2],
n_shared, %(shapes_data)s);
int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
%(err_check)s
""" % locals(), file=sio)
sync = "" sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals() sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print(""" print("""
);
%(sync)s %(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)"
" %(shapes_format)s \\n",
"kernel_reduce_%(pattern)s_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
%(shapes_data)s);
%(fail)s;
}
""" % locals(), file=sio) """ % locals(), file=sio)
return sio.getvalue() return sio.getvalue()
...@@ -993,66 +988,86 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -993,66 +988,86 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
.. code-block:: c .. code-block:: c
static __global__ void kernel_reduce_110_%(nodename)s( KERNEL void kernel_reduce_110_%(nodename)s(
const int d0, const ga_size d0,
const int d1, const ga_size d1,
const int d2, const ga_size d2,
const %(in_dtype)s *A, const %(in_type)s *A,
const int sA0, const ga_size offset_A,
const int sA1, const ga_ssize sA0,
const int sA2, const ga_ssize sA1,
%(out_dtype)s * Z, const ga_ssize sA2,
const int sZ0) %(out_type)s * Z,
const ga_size offset_Z,
const ga_ssize sZ0)
Since the nodename is unique, we don't need to put the name Since the nodename is unique, we don't need to put the name
of the scalar_op in here. of the scalar_op in here.
""" """
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = node.outputs[0].dtype
in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype)
if reduce_mask is None: if reduce_mask is None:
reduce_mask = self.reduce_mask reduce_mask = self.reduce_mask
if ndim is None: if ndim is None:
ndim = len(reduce_mask) ndim = len(reduce_mask)
if pattern is None: if pattern is None:
pattern = ''.join(str(i) for i in reduce_mask) pattern = ''.join(str(i) for i in reduce_mask)
kname = "kernel_reduce_%(pattern)s" % locals()
k_var = "kernel_reduce_%(pattern)s_%(nodename)s" % locals()
params = []
sio = StringIO() sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_%(pattern)s_%(nodename)s( KERNEL void %(kname)s(
""" % locals(), file=sio) """ % locals(), file=sio)
for i in xrange(ndim): for i in xrange(ndim):
params.append('uintp')
print(""" print("""
const int d%(i)s, const ga_size d%(i)s,
""" % locals(), file=sio) """ % locals(), file=sio)
params.append(gpuarray.GpuArray)
params.append('uintp')
print(""" print("""
const %(in_dtype)s *A, const %(in_type)s *A, const ga_size offset_A,
""" % locals(), file=sio) """ % locals(), file=sio)
for i in xrange(ndim): for i in xrange(ndim):
params.append('intp')
print(""" print("""
const int sA%(i)s, const ga_ssize sA%(i)s,
""" % locals(), file=sio) """ % locals(), file=sio)
params.append(gpuarray.GpuArray)
params.append('uintp')
print(""" print("""
%(out_dtype)s * Z %(out_type)s * Z, const ga_size offset_Z
""" % locals(), file=sio) """ % locals(), file=sio)
for i in xrange(ndim - sum(reduce_mask)): for i in xrange(ndim - sum(reduce_mask)):
params.append('intp')
print(""" print("""
, const int sZ%(i)s , const ga_ssize sZ%(i)s
""" % locals(), file=sio) """ % locals(), file=sio)
print(")", file=sio) print(")", file=sio)
return sio.getvalue() return sio.getvalue(), kname, params, k_var
def _k_init(self, node, nodename): def _k_init(self, node, nodename):
in_dtype = node.inputs[0].dtype
out_dtype = node.outputs[0].dtype
acc_dtype = self._acc_dtype(node.inputs[0].dtype) acc_dtype = self._acc_dtype(node.inputs[0].dtype)
# We need to use theano_complex* and not npy_complex* # We need to use theano_complex* and not npy_complex*
acc_dtype = theano.scalar.basic.Scalar(acc_dtype).dtype_specs()[1] in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype)
acc_type = gpuarray.dtype_to_ctype(acc_dtype)
return """ return """
const int threadCount = blockDim.x * blockDim.y * blockDim.z; const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y const int threadNum = threadIdx.z * blockDim.x * blockDim.y
+ threadIdx.y * blockDim.x + threadIdx.x; + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_dtype)s myresult = 0; %(acc_type)s myresult = 0;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
//This is caught in cuda/init.py when we init the gpu. I keep //This is caught in cuda/init.py when we init the gpu. I keep
//it here to ease finding code that rely on this. //it here to ease finding code that rely on this.
...@@ -1315,7 +1330,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1315,7 +1330,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
if getattr(self.scalar_op, 'identity', None) == 0: if getattr(self.scalar_op, 'identity', None) == 0:
zero_shp = "cudaMemset((%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(out_dtype)s))" % locals() zero_shp = "GpuArray_memset(&%(z)s->ga, 0)" % locals()
# TODO: elif getattr(self.scalar_op, 'identity', None) == 1: # TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
else: else:
zero_shp = """ zero_shp = """
...@@ -1325,44 +1340,43 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1325,44 +1340,43 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
""" % locals() """ % locals()
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
k_var = "kernel_reduce_ccontig_%(name)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = "" sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals() sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print(""" print("""
{ {
if(PyGpuArray_SIZE(%(x)s)==0){ if(PyGpuArray_SIZE(%(x)s)==0){
%(zero_shp)s; %(zero_shp)s;
}else{ }else{
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t numEls = PyGpuArray_SIZE(%(x)s);
std::min(PyGpuArray_SIZE(%(x)s), size_t n_threads = std::min(numEls, (size_t) 256);
(size_t) 256)); size_t n_blocks = 1;
dim3 n_blocks(1); void *kernel_params[] = {(void *)&numEls,
(void *)%(x)s->ga.data,
(void *)&%(x)s->ga.offset,
(void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset};
if (verbose) printf("running kernel_reduce_ccontig_%(name)s" if (verbose) printf("running kernel_reduce_ccontig_%(name)s"
" n_threads.x=%%d, size=%%d, ndim=%%d\\n", " n_threads=%%lu, size=%%lu, ndim=%%d\\n",
n_threads.x,PyGpuArray_SIZE(%(x)s), n_threads,numEls,
PyGpuArray_NDIM(%(x)s)); PyGpuArray_NDIM(%(x)s));
int n_shared = sizeof(%(acc_dtype)s) * n_threads.x; size_t n_shared = sizeof(%(acc_dtype)s) * n_threads;
kernel_reduce_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>( int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
PyGpuArray_SIZE(%(x)s), %(err_check)s
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset));
%(sync)s %(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_ccontig_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1372,10 +1386,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1372,10 +1386,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[0], size_t n_blocks[3] = {1, 1, 1};
(size_t) 256));
dim3 n_blocks(1);
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1385,15 +1397,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1385,15 +1397,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads(
std::min(PyGpuArray_DIMS(%(x)s)[1], size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
(size_t) 256)); while (n_threads[1] * n_threads[0] <= 256) ++n_threads[1];
while (n_threads.y * n_threads.x <= 256) ++n_threads.y; n_threads[1] -= 1;
n_threads.y -= 1; if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[0])
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[0]) n_threads[1] = PyGpuArray_DIMS(%(x)s)[0];
n_threads.y = PyGpuArray_DIMS(%(x)s)[0];
size_t n_blocks[3] = {1, 1, 1};
dim3 n_blocks(1);
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1421,25 +1432,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1421,25 +1432,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
threads_y = """ threads_y = """
//get as many y threads as we can fit //get as many y threads as we can fit
while (n_threads.x * (n_threads.y+1) <= 256) while (n_threads[0] * (n_threads[1]+1) <= 256)
{ {
if (n_threads.y < PyGpuArray_DIMS(%(x)s)[%(N)s-1]) if (n_threads[1] < PyGpuArray_DIMS(%(x)s)[%(N)s-1])
n_threads.y += 1; n_threads[1] += 1;
else else
break; break;
}""" % locals() }""" % locals()
threads_z = """ threads_z = """
//get as many z threads as we can fit //get as many z threads as we can fit
while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256) while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256)
{ {
if (n_threads.z < PyGpuArray_DIMS(%(x)s)[%(N)s-2]) if (n_threads[2] < PyGpuArray_DIMS(%(x)s)[%(N)s-2])
n_threads.z += 1; n_threads[2] += 1;
else else
break; break;
} }
//Maximum for Fermi GPU on that dimensions. //Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64); n_threads[2] = std::min(n_threads[2], (size_t)64);
""" % locals() """ % locals()
if len(self.reduce_mask) == 2: if len(self.reduce_mask) == 2:
...@@ -1452,13 +1463,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1452,13 +1463,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[%(N)s], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[%(N)s],
(size_t) 256));
%(threads_y)s %(threads_y)s
%(threads_z)s %(threads_z)s
dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[0], size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
(size_t) 4096));
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1476,9 +1484,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1476,9 +1484,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
k_var = "kernel_reduce_10_%(name)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = "" sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals() sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
...@@ -1491,95 +1511,71 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1491,95 +1511,71 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
// we could schedule more threads if we were maxing out the gridsize below, but // we could schedule more threads if we were maxing out the gridsize below, but
// the gridsize is way more than the physical hardware and I think 32 threads // the gridsize is way more than the physical hardware and I think 32 threads
// on a huge grid is enough to fully use the hardware. // on a huge grid is enough to fully use the hardware.
dim3 n_threads(32,1,1); size_t n_threads[3] = {32, 1, 1};
// We kindof reshape the input implicitly to something 4D: // We kindof reshape the input implicitly to something 4D:
// the shape A,B,C -> A, B, D, E // the shape A,B,C -> A, B, D, E
// where C <= D*E < C+32 // where C <= D*E < C+32
// where E==32 // where E==32
int A = 1; GpuKernel *%(k_var)s = &kernel_reduce_010_AD_%(name)s;
int B = PyGpuArray_DIMS(%(x)s)[0]; size_t A = 1;
int C = PyGpuArray_DIMS(%(x)s)[1]; size_t B = PyGpuArray_DIMS(%(x)s)[0];
int D = C/32; size_t C = PyGpuArray_DIMS(%(x)s)[1];
size_t D = C/32;
if (32*D < C) D+= 1; if (32*D < C) D+= 1;
assert ((C <= 32*D) && (32*D < C+32)); assert ((C <= 32*D) && (32*D < C+32));
// The gridsize would ideally be (A, D). But we do the following logic to make // The gridsize would ideally be (A, D). But we do the following logic to make
// sure we don't ask for a grid that is too big. // sure we don't ask for a grid that is too big.
dim3 n_blocks(A,D); size_t n_blocks[3] = {A, D, 1};
if (n_blocks.x > 4096) n_blocks.x = 4096; if (n_blocks[0] > 4096) n_blocks[0] = 4096;
if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x; if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads>>>( ssize_t stride_A0 = 1;
A,B,C,D, ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset), ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
1, ssize_t stride_Z0 = 1;
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s), ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s), void *kernel_params[] = {
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), (void *)&A, (void *)&B, (void *)&C, (void *)&D,
1, (void *)%(x)s->ga.data,
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s) (void *)&%(x)s->ga.offset,
); (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
(void *)%(z)s->ga.data,
%(sync)s (void *)&%(z)s->ga.offset,
cudaError_t sts = cudaGetLastError(); (void *)&stride_Z0, (void *)&stride_Z1};
if (cudaSuccess != sts) int err = GpuKernel_call(%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
{ %(err_check)s
PyErr_Format(PyExc_RuntimeError, %(sync)s
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_10_AD%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
}else{ }else{
dim3 n_threads( GpuKernel *%(k_var)s = &kernel_reduce_010_%(name)s;
std::min(PyGpuArray_DIMS(%(x)s)[0], size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
(size_t) 256)); size_t n_blocks[3] = {1, std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 4096), 1};
dim3 n_blocks(1,
std::min(PyGpuArray_DIMS(%(x)s)[1],
(size_t) 4096));
if (verbose) { if (verbose) {
fprintf(stderr, fprintf(stderr,
"running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n", "running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n",
n_blocks.x, n_blocks[0],
n_blocks.y); n_blocks[1]);
} }
assert(PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]); assert(PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]);
int n_shared = sizeof(%(acc_dtype)s) * n_threads.x; size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0];
kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>( size_t dim_0 = 1;
1, ssize_t stride_A0 = 1;
PyGpuArray_DIMS(%(x)s)[0], ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
PyGpuArray_DIMS(%(x)s)[1], ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset), ssize_t stride_Z0 = 1;
1, ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s), void *kernel_params[] = {
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s), (void *)&dim_0,
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), (void *)&PyGpuArray_DIMS(%(x)s)[0],
1, (void *)&PyGpuArray_DIMS(%(x)s)[1],
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s) (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
); (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
(void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
%(err_check)s
%(sync)s %(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_010_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1591,9 +1587,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1591,9 +1587,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
pattern = ''.join(str(i) for i in self.reduce_mask) pattern = ''.join(str(i) for i in self.reduce_mask)
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
k_var = "kernel_reduce_010_AD_%(name)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = "" sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals() sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print(""" print("""
{ {
//int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2]; //int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2];
...@@ -1608,108 +1616,82 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1608,108 +1616,82 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
// we could schedule more threads if we were maxing out the gridsize below, but // we could schedule more threads if we were maxing out the gridsize below, but
// the gridsize is way more than the physical hardware and I think 32 threads // the gridsize is way more than the physical hardware and I think 32 threads
// on a huge grid is enough to fully use the hardware. // on a huge grid is enough to fully use the hardware.
dim3 n_threads(32,1,1); size_t n_threads[3] = {32, 1, 1};
// We kindof reshape the input implicitly to something 4D: // We kindof reshape the input implicitly to something 4D:
// the shape A,B,C -> A, B, D, E // the shape A,B,C -> A, B, D, E
// where C <= D*E < C+32 // where C <= D*E < C+32
// where E==32 // where E==32
int A = PyGpuArray_DIMS(%(x)s)[0]; size_t A = PyGpuArray_DIMS(%(x)s)[0];
int B = PyGpuArray_DIMS(%(x)s)[1]; size_t B = PyGpuArray_DIMS(%(x)s)[1];
int C = PyGpuArray_DIMS(%(x)s)[2]; size_t C = PyGpuArray_DIMS(%(x)s)[2];
int D = C/32; size_t D = C/32;
if (32*D < C) D+= 1; if (32*D < C) D+= 1;
assert ((C <= 32*D) && (32*D < C+32)); assert ((C <= 32*D) && (32*D < C+32));
// The gridsize would ideally be (A, D). But we do the following logic to make // The gridsize would ideally be (A, D). But we do the following logic to make
// sure we don't ask for a grid that is too big. // sure we don't ask for a grid that is too big.
dim3 n_blocks(A,D); size_t n_blocks[3] = {A, D, 1};
if (n_blocks.x > 4096) n_blocks.x = 4096; if (n_blocks[0] > 4096) n_blocks[0] = 4096;
if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x; if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
int n_shared = 0; ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>( ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
A,B,C,D, ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s);
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset), ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s), ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s);
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s), void *kernel_params[] = {
PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s), (void *)&A, (void *)&B, (void *)&C, (void *)&D,
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), (void *)%(x)s->ga.data,
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s), (void *)&%(x)s->ga.offset,
PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s) (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
); (void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
%(err_check)s
%(sync)s %(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_010_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
} }
else else
{ {
int verbose = 2; int verbose = 2;
dim3 n_threads(std::min((size_t) 32, size_t n_threads[3] = {std::min((size_t) 32, PyGpuArray_DIMS(%(x)s)[2]), 1, 1};
PyGpuArray_DIMS(%(x)s)[2])); while( (n_threads[0]*(n_threads[1]+1)<=256)
while( (n_threads.x*(n_threads.y+1)<=256) && (n_threads[1]<PyGpuArray_DIMS(%(x)s)[1])){
&& (n_threads.y<PyGpuArray_DIMS(%(x)s)[1])){ n_threads[1]++;
n_threads.y++;
} }
dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[0], size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096), 1, 1};
(size_t)4096)); n_blocks[1] = std::min(
n_blocks.y = std::min(
ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2], ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
(size_t)n_threads.x), (size_t)n_threads[0]),
(size_t)(4096 / n_blocks.x) (size_t)(4096 / n_blocks[0])
); );
if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s), if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s)), PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s)),
PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s)) PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s))
==PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s) ==PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s)
&& n_blocks.y==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2], && n_blocks[1]==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
(size_t)n_threads.x)){ (size_t)n_threads[0])){
if(verbose>1) if(verbose>1)
printf("n_block.x.1=%%d, n_block.x.2=%%d, n_block.y.1=%%d, n_block.y.2=%%d,\\n", printf("n_block.x.1=%%d, n_block.x.2=%%d, n_block.y.1=%%d, n_block.y.2=%%d,\\n",
PyGpuArray_DIMS(%(x)s)[0],4096, PyGpuArray_DIMS(%(x)s)[0],4096,
ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],(size_t)n_threads.x), ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],(size_t)n_threads[0]),
(size_t)(4096 / n_blocks.x)); (size_t)(4096 / n_blocks[0]));
assert(n_threads.x<=32); assert(n_threads[0]<=32);
%(makecall_inner)s %(makecall_inner)s
}else{ }else{
n_threads.x = std::min(PyGpuArray_DIMS(%(x)s)[1], n_threads[0] = std::min(PyGpuArray_DIMS(%(x)s)[1],
(size_t) 256); (size_t) 256);
n_blocks.x = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096); n_blocks[0] = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096);
n_blocks.y = std::min( n_blocks[1] = std::min(
PyGpuArray_DIMS(%(x)s)[2], PyGpuArray_DIMS(%(x)s)[2],
(size_t)(4096 / n_blocks.x) (size_t)(4096 / n_blocks[0])
); );
%(makecall)s %(makecall)s
} }
%(sync)s %(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_%(pattern)s_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1719,16 +1701,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1719,16 +1701,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[3], while (n_threads[0] * n_threads[1] <= 256)
(size_t) 256));
while (n_threads.x * n_threads.y <= 256)
{ {
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1]) break; if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1]) break;
n_threads.y += 1; n_threads[1] += 1;
} }
n_threads.y -= 1; n_threads[1] -= 1;
dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[2]); size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[2], 1};
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1738,7 +1718,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1738,7 +1718,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = "npy_" + node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = "npy_" + node.outputs[0].dtype
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
sync = bool(config.gpuarray.sync) k_var = "kernel_reduce_010_AD_%(name)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
# use threadIdx.x for i0 # use threadIdx.x for i0
# use blockIdx.x for i1 # use blockIdx.x for i1
# use blockIdx.y for i2 # use blockIdx.y for i2
...@@ -1747,15 +1741,12 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1747,15 +1741,12 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
int verbose = 0; int verbose = 0;
if (PyGpuArray_STRIDES(%(x)s)[2] != sizeof(%(in_dtype)s)){ if (PyGpuArray_STRIDES(%(x)s)[2] != sizeof(%(in_dtype)s)){
printf("slow\\n"); printf("slow\\n");
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[0], size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)4096), 1, 1};
(size_t) 256)); while (n_blocks[0] * (n_blocks[1]+1) <= 4096 &&
dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[1], n_blocks[1] <= PyGpuArray_DIMS(%(x)s)[2])
(size_t)4096));
while (n_blocks.x * (n_blocks.y+1) <= 4096 &&
n_blocks.y <= PyGpuArray_DIMS(%(x)s)[2])
{ {
n_blocks.y += 1; n_blocks[1] += 1;
} }
%(makecall)s %(makecall)s
} }
...@@ -1763,50 +1754,38 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1763,50 +1754,38 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ // reuse 010_AD kernel, we transpose the 2 first dim { // reuse 010_AD kernel, we transpose the 2 first dim
// See the reduction for the real 010_AD kernel for // See the reduction for the real 010_AD kernel for
// explanation. We do this to get coalesced read. // explanation. We do this to get coalesced read.
dim3 n_threads(32,1,1); size_t n_threads[3] = {32, 1, 1};
int A = PyGpuArray_DIMS(%(x)s)[1]; size_t A = PyGpuArray_DIMS(%(x)s)[1];
int B = PyGpuArray_DIMS(%(x)s)[0]; size_t B = PyGpuArray_DIMS(%(x)s)[0];
int C = PyGpuArray_DIMS(%(x)s)[2]; size_t C = PyGpuArray_DIMS(%(x)s)[2];
int D = C/32; size_t D = C/32;
if (32*D < C) D+= 1; if (32*D < C) D+= 1;
assert ((C <= 32*D) && (32*D < C+32)); assert ((C <= 32*D) && (32*D < C+32));
// The gridsize would ideally be (A, D). But we do the following logic to make // The gridsize would ideally be (A, D). But we do the following logic to make
// sure we don't ask for a grid that is too big. // sure we don't ask for a grid that is too big.
dim3 n_blocks(A,D); size_t n_blocks[3] = {A, D, 1};
if (n_blocks.x > 4096) n_blocks.x = 4096; if (n_blocks[0] > 4096) n_blocks[0] = 4096;
if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x; if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
int n_shared = 0; size_t n_shared = 0;
kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>( ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
A,B,C,D, ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
(%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset), ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s);
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s), ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s), ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s);
PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s), void *kernel_params[] = {
(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), (void *)&A, (void *)&B, (void *)&C, (void *)&D,
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s), (void *)%(x)s->ga.data,
PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s) (void *)&%(x)s->ga.offset,
); (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
if (%(sync)d) (void *)%(z)s->ga.data,
GpuArray_sync(&%(z)s->ga); (void *)&%(z)s->ga.offset,
cudaError_t sts = cudaGetLastError(); (void *)&stride_Z0, (void *)&stride_Z1};
if (cudaSuccess != sts) int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
{ %(err_check)s
PyErr_Format(PyExc_RuntimeError, %(sync)s
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_010_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1815,18 +1794,16 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1815,18 +1794,16 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[1], while (n_threads[0]*n_threads[1] <= 256)
(size_t) 256));
while (n_threads.x*n_threads.y <= 256)
{ {
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[0]) if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[0])
break; break;
n_threads.y += 1; n_threads[1] += 1;
} }
n_threads.y -= 1; n_threads[1] -= 1;
dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[2]); size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[2], 1, 1};
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1836,19 +1813,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1836,19 +1813,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[2], size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
(size_t) 256)); while (n_blocks[0] * n_blocks[1] <= 4096)
dim3 n_blocks(
std::min(PyGpuArray_DIMS(%(x)s)[0],
(size_t) 4096));
while (n_blocks.x * n_blocks.y <= 4096)
{ {
if (n_blocks.y > PyGpuArray_DIMS(%(x)s)[1]) if (n_blocks[1] > PyGpuArray_DIMS(%(x)s)[1])
break; break;
n_blocks.y += 1; n_blocks[1] += 1;
} }
n_blocks.y -= 1; n_blocks[1] -= 1;
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1858,31 +1831,29 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1858,31 +1831,29 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[2],
(size_t) 256));
//get as many y threads as we can fit //get as many y threads as we can fit
while (n_threads.x * n_threads.y <= 256) while (n_threads[0] * n_threads[1] <= 256)
{ {
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1]) if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1])
break; break;
n_threads.y += 1; n_threads[1] += 1;
} }
n_threads.y -= 1; n_threads[1] -= 1;
//get as many z threads as we can fit //get as many z threads as we can fit
while (n_threads.x * n_threads.y * n_threads.z <= 256) while (n_threads[0] * n_threads[1] * n_threads[2] <= 256)
{ {
if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0]) if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
break; break;
n_threads.z += 1; n_threads[2] += 1;
} }
n_threads.z -= 1; n_threads[2] -= 1;
//Maximum for Fermi GPU on that dimensions. //Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64); n_threads[2] = std::min(n_threads[2], (size_t)64);
dim3 n_blocks(1,1,1); size_t n_blocks[3] = {1, 1, 1};
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1896,24 +1867,20 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1896,24 +1867,20 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ {
int verbose = 0; int verbose = 0;
dim3 n_blocks( size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[0],
(size_t) 4096));
while (n_blocks.x * n_blocks.y <= 4096 && while (n_blocks[0] * n_blocks[1] <= 4096 &&
n_blocks.y < PyGpuArray_DIMS(%(x)s)[1]) n_blocks[1] < PyGpuArray_DIMS(%(x)s)[1])
{ {
n_blocks.y += 1; n_blocks[1] += 1;
} }
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[3], while (n_threads[0] * n_threads[1] <= 256
(size_t) 256)); && n_threads[1] < PyGpuArray_DIMS(%(x)s)[2]
while (n_threads.x * n_threads.y <= 256 && n_threads[0] * n_threads[1] * sizeof(%(acc_dtype)s) <=(15*1024-200))
&& n_threads.y < PyGpuArray_DIMS(%(x)s)[2]
&& n_threads.x * n_threads.y * sizeof(%(acc_dtype)s) <=(15*1024-200))
{ {
n_threads.y += 1; n_threads[1] += 1;
} }
%(makecall)s %(makecall)s
...@@ -1925,32 +1892,30 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1925,32 +1892,30 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[2],
(size_t) 256));
//get as many y threads as we can fit //get as many y threads as we can fit
while (n_threads.x * n_threads.y <= 256) while (n_threads[0] * n_threads[1] <= 256)
{ {
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1]) if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1])
break; break;
n_threads.y += 1; n_threads[1] += 1;
} }
n_threads.y -= 1; n_threads[1] -= 1;
//get as many z threads as we can fit //get as many z threads as we can fit
while (n_threads.x * n_threads.y * n_threads.z <= 256) while (n_threads[0] * n_threads[1] * n_threads[2] <= 256)
{ {
if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0]) if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
break; break;
n_threads.z += 1; n_threads[2] += 1;
} }
n_threads.z -= 1; n_threads[2] -= 1;
//Maximum for Fermi GPU on that dimensions. //Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64); n_threads[2] = std::min(n_threads[2], (size_t)64);
dim3 n_blocks(1,1,1); size_t n_blocks[3] = {1, 1, 1};
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1960,27 +1925,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1960,27 +1925,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
dim3 n_threads( size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
std::min(PyGpuArray_DIMS(%(x)s)[3],
(size_t) 256));
while (n_threads.x * (n_threads.y+1) <= 256) ++n_threads.y; while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1];
if (n_threads.y > PyGpuArray_DIMS(%(x)s)[2]) if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[2])
n_threads.y = PyGpuArray_DIMS(%(x)s)[2]; n_threads[1] = PyGpuArray_DIMS(%(x)s)[2];
while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256) ++n_threads.z; while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256) ++n_threads[2];
if (n_threads.z > 64) if (n_threads[2] > 64)
n_threads.z = 64; n_threads[2] = 64;
if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0]) if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
n_threads.z = PyGpuArray_DIMS(%(x)s)[0]; n_threads[2] = PyGpuArray_DIMS(%(x)s)[0];
dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[1]); size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[1], 1, 1};
%(makecall)s %(makecall)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [15] # the version corresponding to the c code in this Op version = [16] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
...@@ -1994,14 +1957,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -1994,14 +1957,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
else: else:
return () return ()
def c_support_code_apply(self, node, nodename): def gpu_kernels(self, node, nodename):
sio = StringIO()
nd_in = len(self.reduce_mask) nd_in = len(self.reduce_mask)
in_dtype = "npy_" + node.inputs[0].dtype in_dtype = node.inputs[0].dtype
out_dtype = "npy_" + node.outputs[0].dtype out_dtype = node.outputs[0].dtype
acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype) acc_dtype = self._acc_dtype(node.inputs[0].dtype)
load_in = load_w(node.inputs[0].dtype) flags=Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
write_out = write_w(node.outputs[0].dtype) in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype)
acc_type = gpuarray.dtype_to_ctype(acc_dtype)
load_in = load_w(in_dtype)
write_out = write_w(out_dtype)
kernels = []
if all(i == 1 for i in self.reduce_mask): if all(i == 1 for i in self.reduce_mask):
# this kernel is ok for up to a few thousand elements, but # this kernel is ok for up to a few thousand elements, but
...@@ -2011,16 +1978,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2011,16 +1978,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0])", load_in + "(A[i0])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])")
kname = "kernel_reduce_ccontig"
k_var = "kernel_reduce_ccontig_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_ccontig_%(nodename)s( KERNEL void %(kname)s(
const unsigned int d0, const ga_size d0,
const %(in_dtype)s *A, const %(in_type)s *A, const ga_size offset_A,
%(out_dtype)s * Z) %(out_type)s *Z, const ga_size offset_Z)
{ {
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2034,6 +2006,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2034,6 +2006,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp',
gpuarray.GpuArray, 'uintp',
gpuarray.GpuArray, 'uintp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1,): if self.reduce_mask == (1,):
# this kernel is ok for up to a few thousand elements, but # this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
...@@ -2042,16 +2021,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2042,16 +2021,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0])", load_in + "(A[i0 * sA0])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])")
kname = "kernel_reduce_1"
k_var = "kernel_reduce_1_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_1_%(nodename)s( KERNEL void %(kname)s(
const unsigned int d0, const ga_size d0,
const %(in_dtype)s *A, const int sA0, const %(in_type)s *A, const ga_size offset_A,
%(out_dtype)s * Z) const ga_ssize sA0,
%(out_type)s * Z, const ga_size offset_Z)
{ {
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2065,6 +2050,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2065,6 +2050,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp',
gpuarray.GpuArray, 'uintp',
'intp',
gpuarray.GpuArray, 'uintp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 1): if self.reduce_mask == (1, 1):
# this kernel is ok for up to a few thousand elements, but # this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
...@@ -2073,17 +2066,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2073,17 +2066,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1])", load_in + "(A[i0 * sA0 + i1 * sA1])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])")
kname = "kernel_reduce_11"
k_var = "kernel_reduce_11_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_11_%(nodename)s( KERNEL void %(kname)s(
const int d0, const ga_size d0, const ga_size d1,
const int d1, const %(in_type)s *A, const ga_size offset_A,
const %(in_dtype)s *A, const int sA0, const int sA1, const ga_ssize sA0, const ga_ssize sA1,
%(out_dtype)s * Z) %(out_type)s * Z, const ga_size offset_Z)
{ {
const int threadCount = blockDim.x * blockDim.y; const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y*blockDim.x + threadIdx.x; const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2100,6 +2098,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2100,6 +2098,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp',
gpuarray.GpuArray, 'uintp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
#01, 011, 0111 #01, 011, 0111
if (0 == self.reduce_mask[0] and if (0 == self.reduce_mask[0] and
all(self.reduce_mask[1:]) and all(self.reduce_mask[1:]) and
...@@ -2144,17 +2150,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2144,17 +2150,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node, reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node,
nodename, sub={}) nodename, sub={})
param_dim = ",".join(["const int d%d" % i param_dim = ",".join(["const ga_size d%d" % i
for i in xrange(nd_in)]) for i in xrange(nd_in)])
param_strides = ",".join(["const int sA%d" % i param_strides = ",".join(["const ga_ssize sA%d" % i
for i in xrange(nd_in)]) for i in xrange(nd_in)])
decl = self._k_decl(node, nodename) decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals()) reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals())
reduce_fct = self._assign_reduce( reduce_fct = self._assign_reduce(
node, nodename, "myresult", node, nodename, "myresult",
load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])", load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
{}, True) {}, True)
sio = StringIO()
print(""" print("""
%(decl)s{ %(decl)s{
%(init)s %(init)s
...@@ -2171,6 +2178,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2171,6 +2178,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0): if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
# this kernel uses one block for each column, # this kernel uses one block for each column,
# threads per block for each element per column. # threads per block for each element per column.
...@@ -2184,18 +2193,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2184,18 +2193,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])")
kname = "kernel_reduce_010"
k_var = "kernel_reduce_010_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_010_%(nodename)s( KERNEL void %(kname)s(
const int d0, const ga_size d0, const ga_size d1, const ga_size d2,
const int d1, const %(in_type)s *A, const ga_size offset_A,
const int d2, const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
const %(in_dtype)s *A, const int sA0, %(out_type)s * Z, const ga_size offset_Z,
const int sA1, const int sA2, const ga_ssize sZ0, const ga_ssize sZ1)
%(out_dtype)s * Z, const int sZ0, const int sZ1)
{ {
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2207,7 +2220,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2207,7 +2220,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ {
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{ {
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{ {
%(reduce_fct)s; %(reduce_fct)s;
...@@ -2218,25 +2231,36 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2218,25 +2231,36 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask in [(0, 1, 0), (1, 0), (1, 0, 0)]: if self.reduce_mask in [(0, 1, 0), (1, 0), (1, 0, 0)]:
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(X[a * sX0 + b * sX1 + c * sX2])", load_in + "(X[a * sX0 + b * sX1 + c * sX2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])") reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])")
kname = "kernel_reduce_010_AD"
k_var = "kernel_reduce_010_AD_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_010_AD_%(nodename)s( KERNEL void %(kname)s(
const int A, const ga_size A, const ga_size B, const ga_size C, const ga_size D,
const int B, const %(in_type)s *X, const ga_size offset_X,
const int C, const ga_ssize sX0, const ga_ssize sX1, const ga_ssize sX2,
const int D, %(out_type)s * Z, const ga_size offset_Z,
//const int E, // THIS is 32 const ga_ssize sZ0, const ga_ssize sZ1)
const %(in_dtype)s *X, const int sX0,
const int sX1, const int sX2,
%(out_dtype)s * Z, const int sZ0, const int sZ1)
{ {
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
%(acc_dtype)s myresult = 0; %(acc_type)s myresult = 0;
X = (const %(in_type)s *)(((char *)X)+offset_X);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2262,6 +2286,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2262,6 +2286,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp', 'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 1, 0): if self.reduce_mask == (0, 1, 0):
# #
# This kernel is optimized when the inner most dimensions # This kernel is optimized when the inner most dimensions
...@@ -2275,7 +2308,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2275,7 +2308,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
# block.x = dim 0 # block.x = dim 0
# block.y = dim 1 rest # block.y = dim 1 rest
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
decl = self._k_decl(node, nodename, pattern="010_inner") decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner")
reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]', reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
node, nodename, node, nodename,
'blockDim.x') 'blockDim.x')
...@@ -2283,6 +2316,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2283,6 +2316,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])")
sio = StringIO()
print(""" print("""
%(decl)s %(decl)s
{ {
...@@ -2307,6 +2341,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2307,6 +2341,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 1, 0): if self.reduce_mask == (1, 1, 0):
# this kernel uses one block for each column, # this kernel uses one block for each column,
# threads per block for each element per column. # threads per block for each element per column.
...@@ -2319,19 +2355,23 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2319,19 +2355,23 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])") reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])")
kname = "kernel_reduce_110"
k_var = "kernel_reduce_110_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_110_%(nodename)s( KERNEL void %(kname)s(
const int d0, const ga_size d0, const ga_size d1, const ga_size d2,
const int d1, const %(in_type)s *A, const ga_size offset_A,
const int d2, const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
const %(in_dtype)s *A, const int sA0, %(out_type)s * Z, const ga_size offset_Z,
const int sA1, const int sA2, const ga_ssize sZ0)
%(out_dtype)s * Z, const int sZ0)
{ {
const int threadCount = blockDim.x * blockDim.y; const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y * blockDim.x + threadIdx.x; const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2351,15 +2391,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2351,15 +2391,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp',
'intp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 0, 0): if self.reduce_mask == (1, 0, 0):
reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]', reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]',
node, nodename, sub={}) node, nodename, sub={})
decl = self._k_decl(node, nodename) decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])")
sio = StringIO()
print(""" print("""
%(decl)s %(decl)s
{ {
...@@ -2378,15 +2428,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2378,15 +2428,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 1, 1): if self.reduce_mask == (1, 1, 1):
reducebuf = self._k_reduce_buf('Z[0]', node, reducebuf = self._k_reduce_buf('Z[0]', node,
nodename, sub={}) nodename, sub={})
decl = self._k_decl(node, nodename) decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])")
sio = StringIO()
print(""" print("""
%(decl)s %(decl)s
{ {
...@@ -2405,6 +2458,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2405,6 +2458,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 0, 1): if self.reduce_mask == (0, 0, 1):
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
...@@ -2414,18 +2469,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2414,18 +2469,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
kname = "kernel_reduce_001"
k_var = "kernel_reduce_001_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_001_%(nodename)s( KERNEL void %(kname)s(
const int d0, const ga_size d0, const ga_size d1, const ga_size d2,
const int d1, const %(in_type)s *A, const ga_size offset_A,
const int d2, const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
const %(in_dtype)s *A, const int sA0, %(out_type)s * Z, const ga_size offset_Z,
const int sA1, const int sA2, const ga_ssize sZ0, const ga_ssize sZ1)
%(out_dtype)s * Z, const int sZ0, const int sZ1)
{ {
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2436,7 +2495,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2436,7 +2495,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ {
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{ {
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x) for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
{ {
%(reduce_fct)s; %(reduce_fct)s;
...@@ -2446,17 +2505,27 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2446,17 +2505,27 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
params = [
'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp',
'intp', 'intp'
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 0, 1, 1): if self.reduce_mask == (0, 0, 1, 1):
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]', reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
node, nodename, sub={}) node, nodename, sub={})
decl = self._k_decl(node, nodename) decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
sio = StringIO()
print(""" print("""
%(decl)s %(decl)s
{ {
...@@ -2466,7 +2535,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2466,7 +2535,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ {
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{ {
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y) for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
{ {
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
...@@ -2479,17 +2548,20 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2479,17 +2548,20 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 1, 0, 1): if self.reduce_mask == (0, 1, 0, 1):
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]', reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]',
node, nodename, sub={}) node, nodename, sub={})
decl = self._k_decl(node, nodename) decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])") reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])")
sio = StringIO()
print(""" print("""
%(decl)s %(decl)s
{ {
...@@ -2499,7 +2571,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2499,7 +2571,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
{ {
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{ {
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y) for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
{ {
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
...@@ -2512,15 +2584,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2512,15 +2584,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 1, 1, 1): if self.reduce_mask == (1, 1, 1, 1):
reducebuf = self._k_reduce_buf('Z[0]', node, nodename, reducebuf = self._k_reduce_buf('Z[0]', node, nodename,
sub={}) sub={})
decl = self._k_decl(node, nodename) decl, kname, params, k_var = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])", load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[0])") reduce_init = self._assign_init(load_in + "(A[0])")
sio = StringIO()
print(""" print("""
%(decl)s %(decl)s
{ {
...@@ -2540,6 +2615,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2540,6 +2615,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 0, 1, 1): if self.reduce_mask == (1, 0, 1, 1):
reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]', reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
node, nodename, sub={}) node, nodename, sub={})
...@@ -2547,20 +2624,23 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2547,20 +2624,23 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
load_in + "(A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3])", load_in + "(A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3])",
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])") reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])")
kname = "kernel_reduce_1011"
k_var= "kernel_reduce_1011_" + nodename
sio = StringIO()
print(""" print("""
static __global__ void kernel_reduce_1011_%(nodename)s( KERNEL void %(kname)s(
const unsigned int d0, const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3,
const unsigned int d1, const %(in_type)s *A, const ga_size offset_A,
const unsigned int d2, const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2, const ga_ssize sA3,
const unsigned int d3, %(out_type)s * Z, const ga_size offset_Z,
const %(in_dtype)s *A, const int sA0, const int sA1, const ga_ssize sZ0)
const int sA2, const int sA3,
%(out_dtype)s * Z, const int sZ0)
{ {
const int threadCount = blockDim.x * blockDim.y * blockDim.z; const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ %(acc_dtype)s buf[]; extern __shared__ %(acc_type)s buf[];
%(acc_dtype)s myresult = %(reduce_init)s; %(acc_type)s myresult = %(reduce_init)s;
A = (const %(in_type)s *)(((char *)A)+offset_A);
Z = (%(out_type)s *)(((char *)Z)+offset_Z);
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2580,14 +2660,16 @@ class GpuCAReduceCuda(HideC, CAReduceDtype): ...@@ -2580,14 +2660,16 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
print(""" params = [
template <typename T> 'uintp', 'uintp', 'uintp', 'uintp',
static T ceil_intdiv(T a, T b) gpuarray.GpuArray, 'uintp',
{ 'intp', 'intp', 'intp', 'intp',
return (a/b) + ((a % b) ? 1: 0); gpuarray.GpuArray, 'uintp',
} 'intp'
""", file=sio) ]
return sio.getvalue() kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
return kernels
class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...@@ -2820,8 +2902,15 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2820,8 +2902,15 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(output)s = tmp; %(output)s = tmp;
} }
if (%(sync)d) if (%(sync)d) {
GpuArray_sync(&%(output)s->ga); err = GpuArray_sync(&%(output)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: GpuCAReduceCPY: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s
}
}
""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync), """ % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input, ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype)) cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
......
...@@ -3,6 +3,12 @@ Helper routines for generating gpu kernels for nvcc. ...@@ -3,6 +3,12 @@ Helper routines for generating gpu kernels for nvcc.
""" """
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
def nvcc_kernel(name, params, body): def nvcc_kernel(name, params, body):
""" """
Return the c code of a kernel function. Return the c code of a kernel function.
...@@ -26,7 +32,7 @@ def nvcc_kernel(name, params, body): ...@@ -26,7 +32,7 @@ def nvcc_kernel(name, params, body):
else: else:
yield b yield b
bodystr = ';\n'.join(flatbody()) bodystr = ';\n'.join(flatbody())
return """__global__ void %(name)s (%(paramstr)s) return """KERNEL void %(name)s (%(paramstr)s)
{ {
%(bodystr)s; %(bodystr)s;
} }
...@@ -167,11 +173,12 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): ...@@ -167,11 +173,12 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
We use __i as an int variable in a loop. We use __i as an int variable in a loop.
""" """
ctype = gpuarray.dtype_to_ctype(dtype)
return [ return [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount), inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()', '__syncthreads()',
('npy_%s row_max = ' + buf + '[0]') % dtype, ('%s row_max = ' + buf + '[0]') % ctype,
'__syncthreads()', '__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N + 'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){', '; __i+=' + threadCount + '){',
...@@ -181,7 +188,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): ...@@ -181,7 +188,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
'__syncthreads()', '__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount), inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()', '__syncthreads()',
('npy_%s row_sum = ' + buf + '[0]') % dtype, ('%s row_sum = ' + buf + '[0]') % ctype,
'__syncthreads()', '__syncthreads()',
# divide each exp() result by the sum to complete the job. # divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N + 'for(int __i=' + threadPos + '; __i<' + N +
...@@ -259,11 +266,12 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count, ...@@ -259,11 +266,12 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos)) r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos)) r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
ctype = gpuarray.dtype_to_ctype(dtype)
return """ return """
{ {
// This function trashes buf[1..n_threads], // This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0]. // leaving the reduction result in buf[0].
npy_%(dtype)s red = %(init)s; %(ctype)s red = %(init)s;
#pragma unroll 16 #pragma unroll 16
for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){ for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
red = %(loop_line)s; red = %(loop_line)s;
...@@ -356,6 +364,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x, ...@@ -356,6 +364,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
We use tx as an int variable in a loop. We use tx as an int variable in a loop.
""" """
ctype = gpuarray.dtype_to_ctype(dtype)
ret = [ ret = [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x, inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x,
...@@ -363,7 +372,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x, ...@@ -363,7 +372,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
b, stride_b, load_b, b, stride_b, load_b,
dtype), dtype),
'__syncthreads()', '__syncthreads()',
('npy_%s row_max = ' + buf + '[0]') % dtype, ('%s row_max = ' + buf + '[0]') % ctype,
'__syncthreads()', '__syncthreads()',
inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, inline_reduce_fixed_shared(N, buf, x, stride_x, load_x,
threadPos, threadCount, threadPos, threadCount,
...@@ -371,7 +380,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x, ...@@ -371,7 +380,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
lambda a: "exp(%s - row_max)" % a, lambda a: "exp(%s - row_max)" % a,
b, stride_b, load_b, dtype), b, stride_b, load_b, dtype),
'__syncthreads()', '__syncthreads()',
('npy_%s row_sum = ' + buf + '[0]') % dtype, ('%s row_sum = ' + buf + '[0]') % ctype,
'__syncthreads()', '__syncthreads()',
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
] ]
......
import os
import numpy import numpy
from theano import Op, Apply, config from theano import Op, Apply, config
...@@ -12,13 +13,14 @@ except ImportError: ...@@ -12,13 +13,14 @@ except ImportError:
pass pass
from .basic_ops import (as_gpuarray_variable, from .basic_ops import (as_gpuarray_variable,
host_from_gpu, gpu_from_host) host_from_gpu, gpu_from_host,
GpuKernelBase, Kernel)
from .opt import register_opt as register_gpu_opt, op_lifter from .opt import register_opt as register_gpu_opt, op_lifter
from .type import GpuArrayType from .type import GpuArrayType
from .comp import NVCC_compiler from .comp import NVCC_compiler
class GpuImages2Neibs(Images2Neibs, Op): class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
def __init__(self, mode='valid'): def __init__(self, mode='valid'):
if mode not in ['valid', 'ignore_borders', 'wrap_centered']: if mode not in ['valid', 'ignore_borders', 'wrap_centered']:
raise NotImplementedError("Only the mode valid, ignore_borders" raise NotImplementedError("Only the mode valid, ignore_borders"
...@@ -43,25 +45,41 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -43,25 +45,41 @@ class GpuImages2Neibs(Images2Neibs, Op):
dtype=ten4.type.dtype)()]) dtype=ten4.type.dtype)()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (9, 1) return (10,1)
def c_headers(self): def c_headers(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>'] '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_compiler(self): def c_header_dirs(self):
return NVCC_compiler if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_init_code(self): def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();'] return ['setup_ext_cuda();']
def c_support_code_apply(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_ten4 = node.inputs[0].dtype dtype_ten4 = node.inputs[0].dtype
dtype_z = node.outputs[0].dtype dtype_z = node.outputs[0].dtype
flags = Kernel.get_flags(dtype_ten4, dtype_z)
type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
type_z = gpuarray.dtype_to_ctype(dtype_z)
mode = self.mode mode = self.mode
return """ kernels = []
kname = "k_multi_warp_less"
k_var = "k_multi_warp_less_" + nodename
code = """
//a version that use less register but don't work in all case. //a version that use less register but don't work in all case.
static __global__ void k_multi_warp_less_%(nodename)s( KERNEL void %(kname)s(
const int nb_batch, const int nb_batch,
const int nb_stack, const int nb_stack,
const int height, const int height,
...@@ -72,15 +90,17 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -72,15 +90,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
const int step_y, const int step_y,
const int grid_c, const int grid_c,
const int grid_d, const int grid_d,
const int stride0, const int stride1, const size_t stride0, const size_t stride1,
const int stride2, const int stride3, const size_t stride2, const size_t stride3,
npy_%(dtype_ten4)s * global_ten4, const %(type_ten4)s * global_ten4, const size_t offset_ten4,
const int out_s0, const int out_s1, const size_t out_s0, const size_t out_s1,
npy_%(dtype_z)s * global_out %(type_z)s * global_out, const size_t offset_out
) )
{ {
const int wrap_centered_idx_shift_x = c/2; const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2; const int wrap_centered_idx_shift_y = d/2;
global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
global_out = (%(type_z)s *)(((char *)global_out)+offset_out);
for(int tblock = blockIdx.x*blockDim.z+threadIdx.z; for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
tblock<nb_batch*nb_stack*grid_c*grid_d; tblock<nb_batch*nb_stack*grid_c*grid_d;
...@@ -131,9 +151,22 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -131,9 +151,22 @@ class GpuImages2Neibs(Images2Neibs, Op):
} }
} }
} }
} }""" % locals()
params = [
static __global__ void k_multi_warp_%(nodename)s( 'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'uintp', 'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
]
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
kname = "k_multi_warp"
k_var = "k_multi_warp_" + nodename
code = """
KERNEL void %(kname)s(
const int nb_batch, const int nb_batch,
const int nb_stack, const int nb_stack,
const int height, const int height,
...@@ -144,15 +177,17 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -144,15 +177,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
const int step_y, const int step_y,
const int grid_c, const int grid_c,
const int grid_d, const int grid_d,
const int stride0, const int stride1, const size_t stride0, const size_t stride1,
const int stride2, const int stride3, const size_t stride2, const size_t stride3,
npy_%(dtype_ten4)s * global_ten4, const %(type_ten4)s * global_ten4, const size_t offset_ten4,
const int out_s0, const int out_s1, const size_t out_s0, const size_t out_s1,
npy_%(dtype_z)s * global_out %(type_z)s * global_out, const size_t offset_out
) )
{ {
const int wrap_centered_idx_shift_x = c/2; const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2; const int wrap_centered_idx_shift_y = d/2;
global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
global_out = (%(type_z)s *)(((char *)global_out)+offset_out);
for(int tblock = blockIdx.x*blockDim.z+threadIdx.z; for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
tblock<nb_batch*nb_stack*grid_c*grid_d; tblock<nb_batch*nb_stack*grid_c*grid_d;
...@@ -207,6 +242,17 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -207,6 +242,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
} }
} }
""" % locals() """ % locals()
params = [
'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
'uintp', 'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp',
]
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
return kernels
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
dtype_ten4 = node.inputs[0].dtype dtype_ten4 = node.inputs[0].dtype
...@@ -220,15 +266,21 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -220,15 +266,21 @@ class GpuImages2Neibs(Images2Neibs, Op):
z, = out z, = out
fail = sub['fail'] fail = sub['fail']
mode = self.mode mode = self.mode
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: *fptr: %%s.",
GpuKernel_error(fptr, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
cnda_thread_sync = "GpuArray_sync(&%(z)s->ga);" % dict(z=z) sync = """
else: err = GpuArray_sync(&%(z)s->ga);
cnda_thread_sync = "" %(err_check)s
""" % locals()
return """ return """
#ifndef CEIL_INTDIV
#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
#endif
int grid_c = -1; int grid_c = -1;
int grid_d = -1; int grid_d = -1;
...@@ -281,10 +333,10 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -281,10 +333,10 @@ class GpuImages2Neibs(Images2Neibs, Op):
PyGpuArray_DIMS(%(ten4)s)[3]); PyGpuArray_DIMS(%(ten4)s)[3]);
%(fail)s; %(fail)s;
} }
grid_c = CEIL_INTDIV(((PyGpuArray_DIMS(%(ten4)s))[2]), grid_c = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[2]),
step_x); (size_t)step_x);
grid_d = CEIL_INTDIV(((PyGpuArray_DIMS(%(ten4)s))[3]), grid_d = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[3]),
step_y); (size_t)step_y);
}else if ( "%(mode)s" == "valid") { }else if ( "%(mode)s" == "valid") {
...@@ -367,75 +419,57 @@ class GpuImages2Neibs(Images2Neibs, Op): ...@@ -367,75 +419,57 @@ class GpuImages2Neibs(Images2Neibs, Op):
const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*) const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
PyArray_GETPTR1(%(neib_step)s, 1); PyArray_GETPTR1(%(neib_step)s, 1);
dim3 n_threads(d,c,1); size_t threads_per_block[3] = {d, c, 1};
//Their is a max of 512 threads per blocks //Their is a max of 512 threads per blocks
while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--; while(threads_per_block[0]*threads_per_block[1]>512 && threads_per_block[1]>1)threads_per_block[1]--;
while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--; while(threads_per_block[0]*threads_per_block[1]>512 && threads_per_block[0]>1)threads_per_block[0]--;
//Make bigger block to have better memory access pattern and //Make bigger block to have better memory access pattern and
//a higher core utilisation. for smaller patch size //a higher core utilisation. for smaller patch size
while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 && while(c*d*(threads_per_block[2]+1) < 128 && threads_per_block[2]<64 &&
n_threads.z<PyGpuArray_DIMS(%(z)s)[0]){ threads_per_block[2]<PyGpuArray_DIMS(%(z)s)[0]){
n_threads.z++; threads_per_block[2]++;
} }
int nb_block; int nb_block;
if (PyGpuArray_DIMS(%(z)s)[0] %% n_threads.z == 0) if (PyGpuArray_DIMS(%(z)s)[0] %% threads_per_block[2] == 0)
nb_block = PyGpuArray_DIMS(%(z)s)[0] / n_threads.z; nb_block = PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2];
else else
nb_block = (PyGpuArray_DIMS(%(z)s)[0] / n_threads.z) + 1; nb_block = (PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]) + 1;
dim3 n_blocks(std::min(32*1024,nb_block)); size_t n_blocks[3] = {std::min(32*1024,nb_block), 1, 1};
int n_shared = 0;
void (*f)(int, int, int ,int,
int, int, int ,int,
int, int,
int, int, int, int,
npy_%(dtype_ten4)s*,
int, int,
npy_%(dtype_z)s*);
if(n_threads.x==d && n_threads.y==c){
f = k_multi_warp_less_%(name)s;
}else{
f = k_multi_warp_%(name)s;
}
f<<<n_blocks, n_threads, n_shared>>>( GpuKernel *fptr;
nb_batch, if(threads_per_block[0]==d && threads_per_block[1]==c){
nb_stack, fptr = &k_multi_warp_less_%(name)s;
height, width, }else{
c, d, step_x, step_y, fptr = &k_multi_warp_%(name)s;
grid_c, grid_d,
PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s,
PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s,
PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s,
PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s,
(npy_%(dtype_ten4)s*)(
((char *)cuda_get_ptr(%(ten4)s->ga.data)) +
%(ten4)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s,
(npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset)
);
%(cnda_thread_sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "GpuImages2Neibs:"
" Cuda error: %%s: %%s. (grid: %%i x %%i;"
" block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
n_shared);
%(fail)s;
} }
size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s;
size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s;
size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s;
size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s;
size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
void *kernel_params[] = {(void *)&nb_batch,
(void *)&nb_stack,
(void *)&height, (void *)&width,
(void *)&c, (void *)&d,
(void *)&step_x, (void *)&step_y,
(void *)&grid_c, (void *)&grid_d,
(void *)&stride_A0,
(void *)&stride_A1,
(void *)&stride_A2,
(void *)&stride_A3,
(void *)%(ten4)s->ga.data,
(void *)&%(ten4)s->ga.offset,
(void *)&stride_Z0,
(void *)&stride_Z1,
(void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset};
int err = GpuKernel_call(fptr, 3, threads_per_block, n_blocks, 0, kernel_params);
%(err_check)s
%(sync)s
} // END NESTED SCOPE } // END NESTED SCOPE
""" % locals() """ % locals()
......
from __future__ import print_function from __future__ import print_function
import numpy import numpy
import os
from theano import Op, Apply, config from theano import Op, Apply, config
from six import StringIO from six import StringIO
...@@ -10,16 +11,15 @@ try: ...@@ -10,16 +11,15 @@ try:
except ImportError: except ImportError:
pass pass
from .basic_ops import as_gpuarray_variable from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
from .comp import NVCC_compiler
from .type import GpuArrayType from .type import GpuArrayType
from .kernel_codegen import (nvcc_kernel, from .kernel_codegen import (nvcc_kernel,
inline_softmax, inline_softmax,
inline_softmax_fixed_shared) inline_softmax_fixed_shared)
from .fp16_help import work_dtype, load_w, write_w from .fp16_help import work_dtype, load_w, write_w
class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
""" """
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu. Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
...@@ -41,10 +41,18 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -41,10 +41,18 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
am = y_idx.type() am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am]) return Apply(self, [x, b, y_idx], [nll, sm, am])
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>'] return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/types.h>']
def c_support_code_apply(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype dtype_y_idx = node.inputs[2].dtype
...@@ -54,28 +62,48 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -54,28 +62,48 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
load_b = load_w(dtype_b) load_b = load_w(dtype_b)
write_x = write_w(dtype_x) write_x = write_w(dtype_x)
write_b = write_w(dtype_b) write_b = write_w(dtype_b)
return """ flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
__global__ void k_xent_sm_1hot_bias_%(nodename)s(int M, int N, type_x = gpuarray.dtype_to_ctype(work_x)
const npy_%(dtype_x)s* x_data, int xs0, int xs1, type_b = gpuarray.dtype_to_ctype(work_b)
const npy_%(dtype_b)s* b, int bs0, type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
const npy_%(dtype_y_idx)s* y_idx_data, int y_idxs0, kname = "k_xent_sm_1hot_bias"
npy_%(dtype_x)s* nll_data, int nlls0, k_var = "k_xent_sm_1hot_bias_" + nodename
npy_%(dtype_x)s* sm_data, int sms0, int sms1, sio = StringIO()
npy_%(dtype_y_idx)s* am_data, int ams0) print("""
KERNEL void %(kname)s(const ga_size M, const ga_size N,
const %(type_x)s* x_data, const ga_size offset_x,
const ga_ssize xs0, const ga_ssize xs1,
const %(type_b)s* b, const ga_size offset_b,
const ga_ssize bs0,
const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx,
const ga_ssize y_idxs0,
%(type_x)s* nll_data, const ga_size offset_nll,
const ga_ssize nlls0,
%(type_x)s* sm_data, const ga_size offset_sm,
const ga_ssize sms0, const ga_ssize sms1,
%(type_y_idx)s* am_data, const ga_size offset_am,
const ga_ssize ams0)
{ {
x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
b = (const %(type_b)s *)(((char *)b)+offset_b);
y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll);
sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm);
am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);
for (int row = blockIdx.x; row < M; row += gridDim.x){ for (int row = blockIdx.x; row < M; row += gridDim.x){
const npy_%(dtype_x)s* x = x_data + xs0 * row; const %(type_x)s* x = x_data + xs0 * row;
const npy_%(dtype_y_idx)s y_idx = y_idx_data[row * y_idxs0]; const %(type_y_idx)s y_idx = y_idx_data[row * y_idxs0];
npy_%(dtype_x)s* sm = sm_data + sms0 * row; %(type_x)s* sm = sm_data + sms0 * row;
npy_%(work_x)s sum = 0.0; %(type_x)s sum = 0.0;
int row_max_j = 0; int row_max_j = 0;
npy_%(work_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]); %(type_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
for (int j = 1; j < N; ++j) for (int j = 1; j < N; ++j)
{ {
npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) + %(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
%(load_b)s(b[j*bs0]); %(load_b)s(b[j*bs0]);
//todo: store to shared memory //todo: store to shared memory
row_max_j = (row_ij > row_max) ? j : row_max_j; row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max; row_max = (row_ij > row_max) ? row_ij : row_max;
...@@ -83,16 +111,16 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -83,16 +111,16 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
//compute the exp //compute the exp
for (int j = 0; j < N; ++j) for (int j = 0; j < N; ++j)
{ {
npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) + %(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
%(load_b)s(b[j*bs0]); %(load_b)s(b[j*bs0]);
npy_%(work_x)s sm_ij = exp(row_ij - row_max); %(type_x)s sm_ij = exp(row_ij - row_max);
sum += sm_ij; sum += sm_ij;
sm[j * sms1] = %(write_x)s(sm_ij); sm[j * sms1] = %(write_x)s(sm_ij);
} }
npy_%(work_x)s sum_inv = 1.0 / sum; %(type_x)s sum_inv = 1.0 / sum;
for (int j = 0; j < N; ++j) for (int j = 0; j < N; ++j)
{ {
npy_%(work_x)s __tmp = %(load_x)s(sm[j * sms1]); %(type_x)s __tmp = %(load_x)s(sm[j * sms1]);
__tmp *= sum_inv; __tmp *= sum_inv;
sm[j * sms1] = %(write_x)s(__tmp); sm[j * sms1] = %(write_x)s(__tmp);
} }
...@@ -111,12 +139,18 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -111,12 +139,18 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
am_data[row*ams0] = row_max_j; am_data[row*ams0] = row_max_j;
} }
} }
""" % locals(), file=sio)
CUdeviceptr (*cuda_get_ptr)(gpudata *g); params = [
""" % locals() 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
def c_init_code(self): gpuarray.GpuArray, 'uintp', 'intp',
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");'] gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp'
]
return [Kernel(code=sio.getvalue(), name=kname, params=params,
flags=flags, objvar=k_var)]
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype) typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
...@@ -138,6 +172,21 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -138,6 +172,21 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
dtype_am = node.outputs[2].dtype dtype_am = node.outputs[2].dtype
classname = self.__class__.__name__ classname = self.__class__.__name__
fail = sub['fail'] fail = sub['fail']
k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
sio = StringIO() sio = StringIO()
print(""" print("""
if (PyGpuArray_NDIM(%(y_idx)s) != 1) if (PyGpuArray_NDIM(%(y_idx)s) != 1)
...@@ -219,62 +268,47 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -219,62 +268,47 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
} }
} }
{ {
int n_blocks = PyGpuArray_DIMS(%(x)s)[0] < 256 ? PyGpuArray_DIMS(%(x)s)[0] : 256; size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)256), 1, 1};
size_t threads_per_block[3] = {1, 1, 1};
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
ssize_t stride_NLL0 = PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s;
ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
ssize_t stride_AM0 = PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s;
//TODO: launch more threads per row and do parallel sum and max reductions //TODO: launch more threads per row and do parallel sum and max reductions
int n_threads = 1; void *kernel_params[] = {
int n_shared_bytes = 0; //n_threads * sizeof(dtype); (void *)&PyGpuArray_DIMS(%(x)s)[0],
(void *)&PyGpuArray_DIMS(%(x)s)[1],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
k_xent_sm_1hot_bias_%(nodename)s<<<n_blocks, n_threads, n_shared_bytes>>>( (void *)&stride_X0, (void *)&stride_X1,
PyGpuArray_DIMS(%(x)s)[0], (void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
PyGpuArray_DIMS(%(x)s)[1], (void *)&stride_B0,
(npy_%(dtype_x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) + (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
%(x)s->ga.offset), (void *)&stride_YIDX0,
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s, (void *)%(nll)s->ga.data, (void *)&%(nll)s->ga.offset,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s, (void *)&stride_NLL0,
(npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) + (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
%(b)s->ga.offset), (void *)&stride_SM0, (void *)&stride_SM1,
PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s, (void *)%(am)s->ga.data, (void *)&%(am)s->ga.offset,
(npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) + (void *)&stride_AM0};
%(y_idx)s->ga.offset), int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s, %(err_check)s
(npy_%(dtype_nll)s*)(((char *)cuda_get_ptr(%(nll)s->ga.data)) + %(sync)s
%(nll)s->ga.offset),
PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s,
(npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
%(sm)s->ga.offset),
PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
(npy_%(dtype_am)s*)(((char *)cuda_get_ptr(%(am)s->ga.data)) +
%(am)s->ga.offset),
PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s);
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %(classname)s %(nodename)s: %%s.\\n"
"The kernel was launched with %%d threads,"
" %%d blocks and %%d shared memory\\n",
cudaGetErrorString(err),
n_threads, n_blocks, n_shared_bytes);
// no need to decref output vars the cleanup code will do it
%(fail)s;
}
} }
""" % locals(), file=sio) """ % locals(), file=sio)
return sio.getvalue() return sio.getvalue()
def c_code_cache_version(self): def c_code_cache_version(self):
return (6,) return (7,)
def c_compiler(self):
return NVCC_compiler
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
class GpuCrossentropySoftmax1HotWithBiasDx(Op): class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
""" """
Implement CrossentropySoftmax1HotWithBiasDx on the gpu. Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
...@@ -294,13 +328,18 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -294,13 +328,18 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
return Apply(self, [dnll, sm, y_idx], [sm.type()]) return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (9,) return (10,)
def c_headers(self): def c_header_dirs(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>'] if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_compiler(self): def c_headers(self):
return NVCC_compiler return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype) typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
...@@ -312,20 +351,36 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -312,20 +351,36 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
dtype_sm = node.inputs[1].dtype dtype_sm = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype dtype_y_idx = node.inputs[2].dtype
dtype_dx = node.outputs[0].dtype dtype_dx = node.outputs[0].dtype
type_intp = gpuarray.dtype_to_ctype(numpy.intp)
dnll, sm, y_idx = inp dnll, sm, y_idx = inp
dx, = out dx, = out
fail = sub['fail'] fail = sub['fail']
k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
return """ return """
// Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar. // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
const npy_intp %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ? const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
PyGpuArray_DIMS(%(dnll)s)[0] : PyGpuArray_DIMS(%(dnll)s)[0] :
(npy_intp) 0); (ssize_t) 0);
// Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
// or a vector with just one element. // or a vector with just one element.
const npy_intp %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ? const ssize_t %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
PyGpuArray_STRIDES(%(dnll)s)[0] : PyGpuArray_STRIDES(%(dnll)s)[0] :
(npy_intp) 0); (ssize_t) 0);
if ((PyGpuArray_NDIM(%(dnll)s) > 1) if ((PyGpuArray_NDIM(%(dnll)s) > 1)
|| (PyGpuArray_NDIM(%(sm)s) != 2) || (PyGpuArray_NDIM(%(sm)s) != 2)
...@@ -373,48 +428,33 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -373,48 +428,33 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
} }
} }
{ {
int n_blocks = PyGpuArray_DIMS(%(dx)s)[0] < 256 ? PyGpuArray_DIMS(%(dx)s)[0] : 256; size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[0], (size_t)256), 1, 1};
int n_threads = PyGpuArray_DIMS(%(dx)s)[1] < 256 ? PyGpuArray_DIMS(%(dx)s)[1] : 256; size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[1], (size_t)256), 1, 1};
ssize_t stride_DNLL0 = %(dnll)s_strides0 / %(itemsize_dnll)s;
kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
<<<n_blocks, n_threads>>>( ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
PyGpuArray_DIMS(%(dx)s)[0], ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
PyGpuArray_DIMS(%(dx)s)[1], ssize_t stride_DX0 = PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s;
ssize_t stride_DX1 = PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s;
(npy_%(dtype_dnll)s*)(((char *)cuda_get_ptr(%(dnll)s->ga.data)) + void *kernel_params[] = {
%(dnll)s->ga.offset), (void *)&PyGpuArray_DIMS(%(dx)s)[0],
%(dnll)s_strides0 / %(itemsize_dnll)s, (void *)&PyGpuArray_DIMS(%(dx)s)[1],
(void *)%(dnll)s->ga.data, (void *)&%(dnll)s->ga.offset,
(npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) + (void *)&stride_DNLL0,
%(sm)s->ga.offset), (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s, (void *)&stride_SM0, (void *)&stride_SM1,
PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s, (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
(void *)&stride_YIDX0,
(npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) + (void *)%(dx)s->ga.data, (void *)&%(dx)s->ga.offset,
%(y_idx)s->ga.offset), (void *)&stride_DX0, (void *)&stride_DX1};
PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s, int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
%(err_check)s
(npy_%(dtype_dx)s*)(((char *)cuda_get_ptr(%(dx)s->ga.data)) + %(sync)s
%(dx)s->ga.offset),
PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s,
PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s
);
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n"
"The kernel was launched with %%d threads and"
" %%d blocks\\n",
"kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
cudaGetErrorString(err), n_threads, n_blocks);
%(fail)s;
}
} }
assert(%(dx)s); assert(%(dx)s);
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_dnll = node.inputs[0].dtype dtype_dnll = node.inputs[0].dtype
dtype_sm = node.inputs[1].dtype dtype_sm = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype dtype_y_idx = node.inputs[2].dtype
...@@ -423,18 +463,35 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -423,18 +463,35 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
load_dnll = load_w(dtype_dnll) load_dnll = load_w(dtype_dnll)
load_sm = load_w(dtype_sm) load_sm = load_w(dtype_sm)
write_dx = write_w(dtype_dx) write_dx = write_w(dtype_dx)
return """ flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx)
__global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s( type_dnll = gpuarray.dtype_to_ctype(work_dnll)
int N, int K, type_sm = gpuarray.dtype_to_ctype(dtype_sm)
const npy_%(dtype_dnll)s* dnll, const int dnll_s0, type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
const npy_%(dtype_sm)s* sm, const int sm_s0, const int sm_s1, type_dx = gpuarray.dtype_to_ctype(dtype_dx)
const npy_%(dtype_y_idx)s* y_idx, const int y_idx_s0, kname = "kCrossEntropySoftmax1HotWithBiasDx"
npy_%(dtype_dx)s* dx, const int dx_s0, const int dx_s1) k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
sio = StringIO()
print("""
KERNEL void %(kname)s(
const ga_size N, const ga_size K,
const %(type_dnll)s* dnll, const ga_size offset_dnll,
const ga_ssize dnll_s0,
const %(type_sm)s* sm, const ga_size offset_sm,
const ga_ssize sm_s0, const ga_ssize sm_s1,
const %(type_y_idx)s* y_idx, const ga_size offset_y_idx,
const ga_ssize y_idx_s0,
%(type_dx)s* dx, const ga_size offset_dx,
const ga_ssize dx_s0, const ga_ssize dx_s1)
{ {
dnll = (const %(type_dnll)s *)(((char *)dnll)+offset_dnll);
sm = (const %(type_sm)s *)(((char *)sm)+offset_sm);
y_idx = (const %(type_y_idx)s *)(((char *)y_idx)+offset_y_idx);
dx = (%(type_dx)s *)(((char *)dx)+offset_dx);
for (int i = blockIdx.x; i < N; i += gridDim.x) for (int i = blockIdx.x; i < N; i += gridDim.x)
{ {
npy_%(work_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]); %(type_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
npy_%(dtype_y_idx)s y_i = y_idx[i * y_idx_s0]; %(type_y_idx)s y_i = y_idx[i * y_idx_s0];
for (int j = threadIdx.x; j < K; j += blockDim.x) for (int j = threadIdx.x; j < K; j += blockDim.x)
{ {
...@@ -453,17 +510,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -453,17 +510,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
} }
} }
} }
""" % locals(), file=sio)
CUdeviceptr (*cuda_get_ptr)(gpudata *g); params = [
""" % locals() 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp',
def c_init_code(self): gpuarray.GpuArray, 'uintp', 'intp', 'intp',
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");'] gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp'
]
return [Kernel(code=sio.getvalue(), name=kname, params=params,
flags=flags, objvar=k_var)]
gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx() gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
class GpuSoftmax (Op): class GpuSoftmax (GpuKernelBase, Op):
""" """
Implement Softmax on the gpu. Implement Softmax on the gpu.
...@@ -482,12 +543,16 @@ class GpuSoftmax (Op): ...@@ -482,12 +543,16 @@ class GpuSoftmax (Op):
def c_code_cache_version(self): def c_code_cache_version(self):
return (13,) + inline_softmax.code_version return (13,) + inline_softmax.code_version
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>'] '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_compiler(self):
return NVCC_compiler
def c_init_code(self): def c_init_code(self):
return ['setup_ext_cuda();'] return ['setup_ext_cuda();']
...@@ -502,10 +567,21 @@ class GpuSoftmax (Op): ...@@ -502,10 +567,21 @@ class GpuSoftmax (Op):
x, = inp x, = inp
z, = out z, = out
fail = sub['fail'] fail = sub['fail']
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz) sync = """
err = GpuArray_sync(&%(z)s->ga);
msg = "sync error";
%(err_check)s
""" % locals()
else: else:
cnda_thread_sync = "" sync = ""
return """ return """
if (PyGpuArray_NDIM(%(x)s) != 2) if (PyGpuArray_NDIM(%(x)s) != 2)
{ {
...@@ -528,97 +604,82 @@ class GpuSoftmax (Op): ...@@ -528,97 +604,82 @@ class GpuSoftmax (Op):
} }
} }
{ {
int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0], size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32 * 1024)), 1, 1};
(size_t)(32 * 1024));
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512); size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512), 1, 1};
int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] * size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(work_x)s); 2 * sizeof(npy_%(work_x)s);
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
const char *fmt_str, *msg;
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(%(x)s)[0],
(void *)&PyGpuArray_DIMS(%(x)s)[1],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
(void *)&stride_X0, (void *)&stride_X1,
(void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GA_NO_ERROR;
if (PyGpuArray_DIMS(%(x)s)[0] > 0) if (PyGpuArray_DIMS(%(x)s)[0] > 0)
{ {
//Those numbers are based on not too recent GPU //Those numbers are based on not too recent GPU
//to make them compatible with more GPU. //to make them compatible with more GPU.
//TODO: read the information from the card. //TODO: read the information from the card.
if(n_shared_bytes < (32 * 1024 - 500)){ if(shmem_sz < (32 * 1024 - 500)){
kSoftmax_%(nodename)s err = GpuKernel_call(&kSoftmax_%(nodename)s, 3,
<<< threads_per_block, n_blocks, shmem_sz,
n_blocks, kernel_params);
n_threads, fmt_str = "gpuarray error: kSoftmax_%(nodename)s: %%s";
n_shared_bytes msg = GpuKernel_error(&kSoftmax_%(nodename)s, err);
>>>(
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_z)s*)(
((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
);
}else{ }else{
kSoftmax_fixed_shared%(nodename)s err = GpuKernel_call(&kSoftmax_fixed_shared%(nodename)s, 3,
<<< threads_per_block, n_blocks,
n_blocks, threads_per_block[0] * sizeof(npy_%(work_x)s),
n_threads, kernel_params);
n_threads * sizeof(npy_%(work_x)s) fmt_str = "gpuarray error: kSoftmax_fixed_shared%(nodename)s: %%s";
>>>( msg = GpuKernel_error(&kSoftmax_fixed_shared%(nodename)s, err);
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_z)s*)(
((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
);
}
%(cnda_thread_sync)s
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n Used %%d blocks,"
" %%d threads %%d bytes of shared memory",
"kSoftmax[_fixed_shared]%(nodename)s",
cudaGetErrorString(err),
n_blocks, n_threads, n_shared_bytes);
%(fail)s;
} }
%(err_check)s
%(sync)s
} }
} }
assert(%(z)s); assert(%(z)s);
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_sm = node.outputs[0].dtype dtype_sm = node.outputs[0].dtype
load_x = load_w(node.inputs[0].dtype) load_x = load_w(dtype_x)
write_sm = write_w(node.outputs[0].dtype) write_sm = write_w(node.outputs[0].dtype)
work_sm = work_dtype(node.outputs[0].dtype) work_sm = work_dtype(dtype_sm)
ret1 = nvcc_kernel("kSoftmax_%s" % nodename, flags = Kernel.get_flags(dtype_x, dtype_sm)
params=['int M', 'int N', type_x = gpuarray.dtype_to_ctype(dtype_x)
'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1', type_sm = gpuarray.dtype_to_ctype(work_sm)
'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'], params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp'
]
kernels = []
kname = "kSoftmax"
k_var= "kSoftmax_" + nodename
code = nvcc_kernel(kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[ body=[
"extern __shared__ npy_%(work_sm)s buf[]", "extern __shared__ %s buf[]" % type_sm,
"npy_%(work_sm)s * buf2 = buf + N", "%s * buf2 = buf + N" % type_sm,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1])", "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
"buf2[tx] = buf[tx]", "buf2[tx] = buf[tx]",
"}", "}",
"__syncthreads()", "__syncthreads()",
...@@ -626,21 +687,29 @@ class GpuSoftmax (Op): ...@@ -626,21 +687,29 @@ class GpuSoftmax (Op):
'threadIdx.x', 'blockDim.x', work_sm), 'threadIdx.x', 'blockDim.x', work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly # This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx])", "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
"}", "}",
"__syncthreads()", "__syncthreads()",
"}", "}",
]) ])
ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename, kernels.append(Kernel(code=code, name=kname, params=params,
params=['int M', 'int N', flags=flags, objvar=k_var))
'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1', kname = "kSoftmax_fixed_shared"
'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'], k_var= "kSoftmax_fixed_shared" + nodename
code = nvcc_kernel(kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[ body=[
"extern __shared__ npy_%(work_sm)s buf[]", "extern __shared__ %s buf[]" % type_sm,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]", "const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
"npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]", "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1', inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
load_x, load_x,
'sm_ptr', 'sm_s1', write_sm, 'sm_ptr', 'sm_s1', write_sm,
...@@ -649,12 +718,14 @@ class GpuSoftmax (Op): ...@@ -649,12 +718,14 @@ class GpuSoftmax (Op):
"__syncthreads()", "__syncthreads()",
"}", "}",
]) ])
return (ret1 + "\n" + ret2) % locals() kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
return kernels
gpu_softmax = GpuSoftmax() gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias (Op): class GpuSoftmaxWithBias (GpuKernelBase, Op):
""" """
Implement SoftmaxWithBias on the gpu. Implement SoftmaxWithBias on the gpu.
...@@ -676,12 +747,18 @@ class GpuSoftmaxWithBias (Op): ...@@ -676,12 +747,18 @@ class GpuSoftmaxWithBias (Op):
def c_code_cache_version(self): def c_code_cache_version(self):
return (12,) + inline_softmax.code_version return (12,) + inline_softmax.code_version
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>'] '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_compiler(self):
return NVCC_compiler
def c_init_code(self): def c_init_code(self):
return ['setup_ext_cuda();'] return ['setup_ext_cuda();']
...@@ -698,10 +775,19 @@ class GpuSoftmaxWithBias (Op): ...@@ -698,10 +775,19 @@ class GpuSoftmaxWithBias (Op):
x, b = inp x, b = inp
z, = out z, = out
fail = sub['fail'] fail = sub['fail']
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync: if config.gpuarray.sync:
cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz) sync = """
else: err = GpuArray_sync(&%(z)s->ga);
cnda_thread_sync = "" msg = "sync error";
%(err_check)s
""" % locals()
return """ return """
if (PyGpuArray_NDIM(%(x)s) != 2) if (PyGpuArray_NDIM(%(x)s) != 2)
{ {
...@@ -739,82 +825,51 @@ class GpuSoftmaxWithBias (Op): ...@@ -739,82 +825,51 @@ class GpuSoftmaxWithBias (Op):
} }
} }
{ {
int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024)); size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024)), 1, 1};
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512); size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512), 1, 1};
int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] * size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(work_x)s); 2 * sizeof(npy_%(work_x)s);
ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
const char *fmt_str, *msg;
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(%(x)s)[0],
(void *)&PyGpuArray_DIMS(%(x)s)[1],
(void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
(void *)&stride_X0, (void *)&stride_X1,
(void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
(void *)&stride_B0,
(void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GA_NO_ERROR;
if (PyGpuArray_DIMS(%(x)s)[0] > 0) if (PyGpuArray_DIMS(%(x)s)[0] > 0)
{ {
if(n_shared_bytes < (32 * 1024 - 500)){ if(shmem_sz < (32 * 1024 - 500)){
kSoftmaxWithBias_%(nodename)s err = GpuKernel_call(&kSoftmaxWithBias_%(nodename)s, 3,
<<< threads_per_block, n_blocks, shmem_sz,
n_blocks, kernel_params);
n_threads, fmt_str = "gpuarray error: kSoftmaxWithBias_%(nodename)s: %%s";
n_shared_bytes msg = GpuKernel_error(&kSoftmaxWithBias_%(nodename)s, err);
>>>(
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
%(b)s->ga.offset),
PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
(npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
);
}else{ }else{
kSoftmaxWithBias_fixed_shared%(nodename)s err = GpuKernel_call(&kSoftmaxWithBias_fixed_shared%(nodename)s,
<<< 3, threads_per_block, n_blocks,
n_blocks, threads_per_block[0] * sizeof(npy_%(work_x)s),
n_threads, kernel_params);
n_threads * sizeof(npy_%(work_x)s) fmt_str = "gpuarray error: kSoftmaxWithBias_fixed_shared%(nodename)s: %%s";
>>>( msg = GpuKernel_error(&kSoftmaxWithBias_fixed_shared%(nodename)s, err);
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_b)s*)(
((char *)cuda_get_ptr(%(b)s->ga.data)) +
%(b)s->ga.offset),
PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
(npy_%(dtype_z)s*)(
((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
);
} }
%(cnda_thread_sync)s %(err_check)s
cudaError_t err = cudaGetLastError(); %(sync)s
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n",
"kSoftmaxWithBias_%(nodename)s",
cudaGetErrorString(err));
%(fail)s;
}
} }
} }
assert(%(z)s); assert(%(z)s);
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype dtype_b = node.inputs[1].dtype
dtype_sm = node.outputs[0].dtype dtype_sm = node.outputs[0].dtype
...@@ -822,55 +877,80 @@ class GpuSoftmaxWithBias (Op): ...@@ -822,55 +877,80 @@ class GpuSoftmaxWithBias (Op):
load_b = load_w(node.inputs[1].dtype) load_b = load_w(node.inputs[1].dtype)
write_sm = write_w(node.outputs[0].dtype) write_sm = write_w(node.outputs[0].dtype)
work_sm = work_dtype(node.outputs[0].dtype) work_sm = work_dtype(node.outputs[0].dtype)
ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename, flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
params=['int M', 'int N', type_x = gpuarray.dtype_to_ctype(dtype_x)
'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1', type_b = gpuarray.dtype_to_ctype(dtype_b)
'const npy_%(dtype_b)s * b', 'const int sb0', type_sm = gpuarray.dtype_to_ctype(work_sm)
'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'], params = [
'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp'
]
kernels = []
kname = "kSoftmaxWithBias"
k_var = "kSoftmaxWithBias_" + nodename
code = nvcc_kernel(kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'const %s * b' % type_b, 'const ga_size offset_b',
'const ga_ssize sb0',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[ body=[
"extern __shared__ npy_%(work_sm)s buf[]", "extern __shared__ %s buf[]" % type_sm,
"npy_%(work_sm)s * buf2 = buf + N", "%s * buf2 = buf + N" % type_sm,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"b = (const %s *)(((char *)b)+offset_b)" % type_b,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1])", "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
"buf[tx] += %(load_b)s(b[tx * sb0])", "buf[tx] += %s(b[tx * sb0])" % load_b,
"buf2[tx] = buf[tx]", "buf2[tx] = buf[tx]",
"}", "}",
"__syncthreads()", "__syncthreads()",
inline_softmax('N', 'buf', 'buf2', inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x', work_sm), 'threadIdx.x', 'blockDim.x', work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx])", "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
"}", "}",
"__syncthreads()", "__syncthreads()",
"}", "}",
]) ])
ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename, kernels.append(Kernel(code=code, name=kname, params=params,
params=['int M', 'int N', flags=flags, objvar=k_var))
'const npy_%(dtype_x)s * x', kname = "kSoftmaxWithBias_fixed_shared"
'const int sx0', 'const int sx1', k_var = "kSoftmaxWithBias_fixed_shared" + nodename
'const npy_%(dtype_b)s * b', 'const int sb0', code = nvcc_kernel(kname,
'npy_%(dtype_sm)s * sm', params=['const ga_size M', 'const ga_size N',
'const int sm_s0', 'const int sm_s1'], 'const %s * x' % type_x, 'const ga_size offset_x',
body=[ 'const ga_ssize sx0', 'const ga_ssize sx1',
"extern __shared__ npy_%(work_sm)s buf[]", 'const %s * b' % type_b, 'const ga_size offset_b',
"for (int blockIDX = blockIdx.x; blockIDX < M;" 'const ga_ssize sb0',
" blockIDX += gridDim.x){", '%s * sm' % type_sm, 'const ga_size offset_sm',
"const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]", 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
"npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]", body=[
inline_softmax_fixed_shared('N', 'buf', "extern __shared__ %s buf[]" % type_sm,
'x_ptr', 'sx1', "x = (const %s *)(((char *)x)+offset_x)" % type_x,
load_x, "b = (const %s *)(((char *)b)+offset_b)" % type_b,
'sm_ptr', 'sm_s1', "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
write_sm, "for (int blockIDX = blockIdx.x; blockIDX < M;"
'threadIdx.x', " blockIDX += gridDim.x){",
'blockDim.x', "const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
'b', 'sb0', load_b, "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
work_sm), inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
"__syncthreads()", load_x,
"}", 'sm_ptr', 'sm_s1', write_sm,
]) 'threadIdx.x', 'blockDim.x',
return (ret1 + "\n" + ret2) % locals() 'b', 'sb0', load_b, work_sm),
"__syncthreads()",
"}",
])
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
return kernels
gpu_softmax_with_bias = GpuSoftmaxWithBias() gpu_softmax_with_bias = GpuSoftmaxWithBias()
from __future__ import print_function from __future__ import print_function
import copy import copy
import numpy import numpy
import os
import theano import theano
from theano import tensor, gof, Op from theano import tensor, gof, Op, config
from six.moves import StringIO from six.moves import StringIO
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
import theano.tensor.inplace import theano.tensor.inplace
...@@ -15,7 +16,7 @@ except ImportError: ...@@ -15,7 +16,7 @@ except ImportError:
pass pass
from .type import GpuArrayType from .type import GpuArrayType
from .basic_ops import as_gpuarray_variable, HideC from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel)
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
from .comp import NVCC_compiler from .comp import NVCC_compiler
...@@ -159,7 +160,7 @@ class GpuSubtensor(HideC, Subtensor): ...@@ -159,7 +160,7 @@ class GpuSubtensor(HideC, Subtensor):
return (6,) return (6,)
class GpuIncSubtensor(IncSubtensor): class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
""" """
Implement IncSubtensor on the gpu. Implement IncSubtensor on the gpu.
...@@ -177,6 +178,13 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -177,6 +178,13 @@ class GpuIncSubtensor(IncSubtensor):
def _f16_ok(self): def _f16_ok(self):
return self.iadd_node.op._f16_ok return self.iadd_node.op._f16_ok
def c_header_dirs(self):
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_headers(self): def c_headers(self):
return self.iadd_node.op.c_headers() return self.iadd_node.op.c_headers()
...@@ -186,6 +194,10 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -186,6 +194,10 @@ class GpuIncSubtensor(IncSubtensor):
def c_init_code(self): def c_init_code(self):
return self.iadd_node.op.c_init_code() return self.iadd_node.op.c_init_code()
def gpu_kernels(self, node, nodename):
subname = nodename + "_add_to_zview"
return self.iadd_node.op.gpu_kernels(self.iadd_node, subname)
def make_node(self, x, y, *inputs): def make_node(self, x, y, *inputs):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y) y = as_gpuarray_variable(y)
...@@ -486,7 +498,7 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): ...@@ -486,7 +498,7 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
k(x[i], reshaped_y, broadcast=True) k(x[i], reshaped_y, broadcast=True)
class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
""" """
Implement AdvancedIncSubtensor1 on the gpu, but use function Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent. only avail on compute capability 2.0 and more recent.
...@@ -525,16 +537,24 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -525,16 +537,24 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (5,)
def c_headers(self): def c_headers(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>'] '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_compiler(self): def c_header_dirs(self):
return NVCC_compiler if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_init_code(self): def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();'] return ['setup_ext_cuda();']
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
...@@ -569,7 +589,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -569,7 +589,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
} }
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype dtype_ind = node.inputs[2].dtype
...@@ -578,7 +598,14 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -578,7 +598,14 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
itemsize_y = numpy.dtype(dtype_y).itemsize itemsize_y = numpy.dtype(dtype_y).itemsize
itemsize_ind = numpy.dtype(dtype_ind).itemsize itemsize_ind = numpy.dtype(dtype_ind).itemsize
itemsize_out = numpy.dtype(dtype_out).itemsize itemsize_out = numpy.dtype(dtype_out).itemsize
return """ flags=Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_y = gpuarray.dtype_to_ctype(dtype_y)
type_ind = gpuarray.dtype_to_ctype(dtype_ind)
type_out = gpuarray.dtype_to_ctype(dtype_out)
kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename
code = """
/* /*
* This is a version of atomicAdd that works for half-floats. It may * This is a version of atomicAdd that works for half-floats. It may
...@@ -587,37 +614,43 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -587,37 +614,43 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
* will not be modified. * will not be modified.
*/ */
__device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) { __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
npy_uint32 *base = (npy_uint32 *)((size_t)addr & ~2); ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
npy_uint32 old, assumed, sum, new_; ga_uint old, assumed, sum, new_;
old = *base; old = *base;
do { do {
assumed = old; assumed = old;
sum = __float2half_rn( sum = __float2half_rn(
__half2float(val) + __half2float(val) +
__half2float((npy_float16)__byte_perm(old, 0, __half2float((ga_half)__byte_perm(old, 0,
((size_t)addr & 2) ? 0x4432 : 0x4410))); ((ga_size)addr & 2) ? 0x4432 : 0x4410)));
new_ = __byte_perm(old, sum, ((size_t)addr & 2) ? 0x5410 : 0x3254); new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_); old = atomicCAS(base, assumed, new_);
} while (assumed != old); } while (assumed != old);
return (npy_float16)__byte_perm(old, 0, return (ga_half)__byte_perm(old, 0,
((size_t)addr & 2) ? 0x4432 : 0x4410); ((ga_size)addr & 2) ? 0x4432 : 0x4410);
} }
__global__ void k_vector_add_fast(int numRowsX, KERNEL void k_vector_add_fast(const ga_size numRowsX,
int numColsX, const ga_size numColsX,
int stridesX0, const ga_ssize stridesX0,
int stridesX1, const ga_ssize stridesX1,
npy_%(dtype_x)s *X, %(type_x)s *X,
int numRowsY, const ga_size offset_X,
int numColsY, const ga_size numRowsY,
int stridesY0, const ga_size numColsY,
int stridesY1, const ga_ssize stridesY0,
npy_%(dtype_y)s *Y, const ga_ssize stridesY1,
int numIndices, %(type_y)s *Y,
int stridesIndices, const ga_size offset_Y,
npy_%(dtype_ind)s *indices_arr) const ga_size numIndices,
const ga_ssize stridesIndices,
%(type_ind)s *indices_arr,
const ga_size offset_indices_arr)
{ {
X = (%(type_x)s *)(((char *)X)+offset_X);
Y = (%(type_y)s *)(((char *)Y)+offset_Y);
indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr);
for (int i = (blockIdx.x); i < numIndices; i += gridDim.x) for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
{ {
for(int j = (threadIdx.x); j < numColsX;j += blockDim.x) for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
...@@ -631,41 +664,71 @@ __device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) { ...@@ -631,41 +664,71 @@ __device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
} }
return; return;
} }
""" % locals()
params = [
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'intp', gpuarray.GpuArray, 'uintp'
]
return [Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)]
def c_support_code_apply(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
dtype_out = node.outputs[0].dtype
itemsize_x = numpy.dtype(dtype_x).itemsize
itemsize_y = numpy.dtype(dtype_y).itemsize
itemsize_ind = numpy.dtype(dtype_ind).itemsize
itemsize_out = numpy.dtype(dtype_out).itemsize
k_var = "k_vector_add_fast_" + nodename
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(k_var)s: %%s.",
GpuKernel_error(&%(k_var)s, err));
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """
void GpuArray_vector_add_fast(PyGpuArrayObject* py_self, void GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other, PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr) PyGpuArrayObject *indices_arr)
{ {
int num_threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1], size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
(size_t)256); size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
int num_blocks = std::min(PyGpuArray_SIZE(indices_arr), if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
(size_t)4096); ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s;
dim3 n_blocks(num_blocks); ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s;
dim3 n_threads(num_threads_per_block); ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s;
ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s;
k_vector_add_fast<<<n_blocks, n_threads>>>( void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0],
PyGpuArray_DIM(py_self, 0), (void *)&PyGpuArray_DIMS(py_self)[1],
PyGpuArray_DIM(py_self, 1), (void *)&stride_X0,
PyGpuArray_STRIDE(py_self, 0) / %(itemsize_x)s, (void *)&stride_X1,
PyGpuArray_STRIDE(py_self, 1) / %(itemsize_x)s, (void *)py_self->ga.data,
(npy_%(dtype_x)s*)( (void *)&py_self->ga.offset,
((char *)cuda_get_ptr(py_self->ga.data)) + (void *)&PyGpuArray_DIMS(py_other)[0],
py_self->ga.offset), (void *)&PyGpuArray_DIMS(py_other)[1],
PyGpuArray_DIM(py_other, 0), (void *)&stride_Y0,
PyGpuArray_DIM(py_other, 1), (void *)&stride_Y1,
PyGpuArray_DIM(py_other, 0) == 1 ? 0 : PyGpuArray_STRIDE(py_other, 0) / %(itemsize_y)s, (void *)py_other->ga.data,
PyGpuArray_DIM(py_other, 1) == 1 ? 0 : PyGpuArray_STRIDE(py_other, 1) / %(itemsize_y)s, (void *)&py_other->ga.offset,
(npy_%(dtype_x)s*)( (void *)&PyGpuArray_DIMS(indices_arr)[0],
((char *)cuda_get_ptr(py_other->ga.data)) + (void *)&stride_ind,
py_other->ga.offset), (void *)indices_arr->ga.data,
PyGpuArray_DIMS(indices_arr)[0], (void *)&indices_arr->ga.offset};
PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s, int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
(npy_%(dtype_ind)s*)( %(err_check)s
((char *)cuda_get_ptr(indices_arr->ga.data)) + %(sync)s
indices_arr->ga.offset) }
);
return;
} }
""" % locals() """ % locals()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论