提交 dcd37993 authored 作者: Frederic Bastien's avatar Frederic Bastien 提交者: notoraptor

Partial update to use the new libgpuarray interface

上级 d28490dc
...@@ -9,14 +9,19 @@ extern "C" __global__ void k_topk_dense( ...@@ -9,14 +9,19 @@ extern "C" __global__ void k_topk_dense(
// size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM} // size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
$dstv $dstv
// INPUT_TYPE *dstv // INPUT_TYPE *dstv
$dstv_offset
// size_t offset
$dstv_strides $dstv_strides
// ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM} // ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
$dsti $dsti
// INDEX_TYPE *dsti // INDEX_TYPE *dsti
$dsti_offset
// size_t offset
$dsti_strides $dsti_strides
// ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM} // ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
ssize_t k, ssize_t k,
INPUT_TYPE* src, INPUT_TYPE* src,
size_t src_offset
$src_strides $src_strides
// ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM} // ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
size_t size) { size_t size) {
...@@ -28,6 +33,9 @@ extern "C" __global__ void k_topk_dense( ...@@ -28,6 +33,9 @@ extern "C" __global__ void k_topk_dense(
size_t out_idx; size_t out_idx;
const unsigned char warp_id = idx / GA_WARP_SIZE; const unsigned char warp_id = idx / GA_WARP_SIZE;
dstv = ptr_add(dstv, dstv_offset);
dsti = ptr_add(dsti, dsti_offset);
src = ptr_add(src, src_offset);
// 0. get the slice for thread block to work on // 0. get the slice for thread block to work on
......
...@@ -199,14 +199,19 @@ extern "C" __global__ void KERNEL_NAME( ...@@ -199,14 +199,19 @@ extern "C" __global__ void KERNEL_NAME(
// size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM} // size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
$dstv $dstv
// INPUT_TYPE *dstv // INPUT_TYPE *dstv
$dstv_offset
// size_t offset
$dstv_strides $dstv_strides
// ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM} // ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
$dsti $dsti
// INDEX_TYPE *dsti // INDEX_TYPE *dsti
$dsti_offset
// size_t offset
$dsti_strides $dsti_strides
// ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM} // ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
ssize_t k, ssize_t k,
INPUT_TYPE* src, INPUT_TYPE* src,
size_t src_offset
$src_strides $src_strides
// ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM} // ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
size_t size) { size_t size) {
...@@ -217,6 +222,9 @@ extern "C" __global__ void KERNEL_NAME( ...@@ -217,6 +222,9 @@ extern "C" __global__ void KERNEL_NAME(
k = (order ? k : -k); k = (order ? k : -k);
const int idx = threadIdx.x; const int idx = threadIdx.x;
const int warp_id = idx / GA_WARP_SIZE; const int warp_id = idx / GA_WARP_SIZE;
dstv = ptr_add(dstv, dstv_offset);
dsti = ptr_add(dsti, dsti_offset);
src = ptr_add(src, src_offset);
// get the slice for thread block to work on // get the slice for thread block to work on
// size <- the axis to work on // size <- the axis to work on
......
...@@ -53,7 +53,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -53,7 +53,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
pygpu.get_include()] pygpu.get_include()]
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (3,)
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
# load kernel source # load kernel source
...@@ -83,7 +83,9 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -83,7 +83,9 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
out_t=ga.dtype_to_ctype(self.idx_dtype), out_t=ga.dtype_to_ctype(self.idx_dtype),
dims=''.join('size_t dims_%d, ' % i for i in range(1, ndim)), dims=''.join('size_t dims_%d, ' % i for i in range(1, ndim)),
dstv='INPUT_TYPE *dstv,' if self.return_values else '', dstv='INPUT_TYPE *dstv,' if self.return_values else '',
dstv_offset='size_t dstv_offset,' if self.return_values else '',
dsti='INDEX_TYPE *dsti,' if self.return_indices else '', dsti='INDEX_TYPE *dsti,' if self.return_indices else '',
dsti_offset='size_t dsti_offset,' if self.return_indices else '',
dstv_strides=dstv_strides_code if self.return_values else '', dstv_strides=dstv_strides_code if self.return_values else '',
dsti_strides=dsti_strides_code if self.return_indices else '', dsti_strides=dsti_strides_code if self.return_indices else '',
src_strides=src_strides_code, src_strides=src_strides_code,
...@@ -100,9 +102,11 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -100,9 +102,11 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
param_types = [ga.SIZE] * (ndim - 1) # dims param_types = [ga.SIZE] * (ndim - 1) # dims
for _ in range(self.return_values + self.return_indices): for _ in range(self.return_values + self.return_indices):
param_types.append(ga.GpuArray) # dst* param_types.append(ga.GpuArray) # dst*
param_types.extend([ga.SIZE] * ndim) # offset
param_types.extend([ga.SSIZE] * ndim) # dst*_strides param_types.extend([ga.SSIZE] * ndim) # dst*_strides
param_types.append(ga.SIZE) # k param_types.append(ga.SIZE) # k
param_types.append(ga.GpuArray) # src param_types.append(ga.GpuArray) # src
param_types.append(ga.SIZE) # offset
param_types.extend([ga.SSIZE] * ndim) # src_strides param_types.extend([ga.SSIZE] * ndim) # src_strides
param_types.append(ga.SIZE) # size param_types.append(ga.SIZE) # size
...@@ -174,7 +178,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -174,7 +178,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
prep_output = '' prep_output = ''
if self.return_values: if self.return_values:
def_dvstrides = 'const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)' % yv def_dvstrides = 'const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)' % yv
params_dv = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % (yv, yv) params_dv = '(void*)((char*)(%s->ga.data), (%s->ga.offset)),\n' % (yv, yv)
params_dv += ''.join('(void*)(dvstrides+%d), ' % i for i in reordered_axes) params_dv += ''.join('(void*)(dvstrides+%d), ' % i for i in reordered_axes)
prep_output += ''' prep_output += '''
if (0 != theano_prep_output( if (0 != theano_prep_output(
...@@ -187,7 +191,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -187,7 +191,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
if self.return_indices: if self.return_indices:
def_distrides = 'const ssize_t *distrides = PyGpuArray_STRIDES(%s)' % yi def_distrides = 'const ssize_t *distrides = PyGpuArray_STRIDES(%s)' % yi
params_di = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % yi params_di = '(void*)((char*)(%s->ga.data), (%s->ga.offset)),\n' % (yi, yi)
params_di += ''.join('(void*)(distrides+%d), ' % i for i in reordered_axes) params_di += ''.join('(void*)(distrides+%d), ' % i for i in reordered_axes)
prep_output += ''' prep_output += '''
if (0 != theano_prep_output( if (0 != theano_prep_output(
...@@ -246,7 +250,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -246,7 +250,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
%(params_dv)s %(params_dv)s
%(params_di)s %(params_di)s
(void*)(&k_), (void*)(&k_),
(void*)((char*)(%(x)s->ga.data) + (%(x)s->ga.offset)), (void*)((char*)(%(x)s->ga.data), (%(x)s->ga.offset)),
%(sstrides)s, %(sstrides)s,
(void*)(dims+%(axis)d), (void*)(dims+%(axis)d),
}; };
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论