提交 48d1e683 authored 作者: Adam Becker's avatar Adam Becker

mixed changes / fixes

上级 a3624d6f
...@@ -29,7 +29,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant, ...@@ -29,7 +29,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor, GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined) reg_context, get_context, ContextNotDefined)
from .basic_ops import as_gpuarray_variable from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, extra_ops, multinomial, reduction, rng_mrg, ctc from . import fft, dnn, opt, extra_ops, multinomial, reduction, sort, rng_mrg, ctc
def transfer(x, target): def transfer(x, target):
......
...@@ -48,6 +48,9 @@ POSSIBILITY OF SUCH DAMAGE. ...@@ -48,6 +48,9 @@ POSSIBILITY OF SUCH DAMAGE.
#endif #endif
typedef ptrdiff_t ssize_t;
__device__ __forceinline__ int lane_id() { __device__ __forceinline__ int lane_id() {
int id; int id;
asm("mov.s32 %0, %laneid;" : "=r"(id) ); asm("mov.s32 %0, %laneid;" : "=r"(id) );
......
...@@ -7,7 +7,7 @@ from theano.tensor import as_tensor_variable ...@@ -7,7 +7,7 @@ from theano.tensor import as_tensor_variable
from theano.tensor.sort import TopKOp from theano.tensor.sort import TopKOp
from .basic_ops import (GpuKernelBase, Kernel, infer_context_name, from .basic_ops import (GpuKernelBase, Kernel, infer_context_name,
as_gpuarray_variable) as_gpuarray_variable, gpuarray_helper_inc_dir)
from .opt import register_opt, op_lifter, register_opt2 from .opt import register_opt, op_lifter, register_opt2
from .type import GpuArrayType from .type import GpuArrayType
...@@ -19,8 +19,6 @@ except ImportError as e: ...@@ -19,8 +19,6 @@ except ImportError as e:
pass pass
# TODO GPU sort / argsort # TODO GPU sort / argsort
# TODO support when k >= 2^31
class GpuTopKOp(GpuKernelBase, TopKOp): class GpuTopKOp(GpuKernelBase, TopKOp):
''' '''
...@@ -49,7 +47,10 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -49,7 +47,10 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h'] return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']
def c_header_dirs(self): def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include()] return [
os.path.dirname(__file__),
gpuarray_helper_inc_dir(),
pygpu.get_include()]
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (1,)
...@@ -129,17 +130,18 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -129,17 +130,18 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
subs['count_t'] = 'int' subs['count_t'] = 'int'
kernels.append( kernels.append(
build_kernel('topk_dense' + kernel_ext, 'k_topk_dense', subs)) build_kernel('topk_dense' + kernel_ext, 'k_topk_dense', subs))
subs['kname'] = 'topk_dense_large' subs['kname'] = 'k_topk_dense_large'
kernels.append( kernels.append(
build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_large', subs)) build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_large', subs))
subs['count_t'] = 'long long' subs['count_t'] = 'long long'
subs['kname'] = 'topk_dense_xlarge' subs['kname'] = 'k_topk_dense_xlarge'
kernels.append( kernels.append(
build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_xlarge', subs)) build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_xlarge', subs))
return kernels return kernels
def c_code(self, node, nodename, inps, outs, sub): def c_code(self, node, nodename, inps, outs, sub):
if node.inputs[0].type.context.kind != b'cuda': context = node.inputs[0].type.context
if context.kind != b'cuda':
raise NotImplementedError( raise NotImplementedError(
'%s: We only have CUDA ' '%s: We only have CUDA '
'implementation so far.' % self.__class__.__name__) 'implementation so far.' % self.__class__.__name__)
...@@ -156,7 +158,10 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -156,7 +158,10 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
fail = sub['fail'] fail = sub['fail']
ctx = sub['params'] ctx = sub['params']
k_dtype = node.inputs[1].type.dtype_specs()[1] k_dtype = node.inputs[1].type.dtype_specs()[1]
MAX_TPB = 1024 # max threads per block # max threads per block
MAX_TPB = context.maxlsize
# max blocks per grid
MAX_BPG = context.maxgsize0
WARP_SIZE = 32 WARP_SIZE = 32
ndim = node.inputs[0].ndim ndim = node.inputs[0].ndim
...@@ -215,18 +220,22 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -215,18 +220,22 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
} }
%(prep_output)s %(prep_output)s
size_t blk[6]; size_t grid_size=1, block_size=1;
size_t *grd = blk+3;
blk[0] = blk[1] = blk[2] = 1;
grd[0] = grd[1] = grd[2] = 1;
for (int i=0; i<%(ndim)d; ++i) { for (int i=0; i<%(ndim)d; ++i) {
if (i!=%(axis)d) if (i!=%(axis)d)
grd[0] *= dims[i]; grid_size *= dims[i];
else else
blk[0] = dims[i]; block_size = dims[i];
} }
// round up to multiples of warp size // round up to multiples of warp size
blk[0] = ((blk[0] + %(WARP_SIZE)d - 1) / %(WARP_SIZE)d) * %(WARP_SIZE)d; block_size = ((block_size + %(WARP_SIZE)d - 1) / %(WARP_SIZE)d) * %(WARP_SIZE)d;
if (grid_size > %(MAX_BPG)d) {
PyErr_SetString(
PyExc_ValueError,
"topk: too many slices to work with, expected <= %(MAX_BPG)d");
%(fail)s;
}
%(def_dvstrides)s; %(def_dvstrides)s;
%(def_distrides)s; %(def_distrides)s;
...@@ -242,25 +251,20 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -242,25 +251,20 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
}; };
int err; int err;
if (dims[%(axis)d] > PY_SSIZE_T_MAX) { if (dims[%(axis)d] > (1u << 31)) {
PyErr_SetString( block_size = %(MAX_TPB)d;
PyExc_ValueError,
"topk: array size on specified axis is too large, should be less than PY_SSIZE_T_MAX.");
%(fail)s;
} else if (dims[%(axis)d] > (1u << 31)) {
blk[0] = %(MAX_TPB)d;
err = GpuKernel_call( err = GpuKernel_call(
&k_topk_dense_xlarge%(nodename)s, 3, &k_topk_dense_xlarge%(nodename)s, 1,
grd, blk, 0, args); &grid_size, &block_size, 0, args);
} else if (blk[0] > %(MAX_TPB)d) { } else if (block_size > %(MAX_TPB)d) {
blk[0] = %(MAX_TPB)d; block_size = %(MAX_TPB)d;
err = GpuKernel_call( err = GpuKernel_call(
&k_topk_dense_large%(nodename)s, 3, &k_topk_dense_large%(nodename)s, 1,
grd, blk, 0, args); &grid_size, &block_size, 0, args);
} else { } else {
err = GpuKernel_call( err = GpuKernel_call(
&k_topk_dense%(nodename)s, 3, &k_topk_dense%(nodename)s, 1,
grd, blk, 0, args); &grid_size, &block_size, 0, args);
} }
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_SetString( PyErr_SetString(
...@@ -295,9 +299,6 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -295,9 +299,6 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
@op_lifter([TopKOp], cuda_only=True) @op_lifter([TopKOp], cuda_only=True)
@register_opt2([TopKOp], 'fast_compile') @register_opt2([TopKOp], 'fast_compile')
def local_gpua_topkop(op, ctx_name, inputs, outputs): def local_gpua_topkop(op, ctx_name, inputs, outputs):
if isinstance(op, GpuTopKOp):
return False
axis = op.axis axis = op.axis
rv = op.return_values rv = op.return_values
ri = op.return_indices ri = op.return_indices
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论