提交 c89cab31 authored 作者: Adam Becker's avatar Adam Becker

mixed changes

- add float16 suppport - fix 64 bit input bug - fix int8 bug due to integer overflow - more checking on k - add license from pytorch
上级 98c9ead1
// modified from pytorch
// https://github.com/pytorch/pytorch/master/blob/torch/lib/THC/THCTensorTopK.cuh
//
// Converts a type (maybe float) to an integer representation with the same
// sorting; i.e., for floats f1, f2:
// if f1 < f2 then convert(f1) < convert(f2)
// We use this to enable radix selection of floating-point values.
// This also gives a relative order for NaNs, but that's ok, as they
// will all be adjacent
// original license below:
/*
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#if __CUDA_ARCH__ < 350
#define __ldg(ptr) (*(ptr))
......@@ -15,7 +50,13 @@
template <typename T>
struct RadixConfig {
typedef T RadixType;
// Converts a type (maybe float) to an integer representation with the same
// sorting; i.e., for floats f1, f2:
// if f1 < f2 then convert(f1) < convert(f2)
// We use this to enable radix selection of floating-point values.
// This also gives a relative order for NaNs, but that's ok, as they
// will all be adjacent
typedef T RadixType;
static inline __device__ RadixType convert(T v) {
return v;
}
......@@ -45,7 +86,7 @@ struct RadixConfig<ga_float> {
template <>
struct RadixConfig<ga_double> {
typedef unsigned long long int RadixType;
typedef ga_ulong RadixType;
static inline __device__ RadixType convert(ga_double v) {
RadixType x = __double_as_longlong(v);
......@@ -60,22 +101,9 @@ struct RadixConfig<ga_double> {
};
template <>
struct RadixConfig<ga_ubyte> {
typedef ga_uint RadixType;
static inline __device__ RadixType convert(ga_ubyte v) {
return v;
}
static inline __device__ ga_ubyte deconvert(RadixType v) {
return v;
}
};
template <>
struct RadixConfig<ga_byte> {
typedef ga_uint RadixType;
typedef ga_ubyte RadixType;
static inline __device__ RadixType convert(ga_byte v) {
return 128u + v;
......@@ -88,7 +116,7 @@ struct RadixConfig<ga_byte> {
template <>
struct RadixConfig<ga_short> {
typedef ga_uint RadixType;
typedef ga_ushort RadixType;
static inline __device__ RadixType convert(ga_short v) {
assert(sizeof(ga_short) == 2);
......@@ -101,61 +129,52 @@ struct RadixConfig<ga_short> {
};
template <>
struct RadixConfig<int> {
struct RadixConfig<ga_int> {
typedef ga_uint RadixType;
static inline __device__ RadixType convert(int v) {
static inline __device__ RadixType convert(ga_int v) {
assert(sizeof(int) == 4);
return (1u << 31) ^ v;
return 2147483648u + v;
}
static inline __device__ int deconvert(RadixType v) {
return (1u << 31) ^ v;
static inline __device__ ga_int deconvert(RadixType v) {
return v - 2147483648u;
}
};
template <>
struct RadixConfig<ga_long> {
typedef unsigned long long int RadixType;
typedef ga_ulong RadixType;
static inline __device__ RadixType convert(ga_long v) {
assert(sizeof(long) == 8);
return (1ull << 63) ^ v;
assert(sizeof(ga_long) == 8);
return 9223372036854775808ull + v;
}
static inline __device__ ga_long deconvert(RadixType v) {
return (1ull << 63) ^ v;
static inline __device__ long long deconvert(RadixType v) {
return v - 9223372036854775808ull;
}
};
#ifdef USE_HALF
// TODO: make this work
#define USE_HALF $use_half
#if USE_HALF == 1
// since ga_half is ushort, use macro to protect this part is necessary
template <>
struct RadixConfig<half> {
typedef ga_uint RadixType;
struct RadixConfig<ga_half> {
typedef ga_ushort RadixType;
static inline __device__ RadixType convert(half v) {
#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 80000
RadixType x = __half_as_ushort(v);
RadixType mask = -((x >> 15)) | 0x8000;
return (x ^ mask);
#else
assert(false);
return 0u;
#endif
static inline __device__ RadixType convert(ga_half v) {
RadixType mask = -(((RadixType)v >> 15)) | 0x8000;
return (v ^ mask);
}
static inline __device__ half deconvert(RadixType v) {
#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 80000
static inline __device__ ga_half deconvert(RadixType v) {
RadixType mask = ((v >> 15) - 1) | 0x8000;
return __ushort_as_half(v ^ mask);
#else
assert(false);
return ScalarConvert<int, half>::to(0);
#endif
return (ga_half)(v ^ mask);
}
};
#endif
#endif // USE_HALF
// $$inp_t should be replaced in c_code
// we cannot use templated kernel because gpuarray API does not support it
......@@ -175,16 +194,15 @@ struct RadixConfig<half> {
#error "RADIX_SIZE must be smaller than warp size (32)"
#endif
template <typename T>
static inline __device__ T binary_cumsum(
int idx, int warp_id, int lane_id, T* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads
static inline __device__ ga_size binary_cumsum(
int idx, int warp_id, int lane_id, ga_size* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads
// whose id is *no greater than* the current thread
// binary_cumsum(1, 0, 1, 0, 1) -> (1, 1, 2, 2, 3)
// cumsum within warp
ga_uint warp_bits = __ballot(value);
T warp_sum = __popc(((2<<lane_id)-1) & warp_bits);
ga_size warp_sum = __popc(((2<<lane_id)-1) & warp_bits);
if (lane_id == 0)
smem[warp_id] = __popc(warp_bits);
......@@ -195,7 +213,7 @@ static inline __device__ T binary_cumsum(
if (idx == 0) {
int current = 0;
for (int i = 0; i < LDIM_0 / GA_WARP_SIZE; ++i) {
T v = smem[i];
ga_size v = smem[i];
smem[i] = smem[i]+current;
current = current+v;
}
......@@ -211,16 +229,15 @@ static inline __device__ T binary_cumsum(
return warp_sum;
}
template <typename T>
static inline __device__ T binary_cumsum_exclusive(
int idx, int warp_id, int lane_id, T* smem, bool value) {
static inline __device__ ga_size binary_cumsum_exclusive(
int idx, int warp_id, int lane_id, ga_size* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads
// whose id is *less than* the current thread
// binary_cumsum(1, 0, 1, 0, 1) -> (0, 1, 1, 2, 2)
// binary_cumsum_excl(1, 0, 1, 0, 1) -> (0, 1, 1, 2, 2)
// cumsum within warp
ga_uint warp_bits = __ballot(value);
T warp_sum = __popc(((1<<lane_id)-1) & warp_bits);
ga_size warp_sum = __popc(((1<<lane_id)-1) & warp_bits);
if (lane_id == 0)
smem[warp_id] = __popc(warp_bits);
......@@ -231,7 +248,7 @@ static inline __device__ T binary_cumsum_exclusive(
if (idx == 0) {
int current = 0;
for (int i = 0; i < LDIM_0 / GA_WARP_SIZE; ++i) {
T v = smem[i];
ga_size v = smem[i];
smem[i] = smem[i]+current;
current = current+v;
}
......
......@@ -15,10 +15,10 @@ KERNEL void k_topk_dense(
$src_strides
// ga_ssize src_strides_0, ga_ssize src_strides_1, ... , src_strides_$${NDIM}
ga_size size) {
LOCAL_MEM radix_t smem[32 * RADIX_SIZE];
LOCAL_MEM ga_size smem[32 * RADIX_SIZE];
ga_ssize LOCAL_MEM bins[RADIX_SIZE+1]; // TODO: does using 32-bit gives good speedup?
bool is_topk=true, is_topkth=true;
radix_t out_idx;
ga_size out_idx;
const ga_ushort idx = LID_0;
ga_size LOCAL_MEM k2, exceed;
......@@ -118,12 +118,12 @@ KERNEL void k_topk_dense(
if (exceed != 0) {
// top_kth value may not be unique, so we need to
// perform binary cumsum on is_topkth to drop exceeding top-kth values
out_idx = binary_cumsum_exclusive<radix_t>(idx, warp_id, lane_id, smem, is_topkth);
out_idx = binary_cumsum_exclusive(idx, warp_id, lane_id, smem, is_topkth);
is_topk &= ((!is_topkth) || out_idx>=exceed);
}
// perform binary cumsum on is_topk to determine the indices to put result
out_idx = binary_cumsum_exclusive<radix_t>(idx, warp_id, lane_id, smem, is_topk);
out_idx = binary_cumsum_exclusive(idx, warp_id, lane_id, smem, is_topk);
local_barrier();
if (is_topk) {
......
......@@ -15,9 +15,9 @@ KERNEL void k_topk_dense_large(
$src_strides
// ga_ssize src_strides_0, ga_ssize src_strides_1, ... , src_strides_$${NDIM}
ga_size size, ga_ushort inp_per_thread) {
LOCAL_MEM radix_t smem[32 * RADIX_SIZE];
LOCAL_MEM ga_size smem[32 * RADIX_SIZE];
LOCAL_MEM radix_t known_bits, known_bits_mask;
radix_t out_idx;
ga_size out_idx;
ga_size LOCAL_MEM write_base;
INPUT_TYPE xval;
radix_t x;
......@@ -109,21 +109,13 @@ KERNEL void k_topk_dense_large(
local_barrier();
}
/*
if (idx < RADIX_SIZE) {
long long int xxx = 1337;
ptr_at(dstv, idx*dstv_strides_0) = xxx;
}
return;
*/
// 2. write values smaller than top-kth
for (i=0; i<inp_per_thread; ++i) {
in_range = (idx*inp_per_thread+i) < size;
xval = in_range ? ptr_read(src, i*src_strides_0) : (INPUT_TYPE)0;
x = inv_bits ^ RadixConfig<INPUT_TYPE>::convert(xval);
is_topk = (x > known_bits) && in_range;
out_idx = binary_cumsum<radix_t>(idx, warp_id, lane_id, smem, is_topk);
out_idx = binary_cumsum(idx, warp_id, lane_id, smem, is_topk);
if (is_topk) {
#if WRITE_VALUE == 1
ptr_at(dstv, (out_idx+write_base-1) * dstv_strides_0) = xval;
......@@ -144,7 +136,7 @@ KERNEL void k_topk_dense_large(
xval = in_range ? ptr_read(src, i*src_strides_0) : (INPUT_TYPE)0;
x = inv_bits ^ RadixConfig<INPUT_TYPE>::convert(xval);
is_topk = (x == known_bits) && in_range;
out_idx = binary_cumsum<radix_t>(idx, warp_id, lane_id, smem, is_topk);
out_idx = binary_cumsum(idx, warp_id, lane_id, smem, is_topk);
is_topk = ((out_idx+write_base) <= abs(k)) && is_topk;
if (is_topk) {
#if WRITE_VALUE == 1
......
......@@ -19,12 +19,14 @@ except ImportError as e:
# To make sure theano is importable
pass
# TODO sort / argsort
# TODO add runtime opt, if k==1, use max/min reduce
# also if k is axis size, just copy input tensor
# TODO add opt to merge argtopk / topk, or split topk_and_argtopk when only
# one result is needed
# TODO add grad
# TODO sort / argsort
class GpuTopKOp(GpuKernelBase, TopKOp):
'''
......@@ -32,6 +34,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
'''
__props__ = TopKOp.__props__
_f16_ok = True
def __init__(self, axis=-1, return_values=True, return_indices=False, idx_dtype='int64'):
GpuKernelBase.__init__(self)
......@@ -99,7 +102,9 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
set_slice=set_slice_code,
write_value=int(self.return_values),
write_index=int(self.return_indices),
ndim=str(ndim))
ndim=str(ndim),
use_half=int(node.inputs[0].dtype == 'float16')
)
elif device_type == b'opencl':
raise NotImplementedError()
......@@ -201,6 +206,11 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
PyExc_ValueError,
"topk: k must not be zero");
%(fail)s;
} else if (dims[%(axis)d] < odims[%(axis)d]){
PyErr_SetString(
PyExc_ValueError,
"topk: k cannot larger than size on specified axis %(axis)d");
%(fail)s;
}
%(prep_output)s
......@@ -236,7 +246,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
int err;
if (blk[0] > %(MAX_TPB)d) {
// CUDA_OUT_OF_RESOURCE if a max sized block is used
// LAUNCH_OUT_OF_RESOURCE if a 1024 sized block is used
blk[0] = %(MAX_TPB)d / 2;
err = GpuKernel_call(
&k_topk_dense_large_%(nodename)s, 3,
......@@ -291,5 +301,8 @@ def local_gpua_topkop(op, ctx_name, inputs, outputs):
x = as_gpuarray_variable(x, ctx_name)
rets = GpuTopKOp(
axis=axis, return_values=rv, return_indices=ri, idx_dtype=op.idx_dtype)(x, k)
axis=axis,
return_values=rv,
return_indices=ri,
idx_dtype=op.idx_dtype)(x, k)
return rets
......@@ -426,6 +426,10 @@ def topk(x, k, axis=-1):
Upon which axis shall the operation be performed on. If ``None``,
works on flattened array.
Returns
-------
Tensor variable with same dtype as `x`.
Notes
-----
- The returned values may not be sorted.
......@@ -456,6 +460,10 @@ def argtopk(x, k, axis=-1, idx_dtype='int64'):
idx_dtype: string
Specify output dtype, defaults to ``int64``, must be integer type.
Returns
-------
Tensor variable with dtype specified in `idx_dtype`.
Notes
-----
- The corresponding values of returned indices may not be sorted.
......
......@@ -275,7 +275,7 @@ class Test_TopK(unittest.TestCase):
product(
(16, 61, 257),
(1, -1, 10, -10, 'n//2', 'n-1', '-n', '1-n'),
('float64', 'int16', 'int8')),
('float64', 'float16', 'int16', 'int8')),
((2049, 1337, 'float64'),)))
def test_topk_1d(self, size, k, dtype):
if isinstance(k, str):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论