Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
c89cab31
提交
c89cab31
authored
5月 18, 2017
作者:
Adam Becker
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
mixed changes
- add float16 suppport - fix 64 bit input bug - fix int8 bug due to integer overflow - more checking on k - add license from pytorch
上级
98c9ead1
隐藏空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
118 行增加
和
88 行删除
+118
-88
k_topk_common.cuh
theano/gpuarray/k_topk_common.cuh
+84
-67
k_topk_dense.cu
theano/gpuarray/k_topk_dense.cu
+4
-4
k_topk_dense_large.cu
theano/gpuarray/k_topk_dense_large.cu
+4
-12
sort.py
theano/gpuarray/sort.py
+17
-4
sort.py
theano/tensor/sort.py
+8
-0
test_sort.py
theano/tensor/tests/test_sort.py
+1
-1
没有找到文件。
theano/gpuarray/k_topk_common.cuh
浏览文件 @
c89cab31
// modified from pytorch
// https://github.com/pytorch/pytorch/master/blob/torch/lib/THC/THCTensorTopK.cuh
//
// Converts a type (maybe float) to an integer representation with the same
// sorting; i.e., for floats f1, f2:
// if f1 < f2 then convert(f1) < convert(f2)
// We use this to enable radix selection of floating-point values.
// This also gives a relative order for NaNs, but that's ok, as they
// will all be adjacent
// original license below:
/*
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#if __CUDA_ARCH__ < 350
#define __ldg(ptr) (*(ptr))
...
...
@@ -15,7 +50,13 @@
template <typename T>
struct RadixConfig {
typedef T RadixType;
// Converts a type (maybe float) to an integer representation with the same
// sorting; i.e., for floats f1, f2:
// if f1 < f2 then convert(f1) < convert(f2)
// We use this to enable radix selection of floating-point values.
// This also gives a relative order for NaNs, but that's ok, as they
// will all be adjacent
typedef T RadixType;
static inline __device__ RadixType convert(T v) {
return v;
}
...
...
@@ -45,7 +86,7 @@ struct RadixConfig<ga_float> {
template <>
struct RadixConfig<ga_double> {
typedef
unsigned long long int
RadixType;
typedef
ga_ulong
RadixType;
static inline __device__ RadixType convert(ga_double v) {
RadixType x = __double_as_longlong(v);
...
...
@@ -60,22 +101,9 @@ struct RadixConfig<ga_double> {
};
template <>
struct RadixConfig<ga_ubyte> {
typedef ga_uint RadixType;
static inline __device__ RadixType convert(ga_ubyte v) {
return v;
}
static inline __device__ ga_ubyte deconvert(RadixType v) {
return v;
}
};
template <>
struct RadixConfig<ga_byte> {
typedef ga_u
int
RadixType;
typedef ga_u
byte
RadixType;
static inline __device__ RadixType convert(ga_byte v) {
return 128u + v;
...
...
@@ -88,7 +116,7 @@ struct RadixConfig<ga_byte> {
template <>
struct RadixConfig<ga_short> {
typedef ga_u
in
t RadixType;
typedef ga_u
shor
t RadixType;
static inline __device__ RadixType convert(ga_short v) {
assert(sizeof(ga_short) == 2);
...
...
@@ -101,61 +129,52 @@ struct RadixConfig<ga_short> {
};
template <>
struct RadixConfig<int> {
struct RadixConfig<
ga_
int> {
typedef ga_uint RadixType;
static inline __device__ RadixType convert(int v) {
static inline __device__ RadixType convert(
ga_
int v) {
assert(sizeof(int) == 4);
return
(1u << 31) ^
v;
return
2147483648u +
v;
}
static inline __device__ int deconvert(RadixType v) {
return
(1u << 31) ^ v
;
static inline __device__
ga_
int deconvert(RadixType v) {
return
v - 2147483648u
;
}
};
template <>
struct RadixConfig<ga_long> {
typedef
unsigned long long int
RadixType;
typedef
ga_ulong
RadixType;
static inline __device__ RadixType convert(ga_long v) {
assert(sizeof(long) == 8);
return
(1ull << 63) ^
v;
assert(sizeof(
ga_
long) == 8);
return
9223372036854775808ull +
v;
}
static inline __device__
ga_
long deconvert(RadixType v) {
return
(1ull << 63) ^ v
;
static inline __device__
long
long deconvert(RadixType v) {
return
v - 9223372036854775808ull
;
}
};
#ifdef USE_HALF
// TODO: make this work
#define USE_HALF $use_half
#if USE_HALF == 1
// since ga_half is ushort, use macro to protect this part is necessary
template <>
struct RadixConfig<half> {
typedef ga_u
in
t RadixType;
struct RadixConfig<
ga_
half> {
typedef ga_u
shor
t RadixType;
static inline __device__ RadixType convert(half v) {
#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 80000
RadixType x = __half_as_ushort(v);
RadixType mask = -((x >> 15)) | 0x8000;
return (x ^ mask);
#else
assert(false);
return 0u;
#endif
static inline __device__ RadixType convert(ga_half v) {
RadixType mask = -(((RadixType)v >> 15)) | 0x8000;
return (v ^ mask);
}
static inline __device__ half deconvert(RadixType v) {
#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 80000
static inline __device__ ga_half deconvert(RadixType v) {
RadixType mask = ((v >> 15) - 1) | 0x8000;
return __ushort_as_half(v ^ mask);
#else
assert(false);
return ScalarConvert<int, half>::to(0);
#endif
return (ga_half)(v ^ mask);
}
};
#endif
#endif
// USE_HALF
// $$inp_t should be replaced in c_code
// we cannot use templated kernel because gpuarray API does not support it
...
...
@@ -175,16 +194,15 @@ struct RadixConfig<half> {
#error "RADIX_SIZE must be smaller than warp size (32)"
#endif
template <typename T>
static inline __device__ T binary_cumsum(
int idx, int warp_id, int lane_id, T* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads
static inline __device__ ga_size binary_cumsum(
int idx, int warp_id, int lane_id, ga_size* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads
// whose id is *no greater than* the current thread
// binary_cumsum(1, 0, 1, 0, 1) -> (1, 1, 2, 2, 3)
// cumsum within warp
ga_uint warp_bits = __ballot(value);
T
warp_sum = __popc(((2<<lane_id)-1) & warp_bits);
ga_size
warp_sum = __popc(((2<<lane_id)-1) & warp_bits);
if (lane_id == 0)
smem[warp_id] = __popc(warp_bits);
...
...
@@ -195,7 +213,7 @@ static inline __device__ T binary_cumsum(
if (idx == 0) {
int current = 0;
for (int i = 0; i < LDIM_0 / GA_WARP_SIZE; ++i) {
T
v = smem[i];
ga_size
v = smem[i];
smem[i] = smem[i]+current;
current = current+v;
}
...
...
@@ -211,16 +229,15 @@ static inline __device__ T binary_cumsum(
return warp_sum;
}
template <typename T>
static inline __device__ T binary_cumsum_exclusive(
int idx, int warp_id, int lane_id, T* smem, bool value) {
static inline __device__ ga_size binary_cumsum_exclusive(
int idx, int warp_id, int lane_id, ga_size* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads
// whose id is *less than* the current thread
// binary_cumsum(1, 0, 1, 0, 1) -> (0, 1, 1, 2, 2)
// binary_cumsum
_excl
(1, 0, 1, 0, 1) -> (0, 1, 1, 2, 2)
// cumsum within warp
ga_uint warp_bits = __ballot(value);
T
warp_sum = __popc(((1<<lane_id)-1) & warp_bits);
ga_size
warp_sum = __popc(((1<<lane_id)-1) & warp_bits);
if (lane_id == 0)
smem[warp_id] = __popc(warp_bits);
...
...
@@ -231,7 +248,7 @@ static inline __device__ T binary_cumsum_exclusive(
if (idx == 0) {
int current = 0;
for (int i = 0; i < LDIM_0 / GA_WARP_SIZE; ++i) {
T
v = smem[i];
ga_size
v = smem[i];
smem[i] = smem[i]+current;
current = current+v;
}
...
...
theano/gpuarray/k_topk_dense.cu
浏览文件 @
c89cab31
...
...
@@ -15,10 +15,10 @@ KERNEL void k_topk_dense(
$src_strides
// ga_ssize src_strides_0, ga_ssize src_strides_1, ... , src_strides_$${NDIM}
ga_size size) {
LOCAL_MEM
radix_t
smem[32 * RADIX_SIZE];
LOCAL_MEM
ga_size
smem[32 * RADIX_SIZE];
ga_ssize LOCAL_MEM bins[RADIX_SIZE+1]; // TODO: does using 32-bit gives good speedup?
bool is_topk=true, is_topkth=true;
radix_t
out_idx;
ga_size
out_idx;
const ga_ushort idx = LID_0;
ga_size LOCAL_MEM k2, exceed;
...
...
@@ -118,12 +118,12 @@ KERNEL void k_topk_dense(
if (exceed != 0) {
// top_kth value may not be unique, so we need to
// perform binary cumsum on is_topkth to drop exceeding top-kth values
out_idx = binary_cumsum_exclusive
<radix_t>
(idx, warp_id, lane_id, smem, is_topkth);
out_idx = binary_cumsum_exclusive(idx, warp_id, lane_id, smem, is_topkth);
is_topk &= ((!is_topkth) || out_idx>=exceed);
}
// perform binary cumsum on is_topk to determine the indices to put result
out_idx = binary_cumsum_exclusive
<radix_t>
(idx, warp_id, lane_id, smem, is_topk);
out_idx = binary_cumsum_exclusive(idx, warp_id, lane_id, smem, is_topk);
local_barrier();
if (is_topk) {
...
...
theano/gpuarray/k_topk_dense_large.cu
浏览文件 @
c89cab31
...
...
@@ -15,9 +15,9 @@ KERNEL void k_topk_dense_large(
$src_strides
// ga_ssize src_strides_0, ga_ssize src_strides_1, ... , src_strides_$${NDIM}
ga_size size, ga_ushort inp_per_thread) {
LOCAL_MEM
radix_t
smem[32 * RADIX_SIZE];
LOCAL_MEM
ga_size
smem[32 * RADIX_SIZE];
LOCAL_MEM radix_t known_bits, known_bits_mask;
radix_t
out_idx;
ga_size
out_idx;
ga_size LOCAL_MEM write_base;
INPUT_TYPE xval;
radix_t x;
...
...
@@ -109,21 +109,13 @@ KERNEL void k_topk_dense_large(
local_barrier();
}
/*
if (idx < RADIX_SIZE) {
long long int xxx = 1337;
ptr_at(dstv, idx*dstv_strides_0) = xxx;
}
return;
*/
// 2. write values smaller than top-kth
for (i=0; i<inp_per_thread; ++i) {
in_range = (idx*inp_per_thread+i) < size;
xval = in_range ? ptr_read(src, i*src_strides_0) : (INPUT_TYPE)0;
x = inv_bits ^ RadixConfig<INPUT_TYPE>::convert(xval);
is_topk = (x > known_bits) && in_range;
out_idx = binary_cumsum
<radix_t>
(idx, warp_id, lane_id, smem, is_topk);
out_idx = binary_cumsum(idx, warp_id, lane_id, smem, is_topk);
if (is_topk) {
#if WRITE_VALUE == 1
ptr_at(dstv, (out_idx+write_base-1) * dstv_strides_0) = xval;
...
...
@@ -144,7 +136,7 @@ KERNEL void k_topk_dense_large(
xval = in_range ? ptr_read(src, i*src_strides_0) : (INPUT_TYPE)0;
x = inv_bits ^ RadixConfig<INPUT_TYPE>::convert(xval);
is_topk = (x == known_bits) && in_range;
out_idx = binary_cumsum
<radix_t>
(idx, warp_id, lane_id, smem, is_topk);
out_idx = binary_cumsum(idx, warp_id, lane_id, smem, is_topk);
is_topk = ((out_idx+write_base) <= abs(k)) && is_topk;
if (is_topk) {
#if WRITE_VALUE == 1
...
...
theano/gpuarray/sort.py
浏览文件 @
c89cab31
...
...
@@ -19,12 +19,14 @@ except ImportError as e:
# To make sure theano is importable
pass
# TODO sort / argsort
# TODO add runtime opt, if k==1, use max/min reduce
# also if k is axis size, just copy input tensor
# TODO add opt to merge argtopk / topk, or split topk_and_argtopk when only
# one result is needed
# TODO add grad
# TODO sort / argsort
class
GpuTopKOp
(
GpuKernelBase
,
TopKOp
):
'''
...
...
@@ -32,6 +34,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
'''
__props__
=
TopKOp
.
__props__
_f16_ok
=
True
def
__init__
(
self
,
axis
=-
1
,
return_values
=
True
,
return_indices
=
False
,
idx_dtype
=
'int64'
):
GpuKernelBase
.
__init__
(
self
)
...
...
@@ -99,7 +102,9 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
set_slice
=
set_slice_code
,
write_value
=
int
(
self
.
return_values
),
write_index
=
int
(
self
.
return_indices
),
ndim
=
str
(
ndim
))
ndim
=
str
(
ndim
),
use_half
=
int
(
node
.
inputs
[
0
]
.
dtype
==
'float16'
)
)
elif
device_type
==
b
'opencl'
:
raise
NotImplementedError
()
...
...
@@ -201,6 +206,11 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
PyExc_ValueError,
"topk: k must not be zero");
%(fail)
s;
} else if (dims[
%(axis)
d] < odims[
%(axis)
d]){
PyErr_SetString(
PyExc_ValueError,
"topk: k cannot larger than size on specified axis
%(axis)
d");
%(fail)
s;
}
%(prep_output)
s
...
...
@@ -236,7 +246,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
int err;
if (blk[0] >
%(MAX_TPB)
d) {
//
CUDA_OUT_OF_RESOURCE if a max
sized block is used
//
LAUNCH_OUT_OF_RESOURCE if a 1024
sized block is used
blk[0] =
%(MAX_TPB)
d / 2;
err = GpuKernel_call(
&k_topk_dense_large_
%(nodename)
s, 3,
...
...
@@ -291,5 +301,8 @@ def local_gpua_topkop(op, ctx_name, inputs, outputs):
x
=
as_gpuarray_variable
(
x
,
ctx_name
)
rets
=
GpuTopKOp
(
axis
=
axis
,
return_values
=
rv
,
return_indices
=
ri
,
idx_dtype
=
op
.
idx_dtype
)(
x
,
k
)
axis
=
axis
,
return_values
=
rv
,
return_indices
=
ri
,
idx_dtype
=
op
.
idx_dtype
)(
x
,
k
)
return
rets
theano/tensor/sort.py
浏览文件 @
c89cab31
...
...
@@ -426,6 +426,10 @@ def topk(x, k, axis=-1):
Upon which axis shall the operation be performed on. If ``None``,
works on flattened array.
Returns
-------
Tensor variable with same dtype as `x`.
Notes
-----
- The returned values may not be sorted.
...
...
@@ -456,6 +460,10 @@ def argtopk(x, k, axis=-1, idx_dtype='int64'):
idx_dtype: string
Specify output dtype, defaults to ``int64``, must be integer type.
Returns
-------
Tensor variable with dtype specified in `idx_dtype`.
Notes
-----
- The corresponding values of returned indices may not be sorted.
...
...
theano/tensor/tests/test_sort.py
浏览文件 @
c89cab31
...
...
@@ -275,7 +275,7 @@ class Test_TopK(unittest.TestCase):
product
(
(
16
,
61
,
257
),
(
1
,
-
1
,
10
,
-
10
,
'n//2'
,
'n-1'
,
'-n'
,
'1-n'
),
(
'float64'
,
'int16'
,
'int8'
)),
(
'float64'
,
'
float16'
,
'
int16'
,
'int8'
)),
((
2049
,
1337
,
'float64'
),)))
def
test_topk_1d
(
self
,
size
,
k
,
dtype
):
if
isinstance
(
k
,
str
):
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论