Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
670c324b
提交
670c324b
authored
5月 15, 2017
作者:
Adam Becker
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
mixed changes
- fix thread count calculation bug - let write_value and write_index become C macro
上级
13095b4b
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
37 行增加
和
33 行删除
+37
-33
sort.py
theano/gpuarray/sort.py
+6
-6
topk_kernel.cu
theano/gpuarray/topk_kernel.cu
+31
-27
没有找到文件。
theano/gpuarray/sort.py
浏览文件 @
670c324b
...
...
@@ -82,8 +82,6 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
set_slice_code
=
''
.
join
(
set_slice_code
%
dict
(
i
=
j
)
for
j
in
range
(
1
,
ndim
))
flags
=
Kernel
.
get_flags
(
node
.
inputs
[
0
]
.
dtype
)
write_value
=
'ptr_at(dstv, out_idx * dstv_strides_0) = xval'
if
self
.
return_values
else
''
write_index
=
'ptr_at(dsti, out_idx * dsti_strides_0) = (INDEX_TYPE)idx'
if
self
.
return_indices
else
''
subs
=
dict
(
inp_t
=
ga
.
dtype_to_ctype
(
node
.
inputs
[
0
]
.
dtype
),
out_t
=
ga
.
dtype_to_ctype
(
node
.
outputs
[
0
]
.
dtype
),
...
...
@@ -94,8 +92,8 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
dsti_strides
=
dsti_strides_code
if
self
.
return_indices
else
''
,
src_strides
=
src_strides_code
,
set_slice
=
set_slice_code
,
write_value
=
write_value
,
write_index
=
write_index
,
write_value
=
int
(
self
.
return_values
)
,
write_index
=
int
(
self
.
return_indices
)
,
ndim
=
str
(
ndim
))
# substitute "$" macros in kernel code
...
...
@@ -192,14 +190,16 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
// TODO better scheduling?
size_t blk[6];
size_t *grd = blk+3;
blk[1] = blk[2] = 1;
blk[
0] = blk[
1] = blk[2] = 1;
grd[0] = grd[1] = grd[2] = 1;
// round up to multiples of warp size
blk[0] = ((dims[0] +
%(WARP_SIZE)
d - 1) /
%(WARP_SIZE)
d) *
%(WARP_SIZE)
d;
for(int i=0; i<
%(ndim)
d; ++i) {
if (i!=
%(axis)
d)
grd[0] *= dims[i];
else
blk[0] = dims[i];
}
blk[0] = ((blk[0] +
%(WARP_SIZE)
d - 1) /
%(WARP_SIZE)
d) *
%(WARP_SIZE)
d;
%(def_dvstrides)
s;
%(def_distrides)
s;
...
...
theano/gpuarray/topk_kernel.cu
浏览文件 @
670c324b
...
...
@@ -15,14 +15,14 @@ template <>
struct RadixConfig<float> {
typedef ga_uint RadixType;
static inline
WITHIN_KERNEL
RadixType convert(float v) {
static inline
__device__
RadixType convert(float v) {
RadixType x = __float_as_int(v);
RadixType mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
return (x ^ mask);
}
static inline
WITHIN_KERNEL
float deconvert(RadixType v) {
static inline
__device__
float deconvert(RadixType v) {
RadixType mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
return __int_as_float(v ^ mask);
...
...
@@ -30,14 +30,14 @@ struct RadixConfig<float> {
};
template <>
struct RadixConfig<ga_u
char
> {
struct RadixConfig<ga_u
byte
> {
typedef ga_uint RadixType;
static inline
WITHIN_KERNEL RadixType convert(ga_uchar
v) {
static inline
__device__ RadixType convert(ga_ubyte
v) {
return v;
}
static inline
WITHIN_KERNEL ga_uchar
deconvert(RadixType v) {
static inline
__device__ ga_ubyte
deconvert(RadixType v) {
return v;
}
};
...
...
@@ -46,11 +46,11 @@ template <>
struct RadixConfig<char> {
typedef ga_uint RadixType;
static inline
WITHIN_KERNEL
RadixType convert(char v) {
static inline
__device__
RadixType convert(char v) {
return 128u + v;
}
static inline
WITHIN_KERNEL
char deconvert(RadixType v) {
static inline
__device__
char deconvert(RadixType v) {
return v - 128;
}
};
...
...
@@ -59,12 +59,12 @@ template <>
struct RadixConfig<ga_short> {
typedef ga_uint RadixType;
static inline
WITHIN_KERNEL
RadixType convert(ga_short v) {
static inline
__device__
RadixType convert(ga_short v) {
assert(sizeof(ga_short) == 2);
return 32768u + v;
}
static inline
WITHIN_KERNEL
ga_short deconvert(RadixType v) {
static inline
__device__
ga_short deconvert(RadixType v) {
return v - 32768;
}
};
...
...
@@ -73,12 +73,12 @@ template <>
struct RadixConfig<int> {
typedef ga_uint RadixType;
static inline
WITHIN_KERNEL
RadixType convert(int v) {
static inline
__device__
RadixType convert(int v) {
assert(sizeof(int) == 4);
return 2147483648u + v;
}
static inline
WITHIN_KERNEL
int deconvert(RadixType v) {
static inline
__device__
int deconvert(RadixType v) {
return v - 2147483648u;
}
};
...
...
@@ -87,12 +87,12 @@ template <>
struct RadixConfig<long> {
typedef unsigned long long int RadixType;
static inline
WITHIN_KERNEL
RadixType convert(long v) {
static inline
__device__
RadixType convert(long v) {
assert(sizeof(long) == 8);
return 9223372036854775808ull + v;
}
static inline
WITHIN_KERNEL
long deconvert(RadixType v) {
static inline
__device__
long deconvert(RadixType v) {
return v - 9223372036854775808ull;
}
};
...
...
@@ -101,13 +101,13 @@ template <>
struct RadixConfig<double> {
typedef unsigned long long int RadixType;
static inline
WITHIN_KERNEL
RadixType convert(double v) {
static inline
__device__
RadixType convert(double v) {
RadixType x = __double_as_longlong(v);
RadixType mask = -((x >> 63)) | 0x8000000000000000;
return (x ^ mask);
}
static inline
WITHIN_KERNEL
double deconvert(RadixType v) {
static inline
__device__
double deconvert(RadixType v) {
RadixType mask = ((v >> 63) - 1) | 0x8000000000000000;
return __longlong_as_double(v ^ mask);
}
...
...
@@ -118,7 +118,7 @@ template <>
struct RadixConfig<half> {
typedef ga_uint RadixType;
static inline
WITHIN_KERNEL
RadixType convert(half v) {
static inline
__device__
RadixType convert(half v) {
#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 80000
RadixType x = __half_as_ushort(v);
RadixType mask = -((x >> 15)) | 0x8000;
...
...
@@ -129,7 +129,7 @@ struct RadixConfig<half> {
#endif
}
static inline
WITHIN_KERNEL
half deconvert(RadixType v) {
static inline
__device__
half deconvert(RadixType v) {
#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 80000
RadixType mask = ((v >> 15) - 1) | 0x8000;
return __ushort_as_half(v ^ mask);
...
...
@@ -152,13 +152,15 @@ struct RadixConfig<half> {
#define RADIX_MASK(n) ((RADIX_SIZE-1) << (n*RADIX_BITS))
#define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
#define radix_t RadixConfig<INPUT_TYPE>::RadixType
#define WRITE_VALUE $write_value
#define WRITE_INDEX $write_index
#if RADIX_SIZE >
GA_WARP_SIZE
#error "RADIX_SIZE must be smaller than warp size"
#if RADIX_SIZE >
32
#error "RADIX_SIZE must be smaller than warp size
(32)
"
#endif
template <typename T>
static inline
WITHIN_KERNEL
T binary_cumsum(
static inline
__device__
T binary_cumsum(
int idx, int warp_id, int lane_id, T* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads
// whose id is *no greater than* the current thread
...
...
@@ -194,7 +196,7 @@ static inline WITHIN_KERNEL T binary_cumsum(
}
template <typename T>
static inline
WITHIN_KERNEL
T binary_cumsum_exclusive(
static inline
__device__
T binary_cumsum_exclusive(
int idx, int warp_id, int lane_id, T* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads
// whose id is *less than* the current thread
...
...
@@ -230,13 +232,13 @@ static inline WITHIN_KERNEL T binary_cumsum_exclusive(
// apply raw(byte) offset to pointer
template <typename T>
static
WITHIN_KERNEL
inline T* ptr_add(T *ptr, ga_ssize offset) {
static
__device__
inline T* ptr_add(T *ptr, ga_ssize offset) {
return (T*)((char*)ptr + offset);
}
// get array element using raw(byte) offset
template <typename T>
static
WITHIN_KERNEL
inline T& ptr_at(T *ptr, ga_ssize offset) {
static
__device__
inline T& ptr_at(T *ptr, ga_ssize offset) {
return *((T*)((char*)ptr + offset));
}
...
...
@@ -366,9 +368,11 @@ KERNEL void k_topk_dense(
local_barrier();
if (is_topk) {
$write_value;
// ptr_at(dstv, out_idx * dstv_strides_0) = xval;
$write_index;
// ptr_at(dsti, out_idx * dsti_strides_0) = (INDEX_TYPE)idx;
#if WRITE_VALUE == 1
ptr_at(dstv, out_idx * dstv_strides_0) = xval;
#endif
#if WRITE_INDEX == 1
ptr_at(dsti, out_idx * dsti_strides_0) = (INDEX_TYPE)idx;
#endif
}
}
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论