Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
5cc9c326
提交
5cc9c326
authored
9月 21, 2017
作者:
abergeron
提交者:
GitHub
9月 21, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #5959 from khaotik/topk
TopKOp implementation
上级
0d47e204
61de1573
隐藏空白字符变更
内嵌
并排
正在显示
11 个修改的文件
包含
1771 行增加
和
38 行删除
+1771
-38
opt.py
theano/gof/opt.py
+14
-8
__init__.py
theano/gpuarray/__init__.py
+1
-1
topk_common.cuh
theano/gpuarray/c_code/topk_common.cuh
+358
-0
topk_dense.cu
theano/gpuarray/c_code/topk_dense.cu
+133
-0
topk_dense_large.cu
theano/gpuarray/c_code/topk_dense_large.cu
+309
-0
sort.py
theano/gpuarray/sort.py
+315
-0
__init__.py
theano/tensor/__init__.py
+1
-1
opt.py
theano/tensor/opt.py
+33
-0
sort.py
theano/tensor/sort.py
+356
-6
test_sort.py
theano/tensor/tests/test_sort.py
+250
-21
unittest_tools.py
theano/tests/unittest_tools.py
+1
-1
没有找到文件。
theano/gof/opt.py
浏览文件 @
5cc9c326
...
@@ -1367,21 +1367,27 @@ class LocalOptGroup(LocalOptimizer):
...
@@ -1367,21 +1367,27 @@ class LocalOptGroup(LocalOptimizer):
self
.
process_count
[
opt
]
+=
1
self
.
process_count
[
opt
]
+=
1
if
not
new_repl
:
if
not
new_repl
:
continue
continue
else
:
if
isinstance
(
new_repl
,
(
tuple
,
list
)):
if
self
.
profile
:
new_vars
=
new_repl
self
.
node_created
[
opt
]
+=
len
(
graph
.
ops
(
fgraph
.
variables
,
new_repl
))
else
:
# It must be a dict
self
.
applied_true
[
opt
]
+=
1
new_vars
=
list
(
new_repl
.
values
())
break
# break from the for loop over optimization.
if
self
.
profile
:
self
.
node_created
[
opt
]
+=
len
(
graph
.
ops
(
fgraph
.
variables
,
new_vars
))
self
.
applied_true
[
opt
]
+=
1
break
# break from the for loop over optimization.
if
not
new_repl
:
# No optimization applied in the last iteration
if
not
new_repl
:
# No optimization applied in the last iteration
return
repl
return
repl
# only 1 iteration or we are at the start of the graph.
# only 1 iteration
if
not
self
.
apply_all_opts
or
not
new_repl
[
0
]
.
owner
:
if
not
self
.
apply_all_opts
:
return
new_repl
if
not
new_vars
[
0
]
.
owner
:
# We are at the start of the graph.
return
new_repl
return
new_repl
if
len
(
new_repl
)
>
1
:
if
len
(
new_repl
)
>
1
:
s
=
set
([
v
.
owner
for
v
in
new_repl
])
s
=
set
([
v
.
owner
for
v
in
new_repl
])
assert
len
(
s
)
==
1
assert
len
(
s
)
==
1
repl
=
new_repl
repl
=
new_repl
node
=
repl
[
0
]
.
owner
node
=
new_vars
[
0
]
.
owner
@staticmethod
@staticmethod
def
print_profile
(
stream
,
prof
,
level
=
0
):
def
print_profile
(
stream
,
prof
,
level
=
0
):
...
...
theano/gpuarray/__init__.py
浏览文件 @
5cc9c326
...
@@ -28,7 +28,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
...
@@ -28,7 +28,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable
,
gpuarray_shared_constructor
,
GpuArraySharedVariable
,
gpuarray_shared_constructor
,
reg_context
,
get_context
,
ContextNotDefined
)
reg_context
,
get_context
,
ContextNotDefined
)
from
.basic_ops
import
as_gpuarray_variable
from
.basic_ops
import
as_gpuarray_variable
from
.
import
fft
,
dnn
,
opt
,
extra_ops
,
multinomial
,
reduction
,
rng_mrg
,
ctc
from
.
import
fft
,
dnn
,
opt
,
extra_ops
,
multinomial
,
reduction
,
sort
,
rng_mrg
,
ctc
def
transfer
(
x
,
target
):
def
transfer
(
x
,
target
):
...
...
theano/gpuarray/c_code/topk_common.cuh
0 → 100644
浏览文件 @
5cc9c326
// modified from pytorch
// https://github.com/pytorch/pytorch/master/blob/torch/lib/THC/THCTensorTopK.cuh
// original license below:
/*
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#if __CUDA_ARCH__ < 350
#define __ldg(ptr) (*(ptr))
#endif
typedef ptrdiff_t ssize_t;
__device__ __forceinline__ int lane_id() {
int id;
asm("mov.s32 %0, %laneid;" : "=r"(id) );
return id;
}
__device__ __forceinline__ unsigned lane_mask_lt() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ unsigned lane_mask_le() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ unsigned lane_mask_gt() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ unsigned lane_mask_ge() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
return mask;
}
template <typename T>
struct Bitfield {};
template <>
struct Bitfield<unsigned int> {
static __device__ __forceinline__
unsigned int get(unsigned int val, int pos, int len) {
unsigned int ret;
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
return ret;
}
static __device__ __forceinline__
unsigned int set(unsigned int val, unsigned int toInsert, int pos, int len) {
unsigned int ret;
asm("bfi.b32 %0, %1, %2, %3, %4;" :
"=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
return ret;
}
};
template <>
struct Bitfield<unsigned long long int> {
static __device__ __forceinline__
unsigned long long int get(unsigned long long int val, int pos, int len) {
unsigned long long int ret;
asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
return ret;
}
static __device__ __forceinline__
unsigned long long int set(unsigned long long int val, unsigned long long int toInsert, int pos, int len) {
unsigned long long int ret;
asm("bfi.b64 %0, %1, %2, %3, %4;" :
"=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
return ret;
}
};
template <typename T>
struct RadixConfig {
// Converts a type (maybe float) to an integer representation with the same
// sorting; i.e., for floats f1, f2:
// if f1 < f2 then convert(f1) < convert(f2)
// We use this to enable radix selection of floating-point values.
// This also gives a relative order for NaNs, but that's ok, as they
// will all be adjacent
typedef unsigned int RadixType;
static inline __device__ RadixType convert(T v) {
return (RadixType)v;
}
static inline __device__ float deconvert(RadixType v) {
return (T)v;
}
};
template <>
struct RadixConfig<float> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(float v) {
RadixType x = __float_as_int(v);
RadixType mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
return (x ^ mask);
}
static inline __device__ float deconvert(RadixType v) {
RadixType mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
return __int_as_float(v ^ mask);
}
};
template <>
struct RadixConfig<double> {
typedef unsigned long long RadixType;
static inline __device__ RadixType convert(double v) {
RadixType x = __double_as_longlong(v);
RadixType mask = -((x >> 63)) | 0x8000000000000000;
return (x ^ mask);
}
static inline __device__ double deconvert(RadixType v) {
RadixType mask = ((v >> 63) - 1) | 0x8000000000000000;
return __longlong_as_double(v ^ mask);
}
};
template <>
struct RadixConfig<char> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(char v) {
return 128u + v;
}
static inline __device__ char deconvert(RadixType v) {
return v - 128;
}
};
template <>
struct RadixConfig<short> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(short v) {
assert(sizeof(short) == 2);
return 32768u ^ v;
}
static inline __device__ short deconvert(RadixType v) {
return v - 32768;
}
};
template <>
struct RadixConfig<int> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(int v) {
assert(sizeof(int) == 4);
return 2147483648u + v;
}
static inline __device__ int deconvert(RadixType v) {
return v - 2147483648u;
}
};
template <>
struct RadixConfig<long long> {
typedef unsigned long long RadixType;
static inline __device__ RadixType convert(long long v) {
assert(sizeof(long long) == 8);
return 9223372036854775808ull + v;
}
static inline __device__ long long deconvert(RadixType v) {
return v - 9223372036854775808ull;
}
};
#define USE_HALF $use_half
#if USE_HALF == 1
// since half is ushort, using macro to protect this part is necessary
template <>
struct RadixConfig<unsigned short> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(unsigned short v) {
RadixType mask = -(((RadixType)v >> 15)) | 0x8000;
return (v ^ mask);
}
static inline __device__ unsigned short deconvert(RadixType v) {
RadixType mask = ((v >> 15) - 1) | 0x8000;
return (unsigned short)(v ^ mask);
}
};
#endif // USE_HALF
// $$inp_t should be replaced in c_code
// we cannot use templated kernel because gpuarray API does not support it
#define NDIM $ndim
#define INPUT_TYPE $inp_t
#define INDEX_TYPE $out_t
#define bitsof(T) (sizeof(T)*8)
#define radix_t RadixConfig<INPUT_TYPE>::RadixType
#define WRITE_VALUE $write_value
#define WRITE_INDEX $write_index
#if RADIX_SIZE > 32
#error "RADIX_SIZE must be smaller than warp size (32)"
#endif
void __device__ atomicAdd(long long *dst, long long &src) {
atomicAdd(
reinterpret_cast<unsigned long long*>(dst),
reinterpret_cast<unsigned long long&>(src));
}
template <typename T>
static inline __device__ T binary_cumsum(
int idx, int warp_id, T* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads
// whose id is *no greater than* the current thread
// binary_cumsum(1, 0, 1, 0, 1) -> (1, 1, 2, 2, 3)
// cumsum within warp
unsigned int warp_bits = __ballot(value);
T warp_sum = __popc(lane_mask_le() & warp_bits);
if (lane_id() == 0)
smem[warp_id] = __popc(warp_bits);
local_barrier();
// cumsum across warps in one thread
if (idx == 0) {
T sum = smem[0];
for (int i = 1; i < blockDim.x / GA_WARP_SIZE; ++i) {
sum += smem[i];
smem[i] = sum;
}
}
local_barrier();
// load the carry from the preceding warp
if (warp_id >= 1) {
warp_sum = warp_sum+smem[warp_id - 1];
}
return warp_sum;
}
template <typename T>
static inline __device__ T binary_cumsum_exclusive(
int idx, int warp_id, T* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads
// whose id is *less than* the current thread
// binary_cumsum_excl(1, 0, 1, 0, 1) -> (0, 1, 1, 2, 2)
// cumsum within warp
unsigned int warp_bits = __ballot(value);
T warp_sum = __popc(lane_mask_lt() & warp_bits);
if (lane_id() == 0)
smem[warp_id] = __popc(warp_bits);
local_barrier();
// cumsum across warps in one thread
if (idx == 0) {
T sum = smem[0];
for (int i = 1; i < blockDim.x / GA_WARP_SIZE; ++i) {
sum += smem[i];
smem[i] = sum;
}
}
local_barrier();
// load the carry from the preceding warp
if (warp_id >= 1)
warp_sum += smem[warp_id - 1];
return warp_sum;
}
// apply raw(byte) offset to pointer
template <typename T>
static __device__ inline T* ptr_add(T *ptr, ssize_t offset) {
return (T*)((char*)ptr + offset);
}
// get array element using raw(byte) offset
template <typename T>
static __device__ inline T& ptr_at(T *ptr, ssize_t offset) {
return *((T*)((char*)ptr + offset));
}
// read array element using raw(byte) offset
template <typename T>
static __device__ inline T ptr_read_cached(T *ptr, ssize_t offset) {
return __ldg(((T*)((char*)ptr + offset)));
}
theano/gpuarray/c_code/topk_dense.cu
0 → 100644
浏览文件 @
5cc9c326
#define RADIX_BITS 4
#define RADIX_SIZE (1<<RADIX_BITS)
#define RADIX_MASK(n) ((RADIX_SIZE-1) << (n*RADIX_BITS))
#define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
// works when length on axis is within max allowed threads in block (1024)
KERNEL void k_topk_dense(
$dims
// size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
$dstv
// INPUT_TYPE *dstv
$dstv_strides
// ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
$dsti
// INDEX_TYPE *dsti
$dsti_strides
// ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
ssize_t k,
INPUT_TYPE* src,
$src_strides
// ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
size_t size) {
__shared__ int smem[32 * RADIX_SIZE];
__shared__ int k2;
const unsigned int idx = threadIdx.x;
bool is_topk= (idx < size);
bool is_topkth = is_topk;
size_t out_idx;
const unsigned char warp_id = idx / GA_WARP_SIZE;
// 0. get the slice for thread block to work on
size_t gid = blockIdx.x, gidx;
$set_slice
// $$set_slice expands into:
//for(int i=1; i<NDIM; i++) {
// gidx = gid % dims_$${i};
// gid /= dims_$${i};
// dsti = ptr_add(dsti, gidx*dsti_strides_$${i};
// dstv = ptr_add(dstv, gidx*dstv_strides_$${i};
// src = ptr_add(src, gidx*src_strides_$${i});
//}
// get input and its radix friendly form
const INPUT_TYPE xval = is_topk ? ptr_at(src, idx*src_strides_0) : (INPUT_TYPE)0;
radix_t x = RadixConfig<INPUT_TYPE>::convert(xval);
// resolve negative k
if (k<0) { x = ~x; k = -k; }
if (idx==0)
k2 = k;
// 1. filter is_topk and is_topkth using radix select
#pragma unroll
for (int i=bitsof(INPUT_TYPE)-RADIX_BITS; i>=0; i-=RADIX_BITS) {
const int digit = Bitfield<radix_t>::get(x, i, RADIX_BITS);
/*int digit = (x>>i) & (RADIX_SIZE-1);*/
// count within warp
#pragma unroll
for (int bin=0; bin<RADIX_SIZE; ++bin) {
bool vote = (bin == digit) && is_topkth;
unsigned int votes = __ballot(vote);
if (lane_id()==0)
smem[bin + RADIX_SIZE*warp_id] = __popc(votes);
}
local_barrier();
// sum counts across all warps
if (idx < RADIX_SIZE) {
int sum = smem[idx];
#pragma unroll
for(int w=RADIX_SIZE; w<blockDim.x*RADIX_SIZE / GA_WARP_SIZE; w+=RADIX_SIZE)
sum += smem[idx + w];
smem[idx] = sum;
}
local_barrier();
// find the bucket and update k2
// smem[:RADIX_SIZE:-1] = k2 - cumsum(smem[:RADIX_SIZE-1:-1])
if (idx == 0) {
int sum = k2;
#pragma unroll
for (int bin=RADIX_SIZE-1; bin>=0; --bin) {
sum -= smem[bin];
smem[bin] = sum;
k2 = (sum > 0) ? sum : k2;
}
smem[RADIX_SIZE] = 1;
}
local_barrier();
if (is_topkth) {
is_topk &= (smem[digit+1] > 0);
is_topkth &= (smem[digit] <= 0) && (smem[digit+1] > 0);
}
local_barrier();
}
// set k2 as number of exceeding values
if (idx==0) {
#pragma unroll
for (int bin=RADIX_SIZE-1; bin>=0; --bin) {
if (smem[bin] <= 0)
break;
k2 = smem[bin];
}
}
local_barrier();
// 2. find the index of output array, if exists
if (k2 != 0) {
// top_kth value may not be unique, so we need to
// perform binary cumsum on is_topkth to drop exceeding top-kth values
out_idx = binary_cumsum_exclusive(idx, warp_id, smem, is_topkth);
if ((out_idx >= k2) && is_topkth)
is_topk = false;
local_barrier();
}
// perform binary cumsum on is_topk to determine the indices to put result
out_idx = binary_cumsum_exclusive(idx, warp_id, smem, is_topk);
if (is_topk) {
#if WRITE_VALUE == 1
ptr_at(dstv, out_idx * dstv_strides_0) = xval;
#endif
#if WRITE_INDEX == 1
ptr_at(dsti, out_idx * dsti_strides_0) = (INDEX_TYPE)idx;
#endif
}
}
theano/gpuarray/c_code/topk_dense_large.cu
0 → 100644
浏览文件 @
5cc9c326
#define RADIX_BITS 2
#define RADIX_SIZE (1<<RADIX_BITS)
#define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
#define COUNT_TYPE $count_t
#define KERNEL_NAME $kname
// if count_t is int, work for array size within [1025, 2^31-1]
// if count_t is long long, work for array size within [2^31, 2^63-1]
template <typename DataType, typename RadixType, typename CountType>
__device__ DataType find_pattern(DataType* smem,
DataType* data,
CountType slice_size,
CountType stride,
RadixType known_bits,
RadixType known_bits_mask) {
if (threadIdx.x < 32)
smem[threadIdx.x] = 0;
local_barrier();
// All threads participate in the loop, in order to sync on the flag
for (CountType i = threadIdx.x; i < (slice_size + (CountType)blockDim.x-1); i += blockDim.x) {
bool in_range = (i < slice_size);
DataType v = in_range ? ptr_read_cached(data, i*stride) : 0;
if (in_range && ((RadixConfig<DataType>::convert(v) & known_bits_mask) == known_bits)) {
// There should not be conflicts if we are using find_pattern,
// since the result is unique
smem[0] = 1;
smem[1] = v; // can't use val as the flag, since it could be 0
}
local_barrier();
DataType found = smem[0];
DataType val = smem[1];
local_barrier();
// Check to see if a thread found the value
if (found != 0)
return val;
}
return 0;
}
// This function counts the distribution of all input values in a
// slice we are selecting by radix digit at `radix_digit_pos`, but only
// those that pass the filter `((v & known_bits_mask) == known_bits)`.
// This produces and broadcasts the seen counts for a single block only.
// `smem` must have at least `RADIX_SIZE` elements.
template <typename DataType, typename RadixType, typename CountType>
__device__ void count_radix_masked(CountType counts[RADIX_SIZE],
CountType* smem,
RadixType known_bits,
RadixType known_bits_mask,
int radix_digit_pos,
CountType slice_size,
CountType stride,
DataType* data) {
// Clear out per-thread counts from a previous round
#pragma unroll
for (int i = 0; i < RADIX_SIZE; ++i)
counts[i] = 0;
if (threadIdx.x < RADIX_SIZE)
smem[threadIdx.x] = 0;
local_barrier();
// Scan over all the data. Upon a read, the warp will accumulate
// counts per each digit in the radix using warp voting.
for (CountType i = threadIdx.x; i < slice_size; i += blockDim.x) {
RadixType val = RadixConfig<DataType>::convert(ptr_read_cached(data, i*stride));
bool has_val = ((val & known_bits_mask) == known_bits);
RadixType digit_in_radix = Bitfield<RadixType>::get(val, radix_digit_pos, RADIX_BITS);
#pragma unroll
for (int j = 0; j < RADIX_SIZE; ++j) {
bool vote = has_val && (digit_in_radix == j);
counts[j] += __popc(__ballot(vote));
}
}
// Now, for each warp, sum values
if (lane_id() == 0) {
for (int i=0; i<RADIX_SIZE; ++i)
atomicAdd(&smem[i], counts[i]);
}
/*
// not sure why, but this just give wrong results
if (lane_id() < RADIX_SIZE)
atomicAdd(&smem[lane_id()], counts[lane_id()]);
*/
local_barrier();
// For each thread, read in the total counts
#pragma unroll
for (unsigned int i = 0; i < RADIX_SIZE; ++i)
counts[i] = smem[i];
local_barrier();
}
template <typename DataType, typename RadixType, typename CountType>
__device__ void radix_select(DataType* data,
CountType k,
bool order,
CountType slice_size,
CountType stride,
CountType* smem,
DataType* top_kth) {
// Per-thread buckets into which we accumulate digit counts in our
// radix
register CountType counts[RADIX_SIZE];
// We only consider elements x such that (x & known_bits_mask) == known_bits
// Initially, we consider all elements of the array, so the above
// statement is true regardless of input.
RadixType known_bits = 0, known_bits_mask = 0;
// We are looking for the top k_to_find-th element when iterating over
// digits; this count gets reduced by elimination when counting
// successive digits
CountType k_to_find = abs(k);
// We start at the most significant digit in our radix, scanning
// through to the least significant digit
#pragma unroll
for (int digit_pos = bitsof(DataType) - RADIX_BITS;
digit_pos >= 0; digit_pos -= RADIX_BITS) {
// Count radix distribution for the current position and reduce
// across all threads
count_radix_masked<DataType, RadixType, CountType>(
counts, smem,
known_bits, known_bits_mask, digit_pos,
slice_size, stride, data);
// All threads participate in the comparisons below to know the
// final result
#define CHECK_RADIX(i) \\
int count = counts[i]; \\
/* All threads have the same value in counts here, so all */ \\
/* threads will return from the function. */ \\
if (count == 1 && k_to_find == 1) { \\
/* There is a unique answer. */ \\
known_bits = Bitfield<RadixType>::set( \\
known_bits, i, digit_pos, RADIX_BITS); \\
known_bits_mask = Bitfield<RadixType>::set( \\
known_bits_mask, RADIX_SIZE-1, digit_pos, RADIX_BITS); \\
/* The answer is now the unique element v such that: */ \\
/* (v & known_bits_mask) == known_bits */ \\
/* However, we do not yet know what the actual element is. We */ \\
/* need to perform a search through the data to find the */ \\
/* element that matches this pattern. */ \\
*top_kth = find_pattern<DataType, RadixType, CountType>( \\
(DataType*) smem, data, slice_size, \\
stride, known_bits, known_bits_mask); \\
return; \\
} \\
if (count >= k_to_find) { \\
known_bits = Bitfield<RadixType>::set(known_bits, i, digit_pos, RADIX_BITS); \\
known_bits_mask = Bitfield<RadixType>::set( \\
known_bits_mask, RADIX_SIZE-1, digit_pos, RADIX_BITS); \\
/* The top-Kth element v must now be one such that: */ \\
/* (v & known_bits_mask == known_bits) */ \\
/* but we haven't narrowed it down; we must check the next */ \\
/* least-significant digit */ \\
break; \\
} \\
k_to_find -= count
if (order) {
#pragma unroll
for (int i=RADIX_SIZE - 1; i >= 0; --i) {
CHECK_RADIX(i);
}
} else {
#pragma unroll
for (int i=0; i < RADIX_SIZE; ++i) {
CHECK_RADIX(i);
}
}
#undef CHECK_RADIX
} // end digit_pos for
// There is no unique result, but there is a non-unique result
// matching `known_bits` exactly
*top_kth = RadixConfig<DataType>::deconvert(known_bits);
}
KERNEL void KERNEL_NAME(
$dims
// size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
$dstv
// INPUT_TYPE *dstv
$dstv_strides
// ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
$dsti
// INDEX_TYPE *dsti
$dsti_strides
// ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
ssize_t k,
INPUT_TYPE* src,
$src_strides
// ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
size_t size) {
__shared__ COUNT_TYPE smem[32];
INPUT_TYPE topkth_value;
const bool order = (k>0);
k = (order ? k : -k);
const int idx = threadIdx.x;
const int warp_id = idx / GA_WARP_SIZE;
// get the slice for thread block to work on
// size <- the axis to work on
// dims_1+ <- batched dimensions
unsigned int gid = blockIdx.x, gidx;
$set_slice
// $$set_slice expands into:
//for(int i=1; i<NDIM; i++) {
// gidx = gid % dims_$${i};
// gid /= dims_$${i};
// dsti = ptr_add(dsti, gidx*dsti_strides_$${i});
// dstv = ptr_add(dstv, gidx*dstv_strides_$${i});
// src = ptr_add(src, gidx*src_strides_$${i});
//}
radix_select<INPUT_TYPE, radix_t, COUNT_TYPE>(
src, k, order, size, src_strides_0,
smem, &topkth_value);
// Every value that is strictly less/greater than `pattern`
// (depending on sort dir) in sorted int format is in the top-K.
// The top-K value itself might not be unique.
//
// Since there are a variable number of elements that we see that
// are within the top-k, we don't know at what index to write out
// the resulting values.
// In order to get this, we perform an exclusive cumsum of
// `has_topk`. This will return the resulting index into which we
// need to write the result, if a thread has a result.
// All threads need to participate in the loop and the cumsum
// but not necessarily in the load; hence loop bounds being rounded
// up to a multiple of the block dim.
COUNT_TYPE iter_bound = size + blockDim.x-1;
INDEX_TYPE write_base = 0;
for (int i = idx; i < iter_bound; i += blockDim.x) {
bool in_range = (i < size);
INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : 0;
bool has_topk;
if (order) {
has_topk = in_range && (v > topkth_value);
} else {
has_topk = in_range && (v < topkth_value);
}
int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
int carry = smem[blockDim.x / 32 - 1];
if (has_topk) {
COUNT_TYPE write_idx = write_base + index;
#if WRITE_VALUE == 1
ptr_at(dstv, write_idx * dstv_strides_0) = v;
#endif
#if WRITE_INDEX == 1
ptr_at(dsti, write_idx * dsti_strides_0) = (INDEX_TYPE)i;
#endif
}
write_base += carry;
}
COUNT_TYPE topk_remaining = (k - write_base);
for (COUNT_TYPE i = idx; i < iter_bound; i += blockDim.x) {
bool in_range = (i < size);
INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : 0;
bool has_topk = in_range && (v == topkth_value);
int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
int carry = smem[blockDim.x / 32 - 1];
if (has_topk && index < topk_remaining) {
COUNT_TYPE write_idx = write_base + index;
#if WRITE_VALUE == 1
ptr_at(dstv, write_idx * dstv_strides_0) = v;
#endif
#if WRITE_INDEX == 1
ptr_at(dsti, write_idx * dsti_strides_0) = (INDEX_TYPE)i;
#endif
}
if (carry >= topk_remaining)
break;
topk_remaining -= carry;
write_base += carry;
}
}
theano/gpuarray/sort.py
0 → 100644
浏览文件 @
5cc9c326
from
__future__
import
absolute_import
,
print_function
,
division
import
os
from
string
import
Template
from
theano
import
Apply
from
theano.tensor
import
as_tensor_variable
from
theano.tensor.sort
import
TopKOp
from
.basic_ops
import
(
GpuKernelBase
,
Kernel
,
infer_context_name
,
as_gpuarray_variable
,
gpuarray_helper_inc_dir
)
from
.opt
import
register_opt
,
op_lifter
,
register_opt2
from
.type
import
GpuArrayType
try
:
import
pygpu
import
pygpu.gpuarray
as
ga
except
ImportError
as
e
:
# To make sure theano is importable
pass
# TODO GPU sort / argsort
class
GpuTopKOp
(
GpuKernelBase
,
TopKOp
):
'''
Implements TopKOp on gpu
'''
__props__
=
TopKOp
.
__props__
_f16_ok
=
True
def
__init__
(
self
,
axis
=-
1
,
sorted
=
True
,
idx_dtype
=
'int64'
,
return_values
=
True
,
return_indices
=
True
):
GpuKernelBase
.
__init__
(
self
)
TopKOp
.
__init__
(
self
,
axis
=
axis
,
sorted
=
sorted
,
idx_dtype
=
idx_dtype
,
return_values
=
return_values
,
return_indices
=
return_indices
)
def
c_headers
(
self
):
return
[
'gpuarray_api.h'
,
'gpuarray_helper.h'
,
'numpy_compat.h'
]
def
c_header_dirs
(
self
):
return
[
os
.
path
.
dirname
(
__file__
),
gpuarray_helper_inc_dir
(),
pygpu
.
get_include
()]
def
c_code_cache_version
(
self
):
return
(
1
,)
def
gpu_kernels
(
self
,
node
,
nodename
):
# load kernel source
device_type
=
node
.
inputs
[
0
]
.
type
.
context
.
kind
kernel_ext
=
{
b
'cuda'
:
'.cu'
,
b
'opencl'
:
'.cl'
}[
device_type
]
common_ext
=
{
b
'cuda'
:
'.cuh'
,
b
'opencl'
:
'.h'
}[
device_type
]
# prepare "$" macros
if
device_type
==
b
'cuda'
:
ndim
=
node
.
inputs
[
0
]
.
ndim
dstv_strides_code
=
''
.
join
(
'ssize_t dstv_strides_
%
d, '
%
i
for
i
in
range
(
ndim
))
dsti_strides_code
=
''
.
join
(
'ssize_t dsti_strides_
%
d, '
%
i
for
i
in
range
(
ndim
))
src_strides_code
=
''
.
join
(
'ssize_t src_strides_
%
d, '
%
i
for
i
in
range
(
ndim
))
set_slice_code
=
'''
gidx = gid
%%
dims_
%(i)
d;
gid /= dims_
%(i)
d;
{dstv};
{dsti};
src = ptr_add(src, gidx*src_strides_
%(i)
d);
\n
'''
.
format
(
dstv
=
'dstv = ptr_add(dstv, gidx*dstv_strides_
%(i)
d)'
if
self
.
return_values
else
''
,
dsti
=
'dsti = ptr_add(dsti, gidx*dsti_strides_
%(i)
d)'
if
self
.
return_indices
else
''
)
set_slice_code
=
''
.
join
(
set_slice_code
%
dict
(
i
=
j
)
for
j
in
range
(
1
,
ndim
))
flags
=
Kernel
.
get_flags
(
node
.
inputs
[
0
]
.
dtype
)
subs
=
dict
(
inp_t
=
ga
.
dtype_to_ctype
(
node
.
inputs
[
0
]
.
dtype
),
out_t
=
ga
.
dtype_to_ctype
(
self
.
idx_dtype
),
dims
=
''
.
join
(
'size_t dims_
%
d, '
%
i
for
i
in
range
(
1
,
ndim
)),
dstv
=
'INPUT_TYPE *dstv,'
if
self
.
return_values
else
''
,
dsti
=
'INDEX_TYPE *dsti,'
if
self
.
return_indices
else
''
,
dstv_strides
=
dstv_strides_code
if
self
.
return_values
else
''
,
dsti_strides
=
dsti_strides_code
if
self
.
return_indices
else
''
,
src_strides
=
src_strides_code
,
set_slice
=
set_slice_code
,
write_value
=
int
(
self
.
return_values
),
write_index
=
int
(
self
.
return_indices
),
ndim
=
str
(
ndim
),
use_half
=
int
(
node
.
inputs
[
0
]
.
dtype
==
'float16'
)
)
elif
device_type
==
b
'opencl'
:
raise
NotImplementedError
()
# setup parameters
param_types
=
[
ga
.
SIZE
]
*
(
ndim
-
1
)
# dims
for
_
in
range
(
self
.
return_values
+
self
.
return_indices
):
param_types
.
append
(
ga
.
GpuArray
)
# dst*
param_types
.
extend
([
ga
.
SSIZE
]
*
ndim
)
# dst*_strides
param_types
.
append
(
ga
.
SIZE
)
# k
param_types
.
append
(
ga
.
GpuArray
)
# src
param_types
.
extend
([
ga
.
SSIZE
]
*
ndim
)
# src_strides
param_types
.
append
(
ga
.
SIZE
)
# size
# load and compile kernels
with
open
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'c_code'
,
'topk_common'
+
common_ext
))
as
f
:
common_src
=
f
.
read
()
kernels
=
[]
def
build_kernel
(
fname
,
kname
,
subs
):
with
open
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'c_code'
,
fname
)
)
as
f
:
kernel_src
=
f
.
read
()
ker
=
Kernel
(
code
=
Template
(
common_src
+
kernel_src
)
.
substitute
(
**
subs
),
name
=
kname
,
params
=
param_types
,
flags
=
flags
,
objvar
=
kname
+
nodename
)
return
ker
subs
[
'count_t'
]
=
'int'
kernels
.
append
(
build_kernel
(
'topk_dense'
+
kernel_ext
,
'k_topk_dense'
,
subs
))
subs
[
'kname'
]
=
'k_topk_dense_large'
kernels
.
append
(
build_kernel
(
'topk_dense_large'
+
kernel_ext
,
'k_topk_dense_large'
,
subs
))
subs
[
'count_t'
]
=
'long long'
subs
[
'kname'
]
=
'k_topk_dense_xlarge'
kernels
.
append
(
build_kernel
(
'topk_dense_large'
+
kernel_ext
,
'k_topk_dense_xlarge'
,
subs
))
return
kernels
def
c_code
(
self
,
node
,
nodename
,
inps
,
outs
,
sub
):
context
=
node
.
inputs
[
0
]
.
type
.
context
if
context
.
kind
!=
b
'cuda'
:
raise
NotImplementedError
(
'
%
s: We only have CUDA '
'implementation so far.'
%
self
.
__class__
.
__name__
)
x
,
k
=
inps
inp_dtc
=
ga
.
dtype_to_typecode
(
node
.
inputs
[
0
]
.
dtype
)
if
not
self
.
return_indices
:
yv
,
=
outs
elif
self
.
return_values
:
yv
,
yi
=
outs
else
:
yi
,
=
outs
out_dtype_s
=
self
.
idx_dtype
out_dtc
=
ga
.
dtype_to_typecode
(
out_dtype_s
)
fail
=
sub
[
'fail'
]
ctx
=
sub
[
'params'
]
k_dtype
=
node
.
inputs
[
1
]
.
type
.
dtype_specs
()[
1
]
# max threads per block
MAX_TPB
=
context
.
maxlsize
# max blocks per grid
MAX_BPG
=
context
.
maxgsize0
WARP_SIZE
=
32
ndim
=
node
.
inputs
[
0
]
.
ndim
reordered_axes
=
list
(
range
(
ndim
))
axis
=
self
.
axis
%
ndim
del
(
reordered_axes
[
axis
])
reordered_axes
=
[
axis
]
+
reordered_axes
dims
=
''
.
join
(
'(void*)(dims+
%
d), '
%
i
for
i
in
reordered_axes
[
1
:])
prep_output
=
''
if
self
.
return_values
:
def_dvstrides
=
'const ssize_t *dvstrides = PyGpuArray_STRIDES(
%
s)'
%
yv
params_dv
=
'(void*)((char*)(
%
s->ga.data) + (
%
s->ga.offset)),
\n
'
%
(
yv
,
yv
)
params_dv
+=
''
.
join
(
'(void*)(dvstrides+
%
d), '
%
i
for
i
in
reordered_axes
)
prep_output
+=
'''
if (0 != theano_prep_output(
&
%(yv)
s,
%(ndim)
d, odims,
%(inp_dtc)
s, GA_C_ORDER,
%(ctx)
s)) {
%(fail)
s;
}
\n
'''
%
locals
()
else
:
def_dvstrides
=
params_dv
=
''
if
self
.
return_indices
:
def_distrides
=
'const ssize_t *distrides = PyGpuArray_STRIDES(
%
s)'
%
yi
params_di
=
'(void*)((char*)(
%
s->ga.data) + (
%
s->ga.offset)),
\n
'
%
yi
params_di
+=
''
.
join
(
'(void*)(distrides+
%
d), '
%
i
for
i
in
reordered_axes
)
prep_output
+=
'''
if (0 != theano_prep_output(
&
%(yi)
s,
%(ndim)
d, odims,
%(out_dtc)
s, GA_C_ORDER,
%(ctx)
s)) {
%(fail)
s;
}
\n
'''
%
locals
()
else
:
def_distrides
=
params_di
=
''
sstrides
=
', '
.
join
(
'(void*)(sstrides+
%
d)'
%
i
for
i
in
reordered_axes
)
code
=
'''
{
const ssize_t k_ = ((
%(k_dtype)
s*)(PyArray_DATA(
%(k)
s)))[0];
const size_t *dims = PyGpuArray_DIMS(
%(x)
s);
size_t odims[
%(ndim)
d];
for (int i=0; i<
%(ndim)
d; i++)
odims[i] = dims[i];
odims[
%(axis)
d] = k_>=0 ? k_ : -k_;
if (0 == odims[
%(axis)
d]) {
PyErr_SetString(
PyExc_ValueError,
"topk: kth must not be zero");
%(fail)
s;
} else if (dims[
%(axis)
d] < odims[
%(axis)
d]) {
PyErr_SetString(
PyExc_ValueError,
"topk: kth cannot be larger than the size of specified axis
%(axis)
d");
%(fail)
s;
}
%(prep_output)
s
size_t grid_size=1, block_size=1;
for (int i=0; i<
%(ndim)
d; ++i) {
if (i!=
%(axis)
d)
grid_size *= dims[i];
else
block_size = dims[i];
}
// round up to multiples of warp size
block_size = ((block_size +
%(WARP_SIZE)
d - 1) /
%(WARP_SIZE)
d) *
%(WARP_SIZE)
d;
if (grid_size >
%(MAX_BPG)
d) {
PyErr_SetString(
PyExc_ValueError,
"topk: too many slices to work with, expected <=
%(MAX_BPG)
d");
%(fail)
s;
}
%(def_dvstrides)
s;
%(def_distrides)
s;
const ssize_t *sstrides = PyGpuArray_STRIDES(
%(x)
s);
void* args[] = {
%(dims)
s
%(params_dv)
s
%(params_di)
s
(void*)(&k_),
(void*)((char*)(
%(x)
s->ga.data) + (
%(x)
s->ga.offset)),
%(sstrides)
s,
(void*)(dims+
%(axis)
d),
};
int err;
if (dims[
%(axis)
d] > (1u << 31)) {
block_size =
%(MAX_TPB)
d;
err = GpuKernel_call(
&k_topk_dense_xlarge
%(nodename)
s, 1,
&grid_size, &block_size, 0, args);
} else if (block_size >
%(MAX_TPB)
d) {
block_size =
%(MAX_TPB)
d;
err = GpuKernel_call(
&k_topk_dense_large
%(nodename)
s, 1,
&grid_size, &block_size, 0, args);
} else {
err = GpuKernel_call(
&k_topk_dense
%(nodename)
s, 1,
&grid_size, &block_size, 0, args);
}
if (err != GA_NO_ERROR) {
PyErr_SetString(
PyExc_RuntimeError,
"topk: gpu kernel failed to execute");
%(fail)
s;
}
}
'''
return
code
%
locals
()
def
make_node
(
self
,
inp
,
kth
):
ctx_name
=
infer_context_name
(
inp
)
inp
=
as_gpuarray_variable
(
inp
,
ctx_name
)
kth
=
as_tensor_variable
(
kth
)
bcast
=
inp
.
type
.
broadcastable
outs
=
[]
if
self
.
return_values
:
outs
.
append
(
inp
.
type
())
if
self
.
return_indices
:
outs
.
append
(
GpuArrayType
(
dtype
=
self
.
idx_dtype
,
broadcastable
=
bcast
,
context_name
=
ctx_name
)())
return
Apply
(
self
,
[
inp
,
kth
],
outs
)
def
get_params
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
@register_opt
(
'fast_compile'
)
@op_lifter
([
TopKOp
],
cuda_only
=
True
)
@register_opt2
([
TopKOp
],
'fast_compile'
)
def
local_gpua_topkop
(
op
,
ctx_name
,
inputs
,
outputs
):
axis
=
op
.
axis
rv
=
op
.
return_values
ri
=
op
.
return_indices
x
,
k
=
inputs
x
=
as_gpuarray_variable
(
x
,
ctx_name
)
gpu_op
=
GpuTopKOp
(
axis
=
axis
,
sorted
=
op
.
sorted
,
idx_dtype
=
op
.
idx_dtype
,
return_values
=
rv
,
return_indices
=
ri
)
rets
=
gpu_op
(
x
,
k
)
return
rets
theano/tensor/__init__.py
浏览文件 @
5cc9c326
...
@@ -40,7 +40,7 @@ from theano.tensor import nnet # used for softmax, sigmoid, etc.
...
@@ -40,7 +40,7 @@ from theano.tensor import nnet # used for softmax, sigmoid, etc.
from
theano.gradient
import
Rop
,
Lop
,
grad
,
numeric_grad
,
verify_grad
,
\
from
theano.gradient
import
Rop
,
Lop
,
grad
,
numeric_grad
,
verify_grad
,
\
jacobian
,
hessian
,
consider_constant
jacobian
,
hessian
,
consider_constant
from
theano.tensor.sort
import
sort
,
argsort
from
theano.tensor.sort
import
sort
,
argsort
,
topk
,
argtopk
,
topk_and_argtopk
from
theano.tensor.extra_ops
import
(
DiffOp
,
bincount
,
squeeze
,
from
theano.tensor.extra_ops
import
(
DiffOp
,
bincount
,
squeeze
,
repeat
,
bartlett
,
fill_diagonal
,
fill_diagonal_offset
,
repeat
,
bartlett
,
fill_diagonal
,
fill_diagonal_offset
,
cumsum
,
cumprod
,
unravel_index
,
ravel_multi_index
)
cumsum
,
cumprod
,
unravel_index
,
ravel_multi_index
)
...
...
theano/tensor/opt.py
浏览文件 @
5cc9c326
...
@@ -35,6 +35,7 @@ from theano.tensor.subtensor import (get_idx_list, get_canonical_form_slice,
...
@@ -35,6 +35,7 @@ from theano.tensor.subtensor import (get_idx_list, get_canonical_form_slice,
advanced_subtensor
,
advanced_subtensor
,
advanced_subtensor1
,
advanced_subtensor1
,
advanced_inc_subtensor1
)
advanced_inc_subtensor1
)
from
theano.tensor.sort
import
TopKOp
from
theano
import
scalar
from
theano
import
scalar
from
theano.scalar
import
basic
from
theano.scalar
import
basic
from
theano.tensor
import
basic
as
T
from
theano.tensor
import
basic
as
T
...
@@ -7548,3 +7549,35 @@ def local_merge_alloc(node):
...
@@ -7548,3 +7549,35 @@ def local_merge_alloc(node):
dim_outer
,
T
.
eq
(
dim_outer
,
dim_inner
))
dim_outer
,
T
.
eq
(
dim_outer
,
dim_inner
))
i
+=
1
i
+=
1
return
[
T
.
alloc
(
inputs_inner
[
0
],
*
dims_outer
)]
return
[
T
.
alloc
(
inputs_inner
[
0
],
*
dims_outer
)]
@register_useless
(
'fast_compile'
)
@gof.local_optimizer
([
TopKOp
])
def
local_useless_topk
(
node
):
"""
TopKOp generates two outputs by default
This opt removes the useless ones
"""
op
=
node
.
op
if
not
isinstance
(
op
,
TopKOp
):
return
if
not
(
op
.
return_values
and
op
.
return_indices
):
return
False
x
,
k
=
node
.
inputs
ret_val
=
bool
(
node
.
outputs
[
0
]
.
clients
)
ret_idx
=
bool
(
node
.
outputs
[
1
]
.
clients
)
if
not
(
ret_val
^
ret_idx
):
# both true -> nothing to remove
# both false -> let pruner handle
return
False
old_output
=
node
.
outputs
[
ret_idx
]
new_output
=
TopKOp
(
axis
=
op
.
axis
,
idx_dtype
=
op
.
idx_dtype
,
return_values
=
ret_val
,
return_indices
=
ret_idx
)(
x
,
k
)
return
{
old_output
:
new_output
}
theano/tensor/sort.py
浏览文件 @
5cc9c326
...
@@ -2,6 +2,22 @@ from __future__ import absolute_import, print_function, division
...
@@ -2,6 +2,22 @@ from __future__ import absolute_import, print_function, division
import
numpy
as
np
import
numpy
as
np
import
theano
import
theano
from
theano.tensor.basic
import
mul
,
arange
from
theano.tensor.basic
import
mul
,
arange
from
theano.gradient
import
grad_undefined
from
theano.tensor.subtensor
import
set_subtensor
def
_variable_is_none
(
var
):
return
isinstance
(
var
,
theano
.
Constant
)
and
var
.
data
is
None
def
_check_tensor_is_scalar
(
var
):
'''
Checks if a tensor variable is scalar, raise ValueError otherwise
'''
msg
=
'
%(var)
s is expected to be 0d tensor, got
%(ndim)
d'
if
var
.
ndim
!=
0
:
raise
ValueError
(
msg
%
(
var
,
var
.
ndim
))
class
SortOp
(
theano
.
Op
):
class
SortOp
(
theano
.
Op
):
...
@@ -33,8 +49,7 @@ class SortOp(theano.Op):
...
@@ -33,8 +49,7 @@ class SortOp(theano.Op):
z
[
0
]
=
np
.
sort
(
a
,
axis
,
self
.
kind
,
self
.
order
)
z
[
0
]
=
np
.
sort
(
a
,
axis
,
self
.
kind
,
self
.
order
)
def
infer_shape
(
self
,
node
,
inputs_shapes
):
def
infer_shape
(
self
,
node
,
inputs_shapes
):
if
(
isinstance
(
node
.
inputs
[
1
],
theano
.
Constant
)
and
if
_variable_is_none
(
node
.
inputs
[
1
]):
node
.
inputs
[
1
]
.
data
is
None
):
# That means axis = None,
# That means axis = None,
# So the array is flattened before being sorted
# So the array is flattened before being sorted
return
[(
mul
(
*
inputs_shapes
[
0
]),)]
return
[(
mul
(
*
inputs_shapes
[
0
]),)]
...
@@ -49,7 +64,7 @@ class SortOp(theano.Op):
...
@@ -49,7 +64,7 @@ class SortOp(theano.Op):
a
,
axis
=
inputs
a
,
axis
=
inputs
indices
=
self
.
__get_argsort_indices
(
a
,
axis
)
indices
=
self
.
__get_argsort_indices
(
a
,
axis
)
inp_grad
=
output_grads
[
0
][
tuple
(
indices
)]
inp_grad
=
output_grads
[
0
][
tuple
(
indices
)]
axis_grad
=
theano
.
gradient
.
grad_undefined
(
axis_grad
=
grad_undefined
(
self
,
1
,
axis
,
self
,
1
,
axis
,
"The gradient of sort is not defined "
"The gradient of sort is not defined "
"with respect to the integer axes itself"
)
"with respect to the integer axes itself"
)
...
@@ -162,8 +177,7 @@ class ArgSortOp(theano.Op):
...
@@ -162,8 +177,7 @@ class ArgSortOp(theano.Op):
dtype
=
node
.
outputs
[
0
]
.
dtype
)
dtype
=
node
.
outputs
[
0
]
.
dtype
)
def
infer_shape
(
self
,
node
,
inputs_shapes
):
def
infer_shape
(
self
,
node
,
inputs_shapes
):
if
(
isinstance
(
node
.
inputs
[
1
],
theano
.
Constant
)
and
if
_variable_is_none
(
node
.
inputs
[
1
]):
node
.
inputs
[
1
]
.
data
is
None
):
return
[(
mul
(
*
inputs_shapes
[
0
]),)]
return
[(
mul
(
*
inputs_shapes
[
0
]),)]
# axis should not be None, so there should be the same number of
# axis should not be None, so there should be the same number of
# dimensions in the input and output
# dimensions in the input and output
...
@@ -175,7 +189,7 @@ class ArgSortOp(theano.Op):
...
@@ -175,7 +189,7 @@ class ArgSortOp(theano.Op):
# No grad defined for intergers.
# No grad defined for intergers.
inp
,
axis
=
inputs
inp
,
axis
=
inputs
inp_grad
=
inp
.
zeros_like
()
inp_grad
=
inp
.
zeros_like
()
axis_grad
=
theano
.
gradient
.
grad_undefined
(
axis_grad
=
grad_undefined
(
self
,
1
,
axis
,
self
,
1
,
axis
,
"argsort is not defined for non-integer axes so"
"argsort is not defined for non-integer axes so"
" argsort(x, axis+eps) is undefined"
)
" argsort(x, axis+eps) is undefined"
)
...
@@ -206,3 +220,339 @@ def argsort(a, axis=-1, kind='quicksort', order=None):
...
@@ -206,3 +220,339 @@ def argsort(a, axis=-1, kind='quicksort', order=None):
a
=
a
.
flatten
()
a
=
a
.
flatten
()
axis
=
0
axis
=
0
return
ArgSortOp
(
kind
,
order
)(
a
,
axis
)
return
ArgSortOp
(
kind
,
order
)(
a
,
axis
)
def
_topk_py_impl
(
op
,
x
,
k
,
axis
,
idx_dtype
):
ndim
=
x
.
ndim
assert
-
ndim
<=
axis
<
ndim
axis
%=
ndim
if
k
==
0
:
raise
ValueError
(
'topk: kth cannot be zero'
)
elif
k
>
x
.
shape
[
axis
]:
raise
ValueError
(
'topk: kth cannot be larger than the size of specified axis
%
d'
%
axis
)
if
abs
(
k
)
==
1
:
# negative k means min instead of max
fn_max
=
[
None
,
np
.
max
,
np
.
min
][
k
]
fn_argmax
=
[
None
,
np
.
argmax
,
np
.
argmin
][
k
]
if
not
op
.
return_indices
:
return
np
.
expand_dims
(
fn_max
(
x
,
axis
=
axis
),
axis
)
elif
op
.
return_values
:
zi
=
np
.
expand_dims
(
fn_argmax
(
x
,
axis
=
axis
),
axis
)
idx2
=
tuple
(
np
.
arange
(
s
)
.
reshape
(
(
s
,)
+
(
1
,)
*
(
ndim
-
i
-
1
)
)
if
i
!=
axis
else
zi
for
i
,
s
in
enumerate
(
x
.
shape
))
zv
=
x
[
idx2
]
return
zv
,
zi
.
astype
(
idx_dtype
)
else
:
zi
=
np
.
expand_dims
(
fn_argmax
(
x
,
axis
=
axis
),
axis
)
return
zi
.
astype
(
idx_dtype
)
if
x
.
shape
[
axis
]
==
abs
(
k
):
if
not
op
.
return_indices
:
return
x
.
copy
()
else
:
l
=
axis
r
=
ndim
-
l
reps
=
list
(
x
.
shape
)
reps
[
axis
]
=
1
zi
=
np
.
arange
(
abs
(
k
),
dtype
=
idx_dtype
)
zi
=
zi
.
reshape
((
1
,)
*
l
+
(
k
,)
+
(
1
,)
*
(
r
-
1
))
zi
=
np
.
tile
(
zi
,
reps
)
if
op
.
return_values
:
return
x
.
copy
(),
zi
else
:
return
zi
idx
=
[
slice
(
None
)]
*
ndim
idx
[
axis
]
=
(
slice
(
-
k
,
None
)
if
k
>
0
else
slice
(
-
k
))
if
not
op
.
return_indices
:
zv
=
np
.
partition
(
x
,
-
k
,
axis
=
axis
)[
idx
]
return
zv
elif
op
.
return_values
:
zi
=
np
.
argpartition
(
x
,
-
k
,
axis
=
axis
)[
idx
]
idx2
=
tuple
(
np
.
arange
(
s
)
.
reshape
(
(
s
,)
+
(
1
,)
*
(
ndim
-
i
-
1
)
)
if
i
!=
axis
else
zi
for
i
,
s
in
enumerate
(
x
.
shape
))
zv
=
x
[
idx2
]
return
zv
,
zi
.
astype
(
idx_dtype
)
else
:
zi
=
np
.
argpartition
(
x
,
-
k
,
axis
=
axis
)[
idx
]
return
zi
.
astype
(
idx_dtype
)
class
TopKOp
(
theano
.
Op
):
"""
Operations related to finding k-largest elements.
Parameters
----------
axis: integer
Defaults to ``-1``.
The axis to perform the operation. Must be in range ``[-ndim, ndim)``, where
``ndim`` is the dimensionality of input tensor.
idx_dtype: string
Specify output dtype for indices, defaults to ``int64``, must be integer type.
sorted: bool
NOTE: NOT IMPLEMENTED YET
Defaults to ``True``
If True, the result array would be sorted in descending order.
Notes
-----
- By default, this Op give two outputs: values and indices. However optimizer may
remove a certain output if not needed.
- Computing gradient is only possible when both values and indices are computed in
forward pass.
- If the top-k-th value is not unique, we cannot guarantee the output indices being
deterministically chosen.
See Also
--------
topk
argtopk
argtopk_and_topk
"""
# TODO more params
'''
only_top_kth: bool
Defaults to ``False``
If ``True``, will only find one exact top k-th element on given axis.
'''
# TODO c_code
# TODO add opt, if k==1, use max/min reduce
# also if k is axis size, just copy input tensor
# TODO add opt, to merge argtopk / topk
__props__
=
(
'axis'
,
'sorted'
,
'return_values'
,
'return_indices'
,
'idx_dtype'
)
def
__init__
(
self
,
axis
=-
1
,
sorted
=
True
,
idx_dtype
=
'int64'
,
return_values
=
True
,
return_indices
=
True
):
# numpy always uses int64 as output dtype for arg*() routines
# however, we add "idx_dtype" param as memory is more precious on gpu
if
not
isinstance
(
axis
,
int
):
raise
TypeError
(
'"axis" parameter must be integer, got "
%
s"'
%
type
(
axis
))
if
idx_dtype
not
in
theano
.
tensor
.
integer_dtypes
:
raise
TypeError
(
'"idx_dtype" parameter must be an integer dtype, got "
%
s"'
%
idx_dtype
)
if
not
(
return_indices
or
return_values
):
raise
ValueError
(
"Neither return_values nor return_indices is True, this isn't allowed"
)
self
.
axis
=
axis
self
.
sorted
=
sorted
self
.
return_values
=
return_values
self
.
return_indices
=
return_indices
self
.
idx_dtype
=
idx_dtype
def
__str__
(
self
):
return
'
%(op)
s{axis=
%(axis)
d, sorted=
%(sorted)
s}'
%
dict
(
op
=
self
.
__class__
.
__name__
,
axis
=
self
.
axis
,
sorted
=
self
.
sorted
)
def
make_node
(
self
,
inp
,
kth
):
inp
=
theano
.
tensor
.
as_tensor_variable
(
inp
)
ndim
=
inp
.
ndim
if
ndim
==
0
:
raise
ValueError
(
'Cannot take scalar as input'
)
if
not
-
ndim
<=
self
.
axis
<
ndim
:
raise
IndexError
(
'"axis" parameter out of range,'
' expected integer within [
%
d,
%
d]'
%
(
-
ndim
,
ndim
-
1
))
kth
=
theano
.
tensor
.
as_tensor_variable
(
kth
)
_check_tensor_is_scalar
(
kth
)
bcast
=
inp
.
type
.
broadcastable
outs
=
[]
if
self
.
return_values
:
outs
.
append
(
inp
.
type
())
if
self
.
return_indices
:
outs
.
append
(
theano
.
tensor
.
TensorType
(
dtype
=
self
.
idx_dtype
,
broadcastable
=
bcast
)())
return
theano
.
Apply
(
self
,
[
inp
,
kth
],
outs
)
def
perform
(
self
,
node
,
inputs
,
output_storage
):
x
,
k
=
inputs
axis
=
self
.
axis
if
not
self
.
return_indices
:
pzv
=
output_storage
[
0
]
pzv
[
0
]
=
_topk_py_impl
(
self
,
x
,
k
,
axis
,
None
)
elif
self
.
return_values
:
pzv
=
output_storage
[
0
]
pzi
=
output_storage
[
1
]
pzv
[
0
],
pzi
[
0
]
=
_topk_py_impl
(
self
,
x
,
k
,
axis
,
node
.
outputs
[
1
]
.
dtype
)
else
:
pzi
=
output_storage
[
0
]
pzi
[
0
]
=
_topk_py_impl
(
self
,
x
,
k
,
axis
,
node
.
outputs
[
0
]
.
dtype
)
def
infer_shape
(
self
,
node
,
inp_shapes
):
shp
=
list
(
inp_shapes
[
0
])
shp
[
self
.
axis
]
=
np
.
abs
(
node
.
inputs
[
1
])
shp
=
tuple
(
shp
)
return
[
shp
for
i
in
[
self
.
return_values
,
self
.
return_indices
]
if
i
]
def
L_op
(
self
,
inputs
,
outputs
,
out_grads
):
x
,
k
=
inputs
k_grad
=
grad_undefined
(
self
,
1
,
k
,
'topk: k is not differentiable'
)
if
not
(
self
.
return_indices
or
self
.
return_values
):
x_grad
=
grad_undefined
(
self
,
0
,
x
,
'topk: cannot get gradient'
' without both indices and values'
)
else
:
x_shp
=
theano
.
tensor
.
shape
(
x
)
z_grad
=
out_grads
[
0
]
ndim
=
x
.
ndim
axis
=
self
.
axis
%
ndim
grad_indices
=
[
arange
(
x_shp
[
i
])
.
dimshuffle
([
0
]
+
[
'x'
]
*
(
ndim
-
i
-
1
))
if
i
!=
axis
else
outputs
[
-
1
]
for
i
in
range
(
ndim
)]
x_grad
=
x
.
zeros_like
(
dtype
=
z_grad
.
dtype
)
x_grad
=
set_subtensor
(
x_grad
[
tuple
(
grad_indices
)],
z_grad
)
return
[
x_grad
,
k_grad
]
def
topk
(
x
,
kth
,
axis
=-
1
,
sorted
=
True
,
idx_dtype
=
'int64'
):
"""
Returns the k-largest elements along an axis.
Parameters
----------
x: tensor instance
kth: integer constant/variable
Must not be 0. If negative, gives k-smallest elements instead.
axis: integer or ``None``
Upon which axis shall the operation be performed on.
If ``None``, works on flattened array.
sorted: bool
NOTE: NOT IMPLEMENTED YET, USE ``False`` FOR NOW.
Defaults to ``True``
If True, the result array would be sorted in descending order.
idx_dtype: string
Specify output dtype used in indices, defaults to ``int64``, must be integer type.
This option is here because indices are needed for gradient.
Returns
-------
Tensor variable with same dtype as `x`.
Notes
-----
- ``sorted=True`` is not supported yet.
"""
if
sorted
:
raise
NotImplementedError
(
"We are still working on sorted topk. Use sorted=False for now."
)
if
axis
is
None
:
x
=
theano
.
tensor
.
flatten
(
x
)
axis
=
0
return
TopKOp
(
axis
=
axis
,
sorted
=
sorted
,
idx_dtype
=
idx_dtype
)(
x
,
kth
)[
0
]
def
argtopk
(
x
,
kth
,
axis
=-
1
,
sorted
=
True
,
idx_dtype
=
'int64'
):
"""
Returns the indices of k-largest elements along an axis.
Parameters
----------
x: tensor instance
kth: integer constant/variable
Must not be 0. If negative, gives k-smallest elements instead.
sorted: bool
NOTE: NOT IMPLEMENTED YET, USE ``False`` FOR NOW.
Defaults to ``True``
If True, the result array of corresponding indices would be sorted in descending order.
axis: integer, tuple/list of integers, or ``None``
Upon which axis shall the operation be performed on.
If ``None``, works on flattened array.
idx_dtype: string
Specify output dtype, defaults to ``int64``, must be integer type.
Returns
-------
Tensor variable with dtype specified in `idx_dtype`.
Notes
-----
- ``sorted=True`` is not supported yet.
- If the top-k-th value is not unique, we cannot guarantee the output
indices are deterministically chosen.
"""
if
sorted
:
raise
NotImplementedError
(
"We are still working on sorted topk. Use sorted=False for now."
)
if
axis
is
None
:
x
=
theano
.
tensor
.
flatten
(
x
)
axis
=
0
return
TopKOp
(
axis
=
axis
,
sorted
=
sorted
,
idx_dtype
=
idx_dtype
)(
x
,
kth
)[
1
]
def
topk_and_argtopk
(
x
,
kth
,
axis
=-
1
,
sorted
=
True
,
idx_dtype
=
'int64'
):
"""
Returns the results of both topk() and argtopk() in one Op.
See the respective documentation for details.
Returns
-------
tuple: (values, indices)
"""
if
sorted
:
raise
NotImplementedError
(
"We are still working on sorted topk. Use sorted=False for now."
)
if
axis
is
None
:
x
=
theano
.
tensor
.
flatten
(
x
)
axis
=
0
return
TopKOp
(
axis
=
axis
,
sorted
=
sorted
,
idx_dtype
=
idx_dtype
)(
x
,
kth
)
theano/tensor/tests/test_sort.py
浏览文件 @
5cc9c326
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
from
itertools
import
product
,
chain
from
functools
import
reduce
import
unittest
import
unittest
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
...
@@ -9,9 +11,18 @@ from theano import tensor
...
@@ -9,9 +11,18 @@ from theano import tensor
from
theano.tensor.sort
import
sort
,
SortOp
from
theano.tensor.sort
import
sort
,
SortOp
from
theano.tensor.sort
import
argsort
,
ArgSortOp
from
theano.tensor.sort
import
argsort
,
ArgSortOp
from
theano.tensor.sort
import
topk
,
argtopk
,
topk_and_argtopk
,
TopKOp
_all_dtypes
=
tensor
.
integer_dtypes
+
tensor
.
float_dtypes
class
test_sort
(
unittest
.
TestCase
):
def
gen_unique_vector
(
size
,
dtype
):
# generate a randomized vector with unique elements
retval
=
np
.
arange
(
size
)
*
3.
+
np
.
random
.
uniform
(
-
1.
,
1.
)
return
(
retval
[
np
.
random
.
permutation
(
size
)]
-
size
*
1.5
)
.
astype
(
dtype
)
class
Test_sort
(
unittest
.
TestCase
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
rng
=
np
.
random
.
RandomState
(
seed
=
utt
.
fetch_seed
())
self
.
rng
=
np
.
random
.
RandomState
(
seed
=
utt
.
fetch_seed
())
...
@@ -22,7 +33,7 @@ class test_sort(unittest.TestCase):
...
@@ -22,7 +33,7 @@ class test_sort(unittest.TestCase):
a
=
tensor
.
dmatrix
()
a
=
tensor
.
dmatrix
()
w
=
sort
(
a
)
w
=
sort
(
a
)
f
=
theano
.
function
([
a
],
w
)
f
=
theano
.
function
([
a
],
w
)
assert
np
.
allclose
(
f
(
self
.
m_val
),
np
.
sort
(
self
.
m_val
))
utt
.
assert_
allclose
(
f
(
self
.
m_val
),
np
.
sort
(
self
.
m_val
))
def
test2
(
self
):
def
test2
(
self
):
a
=
tensor
.
dmatrix
()
a
=
tensor
.
dmatrix
()
...
@@ -32,7 +43,7 @@ class test_sort(unittest.TestCase):
...
@@ -32,7 +43,7 @@ class test_sort(unittest.TestCase):
for
axis_val
in
0
,
1
:
for
axis_val
in
0
,
1
:
gv
=
f
(
self
.
m_val
,
axis_val
)
gv
=
f
(
self
.
m_val
,
axis_val
)
gt
=
np
.
sort
(
self
.
m_val
,
axis_val
)
gt
=
np
.
sort
(
self
.
m_val
,
axis_val
)
assert
np
.
allclose
(
gv
,
gt
)
utt
.
assert_
allclose
(
gv
,
gt
)
def
test3
(
self
):
def
test3
(
self
):
a
=
tensor
.
dvector
()
a
=
tensor
.
dvector
()
...
@@ -40,7 +51,7 @@ class test_sort(unittest.TestCase):
...
@@ -40,7 +51,7 @@ class test_sort(unittest.TestCase):
f
=
theano
.
function
([
a
],
w2
)
f
=
theano
.
function
([
a
],
w2
)
gv
=
f
(
self
.
v_val
)
gv
=
f
(
self
.
v_val
)
gt
=
np
.
sort
(
self
.
v_val
)
gt
=
np
.
sort
(
self
.
v_val
)
assert
np
.
allclose
(
gv
,
gt
)
utt
.
assert_
allclose
(
gv
,
gt
)
def
test4
(
self
):
def
test4
(
self
):
a
=
tensor
.
dmatrix
()
a
=
tensor
.
dmatrix
()
...
@@ -50,7 +61,7 @@ class test_sort(unittest.TestCase):
...
@@ -50,7 +61,7 @@ class test_sort(unittest.TestCase):
for
axis_val
in
0
,
1
:
for
axis_val
in
0
,
1
:
gv
=
f
(
self
.
m_val
,
axis_val
)
gv
=
f
(
self
.
m_val
,
axis_val
)
gt
=
np
.
sort
(
self
.
m_val
,
axis_val
)
gt
=
np
.
sort
(
self
.
m_val
,
axis_val
)
assert
np
.
allclose
(
gv
,
gt
)
utt
.
assert_
allclose
(
gv
,
gt
)
def
test5
(
self
):
def
test5
(
self
):
a1
=
SortOp
(
"mergesort"
,
[])
a1
=
SortOp
(
"mergesort"
,
[])
...
@@ -67,10 +78,9 @@ class test_sort(unittest.TestCase):
...
@@ -67,10 +78,9 @@ class test_sort(unittest.TestCase):
f
=
theano
.
function
([
a
],
l
)
f
=
theano
.
function
([
a
],
l
)
gv
=
f
(
self
.
m_val
)
gv
=
f
(
self
.
m_val
)
gt
=
np
.
sort
(
self
.
m_val
,
None
)
gt
=
np
.
sort
(
self
.
m_val
,
None
)
assert
np
.
allclose
(
gv
,
gt
)
utt
.
assert_
allclose
(
gv
,
gt
)
def
test_grad_vector
(
self
):
def
test_grad_vector
(
self
):
a
=
theano
.
tensor
.
vector
()
data
=
np
.
random
.
rand
(
10
)
.
astype
(
theano
.
config
.
floatX
)
data
=
np
.
random
.
rand
(
10
)
.
astype
(
theano
.
config
.
floatX
)
utt
.
verify_grad
(
sort
,
[
data
])
utt
.
verify_grad
(
sort
,
[
data
])
...
@@ -133,19 +143,19 @@ class test_sort(unittest.TestCase):
...
@@ -133,19 +143,19 @@ class test_sort(unittest.TestCase):
utt
.
verify_grad
(
lambda
x
:
sort
(
x
,
3
),
[
data
])
utt
.
verify_grad
(
lambda
x
:
sort
(
x
,
3
),
[
data
])
class
Tensor
InferShapeTester
(
utt
.
InferShapeTester
):
class
Sort
InferShapeTester
(
utt
.
InferShapeTester
):
def
test_sort
(
self
):
def
test_sort
(
self
):
x
=
tensor
.
matrix
()
x
=
tensor
.
matrix
()
self
.
_compile_and_check
(
self
.
_compile_and_check
(
[
x
],
[
x
],
[
sort
(
x
)],
[
sort
(
x
)],
[
np
.
random
.
randn
(
10
,
40
)
.
astype
(
theano
.
config
.
floatX
)],
[
np
.
random
.
randn
(
10
,
40
)
.
astype
(
theano
.
config
.
floatX
)],
SortOp
)
SortOp
)
self
.
_compile_and_check
(
self
.
_compile_and_check
(
[
x
],
[
x
],
[
sort
(
x
,
axis
=
None
)],
[
sort
(
x
,
axis
=
None
)],
[
np
.
random
.
randn
(
10
,
40
)
.
astype
(
theano
.
config
.
floatX
)],
[
np
.
random
.
randn
(
10
,
40
)
.
astype
(
theano
.
config
.
floatX
)],
SortOp
)
SortOp
)
def
test_argsort
():
def
test_argsort
():
...
@@ -160,7 +170,7 @@ def test_argsort():
...
@@ -160,7 +170,7 @@ def test_argsort():
f
=
theano
.
function
([
a
],
w
)
f
=
theano
.
function
([
a
],
w
)
gv
=
f
(
m_val
)
gv
=
f
(
m_val
)
gt
=
np
.
argsort
(
m_val
)
gt
=
np
.
argsort
(
m_val
)
assert
np
.
allclose
(
gv
,
gt
)
utt
.
assert_
allclose
(
gv
,
gt
)
# Example 2
# Example 2
a
=
tensor
.
dmatrix
()
a
=
tensor
.
dmatrix
()
...
@@ -170,7 +180,7 @@ def test_argsort():
...
@@ -170,7 +180,7 @@ def test_argsort():
for
axis_val
in
0
,
1
:
for
axis_val
in
0
,
1
:
gv
=
f
(
m_val
,
axis_val
)
gv
=
f
(
m_val
,
axis_val
)
gt
=
np
.
argsort
(
m_val
,
axis_val
)
gt
=
np
.
argsort
(
m_val
,
axis_val
)
assert
np
.
allclose
(
gv
,
gt
)
utt
.
assert_
allclose
(
gv
,
gt
)
# Example 3
# Example 3
a
=
tensor
.
dvector
()
a
=
tensor
.
dvector
()
...
@@ -178,7 +188,7 @@ def test_argsort():
...
@@ -178,7 +188,7 @@ def test_argsort():
f
=
theano
.
function
([
a
],
w2
)
f
=
theano
.
function
([
a
],
w2
)
gv
=
f
(
v_val
)
gv
=
f
(
v_val
)
gt
=
np
.
argsort
(
v_val
)
gt
=
np
.
argsort
(
v_val
)
assert
np
.
allclose
(
gv
,
gt
)
utt
.
assert_
allclose
(
gv
,
gt
)
# Example 4
# Example 4
a
=
tensor
.
dmatrix
()
a
=
tensor
.
dmatrix
()
...
@@ -188,7 +198,7 @@ def test_argsort():
...
@@ -188,7 +198,7 @@ def test_argsort():
for
axis_val
in
0
,
1
:
for
axis_val
in
0
,
1
:
gv
=
f
(
m_val
,
axis_val
)
gv
=
f
(
m_val
,
axis_val
)
gt
=
np
.
argsort
(
m_val
,
axis_val
)
gt
=
np
.
argsort
(
m_val
,
axis_val
)
assert
np
.
allclose
(
gv
,
gt
)
utt
.
assert_
allclose
(
gv
,
gt
)
# Example 5
# Example 5
a
=
tensor
.
dmatrix
()
a
=
tensor
.
dmatrix
()
...
@@ -206,7 +216,7 @@ def test_argsort():
...
@@ -206,7 +216,7 @@ def test_argsort():
f
=
theano
.
function
([
a
],
w2
)
f
=
theano
.
function
([
a
],
w2
)
gv
=
f
(
m_val
)
gv
=
f
(
m_val
)
gt
=
np
.
argsort
(
m_val
,
None
)
gt
=
np
.
argsort
(
m_val
,
None
)
assert
np
.
allclose
(
gv
,
gt
)
utt
.
assert_
allclose
(
gv
,
gt
)
def
test_argsort_grad
():
def
test_argsort_grad
():
...
@@ -219,3 +229,222 @@ def test_argsort_grad():
...
@@ -219,3 +229,222 @@ def test_argsort_grad():
data
=
np
.
random
.
rand
(
2
,
3
,
3
)
.
astype
(
theano
.
config
.
floatX
)
data
=
np
.
random
.
rand
(
2
,
3
,
3
)
.
astype
(
theano
.
config
.
floatX
)
utt
.
verify_grad
(
lambda
x
:
argsort
(
x
,
axis
=
2
),
[
data
])
utt
.
verify_grad
(
lambda
x
:
argsort
(
x
,
axis
=
2
),
[
data
])
class
Test_TopK
(
unittest
.
TestCase
):
def
setUp
(
self
):
pass
@utt.parameterized.expand
(
product
(
_all_dtypes
,
tensor
.
integer_dtypes
,
[
-
1
,
0
,
None
],
[
False
]))
def
test_argtopk_sanity
(
self
,
dtype
,
idx_dtype
,
axis
,
sorted
):
x
=
tensor
.
vector
(
name
=
'x'
,
dtype
=
dtype
)
fn
=
theano
.
function
([
x
],
argtopk
(
x
,
1
,
axis
=
axis
,
sorted
=
sorted
,
idx_dtype
=
idx_dtype
))
xval
=
np
.
asarray
([
1
])
.
astype
(
dtype
)
yval
=
fn
(
xval
)
assert
yval
==
np
.
asarray
([
0
],
dtype
=
idx_dtype
)
assert
yval
.
dtype
==
np
.
dtype
(
idx_dtype
)
@utt.parameterized.expand
(
product
(
_all_dtypes
,
[
-
1
,
0
,
None
],
[
False
]))
def
test_topk_sanity
(
self
,
dtype
,
axis
,
sorted
):
x
=
tensor
.
vector
(
name
=
'x'
,
dtype
=
dtype
)
fn
=
theano
.
function
([
x
],
topk
(
x
,
1
,
axis
=
axis
,
sorted
=
sorted
))
xval
=
np
.
asarray
([
1
])
.
astype
(
dtype
)
yval
=
fn
(
xval
)
assert
yval
==
xval
assert
yval
.
dtype
==
xval
.
dtype
@utt.parameterized.expand
(
product
(
_all_dtypes
,
tensor
.
integer_dtypes
,
[
-
1
,
0
,
None
],
[
False
]))
def
test_combined_sanity
(
self
,
dtype
,
idx_dtype
,
axis
,
sorted
):
x
=
tensor
.
vector
(
name
=
'x'
,
dtype
=
dtype
)
yv
,
yi
=
topk_and_argtopk
(
x
,
1
,
axis
=
axis
,
sorted
=
sorted
,
idx_dtype
=
idx_dtype
)
fn
=
theano
.
function
([
x
],
[
yv
,
yi
])
xval
=
np
.
asarray
([
1
])
.
astype
(
dtype
)
yvval
,
yival
=
fn
(
xval
)
assert
yival
==
np
.
asarray
([
0
],
dtype
=
idx_dtype
)
utt
.
assert_allclose
(
xval
,
yvval
)
assert
yvval
.
dtype
==
xval
.
dtype
assert
yival
.
dtype
==
np
.
dtype
(
idx_dtype
)
@utt.parameterized.expand
(
chain
(
product
(
(
16
,
61
,
257
),
(
1
,
-
1
,
-
10
,
'n//2'
,
'n-1'
,
'-n'
,
'1-n'
),
(
'float64'
,
'float16'
,
'int16'
,
'int8'
),
(
False
,)),
((
2049
,
1337
,
'float64'
,
False
),)))
def
test_topk_1d
(
self
,
size
,
k
,
dtype
,
sorted
):
if
isinstance
(
k
,
str
):
k
=
eval
(
k
.
replace
(
'n'
,
str
(
size
)))
x
=
theano
.
tensor
.
vector
(
name
=
'x'
,
dtype
=
dtype
)
y
=
topk
(
x
,
k
,
sorted
=
sorted
)
fn
=
theano
.
function
([
x
],
y
)
# assert local_useless_topk opt is done properly
assert
1
==
len
(
fn
.
maker
.
fgraph
.
outputs
[
0
]
.
owner
.
outputs
)
# generate a all-unique array
xval
=
gen_unique_vector
(
size
,
dtype
)
yval
=
fn
(
xval
)
idx
=
(
slice
(
-
k
,
None
)
if
k
>
0
else
slice
(
-
k
))
goal
=
np
.
sort
(
xval
)[
idx
]
assert
yval
.
dtype
==
goal
.
dtype
utt
.
assert_allclose
(
np
.
sort
(
yval
),
goal
)
@utt.parameterized.expand
(
chain
(
product
(
(
16
,
61
,
257
),
(
1
,
-
1
,
-
10
,
'n//2'
,
'n-1'
,
'-n'
),
(
'float32'
,
'int32'
),
(
False
,),
(
'int32'
,
'int64'
)),
((
2049
,
1337
,
'float32'
,
False
,
'int32'
),)))
def
test_argtopk_1d
(
self
,
size
,
k
,
dtype
,
sorted
,
idx_dtype
):
if
isinstance
(
k
,
str
):
k
=
eval
(
k
.
replace
(
'n'
,
str
(
size
)))
x
=
theano
.
tensor
.
vector
(
name
=
'x'
,
dtype
=
dtype
)
y
=
argtopk
(
x
,
k
,
sorted
=
sorted
,
idx_dtype
=
idx_dtype
)
fn
=
theano
.
function
([
x
],
y
)
# assert local_useless_topk opt is done properly
assert
1
==
len
(
fn
.
maker
.
fgraph
.
outputs
[
0
]
.
owner
.
outputs
)
# generate a all-unique array
xval
=
gen_unique_vector
(
size
,
dtype
)
yval
=
fn
(
xval
)
idx
=
(
slice
(
-
k
,
None
)
if
k
>
0
else
slice
(
-
k
))
goal
=
np
.
argsort
(
xval
)[
idx
]
.
astype
(
idx_dtype
)
# due to uniqueness, we expect indices same
assert
np
.
all
(
xval
[
np
.
sort
(
yval
)]
==
xval
[
np
.
sort
(
goal
)])
@utt.parameterized.expand
(
chain
(
product
(
(
16
,
61
,
257
),
(
1
,
-
1
,
10
,
'n//2'
,
'n-1'
,
'1-n'
),
(
'float32'
,
'int32'
),
(
False
,),
(
'int32'
,
'int64'
)),
((
2049
,
1337
,
'float32'
,
False
,
'int32'
),)))
def
test_combined_1d
(
self
,
size
,
k
,
dtype
,
sorted
,
idx_dtype
):
if
isinstance
(
k
,
str
):
k
=
eval
(
k
.
replace
(
'n'
,
str
(
size
)))
x
=
theano
.
tensor
.
vector
(
name
=
'x'
,
dtype
=
dtype
)
yv
,
yi
=
topk_and_argtopk
(
x
,
k
,
sorted
=
sorted
,
idx_dtype
=
idx_dtype
)
fn
=
theano
.
function
([
x
],
[
yv
,
yi
])
# generate a all-unique array
xval
=
gen_unique_vector
(
size
,
dtype
)
yvval
,
yival
=
fn
(
xval
)
idx
=
(
slice
(
-
k
,
None
)
if
k
>
0
else
slice
(
-
k
))
goali
=
np
.
argsort
(
xval
)[
idx
]
.
astype
(
idx_dtype
)
goalv
=
xval
[
goali
]
# due to uniqueness, we expect indices same
assert
np
.
all
(
xval
[
np
.
sort
(
yival
)]
==
xval
[
np
.
sort
(
goali
)])
utt
.
assert_allclose
(
np
.
sort
(
yvval
),
goalv
)
@utt.parameterized.expand
(
chain
(
product
(
(
18
,
62
,
258
),
(
1
,
-
1
,
'n//2'
),
(
'int32'
,
'float32'
),
(
False
,)),
((
2048
,
1337
,
'float32'
,
False
),)))
def
test_argtopk_1d_collision
(
self
,
size
,
k
,
dtype
,
sorted
):
# with non-unique kth max value
if
isinstance
(
k
,
str
):
k
=
eval
(
k
.
replace
(
'n'
,
str
(
size
)))
x
=
theano
.
tensor
.
vector
(
name
=
'x'
,
dtype
=
dtype
)
y
=
argtopk
(
x
,
k
,
sorted
=
sorted
,
idx_dtype
=
'int32'
)
fn
=
theano
.
function
([
x
],
y
)
xval
=
np
.
repeat
(
np
.
random
.
uniform
(
-
100.
,
100.
,
size
=
size
//
2
)
.
astype
(
dtype
),
2
)
xval
=
xval
[
np
.
random
.
permutation
(
size
)]
yval
=
fn
(
xval
)
idx
=
(
slice
(
-
k
,
None
)
if
k
>
0
else
slice
(
-
k
))
goal
=
np
.
argsort
(
xval
)[
idx
]
.
astype
(
'int32'
)
utt
.
assert_allclose
(
np
.
sort
(
xval
[
yval
]),
np
.
sort
(
xval
[
goal
]))
@utt.parameterized.expand
(
product
(
((
17
,
15
),
(
2
,
3
,
5
,
7
,
11
),
(
2017
,
5
,
3
)),
(
-
1
,
'(1+n)//2'
,
'-n'
,
'1-n'
),
(
'float32'
,
'int32'
),
(
False
,),
(
'int32'
,
'int64'
)))
def
test_argtopk_nd
(
self
,
shp
,
k_
,
dtype
,
sorted
,
idx_dtype
):
ndim
=
len
(
shp
)
for
axis
in
range
(
-
ndim
,
ndim
):
if
isinstance
(
k_
,
str
):
k
=
eval
(
k_
.
replace
(
'n'
,
str
(
shp
[
axis
])))
else
:
k
=
k_
if
k
==
0
:
continue
x
=
theano
.
tensor
.
tensor
(
name
=
'x'
,
broadcastable
=
(
False
,)
*
len
(
shp
),
dtype
=
dtype
)
y
=
argtopk
(
x
,
k
,
axis
=
axis
,
sorted
=
sorted
,
idx_dtype
=
idx_dtype
)
fn
=
theano
.
function
([
x
],
y
)
size
=
reduce
(
int
.
__mul__
,
shp
)
xval
=
gen_unique_vector
(
size
,
dtype
)
.
reshape
(
shp
)
yval
=
fn
(
xval
)
idx
=
slice
(
-
k
,
None
)
if
k
>
0
else
slice
(
-
k
)
l
=
axis
%
ndim
r
=
ndim
-
l
idx
=
(
slice
(
None
),)
*
l
+
(
idx
,)
+
(
slice
(
None
),)
*
(
r
-
1
)
goal
=
np
.
argsort
(
xval
,
axis
=
axis
)[
idx
]
.
astype
(
idx_dtype
)
assert
np
.
all
(
np
.
sort
(
yval
,
axis
=
axis
)
==
np
.
sort
(
goal
,
axis
=
axis
))
@utt.parameterized.expand
(
product
(
((
257
,),
(
17
,
15
),
(
5
,
3
,
5
,
3
),
(
2
,
3
,
5
,
7
,
11
)),
(
1
,
-
1
,
'(1+n)//2'
,
'n-1'
,
'-n'
,
'1-n'
),
(
False
,)))
def
test_grad
(
self
,
shp
,
k_
,
sorted
):
ndim
=
len
(
shp
)
for
axis
in
range
(
-
ndim
,
ndim
):
if
isinstance
(
k_
,
str
):
k
=
eval
(
k_
.
replace
(
'n'
,
str
(
shp
[
axis
])))
else
:
k
=
k_
if
k
==
0
:
continue
# make input away from undefined gradient (where some inputs are equal)
xval
=
gen_unique_vector
(
reduce
(
int
.
__mul__
,
shp
),
dtype
=
theano
.
config
.
floatX
)
.
reshape
(
shp
)
utt
.
verify_grad
(
lambda
x
:
topk
(
x
,
k
,
axis
=
axis
,
sorted
=
sorted
),
[
xval
],
eps
=
1e-2
)
class
TopKInferShapeTester
(
utt
.
InferShapeTester
):
@utt.parameterized.expand
(
product
(
((
2
,
3
),
(
15
,
17
),
(
11
,
7
,
5
),
(
2
,
3
,
5
,
7
,
11
),
(
2
,
4
,
3
,
1
)),
(
1
,
'(1+n)//2'
,
'n-1'
,
'n'
)))
def
test_combined_infer_shape
(
self
,
shp
,
k_
):
ndim
=
len
(
shp
)
for
axis
in
range
(
-
ndim
,
ndim
):
if
isinstance
(
k_
,
str
):
k
=
eval
(
k_
.
replace
(
'n'
,
str
(
shp
[
axis
])))
else
:
k
=
k_
if
k
==
0
:
continue
x
=
theano
.
tensor
.
tensor
(
name
=
'x'
,
broadcastable
=
(
False
,)
*
len
(
shp
),
dtype
=
theano
.
config
.
floatX
)
yv
,
yi
=
topk_and_argtopk
(
x
,
k
,
axis
=
axis
,
sorted
=
False
,
idx_dtype
=
'int32'
)
size
=
reduce
(
int
.
__mul__
,
shp
)
xval
=
gen_unique_vector
(
size
,
theano
.
config
.
floatX
)
.
reshape
(
shp
)
self
.
_compile_and_check
(
[
x
],
[
yv
,
yi
],
[
xval
],
TopKOp
)
theano/tests/unittest_tools.py
浏览文件 @
5cc9c326
...
@@ -83,7 +83,7 @@ def seed_rng(pseed=None):
...
@@ -83,7 +83,7 @@ def seed_rng(pseed=None):
def
verify_grad
(
op
,
pt
,
n_tests
=
2
,
rng
=
None
,
*
args
,
**
kwargs
):
def
verify_grad
(
op
,
pt
,
n_tests
=
2
,
rng
=
None
,
*
args
,
**
kwargs
):
"""
"""
Wrapper for
tensor/basic
.py:verify_grad
Wrapper for
gradient
.py:verify_grad
Takes care of seeding the random number generator if None is given
Takes care of seeding the random number generator if None is given
"""
"""
if
rng
is
None
:
if
rng
is
None
:
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论