提交 5cc9c326 authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #5959 from khaotik/topk

TopKOp implementation
......@@ -1367,21 +1367,27 @@ class LocalOptGroup(LocalOptimizer):
self.process_count[opt] += 1
if not new_repl:
continue
else:
if self.profile:
self.node_created[opt] += len(graph.ops(fgraph.variables, new_repl))
self.applied_true[opt] += 1
break # break from the for loop over optimization.
if isinstance(new_repl, (tuple, list)):
new_vars = new_repl
else: # It must be a dict
new_vars = list(new_repl.values())
if self.profile:
self.node_created[opt] += len(graph.ops(fgraph.variables, new_vars))
self.applied_true[opt] += 1
break # break from the for loop over optimization.
if not new_repl: # No optimization applied in the last iteration
return repl
# only 1 iteration or we are at the start of the graph.
if not self.apply_all_opts or not new_repl[0].owner:
# only 1 iteration
if not self.apply_all_opts:
return new_repl
if not new_vars[0].owner:
# We are at the start of the graph.
return new_repl
if len(new_repl) > 1:
s = set([v.owner for v in new_repl])
assert len(s) == 1
repl = new_repl
node = repl[0].owner
node = new_vars[0].owner
@staticmethod
def print_profile(stream, prof, level=0):
......
......@@ -28,7 +28,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined)
from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, extra_ops, multinomial, reduction, rng_mrg, ctc
from . import fft, dnn, opt, extra_ops, multinomial, reduction, sort, rng_mrg, ctc
def transfer(x, target):
......
// modified from pytorch
// https://github.com/pytorch/pytorch/master/blob/torch/lib/THC/THCTensorTopK.cuh
// original license below:
/*
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
#if __CUDA_ARCH__ < 350
#define __ldg(ptr) (*(ptr))
#endif
typedef ptrdiff_t ssize_t;
__device__ __forceinline__ int lane_id() {
int id;
asm("mov.s32 %0, %laneid;" : "=r"(id) );
return id;
}
__device__ __forceinline__ unsigned lane_mask_lt() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ unsigned lane_mask_le() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ unsigned lane_mask_gt() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
return mask;
}
__device__ __forceinline__ unsigned lane_mask_ge() {
unsigned mask;
asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
return mask;
}
template <typename T>
struct Bitfield {};
template <>
struct Bitfield<unsigned int> {
static __device__ __forceinline__
unsigned int get(unsigned int val, int pos, int len) {
unsigned int ret;
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
return ret;
}
static __device__ __forceinline__
unsigned int set(unsigned int val, unsigned int toInsert, int pos, int len) {
unsigned int ret;
asm("bfi.b32 %0, %1, %2, %3, %4;" :
"=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
return ret;
}
};
template <>
struct Bitfield<unsigned long long int> {
static __device__ __forceinline__
unsigned long long int get(unsigned long long int val, int pos, int len) {
unsigned long long int ret;
asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
return ret;
}
static __device__ __forceinline__
unsigned long long int set(unsigned long long int val, unsigned long long int toInsert, int pos, int len) {
unsigned long long int ret;
asm("bfi.b64 %0, %1, %2, %3, %4;" :
"=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
return ret;
}
};
template <typename T>
struct RadixConfig {
// Converts a type (maybe float) to an integer representation with the same
// sorting; i.e., for floats f1, f2:
// if f1 < f2 then convert(f1) < convert(f2)
// We use this to enable radix selection of floating-point values.
// This also gives a relative order for NaNs, but that's ok, as they
// will all be adjacent
typedef unsigned int RadixType;
static inline __device__ RadixType convert(T v) {
return (RadixType)v;
}
static inline __device__ float deconvert(RadixType v) {
return (T)v;
}
};
template <>
struct RadixConfig<float> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(float v) {
RadixType x = __float_as_int(v);
RadixType mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
return (x ^ mask);
}
static inline __device__ float deconvert(RadixType v) {
RadixType mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
return __int_as_float(v ^ mask);
}
};
template <>
struct RadixConfig<double> {
typedef unsigned long long RadixType;
static inline __device__ RadixType convert(double v) {
RadixType x = __double_as_longlong(v);
RadixType mask = -((x >> 63)) | 0x8000000000000000;
return (x ^ mask);
}
static inline __device__ double deconvert(RadixType v) {
RadixType mask = ((v >> 63) - 1) | 0x8000000000000000;
return __longlong_as_double(v ^ mask);
}
};
template <>
struct RadixConfig<char> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(char v) {
return 128u + v;
}
static inline __device__ char deconvert(RadixType v) {
return v - 128;
}
};
template <>
struct RadixConfig<short> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(short v) {
assert(sizeof(short) == 2);
return 32768u ^ v;
}
static inline __device__ short deconvert(RadixType v) {
return v - 32768;
}
};
template <>
struct RadixConfig<int> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(int v) {
assert(sizeof(int) == 4);
return 2147483648u + v;
}
static inline __device__ int deconvert(RadixType v) {
return v - 2147483648u;
}
};
template <>
struct RadixConfig<long long> {
typedef unsigned long long RadixType;
static inline __device__ RadixType convert(long long v) {
assert(sizeof(long long) == 8);
return 9223372036854775808ull + v;
}
static inline __device__ long long deconvert(RadixType v) {
return v - 9223372036854775808ull;
}
};
#define USE_HALF $use_half
#if USE_HALF == 1
// since half is ushort, using macro to protect this part is necessary
template <>
struct RadixConfig<unsigned short> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(unsigned short v) {
RadixType mask = -(((RadixType)v >> 15)) | 0x8000;
return (v ^ mask);
}
static inline __device__ unsigned short deconvert(RadixType v) {
RadixType mask = ((v >> 15) - 1) | 0x8000;
return (unsigned short)(v ^ mask);
}
};
#endif // USE_HALF
// $$inp_t should be replaced in c_code
// we cannot use templated kernel because gpuarray API does not support it
#define NDIM $ndim
#define INPUT_TYPE $inp_t
#define INDEX_TYPE $out_t
#define bitsof(T) (sizeof(T)*8)
#define radix_t RadixConfig<INPUT_TYPE>::RadixType
#define WRITE_VALUE $write_value
#define WRITE_INDEX $write_index
#if RADIX_SIZE > 32
#error "RADIX_SIZE must be smaller than warp size (32)"
#endif
void __device__ atomicAdd(long long *dst, long long &src) {
atomicAdd(
reinterpret_cast<unsigned long long*>(dst),
reinterpret_cast<unsigned long long&>(src));
}
template <typename T>
static inline __device__ T binary_cumsum(
int idx, int warp_id, T* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads
// whose id is *no greater than* the current thread
// binary_cumsum(1, 0, 1, 0, 1) -> (1, 1, 2, 2, 3)
// cumsum within warp
unsigned int warp_bits = __ballot(value);
T warp_sum = __popc(lane_mask_le() & warp_bits);
if (lane_id() == 0)
smem[warp_id] = __popc(warp_bits);
local_barrier();
// cumsum across warps in one thread
if (idx == 0) {
T sum = smem[0];
for (int i = 1; i < blockDim.x / GA_WARP_SIZE; ++i) {
sum += smem[i];
smem[i] = sum;
}
}
local_barrier();
// load the carry from the preceding warp
if (warp_id >= 1) {
warp_sum = warp_sum+smem[warp_id - 1];
}
return warp_sum;
}
template <typename T>
static inline __device__ T binary_cumsum_exclusive(
int idx, int warp_id, T* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads
// whose id is *less than* the current thread
// binary_cumsum_excl(1, 0, 1, 0, 1) -> (0, 1, 1, 2, 2)
// cumsum within warp
unsigned int warp_bits = __ballot(value);
T warp_sum = __popc(lane_mask_lt() & warp_bits);
if (lane_id() == 0)
smem[warp_id] = __popc(warp_bits);
local_barrier();
// cumsum across warps in one thread
if (idx == 0) {
T sum = smem[0];
for (int i = 1; i < blockDim.x / GA_WARP_SIZE; ++i) {
sum += smem[i];
smem[i] = sum;
}
}
local_barrier();
// load the carry from the preceding warp
if (warp_id >= 1)
warp_sum += smem[warp_id - 1];
return warp_sum;
}
// apply raw(byte) offset to pointer
template <typename T>
static __device__ inline T* ptr_add(T *ptr, ssize_t offset) {
return (T*)((char*)ptr + offset);
}
// get array element using raw(byte) offset
template <typename T>
static __device__ inline T& ptr_at(T *ptr, ssize_t offset) {
return *((T*)((char*)ptr + offset));
}
// read array element using raw(byte) offset
template <typename T>
static __device__ inline T ptr_read_cached(T *ptr, ssize_t offset) {
return __ldg(((T*)((char*)ptr + offset)));
}
#define RADIX_BITS 4
#define RADIX_SIZE (1<<RADIX_BITS)
#define RADIX_MASK(n) ((RADIX_SIZE-1) << (n*RADIX_BITS))
#define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
// works when length on axis is within max allowed threads in block (1024)
KERNEL void k_topk_dense(
$dims
// size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
$dstv
// INPUT_TYPE *dstv
$dstv_strides
// ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
$dsti
// INDEX_TYPE *dsti
$dsti_strides
// ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
ssize_t k,
INPUT_TYPE* src,
$src_strides
// ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
size_t size) {
__shared__ int smem[32 * RADIX_SIZE];
__shared__ int k2;
const unsigned int idx = threadIdx.x;
bool is_topk= (idx < size);
bool is_topkth = is_topk;
size_t out_idx;
const unsigned char warp_id = idx / GA_WARP_SIZE;
// 0. get the slice for thread block to work on
size_t gid = blockIdx.x, gidx;
$set_slice
// $$set_slice expands into:
//for(int i=1; i<NDIM; i++) {
// gidx = gid % dims_$${i};
// gid /= dims_$${i};
// dsti = ptr_add(dsti, gidx*dsti_strides_$${i};
// dstv = ptr_add(dstv, gidx*dstv_strides_$${i};
// src = ptr_add(src, gidx*src_strides_$${i});
//}
// get input and its radix friendly form
const INPUT_TYPE xval = is_topk ? ptr_at(src, idx*src_strides_0) : (INPUT_TYPE)0;
radix_t x = RadixConfig<INPUT_TYPE>::convert(xval);
// resolve negative k
if (k<0) { x = ~x; k = -k; }
if (idx==0)
k2 = k;
// 1. filter is_topk and is_topkth using radix select
#pragma unroll
for (int i=bitsof(INPUT_TYPE)-RADIX_BITS; i>=0; i-=RADIX_BITS) {
const int digit = Bitfield<radix_t>::get(x, i, RADIX_BITS);
/*int digit = (x>>i) & (RADIX_SIZE-1);*/
// count within warp
#pragma unroll
for (int bin=0; bin<RADIX_SIZE; ++bin) {
bool vote = (bin == digit) && is_topkth;
unsigned int votes = __ballot(vote);
if (lane_id()==0)
smem[bin + RADIX_SIZE*warp_id] = __popc(votes);
}
local_barrier();
// sum counts across all warps
if (idx < RADIX_SIZE) {
int sum = smem[idx];
#pragma unroll
for(int w=RADIX_SIZE; w<blockDim.x*RADIX_SIZE / GA_WARP_SIZE; w+=RADIX_SIZE)
sum += smem[idx + w];
smem[idx] = sum;
}
local_barrier();
// find the bucket and update k2
// smem[:RADIX_SIZE:-1] = k2 - cumsum(smem[:RADIX_SIZE-1:-1])
if (idx == 0) {
int sum = k2;
#pragma unroll
for (int bin=RADIX_SIZE-1; bin>=0; --bin) {
sum -= smem[bin];
smem[bin] = sum;
k2 = (sum > 0) ? sum : k2;
}
smem[RADIX_SIZE] = 1;
}
local_barrier();
if (is_topkth) {
is_topk &= (smem[digit+1] > 0);
is_topkth &= (smem[digit] <= 0) && (smem[digit+1] > 0);
}
local_barrier();
}
// set k2 as number of exceeding values
if (idx==0) {
#pragma unroll
for (int bin=RADIX_SIZE-1; bin>=0; --bin) {
if (smem[bin] <= 0)
break;
k2 = smem[bin];
}
}
local_barrier();
// 2. find the index of output array, if exists
if (k2 != 0) {
// top_kth value may not be unique, so we need to
// perform binary cumsum on is_topkth to drop exceeding top-kth values
out_idx = binary_cumsum_exclusive(idx, warp_id, smem, is_topkth);
if ((out_idx >= k2) && is_topkth)
is_topk = false;
local_barrier();
}
// perform binary cumsum on is_topk to determine the indices to put result
out_idx = binary_cumsum_exclusive(idx, warp_id, smem, is_topk);
if (is_topk) {
#if WRITE_VALUE == 1
ptr_at(dstv, out_idx * dstv_strides_0) = xval;
#endif
#if WRITE_INDEX == 1
ptr_at(dsti, out_idx * dsti_strides_0) = (INDEX_TYPE)idx;
#endif
}
}
#define RADIX_BITS 2
#define RADIX_SIZE (1<<RADIX_BITS)
#define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
#define COUNT_TYPE $count_t
#define KERNEL_NAME $kname
// if count_t is int, work for array size within [1025, 2^31-1]
// if count_t is long long, work for array size within [2^31, 2^63-1]
template <typename DataType, typename RadixType, typename CountType>
__device__ DataType find_pattern(DataType* smem,
DataType* data,
CountType slice_size,
CountType stride,
RadixType known_bits,
RadixType known_bits_mask) {
if (threadIdx.x < 32)
smem[threadIdx.x] = 0;
local_barrier();
// All threads participate in the loop, in order to sync on the flag
for (CountType i = threadIdx.x; i < (slice_size + (CountType)blockDim.x-1); i += blockDim.x) {
bool in_range = (i < slice_size);
DataType v = in_range ? ptr_read_cached(data, i*stride) : 0;
if (in_range && ((RadixConfig<DataType>::convert(v) & known_bits_mask) == known_bits)) {
// There should not be conflicts if we are using find_pattern,
// since the result is unique
smem[0] = 1;
smem[1] = v; // can't use val as the flag, since it could be 0
}
local_barrier();
DataType found = smem[0];
DataType val = smem[1];
local_barrier();
// Check to see if a thread found the value
if (found != 0)
return val;
}
return 0;
}
// This function counts the distribution of all input values in a
// slice we are selecting by radix digit at `radix_digit_pos`, but only
// those that pass the filter `((v & known_bits_mask) == known_bits)`.
// This produces and broadcasts the seen counts for a single block only.
// `smem` must have at least `RADIX_SIZE` elements.
template <typename DataType, typename RadixType, typename CountType>
__device__ void count_radix_masked(CountType counts[RADIX_SIZE],
CountType* smem,
RadixType known_bits,
RadixType known_bits_mask,
int radix_digit_pos,
CountType slice_size,
CountType stride,
DataType* data) {
// Clear out per-thread counts from a previous round
#pragma unroll
for (int i = 0; i < RADIX_SIZE; ++i)
counts[i] = 0;
if (threadIdx.x < RADIX_SIZE)
smem[threadIdx.x] = 0;
local_barrier();
// Scan over all the data. Upon a read, the warp will accumulate
// counts per each digit in the radix using warp voting.
for (CountType i = threadIdx.x; i < slice_size; i += blockDim.x) {
RadixType val = RadixConfig<DataType>::convert(ptr_read_cached(data, i*stride));
bool has_val = ((val & known_bits_mask) == known_bits);
RadixType digit_in_radix = Bitfield<RadixType>::get(val, radix_digit_pos, RADIX_BITS);
#pragma unroll
for (int j = 0; j < RADIX_SIZE; ++j) {
bool vote = has_val && (digit_in_radix == j);
counts[j] += __popc(__ballot(vote));
}
}
// Now, for each warp, sum values
if (lane_id() == 0) {
for (int i=0; i<RADIX_SIZE; ++i)
atomicAdd(&smem[i], counts[i]);
}
/*
// not sure why, but this just give wrong results
if (lane_id() < RADIX_SIZE)
atomicAdd(&smem[lane_id()], counts[lane_id()]);
*/
local_barrier();
// For each thread, read in the total counts
#pragma unroll
for (unsigned int i = 0; i < RADIX_SIZE; ++i)
counts[i] = smem[i];
local_barrier();
}
template <typename DataType, typename RadixType, typename CountType>
__device__ void radix_select(DataType* data,
CountType k,
bool order,
CountType slice_size,
CountType stride,
CountType* smem,
DataType* top_kth) {
// Per-thread buckets into which we accumulate digit counts in our
// radix
register CountType counts[RADIX_SIZE];
// We only consider elements x such that (x & known_bits_mask) == known_bits
// Initially, we consider all elements of the array, so the above
// statement is true regardless of input.
RadixType known_bits = 0, known_bits_mask = 0;
// We are looking for the top k_to_find-th element when iterating over
// digits; this count gets reduced by elimination when counting
// successive digits
CountType k_to_find = abs(k);
// We start at the most significant digit in our radix, scanning
// through to the least significant digit
#pragma unroll
for (int digit_pos = bitsof(DataType) - RADIX_BITS;
digit_pos >= 0; digit_pos -= RADIX_BITS) {
// Count radix distribution for the current position and reduce
// across all threads
count_radix_masked<DataType, RadixType, CountType>(
counts, smem,
known_bits, known_bits_mask, digit_pos,
slice_size, stride, data);
// All threads participate in the comparisons below to know the
// final result
#define CHECK_RADIX(i) \\
int count = counts[i]; \\
/* All threads have the same value in counts here, so all */ \\
/* threads will return from the function. */ \\
if (count == 1 && k_to_find == 1) { \\
/* There is a unique answer. */ \\
known_bits = Bitfield<RadixType>::set( \\
known_bits, i, digit_pos, RADIX_BITS); \\
known_bits_mask = Bitfield<RadixType>::set( \\
known_bits_mask, RADIX_SIZE-1, digit_pos, RADIX_BITS); \\
/* The answer is now the unique element v such that: */ \\
/* (v & known_bits_mask) == known_bits */ \\
/* However, we do not yet know what the actual element is. We */ \\
/* need to perform a search through the data to find the */ \\
/* element that matches this pattern. */ \\
*top_kth = find_pattern<DataType, RadixType, CountType>( \\
(DataType*) smem, data, slice_size, \\
stride, known_bits, known_bits_mask); \\
return; \\
} \\
if (count >= k_to_find) { \\
known_bits = Bitfield<RadixType>::set(known_bits, i, digit_pos, RADIX_BITS); \\
known_bits_mask = Bitfield<RadixType>::set( \\
known_bits_mask, RADIX_SIZE-1, digit_pos, RADIX_BITS); \\
/* The top-Kth element v must now be one such that: */ \\
/* (v & known_bits_mask == known_bits) */ \\
/* but we haven't narrowed it down; we must check the next */ \\
/* least-significant digit */ \\
break; \\
} \\
k_to_find -= count
if (order) {
#pragma unroll
for (int i=RADIX_SIZE - 1; i >= 0; --i) {
CHECK_RADIX(i);
}
} else {
#pragma unroll
for (int i=0; i < RADIX_SIZE; ++i) {
CHECK_RADIX(i);
}
}
#undef CHECK_RADIX
} // end digit_pos for
// There is no unique result, but there is a non-unique result
// matching `known_bits` exactly
*top_kth = RadixConfig<DataType>::deconvert(known_bits);
}
KERNEL void KERNEL_NAME(
$dims
// size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
$dstv
// INPUT_TYPE *dstv
$dstv_strides
// ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
$dsti
// INDEX_TYPE *dsti
$dsti_strides
// ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
ssize_t k,
INPUT_TYPE* src,
$src_strides
// ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
size_t size) {
__shared__ COUNT_TYPE smem[32];
INPUT_TYPE topkth_value;
const bool order = (k>0);
k = (order ? k : -k);
const int idx = threadIdx.x;
const int warp_id = idx / GA_WARP_SIZE;
// get the slice for thread block to work on
// size <- the axis to work on
// dims_1+ <- batched dimensions
unsigned int gid = blockIdx.x, gidx;
$set_slice
// $$set_slice expands into:
//for(int i=1; i<NDIM; i++) {
// gidx = gid % dims_$${i};
// gid /= dims_$${i};
// dsti = ptr_add(dsti, gidx*dsti_strides_$${i});
// dstv = ptr_add(dstv, gidx*dstv_strides_$${i});
// src = ptr_add(src, gidx*src_strides_$${i});
//}
radix_select<INPUT_TYPE, radix_t, COUNT_TYPE>(
src, k, order, size, src_strides_0,
smem, &topkth_value);
// Every value that is strictly less/greater than `pattern`
// (depending on sort dir) in sorted int format is in the top-K.
// The top-K value itself might not be unique.
//
// Since there are a variable number of elements that we see that
// are within the top-k, we don't know at what index to write out
// the resulting values.
// In order to get this, we perform an exclusive cumsum of
// `has_topk`. This will return the resulting index into which we
// need to write the result, if a thread has a result.
// All threads need to participate in the loop and the cumsum
// but not necessarily in the load; hence loop bounds being rounded
// up to a multiple of the block dim.
COUNT_TYPE iter_bound = size + blockDim.x-1;
INDEX_TYPE write_base = 0;
for (int i = idx; i < iter_bound; i += blockDim.x) {
bool in_range = (i < size);
INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : 0;
bool has_topk;
if (order) {
has_topk = in_range && (v > topkth_value);
} else {
has_topk = in_range && (v < topkth_value);
}
int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
int carry = smem[blockDim.x / 32 - 1];
if (has_topk) {
COUNT_TYPE write_idx = write_base + index;
#if WRITE_VALUE == 1
ptr_at(dstv, write_idx * dstv_strides_0) = v;
#endif
#if WRITE_INDEX == 1
ptr_at(dsti, write_idx * dsti_strides_0) = (INDEX_TYPE)i;
#endif
}
write_base += carry;
}
COUNT_TYPE topk_remaining = (k - write_base);
for (COUNT_TYPE i = idx; i < iter_bound; i += blockDim.x) {
bool in_range = (i < size);
INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : 0;
bool has_topk = in_range && (v == topkth_value);
int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
int carry = smem[blockDim.x / 32 - 1];
if (has_topk && index < topk_remaining) {
COUNT_TYPE write_idx = write_base + index;
#if WRITE_VALUE == 1
ptr_at(dstv, write_idx * dstv_strides_0) = v;
#endif
#if WRITE_INDEX == 1
ptr_at(dsti, write_idx * dsti_strides_0) = (INDEX_TYPE)i;
#endif
}
if (carry >= topk_remaining)
break;
topk_remaining -= carry;
write_base += carry;
}
}
from __future__ import absolute_import, print_function, division
import os
from string import Template
from theano import Apply
from theano.tensor import as_tensor_variable
from theano.tensor.sort import TopKOp
from .basic_ops import (GpuKernelBase, Kernel, infer_context_name,
as_gpuarray_variable, gpuarray_helper_inc_dir)
from .opt import register_opt, op_lifter, register_opt2
from .type import GpuArrayType
try:
import pygpu
import pygpu.gpuarray as ga
except ImportError as e:
# To make sure theano is importable
pass
# TODO GPU sort / argsort
class GpuTopKOp(GpuKernelBase, TopKOp):
'''
Implements TopKOp on gpu
'''
__props__ = TopKOp.__props__
_f16_ok = True
def __init__(
self, axis=-1,
sorted=True,
idx_dtype='int64',
return_values=True,
return_indices=True
):
GpuKernelBase.__init__(self)
TopKOp.__init__(
self, axis=axis,
sorted=sorted,
idx_dtype=idx_dtype,
return_values=return_values,
return_indices=return_indices)
def c_headers(self):
return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']
def c_header_dirs(self):
return [
os.path.dirname(__file__),
gpuarray_helper_inc_dir(),
pygpu.get_include()]
def c_code_cache_version(self):
return (1,)
def gpu_kernels(self, node, nodename):
# load kernel source
device_type = node.inputs[0].type.context.kind
kernel_ext = {b'cuda': '.cu', b'opencl': '.cl'}[device_type]
common_ext = {b'cuda': '.cuh', b'opencl': '.h'}[device_type]
# prepare "$" macros
if device_type == b'cuda':
ndim = node.inputs[0].ndim
dstv_strides_code = ''.join('ssize_t dstv_strides_%d, ' % i for i in range(ndim))
dsti_strides_code = ''.join('ssize_t dsti_strides_%d, ' % i for i in range(ndim))
src_strides_code = ''.join('ssize_t src_strides_%d, ' % i for i in range(ndim))
set_slice_code = '''
gidx = gid %% dims_%(i)d;
gid /= dims_%(i)d;
{dstv};
{dsti};
src = ptr_add(src, gidx*src_strides_%(i)d);\n'''.format(
dstv='dstv = ptr_add(dstv, gidx*dstv_strides_%(i)d)' if self.return_values else '',
dsti='dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)' if self.return_indices else '')
set_slice_code = ''.join(
set_slice_code % dict(i=j) for j in range(1, ndim))
flags = Kernel.get_flags(node.inputs[0].dtype)
subs = dict(
inp_t=ga.dtype_to_ctype(node.inputs[0].dtype),
out_t=ga.dtype_to_ctype(self.idx_dtype),
dims=''.join('size_t dims_%d, ' % i for i in range(1, ndim)),
dstv='INPUT_TYPE *dstv,' if self.return_values else '',
dsti='INDEX_TYPE *dsti,' if self.return_indices else '',
dstv_strides=dstv_strides_code if self.return_values else '',
dsti_strides=dsti_strides_code if self.return_indices else '',
src_strides=src_strides_code,
set_slice=set_slice_code,
write_value=int(self.return_values),
write_index=int(self.return_indices),
ndim=str(ndim),
use_half=int(node.inputs[0].dtype == 'float16')
)
elif device_type == b'opencl':
raise NotImplementedError()
# setup parameters
param_types = [ga.SIZE] * (ndim - 1) # dims
for _ in range(self.return_values + self.return_indices):
param_types.append(ga.GpuArray) # dst*
param_types.extend([ga.SSIZE] * ndim) # dst*_strides
param_types.append(ga.SIZE) # k
param_types.append(ga.GpuArray) # src
param_types.extend([ga.SSIZE] * ndim) # src_strides
param_types.append(ga.SIZE) # size
# load and compile kernels
with open(os.path.join(
os.path.dirname(__file__), 'c_code', 'topk_common' + common_ext
)) as f:
common_src = f.read()
kernels = []
def build_kernel(fname, kname, subs):
with open(os.path.join(
os.path.dirname(__file__), 'c_code', fname)
) as f:
kernel_src = f.read()
ker = Kernel(
code=Template(common_src + kernel_src).substitute(**subs),
name=kname,
params=param_types,
flags=flags,
objvar=kname + nodename)
return ker
subs['count_t'] = 'int'
kernels.append(
build_kernel('topk_dense' + kernel_ext, 'k_topk_dense', subs))
subs['kname'] = 'k_topk_dense_large'
kernels.append(
build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_large', subs))
subs['count_t'] = 'long long'
subs['kname'] = 'k_topk_dense_xlarge'
kernels.append(
build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_xlarge', subs))
return kernels
def c_code(self, node, nodename, inps, outs, sub):
context = node.inputs[0].type.context
if context.kind != b'cuda':
raise NotImplementedError(
'%s: We only have CUDA '
'implementation so far.' % self.__class__.__name__)
x, k = inps
inp_dtc = ga.dtype_to_typecode(node.inputs[0].dtype)
if not self.return_indices:
yv, = outs
elif self.return_values:
yv, yi = outs
else:
yi, = outs
out_dtype_s = self.idx_dtype
out_dtc = ga.dtype_to_typecode(out_dtype_s)
fail = sub['fail']
ctx = sub['params']
k_dtype = node.inputs[1].type.dtype_specs()[1]
# max threads per block
MAX_TPB = context.maxlsize
# max blocks per grid
MAX_BPG = context.maxgsize0
WARP_SIZE = 32
ndim = node.inputs[0].ndim
reordered_axes = list(range(ndim))
axis = self.axis % ndim
del(reordered_axes[axis])
reordered_axes = [axis] + reordered_axes
dims = ''.join('(void*)(dims+%d), ' % i for i in reordered_axes[1:])
prep_output = ''
if self.return_values:
def_dvstrides = 'const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)' % yv
params_dv = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % (yv, yv)
params_dv += ''.join('(void*)(dvstrides+%d), ' % i for i in reordered_axes)
prep_output += '''
if (0 != theano_prep_output(
&%(yv)s, %(ndim)d, odims,
%(inp_dtc)s, GA_C_ORDER, %(ctx)s)) {
%(fail)s;
}\n''' % locals()
else:
def_dvstrides = params_dv = ''
if self.return_indices:
def_distrides = 'const ssize_t *distrides = PyGpuArray_STRIDES(%s)' % yi
params_di = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % yi
params_di += ''.join('(void*)(distrides+%d), ' % i for i in reordered_axes)
prep_output += '''
if (0 != theano_prep_output(
&%(yi)s, %(ndim)d, odims,
%(out_dtc)s, GA_C_ORDER, %(ctx)s)) {
%(fail)s;
}\n''' % locals()
else:
def_distrides = params_di = ''
sstrides = ', '.join('(void*)(sstrides+%d)' % i for i in reordered_axes)
code = '''
{
const ssize_t k_ = ((%(k_dtype)s*)(PyArray_DATA(%(k)s)))[0];
const size_t *dims = PyGpuArray_DIMS(%(x)s);
size_t odims[%(ndim)d];
for (int i=0; i<%(ndim)d; i++)
odims[i] = dims[i];
odims[%(axis)d] = k_>=0 ? k_ : -k_;
if (0 == odims[%(axis)d]) {
PyErr_SetString(
PyExc_ValueError,
"topk: kth must not be zero");
%(fail)s;
} else if (dims[%(axis)d] < odims[%(axis)d]) {
PyErr_SetString(
PyExc_ValueError,
"topk: kth cannot be larger than the size of specified axis %(axis)d");
%(fail)s;
}
%(prep_output)s
size_t grid_size=1, block_size=1;
for (int i=0; i<%(ndim)d; ++i) {
if (i!=%(axis)d)
grid_size *= dims[i];
else
block_size = dims[i];
}
// round up to multiples of warp size
block_size = ((block_size + %(WARP_SIZE)d - 1) / %(WARP_SIZE)d) * %(WARP_SIZE)d;
if (grid_size > %(MAX_BPG)d) {
PyErr_SetString(
PyExc_ValueError,
"topk: too many slices to work with, expected <= %(MAX_BPG)d");
%(fail)s;
}
%(def_dvstrides)s;
%(def_distrides)s;
const ssize_t *sstrides = PyGpuArray_STRIDES(%(x)s);
void* args[] = {
%(dims)s
%(params_dv)s
%(params_di)s
(void*)(&k_),
(void*)((char*)(%(x)s->ga.data) + (%(x)s->ga.offset)),
%(sstrides)s,
(void*)(dims+%(axis)d),
};
int err;
if (dims[%(axis)d] > (1u << 31)) {
block_size = %(MAX_TPB)d;
err = GpuKernel_call(
&k_topk_dense_xlarge%(nodename)s, 1,
&grid_size, &block_size, 0, args);
} else if (block_size > %(MAX_TPB)d) {
block_size = %(MAX_TPB)d;
err = GpuKernel_call(
&k_topk_dense_large%(nodename)s, 1,
&grid_size, &block_size, 0, args);
} else {
err = GpuKernel_call(
&k_topk_dense%(nodename)s, 1,
&grid_size, &block_size, 0, args);
}
if (err != GA_NO_ERROR) {
PyErr_SetString(
PyExc_RuntimeError,
"topk: gpu kernel failed to execute");
%(fail)s;
}
}
'''
return code % locals()
def make_node(self, inp, kth):
ctx_name = infer_context_name(inp)
inp = as_gpuarray_variable(inp, ctx_name)
kth = as_tensor_variable(kth)
bcast = inp.type.broadcastable
outs = []
if self.return_values:
outs.append(inp.type())
if self.return_indices:
outs.append(GpuArrayType(
dtype=self.idx_dtype,
broadcastable=bcast,
context_name=ctx_name)())
return Apply(self, [inp, kth], outs)
def get_params(self, node):
return node.inputs[0].type.context
@register_opt('fast_compile')
@op_lifter([TopKOp], cuda_only=True)
@register_opt2([TopKOp], 'fast_compile')
def local_gpua_topkop(op, ctx_name, inputs, outputs):
axis = op.axis
rv = op.return_values
ri = op.return_indices
x, k = inputs
x = as_gpuarray_variable(x, ctx_name)
gpu_op = GpuTopKOp(
axis=axis,
sorted=op.sorted,
idx_dtype=op.idx_dtype,
return_values=rv,
return_indices=ri)
rets = gpu_op(x, k)
return rets
......@@ -40,7 +40,7 @@ from theano.tensor import nnet # used for softmax, sigmoid, etc.
from theano.gradient import Rop, Lop, grad, numeric_grad, verify_grad, \
jacobian, hessian, consider_constant
from theano.tensor.sort import sort, argsort
from theano.tensor.sort import sort, argsort, topk, argtopk, topk_and_argtopk
from theano.tensor.extra_ops import (DiffOp, bincount, squeeze,
repeat, bartlett, fill_diagonal, fill_diagonal_offset,
cumsum, cumprod, unravel_index, ravel_multi_index)
......
......@@ -35,6 +35,7 @@ from theano.tensor.subtensor import (get_idx_list, get_canonical_form_slice,
advanced_subtensor,
advanced_subtensor1,
advanced_inc_subtensor1)
from theano.tensor.sort import TopKOp
from theano import scalar
from theano.scalar import basic
from theano.tensor import basic as T
......@@ -7548,3 +7549,35 @@ def local_merge_alloc(node):
dim_outer, T.eq(dim_outer, dim_inner))
i += 1
return [T.alloc(inputs_inner[0], *dims_outer)]
@register_useless('fast_compile')
@gof.local_optimizer([TopKOp])
def local_useless_topk(node):
"""
TopKOp generates two outputs by default
This opt removes the useless ones
"""
op = node.op
if not isinstance(op, TopKOp):
return
if not (op.return_values and op.return_indices):
return False
x, k = node.inputs
ret_val = bool(node.outputs[0].clients)
ret_idx = bool(node.outputs[1].clients)
if not (ret_val ^ ret_idx):
# both true -> nothing to remove
# both false -> let pruner handle
return False
old_output = node.outputs[ret_idx]
new_output = TopKOp(
axis=op.axis,
idx_dtype=op.idx_dtype,
return_values=ret_val,
return_indices=ret_idx)(x, k)
return {old_output: new_output}
......@@ -2,6 +2,22 @@ from __future__ import absolute_import, print_function, division
import numpy as np
import theano
from theano.tensor.basic import mul, arange
from theano.gradient import grad_undefined
from theano.tensor.subtensor import set_subtensor
def _variable_is_none(var):
return isinstance(var, theano.Constant) and var.data is None
def _check_tensor_is_scalar(var):
'''
Checks if a tensor variable is scalar, raise ValueError otherwise
'''
msg = '%(var)s is expected to be 0d tensor, got %(ndim)d'
if var.ndim != 0:
raise ValueError(
msg % (var, var.ndim))
class SortOp(theano.Op):
......@@ -33,8 +49,7 @@ class SortOp(theano.Op):
z[0] = np.sort(a, axis, self.kind, self.order)
def infer_shape(self, node, inputs_shapes):
if (isinstance(node.inputs[1], theano.Constant) and
node.inputs[1].data is None):
if _variable_is_none(node.inputs[1]):
# That means axis = None,
# So the array is flattened before being sorted
return [(mul(*inputs_shapes[0]),)]
......@@ -49,7 +64,7 @@ class SortOp(theano.Op):
a, axis = inputs
indices = self.__get_argsort_indices(a, axis)
inp_grad = output_grads[0][tuple(indices)]
axis_grad = theano.gradient.grad_undefined(
axis_grad = grad_undefined(
self, 1, axis,
"The gradient of sort is not defined "
"with respect to the integer axes itself")
......@@ -162,8 +177,7 @@ class ArgSortOp(theano.Op):
dtype=node.outputs[0].dtype)
def infer_shape(self, node, inputs_shapes):
if (isinstance(node.inputs[1], theano.Constant) and
node.inputs[1].data is None):
if _variable_is_none(node.inputs[1]):
return [(mul(*inputs_shapes[0]),)]
# axis should not be None, so there should be the same number of
# dimensions in the input and output
......@@ -175,7 +189,7 @@ class ArgSortOp(theano.Op):
# No grad defined for intergers.
inp, axis = inputs
inp_grad = inp.zeros_like()
axis_grad = theano.gradient.grad_undefined(
axis_grad = grad_undefined(
self, 1, axis,
"argsort is not defined for non-integer axes so"
" argsort(x, axis+eps) is undefined")
......@@ -206,3 +220,339 @@ def argsort(a, axis=-1, kind='quicksort', order=None):
a = a.flatten()
axis = 0
return ArgSortOp(kind, order)(a, axis)
def _topk_py_impl(op, x, k, axis, idx_dtype):
ndim = x.ndim
assert -ndim <= axis < ndim
axis %= ndim
if k == 0:
raise ValueError('topk: kth cannot be zero')
elif k > x.shape[axis]:
raise ValueError(
'topk: kth cannot be larger than the size of specified axis %d' % axis)
if abs(k) == 1:
# negative k means min instead of max
fn_max = [None, np.max, np.min][k]
fn_argmax = [None, np.argmax, np.argmin][k]
if not op.return_indices:
return np.expand_dims(fn_max(x, axis=axis), axis)
elif op.return_values:
zi = np.expand_dims(
fn_argmax(x, axis=axis), axis)
idx2 = tuple(
np.arange(s).reshape(
(s,) + (1,) * (ndim - i - 1)
) if i != axis else zi for i, s in enumerate(x.shape))
zv = x[idx2]
return zv, zi.astype(idx_dtype)
else:
zi = np.expand_dims(
fn_argmax(x, axis=axis), axis)
return zi.astype(idx_dtype)
if x.shape[axis] == abs(k):
if not op.return_indices:
return x.copy()
else:
l = axis
r = ndim - l
reps = list(x.shape)
reps[axis] = 1
zi = np.arange(abs(k), dtype=idx_dtype)
zi = zi.reshape((1,) * l + (k,) + (1,) * (r - 1))
zi = np.tile(zi, reps)
if op.return_values:
return x.copy(), zi
else:
return zi
idx = [slice(None)] * ndim
idx[axis] = (slice(-k, None) if k > 0 else slice(-k))
if not op.return_indices:
zv = np.partition(x, -k, axis=axis)[idx]
return zv
elif op.return_values:
zi = np.argpartition(x, -k, axis=axis)[idx]
idx2 = tuple(
np.arange(s).reshape(
(s,) + (1,) * (ndim - i - 1)
) if i != axis else zi for i, s in enumerate(x.shape))
zv = x[idx2]
return zv, zi.astype(idx_dtype)
else:
zi = np.argpartition(x, -k, axis=axis)[idx]
return zi.astype(idx_dtype)
class TopKOp(theano.Op):
"""
Operations related to finding k-largest elements.
Parameters
----------
axis: integer
Defaults to ``-1``.
The axis to perform the operation. Must be in range ``[-ndim, ndim)``, where
``ndim`` is the dimensionality of input tensor.
idx_dtype: string
Specify output dtype for indices, defaults to ``int64``, must be integer type.
sorted: bool
NOTE: NOT IMPLEMENTED YET
Defaults to ``True``
If True, the result array would be sorted in descending order.
Notes
-----
- By default, this Op give two outputs: values and indices. However optimizer may
remove a certain output if not needed.
- Computing gradient is only possible when both values and indices are computed in
forward pass.
- If the top-k-th value is not unique, we cannot guarantee the output indices being
deterministically chosen.
See Also
--------
topk
argtopk
argtopk_and_topk
"""
# TODO more params
'''
only_top_kth: bool
Defaults to ``False``
If ``True``, will only find one exact top k-th element on given axis.
'''
# TODO c_code
# TODO add opt, if k==1, use max/min reduce
# also if k is axis size, just copy input tensor
# TODO add opt, to merge argtopk / topk
__props__ = ('axis', 'sorted', 'return_values', 'return_indices', 'idx_dtype')
def __init__(
self,
axis=-1,
sorted=True,
idx_dtype='int64',
return_values=True,
return_indices=True
):
# numpy always uses int64 as output dtype for arg*() routines
# however, we add "idx_dtype" param as memory is more precious on gpu
if not isinstance(axis, int):
raise TypeError(
'"axis" parameter must be integer, got "%s"' % type(axis))
if idx_dtype not in theano.tensor.integer_dtypes:
raise TypeError(
'"idx_dtype" parameter must be an integer dtype, got "%s"' % idx_dtype)
if not (return_indices or return_values):
raise ValueError(
"Neither return_values nor return_indices is True, this isn't allowed")
self.axis = axis
self.sorted = sorted
self.return_values = return_values
self.return_indices = return_indices
self.idx_dtype = idx_dtype
def __str__(self):
return '%(op)s{axis=%(axis)d, sorted=%(sorted)s}' % dict(
op=self.__class__.__name__,
axis=self.axis,
sorted=self.sorted)
def make_node(self, inp, kth):
inp = theano.tensor.as_tensor_variable(inp)
ndim = inp.ndim
if ndim == 0:
raise ValueError('Cannot take scalar as input')
if not -ndim <= self.axis < ndim:
raise IndexError(
'"axis" parameter out of range,'
' expected integer within [%d, %d]' % (-ndim, ndim - 1))
kth = theano.tensor.as_tensor_variable(kth)
_check_tensor_is_scalar(kth)
bcast = inp.type.broadcastable
outs = []
if self.return_values:
outs.append(inp.type())
if self.return_indices:
outs.append(theano.tensor.TensorType(
dtype=self.idx_dtype, broadcastable=bcast)())
return theano.Apply(self, [inp, kth], outs)
def perform(self, node, inputs, output_storage):
x, k = inputs
axis = self.axis
if not self.return_indices:
pzv = output_storage[0]
pzv[0] = _topk_py_impl(self, x, k, axis, None)
elif self.return_values:
pzv = output_storage[0]
pzi = output_storage[1]
pzv[0], pzi[0] = _topk_py_impl(
self, x, k, axis, node.outputs[1].dtype)
else:
pzi = output_storage[0]
pzi[0] = _topk_py_impl(self, x, k, axis, node.outputs[0].dtype)
def infer_shape(self, node, inp_shapes):
shp = list(inp_shapes[0])
shp[self.axis] = np.abs(node.inputs[1])
shp = tuple(shp)
return [shp for i in [self.return_values, self.return_indices] if i]
def L_op(self, inputs, outputs, out_grads):
x, k = inputs
k_grad = grad_undefined(self, 1, k, 'topk: k is not differentiable')
if not (self.return_indices or self.return_values):
x_grad = grad_undefined(
self, 0, x, 'topk: cannot get gradient'
' without both indices and values')
else:
x_shp = theano.tensor.shape(x)
z_grad = out_grads[0]
ndim = x.ndim
axis = self.axis % ndim
grad_indices = [
arange(x_shp[i]).dimshuffle([0] + ['x'] * (ndim - i - 1))
if i != axis else outputs[-1] for i in range(ndim)]
x_grad = x.zeros_like(dtype=z_grad.dtype)
x_grad = set_subtensor(x_grad[tuple(grad_indices)], z_grad)
return [x_grad, k_grad]
def topk(x, kth, axis=-1, sorted=True, idx_dtype='int64'):
"""
Returns the k-largest elements along an axis.
Parameters
----------
x: tensor instance
kth: integer constant/variable
Must not be 0. If negative, gives k-smallest elements instead.
axis: integer or ``None``
Upon which axis shall the operation be performed on.
If ``None``, works on flattened array.
sorted: bool
NOTE: NOT IMPLEMENTED YET, USE ``False`` FOR NOW.
Defaults to ``True``
If True, the result array would be sorted in descending order.
idx_dtype: string
Specify output dtype used in indices, defaults to ``int64``, must be integer type.
This option is here because indices are needed for gradient.
Returns
-------
Tensor variable with same dtype as `x`.
Notes
-----
- ``sorted=True`` is not supported yet.
"""
if sorted:
raise NotImplementedError(
"We are still working on sorted topk. Use sorted=False for now.")
if axis is None:
x = theano.tensor.flatten(x)
axis = 0
return TopKOp(
axis=axis,
sorted=sorted,
idx_dtype=idx_dtype)(x, kth)[0]
def argtopk(x, kth, axis=-1, sorted=True, idx_dtype='int64'):
"""
Returns the indices of k-largest elements along an axis.
Parameters
----------
x: tensor instance
kth: integer constant/variable
Must not be 0. If negative, gives k-smallest elements instead.
sorted: bool
NOTE: NOT IMPLEMENTED YET, USE ``False`` FOR NOW.
Defaults to ``True``
If True, the result array of corresponding indices would be sorted in descending order.
axis: integer, tuple/list of integers, or ``None``
Upon which axis shall the operation be performed on.
If ``None``, works on flattened array.
idx_dtype: string
Specify output dtype, defaults to ``int64``, must be integer type.
Returns
-------
Tensor variable with dtype specified in `idx_dtype`.
Notes
-----
- ``sorted=True`` is not supported yet.
- If the top-k-th value is not unique, we cannot guarantee the output
indices are deterministically chosen.
"""
if sorted:
raise NotImplementedError(
"We are still working on sorted topk. Use sorted=False for now.")
if axis is None:
x = theano.tensor.flatten(x)
axis = 0
return TopKOp(
axis=axis,
sorted=sorted,
idx_dtype=idx_dtype)(x, kth)[1]
def topk_and_argtopk(x, kth, axis=-1, sorted=True, idx_dtype='int64'):
"""
Returns the results of both topk() and argtopk() in one Op.
See the respective documentation for details.
Returns
-------
tuple: (values, indices)
"""
if sorted:
raise NotImplementedError(
"We are still working on sorted topk. Use sorted=False for now.")
if axis is None:
x = theano.tensor.flatten(x)
axis = 0
return TopKOp(
axis=axis,
sorted=sorted,
idx_dtype=idx_dtype)(x, kth)
from __future__ import absolute_import, print_function, division
from itertools import product, chain
from functools import reduce
import unittest
from theano.tests import unittest_tools as utt
......@@ -9,9 +11,18 @@ from theano import tensor
from theano.tensor.sort import sort, SortOp
from theano.tensor.sort import argsort, ArgSortOp
from theano.tensor.sort import topk, argtopk, topk_and_argtopk, TopKOp
_all_dtypes = tensor.integer_dtypes + tensor.float_dtypes
class test_sort(unittest.TestCase):
def gen_unique_vector(size, dtype):
# generate a randomized vector with unique elements
retval = np.arange(size) * 3. + np.random.uniform(-1., 1.)
return (retval[np.random.permutation(size)] - size * 1.5).astype(dtype)
class Test_sort(unittest.TestCase):
def setUp(self):
self.rng = np.random.RandomState(seed=utt.fetch_seed())
......@@ -22,7 +33,7 @@ class test_sort(unittest.TestCase):
a = tensor.dmatrix()
w = sort(a)
f = theano.function([a], w)
assert np.allclose(f(self.m_val), np.sort(self.m_val))
utt.assert_allclose(f(self.m_val), np.sort(self.m_val))
def test2(self):
a = tensor.dmatrix()
......@@ -32,7 +43,7 @@ class test_sort(unittest.TestCase):
for axis_val in 0, 1:
gv = f(self.m_val, axis_val)
gt = np.sort(self.m_val, axis_val)
assert np.allclose(gv, gt)
utt.assert_allclose(gv, gt)
def test3(self):
a = tensor.dvector()
......@@ -40,7 +51,7 @@ class test_sort(unittest.TestCase):
f = theano.function([a], w2)
gv = f(self.v_val)
gt = np.sort(self.v_val)
assert np.allclose(gv, gt)
utt.assert_allclose(gv, gt)
def test4(self):
a = tensor.dmatrix()
......@@ -50,7 +61,7 @@ class test_sort(unittest.TestCase):
for axis_val in 0, 1:
gv = f(self.m_val, axis_val)
gt = np.sort(self.m_val, axis_val)
assert np.allclose(gv, gt)
utt.assert_allclose(gv, gt)
def test5(self):
a1 = SortOp("mergesort", [])
......@@ -67,10 +78,9 @@ class test_sort(unittest.TestCase):
f = theano.function([a], l)
gv = f(self.m_val)
gt = np.sort(self.m_val, None)
assert np.allclose(gv, gt)
utt.assert_allclose(gv, gt)
def test_grad_vector(self):
a = theano.tensor.vector()
data = np.random.rand(10).astype(theano.config.floatX)
utt.verify_grad(sort, [data])
......@@ -133,19 +143,19 @@ class test_sort(unittest.TestCase):
utt.verify_grad(lambda x: sort(x, 3), [data])
class TensorInferShapeTester(utt.InferShapeTester):
class SortInferShapeTester(utt.InferShapeTester):
def test_sort(self):
x = tensor.matrix()
self._compile_and_check(
[x],
[sort(x)],
[np.random.randn(10, 40).astype(theano.config.floatX)],
SortOp)
[x],
[sort(x)],
[np.random.randn(10, 40).astype(theano.config.floatX)],
SortOp)
self._compile_and_check(
[x],
[sort(x, axis=None)],
[np.random.randn(10, 40).astype(theano.config.floatX)],
SortOp)
[x],
[sort(x, axis=None)],
[np.random.randn(10, 40).astype(theano.config.floatX)],
SortOp)
def test_argsort():
......@@ -160,7 +170,7 @@ def test_argsort():
f = theano.function([a], w)
gv = f(m_val)
gt = np.argsort(m_val)
assert np.allclose(gv, gt)
utt.assert_allclose(gv, gt)
# Example 2
a = tensor.dmatrix()
......@@ -170,7 +180,7 @@ def test_argsort():
for axis_val in 0, 1:
gv = f(m_val, axis_val)
gt = np.argsort(m_val, axis_val)
assert np.allclose(gv, gt)
utt.assert_allclose(gv, gt)
# Example 3
a = tensor.dvector()
......@@ -178,7 +188,7 @@ def test_argsort():
f = theano.function([a], w2)
gv = f(v_val)
gt = np.argsort(v_val)
assert np.allclose(gv, gt)
utt.assert_allclose(gv, gt)
# Example 4
a = tensor.dmatrix()
......@@ -188,7 +198,7 @@ def test_argsort():
for axis_val in 0, 1:
gv = f(m_val, axis_val)
gt = np.argsort(m_val, axis_val)
assert np.allclose(gv, gt)
utt.assert_allclose(gv, gt)
# Example 5
a = tensor.dmatrix()
......@@ -206,7 +216,7 @@ def test_argsort():
f = theano.function([a], w2)
gv = f(m_val)
gt = np.argsort(m_val, None)
assert np.allclose(gv, gt)
utt.assert_allclose(gv, gt)
def test_argsort_grad():
......@@ -219,3 +229,222 @@ def test_argsort_grad():
data = np.random.rand(2, 3, 3).astype(theano.config.floatX)
utt.verify_grad(lambda x: argsort(x, axis=2), [data])
class Test_TopK(unittest.TestCase):
def setUp(self):
pass
@utt.parameterized.expand(product(
_all_dtypes, tensor.integer_dtypes, [-1, 0, None], [False]))
def test_argtopk_sanity(self, dtype, idx_dtype, axis, sorted):
x = tensor.vector(name='x', dtype=dtype)
fn = theano.function([x], argtopk(x, 1, axis=axis, sorted=sorted, idx_dtype=idx_dtype))
xval = np.asarray([1]).astype(dtype)
yval = fn(xval)
assert yval == np.asarray([0], dtype=idx_dtype)
assert yval.dtype == np.dtype(idx_dtype)
@utt.parameterized.expand(product(
_all_dtypes, [-1, 0, None], [False]))
def test_topk_sanity(self, dtype, axis, sorted):
x = tensor.vector(name='x', dtype=dtype)
fn = theano.function([x], topk(x, 1, axis=axis, sorted=sorted))
xval = np.asarray([1]).astype(dtype)
yval = fn(xval)
assert yval == xval
assert yval.dtype == xval.dtype
@utt.parameterized.expand(product(
_all_dtypes, tensor.integer_dtypes, [-1, 0, None], [False]))
def test_combined_sanity(self, dtype, idx_dtype, axis, sorted):
x = tensor.vector(name='x', dtype=dtype)
yv, yi = topk_and_argtopk(x, 1, axis=axis, sorted=sorted, idx_dtype=idx_dtype)
fn = theano.function([x], [yv, yi])
xval = np.asarray([1]).astype(dtype)
yvval, yival = fn(xval)
assert yival == np.asarray([0], dtype=idx_dtype)
utt.assert_allclose(xval, yvval)
assert yvval.dtype == xval.dtype
assert yival.dtype == np.dtype(idx_dtype)
@utt.parameterized.expand(chain(
product(
(16, 61, 257),
(1, -1, -10, 'n//2', 'n-1', '-n', '1-n'),
('float64', 'float16', 'int16', 'int8'),
(False,)),
((2049, 1337, 'float64', False),)))
def test_topk_1d(self, size, k, dtype, sorted):
if isinstance(k, str):
k = eval(k.replace('n', str(size)))
x = theano.tensor.vector(name='x', dtype=dtype)
y = topk(x, k, sorted=sorted)
fn = theano.function([x], y)
# assert local_useless_topk opt is done properly
assert 1 == len(fn.maker.fgraph.outputs[0].owner.outputs)
# generate a all-unique array
xval = gen_unique_vector(size, dtype)
yval = fn(xval)
idx = (slice(-k, None) if k > 0 else slice(-k))
goal = np.sort(xval)[idx]
assert yval.dtype == goal.dtype
utt.assert_allclose(np.sort(yval), goal)
@utt.parameterized.expand(chain(
product(
(16, 61, 257),
(1, -1, -10, 'n//2', 'n-1', '-n'),
('float32', 'int32'),
(False,),
('int32', 'int64')),
((2049, 1337, 'float32', False, 'int32'),)))
def test_argtopk_1d(self, size, k, dtype, sorted, idx_dtype):
if isinstance(k, str):
k = eval(k.replace('n', str(size)))
x = theano.tensor.vector(name='x', dtype=dtype)
y = argtopk(x, k, sorted=sorted, idx_dtype=idx_dtype)
fn = theano.function([x], y)
# assert local_useless_topk opt is done properly
assert 1 == len(fn.maker.fgraph.outputs[0].owner.outputs)
# generate a all-unique array
xval = gen_unique_vector(size, dtype)
yval = fn(xval)
idx = (slice(-k, None) if k > 0 else slice(-k))
goal = np.argsort(xval)[idx].astype(idx_dtype)
# due to uniqueness, we expect indices same
assert np.all(xval[np.sort(yval)] == xval[np.sort(goal)])
@utt.parameterized.expand(chain(
product(
(16, 61, 257),
(1, -1, 10, 'n//2', 'n-1', '1-n'),
('float32', 'int32'),
(False,),
('int32', 'int64')),
((2049, 1337, 'float32', False, 'int32'),)))
def test_combined_1d(self, size, k, dtype, sorted, idx_dtype):
if isinstance(k, str):
k = eval(k.replace('n', str(size)))
x = theano.tensor.vector(name='x', dtype=dtype)
yv, yi = topk_and_argtopk(x, k, sorted=sorted, idx_dtype=idx_dtype)
fn = theano.function([x], [yv, yi])
# generate a all-unique array
xval = gen_unique_vector(size, dtype)
yvval, yival = fn(xval)
idx = (slice(-k, None) if k > 0 else slice(-k))
goali = np.argsort(xval)[idx].astype(idx_dtype)
goalv = xval[goali]
# due to uniqueness, we expect indices same
assert np.all(xval[np.sort(yival)] == xval[np.sort(goali)])
utt.assert_allclose(np.sort(yvval), goalv)
@utt.parameterized.expand(chain(
product(
(18, 62, 258),
(1, -1, 'n//2'),
('int32', 'float32'),
(False,)),
((2048, 1337, 'float32', False),)))
def test_argtopk_1d_collision(self, size, k, dtype, sorted):
# with non-unique kth max value
if isinstance(k, str):
k = eval(k.replace('n', str(size)))
x = theano.tensor.vector(name='x', dtype=dtype)
y = argtopk(x, k, sorted=sorted, idx_dtype='int32')
fn = theano.function([x], y)
xval = np.repeat(np.random.uniform(-100., 100., size=size // 2).astype(dtype), 2)
xval = xval[np.random.permutation(size)]
yval = fn(xval)
idx = (slice(-k, None) if k > 0 else slice(-k))
goal = np.argsort(xval)[idx].astype('int32')
utt.assert_allclose(np.sort(xval[yval]), np.sort(xval[goal]))
@utt.parameterized.expand(product(
((17, 15), (2, 3, 5, 7, 11), (2017, 5, 3)),
(-1, '(1+n)//2', '-n', '1-n'),
('float32', 'int32'),
(False,),
('int32', 'int64')))
def test_argtopk_nd(self, shp, k_, dtype, sorted, idx_dtype):
ndim = len(shp)
for axis in range(-ndim, ndim):
if isinstance(k_, str):
k = eval(k_.replace('n', str(shp[axis])))
else:
k = k_
if k == 0:
continue
x = theano.tensor.tensor(
name='x', broadcastable=(False,) * len(shp), dtype=dtype)
y = argtopk(x, k, axis=axis, sorted=sorted, idx_dtype=idx_dtype)
fn = theano.function([x], y)
size = reduce(int.__mul__, shp)
xval = gen_unique_vector(size, dtype).reshape(shp)
yval = fn(xval)
idx = slice(-k, None) if k > 0 else slice(-k)
l = axis % ndim
r = ndim - l
idx = (slice(None),) * l + (idx,) + (slice(None),) * (r - 1)
goal = np.argsort(xval, axis=axis)[idx].astype(idx_dtype)
assert np.all(np.sort(yval, axis=axis) == np.sort(goal, axis=axis))
@utt.parameterized.expand(product(
((257,), (17, 15), (5, 3, 5, 3), (2, 3, 5, 7, 11)),
(1, -1, '(1+n)//2', 'n-1', '-n', '1-n'), (False,)))
def test_grad(self, shp, k_, sorted):
ndim = len(shp)
for axis in range(-ndim, ndim):
if isinstance(k_, str):
k = eval(k_.replace('n', str(shp[axis])))
else:
k = k_
if k == 0:
continue
# make input away from undefined gradient (where some inputs are equal)
xval = gen_unique_vector(
reduce(int.__mul__, shp),
dtype=theano.config.floatX
).reshape(shp)
utt.verify_grad(lambda x: topk(x, k, axis=axis, sorted=sorted), [xval], eps=1e-2)
class TopKInferShapeTester(utt.InferShapeTester):
@utt.parameterized.expand(product(
((2, 3), (15, 17), (11, 7, 5), (2, 3, 5, 7, 11), (2, 4, 3, 1)),
(1, '(1+n)//2', 'n-1', 'n')))
def test_combined_infer_shape(self, shp, k_):
ndim = len(shp)
for axis in range(-ndim, ndim):
if isinstance(k_, str):
k = eval(k_.replace('n', str(shp[axis])))
else:
k = k_
if k == 0:
continue
x = theano.tensor.tensor(
name='x', broadcastable=(False,) * len(shp),
dtype=theano.config.floatX)
yv, yi = topk_and_argtopk(x, k, axis=axis, sorted=False, idx_dtype='int32')
size = reduce(int.__mul__, shp)
xval = gen_unique_vector(size, theano.config.floatX).reshape(shp)
self._compile_and_check(
[x], [yv, yi], [xval], TopKOp)
......@@ -83,7 +83,7 @@ def seed_rng(pseed=None):
def verify_grad(op, pt, n_tests=2, rng=None, *args, **kwargs):
"""
Wrapper for tensor/basic.py:verify_grad
Wrapper for gradient.py:verify_grad
Takes care of seeding the random number generator if None is given
"""
if rng is None:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论