提交 95f6eda6 authored 作者: Adam Becker's avatar Adam Becker

mixed changes

- add multidim support for top_k - use unified TopKOp, can implement topk, argtopk, and both
上级 ece8b25b
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import os import os
from string import Template
import theano import theano
from theano import Apply from theano import Apply
from theano.tensor import as_tensor_variable from theano.tensor import as_tensor_variable
from theano.tensor.sort import ArgTopKOp from theano.tensor.sort import TopKOp
from .basic_ops import (GpuKernelBase, Kernel, infer_context_name, from .basic_ops import (GpuKernelBase, Kernel, infer_context_name,
as_gpuarray_variable, gpu_contiguous) as_gpuarray_variable, gpu_contiguous)
from .opt import register_opt, op_lifter, register_opt2
from .type import GpuArrayType from .type import GpuArrayType
try: try:
import pygpu import pygpu
import pygpu.gpuarray as ga
except ImportError as e: except ImportError as e:
# To make sure theano is importable # To make sure theano is importable
pass pass
class GpuSortOp(object): # TODO add support is slice size is larger than max allowed block size (1024)
# TODO # TODO add runtime opt, if k==1, use max/min reduce
pass # TODO sort / argsort
class GpuArgSortOp(object):
# TODO
pass
class GpuArgTopKOp(ArgTopKOp, GpuKernelBase): class GpuTopKOp(GpuKernelBase, TopKOp):
''' '''
implement argtopk() on gpu Implements TopKOp() on gpu
''' '''
__props__ = ArgTopKOp.__props__ __props__ = TopKOp.__props__
def __init__(self, axis=-1): def __init__(self, axis=-1, return_indices=False, return_values=True):
ArgTopKOp.__init__(self, axis=axis) GpuKernelBase.__init__(self)
TopKOp.__init__(
self, axis=axis,
return_values=return_values,
return_indices=return_indices)
def c_headers(self): def c_headers(self):
return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h'] return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']
...@@ -40,67 +43,180 @@ class GpuArgTopKOp(ArgTopKOp, GpuKernelBase): ...@@ -40,67 +43,180 @@ class GpuArgTopKOp(ArgTopKOp, GpuKernelBase):
def c_header_dirs(self): def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include()] return [os.path.dirname(__file__), pygpu.get_include()]
'''
def c_code_cache_version(self):
return (1,)
'''
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
device_type = str(node.inputs[0].type.context.kind) # load kernel source
kernel_ext = dict(cuda='.cu', opencl='.cl')[device_type] device_type = node.inputs[0].type.context.kind
flags = Kernel.get_flags(node.inputs[0].dtype) kernel_ext = {b'cuda':'.cu', b'opencl':'.cl'}[device_type]
try: try:
kernel_filename = 'topk_kernel%s' % kernel_ext kernel_filename = 'topk_kernel%s' % kernel_ext
with open(os.path.join( with open(os.path.join(
os.path.dirname(__file__), kernel_filename os.path.dirname(__file__), kernel_filename
)) as f: ), 'r') as f:
kernel_src = f.read() kernel_src = f.read()
except FileNotFoundError: except FileNotFoundError:
raise RuntimeError( raise RuntimeError(
'Cannot find GPU kernel ' 'Cannot find GPU kernel '
'implementation for device "%s"' % device_type) 'implementation for device "%s"' % device_type)
return [Kernel(
kernel_src, # prepare "$" macros
params='TODO_params', ndim = node.inputs[0].ndim
name='topk_kernel', dstv_strides_code = ''.join('ga_ssize dstv_strides_%d, ' % i for i in range(ndim))
flags=flags, dsti_strides_code = ''.join('ga_ssize dsti_strides_%d, ' % i for i in range(ndim))
)] src_strides_code = ''.join('ga_ssize src_strides_%d, ' % i for i in range(ndim))
set_slice_code = '''
gidx = gid %% dims_%(i)d;
gid /= dims_%(i)d;
{dstv};
{dsti};
src = ptr_add(src, gidx*src_strides_%(i)d);\n'''.format(
dstv='dstv = ptr_add(dstv, gidx*dstv_strides_%(i)d)' if self.return_values else '',
dsti='dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)' if self.return_indices else '')
set_slice_code = ''.join(
set_slice_code % dict(i=j) for j in range(1, ndim))
flags = Kernel.get_flags(node.inputs[0].dtype)
dst = ''
if self.return_values:
dst += 'INPUT_TYPE *dstv, '
if self.return_values:
dst += 'INDEX_TYPE *dsti, '
write_value = 'ptr_at(dstv, out_idx * dstv_strides_0) = xval' if self.return_values else ''
write_index = 'ptr_at(dsti, out_idx * dsti_strides_0) = (INDEX_TYPE)idx' if self.return_indices else ''
subs = dict(
inp_t=ga.dtype_to_ctype(node.inputs[0].dtype),
out_t=ga.dtype_to_ctype(node.outputs[0].dtype),
dims=''.join('ga_size dims_%d, ' % i for i in range(1, ndim)),
dstv='INPUT_TYPE *dstv,' if self.return_values else '',
dsti='INDEX_TYPE *dsti,' if self.return_indices else '',
dstv_strides=dstv_strides_code,
dsti_strides=dsti_strides_code,
src_strides=src_strides_code,
set_slice=set_slice_code,
write_value=write_value,
write_index=write_index,
ndim=str(ndim))
# substitute "$" macros in kernel code
kernel_src = Template(kernel_src).substitute(**subs)
# compile kernel
param_types = [ga.SIZE] * (ndim - 1) # dims
for _ in range(int(self.return_values) + int(self.return_indices)):
param_types.append(ga.GpuArray) # dst*
param_types.extend([ga.SSIZE] * ndim) # dst*_strides
param_types.append(ga.SIZE) # k
param_types.append(ga.GpuArray) # src
param_types.extend([ga.SSIZE] * ndim) # src_strides
param_types.append(ga.SIZE) # size
return [Kernel(
code=kernel_src,
name='k_topk_dense',
params=param_types,
flags=flags,
objvar='k_topk_dense_' + nodename
)]
def c_code(self, node, nodename, inps, outs, sub): def c_code(self, node, nodename, inps, outs, sub):
if node.inputs[0].type.context.kind != b'cuda': if node.inputs[0].type.context.kind != b'cuda':
raise NotImplementedError('We only have CUDA implementation so far.') raise NotImplementedError('We only have CUDA implementation so far.')
x, k = inps x, k = inps
y, = outs inp_dtc = pygpu.dtypes.dtype_to_ctype(node.inputs[0].dtype).upper()
if not self.return_indices:
yv, = outs
out_dtype_s = ''
out_dtc = ''
else:
if self.return_values:
yv, yi = outs
else:
yi, = outs
out_dtype_s = node.outputs[0].dtype
out_dtc = pygpu.dtypes.dtype_to_ctype(out_dtype_s).upper()
fail = sub['fail'] fail = sub['fail']
ctx = sub['params'] ctx = sub['params']
out_dtype = pygpu.dtypes.dtype_to_ctype(self.out_dtype).upper() k_dtype = node.inputs[1].type.dtype_specs()[1]
MAX_TPB = 1024 # max thread per block MAX_TPB = 1024 # max thread per block
WARP_SIZE = 32 WARP_SIZE = 32
ndim = node.inputs[0].ndim
reordered_axes = list(range(ndim))
axis = self.axis % ndim
del(reordered_axes[axis])
reordered_axes = [axis] + reordered_axes
dims = ', '.join('(void*)(dims+%d)' % i for i in reordered_axes[1:])
prep_output = ''
if self.return_values:
def_dvstrides = 'const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)' % yv
params_dv = '(void*)(%s->ga.data),\n' % yv
params_dv += ''.join('(void*)(dvstrides+%d), ' % i for i in reordered_axes)
prep_output += '''
if (0 != theano_prep_output(
&%(yv)s, %(ndim)d, odims,
%(inp_dtc)s, GA_C_ORDER, %(ctx)s)) {
%(fail)s;
}\n''' % locals()
else:
def_dvstrides = params_dv = ''
if self.return_indices:
def_distrides = 'const ssize_t *distrides = PyGpuArray_STRIDES(%s)' % yi
params_di = '(void*)(%s->ga.data),\n' % yi
params_di += ''.join('(void*)(distrides+%d), ' % i for i in reordered_axes)
prep_output += '''
if (0 != theano_prep_output(
&%(yi)s, %(ndim)d, odims,
%(out_dtc)s, GA_C_ORDER, %(ctx)s)) {
%(fail)s;
}\n''' % locals()
else:
def_distrides = params_di = ''
sstrides = ', '.join('(void*)(sstrides+%d)' % i for i in reordered_axes)
code = ''' code = '''
{ {
// prepare output
const size_t *dims = PyGpuArray_DIMS(%(x)s); const size_t *dims = PyGpuArray_DIMS(%(x)s);
const size_t *odims[1] = {*((%(out_dtype)s)PyArray_DATA(%(k)s))}; size_t odims[%(ndim)d];
for (int i=0; i<%(ndim)d; i++) {
odims[i] = dims[i];
}
odims[%(axis)d] = *((%(k_dtype)s*)(PyArray_DATA(%(k)s)));
if (odims[0] > %(MAX_TPB)d) { if (odims[0] > %(MAX_TPB)d) {
PyErr_SetString( PyErr_SetString(
PyExc_ValueError, PyExc_ValueError,
"topk: slice size larger than %(MAX_TPB)d is not supported"); "topk: slice size larger than %(MAX_TPB)d is not supported");
%(fail)s; } %(fail)s; }
if (0 != theano_prep_output( %(prep_output)s
&%(y)s, 1, odims,
%(out_dtype)s, GA_C_ORDER, %(ctx)s)) { // TODO better scheduling?
%(fail)s; size_t blk[6];
} size_t *grd = blk+3;
size_t blk[6] = ;
size_t grd = blk+3;
blk[1] = blk[2] = 1; blk[1] = blk[2] = 1;
grd[0] = grd[1] = grd[2] = 1; grd[0] = grd[1] = grd[2] = 1;
// round up to multiples of warp size // round up to multiples of warp size
blk[0] = (dims[0] + (%(WARP_SIZE)d - 1) / %(WARP_SIZE)d) * %(WARP_SIZE)d; blk[0] = ((dims[0] + %(WARP_SIZE)d - 1) / %(WARP_SIZE)d) * %(WARP_SIZE)d;
for(int i=0; i<%(ndim)d; ++i) {
if (i!=%(axis)d)
grd[0] *= dims[i];
}
%(def_dvstrides)s;
%(def_distrides)s;
const ssize_t *sstrides = PyGpuArray_STRIDES(%(x)s);
void* args[] = { void* args[] = {
((void*)(%(y)s->ga.data)), %(dims)s
((void*)(%(x)s->ga.data)), %(params_dv)s
(void*)dims, (void*)odims %(params_di)s
(void*)(odims+%(axis)d),
(void*)(%(x)s->ga.data),
%(sstrides)s,
(void*)(dims+%(axis)d)
}; };
int err = GpuKernel_call( int err = GpuKernel_call(
&topk_kernel, 3, &k_topk_dense_%(nodename)s, 3,
grd, blk, grd, blk,
blk[0] * gpuarray_get_elsize(%(x)s->ga.typecode), blk[0] * gpuarray_get_elsize(%(x)s->ga.typecode),
args); args);
...@@ -114,21 +230,37 @@ class GpuArgTopKOp(ArgTopKOp, GpuKernelBase): ...@@ -114,21 +230,37 @@ class GpuArgTopKOp(ArgTopKOp, GpuKernelBase):
''' '''
return code % locals() return code % locals()
def make_node(self, inp, k, out_dtype='int64'): def make_node(self, inp, k, idx_dtype='int64'):
ctx_name = infer_context_name(inp) ctx_name = infer_context_name(inp)
inp = as_gpuarray_variable(inp, ctx_name) inp = as_gpuarray_variable(inp, ctx_name)
k = as_tensor_variable(k) k = as_tensor_variable(k)
bcast = inp.type.broadcastable bcast = inp.type.broadcastable
return Apply( outs = []
self, [inp, k], if self.return_indices:
[GpuArrayType( outs.append(GpuArrayType(
dtype=out_dtype, dtype=idx_dtype,
broadcastable=bcast, broadcastable=bcast,
context_name=ctx_name)()]) context_name=ctx_name)())
if self.return_values:
outs.append(inp.type())
return Apply(self, [inp, k], outs)
def get_params(self, node): def get_params(self, node):
return node.inputs[0].type.context return node.inputs[0].type.context
def get_op_params(self): # def get_op_params(self):
return [('AXIS', self.axis)] # return [('AXIS', self.axis)]
@register_opt('fast_compile')
@op_lifter([TopKOp])
@register_opt2([TopKOp], 'fast_compile')
def local_gpua_topkop(op, ctx_name, inputs, outputs):
axis = op.axis
rv = op.return_values
ri = op.return_indices
x, k = inputs
x = as_gpuarray_variable(x, ctx_name)
y = outputs[-1]
return GpuTopKOp(
axis=axis, return_values=rv, return_indices=ri)(x, k, idx_dtype=y.dtype)
...@@ -113,6 +113,7 @@ struct RadixConfig<double> { ...@@ -113,6 +113,7 @@ struct RadixConfig<double> {
} }
}; };
#ifdef USE_HALF
template <> template <>
struct RadixConfig<half> { struct RadixConfig<half> {
typedef unsigned int RadixType; typedef unsigned int RadixType;
...@@ -138,23 +139,29 @@ struct RadixConfig<half> { ...@@ -138,23 +139,29 @@ struct RadixConfig<half> {
#endif #endif
} }
}; };
#endif
// $$inp_t should be replaced in c_code
// we cannot use templated __global__ because gpuarray API does not support it yet
#define NDIM $ndim
#define INPUT_TYPE $inp_t
#define INDEX_TYPE $out_t
#define bitsof(T) (sizeof(T)*8) #define bitsof(T) (sizeof(T)*8)
#define RADIX_BITS 2 #define RADIX_BITS 2
#define RADIX_SIZE (1<<RADIX_BITS) #define RADIX_SIZE (1<<RADIX_BITS)
#define RADIX_MASK(n) ((RADIX_SIZE-1) << (n*RADIX_BITS)) #define RADIX_MASK(n) ((RADIX_SIZE-1) << (n*RADIX_BITS))
#define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS) #define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
#define radix_t RadixConfig<T>::RadixType #define radix_t RadixConfig<INPUT_TYPE>::RadixType
#if RADIX_SIZE > 32 #if RADIX_SIZE > 32
#error "RADIX_SIZE must be smaller than warp size (32)" #error "RADIX_SIZE must be smaller than warp size (32)"
#endif #endif
template <typename T> template <typename T>
inline __device__ T binary_cumsum(int idx, int warp_id, int lane_id, T* smem, bool value) { static inline __device__ T binary_cumsum(int idx, int warp_id, int lane_id, T* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads whose id is *no greater than* the current thread // cumsum within 1D thread block, which adds up `value` of all threads whose id is *no greater than* the current thread
// cumsum within warp // cumsum within warp
unsigned int warp_bits = __ballot(in); unsigned int warp_bits = __ballot(value);
T warp_sum = __popc(((2<<lane_id)-1) & warp_bits); T warp_sum = __popc(((2<<lane_id)-1) & warp_bits);
if (lane_id == 0) if (lane_id == 0)
...@@ -175,20 +182,21 @@ inline __device__ T binary_cumsum(int idx, int warp_id, int lane_id, T* smem, bo ...@@ -175,20 +182,21 @@ inline __device__ T binary_cumsum(int idx, int warp_id, int lane_id, T* smem, bo
__syncthreads(); __syncthreads();
// load the carry from the preceding warp // load the carry from the preceding warp
if (warp >= 1) { if (warp_id >= 1) {
warp_sum = warp_sum+smem[warp - 1]; warp_sum = warp_sum+smem[warp_id - 1];
} }
return warp_sum; return warp_sum;
} }
template <typename T> template <typename T>
inline __device__ T binary_cumsum_exclusive( static inline __device__ T binary_cumsum_exclusive(
int idx, int warp_id, int lane_id, T* smem, bool value) { int idx, int warp_id, int lane_id, T* smem, bool value) {
// cumsum within 1D thread block, which adds up `value` of all threads // cumsum within 1D thread block, which adds up `value` of all threads
// whose id is *less than* the current thread // whose id is *less than* the current thread
// cumsum within warp // cumsum within warp
unsigned int warp_bits = __ballot(in); unsigned int warp_bits = __ballot(value);
T warp_sum = __popc(((1<<lane_id)-1) & warp_bits); T warp_sum = __popc(((1<<lane_id)-1) & warp_bits);
if (lane_id == 0) if (lane_id == 0)
...@@ -209,35 +217,77 @@ inline __device__ T binary_cumsum_exclusive( ...@@ -209,35 +217,77 @@ inline __device__ T binary_cumsum_exclusive(
__syncthreads(); __syncthreads();
// load the carry from the preceding warp // load the carry from the preceding warp
if (warp >= 1) { if (warp_id >= 1)
warp_sum = warp_sum+smem[warp - 1]; warp_sum += smem[warp_id - 1];
}
return warp_sum; return warp_sum;
} }
// apply raw(byte) offset to pointer
template <typename T>
static __device__ inline T* ptr_add(T *ptr, ga_ssize offset) {
return (T*)((char*)ptr + offset);
}
// get array element using raw(byte) offset
template <typename T> template <typename T>
void __global__ topk_1d_contig_kernel(T* dst, T* src, size_t size, size_t k) { static __device__ inline T& ptr_at(T *ptr, ga_ssize offset) {
extern radix_t smem[]; return *((T*)((char*)ptr + offset));
ssize_t bins[RADIX_SIZE]; // TODO: does using 32-bit gives speedup? }
KERNEL void k_topk_dense(
$dims
// ga_size dims_1, ga_ssize dims_2, ... , dims_$${NDIM}
$dstv
// INPUT_TYPE *dstv
$dstv_strides
// ga_ssize dstv_strides_0, ga_ssize dstv_strides_1, ... , dstv_strides_$${NDIM}
$dsti
// INDEX_TYPE *dsti
$dsti_strides
// ga_ssize dsti_strides_0, ga_ssize dsti_strides_1, ... , dsti_strides_$${NDIM}
ga_ssize k,
INPUT_TYPE* src,
$src_strides
// ga_ssize src_strides_0, ga_ssize src_strides_1, ... , src_strides_$${NDIM}
size_t size) {
/*
extern __shared__ radix_t smem[];
ga_ssize __shared__ bins[RADIX_SIZE]; // TODO: does using 32-bit gives speedup?
bool is_topk = true; bool is_topk = true;
bool is_topkth = true; // exactly k-th largest bool is_topkth = true; // exactly k-th largest
radix_t out_idx;
const size_t idx = threadIdx.x;
size_t __shared__ k2, exceed;
const ga_uint warp_id = idx / 32;
const ga_uint lane_id = idx % 32;
radix_t *wmem = (radix_t*)(smem) + warp_id * 32;
const bool in_range = (idx < size);
is_topk &= in_range;
const INPUT_TYPE xval = in_range ? ptr_at(src, idx*src_strides_0) : (INPUT_TYPE)0;
radix_t x = in_range ? RadixConfig<INPUT_TYPE>::convert(xval) : 0;
// resolve negative k
if (k<0) { x = ~x; k = -k; }
if (idx==0) k2 = k;
// 0. get the slice for thread block to work on
size_t gid = blockIdx.x, gidx;
$set_slice
//for(int i=0; i<NDIM; i++) {
//gidx = gid % dims_$${i};
//gid /= dims_$${i};
//dsti = ptr_add(dsti, gidx*dsti_strides_$${i+1};
//dstv = ptr_add(dstv, gidx*dstv_strides_$${i+1};
//src = ptr_add(src, gidx*src_strides_$${i+1});
//}
// 1. filter is_topk and is_topkth using radix select
size_t idx = threadIdx.x;
size_t k2 = k, exceed;
int warp_id = idx / 32;
int lane_id = idx % 32;
radix_t wmem = smem + warp_id * 32;
bool in_range = (idx < size);
RadixConfig<T>::RadixType x = in_range ? RadixConfig<T>::convert(src[idx]) : 0;
// 1. find the kth largest value using radix select
// 1.1 for each radix mask, count
smem[threadIdx.x] = 0;
#pragma unroll #pragma unroll
for (int i=bitsof(T)-RADIX_BITS; i; i-=RADIX_BITS) { for (int i=bitsof(INPUT_TYPE)-RADIX_BITS; i>=0; i-=RADIX_BITS) {
radix_t mask = (RADIX_SIZE-1)<<i; smem[idx] = 0;
int digit = (x>>i) & (RADIX_SIZE-1); int digit = (x>>i) & (RADIX_SIZE-1);
// count within warp // count within warp
#pragma unroll #pragma unroll
...@@ -245,43 +295,34 @@ void __global__ topk_1d_contig_kernel(T* dst, T* src, size_t size, size_t k) { ...@@ -245,43 +295,34 @@ void __global__ topk_1d_contig_kernel(T* dst, T* src, size_t size, size_t k) {
bool incr_bin = (bin == digit) && is_topkth && in_range; bool incr_bin = (bin == digit) && is_topkth && in_range;
unsigned int incr_bin_warp = __ballot(incr_bin); unsigned int incr_bin_warp = __ballot(incr_bin);
if (lane_id==0) if (lane_id==0)
wmem[bin] += __popc(bin_warp); wmem[bin] += __popc(incr_bin_warp);
} }
__syncthreads(); __syncthreads();
// sum counts across all warps // sum counts across all warps
// TODO: test in-block parallel sum? // TODO: test in-block parallel sum?
if (idx<RADIX_SIZE) if (idx < RADIX_SIZE) {
bins[idx] = 0; for(int w=32; w<blockDim.x; w+=32)
if (idx==0) { smem[idx] += smem[idx + w];
for(int w=1; w<blockDim.x/32; ++w) {
#pragma unroll
for(int bin=0; bin<RADIX_SIZE; ++bin) {
smem[bin] += wmem[bin];
}
}
} }
__syncthreads(); __syncthreads();
// broadcast sum result
if (idx < RADIX_SIZE)
smem[idx] = bins[idx];
__syncthreads();
// calculate k minus cumsum(count) // calculate k minus cumsum(count)
exceed = -k; // how many the number of is_topk exceeds k if (idx<RADIX_SIZE)
bins[idx] = 0;
if (idx == 0) { if (idx == 0) {
bins[0] = k2 - smem[0]; exceed = k; // how many the number of is_topk exceeds k
if (bins[0] > 0) bins[RADIX_SIZE-1] = k2 - smem[RADIX_SIZE-1];
k2 = bins[0]; if (bins[RADIX_SIZE-1] > 0)
else if (bins[0] < 0) k2 = bins[RADIX_SIZE-1];
exceed = max(exceed, bins[0]); else
exceed = min(exceed, bins[RADIX_SIZE-1]);
#pragma unroll #pragma unroll
for(int bin=1; bin<RADIX_SIZE; ++bin) { for(int bin=RADIX_SIZE-1; bin; --bin) {
bins[bin] = bins[bin-1] - smem[bin]; bins[bin-1] = bins[bin] - smem[bin-1];
if (bins[bin] > 0) if (bins[bin-1] > 0)
k2 = bins[bin]; k2 = bins[bin-1];
else if (bins[bin] < 0) else
exceed = max(exceed, bins[bin]); exceed = min(exceed, bins[bin-1]);
} }
} }
__syncthreads(); __syncthreads();
...@@ -290,7 +331,7 @@ void __global__ topk_1d_contig_kernel(T* dst, T* src, size_t size, size_t k) { ...@@ -290,7 +331,7 @@ void __global__ topk_1d_contig_kernel(T* dst, T* src, size_t size, size_t k) {
// smem -> count // smem -> count
// bins -> k2 - cumsum(count) // bins -> k2 - cumsum(count)
if (is_topk && is_topkth) { if (is_topk && is_topkth) {
ssize_t icount = bins[digit]; ga_ssize icount = bins[digit];
if (icount > 0) { if (icount > 0) {
is_topkth = false; is_topkth = false;
} else if (icount < 0) { } else if (icount < 0) {
...@@ -305,17 +346,23 @@ void __global__ topk_1d_contig_kernel(T* dst, T* src, size_t size, size_t k) { ...@@ -305,17 +346,23 @@ void __global__ topk_1d_contig_kernel(T* dst, T* src, size_t size, size_t k) {
} }
// 2. find the index of output array, if exists // 2. find the index of output array, if exists
//
// top_kth value may not be unique, so we need to if (exceed != 0) {
// count how many is needed // top_kth value may not be unique, so we need to
// perform binary cumsum on is_topkth to drop exceeding top-kth values
// perform binary cumsum on is_topkth to drop exceeding top-kth values out_idx = binary_cumsum_exclusive<radix_t>(idx, warp_id, lane_id, smem, is_topkth);
radix_t topkth_idx = binary_cumsum_exclusive<radix_t>(idx, warp_id, lane_id, smem, is_topkth); is_topk &= (out_idx < exceed);
if (topkth_idx >= exceed) }
is_topk = false;
// perform binary cumsum on is_topk to determine the indices to put result
// perform binary cumsum on is_topk to determine idx to put result out_idx = binary_cumsum_exclusive<radix_t>(idx, warp_id, lane_id, smem, is_topk);
topkth_idx = binary_cumsum_exclusive<radix_t>(idx, warp_id, lane_id, smem, is_topkth); __syncthreads();
if (is_topk)
dst[topkth_idx] = idx; if (is_topk) {
$write_value;
// ptr_at(dstv, out_idx * dstv_strides_0) = xval;
$write_index;
// ptr_at(dsti, out_idx * dsti_strides_0) = (INDEX_TYPE)idx;
}
*/
} }
...@@ -40,7 +40,7 @@ from theano.tensor import nnet # used for softmax, sigmoid, etc. ...@@ -40,7 +40,7 @@ from theano.tensor import nnet # used for softmax, sigmoid, etc.
from theano.gradient import Rop, Lop, grad, numeric_grad, verify_grad, \ from theano.gradient import Rop, Lop, grad, numeric_grad, verify_grad, \
jacobian, hessian, consider_constant jacobian, hessian, consider_constant
from theano.tensor.sort import sort, argsort, argtopk from theano.tensor.sort import sort, argsort, topk, argtopk, topk_and_argtopk
from theano.tensor.extra_ops import (DiffOp, bincount, squeeze, from theano.tensor.extra_ops import (DiffOp, bincount, squeeze,
repeat, bartlett, fill_diagonal, fill_diagonal_offset, repeat, bartlett, fill_diagonal, fill_diagonal_offset,
cumsum, cumprod) cumsum, cumprod)
......
...@@ -221,117 +221,217 @@ def argsort(a, axis=-1, kind='quicksort', order=None): ...@@ -221,117 +221,217 @@ def argsort(a, axis=-1, kind='quicksort', order=None):
if hasattr(np, 'argpartition'): if hasattr(np, 'argpartition'):
def _argtopk_py_impl(x, k, axis, out_dtype): # numpy >= 1.8 implementation
# numpy >= 1.8 implementation def _topk_py_impl(op, x, k, axis, idx_dtype):
if k == 1:
return np.expand_dims(
np.argmax(x, axis=axis).astype(out_dtype), axis)
elif k == -1:
return np.expand_dims(
np.argmin(x, axis=axis).astype(out_dtype), axis)
ndim = x.ndim ndim = x.ndim
asize = x.shape[axis] if abs(k) == 1:
if asize == abs(k): i = (k + 1) // 2
z = np.arange(abs(k), dtype=out_dtype) fn_max = [np.min, np.max][i]
l = axis % ndim fn_argmax = [np.argmin, np.argmax][i]
r = ndim - l if not op.return_indices:
z = z.reshape((1,) * l + (k,) + (1,) * (r - 1)) return np.expand_dims(fn_max(x, axis=axis), axis)
reps = list(x.shape) elif op.return_values:
reps[axis] = 1 zi = np.expand_dims(
return np.tile(z, reps) fn_argmax(x, axis=axis).astype(idx_dtype), axis)
idx2 = tuple(np.arange(s).reshape((s,) + (1,) * (ndim - i - 1)) if i != axis else zi for i, s in enumerate(x.shape))
print('used axis %d' % axis) zv = x[idx2]
z = np.argpartition(x, -k, axis=axis) return zv, zi.astype(idx_dtype)
idx = (slice(None),) * (axis % ndim) else:
if k > 0: zi = np.expand_dims(
idx += (slice(-k, None),) fn_argmax(x, axis=axis).astype(idx_dtype), axis)
elif k < 0: return zi.astype(idx_dtype)
idx += (slice(-k),)
else:
raise ValueError('k cannot be zero')
return z[idx].astype(out_dtype)
else:
def _argtopk_py_impl(x, k, axis, out_dtype):
if k == 1:
return np.argmax(x, axis=axis).astype(out_dtype)
elif k == -1:
return np.argmin(x, axis=axis).astype(out_dtype)
ndim = x.ndim
asize = x.shape[axis] asize = x.shape[axis]
if asize == abs(k): if asize == abs(k):
z = np.arange(abs(k), dtype=out_dtype) if not op.return_indices:
l = axis % ndim return x.copy()
r = ndim - l else:
z = z.reshape((1,) * l + (k,) + (1,) * r) l = axis
reps = list(x.shape) r = ndim - l
reps[axis] = 1 reps = list(x.shape)
return np.tile(z, reps) reps[axis] = 1
zi = np.arange(abs(k), dtype=idx_dtype)
# numpy implementation for older version zi = zi.reshape((1,) * l + (k,) + (1,) * (r - 1))
z = np.argsort(x, axis=axis) zi = np.tile(zi, reps)
idx = (slice(None),) * (axis - 1) if op.return_values:
return x.copy(), zi
else:
return zi
idx = [slice(None)] * ndim
if k > 0: if k > 0:
idx += (slice(-k, None),) idx[axis] = slice(-k, None)
elif k < 0: elif k < 0:
idx += (slice(-k),) idx[axis] = slice(-k)
else: else:
raise ValueError('k cannot be zero') raise ValueError('k cannot be zero')
return z[idx].astype(out_dtype) if not op.return_indices:
zv = np.partition(x, -k, axis=axis)[idx]
return zv
elif op.return_values:
zi = np.argpartition(x, -k, axis=axis)[idx]
idx2 = tuple(np.arange(s).reshape((s,)+(1,)*(ndim-i-1)) if i != axis else zi for i, s in enumerate(x.shape))
zv = x[idx2]
return zv, zi.astype(idx_dtype)
else:
zi = np.argpartition(x, -k, axis=axis)[idx]
return zi
else:
def _topk_py_impl(op, x, k, axis, idx_dtype):
# TODO better compatibility?
raise NotImplementedError('TopKOp: need numpy.argpartition() method (numpy >= 1.8)')
class ArgTopKOp(theano.Op): class TopKOp(theano.Op):
""" """
See help(theano.argtopk) Operations related to finding k-largest elements.
The outputs of this Op depends on ``returns_values`` and ``return_indices``,
if both ``True``, will return two outputs, corresponding to k-largest values
and indices. If only one is ``True``, this Op shall have only one output. Can't
be both ``False``.
Parameters
----------
axis: integer
The axis to perform the operation. Must be in range ``[-ndim, ndim)``, where
``ndim`` is the dimensionality of input tensor.
return_values: bool
Defaults to ``True``
If ``True``, one output of the Op will return k-largest array values.
return_indices: bool
Defaults to ``False``
If ``True``, one output of the Op will return the indices on the given axis.
Notes
-----
- ``return_values`` and ``return_indices`` cannot be both ``False``
See Also
--------
topk
argtopk
argtopk_and_topk
""" """
__props__ = ('axis',) # TODO more params
'''
sorted: bool
Defaults to ``False``
If True, the result array would be incremental-sorted. Mutually exclusive with ``sparse``
sparse: bool
Defaults to ``False``
if ``True``, the output array will always have the same shape as input.
The non-top-k values will be replaced by zero.
def __init__(self, axis=-1): only_top_kth: bool
Defaults to ``False``
If ``True``, will only find the exact top k-th element. The Op behaves
like a reduction.
'''
# TODO c_code
__props__ = ('axis', 'return_values', 'return_indices')
def __init__(self, axis=-1, return_indices=False, return_values=True):
assert isinstance(axis, int) assert isinstance(axis, int)
assert return_indices or return_values
self.axis = axis self.axis = axis
self.return_indices = return_indices
self.return_values = return_values
def __str__(self): def __str__(self):
return '%(op)s{axis=%(axis)d}' % dict( return '%(op)s{axis=%(axis)d}' % dict(
op=self.__class__.__name__, axis=self.axis) op=self.__class__.__name__, axis=self.axis)
def make_node(self, inp, k, out_dtype='int64'): def make_node(self, inp, k, idx_dtype='int64'):
# numpy always uses float64 as output dtype for arg*() routines
# however, we add this option as memory is more precious on gpu
inp = theano.tensor.as_tensor_variable(inp) inp = theano.tensor.as_tensor_variable(inp)
k = theano.tensor.as_tensor_variable(k) k = theano.tensor.as_tensor_variable(k)
bcast = inp.type.broadcastable bcast = inp.type.broadcastable
return theano.Apply(self, [inp, k], [ outs = []
theano.tensor.TensorType( if self.return_values:
dtype=out_dtype, outs.append(inp.type())
broadcastable=bcast)()]) if self.return_indices:
outs.append(
theano.tensor.TensorType(dtype=idx_dtype, broadcastable=bcast)())
return theano.Apply(self, [inp, k], outs)
def perform(self, node, inputs, output_storage): def perform(self, node, inputs, output_storage):
x, k = inputs x, k = inputs
pz = output_storage[0] ndim = x.ndim
print("Op's axis: %d" % self.axis) axis = self.axis
pz[0] = _argtopk_py_impl(x, k, self.axis, node.outputs[0].dtype) assert -ndim <= axis < ndim
axis %= ndim
if not self.return_indices:
pzv = output_storage[0]
pzv[0] = _topk_py_impl(self, x, k, axis, None)
elif self.return_values:
pzv = output_storage[0]
pzi = output_storage[1]
pzv[0], pzi[0] = _topk_py_impl(self, x, k, axis, node.outputs[1].dtype)
else:
pzi = output_storage[0]
pzi[0] = _topk_py_impl(self, x, k, axis, node.outputs[0].dtype)
def infer_shape(self, node, inp_shapes): def infer_shape(self, node, inp_shapes):
# numpy always uses float64 as output dtype for arg*() routines
# however, we add this option as memory is more precious on gpu
_check_tensor_is_scalar(node.inputs[1]) _check_tensor_is_scalar(node.inputs[1])
shp = list(inp_shapes[0]) shp = list(inp_shapes[0])
if not isinstance(self.axis, int): if not isinstance(self.axis, int):
raise TypeError( raise TypeError(
'axis parameter must be integer, got "%s"' % type(self.axis)) '"axis" parameter must be integer, got "%s"' % type(self.axis))
ndim = node.inputs[0].ndim ndim = node.inputs[0].ndim
if ndim == 0: if ndim == 0:
raise ValueError('cannot use 0d tensor') raise ValueError('Cannot take 0d tensor as input')
if not -ndim <= self.axis < ndim: if not -ndim <= self.axis < ndim:
raise IndexError( raise IndexError(
'axis parameter out of range,' '"axis" parameter out of range,'
' expected integer within [%d, %d]' % (-ndim, ndim - 1)) ' expected integer within [%d, %d]' % (-ndim, ndim - 1))
shp[self.axis] = np.abs(node.inputs[1]) shp[self.axis] = np.abs(node.inputs[1])
return [tuple(shp)] shp = tuple(shp)
return [shp for i in [self.return_values, self.return_indices] if i]
def topk(x, k, axis=-1):
"""
Returns the k-largest elements along an axis.
Parameters
----------
x: tensor instance
k: integer constant/variable
Must not be 0. If negative, gives k-smallest elements instead.
axis: integer or ``None``
Upon which axis shall the operation be performed on. If ``None``,
works on flattened array.
Notes
-----
- The returned values may not be sorted.
def argtopk(x, k, axis=-1, out_dtype='int64'): """
if axis is None:
x = theano.tensor.flatten(x)
axis = -1
return TopKOp(axis=axis)(x, k)
def argtopk(x, k, axis=-1, idx_dtype='int64'):
""" """
Returns the indices of k-largest elements along an axis. Returns the indices of k-largest elements along an axis.
...@@ -341,21 +441,35 @@ def argtopk(x, k, axis=-1, out_dtype='int64'): ...@@ -341,21 +441,35 @@ def argtopk(x, k, axis=-1, out_dtype='int64'):
x: tensor instance x: tensor instance
k: integer constant/variable k: integer constant/variable
Must not be 0. If negative, gives k-least elements instead. Must not be 0. If negative, gives k-smallest elements instead.
axis: integer or ``None`` axis: integer or ``None``
Upon which axis shall the operation be performed on. If ``None``, Upon which axis shall the operation be performed on. If ``None``,
works on flattened array. works on flattened array.
out_dtype: string idx_dtype: string
Specify output dtype, defaults to ``int64``, must be integer type Specify output dtype, defaults to ``int64``, must be integer type.
Notes Notes
----- -----
- The corresponding value of returned indices may not be sorted themselves - The corresponding values of returned indices may not be sorted.
""" """
if axis is None: if axis is None:
x = theano.tensor.flatten(x) x = theano.tensor.flatten(x)
axis = -1 axis = -1
return ArgTopKOp(axis=axis)(x, k, out_dtype=out_dtype) return TopKOp(axis=axis, return_indices=True, return_values=False)(x, k, idx_dtype=idx_dtype)
def topk_and_argtopk(x, k, axis=-1, idx_dtype='int64'):
'''
Returns the results of both topk() and argtopk() in one Op.
See the respective documentation for details.
'''
if axis is None:
x = theano.tensor.flatten(x)
axis = -1
return TopKOp(axis=axis, return_indices=True)(x, k, idx_dtype=idx_dtype)
...@@ -11,7 +11,7 @@ from theano import tensor ...@@ -11,7 +11,7 @@ from theano import tensor
from theano.tensor.sort import sort, SortOp from theano.tensor.sort import sort, SortOp
from theano.tensor.sort import argsort, ArgSortOp from theano.tensor.sort import argsort, ArgSortOp
from theano.tensor.sort import argtopk, ArgTopKOp from theano.tensor.sort import topk, argtopk, topk_and_argtopk, TopKOp
_dtypes = ( _dtypes = (
'float32', 'float64', 'float32', 'float64',
...@@ -24,10 +24,11 @@ _int_dtypes = ( ...@@ -24,10 +24,11 @@ _int_dtypes = (
def gen_unique_vector(size, dtype): def gen_unique_vector(size, dtype):
# generate a randomized vector with unique elements # generate a randomized vector with unique elements
retval = np.cumsum(np.random.uniform(1.01, 3.01, size)) retval = np.arange(size*3) + np.random.uniform(-1., 1.)
return (retval[np.random.permutation(size)] - size).astype(dtype) return (retval[np.random.permutation(size)] - size*1.5).astype(dtype)
'''
class Test_sort(unittest.TestCase): class Test_sort(unittest.TestCase):
def setUp(self): def setUp(self):
...@@ -235,21 +236,67 @@ def test_argsort_grad(): ...@@ -235,21 +236,67 @@ def test_argsort_grad():
data = np.random.rand(2, 3, 3).astype(theano.config.floatX) data = np.random.rand(2, 3, 3).astype(theano.config.floatX)
utt.verify_grad(lambda x: argsort(x, axis=2), [data]) utt.verify_grad(lambda x: argsort(x, axis=2), [data])
'''
class Test_topk(unittest.TestCase): class Test_TopK(unittest.TestCase):
def setUp(self): def setUp(self):
pass pass
@utt.parameterized.expand(product( @utt.parameterized.expand(product(
_dtypes, _int_dtypes, [-1, 0, None])) _dtypes, _int_dtypes, [-1, 0, None]))
def test_sanity(self, dtype, out_dtype, axis): def test_argtopk_sanity(self, dtype, idx_dtype, axis):
x = tensor.vector(name='x', dtype=dtype) x = tensor.vector(name='x', dtype=dtype)
fn = theano.function([x], argtopk(x, 1, axis=axis, out_dtype=out_dtype)) fn = theano.function([x], argtopk(x, 1, axis=axis, idx_dtype=idx_dtype))
xval = np.asarray([1]).astype(dtype) xval = np.asarray([1]).astype(dtype)
yval = fn(xval) yval = fn(xval)
assert yval == np.asarray([0], dtype=out_dtype) assert yval == np.asarray([0], dtype=idx_dtype)
@utt.parameterized.expand(product(
_dtypes, [-1, 0, None]))
def test_topk_sanity(self, dtype, axis):
x = tensor.vector(name='x', dtype=dtype)
fn = theano.function([x], topk(x, 1, axis=axis))
xval = np.asarray([1]).astype(dtype)
yval = fn(xval)
assert yval == xval
@utt.parameterized.expand(product(
_dtypes, _int_dtypes, [-1, 0, None]))
def test_combined_sanity(self, dtype, idx_dtype, axis):
x = tensor.vector(name='x', dtype=dtype)
yv, yi = topk_and_argtopk(x, 1, axis=axis, idx_dtype=idx_dtype)
fn = theano.function([x], [yv, yi])
xval = np.asarray([1]).astype(dtype)
yvval, yival = fn(xval)
assert yival == np.asarray([0], dtype=idx_dtype)
assert np.allclose(xval, yvval)
@utt.parameterized.expand(chain(
product(
(16, 61, 257),
(1, -1, 10, -10, 'n//2', 'n-1', '-n', '1-n'),
('float64', 'int16', 'int8')),
((2049, 1337, 'float64'),)))
def test_topk_1d(self, size, k, dtype):
if isinstance(k, str):
k = eval(k.replace('n', str(size)))
x = theano.tensor.vector(name='x', dtype=dtype)
y = topk(x, k)
fn = theano.function([x], y)
# generate a all-unique array
xval = gen_unique_vector(size, dtype)
yval = fn(xval)
idx = slice(-k, None) if k > 0 else slice(-k)
goal = np.sort(xval)[idx]
print(np.sort(yval))
print(goal)
assert yval.dtype == goal.dtype
assert np.allclose(np.sort(yval), goal)
@utt.parameterized.expand(chain( @utt.parameterized.expand(chain(
product( product(
...@@ -258,38 +305,60 @@ class Test_topk(unittest.TestCase): ...@@ -258,38 +305,60 @@ class Test_topk(unittest.TestCase):
('float32', 'int32'), ('float32', 'int32'),
('int32', 'int64')), ('int32', 'int64')),
((2049, 1337, 'float32', 'int32'),))) ((2049, 1337, 'float32', 'int32'),)))
def test_1d(self, size, k, dtype, out_dtype): def test_argtopk_1d(self, size, k, dtype, idx_dtype):
if isinstance(k, str): if isinstance(k, str):
k = eval(k.replace('n', str(size))) k = eval(k.replace('n', str(size)))
x = theano.tensor.vector(name='x', dtype=dtype) x = theano.tensor.vector(name='x', dtype=dtype)
y = argtopk(x, k, out_dtype=out_dtype) y = argtopk(x, k, idx_dtype=idx_dtype)
fn = theano.function([x], y) fn = theano.function([x], y)
# generate a all-unique array # generate a all-unique array
xval = gen_unique_vector(size, dtype) xval = gen_unique_vector(size, dtype)
yval = fn(xval) yval = fn(xval)
idx = slice(-k, None) if k > 0 else slice(-k) idx = slice(-k, None) if k > 0 else slice(-k)
goal = np.argsort(xval)[idx].astype(out_dtype) goal = np.argsort(xval)[idx].astype(idx_dtype)
print(yval)
print(goal)
print(np.argsort(xval))
# due to uniqueness, we expect indices same # due to uniqueness, we expect indices same
assert np.all(xval[np.sort(yval)] == xval[np.sort(goal)]) assert np.all(xval[np.sort(yval)] == xval[np.sort(goal)])
@utt.parameterized.expand(chain(
product(
(16, 61, 257),
(1, -1, 10, -10, 'n//2', 'n-1', '-n', '1-n'),
('float32', 'int32'),
('int32', 'int64')),
((2049, 1337, 'float32', 'int32'),)))
def test_combined_1d(self, size, k, dtype, idx_dtype):
if isinstance(k, str):
k = eval(k.replace('n', str(size)))
x = theano.tensor.vector(name='x', dtype=dtype)
yv, yi = topk_and_argtopk(x, k, idx_dtype=idx_dtype)
fn = theano.function([x], [yv, yi])
# generate a all-unique array
xval = gen_unique_vector(size, dtype)
yvval, yival = fn(xval)
idx = slice(-k, None) if k > 0 else slice(-k)
goali = np.argsort(xval)[idx].astype(idx_dtype)
goalv = xval[goali]
# due to uniqueness, we expect indices same
assert np.all(xval[np.sort(yival)] == xval[np.sort(goali)])
assert np.allclose(np.sort(yvval), goalv)
@utt.parameterized.expand(chain( @utt.parameterized.expand(chain(
product( product(
(18, 62, 258), (18, 62, 258),
(1, -1, 'n//2'), (1, -1, 'n//2'),
('int32', 'float32')), ('int32', 'float32')),
((2048, 1337, 'float32'),))) ((2048, 1337, 'float32'),)))
def test_1d_collision(self, size, k, dtype): def test_argtopk_1d_collision(self, size, k, dtype):
# with non-unique kth max value # with non-unique kth max value
if isinstance(k, str): if isinstance(k, str):
k = eval(k.replace('n', str(size))) k = eval(k.replace('n', str(size)))
x = theano.tensor.vector(name='x', dtype=dtype) x = theano.tensor.vector(name='x', dtype=dtype)
y = argtopk(x, k, out_dtype='int32') y = argtopk(x, k, idx_dtype='int32')
fn = theano.function([x], y) fn = theano.function([x], y)
xval = np.repeat(np.random.uniform(-100., 100., size=size // 2).astype(dtype), 2) xval = np.repeat(np.random.uniform(-100., 100., size=size // 2).astype(dtype), 2)
xval = xval[np.random.permutation(size)] xval = xval[np.random.permutation(size)]
...@@ -305,7 +374,7 @@ class Test_topk(unittest.TestCase): ...@@ -305,7 +374,7 @@ class Test_topk(unittest.TestCase):
(1, -1, '(1+n)//2', 'n-1', '-n', '1-n'), (1, -1, '(1+n)//2', 'n-1', '-n', '1-n'),
('float32', 'int32'), ('float32', 'int32'),
('int32', 'int64'))) ('int32', 'int64')))
def test_nd(self, shp, k_, dtype, out_dtype): def test_argtopk_nd(self, shp, k_, dtype, idx_dtype):
ndim = len(shp) ndim = len(shp)
for axis in range(-ndim, ndim): for axis in range(-ndim, ndim):
if isinstance(k_, str): if isinstance(k_, str):
...@@ -318,7 +387,7 @@ class Test_topk(unittest.TestCase): ...@@ -318,7 +387,7 @@ class Test_topk(unittest.TestCase):
x = theano.tensor.tensor( x = theano.tensor.tensor(
name='x', broadcastable=(False,) * len(shp), dtype=dtype) name='x', broadcastable=(False,) * len(shp), dtype=dtype)
y = argtopk(x, k, axis=axis, out_dtype=out_dtype) y = argtopk(x, k, axis=axis, idx_dtype=idx_dtype)
fn = theano.function([x], y) fn = theano.function([x], y)
size = reduce(int.__mul__, shp) size = reduce(int.__mul__, shp)
xval = gen_unique_vector(size, dtype).reshape(shp) xval = gen_unique_vector(size, dtype).reshape(shp)
...@@ -327,20 +396,47 @@ class Test_topk(unittest.TestCase): ...@@ -327,20 +396,47 @@ class Test_topk(unittest.TestCase):
l = axis % ndim l = axis % ndim
r = ndim - l r = ndim - l
idx = (slice(None),) * l + (idx,) + (slice(None),) * (r - 1) idx = (slice(None),) * l + (idx,) + (slice(None),) * (r - 1)
goal = np.argsort(xval, axis=axis)[idx].astype(out_dtype) goal = np.argsort(xval, axis=axis)[idx].astype(idx_dtype)
print(dict(k=k, axis=axis, shp=shp)) print(dict(k=k, axis=axis, shp=shp))
print('x:')
print(xval)
print('y:')
print(np.sort(yval, axis=axis)) print(np.sort(yval, axis=axis))
print('goal:')
print(np.sort(goal, axis=axis)) print(np.sort(goal, axis=axis))
# print(np.argsort(xval)) # print(np.argsort(xval))
assert np.all(np.sort(yval, axis=axis) == np.sort(goal, axis=axis)) assert np.all(np.sort(yval, axis=axis) == np.sort(goal, axis=axis))
class ArgTopKInferShapeTester(utt.InferShapeTester): class TopKInferShapeTester(utt.InferShapeTester):
@utt.parameterized.expand(product(
((2, 3), (15, 17), (11, 7, 5), (2, 3, 5, 7, 11), (2, 4, 3, 1)),
(1, -1, '(1+n)//2', 'n-1', '-n', '1-n')))
def test_topk_infer_shape(self, shp, k_):
ndim = len(shp)
for axis in range(-ndim, ndim):
if isinstance(k_, str):
k = eval(k_.replace('n', str(shp[axis])))
else:
k = k_
if k == 0:
continue
x = theano.tensor.tensor(
name='x', broadcastable=(False,) * len(shp),
dtype=theano.config.floatX)
y = topk(x, k, axis=axis)
size = reduce(int.__mul__, shp)
xval = gen_unique_vector(size, theano.config.floatX).reshape(shp)
self._compile_and_check(
[x], [y], [xval], TopKOp)
@utt.parameterized.expand(product( @utt.parameterized.expand(product(
((2, 3), (15, 17), (11, 7, 5), (2, 3, 5, 7, 11), (2, 4, 3, 1)), ((2, 3), (15, 17), (11, 7, 5), (2, 3, 5, 7, 11), (2, 4, 3, 1)),
(1, -1, '(1+n)//2', 'n-1', '-n', '1-n'))) (1, -1, '(1+n)//2', 'n-1', '-n', '1-n')))
def test_infer_shape(self, shp, k_): def test_argtopk_infer_shape(self, shp, k_):
ndim = len(shp) ndim = len(shp)
for axis in range(-ndim, ndim): for axis in range(-ndim, ndim):
if isinstance(k_, str): if isinstance(k_, str):
...@@ -354,8 +450,32 @@ class ArgTopKInferShapeTester(utt.InferShapeTester): ...@@ -354,8 +450,32 @@ class ArgTopKInferShapeTester(utt.InferShapeTester):
x = theano.tensor.tensor( x = theano.tensor.tensor(
name='x', broadcastable=(False,) * len(shp), name='x', broadcastable=(False,) * len(shp),
dtype=theano.config.floatX) dtype=theano.config.floatX)
y = argtopk(x, k, axis=axis, out_dtype='int32') y = argtopk(x, k, axis=axis, idx_dtype='int32')
size = reduce(int.__mul__, shp) size = reduce(int.__mul__, shp)
xval = gen_unique_vector(size, theano.config.floatX).reshape(shp) xval = gen_unique_vector(size, theano.config.floatX).reshape(shp)
self._compile_and_check( self._compile_and_check(
[x], [y], [xval], ArgTopKOp) [x], [y], [xval], TopKOp)
@utt.parameterized.expand(product(
((2, 3), (15, 17), (11, 7, 5), (2, 3, 5, 7, 11), (2, 4, 3, 1)),
(1, -1, '(1+n)//2', 'n-1', '-n', '1-n')))
def test_combined_infer_shape(self, shp, k_):
ndim = len(shp)
for axis in range(-ndim, ndim):
if isinstance(k_, str):
k = eval(k_.replace('n', str(shp[axis])))
else:
k = k_
if k == 0:
continue
x = theano.tensor.tensor(
name='x', broadcastable=(False,) * len(shp),
dtype=theano.config.floatX)
yv, yi = topk_and_argtopk(x, k, axis=axis, idx_dtype='int32')
size = reduce(int.__mul__, shp)
xval = gen_unique_vector(size, theano.config.floatX).reshape(shp)
self._compile_and_check(
[x], [yv, yi], [xval], TopKOp)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论