Unverified 提交 55e94968 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6513 from abergeron/fix_gputopk

Fix compilation of GpuTopKOp
...@@ -187,6 +187,21 @@ struct RadixConfig<char> { ...@@ -187,6 +187,21 @@ struct RadixConfig<char> {
} }
}; };
// g++ makes difference between 'signed char' (ga_byte, int8) and 'char'.
// Same code as for char.
template <>
struct RadixConfig<ga_byte> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(ga_byte v) {
return 128u + v;
}
static inline __device__ ga_byte deconvert(RadixType v) {
return v - 128;
}
};
template <> template <>
struct RadixConfig<short> { struct RadixConfig<short> {
typedef unsigned int RadixType; typedef unsigned int RadixType;
...@@ -229,25 +244,27 @@ struct RadixConfig<long long> { ...@@ -229,25 +244,27 @@ struct RadixConfig<long long> {
} }
}; };
#define USE_HALF $use_half /* NB: This specialization for ga_half does know that ga_half is a struct with only one member of type ga_ushort.
* So, if ga_half implementation changes, this code should change too.
#if USE_HALF == 1 * TODO: Maybe should gpuarray provide abstract functions to manipulate ga_half internal structure? e.g:
// since half is ushort, using macro to protect this part is necessary * unsigned short ga_half2bits(ga_half value);
* ga_half ga_bits2half(unsigned short bits);
*/
template <> template <>
struct RadixConfig<unsigned short> { struct RadixConfig<ga_half> {
typedef unsigned int RadixType; typedef unsigned int RadixType;
static inline __device__ RadixType convert(unsigned short v) { static inline __device__ RadixType convert(ga_half v) {
RadixType mask = -(((RadixType)v >> 15)) | 0x8000; RadixType mask = -(((RadixType)v.data >> 15)) | 0x8000;
return (v ^ mask); return (v.data ^ mask);
} }
static inline __device__ unsigned short deconvert(RadixType v) { static inline __device__ ga_half deconvert(RadixType v) {
RadixType mask = ((v >> 15) - 1) | 0x8000; RadixType mask = ((v >> 15) - 1) | 0x8000;
return (unsigned short)(v ^ mask); ga_half out = {(unsigned short)(v ^ mask)};
return out;
} }
}; };
#endif // USE_HALF
// $$inp_t should be replaced in c_code // $$inp_t should be replaced in c_code
// we cannot use templated kernel because gpuarray API does not support it // we cannot use templated kernel because gpuarray API does not support it
...@@ -356,3 +373,61 @@ static __device__ inline T ptr_read_cached(T *ptr, ssize_t offset) { ...@@ -356,3 +373,61 @@ static __device__ inline T ptr_read_cached(T *ptr, ssize_t offset) {
return __ldg(((T*)((char*)ptr + offset))); return __ldg(((T*)((char*)ptr + offset)));
} }
/* NB: __ldg is not defined for ga_half, so we must specialize ptr_read_cached.
* To do it, I try to use a built-in type that should have the same size as ga_half.
* Based on current ga_half implementation (2017/11/27), it should be ga_ushort.
* This code must be updated every time ga_half implementation size changes,
* until a better code be provided. */
#define GA_HALF_STD_TYPE ga_ushort
static __device__ inline ga_half ptr_read_cached(ga_half *ptr, ssize_t offset) {
int check_ga_half_std_type[ ( ( sizeof(GA_HALF_STD_TYPE) - sizeof(ga_half) ) ? -1 : 1 ) ];
GA_HALF_STD_TYPE out = __ldg(((GA_HALF_STD_TYPE*)((char*)ptr + offset)));
ga_half real_out;
*(GA_HALF_STD_TYPE*)(&real_out) = out;
return real_out;
}
#undef GA_HALF_STD_TYPE
/* Comparisons involving ga_half and conversions from integers (e.g. 0, 1) to ga_half lead to compilation errors.
* Following functions are provided to bypass these issues. */
template<typename T>
static __device__ inline T theano_zero() {return 0;}
template<>
__device__ inline ga_half theano_zero() {return ga_float2half(0);}
template<typename T>
static __device__ inline T theano_one() {return 1;}
template<>
__device__ inline ga_half theano_one() {return ga_float2half(1);}
template<typename A, typename B> static __device__ inline bool theano_eq(const A& a, const B& b) {return a == b;}
template<typename A, typename B> static __device__ inline bool theano_ne(const A& a, const B& b) {return a != b;}
template<typename A, typename B> static __device__ inline bool theano_lt(const A& a, const B& b) {return a < b;}
template<typename A, typename B> static __device__ inline bool theano_gt(const A& a, const B& b) {return a > b;}
template<typename A, typename B> static __device__ inline bool theano_le(const A& a, const B& b) {return a <= b;}
template<typename A, typename B> static __device__ inline bool theano_ge(const A& a, const B& b) {return a >= b;}
template<typename T> static __device__ inline bool theano_eq(const ga_half& a, const T& b) {return ga_half2float(a) == b;}
template<typename T> static __device__ inline bool theano_ne(const ga_half& a, const T& b) {return ga_half2float(a) != b;}
template<typename T> static __device__ inline bool theano_lt(const ga_half& a, const T& b) {return ga_half2float(a) < b;}
template<typename T> static __device__ inline bool theano_gt(const ga_half& a, const T& b) {return ga_half2float(a) > b;}
template<typename T> static __device__ inline bool theano_le(const ga_half& a, const T& b) {return ga_half2float(a) <= b;}
template<typename T> static __device__ inline bool theano_ge(const ga_half& a, const T& b) {return ga_half2float(a) >= b;}
template<typename T> static __device__ inline bool theano_eq(const T& a, const ga_half& b) {return a == ga_half2float(b);}
template<typename T> static __device__ inline bool theano_ne(const T& a, const ga_half& b) {return a != ga_half2float(b);}
template<typename T> static __device__ inline bool theano_lt(const T& a, const ga_half& b) {return a < ga_half2float(b);}
template<typename T> static __device__ inline bool theano_gt(const T& a, const ga_half& b) {return a > ga_half2float(b);}
template<typename T> static __device__ inline bool theano_le(const T& a, const ga_half& b) {return a <= ga_half2float(b);}
template<typename T> static __device__ inline bool theano_ge(const T& a, const ga_half& b) {return a >= ga_half2float(b);}
static __device__ inline bool theano_eq(const ga_half& a, const ga_half& b) {return ga_half2float(a) == ga_half2float(b);}
static __device__ inline bool theano_ne(const ga_half& a, const ga_half& b) {return ga_half2float(a) != ga_half2float(b);}
static __device__ inline bool theano_lt(const ga_half& a, const ga_half& b) {return ga_half2float(a) < ga_half2float(b);}
static __device__ inline bool theano_gt(const ga_half& a, const ga_half& b) {return ga_half2float(a) > ga_half2float(b);}
static __device__ inline bool theano_le(const ga_half& a, const ga_half& b) {return ga_half2float(a) <= ga_half2float(b);}
static __device__ inline bool theano_ge(const ga_half& a, const ga_half& b) {return ga_half2float(a) >= ga_half2float(b);}
...@@ -4,19 +4,24 @@ ...@@ -4,19 +4,24 @@
#define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS) #define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
// works when length on axis is within max allowed threads in block (1024) // works when length on axis is within max allowed threads in block (1024)
KERNEL void k_topk_dense( extern "C" __global__ void k_topk_dense(
$dims $dims
// size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM} // size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
$dstv $dstv
// INPUT_TYPE *dstv // INPUT_TYPE *dstv
$dstv_offset
// size_t offset
$dstv_strides $dstv_strides
// ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM} // ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
$dsti $dsti
// INDEX_TYPE *dsti // INDEX_TYPE *dsti
$dsti_offset
// size_t offset
$dsti_strides $dsti_strides
// ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM} // ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
ssize_t k, ssize_t k,
INPUT_TYPE* src, INPUT_TYPE* src,
size_t src_offset,
$src_strides $src_strides
// ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM} // ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
size_t size) { size_t size) {
...@@ -28,7 +33,6 @@ KERNEL void k_topk_dense( ...@@ -28,7 +33,6 @@ KERNEL void k_topk_dense(
size_t out_idx; size_t out_idx;
const unsigned char warp_id = idx / GA_WARP_SIZE; const unsigned char warp_id = idx / GA_WARP_SIZE;
// 0. get the slice for thread block to work on // 0. get the slice for thread block to work on
size_t gid = blockIdx.x, gidx; size_t gid = blockIdx.x, gidx;
...@@ -43,7 +47,7 @@ KERNEL void k_topk_dense( ...@@ -43,7 +47,7 @@ KERNEL void k_topk_dense(
//} //}
// get input and its radix friendly form // get input and its radix friendly form
const INPUT_TYPE xval = is_topk ? ptr_at(src, idx*src_strides_0) : (INPUT_TYPE)0; const INPUT_TYPE xval = is_topk ? ptr_at(src, idx*src_strides_0) : theano_zero<INPUT_TYPE>();
radix_t x = RadixConfig<INPUT_TYPE>::convert(xval); radix_t x = RadixConfig<INPUT_TYPE>::convert(xval);
// resolve negative k // resolve negative k
......
...@@ -15,19 +15,19 @@ __device__ DataType find_pattern(DataType* smem, ...@@ -15,19 +15,19 @@ __device__ DataType find_pattern(DataType* smem,
RadixType known_bits, RadixType known_bits,
RadixType known_bits_mask) { RadixType known_bits_mask) {
if (threadIdx.x < 32) if (threadIdx.x < 32)
smem[threadIdx.x] = 0; smem[threadIdx.x] = theano_zero<DataType>();
local_barrier(); local_barrier();
// All threads participate in the loop, in order to sync on the flag // All threads participate in the loop, in order to sync on the flag
for (CountType i = threadIdx.x; i < (slice_size + (CountType)blockDim.x-1); i += blockDim.x) { for (CountType i = threadIdx.x; i < (slice_size + (CountType)blockDim.x-1); i += blockDim.x) {
bool in_range = (i < slice_size); bool in_range = (i < slice_size);
DataType v = in_range ? ptr_read_cached(data, i*stride) : 0; DataType v = in_range ? ptr_read_cached(data, i*stride) : theano_zero<DataType>();
if (in_range && ((RadixConfig<DataType>::convert(v) & known_bits_mask) == known_bits)) { if (in_range && ((RadixConfig<DataType>::convert(v) & known_bits_mask) == known_bits)) {
// There should not be conflicts if we are using find_pattern, // There should not be conflicts if we are using find_pattern,
// since the result is unique // since the result is unique
smem[0] = 1; smem[0] = theano_one<DataType>();
smem[1] = v; // can't use val as the flag, since it could be 0 smem[1] = v; // can't use val as the flag, since it could be 0
} }
...@@ -39,10 +39,10 @@ __device__ DataType find_pattern(DataType* smem, ...@@ -39,10 +39,10 @@ __device__ DataType find_pattern(DataType* smem,
local_barrier(); local_barrier();
// Check to see if a thread found the value // Check to see if a thread found the value
if (found != 0) if (theano_ne(found, 0))
return val; return val;
} }
return 0; return theano_zero<DataType>();
} }
// This function counts the distribution of all input values in a // This function counts the distribution of all input values in a
...@@ -194,19 +194,24 @@ __device__ void radix_select(DataType* data, ...@@ -194,19 +194,24 @@ __device__ void radix_select(DataType* data,
*top_kth = RadixConfig<DataType>::deconvert(known_bits); *top_kth = RadixConfig<DataType>::deconvert(known_bits);
} }
KERNEL void KERNEL_NAME( extern "C" __global__ void KERNEL_NAME(
$dims $dims
// size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM} // size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
$dstv $dstv
// INPUT_TYPE *dstv // INPUT_TYPE *dstv
$dstv_offset
// size_t offset
$dstv_strides $dstv_strides
// ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM} // ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
$dsti $dsti
// INDEX_TYPE *dsti // INDEX_TYPE *dsti
$dsti_offset
// size_t offset
$dsti_strides $dsti_strides
// ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM} // ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
ssize_t k, ssize_t k,
INPUT_TYPE* src, INPUT_TYPE* src,
size_t src_offset,
$src_strides $src_strides
// ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM} // ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
size_t size) { size_t size) {
...@@ -255,12 +260,12 @@ KERNEL void KERNEL_NAME( ...@@ -255,12 +260,12 @@ KERNEL void KERNEL_NAME(
for (int i = idx; i < iter_bound; i += blockDim.x) { for (int i = idx; i < iter_bound; i += blockDim.x) {
bool in_range = (i < size); bool in_range = (i < size);
INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : 0; INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : theano_zero<INPUT_TYPE>();
bool has_topk; bool has_topk;
if (order) { if (order) {
has_topk = in_range && (v > topkth_value); has_topk = in_range && (theano_gt(v, topkth_value));
} else { } else {
has_topk = in_range && (v < topkth_value); has_topk = in_range && (theano_lt(v, topkth_value));
} }
int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk); int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
...@@ -283,8 +288,8 @@ KERNEL void KERNEL_NAME( ...@@ -283,8 +288,8 @@ KERNEL void KERNEL_NAME(
for (COUNT_TYPE i = idx; i < iter_bound; i += blockDim.x) { for (COUNT_TYPE i = idx; i < iter_bound; i += blockDim.x) {
bool in_range = (i < size); bool in_range = (i < size);
INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : 0; INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : theano_zero<INPUT_TYPE>();
bool has_topk = in_range && (v == topkth_value); bool has_topk = in_range && (theano_eq(v, topkth_value));
int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk); int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
int carry = smem[blockDim.x / 32 - 1]; int carry = smem[blockDim.x / 32 - 1];
......
...@@ -53,7 +53,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -53,7 +53,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
pygpu.get_include()] pygpu.get_include()]
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (4,)
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
# load kernel source # load kernel source
...@@ -77,21 +77,33 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -77,21 +77,33 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
dsti='dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)' if self.return_indices else '') dsti='dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)' if self.return_indices else '')
set_slice_code = ''.join( set_slice_code = ''.join(
set_slice_code % dict(i=j) for j in range(1, ndim)) set_slice_code % dict(i=j) for j in range(1, ndim))
if self.return_values:
set_slice_code += """
dstv = ptr_add(dstv, dstv_offset);
"""
if self.return_indices:
set_slice_code += """
dsti = ptr_add(dsti, dsti_offset);
"""
set_slice_code += """
src = ptr_add(src, src_offset);
"""
flags = Kernel.get_flags(node.inputs[0].dtype) flags = Kernel.get_flags(node.inputs[0].dtype)
subs = dict( subs = dict(
inp_t=ga.dtype_to_ctype(node.inputs[0].dtype), inp_t=ga.dtype_to_ctype(node.inputs[0].dtype),
out_t=ga.dtype_to_ctype(self.idx_dtype), out_t=ga.dtype_to_ctype(self.idx_dtype),
dims=''.join('size_t dims_%d, ' % i for i in range(1, ndim)), dims=''.join('size_t dims_%d, ' % i for i in range(1, ndim)),
dstv='INPUT_TYPE *dstv,' if self.return_values else '', dstv='INPUT_TYPE *dstv,' if self.return_values else '',
dstv_offset='size_t dstv_offset,' if self.return_values else '',
dsti='INDEX_TYPE *dsti,' if self.return_indices else '', dsti='INDEX_TYPE *dsti,' if self.return_indices else '',
dsti_offset='size_t dsti_offset,' if self.return_indices else '',
dstv_strides=dstv_strides_code if self.return_values else '', dstv_strides=dstv_strides_code if self.return_values else '',
dsti_strides=dsti_strides_code if self.return_indices else '', dsti_strides=dsti_strides_code if self.return_indices else '',
src_strides=src_strides_code, src_strides=src_strides_code,
set_slice=set_slice_code, set_slice=set_slice_code,
write_value=int(self.return_values), write_value=int(self.return_values),
write_index=int(self.return_indices), write_index=int(self.return_indices),
ndim=str(ndim), ndim=str(ndim)
use_half=int(node.inputs[0].dtype == 'float16')
) )
elif device_type == b'opencl': elif device_type == b'opencl':
raise NotImplementedError() raise NotImplementedError()
...@@ -100,9 +112,11 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -100,9 +112,11 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
param_types = [ga.SIZE] * (ndim - 1) # dims param_types = [ga.SIZE] * (ndim - 1) # dims
for _ in range(self.return_values + self.return_indices): for _ in range(self.return_values + self.return_indices):
param_types.append(ga.GpuArray) # dst* param_types.append(ga.GpuArray) # dst*
param_types.append(ga.SIZE) # offset
param_types.extend([ga.SSIZE] * ndim) # dst*_strides param_types.extend([ga.SSIZE] * ndim) # dst*_strides
param_types.append(ga.SIZE) # k param_types.append(ga.SIZE) # k
param_types.append(ga.GpuArray) # src param_types.append(ga.GpuArray) # src
param_types.append(ga.SIZE) # offset
param_types.extend([ga.SSIZE] * ndim) # src_strides param_types.extend([ga.SSIZE] * ndim) # src_strides
param_types.append(ga.SIZE) # size param_types.append(ga.SIZE) # size
...@@ -120,7 +134,8 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -120,7 +134,8 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
) as f: ) as f:
kernel_src = f.read() kernel_src = f.read()
ker = Kernel( ker = Kernel(
code=Template(common_src + kernel_src).substitute(**subs), code=("#include <cluda.h>\n" +
Template(common_src + kernel_src).substitute(**subs)),
name=kname, name=kname,
params=param_types, params=param_types,
flags=flags, flags=flags,
...@@ -159,7 +174,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -159,7 +174,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
ctx = sub['params'] ctx = sub['params']
k_dtype = node.inputs[1].type.dtype_specs()[1] k_dtype = node.inputs[1].type.dtype_specs()[1]
# max threads per block # max threads per block
MAX_TPB = context.maxlsize MAX_TPB = context.maxlsize0
# max blocks per grid # max blocks per grid
MAX_BPG = context.maxgsize0 MAX_BPG = context.maxgsize0
WARP_SIZE = 32 WARP_SIZE = 32
...@@ -169,12 +184,12 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -169,12 +184,12 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
axis = self.axis % ndim axis = self.axis % ndim
del(reordered_axes[axis]) del(reordered_axes[axis])
reordered_axes = [axis] + reordered_axes reordered_axes = [axis] + reordered_axes
dims = ''.join('(void*)(dims+%d), ' % i for i in reordered_axes[1:]) dims = ''.join('dims[%d], ' % i for i in reordered_axes[1:])
prep_output = '' prep_output = ''
if self.return_values: if self.return_values:
def_dvstrides = 'const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)' % yv def_dvstrides = 'const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)' % yv
params_dv = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % (yv, yv) params_dv = '%s->ga.data, %s->ga.offset,\n' % (yv, yv)
params_dv += ''.join('(void*)(dvstrides+%d), ' % i for i in reordered_axes) params_dv += ''.join('dvstrides[%d], ' % i for i in reordered_axes)
prep_output += ''' prep_output += '''
if (0 != theano_prep_output( if (0 != theano_prep_output(
&%(yv)s, %(ndim)d, odims, &%(yv)s, %(ndim)d, odims,
...@@ -186,8 +201,8 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -186,8 +201,8 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
if self.return_indices: if self.return_indices:
def_distrides = 'const ssize_t *distrides = PyGpuArray_STRIDES(%s)' % yi def_distrides = 'const ssize_t *distrides = PyGpuArray_STRIDES(%s)' % yi
params_di = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % yi params_di = '%s->ga.data, %s->ga.offset,\n' % (yi, yi)
params_di += ''.join('(void*)(distrides+%d), ' % i for i in reordered_axes) params_di += ''.join('distrides[%d], ' % i for i in reordered_axes)
prep_output += ''' prep_output += '''
if (0 != theano_prep_output( if (0 != theano_prep_output(
&%(yi)s, %(ndim)d, odims, &%(yi)s, %(ndim)d, odims,
...@@ -196,7 +211,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -196,7 +211,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
}\n''' % locals() }\n''' % locals()
else: else:
def_distrides = params_di = '' def_distrides = params_di = ''
sstrides = ', '.join('(void*)(sstrides+%d)' % i for i in reordered_axes) sstrides = ', '.join('sstrides[%d]' % i for i in reordered_axes)
code = ''' code = '''
{ {
const ssize_t k_ = ((%(k_dtype)s*)(PyArray_DATA(%(k)s)))[0]; const ssize_t k_ = ((%(k_dtype)s*)(PyArray_DATA(%(k)s)))[0];
...@@ -240,31 +255,46 @@ class GpuTopKOp(GpuKernelBase, TopKOp): ...@@ -240,31 +255,46 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
%(def_dvstrides)s; %(def_dvstrides)s;
%(def_distrides)s; %(def_distrides)s;
const ssize_t *sstrides = PyGpuArray_STRIDES(%(x)s); const ssize_t *sstrides = PyGpuArray_STRIDES(%(x)s);
void* args[] = {
%(dims)s
%(params_dv)s
%(params_di)s
(void*)(&k_),
(void*)((char*)(%(x)s->ga.data) + (%(x)s->ga.offset)),
%(sstrides)s,
(void*)(dims+%(axis)d),
};
int err; int err;
if (dims[%(axis)d] > (1u << 31)) { if (dims[%(axis)d] > (1u << 31)) {
block_size = %(MAX_TPB)d; block_size = %(MAX_TPB)d;
err = GpuKernel_call( err = k_topk_dense_xlarge_call(
&k_topk_dense_xlarge%(nodename)s, 1, 1, &grid_size, &block_size, 0,
&grid_size, &block_size, 0, args); %(dims)s
%(params_dv)s
%(params_di)s
k_,
%(x)s->ga.data,
%(x)s->ga.offset,
%(sstrides)s,
dims[%(axis)d]
);
} else if (block_size > %(MAX_TPB)d) { } else if (block_size > %(MAX_TPB)d) {
block_size = %(MAX_TPB)d; block_size = %(MAX_TPB)d;
err = GpuKernel_call( err = k_topk_dense_large_call(
&k_topk_dense_large%(nodename)s, 1, 1, &grid_size, &block_size, 0,
&grid_size, &block_size, 0, args); %(dims)s
%(params_dv)s
%(params_di)s
k_,
%(x)s->ga.data,
%(x)s->ga.offset,
%(sstrides)s,
dims[%(axis)d]
);
} else { } else {
err = GpuKernel_call( err = k_topk_dense_call(
&k_topk_dense%(nodename)s, 1, 1, &grid_size, &block_size, 0,
&grid_size, &block_size, 0, args); %(dims)s
%(params_dv)s
%(params_di)s
k_,
%(x)s->ga.data,
%(x)s->ga.offset,
%(sstrides)s,
dims[%(axis)d]
);
} }
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_SetString( PyErr_SetString(
......
from __future__ import absolute_import, print_function, division
import theano
import theano.tensor.tests.test_sort
from .config import mode_with_gpu
from ..sort import GpuTopKOp
class Test_GpuTopK(theano.tensor.tests.test_sort.Test_TopK):
mode = mode_with_gpu
dtype = 'float32'
op_class = GpuTopKOp
...@@ -7563,6 +7563,7 @@ def local_useless_topk(node): ...@@ -7563,6 +7563,7 @@ def local_useless_topk(node):
old_output = node.outputs[ret_idx] old_output = node.outputs[ret_idx]
new_output = TopKOp( new_output = TopKOp(
axis=op.axis, axis=op.axis,
sorted=op.sorted,
idx_dtype=op.idx_dtype, idx_dtype=op.idx_dtype,
return_values=ret_val, return_values=ret_val,
return_indices=ret_idx)(x, k) return_indices=ret_idx)(x, k)
......
...@@ -232,6 +232,8 @@ def test_argsort_grad(): ...@@ -232,6 +232,8 @@ def test_argsort_grad():
class Test_TopK(unittest.TestCase): class Test_TopK(unittest.TestCase):
mode = None
op_class = TopKOp
def setUp(self): def setUp(self):
pass pass
...@@ -240,7 +242,10 @@ class Test_TopK(unittest.TestCase): ...@@ -240,7 +242,10 @@ class Test_TopK(unittest.TestCase):
_all_dtypes, tensor.integer_dtypes, [-1, 0, None], [False])) _all_dtypes, tensor.integer_dtypes, [-1, 0, None], [False]))
def test_argtopk_sanity(self, dtype, idx_dtype, axis, sorted): def test_argtopk_sanity(self, dtype, idx_dtype, axis, sorted):
x = tensor.vector(name='x', dtype=dtype) x = tensor.vector(name='x', dtype=dtype)
fn = theano.function([x], argtopk(x, 1, axis=axis, sorted=sorted, idx_dtype=idx_dtype)) fn = theano.function([x],
argtopk(x, 1, axis=axis, sorted=sorted, idx_dtype=idx_dtype),
mode=self.mode)
assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
xval = np.asarray([1]).astype(dtype) xval = np.asarray([1]).astype(dtype)
yval = fn(xval) yval = fn(xval)
assert yval == np.asarray([0], dtype=idx_dtype) assert yval == np.asarray([0], dtype=idx_dtype)
...@@ -250,7 +255,8 @@ class Test_TopK(unittest.TestCase): ...@@ -250,7 +255,8 @@ class Test_TopK(unittest.TestCase):
_all_dtypes, [-1, 0, None], [False])) _all_dtypes, [-1, 0, None], [False]))
def test_topk_sanity(self, dtype, axis, sorted): def test_topk_sanity(self, dtype, axis, sorted):
x = tensor.vector(name='x', dtype=dtype) x = tensor.vector(name='x', dtype=dtype)
fn = theano.function([x], topk(x, 1, axis=axis, sorted=sorted)) fn = theano.function([x], topk(x, 1, axis=axis, sorted=sorted), mode=self.mode)
assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
xval = np.asarray([1]).astype(dtype) xval = np.asarray([1]).astype(dtype)
yval = fn(xval) yval = fn(xval)
assert yval == xval assert yval == xval
...@@ -261,7 +267,8 @@ class Test_TopK(unittest.TestCase): ...@@ -261,7 +267,8 @@ class Test_TopK(unittest.TestCase):
def test_combined_sanity(self, dtype, idx_dtype, axis, sorted): def test_combined_sanity(self, dtype, idx_dtype, axis, sorted):
x = tensor.vector(name='x', dtype=dtype) x = tensor.vector(name='x', dtype=dtype)
yv, yi = topk_and_argtopk(x, 1, axis=axis, sorted=sorted, idx_dtype=idx_dtype) yv, yi = topk_and_argtopk(x, 1, axis=axis, sorted=sorted, idx_dtype=idx_dtype)
fn = theano.function([x], [yv, yi]) fn = theano.function([x], [yv, yi], mode=self.mode)
assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
xval = np.asarray([1]).astype(dtype) xval = np.asarray([1]).astype(dtype)
yvval, yival = fn(xval) yvval, yival = fn(xval)
assert yival == np.asarray([0], dtype=idx_dtype) assert yival == np.asarray([0], dtype=idx_dtype)
...@@ -282,7 +289,8 @@ class Test_TopK(unittest.TestCase): ...@@ -282,7 +289,8 @@ class Test_TopK(unittest.TestCase):
x = theano.tensor.vector(name='x', dtype=dtype) x = theano.tensor.vector(name='x', dtype=dtype)
y = topk(x, k, sorted=sorted) y = topk(x, k, sorted=sorted)
fn = theano.function([x], y) fn = theano.function([x], y, mode=self.mode)
assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
# assert local_useless_topk opt is done properly # assert local_useless_topk opt is done properly
assert 1 == len(fn.maker.fgraph.outputs[0].owner.outputs) assert 1 == len(fn.maker.fgraph.outputs[0].owner.outputs)
...@@ -293,7 +301,7 @@ class Test_TopK(unittest.TestCase): ...@@ -293,7 +301,7 @@ class Test_TopK(unittest.TestCase):
goal = np.sort(xval)[idx] goal = np.sort(xval)[idx]
assert yval.dtype == goal.dtype assert yval.dtype == goal.dtype
utt.assert_allclose(np.sort(yval), goal) utt.assert_allclose(goal, np.sort(yval))
@utt.parameterized.expand(chain( @utt.parameterized.expand(chain(
product( product(
...@@ -309,7 +317,8 @@ class Test_TopK(unittest.TestCase): ...@@ -309,7 +317,8 @@ class Test_TopK(unittest.TestCase):
x = theano.tensor.vector(name='x', dtype=dtype) x = theano.tensor.vector(name='x', dtype=dtype)
y = argtopk(x, k, sorted=sorted, idx_dtype=idx_dtype) y = argtopk(x, k, sorted=sorted, idx_dtype=idx_dtype)
fn = theano.function([x], y) fn = theano.function([x], y, mode=self.mode)
assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
# assert local_useless_topk opt is done properly # assert local_useless_topk opt is done properly
assert 1 == len(fn.maker.fgraph.outputs[0].owner.outputs) assert 1 == len(fn.maker.fgraph.outputs[0].owner.outputs)
...@@ -337,7 +346,8 @@ class Test_TopK(unittest.TestCase): ...@@ -337,7 +346,8 @@ class Test_TopK(unittest.TestCase):
x = theano.tensor.vector(name='x', dtype=dtype) x = theano.tensor.vector(name='x', dtype=dtype)
yv, yi = topk_and_argtopk(x, k, sorted=sorted, idx_dtype=idx_dtype) yv, yi = topk_and_argtopk(x, k, sorted=sorted, idx_dtype=idx_dtype)
fn = theano.function([x], [yv, yi]) fn = theano.function([x], [yv, yi], mode=self.mode)
assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
# generate a all-unique array # generate a all-unique array
xval = gen_unique_vector(size, dtype) xval = gen_unique_vector(size, dtype)
yvval, yival = fn(xval) yvval, yival = fn(xval)
...@@ -363,7 +373,8 @@ class Test_TopK(unittest.TestCase): ...@@ -363,7 +373,8 @@ class Test_TopK(unittest.TestCase):
x = theano.tensor.vector(name='x', dtype=dtype) x = theano.tensor.vector(name='x', dtype=dtype)
y = argtopk(x, k, sorted=sorted, idx_dtype='int32') y = argtopk(x, k, sorted=sorted, idx_dtype='int32')
fn = theano.function([x], y) fn = theano.function([x], y, mode=self.mode)
assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
xval = np.repeat(np.random.uniform(-100., 100., size=size // 2).astype(dtype), 2) xval = np.repeat(np.random.uniform(-100., 100., size=size // 2).astype(dtype), 2)
xval = xval[np.random.permutation(size)] xval = xval[np.random.permutation(size)]
yval = fn(xval) yval = fn(xval)
...@@ -372,7 +383,7 @@ class Test_TopK(unittest.TestCase): ...@@ -372,7 +383,7 @@ class Test_TopK(unittest.TestCase):
utt.assert_allclose(np.sort(xval[yval]), np.sort(xval[goal])) utt.assert_allclose(np.sort(xval[yval]), np.sort(xval[goal]))
@utt.parameterized.expand(product( @utt.parameterized.expand(product(
((17, 15), (2, 3, 5, 7, 11), (2017, 5, 3)), ((17, 15), (2, 3, 5, 7, 11), (500, 5, 3)), # NB: Test may fail with bigger sizes (e.g. (2017, 5, 3)) due to "too many resources requested" kernel error on some GPUs.
(-1, '(1+n)//2', '-n', '1-n'), (-1, '(1+n)//2', '-n', '1-n'),
('float32', 'int32'), ('float32', 'int32'),
(False,), (False,),
...@@ -391,7 +402,8 @@ class Test_TopK(unittest.TestCase): ...@@ -391,7 +402,8 @@ class Test_TopK(unittest.TestCase):
x = theano.tensor.tensor( x = theano.tensor.tensor(
name='x', broadcastable=(False,) * len(shp), dtype=dtype) name='x', broadcastable=(False,) * len(shp), dtype=dtype)
y = argtopk(x, k, axis=axis, sorted=sorted, idx_dtype=idx_dtype) y = argtopk(x, k, axis=axis, sorted=sorted, idx_dtype=idx_dtype)
fn = theano.function([x], y) fn = theano.function([x], y, mode=self.mode)
assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
size = reduce(int.__mul__, shp) size = reduce(int.__mul__, shp)
xval = gen_unique_vector(size, dtype).reshape(shp) xval = gen_unique_vector(size, dtype).reshape(shp)
yval = fn(xval) yval = fn(xval)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论