提交 375fec9a authored 作者: notoraptor's avatar notoraptor

Try to fix gputopk: add ga_half handling to template codes.

上级 55f013b9
......@@ -187,6 +187,21 @@ struct RadixConfig<char> {
}
};
// g++ makes difference between 'signed char' (ga_byte, int8) and 'char'.
// Same code as for char.
template <>
struct RadixConfig<ga_byte> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(ga_byte v) {
return 128u + v;
}
static inline __device__ ga_byte deconvert(RadixType v) {
return v - 128;
}
};
template <>
struct RadixConfig<short> {
typedef unsigned int RadixType;
......@@ -229,25 +244,27 @@ struct RadixConfig<long long> {
}
};
#define USE_HALF $use_half
#if USE_HALF == 1
// since half is ushort, using macro to protect this part is necessary
/* NB: This specialization for ga_half does know that ga_half is a struct with only one member of type ga_ushort.
* So, if ga_half implementation changes, this code should change too.
* TODO: Maybe should gpuarray provide abstract functions to manipulate ga_half internal structure? e.g:
* unsigned short ga_half2bits(ga_half value);
* ga_half ga_bits2half(unsigned short bits);
*/
template <>
struct RadixConfig<unsigned short> {
struct RadixConfig<ga_half> {
typedef unsigned int RadixType;
static inline __device__ RadixType convert(unsigned short v) {
RadixType mask = -(((RadixType)v >> 15)) | 0x8000;
return (v ^ mask);
static inline __device__ RadixType convert(ga_half v) {
RadixType mask = -(((RadixType)v.data >> 15)) | 0x8000;
return (v.data ^ mask);
}
static inline __device__ unsigned short deconvert(RadixType v) {
static inline __device__ ga_half deconvert(RadixType v) {
RadixType mask = ((v >> 15) - 1) | 0x8000;
return (unsigned short)(v ^ mask);
ga_half out = {(unsigned short)(v ^ mask)};
return out;
}
};
#endif // USE_HALF
// $$inp_t should be replaced in c_code
// we cannot use templated kernel because gpuarray API does not support it
......@@ -356,3 +373,61 @@ static __device__ inline T ptr_read_cached(T *ptr, ssize_t offset) {
return __ldg(((T*)((char*)ptr + offset)));
}
/* NB: __ldg is not defined for ga_half, so we must specialize ptr_read_cached.
* To do it, I try to use a built-in type that should have the same size as ga_half.
* Based on current ga_half implementation (2017/11/27), it should be ga_ushort.
* This code must be updated every time ga_half implementation size changes,
* until a better code be provided. */
#define GA_HALF_STD_TYPE ga_ushort
static __device__ inline ga_half ptr_read_cached(ga_half *ptr, ssize_t offset) {
int check_ga_half_std_type[ ( ( sizeof(GA_HALF_STD_TYPE) - sizeof(ga_half) ) ? -1 : 1 ) ];
GA_HALF_STD_TYPE out = __ldg(((GA_HALF_STD_TYPE*)((char*)ptr + offset)));
ga_half real_out;
*(GA_HALF_STD_TYPE*)(&real_out) = out;
return real_out;
}
#undef GA_HALF_STD_TYPE
/* Comparisons involving ga_half and conversions from integers (e.g. 0, 1) to ga_half lead to compilation errors.
* Following functions are provided to bypass these issues. */
template<typename T>
static __device__ inline T theano_zero() {return 0;}
template<>
__device__ inline ga_half theano_zero() {return ga_float2half(0);}
template<typename T>
static __device__ inline T theano_one() {return 1;}
template<>
__device__ inline ga_half theano_one() {return ga_float2half(1);}
template<typename A, typename B> static __device__ inline bool theano_eq(const A& a, const B& b) {return a == b;}
template<typename A, typename B> static __device__ inline bool theano_ne(const A& a, const B& b) {return a != b;}
template<typename A, typename B> static __device__ inline bool theano_lt(const A& a, const B& b) {return a < b;}
template<typename A, typename B> static __device__ inline bool theano_gt(const A& a, const B& b) {return a > b;}
template<typename A, typename B> static __device__ inline bool theano_le(const A& a, const B& b) {return a <= b;}
template<typename A, typename B> static __device__ inline bool theano_ge(const A& a, const B& b) {return a >= b;}
template<typename T> static __device__ inline bool theano_eq(const ga_half& a, const T& b) {return ga_half2float(a) == b;}
template<typename T> static __device__ inline bool theano_ne(const ga_half& a, const T& b) {return ga_half2float(a) != b;}
template<typename T> static __device__ inline bool theano_lt(const ga_half& a, const T& b) {return ga_half2float(a) < b;}
template<typename T> static __device__ inline bool theano_gt(const ga_half& a, const T& b) {return ga_half2float(a) > b;}
template<typename T> static __device__ inline bool theano_le(const ga_half& a, const T& b) {return ga_half2float(a) <= b;}
template<typename T> static __device__ inline bool theano_ge(const ga_half& a, const T& b) {return ga_half2float(a) >= b;}
template<typename T> static __device__ inline bool theano_eq(const T& a, const ga_half& b) {return a == ga_half2float(b);}
template<typename T> static __device__ inline bool theano_ne(const T& a, const ga_half& b) {return a != ga_half2float(b);}
template<typename T> static __device__ inline bool theano_lt(const T& a, const ga_half& b) {return a < ga_half2float(b);}
template<typename T> static __device__ inline bool theano_gt(const T& a, const ga_half& b) {return a > ga_half2float(b);}
template<typename T> static __device__ inline bool theano_le(const T& a, const ga_half& b) {return a <= ga_half2float(b);}
template<typename T> static __device__ inline bool theano_ge(const T& a, const ga_half& b) {return a >= ga_half2float(b);}
static __device__ inline bool theano_eq(const ga_half& a, const ga_half& b) {return ga_half2float(a) == ga_half2float(b);}
static __device__ inline bool theano_ne(const ga_half& a, const ga_half& b) {return ga_half2float(a) != ga_half2float(b);}
static __device__ inline bool theano_lt(const ga_half& a, const ga_half& b) {return ga_half2float(a) < ga_half2float(b);}
static __device__ inline bool theano_gt(const ga_half& a, const ga_half& b) {return ga_half2float(a) > ga_half2float(b);}
static __device__ inline bool theano_le(const ga_half& a, const ga_half& b) {return ga_half2float(a) <= ga_half2float(b);}
static __device__ inline bool theano_ge(const ga_half& a, const ga_half& b) {return ga_half2float(a) >= ga_half2float(b);}
......@@ -47,7 +47,7 @@ extern "C" __global__ void k_topk_dense(
//}
// get input and its radix friendly form
const INPUT_TYPE xval = is_topk ? ptr_at(src, idx*src_strides_0) : (INPUT_TYPE)0;
const INPUT_TYPE xval = is_topk ? ptr_at(src, idx*src_strides_0) : theano_zero<INPUT_TYPE>();
radix_t x = RadixConfig<INPUT_TYPE>::convert(xval);
// resolve negative k
......
......@@ -15,19 +15,19 @@ __device__ DataType find_pattern(DataType* smem,
RadixType known_bits,
RadixType known_bits_mask) {
if (threadIdx.x < 32)
smem[threadIdx.x] = 0;
smem[threadIdx.x] = theano_zero<DataType>();
local_barrier();
// All threads participate in the loop, in order to sync on the flag
for (CountType i = threadIdx.x; i < (slice_size + (CountType)blockDim.x-1); i += blockDim.x) {
bool in_range = (i < slice_size);
DataType v = in_range ? ptr_read_cached(data, i*stride) : 0;
DataType v = in_range ? ptr_read_cached(data, i*stride) : theano_zero<DataType>();
if (in_range && ((RadixConfig<DataType>::convert(v) & known_bits_mask) == known_bits)) {
// There should not be conflicts if we are using find_pattern,
// since the result is unique
smem[0] = 1;
smem[0] = theano_one<DataType>();
smem[1] = v; // can't use val as the flag, since it could be 0
}
......@@ -39,10 +39,10 @@ __device__ DataType find_pattern(DataType* smem,
local_barrier();
// Check to see if a thread found the value
if (found != 0)
if (theano_ne(found, 0))
return val;
}
return 0;
return theano_zero<DataType>();
}
// This function counts the distribution of all input values in a
......@@ -260,12 +260,12 @@ extern "C" __global__ void KERNEL_NAME(
for (int i = idx; i < iter_bound; i += blockDim.x) {
bool in_range = (i < size);
INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : 0;
INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : theano_zero<INPUT_TYPE>();
bool has_topk;
if (order) {
has_topk = in_range && (v > topkth_value);
has_topk = in_range && (theano_gt(v, topkth_value));
} else {
has_topk = in_range && (v < topkth_value);
has_topk = in_range && (theano_lt(v, topkth_value));
}
int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
......@@ -288,8 +288,8 @@ extern "C" __global__ void KERNEL_NAME(
for (COUNT_TYPE i = idx; i < iter_bound; i += blockDim.x) {
bool in_range = (i < size);
INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : 0;
bool has_topk = in_range && (v == topkth_value);
INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : theano_zero<INPUT_TYPE>();
bool has_topk = in_range && (theano_eq(v, topkth_value));
int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
int carry = smem[blockDim.x / 32 - 1];
......
......@@ -53,7 +53,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
pygpu.get_include()]
def c_code_cache_version(self):
return (3,)
return (4,)
def gpu_kernels(self, node, nodename):
# load kernel source
......@@ -103,8 +103,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
set_slice=set_slice_code,
write_value=int(self.return_values),
write_index=int(self.return_indices),
ndim=str(ndim),
use_half=int(node.inputs[0].dtype == 'float16')
ndim=str(ndim)
)
elif device_type == b'opencl':
raise NotImplementedError()
......
......@@ -7563,6 +7563,7 @@ def local_useless_topk(node):
old_output = node.outputs[ret_idx]
new_output = TopKOp(
axis=op.axis,
sorted=op.sorted,
idx_dtype=op.idx_dtype,
return_values=ret_val,
return_indices=ret_idx)(x, k)
......
......@@ -301,7 +301,7 @@ class Test_TopK(unittest.TestCase):
goal = np.sort(xval)[idx]
assert yval.dtype == goal.dtype
utt.assert_allclose(np.sort(yval), goal)
utt.assert_allclose(goal, np.sort(yval))
@utt.parameterized.expand(chain(
product(
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论