Try to fix gputopk: add ga_half handling to template codes.

375fec9a · notoraptor · 55f013b9 · 375fec9a · 375fec9a · 375fec9a
--- a/theano/gpuarray/c_code/topk_common.cuh
+++ b/theano/gpuarray/c_code/topk_common.cuh
@@ -187,6 +187,21 @@ struct RadixConfig<char> {
  }
 };

+// g++ makes difference between 'signed char' (ga_byte, int8) and 'char'.
+// Same code as for char.
+template <>
+struct RadixConfig<ga_byte> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(ga_byte v) {
+    return 128u + v;
+  }
+
+  static inline __device__ ga_byte deconvert(RadixType v) {
+    return v - 128;
+  }
+};
+
 template <>
 struct RadixConfig<short> {
  typedef unsigned int RadixType;
@@ -229,25 +244,27 @@ struct RadixConfig<long long> {
  }
 };

-#define USE_HALF $use_half
-
-#if USE_HALF == 1
-// since half is ushort, using macro to protect this part is necessary
+/* NB: This specialization for ga_half does know that ga_half is a struct with only one member of type ga_ushort.
+ * So, if ga_half implementation changes, this code should change too.
+ * TODO: Maybe should gpuarray provide abstract functions to manipulate ga_half internal structure? e.g:
+ *   unsigned short ga_half2bits(ga_half value);
+ *   ga_half ga_bits2half(unsigned short bits);
+ */
 template <>
-struct RadixConfig<unsigned short> {
+struct RadixConfig<ga_half> {
  typedef unsigned int RadixType;

-  static inline __device__ RadixType convert(unsigned short v) {
-    RadixType mask = -(((RadixType)v >> 15)) | 0x8000;
-    return (v ^ mask);
+  static inline __device__ RadixType convert(ga_half v) {
+    RadixType mask = -(((RadixType)v.data >> 15)) | 0x8000;
+    return (v.data ^ mask);
  }

-  static inline __device__ unsigned short deconvert(RadixType v) {
+  static inline __device__ ga_half deconvert(RadixType v) {
    RadixType mask = ((v >> 15) - 1) | 0x8000;
-    return (unsigned short)(v ^ mask);
+    ga_half out = {(unsigned short)(v ^ mask)};
+    return out;
  }
 };
-#endif // USE_HALF

 // $$inp_t should be replaced in c_code
 // we cannot use templated kernel because gpuarray API does not support it
@@ -356,3 +373,61 @@ static __device__ inline T ptr_read_cached(T *ptr, ssize_t offset) {
    return __ldg(((T*)((char*)ptr + offset)));
 }

+/* NB: __ldg is not defined for ga_half, so we must specialize ptr_read_cached.
+ * To do it, I try to use a built-in type that should have the same size as ga_half.
+ * Based on current ga_half implementation (2017/11/27), it should be ga_ushort.
+ * This code must be updated every time ga_half implementation size changes,
+ * until a better code be provided. */
+#define GA_HALF_STD_TYPE ga_ushort
+static __device__ inline ga_half ptr_read_cached(ga_half *ptr, ssize_t offset) {
+
+    int check_ga_half_std_type[ ( ( sizeof(GA_HALF_STD_TYPE) - sizeof(ga_half) ) ? -1 : 1 ) ];
+
+    GA_HALF_STD_TYPE out = __ldg(((GA_HALF_STD_TYPE*)((char*)ptr + offset)));
+    ga_half real_out;
+    *(GA_HALF_STD_TYPE*)(&real_out) = out;
+    return real_out;
+
+}
+#undef GA_HALF_STD_TYPE
+
+/* Comparisons involving ga_half and conversions from integers (e.g. 0, 1) to ga_half lead to compilation errors.
+ * Following functions are provided to bypass these issues. */
+
+template<typename T>
+static __device__ inline T theano_zero() {return 0;}
+template<>
+__device__ inline ga_half theano_zero() {return ga_float2half(0);}
+
+template<typename T>
+static __device__ inline T theano_one() {return 1;}
+template<>
+__device__ inline ga_half theano_one() {return ga_float2half(1);}
+
+template<typename A, typename B> static __device__ inline bool theano_eq(const A& a, const B& b) {return a == b;}
+template<typename A, typename B> static __device__ inline bool theano_ne(const A& a, const B& b) {return a != b;}
+template<typename A, typename B> static __device__ inline bool theano_lt(const A& a, const B& b) {return a < b;}
+template<typename A, typename B> static __device__ inline bool theano_gt(const A& a, const B& b) {return a > b;}
+template<typename A, typename B> static __device__ inline bool theano_le(const A& a, const B& b) {return a <= b;}
+template<typename A, typename B> static __device__ inline bool theano_ge(const A& a, const B& b) {return a >= b;}
+
+template<typename T> static __device__ inline bool theano_eq(const ga_half& a, const T& b) {return ga_half2float(a) == b;}
+template<typename T> static __device__ inline bool theano_ne(const ga_half& a, const T& b) {return ga_half2float(a) != b;}
+template<typename T> static __device__ inline bool theano_lt(const ga_half& a, const T& b) {return ga_half2float(a) < b;}
+template<typename T> static __device__ inline bool theano_gt(const ga_half& a, const T& b) {return ga_half2float(a) > b;}
+template<typename T> static __device__ inline bool theano_le(const ga_half& a, const T& b) {return ga_half2float(a) <= b;}
+template<typename T> static __device__ inline bool theano_ge(const ga_half& a, const T& b) {return ga_half2float(a) >= b;}
+
+template<typename T> static __device__ inline bool theano_eq(const T& a, const ga_half& b) {return a == ga_half2float(b);}
+template<typename T> static __device__ inline bool theano_ne(const T& a, const ga_half& b) {return a != ga_half2float(b);}
+template<typename T> static __device__ inline bool theano_lt(const T& a, const ga_half& b) {return a < ga_half2float(b);}
+template<typename T> static __device__ inline bool theano_gt(const T& a, const ga_half& b) {return a > ga_half2float(b);}
+template<typename T> static __device__ inline bool theano_le(const T& a, const ga_half& b) {return a <= ga_half2float(b);}
+template<typename T> static __device__ inline bool theano_ge(const T& a, const ga_half& b) {return a >= ga_half2float(b);}
+
+static __device__ inline bool theano_eq(const ga_half& a, const ga_half& b) {return ga_half2float(a) == ga_half2float(b);}
+static __device__ inline bool theano_ne(const ga_half& a, const ga_half& b) {return ga_half2float(a) != ga_half2float(b);}
+static __device__ inline bool theano_lt(const ga_half& a, const ga_half& b) {return ga_half2float(a) < ga_half2float(b);}
+static __device__ inline bool theano_gt(const ga_half& a, const ga_half& b) {return ga_half2float(a) > ga_half2float(b);}
+static __device__ inline bool theano_le(const ga_half& a, const ga_half& b) {return ga_half2float(a) <= ga_half2float(b);}
+static __device__ inline bool theano_ge(const ga_half& a, const ga_half& b) {return ga_half2float(a) >= ga_half2float(b);}
--- a/theano/gpuarray/c_code/topk_dense.cu
+++ b/theano/gpuarray/c_code/topk_dense.cu
@@ -47,7 +47,7 @@ extern "C" __global__ void k_topk_dense(
    //}

    // get input and its radix friendly form
-    const INPUT_TYPE xval = is_topk ? ptr_at(src, idx*src_strides_0) : (INPUT_TYPE)0;
+    const INPUT_TYPE xval = is_topk ? ptr_at(src, idx*src_strides_0) : theano_zero<INPUT_TYPE>();
    radix_t x = RadixConfig<INPUT_TYPE>::convert(xval);

    // resolve negative k

--- a/theano/gpuarray/c_code/topk_dense_large.cu
+++ b/theano/gpuarray/c_code/topk_dense_large.cu
@@ -15,19 +15,19 @@ __device__ DataType find_pattern(DataType* smem,
                             RadixType known_bits,
                             RadixType known_bits_mask) {
    if (threadIdx.x < 32)
-        smem[threadIdx.x] = 0;
+        smem[threadIdx.x] = theano_zero<DataType>();

    local_barrier();

    // All threads participate in the loop, in order to sync on the flag
    for (CountType i = threadIdx.x; i < (slice_size + (CountType)blockDim.x-1); i += blockDim.x) {
        bool in_range = (i < slice_size);
-        DataType v = in_range ? ptr_read_cached(data, i*stride) : 0;
+        DataType v = in_range ? ptr_read_cached(data, i*stride) : theano_zero<DataType>();

        if (in_range && ((RadixConfig<DataType>::convert(v) & known_bits_mask) == known_bits)) {
            // There should not be conflicts if we are using find_pattern,
            // since the result is unique
-            smem[0] = 1;
+            smem[0] = theano_one<DataType>();
            smem[1] = v; // can't use val as the flag, since it could be 0
        }

@@ -39,10 +39,10 @@ __device__ DataType find_pattern(DataType* smem,
        local_barrier();

        // Check to see if a thread found the value
-        if (found != 0)
+        if (theano_ne(found, 0))
            return val;
    }
-    return 0;
+    return theano_zero<DataType>();
 }

 // This function counts the distribution of all input values in a
@@ -260,12 +260,12 @@ extern "C" __global__ void KERNEL_NAME(

    for (int i = idx; i < iter_bound; i += blockDim.x) {
        bool in_range = (i < size);
-        INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : 0;
+        INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : theano_zero<INPUT_TYPE>();
        bool has_topk;
        if (order) {
-            has_topk = in_range && (v > topkth_value);
+            has_topk = in_range && (theano_gt(v, topkth_value));
        } else {
-            has_topk = in_range && (v < topkth_value);
+            has_topk = in_range && (theano_lt(v, topkth_value));
        }

        int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
@@ -288,8 +288,8 @@ extern "C" __global__ void KERNEL_NAME(

    for (COUNT_TYPE i = idx; i < iter_bound; i += blockDim.x) {
        bool in_range = (i < size);
-        INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : 0;
-        bool has_topk = in_range && (v == topkth_value);
+        INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : theano_zero<INPUT_TYPE>();
+        bool has_topk = in_range && (theano_eq(v, topkth_value));

        int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
        int carry = smem[blockDim.x / 32 - 1];

--- a/theano/gpuarray/sort.py
+++ b/theano/gpuarray/sort.py
@@ -53,7 +53,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
            pygpu.get_include()]

    def c_code_cache_version(self):
-        return (3,)
+        return (4,)

    def gpu_kernels(self, node, nodename):
        # load kernel source
@@ -103,8 +103,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
                set_slice=set_slice_code,
                write_value=int(self.return_values),
                write_index=int(self.return_indices),
-                ndim=str(ndim),
-                use_half=int(node.inputs[0].dtype == 'float16')
+                ndim=str(ndim)
                )
        elif device_type == b'opencl':
            raise NotImplementedError()

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -7563,6 +7563,7 @@ def local_useless_topk(node):
    old_output = node.outputs[ret_idx]
    new_output = TopKOp(
        axis=op.axis,
+        sorted=op.sorted,
        idx_dtype=op.idx_dtype,
        return_values=ret_val,
        return_indices=ret_idx)(x, k)

--- a/theano/tensor/tests/test_sort.py
+++ b/theano/tensor/tests/test_sort.py
@@ -301,7 +301,7 @@ class Test_TopK(unittest.TestCase):
        goal = np.sort(xval)[idx]

        assert yval.dtype == goal.dtype
-        utt.assert_allclose(np.sort(yval), goal)
+        utt.assert_allclose(goal, np.sort(yval))

    @utt.parameterized.expand(chain(
        product(