mixed changes

- add float16 suppport - fix 64 bit input bug - fix int8 bug due to integer overflow - more checking on k - add license from pytorch

mixed changes
c89cab31 · Adam Becker · 98c9ead1 · c89cab31 · c89cab31 · c89cab31
--- a/theano/gpuarray/k_topk_common.cuh
+++ b/theano/gpuarray/k_topk_common.cuh
 // modified from pytorch
 // https://github.com/pytorch/pytorch/master/blob/torch/lib/THC/THCTensorTopK.cuh
-//
-// Converts a type (maybe float) to an integer representation with the same
-// sorting; i.e., for floats f1, f2:
-// if f1 < f2 then convert(f1) < convert(f2)
-// We use this to enable radix selection of floating-point values.
-// This also gives a relative order for NaNs, but that's ok, as they
-// will all be adjacent
+// original license below:
+/*
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+   and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+

 #if __CUDA_ARCH__ < 350
 #define __ldg(ptr) (*(ptr))
@@ -15,7 +50,13 @@

 template <typename T>
 struct RadixConfig {
-    typedef T RadixType;
+// Converts a type (maybe float) to an integer representation with the same
+// sorting; i.e., for floats f1, f2:
+// if f1 < f2 then convert(f1) < convert(f2)
+// We use this to enable radix selection of floating-point values.
+// This also gives a relative order for NaNs, but that's ok, as they
+// will all be adjacent
+  typedef T RadixType;
  static inline __device__ RadixType convert(T v) {
      return v;
  }
@@ -45,7 +86,7 @@ struct RadixConfig<ga_float> {

 template <>
 struct RadixConfig<ga_double> {
-  typedef unsigned long long int RadixType;
+  typedef ga_ulong RadixType;

  static inline __device__ RadixType convert(ga_double v) {
    RadixType x = __double_as_longlong(v);
@@ -60,22 +101,9 @@ struct RadixConfig<ga_double> {
 };


-template <>
-struct RadixConfig<ga_ubyte> {
-  typedef ga_uint RadixType;
-
-  static inline __device__ RadixType convert(ga_ubyte v) {
-    return v;
-  }
-
-  static inline __device__ ga_ubyte deconvert(RadixType v) {
-    return v;
-  }
-};
-
 template <>
 struct RadixConfig<ga_byte> {
-  typedef ga_uint RadixType;
+  typedef ga_ubyte RadixType;

  static inline __device__ RadixType convert(ga_byte v) {
    return 128u + v;
@@ -88,7 +116,7 @@ struct RadixConfig<ga_byte> {

 template <>
 struct RadixConfig<ga_short> {
-  typedef ga_uint RadixType;
+  typedef ga_ushort RadixType;

  static inline __device__ RadixType convert(ga_short v) {
    assert(sizeof(ga_short) == 2);
@@ -101,61 +129,52 @@ struct RadixConfig<ga_short> {
 };

 template <>
-struct RadixConfig<int> {
+struct RadixConfig<ga_int> {
  typedef ga_uint RadixType;

-  static inline __device__ RadixType convert(int v) {
+  static inline __device__ RadixType convert(ga_int v) {
    assert(sizeof(int) == 4);
-    return (1u << 31) ^ v;
+    return 2147483648u + v;
  }

-  static inline __device__ int deconvert(RadixType v) {
-    return (1u << 31) ^ v;
+  static inline __device__ ga_int deconvert(RadixType v) {
+    return v - 2147483648u;
  }
 };

 template <>
 struct RadixConfig<ga_long> {
-  typedef unsigned long long int RadixType;
+  typedef ga_ulong RadixType;

  static inline __device__ RadixType convert(ga_long v) {
-    assert(sizeof(long) == 8);
-    return (1ull << 63) ^ v;
+    assert(sizeof(ga_long) == 8);
+    return 9223372036854775808ull + v;
  }

-  static inline __device__ ga_long deconvert(RadixType v) {
-    return (1ull << 63) ^ v;
+  static inline __device__ long long deconvert(RadixType v) {
+    return v - 9223372036854775808ull;
  }
 };

-#ifdef USE_HALF
-// TODO: make this work
+#define USE_HALF $use_half
+
+#if USE_HALF == 1
+// since ga_half is ushort, use macro to protect this part is necessary
 template <>
-struct RadixConfig<half> {
-  typedef ga_uint RadixType;
+struct RadixConfig<ga_half> {
+  typedef ga_ushort RadixType;

-  static inline __device__ RadixType convert(half v) {
-#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 80000
-    RadixType x = __half_as_ushort(v);
-    RadixType mask = -((x >> 15)) | 0x8000;
-    return (x ^ mask);
-#else
-    assert(false);
-    return 0u;
-#endif
+  static inline __device__ RadixType convert(ga_half v) {
+    RadixType mask = -(((RadixType)v >> 15)) | 0x8000;
+    return (v ^ mask);
  }

-  static inline __device__ half deconvert(RadixType v) {
-#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 80000
+  static inline __device__ ga_half deconvert(RadixType v) {
    RadixType mask = ((v >> 15) - 1) | 0x8000;
-    return __ushort_as_half(v ^ mask);
-#else
-    assert(false);
-    return ScalarConvert<int, half>::to(0);
-#endif
+    return (ga_half)(v ^ mask);
  }
 };
-#endif
+#endif // USE_HALF

 // $$inp_t should be replaced in c_code
 // we cannot use templated kernel because gpuarray API does not support it
@@ -175,16 +194,15 @@ struct RadixConfig<half> {
 #error "RADIX_SIZE must be smaller than warp size (32)"
 #endif

-template <typename T>
-static inline __device__ T binary_cumsum(
-        int idx, int warp_id, int lane_id, T* smem, bool value) {
-    // cumsum within 1D thread block, which adds up `value` of all threads 
+static inline __device__ ga_size binary_cumsum(
+    int idx, int warp_id, int lane_id, ga_size* smem, bool value) {
+    // cumsum within 1D thread block, which adds up `value` of all threads
    // whose id is *no greater than* the current thread
    // binary_cumsum(1, 0, 1, 0, 1) -> (1, 1, 2, 2, 3)

    // cumsum within warp
    ga_uint warp_bits = __ballot(value);
-    T warp_sum = __popc(((2<<lane_id)-1) & warp_bits);
+    ga_size warp_sum = __popc(((2<<lane_id)-1) & warp_bits);

    if (lane_id == 0)
        smem[warp_id] = __popc(warp_bits);
@@ -195,7 +213,7 @@ static inline __device__ T binary_cumsum(
    if (idx == 0) {
        int current = 0;
        for (int i = 0; i < LDIM_0 / GA_WARP_SIZE; ++i) {
-            T v = smem[i];
+            ga_size v = smem[i];
            smem[i] = smem[i]+current;
            current = current+v;
        }
@@ -211,16 +229,15 @@ static inline __device__ T binary_cumsum(
    return warp_sum;
 }

-template <typename T>
-static inline __device__ T binary_cumsum_exclusive(
-    int idx, int warp_id, int lane_id, T* smem, bool value) {
+static inline __device__ ga_size binary_cumsum_exclusive(
+    int idx, int warp_id, int lane_id, ga_size* smem, bool value) {
    // cumsum within 1D thread block, which adds up `value` of all threads
    // whose id is *less than* the current thread
-    // binary_cumsum(1, 0, 1, 0, 1) -> (0, 1, 1, 2, 2)
+    // binary_cumsum_excl(1, 0, 1, 0, 1) -> (0, 1, 1, 2, 2)

    // cumsum within warp
    ga_uint warp_bits = __ballot(value);
-    T warp_sum = __popc(((1<<lane_id)-1) & warp_bits);
+    ga_size warp_sum = __popc(((1<<lane_id)-1) & warp_bits);

    if (lane_id == 0)
        smem[warp_id] = __popc(warp_bits);
@@ -231,7 +248,7 @@ static inline __device__ T binary_cumsum_exclusive(
    if (idx == 0) {
        int current = 0;
        for (int i = 0; i < LDIM_0 / GA_WARP_SIZE; ++i) {
-            T v = smem[i];
+            ga_size v = smem[i];
            smem[i] = smem[i]+current;
            current = current+v;
        }

--- a/theano/gpuarray/k_topk_dense.cu
+++ b/theano/gpuarray/k_topk_dense.cu
@@ -15,10 +15,10 @@ KERNEL void k_topk_dense(
        $src_strides
        // ga_ssize src_strides_0, ga_ssize src_strides_1, ... , src_strides_$${NDIM}
        ga_size size) {
-    LOCAL_MEM radix_t smem[32 * RADIX_SIZE];
+    LOCAL_MEM ga_size smem[32 * RADIX_SIZE];
    ga_ssize LOCAL_MEM bins[RADIX_SIZE+1]; // TODO: does using 32-bit gives good speedup?
    bool is_topk=true, is_topkth=true;
-    radix_t out_idx;
+    ga_size out_idx;

    const ga_ushort idx = LID_0;
    ga_size LOCAL_MEM k2, exceed;
@@ -118,12 +118,12 @@ KERNEL void k_topk_dense(
    if (exceed != 0) {
        // top_kth value may not be unique, so we need to
        // perform binary cumsum on is_topkth to drop exceeding top-kth values
-        out_idx = binary_cumsum_exclusive<radix_t>(idx, warp_id, lane_id, smem, is_topkth);
+        out_idx = binary_cumsum_exclusive(idx, warp_id, lane_id, smem, is_topkth);
        is_topk &= ((!is_topkth) || out_idx>=exceed);
    }

    // perform binary cumsum on is_topk to determine the indices to put result
-    out_idx = binary_cumsum_exclusive<radix_t>(idx, warp_id, lane_id, smem, is_topk);
+    out_idx = binary_cumsum_exclusive(idx, warp_id, lane_id, smem, is_topk);
    local_barrier();

    if (is_topk) {

--- a/theano/gpuarray/k_topk_dense_large.cu
+++ b/theano/gpuarray/k_topk_dense_large.cu
@@ -15,9 +15,9 @@ KERNEL void k_topk_dense_large(
        $src_strides
        // ga_ssize src_strides_0, ga_ssize src_strides_1, ... , src_strides_$${NDIM}
        ga_size size, ga_ushort inp_per_thread) {
-    LOCAL_MEM radix_t smem[32 * RADIX_SIZE];
+    LOCAL_MEM ga_size smem[32 * RADIX_SIZE];
    LOCAL_MEM radix_t known_bits, known_bits_mask;
-    radix_t out_idx;
+    ga_size out_idx;
    ga_size LOCAL_MEM write_base;
    INPUT_TYPE xval;
    radix_t x;
@@ -109,21 +109,13 @@ KERNEL void k_topk_dense_large(
        local_barrier();
    }

-    /*
-    if (idx < RADIX_SIZE) {
-        long long int xxx = 1337;
-        ptr_at(dstv, idx*dstv_strides_0) = xxx;
-    }
-    return;
-    */
-
    // 2. write values smaller than top-kth
    for (i=0; i<inp_per_thread; ++i) {
        in_range = (idx*inp_per_thread+i) < size;
        xval = in_range ? ptr_read(src, i*src_strides_0) : (INPUT_TYPE)0;
        x = inv_bits ^ RadixConfig<INPUT_TYPE>::convert(xval);
        is_topk = (x > known_bits) && in_range;
-        out_idx = binary_cumsum<radix_t>(idx, warp_id, lane_id, smem, is_topk);
+        out_idx = binary_cumsum(idx, warp_id, lane_id, smem, is_topk);
        if (is_topk) {
 #if WRITE_VALUE == 1
            ptr_at(dstv, (out_idx+write_base-1) * dstv_strides_0) = xval;
@@ -144,7 +136,7 @@ KERNEL void k_topk_dense_large(
        xval = in_range ? ptr_read(src, i*src_strides_0) : (INPUT_TYPE)0;
        x = inv_bits ^ RadixConfig<INPUT_TYPE>::convert(xval);
        is_topk = (x == known_bits) && in_range;
-        out_idx = binary_cumsum<radix_t>(idx, warp_id, lane_id, smem, is_topk);
+        out_idx = binary_cumsum(idx, warp_id, lane_id, smem, is_topk);
        is_topk = ((out_idx+write_base) <= abs(k)) && is_topk;
        if (is_topk) {
 #if WRITE_VALUE == 1

--- a/theano/gpuarray/sort.py
+++ b/theano/gpuarray/sort.py
@@ -19,12 +19,14 @@ except ImportError as e:
    # To make sure theano is importable
    pass

+# TODO sort / argsort

 # TODO add runtime opt, if k==1, use max/min reduce
+#      also if k is axis size, just copy input tensor
 # TODO add opt to merge argtopk / topk, or split topk_and_argtopk when only
 #      one result is needed
 # TODO add grad
-# TODO sort / argsort
+

 class GpuTopKOp(GpuKernelBase, TopKOp):
    '''
@@ -32,6 +34,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):

    '''
    __props__ = TopKOp.__props__
+    _f16_ok = True

    def __init__(self, axis=-1, return_values=True, return_indices=False, idx_dtype='int64'):
        GpuKernelBase.__init__(self)
@@ -99,7 +102,9 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
                set_slice=set_slice_code,
                write_value=int(self.return_values),
                write_index=int(self.return_indices),
-                ndim=str(ndim))
+                ndim=str(ndim),
+                use_half=int(node.inputs[0].dtype == 'float16')
+                )
        elif device_type == b'opencl':
            raise NotImplementedError()

@@ -201,6 +206,11 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
            PyExc_ValueError,
            "topk: k must not be zero");
        %(fail)s;
+    } else if (dims[%(axis)d] < odims[%(axis)d]){
+        PyErr_SetString(
+            PyExc_ValueError,
+            "topk: k cannot larger than size on specified axis %(axis)d");
+        %(fail)s;
    }
    %(prep_output)s

@@ -236,7 +246,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):

    int err;
    if (blk[0] > %(MAX_TPB)d) {
-        // CUDA_OUT_OF_RESOURCE if a max sized block is used
+        // LAUNCH_OUT_OF_RESOURCE if a 1024 sized block is used
        blk[0] = %(MAX_TPB)d / 2;
        err = GpuKernel_call(
            &k_topk_dense_large_%(nodename)s, 3,
@@ -291,5 +301,8 @@ def local_gpua_topkop(op, ctx_name, inputs, outputs):
    x = as_gpuarray_variable(x, ctx_name)

    rets = GpuTopKOp(
-        axis=axis, return_values=rv, return_indices=ri, idx_dtype=op.idx_dtype)(x, k)
+        axis=axis,
+        return_values=rv,
+        return_indices=ri,
+        idx_dtype=op.idx_dtype)(x, k)
    return rets
--- a/theano/tensor/sort.py
+++ b/theano/tensor/sort.py
@@ -426,6 +426,10 @@ def topk(x, k, axis=-1):
        Upon which axis shall the operation be performed on. If ``None``,
        works on flattened array.

+    Returns
+    -------
+    Tensor variable with same dtype as `x`.
+
    Notes
    -----
    - The returned values may not be sorted.
@@ -456,6 +460,10 @@ def argtopk(x, k, axis=-1, idx_dtype='int64'):
    idx_dtype: string
        Specify output dtype, defaults to ``int64``, must be integer type.

+    Returns
+    -------
+    Tensor variable with dtype specified in `idx_dtype`.
+
    Notes
    -----
    - The corresponding values of returned indices may not be sorted.

--- a/theano/tensor/tests/test_sort.py
+++ b/theano/tensor/tests/test_sort.py
@@ -275,7 +275,7 @@ class Test_TopK(unittest.TestCase):
        product(
            (16, 61, 257),
            (1, -1, 10, -10, 'n//2', 'n-1', '-n', '1-n'),
-            ('float64', 'int16', 'int8')),
+            ('float64', 'float16', 'int16', 'int8')),
        ((2049, 1337, 'float64'),)))
    def test_topk_1d(self, size, k, dtype):
        if isinstance(k, str):