fix segfault

13095b4b · Adam Becker · 95f6eda6 · 13095b4b · 13095b4b
--- a/theano/gpuarray/sort.py
+++ b/theano/gpuarray/sort.py
@@ -20,13 +20,16 @@ except ImportError as e:
    pass


-# TODO add support is slice size is larger than max allowed block size (1024)
+# TODO add support when slice size is larger than max allowed block size (1024)
 # TODO add runtime opt, if k==1, use max/min reduce
+# TODO add opt to merge argtopk / topk, or split topk_and_argtopk when only
+#      one result is needed
+# TODO add grad
 # TODO sort / argsort

 class GpuTopKOp(GpuKernelBase, TopKOp):
    '''
-    Implements TopKOp() on gpu
+    Implements TopKOp on gpu

    '''
    __props__ = TopKOp.__props__
@@ -79,11 +82,6 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
        set_slice_code = ''.join(
            set_slice_code % dict(i=j) for j in range(1, ndim))
        flags = Kernel.get_flags(node.inputs[0].dtype)
-        dst = ''
-        if self.return_values:
-            dst += 'INPUT_TYPE *dstv, '
-        if self.return_values:
-            dst += 'INDEX_TYPE *dsti, '
        write_value = 'ptr_at(dstv, out_idx * dstv_strides_0) = xval' if self.return_values else ''
        write_index = 'ptr_at(dsti, out_idx * dsti_strides_0) = (INDEX_TYPE)idx' if self.return_indices else ''
        subs = dict(
@@ -92,8 +90,8 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
            dims=''.join('ga_size dims_%d, ' % i for i in range(1, ndim)),
            dstv='INPUT_TYPE *dstv,' if self.return_values else '',
            dsti='INDEX_TYPE *dsti,' if self.return_indices else '',
-            dstv_strides=dstv_strides_code,
-            dsti_strides=dsti_strides_code,
+            dstv_strides=dstv_strides_code if self.return_values else '',
+            dsti_strides=dsti_strides_code if self.return_indices else '',
            src_strides=src_strides_code,
            set_slice=set_slice_code,
            write_value=write_value,
@@ -111,7 +109,8 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
        param_types.append(ga.SIZE)  # k
        param_types.append(ga.GpuArray)  # src
        param_types.extend([ga.SSIZE] * ndim)  # src_strides
-        param_types.append(ga.SIZE) # size
+        param_types.append(ga.SIZE)  # size
+        self.nargs = len(param_types)
        return [Kernel(
            code=kernel_src,
            name='k_topk_dense',
@@ -143,11 +142,12 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
        WARP_SIZE = 32

        ndim = node.inputs[0].ndim
+        nargs = self.nargs
        reordered_axes = list(range(ndim))
        axis = self.axis % ndim
        del(reordered_axes[axis])
        reordered_axes = [axis] + reordered_axes
-        dims = ', '.join('(void*)(dims+%d)' % i for i in reordered_axes[1:])
+        dims = ''.join('(void*)(dims+%d), ' % i for i in reordered_axes[1:])
        prep_output = ''
        if self.return_values:
            def_dvstrides = 'const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)' % yv
@@ -179,9 +179,8 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
 {
    const size_t *dims = PyGpuArray_DIMS(%(x)s);
    size_t odims[%(ndim)d];
-    for (int i=0; i<%(ndim)d; i++) {
+    for (int i=0; i<%(ndim)d; i++)
        odims[i] = dims[i];
-    }
    odims[%(axis)d] = *((%(k_dtype)s*)(PyArray_DATA(%(k)s)));
    if (odims[0] > %(MAX_TPB)d) {
        PyErr_SetString(

--- a/theano/gpuarray/topk_kernel.cu
+++ b/theano/gpuarray/topk_kernel.cu
@@ -13,16 +13,16 @@ struct RadixConfig {};

 template <>
 struct RadixConfig<float> {
-  typedef unsigned int RadixType;
+  typedef ga_uint RadixType;

-  static inline __device__ RadixType convert(float v) {
+  static inline WITHIN_KERNEL RadixType convert(float v) {
    RadixType x = __float_as_int(v);
    RadixType mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;

    return (x ^ mask);
  }

-  static inline __device__ float deconvert(RadixType v) {
+  static inline WITHIN_KERNEL float deconvert(RadixType v) {
    RadixType mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;

    return __int_as_float(v ^ mask);
@@ -30,55 +30,55 @@ struct RadixConfig<float> {
 };

 template <>
-struct RadixConfig<unsigned char> {
-  typedef unsigned int RadixType;
+struct RadixConfig<ga_uchar> {
+  typedef ga_uint RadixType;

-  static inline __device__ RadixType convert(unsigned char v) {
+  static inline WITHIN_KERNEL RadixType convert(ga_uchar v) {
    return v;
  }

-  static inline __device__ unsigned char deconvert(RadixType v) {
+  static inline WITHIN_KERNEL ga_uchar deconvert(RadixType v) {
    return v;
  }
 };

 template <>
 struct RadixConfig<char> {
-  typedef unsigned int RadixType;
+  typedef ga_uint RadixType;

-  static inline __device__ RadixType convert(char v) {
+  static inline WITHIN_KERNEL RadixType convert(char v) {
    return 128u + v;
  }

-  static inline __device__ char deconvert(RadixType v) {
+  static inline WITHIN_KERNEL char deconvert(RadixType v) {
    return v - 128;
  }
 };

 template <>
-struct RadixConfig<short> {
-  typedef unsigned int RadixType;
+struct RadixConfig<ga_short> {
+  typedef ga_uint RadixType;

-  static inline __device__ RadixType convert(short v) {
-    assert(sizeof(short) == 2);
+  static inline WITHIN_KERNEL RadixType convert(ga_short v) {
+    assert(sizeof(ga_short) == 2);
    return 32768u + v;
  }

-  static inline __device__ short deconvert(RadixType v) {
+  static inline WITHIN_KERNEL ga_short deconvert(RadixType v) {
    return v - 32768;
  }
 };

 template <>
 struct RadixConfig<int> {
-  typedef unsigned int RadixType;
+  typedef ga_uint RadixType;

-  static inline __device__ RadixType convert(int v) {
+  static inline WITHIN_KERNEL RadixType convert(int v) {
    assert(sizeof(int) == 4);
    return 2147483648u + v;
  }

-  static inline __device__ int deconvert(RadixType v) {
+  static inline WITHIN_KERNEL int deconvert(RadixType v) {
    return v - 2147483648u;
  }
 };
@@ -87,12 +87,12 @@ template <>
 struct RadixConfig<long> {
  typedef unsigned long long int RadixType;

-  static inline __device__ RadixType convert(long v) {
+  static inline WITHIN_KERNEL RadixType convert(long v) {
    assert(sizeof(long) == 8);
    return 9223372036854775808ull + v;
  }

-  static inline __device__ long deconvert(RadixType v) {
+  static inline WITHIN_KERNEL long deconvert(RadixType v) {
    return v - 9223372036854775808ull;
  }
 };
@@ -101,13 +101,13 @@ template <>
 struct RadixConfig<double> {
  typedef unsigned long long int RadixType;

-  static inline __device__ RadixType convert(double v) {
+  static inline WITHIN_KERNEL RadixType convert(double v) {
    RadixType x = __double_as_longlong(v);
    RadixType mask = -((x >> 63)) | 0x8000000000000000;
    return (x ^ mask);
  }

-  static inline __device__ double deconvert(RadixType v) {
+  static inline WITHIN_KERNEL double deconvert(RadixType v) {
    RadixType mask = ((v >> 63) - 1) | 0x8000000000000000;
    return __longlong_as_double(v ^ mask);
  }
@@ -116,9 +116,9 @@ struct RadixConfig<double> {
 #ifdef USE_HALF
 template <>
 struct RadixConfig<half> {
-  typedef unsigned int RadixType;
+  typedef ga_uint RadixType;

-  static inline __device__ RadixType convert(half v) {
+  static inline WITHIN_KERNEL RadixType convert(half v) {
 #if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 80000
    RadixType x = __half_as_ushort(v);
    RadixType mask = -((x >> 15)) | 0x8000;
@@ -129,7 +129,7 @@ struct RadixConfig<half> {
 #endif
  }

-  static inline __device__ half deconvert(RadixType v) {
+  static inline WITHIN_KERNEL half deconvert(RadixType v) {
 #if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 80000
    RadixType mask = ((v >> 15) - 1) | 0x8000;
    return __ushort_as_half(v ^ mask);
@@ -142,10 +142,10 @@ struct RadixConfig<half> {
 #endif

 // $$inp_t should be replaced in c_code
-// we cannot use templated __global__ because gpuarray API does not support it yet
+// we cannot use templated kernel because gpuarray API does not support it
 #define NDIM            $ndim
 #define INPUT_TYPE      $inp_t
-#define INDEX_TYPE     $out_t
+#define INDEX_TYPE      $out_t
 #define bitsof(T)       (sizeof(T)*8)
 #define RADIX_BITS      2
 #define RADIX_SIZE      (1<<RADIX_BITS)
@@ -153,33 +153,37 @@ struct RadixConfig<half> {
 #define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
 #define radix_t         RadixConfig<INPUT_TYPE>::RadixType

-#if RADIX_SIZE > 32
-#error "RADIX_SIZE must be smaller than warp size (32)"
+#if RADIX_SIZE > GA_WARP_SIZE
+#error "RADIX_SIZE must be smaller than warp size"
 #endif

 template <typename T>
-static inline __device__ T binary_cumsum(int idx, int warp_id, int lane_id, T* smem, bool value) {
-    // cumsum within 1D thread block, which adds up `value` of all threads whose id is *no greater than* the current thread
+static inline WITHIN_KERNEL T binary_cumsum(
+        int idx, int warp_id, int lane_id, T* smem, bool value) {
+    // cumsum within 1D thread block, which adds up `value` of all threads 
+    // whose id is *no greater than* the current thread
+    // binary_cumsum(1, 0, 1, 0, 1) -> (1, 1, 2, 2, 3)
+
    // cumsum within warp
-    unsigned int warp_bits = __ballot(value);
+    ga_uint warp_bits = __ballot(value);
    T warp_sum = __popc(((2<<lane_id)-1) & warp_bits);

    if (lane_id == 0)
        smem[warp_id] = __popc(warp_bits);

-    __syncthreads();
+    local_barrier();

    // cumsum across warps in one thread
    if (idx == 0) {
        int current = 0;
-        for (int i = 0; i < blockDim.x / 32; ++i) {
+        for (int i = 0; i < LDIM_0 / GA_WARP_SIZE; ++i) {
            T v = smem[i];
            smem[i] = smem[i]+current;
            current = current+v;
        }
    }

-    __syncthreads();
+    local_barrier();

    // load the carry from the preceding warp
    if (warp_id >= 1) {
@@ -190,31 +194,32 @@ static inline __device__ T binary_cumsum(int idx, int warp_id, int lane_id, T* s
 }

 template <typename T>
-static inline __device__ T binary_cumsum_exclusive(
+static inline WITHIN_KERNEL T binary_cumsum_exclusive(
    int idx, int warp_id, int lane_id, T* smem, bool value) {
    // cumsum within 1D thread block, which adds up `value` of all threads
    // whose id is *less than* the current thread
+    // binary_cumsum(1, 0, 1, 0, 1) -> (0, 1, 1, 2, 2)

    // cumsum within warp
-    unsigned int warp_bits = __ballot(value);
+    ga_uint warp_bits = __ballot(value);
    T warp_sum = __popc(((1<<lane_id)-1) & warp_bits);

    if (lane_id == 0)
        smem[warp_id] = __popc(warp_bits);

-    __syncthreads();
+    local_barrier();

    // cumsum across warps in one thread
    if (idx == 0) {
        int current = 0;
-        for (int i = 0; i < blockDim.x / 32; ++i) {
+        for (int i = 0; i < LDIM_0 / GA_WARP_SIZE; ++i) {
            T v = smem[i];
            smem[i] = smem[i]+current;
            current = current+v;
        }
    }

-    __syncthreads();
+    local_barrier();

    // load the carry from the preceding warp
    if (warp_id >= 1)
@@ -225,13 +230,13 @@ static inline __device__ T binary_cumsum_exclusive(

 // apply raw(byte) offset to pointer
 template <typename T>
-static __device__ inline T* ptr_add(T *ptr, ga_ssize offset) {
+static WITHIN_KERNEL inline T* ptr_add(T *ptr, ga_ssize offset) {
    return (T*)((char*)ptr + offset);
 }

 // get array element using raw(byte) offset
 template <typename T>
-static __device__ inline T& ptr_at(T *ptr, ga_ssize offset) {
+static WITHIN_KERNEL inline T& ptr_at(T *ptr, ga_ssize offset) {
    return *((T*)((char*)ptr + offset));
 }

@@ -250,21 +255,34 @@ KERNEL void k_topk_dense(
        INPUT_TYPE* src,
        $src_strides
        // ga_ssize src_strides_0, ga_ssize src_strides_1, ... , src_strides_$${NDIM}
-        size_t size) {
-    /*
-    extern __shared__ radix_t smem[];
-    ga_ssize __shared__ bins[RADIX_SIZE]; // TODO: does using 32-bit gives speedup?
-    bool is_topk = true;
-    bool is_topkth = true; // exactly k-th largest
+        ga_size size) {
+    extern LOCAL_MEM radix_t smem[];
+    ga_ssize LOCAL_MEM bins[RADIX_SIZE]; // TODO: does using 32-bit gives speedup?
+    bool is_topk=true, is_topkth=true;
    radix_t out_idx;

-    const size_t idx = threadIdx.x;
-    size_t __shared__ k2, exceed;
-    const ga_uint warp_id = idx / 32;
-    const ga_uint lane_id = idx % 32;
-    radix_t *wmem = (radix_t*)(smem) + warp_id * 32;
+    const ga_size idx = LID_0;
+    ga_size LOCAL_MEM k2, exceed;
+    const ga_uint warp_id = idx / GA_WARP_SIZE;
+    const ga_uint lane_id = idx % GA_WARP_SIZE;
+    radix_t *wmem = (radix_t*)(smem) + warp_id * GA_WARP_SIZE;
    const bool in_range = (idx < size);
    is_topk &= in_range;
+
+
+    // 0. get the slice for thread block to work on
+    // TODO if ndim <= 3, use native indexing ? (blockIdx.[xyz])
+    ga_size gid = GID_0, gidx;
+    $set_slice
+    //for(int i=1; i<NDIM; i++) {
+        // gidx = gid % dims_$${i};
+        // gid /= dims_$${i};
+        // dsti = ptr_add(dsti, gidx*dsti_strides_$${i};
+        // dstv = ptr_add(dstv, gidx*dstv_strides_$${i};
+        // src = ptr_add(src, gidx*src_strides_$${i});
+    //}
+
+    // get input and its radix friendly form
    const INPUT_TYPE xval = in_range ? ptr_at(src, idx*src_strides_0) : (INPUT_TYPE)0;
    radix_t x = in_range ? RadixConfig<INPUT_TYPE>::convert(xval) : 0;

@@ -272,17 +290,6 @@ KERNEL void k_topk_dense(
    if (k<0) { x = ~x; k = -k; }
    if (idx==0) k2 = k;

-    // 0. get the slice for thread block to work on
-    size_t gid = blockIdx.x, gidx;
-    $set_slice
-    //for(int i=0; i<NDIM; i++) {
-        //gidx = gid % dims_$${i};
-        //gid /= dims_$${i};
-        //dsti = ptr_add(dsti, gidx*dsti_strides_$${i+1};
-        //dstv = ptr_add(dstv, gidx*dstv_strides_$${i+1};
-        //src = ptr_add(src, gidx*src_strides_$${i+1});
-    //}
-
    // 1. filter is_topk and is_topkth using radix select

    #pragma unroll
@@ -293,18 +300,18 @@ KERNEL void k_topk_dense(
        #pragma unroll
        for (int bin=0; bin<RADIX_SIZE; ++bin) {
            bool incr_bin = (bin == digit) && is_topkth && in_range;
-            unsigned int incr_bin_warp = __ballot(incr_bin);
+            ga_uint incr_bin_warp = __ballot(incr_bin);
            if (lane_id==0)
                wmem[bin] += __popc(incr_bin_warp);
        }
-        __syncthreads();
+        local_barrier();
        // sum counts across all warps
        // TODO: test in-block parallel sum?
        if (idx < RADIX_SIZE) {
-            for(int w=32; w<blockDim.x; w+=32)
+            for(int w=GA_WARP_SIZE; w<LDIM_0; w+=GA_WARP_SIZE)
                smem[idx] += smem[idx + w];
        }
-        __syncthreads();
+        local_barrier();

        // calculate k minus cumsum(count)
        if (idx<RADIX_SIZE)
@@ -325,7 +332,7 @@ KERNEL void k_topk_dense(
                    exceed = min(exceed, bins[bin-1]);
            }
        }
-        __syncthreads();
+        local_barrier();


        // smem -> count
@@ -356,7 +363,7 @@ KERNEL void k_topk_dense(

    // perform binary cumsum on is_topk to determine the indices to put result
    out_idx = binary_cumsum_exclusive<radix_t>(idx, warp_id, lane_id, smem, is_topk);
-    __syncthreads();
+    local_barrier();

    if (is_topk) {
        $write_value;
@@ -364,5 +371,4 @@ KERNEL void k_topk_dense(
        $write_index;
        // ptr_at(dsti, out_idx * dsti_strides_0) = (INDEX_TYPE)idx;
    }
-    */
 }