Merge pull request #6513 from abergeron/fix_gputopk

Fix compilation of GpuTopKOp

Merge pull request #6513 from abergeron/fix_gputopk
55e94968 · Frédéric Bastien · GitHub · 2716a8be · b0949585 · 55e94968
--- a/theano/gpuarray/c_code/topk_common.cuh
+++ b/theano/gpuarray/c_code/topk_common.cuh
@@ -187,6 +187,21 @@ struct RadixConfig<char> {
  }
 };
+// g++ makes difference between 'signed char' (ga_byte, int8) and 'char'.
+// Same code as for char.
+template <>
+struct RadixConfig<ga_byte> {
+  typedef unsigned int RadixType;
+  static inline __device__ RadixType convert(ga_byte v) {
+    return 128u + v;
+  }
+  static inline __device__ ga_byte deconvert(RadixType v) {
+    return v - 128;
+  }
+};
 template <>
 struct RadixConfig<short> {
  typedef unsigned int RadixType;
@@ -229,25 +244,27 @@ struct RadixConfig<long long> {
  }
 };
-#define USE_HALF $use_half
+/* NB: This specialization for ga_half does know that ga_half is a struct with only one member of type ga_ushort.
+ * So, if ga_half implementation changes, this code should change too.
-#if USE_HALF == 1
+ * TODO: Maybe should gpuarray provide abstract functions to manipulate ga_half internal structure? e.g:
-// since half is ushort, using macro to protect this part is necessary
+ *   unsigned short ga_half2bits(ga_half value);
+ *   ga_half ga_bits2half(unsigned short bits);
+ */
 template <>
-struct RadixConfig<unsigned short> {
+struct RadixConfig<ga_half> {
  typedef unsigned int RadixType;
-  static inline __device__ RadixType convert(unsigned short v) {
+  static inline __device__ RadixType convert(ga_half v) {
-    RadixType mask = -(((RadixType)v >> 15)) | 0x8000;
+    RadixType mask = -(((RadixType)v.data >> 15)) | 0x8000;
-    return (v ^ mask);
+    return (v.data ^ mask);
  }
-  static inline __device__ unsigned short deconvert(RadixType v) {
+  static inline __device__ ga_half deconvert(RadixType v) {
    RadixType mask = ((v >> 15) - 1) | 0x8000;
-    return (unsigned short)(v ^ mask);
+    ga_half out = {(unsigned short)(v ^ mask)};
+    return out;
  }
 };
-#endif // USE_HALF
 // $$inp_t should be replaced in c_code
 // we cannot use templated kernel because gpuarray API does not support it
@@ -356,3 +373,61 @@ static __device__ inline T ptr_read_cached(T *ptr, ssize_t offset) {
    return __ldg(((T*)((char*)ptr + offset)));
 }
+/* NB: __ldg is not defined for ga_half, so we must specialize ptr_read_cached.
+ * To do it, I try to use a built-in type that should have the same size as ga_half.
+ * Based on current ga_half implementation (2017/11/27), it should be ga_ushort.
+ * This code must be updated every time ga_half implementation size changes,
+ * until a better code be provided. */
+#define GA_HALF_STD_TYPE ga_ushort
+static __device__ inline ga_half ptr_read_cached(ga_half *ptr, ssize_t offset) {
+    int check_ga_half_std_type[ ( ( sizeof(GA_HALF_STD_TYPE) - sizeof(ga_half) ) ? -1 : 1 ) ];
+    GA_HALF_STD_TYPE out = __ldg(((GA_HALF_STD_TYPE*)((char*)ptr + offset)));
+    ga_half real_out;
+    *(GA_HALF_STD_TYPE*)(&real_out) = out;
+    return real_out;
+}
+#undef GA_HALF_STD_TYPE
+/* Comparisons involving ga_half and conversions from integers (e.g. 0, 1) to ga_half lead to compilation errors.
+ * Following functions are provided to bypass these issues. */
+template<typename T>
+static __device__ inline T theano_zero() {return 0;}
+template<>
+__device__ inline ga_half theano_zero() {return ga_float2half(0);}
+template<typename T>
+static __device__ inline T theano_one() {return 1;}
+template<>
+__device__ inline ga_half theano_one() {return ga_float2half(1);}
+template<typename A, typename B> static __device__ inline bool theano_eq(const A& a, const B& b) {return a == b;}
+template<typename A, typename B> static __device__ inline bool theano_ne(const A& a, const B& b) {return a != b;}
+template<typename A, typename B> static __device__ inline bool theano_lt(const A& a, const B& b) {return a < b;}
+template<typename A, typename B> static __device__ inline bool theano_gt(const A& a, const B& b) {return a > b;}
+template<typename A, typename B> static __device__ inline bool theano_le(const A& a, const B& b) {return a <= b;}
+template<typename A, typename B> static __device__ inline bool theano_ge(const A& a, const B& b) {return a >= b;}
+template<typename T> static __device__ inline bool theano_eq(const ga_half& a, const T& b) {return ga_half2float(a) == b;}
+template<typename T> static __device__ inline bool theano_ne(const ga_half& a, const T& b) {return ga_half2float(a) != b;}
+template<typename T> static __device__ inline bool theano_lt(const ga_half& a, const T& b) {return ga_half2float(a) < b;}
+template<typename T> static __device__ inline bool theano_gt(const ga_half& a, const T& b) {return ga_half2float(a) > b;}
+template<typename T> static __device__ inline bool theano_le(const ga_half& a, const T& b) {return ga_half2float(a) <= b;}
+template<typename T> static __device__ inline bool theano_ge(const ga_half& a, const T& b) {return ga_half2float(a) >= b;}
+template<typename T> static __device__ inline bool theano_eq(const T& a, const ga_half& b) {return a == ga_half2float(b);}
+template<typename T> static __device__ inline bool theano_ne(const T& a, const ga_half& b) {return a != ga_half2float(b);}
+template<typename T> static __device__ inline bool theano_lt(const T& a, const ga_half& b) {return a < ga_half2float(b);}
+template<typename T> static __device__ inline bool theano_gt(const T& a, const ga_half& b) {return a > ga_half2float(b);}
+template<typename T> static __device__ inline bool theano_le(const T& a, const ga_half& b) {return a <= ga_half2float(b);}
+template<typename T> static __device__ inline bool theano_ge(const T& a, const ga_half& b) {return a >= ga_half2float(b);}
+static __device__ inline bool theano_eq(const ga_half& a, const ga_half& b) {return ga_half2float(a) == ga_half2float(b);}
+static __device__ inline bool theano_ne(const ga_half& a, const ga_half& b) {return ga_half2float(a) != ga_half2float(b);}
+static __device__ inline bool theano_lt(const ga_half& a, const ga_half& b) {return ga_half2float(a) < ga_half2float(b);}
+static __device__ inline bool theano_gt(const ga_half& a, const ga_half& b) {return ga_half2float(a) > ga_half2float(b);}
+static __device__ inline bool theano_le(const ga_half& a, const ga_half& b) {return ga_half2float(a) <= ga_half2float(b);}
+static __device__ inline bool theano_ge(const ga_half& a, const ga_half& b) {return ga_half2float(a) >= ga_half2float(b);}
--- a/theano/gpuarray/c_code/topk_dense.cu
+++ b/theano/gpuarray/c_code/topk_dense.cu
@@ -4,19 +4,24 @@
 #define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
 // works when length on axis is within max allowed threads in block (1024)
-KERNEL void k_topk_dense(
+extern "C" __global__ void k_topk_dense(
        $dims
        // size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
        $dstv
        // INPUT_TYPE *dstv
+        $dstv_offset
+        // size_t offset
        $dstv_strides
        // ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
        $dsti
        // INDEX_TYPE *dsti
+        $dsti_offset
+        // size_t offset
        $dsti_strides
        // ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
        ssize_t k,
        INPUT_TYPE* src,
+	size_t src_offset,
        $src_strides
        // ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
        size_t size) {
@@ -28,7 +33,6 @@ KERNEL void k_topk_dense(
    size_t out_idx;
    const unsigned char warp_id = idx / GA_WARP_SIZE;
    // 0. get the slice for thread block to work on
    size_t gid = blockIdx.x, gidx;
@@ -43,7 +47,7 @@ KERNEL void k_topk_dense(
    //}
    // get input and its radix friendly form
-    const INPUT_TYPE xval = is_topk ? ptr_at(src, idx*src_strides_0) : (INPUT_TYPE)0;
+    const INPUT_TYPE xval = is_topk ? ptr_at(src, idx*src_strides_0) : theano_zero<INPUT_TYPE>();
    radix_t x = RadixConfig<INPUT_TYPE>::convert(xval);
    // resolve negative k

--- a/theano/gpuarray/c_code/topk_dense_large.cu
+++ b/theano/gpuarray/c_code/topk_dense_large.cu
@@ -15,19 +15,19 @@ __device__ DataType find_pattern(DataType* smem,
                             RadixType known_bits,
                             RadixType known_bits_mask) {
    if (threadIdx.x < 32)
-        smem[threadIdx.x] = 0;
+        smem[threadIdx.x] = theano_zero<DataType>();
    local_barrier();
    // All threads participate in the loop, in order to sync on the flag
    for (CountType i = threadIdx.x; i < (slice_size + (CountType)blockDim.x-1); i += blockDim.x) {
        bool in_range = (i < slice_size);
-        DataType v = in_range ? ptr_read_cached(data, i*stride) : 0;
+        DataType v = in_range ? ptr_read_cached(data, i*stride) : theano_zero<DataType>();
        if (in_range && ((RadixConfig<DataType>::convert(v) & known_bits_mask) == known_bits)) {
            // There should not be conflicts if we are using find_pattern,
            // since the result is unique
-            smem[0] = 1;
+            smem[0] = theano_one<DataType>();
            smem[1] = v; // can't use val as the flag, since it could be 0
        }
@@ -39,10 +39,10 @@ __device__ DataType find_pattern(DataType* smem,
        local_barrier();
        // Check to see if a thread found the value
-        if (found != 0)
+        if (theano_ne(found, 0))
            return val;
    }
-    return 0;
+    return theano_zero<DataType>();
 }
 // This function counts the distribution of all input values in a
@@ -194,19 +194,24 @@ __device__ void radix_select(DataType* data,
    *top_kth = RadixConfig<DataType>::deconvert(known_bits);
 }
-KERNEL void KERNEL_NAME(
+extern "C" __global__ void KERNEL_NAME(
        $dims
        // size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
        $dstv
        // INPUT_TYPE *dstv
+        $dstv_offset
+        // size_t offset
        $dstv_strides
        // ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
        $dsti
        // INDEX_TYPE *dsti
+        $dsti_offset
+        // size_t offset
        $dsti_strides
        // ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
        ssize_t k,
        INPUT_TYPE* src,
+	size_t src_offset,
        $src_strides
        // ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
        size_t size) {
@@ -255,12 +260,12 @@ KERNEL void KERNEL_NAME(
    for (int i = idx; i < iter_bound; i += blockDim.x) {
        bool in_range = (i < size);
-        INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : 0;
+        INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : theano_zero<INPUT_TYPE>();
        bool has_topk;
        if (order) {
-            has_topk = in_range && (v > topkth_value);
+            has_topk = in_range && (theano_gt(v, topkth_value));
        } else {
-            has_topk = in_range && (v < topkth_value);
+            has_topk = in_range && (theano_lt(v, topkth_value));
        }
        int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
@@ -283,8 +288,8 @@ KERNEL void KERNEL_NAME(
    for (COUNT_TYPE i = idx; i < iter_bound; i += blockDim.x) {
        bool in_range = (i < size);
-        INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : 0;
+        INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : theano_zero<INPUT_TYPE>();
-        bool has_topk = in_range && (v == topkth_value);
+        bool has_topk = in_range && (theano_eq(v, topkth_value));
        int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
        int carry = smem[blockDim.x / 32 - 1];

--- a/theano/gpuarray/sort.py
+++ b/theano/gpuarray/sort.py
@@ -53,7 +53,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
            pygpu.get_include()]
    def c_code_cache_version(self):
-        return (1,)
+        return (4,)
    def gpu_kernels(self, node, nodename):
        # load kernel source
@@ -77,21 +77,33 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
                dsti='dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)' if self.return_indices else '')
            set_slice_code = ''.join(
                set_slice_code % dict(i=j) for j in range(1, ndim))
+            if self.return_values:
+                set_slice_code += """
+                dstv = ptr_add(dstv, dstv_offset);
+                """
+            if self.return_indices:
+                set_slice_code += """
+                dsti = ptr_add(dsti, dsti_offset);
+                """
+            set_slice_code += """
+                src = ptr_add(src, src_offset);
+            """
            flags = Kernel.get_flags(node.inputs[0].dtype)
            subs = dict(
                inp_t=ga.dtype_to_ctype(node.inputs[0].dtype),
                out_t=ga.dtype_to_ctype(self.idx_dtype),
                dims=''.join('size_t dims_%d, ' % i for i in range(1, ndim)),
                dstv='INPUT_TYPE *dstv,' if self.return_values else '',
+                dstv_offset='size_t dstv_offset,' if self.return_values else '',
                dsti='INDEX_TYPE *dsti,' if self.return_indices else '',
+                dsti_offset='size_t dsti_offset,' if self.return_indices else '',
                dstv_strides=dstv_strides_code if self.return_values else '',
                dsti_strides=dsti_strides_code if self.return_indices else '',
                src_strides=src_strides_code,
                set_slice=set_slice_code,
                write_value=int(self.return_values),
                write_index=int(self.return_indices),
-                ndim=str(ndim),
+                ndim=str(ndim)
-                use_half=int(node.inputs[0].dtype == 'float16')
                )
        elif device_type == b'opencl':
            raise NotImplementedError()
@@ -100,9 +112,11 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
        param_types = [ga.SIZE] * (ndim - 1)  # dims
        for _ in range(self.return_values + self.return_indices):
            param_types.append(ga.GpuArray)  # dst*
+            param_types.append(ga.SIZE)  # offset
            param_types.extend([ga.SSIZE] * ndim)  # dst*_strides
        param_types.append(ga.SIZE)  # k
        param_types.append(ga.GpuArray)  # src
+        param_types.append(ga.SIZE)  # offset
        param_types.extend([ga.SSIZE] * ndim)  # src_strides
        param_types.append(ga.SIZE)  # size
@@ -120,7 +134,8 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
            ) as f:
                kernel_src = f.read()
            ker = Kernel(
-                code=Template(common_src + kernel_src).substitute(**subs),
+                code=("#include <cluda.h>\n" +
+                      Template(common_src + kernel_src).substitute(**subs)),
                name=kname,
                params=param_types,
                flags=flags,
@@ -159,7 +174,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
        ctx = sub['params']
        k_dtype = node.inputs[1].type.dtype_specs()[1]
        # max threads per block
-        MAX_TPB = context.maxlsize
+        MAX_TPB = context.maxlsize0
        # max blocks per grid
        MAX_BPG = context.maxgsize0
        WARP_SIZE = 32
@@ -169,12 +184,12 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
        axis = self.axis % ndim
        del(reordered_axes[axis])
        reordered_axes = [axis] + reordered_axes
-        dims = ''.join('(void*)(dims+%d), ' % i for i in reordered_axes[1:])
+        dims = ''.join('dims[%d], ' % i for i in reordered_axes[1:])
        prep_output = ''
        if self.return_values:
            def_dvstrides = 'const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)' % yv
-            params_dv = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % (yv, yv)
+            params_dv = '%s->ga.data, %s->ga.offset,\n' % (yv, yv)
-            params_dv += ''.join('(void*)(dvstrides+%d), ' % i for i in reordered_axes)
+            params_dv += ''.join('dvstrides[%d], ' % i for i in reordered_axes)
            prep_output += '''
    if (0 != theano_prep_output(
        &%(yv)s, %(ndim)d, odims,
@@ -186,8 +201,8 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
        if self.return_indices:
            def_distrides = 'const ssize_t *distrides = PyGpuArray_STRIDES(%s)' % yi
-            params_di = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % yi
+            params_di = '%s->ga.data, %s->ga.offset,\n' % (yi, yi)
-            params_di += ''.join('(void*)(distrides+%d), ' % i for i in reordered_axes)
+            params_di += ''.join('distrides[%d], ' % i for i in reordered_axes)
            prep_output += '''
    if (0 != theano_prep_output(
        &%(yi)s, %(ndim)d, odims,
@@ -196,7 +211,7 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
    }\n''' % locals()
        else:
            def_distrides = params_di = ''
-        sstrides = ', '.join('(void*)(sstrides+%d)' % i for i in reordered_axes)
+        sstrides = ', '.join('sstrides[%d]' % i for i in reordered_axes)
        code = '''
 {
    const ssize_t k_ = ((%(k_dtype)s*)(PyArray_DATA(%(k)s)))[0];
@@ -240,31 +255,46 @@ class GpuTopKOp(GpuKernelBase, TopKOp):
    %(def_dvstrides)s;
    %(def_distrides)s;
    const ssize_t *sstrides = PyGpuArray_STRIDES(%(x)s);
-    void* args[] = {
-        %(dims)s
-        %(params_dv)s
-        %(params_di)s
-        (void*)(&k_),
-        (void*)((char*)(%(x)s->ga.data) + (%(x)s->ga.offset)),
-        %(sstrides)s,
-        (void*)(dims+%(axis)d),
-    };
    int err;
    if (dims[%(axis)d] > (1u << 31)) {
        block_size = %(MAX_TPB)d;
-        err = GpuKernel_call(
+        err = k_topk_dense_xlarge_call(
-            &k_topk_dense_xlarge%(nodename)s, 1,
+                1, &grid_size, &block_size, 0,
-            &grid_size, &block_size, 0, args);
+                %(dims)s
+                %(params_dv)s
+                %(params_di)s
+                k_,
+                %(x)s->ga.data,
+                %(x)s->ga.offset,
+                %(sstrides)s,
+                dims[%(axis)d]
+        );
    } else if (block_size > %(MAX_TPB)d) {
        block_size = %(MAX_TPB)d;
-        err = GpuKernel_call(
+        err = k_topk_dense_large_call(
-            &k_topk_dense_large%(nodename)s, 1,
+                1, &grid_size, &block_size, 0,
-            &grid_size, &block_size, 0, args);
+                %(dims)s
+                %(params_dv)s
+                %(params_di)s
+                k_,
+                %(x)s->ga.data,
+                %(x)s->ga.offset,
+                %(sstrides)s,
+                dims[%(axis)d]
+        );
    } else {
-        err = GpuKernel_call(
+        err = k_topk_dense_call(
-            &k_topk_dense%(nodename)s, 1,
+                1, &grid_size, &block_size, 0,
-            &grid_size, &block_size, 0, args);
+                %(dims)s
+                %(params_dv)s
+                %(params_di)s
+                k_,
+                %(x)s->ga.data,
+                %(x)s->ga.offset,
+                %(sstrides)s,
+                dims[%(axis)d]
+        );
    }
    if (err != GA_NO_ERROR) {
        PyErr_SetString(

--- a/theano/gpuarray/tests/test_sort.py
+++ b/theano/gpuarray/tests/test_sort.py
+from __future__ import absolute_import, print_function, division
+import theano
+import theano.tensor.tests.test_sort
+from .config import mode_with_gpu
+from ..sort import GpuTopKOp
+class Test_GpuTopK(theano.tensor.tests.test_sort.Test_TopK):
+    mode = mode_with_gpu
+    dtype = 'float32'
+    op_class = GpuTopKOp
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -7563,6 +7563,7 @@ def local_useless_topk(node):
    old_output = node.outputs[ret_idx]
    new_output = TopKOp(
        axis=op.axis,
+        sorted=op.sorted,
        idx_dtype=op.idx_dtype,
        return_values=ret_val,
        return_indices=ret_idx)(x, k)

--- a/theano/tensor/tests/test_sort.py
+++ b/theano/tensor/tests/test_sort.py
@@ -232,6 +232,8 @@ def test_argsort_grad():
 class Test_TopK(unittest.TestCase):
+    mode = None
+    op_class = TopKOp
    def setUp(self):
        pass
@@ -240,7 +242,10 @@ class Test_TopK(unittest.TestCase):
        _all_dtypes, tensor.integer_dtypes, [-1, 0, None], [False]))
    def test_argtopk_sanity(self, dtype, idx_dtype, axis, sorted):
        x = tensor.vector(name='x', dtype=dtype)
-        fn = theano.function([x], argtopk(x, 1, axis=axis, sorted=sorted, idx_dtype=idx_dtype))
+        fn = theano.function([x],
+                             argtopk(x, 1, axis=axis, sorted=sorted, idx_dtype=idx_dtype),
+                             mode=self.mode)
+        assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
        xval = np.asarray([1]).astype(dtype)
        yval = fn(xval)
        assert yval == np.asarray([0], dtype=idx_dtype)
@@ -250,7 +255,8 @@ class Test_TopK(unittest.TestCase):
        _all_dtypes, [-1, 0, None], [False]))
    def test_topk_sanity(self, dtype, axis, sorted):
        x = tensor.vector(name='x', dtype=dtype)
-        fn = theano.function([x], topk(x, 1, axis=axis, sorted=sorted))
+        fn = theano.function([x], topk(x, 1, axis=axis, sorted=sorted), mode=self.mode)
+        assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
        xval = np.asarray([1]).astype(dtype)
        yval = fn(xval)
        assert yval == xval
@@ -261,7 +267,8 @@ class Test_TopK(unittest.TestCase):
    def test_combined_sanity(self, dtype, idx_dtype, axis, sorted):
        x = tensor.vector(name='x', dtype=dtype)
        yv, yi = topk_and_argtopk(x, 1, axis=axis, sorted=sorted, idx_dtype=idx_dtype)
-        fn = theano.function([x], [yv, yi])
+        fn = theano.function([x], [yv, yi], mode=self.mode)
+        assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
        xval = np.asarray([1]).astype(dtype)
        yvval, yival = fn(xval)
        assert yival == np.asarray([0], dtype=idx_dtype)
@@ -282,7 +289,8 @@ class Test_TopK(unittest.TestCase):
        x = theano.tensor.vector(name='x', dtype=dtype)
        y = topk(x, k, sorted=sorted)
-        fn = theano.function([x], y)
+        fn = theano.function([x], y, mode=self.mode)
+        assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
        # assert local_useless_topk opt is done properly
        assert 1 == len(fn.maker.fgraph.outputs[0].owner.outputs)
@@ -293,7 +301,7 @@ class Test_TopK(unittest.TestCase):
        goal = np.sort(xval)[idx]
        assert yval.dtype == goal.dtype
-        utt.assert_allclose(np.sort(yval), goal)
+        utt.assert_allclose(goal, np.sort(yval))
    @utt.parameterized.expand(chain(
        product(
@@ -309,7 +317,8 @@ class Test_TopK(unittest.TestCase):
        x = theano.tensor.vector(name='x', dtype=dtype)
        y = argtopk(x, k, sorted=sorted, idx_dtype=idx_dtype)
-        fn = theano.function([x], y)
+        fn = theano.function([x], y, mode=self.mode)
+        assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
        # assert local_useless_topk opt is done properly
        assert 1 == len(fn.maker.fgraph.outputs[0].owner.outputs)
@@ -337,7 +346,8 @@ class Test_TopK(unittest.TestCase):
        x = theano.tensor.vector(name='x', dtype=dtype)
        yv, yi = topk_and_argtopk(x, k, sorted=sorted, idx_dtype=idx_dtype)
-        fn = theano.function([x], [yv, yi])
+        fn = theano.function([x], [yv, yi], mode=self.mode)
+        assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
        # generate a all-unique array
        xval = gen_unique_vector(size, dtype)
        yvval, yival = fn(xval)
@@ -363,7 +373,8 @@ class Test_TopK(unittest.TestCase):
        x = theano.tensor.vector(name='x', dtype=dtype)
        y = argtopk(x, k, sorted=sorted, idx_dtype='int32')
-        fn = theano.function([x], y)
+        fn = theano.function([x], y, mode=self.mode)
+        assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
        xval = np.repeat(np.random.uniform(-100., 100., size=size // 2).astype(dtype), 2)
        xval = xval[np.random.permutation(size)]
        yval = fn(xval)
@@ -372,7 +383,7 @@ class Test_TopK(unittest.TestCase):
        utt.assert_allclose(np.sort(xval[yval]), np.sort(xval[goal]))
    @utt.parameterized.expand(product(
-        ((17, 15), (2, 3, 5, 7, 11), (2017, 5, 3)),
+        ((17, 15), (2, 3, 5, 7, 11), (500, 5, 3)),  # NB: Test may fail with bigger sizes (e.g. (2017, 5, 3)) due to "too many resources requested" kernel error on some GPUs.
        (-1, '(1+n)//2', '-n', '1-n'),
        ('float32', 'int32'),
        (False,),
@@ -391,7 +402,8 @@ class Test_TopK(unittest.TestCase):
            x = theano.tensor.tensor(
                name='x', broadcastable=(False,) * len(shp), dtype=dtype)
            y = argtopk(x, k, axis=axis, sorted=sorted, idx_dtype=idx_dtype)
-            fn = theano.function([x], y)
+            fn = theano.function([x], y, mode=self.mode)
+            assert any([isinstance(n.op, self.op_class) for n in fn.maker.fgraph.apply_nodes])
            size = reduce(int.__mul__, shp)
            xval = gen_unique_vector(size, dtype).reshape(shp)
            yval = fn(xval)