Merge pull request #5959 from khaotik/topk

TopKOp implementation

Merge pull request #5959 from khaotik/topk
5cc9c326 · abergeron · GitHub · 0d47e204 · 61de1573 · 5cc9c326
--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -1367,21 +1367,27 @@ class LocalOptGroup(LocalOptimizer):
                    self.process_count[opt] += 1
                if not new_repl:
                    continue
-                else:
-                    if self.profile:
-                        self.node_created[opt] += len(graph.ops(fgraph.variables, new_repl))
-                        self.applied_true[opt] += 1
-                    break  # break from the for loop over optimization.
+                if isinstance(new_repl, (tuple, list)):
+                    new_vars = new_repl
+                else:  # It must be a dict
+                    new_vars = list(new_repl.values())
+                if self.profile:
+                    self.node_created[opt] += len(graph.ops(fgraph.variables, new_vars))
+                    self.applied_true[opt] += 1
+                break  # break from the for loop over optimization.
            if not new_repl:  # No optimization applied in the last iteration
                return repl
-            # only 1 iteration or we are at the start of the graph.
-            if not self.apply_all_opts or not new_repl[0].owner:
+            # only 1 iteration
+            if not self.apply_all_opts:
+                return new_repl
+            if not new_vars[0].owner:
+                # We are at the start of the graph.
                return new_repl
            if len(new_repl) > 1:
                s = set([v.owner for v in new_repl])
                assert len(s) == 1
            repl = new_repl
-            node = repl[0].owner
+            node = new_vars[0].owner

    @staticmethod
    def print_profile(stream, prof, level=0):

--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -28,7 +28,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
                   reg_context, get_context, ContextNotDefined)
 from .basic_ops import as_gpuarray_variable
-from . import fft, dnn, opt, extra_ops, multinomial, reduction, rng_mrg, ctc
+from . import fft, dnn, opt, extra_ops, multinomial, reduction, sort, rng_mrg, ctc


 def transfer(x, target):

--- a/theano/gpuarray/c_code/topk_common.cuh
+++ b/theano/gpuarray/c_code/topk_common.cuh
+// modified from pytorch
+// https://github.com/pytorch/pytorch/master/blob/torch/lib/THC/THCTensorTopK.cuh
+// original license below:
+/*
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+   and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#if __CUDA_ARCH__ < 350
+#define __ldg(ptr) (*(ptr))
+#endif
+
+
+typedef ptrdiff_t ssize_t;
+
+
+__device__ __forceinline__ int lane_id() {
+  int id;
+  asm("mov.s32 %0, %laneid;" : "=r"(id) );
+  return id;
+}
+
+__device__ __forceinline__ unsigned lane_mask_lt() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
+  return mask;
+}
+
+__device__ __forceinline__ unsigned lane_mask_le() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
+  return mask;
+}
+
+__device__ __forceinline__ unsigned lane_mask_gt() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
+  return mask;
+}
+
+__device__ __forceinline__ unsigned lane_mask_ge() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
+  return mask;
+}
+
+template <typename T>
+struct Bitfield {};
+
+template <>
+struct Bitfield<unsigned int> {
+  static __device__ __forceinline__
+  unsigned int get(unsigned int val, int pos, int len) {
+    unsigned int ret;
+    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
+    return ret;
+  }
+
+  static __device__ __forceinline__
+  unsigned int set(unsigned int val, unsigned int toInsert, int pos, int len) {
+    unsigned int ret;
+    asm("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
+    return ret;
+  }
+};
+
+template <>
+struct Bitfield<unsigned long long int> {
+  static __device__ __forceinline__
+  unsigned long long int get(unsigned long long int val, int pos, int len) {
+    unsigned long long int ret;
+    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
+    return ret;
+  }
+
+  static __device__ __forceinline__
+  unsigned long long int set(unsigned long long int val, unsigned long long int toInsert, int pos, int len) {
+    unsigned long long int ret;
+    asm("bfi.b64 %0, %1, %2, %3, %4;" :
+        "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
+    return ret;
+  }
+};
+
+
+template <typename T>
+struct RadixConfig {
+// Converts a type (maybe float) to an integer representation with the same
+// sorting; i.e., for floats f1, f2:
+// if f1 < f2 then convert(f1) < convert(f2)
+// We use this to enable radix selection of floating-point values.
+// This also gives a relative order for NaNs, but that's ok, as they
+// will all be adjacent
+  typedef unsigned int RadixType;
+  static inline __device__ RadixType convert(T v) {
+      return (RadixType)v;
+  }
+
+  static inline __device__ float deconvert(RadixType v) {
+      return (T)v;
+  }
+};
+
+template <>
+struct RadixConfig<float> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(float v) {
+    RadixType x = __float_as_int(v);
+    RadixType mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
+
+    return (x ^ mask);
+  }
+
+  static inline __device__ float deconvert(RadixType v) {
+    RadixType mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
+
+    return __int_as_float(v ^ mask);
+  }
+};
+
+template <>
+struct RadixConfig<double> {
+  typedef unsigned long long RadixType;
+
+  static inline __device__ RadixType convert(double v) {
+    RadixType x = __double_as_longlong(v);
+    RadixType mask = -((x >> 63)) | 0x8000000000000000;
+    return (x ^ mask);
+  }
+
+  static inline __device__ double deconvert(RadixType v) {
+    RadixType mask = ((v >> 63) - 1) | 0x8000000000000000;
+    return __longlong_as_double(v ^ mask);
+  }
+};
+
+
+template <>
+struct RadixConfig<char> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(char v) {
+    return 128u + v;
+  }
+
+  static inline __device__ char deconvert(RadixType v) {
+    return v - 128;
+  }
+};
+
+template <>
+struct RadixConfig<short> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(short v) {
+    assert(sizeof(short) == 2);
+    return 32768u ^ v;
+  }
+
+  static inline __device__ short deconvert(RadixType v) {
+    return v - 32768;
+  }
+};
+
+template <>
+struct RadixConfig<int> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(int v) {
+    assert(sizeof(int) == 4);
+    return 2147483648u + v;
+  }
+
+  static inline __device__ int deconvert(RadixType v) {
+    return v - 2147483648u;
+  }
+};
+
+template <>
+struct RadixConfig<long long> {
+  typedef unsigned long long RadixType;
+
+  static inline __device__ RadixType convert(long long v) {
+    assert(sizeof(long long) == 8);
+    return 9223372036854775808ull + v;
+  }
+
+  static inline __device__ long long deconvert(RadixType v) {
+    return v - 9223372036854775808ull;
+  }
+};
+
+#define USE_HALF $use_half
+
+#if USE_HALF == 1
+// since half is ushort, using macro to protect this part is necessary
+template <>
+struct RadixConfig<unsigned short> {
+  typedef unsigned int RadixType;
+
+  static inline __device__ RadixType convert(unsigned short v) {
+    RadixType mask = -(((RadixType)v >> 15)) | 0x8000;
+    return (v ^ mask);
+  }
+
+  static inline __device__ unsigned short deconvert(RadixType v) {
+    RadixType mask = ((v >> 15) - 1) | 0x8000;
+    return (unsigned short)(v ^ mask);
+  }
+};
+#endif // USE_HALF
+
+// $$inp_t should be replaced in c_code
+// we cannot use templated kernel because gpuarray API does not support it
+#define NDIM            $ndim
+#define INPUT_TYPE      $inp_t
+#define INDEX_TYPE      $out_t
+#define bitsof(T)       (sizeof(T)*8)
+#define radix_t         RadixConfig<INPUT_TYPE>::RadixType
+#define WRITE_VALUE     $write_value
+#define WRITE_INDEX     $write_index
+
+#if RADIX_SIZE > 32
+#error "RADIX_SIZE must be smaller than warp size (32)"
+#endif
+
+void __device__ atomicAdd(long long *dst, long long &src) {
+    atomicAdd(
+        reinterpret_cast<unsigned long long*>(dst),
+        reinterpret_cast<unsigned long long&>(src));
+}
+
+template <typename T>
+static inline __device__ T binary_cumsum(
+    int idx, int warp_id, T* smem, bool value) {
+    // cumsum within 1D thread block, which adds up `value` of all threads
+    // whose id is *no greater than* the current thread
+    // binary_cumsum(1, 0, 1, 0, 1) -> (1, 1, 2, 2, 3)
+
+    // cumsum within warp
+    unsigned int warp_bits = __ballot(value);
+    T warp_sum = __popc(lane_mask_le() & warp_bits);
+
+    if (lane_id() == 0)
+        smem[warp_id] = __popc(warp_bits);
+
+    local_barrier();
+
+    // cumsum across warps in one thread
+    if (idx == 0) {
+        T sum = smem[0];
+        for (int i = 1; i < blockDim.x / GA_WARP_SIZE; ++i) {
+            sum += smem[i];
+            smem[i] = sum;
+        }
+    }
+
+    local_barrier();
+
+    // load the carry from the preceding warp
+    if (warp_id >= 1) {
+        warp_sum = warp_sum+smem[warp_id - 1];
+    }
+
+    return warp_sum;
+}
+
+template <typename T>
+static inline __device__ T binary_cumsum_exclusive(
+    int idx, int warp_id, T* smem, bool value) {
+    // cumsum within 1D thread block, which adds up `value` of all threads
+    // whose id is *less than* the current thread
+    // binary_cumsum_excl(1, 0, 1, 0, 1) -> (0, 1, 1, 2, 2)
+
+    // cumsum within warp
+    unsigned int warp_bits = __ballot(value);
+    T warp_sum = __popc(lane_mask_lt() & warp_bits);
+
+    if (lane_id() == 0)
+        smem[warp_id] = __popc(warp_bits);
+
+    local_barrier();
+
+    // cumsum across warps in one thread
+    if (idx == 0) {
+        T sum = smem[0];
+        for (int i = 1; i < blockDim.x / GA_WARP_SIZE; ++i) {
+            sum += smem[i];
+            smem[i] = sum;
+        }
+    }
+
+    local_barrier();
+
+    // load the carry from the preceding warp
+    if (warp_id >= 1)
+        warp_sum += smem[warp_id - 1];
+
+    return warp_sum;
+}
+
+// apply raw(byte) offset to pointer
+template <typename T>
+static __device__ inline T* ptr_add(T *ptr, ssize_t offset) {
+    return (T*)((char*)ptr + offset);
+}
+
+// get array element using raw(byte) offset
+template <typename T>
+static __device__ inline T& ptr_at(T *ptr, ssize_t offset) {
+    return *((T*)((char*)ptr + offset));
+}
+
+// read array element using raw(byte) offset
+template <typename T>
+static __device__ inline T ptr_read_cached(T *ptr, ssize_t offset) {
+    return __ldg(((T*)((char*)ptr + offset)));
+}
+
--- a/theano/gpuarray/c_code/topk_dense.cu
+++ b/theano/gpuarray/c_code/topk_dense.cu
+#define RADIX_BITS 4
+#define RADIX_SIZE      (1<<RADIX_BITS)
+#define RADIX_MASK(n)   ((RADIX_SIZE-1) << (n*RADIX_BITS))
+#define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
+
+// works when length on axis is within max allowed threads in block (1024)
+KERNEL void k_topk_dense(
+        $dims
+        // size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
+        $dstv
+        // INPUT_TYPE *dstv
+        $dstv_strides
+        // ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
+        $dsti
+        // INDEX_TYPE *dsti
+        $dsti_strides
+        // ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
+        ssize_t k,
+        INPUT_TYPE* src,
+        $src_strides
+        // ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
+        size_t size) {
+    __shared__ int smem[32 * RADIX_SIZE];
+    __shared__ int k2;
+    const unsigned int idx = threadIdx.x;
+    bool is_topk= (idx < size);
+    bool is_topkth = is_topk;
+    size_t out_idx;
+
+    const unsigned char warp_id = idx / GA_WARP_SIZE;
+
+    // 0. get the slice for thread block to work on
+
+    size_t gid = blockIdx.x, gidx;
+    $set_slice
+    // $$set_slice expands into:
+    //for(int i=1; i<NDIM; i++) {
+        // gidx = gid % dims_$${i};
+        // gid /= dims_$${i};
+        // dsti = ptr_add(dsti, gidx*dsti_strides_$${i};
+        // dstv = ptr_add(dstv, gidx*dstv_strides_$${i};
+        // src = ptr_add(src, gidx*src_strides_$${i});
+    //}
+
+    // get input and its radix friendly form
+    const INPUT_TYPE xval = is_topk ? ptr_at(src, idx*src_strides_0) : (INPUT_TYPE)0;
+    radix_t x = RadixConfig<INPUT_TYPE>::convert(xval);
+
+    // resolve negative k
+    if (k<0) { x = ~x; k = -k; }
+    if (idx==0)
+        k2 = k;
+
+    // 1. filter is_topk and is_topkth using radix select
+
+    #pragma unroll
+    for (int i=bitsof(INPUT_TYPE)-RADIX_BITS; i>=0; i-=RADIX_BITS) {
+        const int digit = Bitfield<radix_t>::get(x, i, RADIX_BITS);
+        /*int digit = (x>>i) & (RADIX_SIZE-1);*/
+        // count within warp
+        #pragma unroll
+        for (int bin=0; bin<RADIX_SIZE; ++bin) {
+            bool vote = (bin == digit) && is_topkth;
+            unsigned int votes = __ballot(vote);
+            if (lane_id()==0)
+                smem[bin + RADIX_SIZE*warp_id] = __popc(votes);
+        }
+        local_barrier();
+        // sum counts across all warps
+        if (idx < RADIX_SIZE) {
+            int sum = smem[idx];
+            #pragma unroll
+            for(int w=RADIX_SIZE; w<blockDim.x*RADIX_SIZE / GA_WARP_SIZE; w+=RADIX_SIZE)
+                sum += smem[idx + w];
+            smem[idx] = sum;
+        }
+        local_barrier();
+
+        // find the bucket and update k2
+        // smem[:RADIX_SIZE:-1] = k2 - cumsum(smem[:RADIX_SIZE-1:-1])
+        if (idx == 0) {
+            int sum = k2;
+            #pragma unroll
+            for (int bin=RADIX_SIZE-1; bin>=0; --bin) {
+                sum -= smem[bin];
+                smem[bin] = sum;
+                k2 = (sum > 0) ? sum : k2;
+            }
+            smem[RADIX_SIZE] = 1;
+        }
+        local_barrier();
+
+        if (is_topkth) {
+            is_topk &= (smem[digit+1] > 0);
+            is_topkth &= (smem[digit] <= 0) && (smem[digit+1] > 0);
+        }
+        local_barrier();
+    }
+
+    // set k2 as number of exceeding values
+    if (idx==0) {
+        #pragma unroll
+        for (int bin=RADIX_SIZE-1; bin>=0; --bin) {
+            if (smem[bin] <= 0)
+                break;
+            k2 = smem[bin];
+        }
+    }
+    local_barrier();
+
+    // 2. find the index of output array, if exists
+
+    if (k2 != 0) {
+        // top_kth value may not be unique, so we need to
+        // perform binary cumsum on is_topkth to drop exceeding top-kth values
+        out_idx = binary_cumsum_exclusive(idx, warp_id, smem, is_topkth);
+        if ((out_idx >= k2) && is_topkth)
+            is_topk = false;
+        local_barrier();
+    }
+
+    // perform binary cumsum on is_topk to determine the indices to put result
+    out_idx = binary_cumsum_exclusive(idx, warp_id, smem, is_topk);
+
+    if (is_topk) {
+#if WRITE_VALUE == 1
+        ptr_at(dstv, out_idx * dstv_strides_0) = xval;
+#endif
+#if WRITE_INDEX == 1
+        ptr_at(dsti, out_idx * dsti_strides_0) = (INDEX_TYPE)idx;
+#endif
+    }
+}
--- a/theano/gpuarray/c_code/topk_dense_large.cu
+++ b/theano/gpuarray/c_code/topk_dense_large.cu
+#define RADIX_BITS 2
+#define RADIX_SIZE      (1<<RADIX_BITS)
+#define RADIX_DIGITS(T) (bitsof(T)/RADIX_BITS)
+
+#define COUNT_TYPE $count_t
+#define KERNEL_NAME $kname
+
+// if count_t is int, work for array size within [1025, 2^31-1]
+// if count_t is long long, work for array size within [2^31, 2^63-1]
+template <typename DataType, typename RadixType, typename CountType>
+__device__ DataType find_pattern(DataType* smem,
+                             DataType* data,
+                             CountType slice_size,
+                             CountType stride,
+                             RadixType known_bits,
+                             RadixType known_bits_mask) {
+    if (threadIdx.x < 32)
+        smem[threadIdx.x] = 0;
+
+    local_barrier();
+
+    // All threads participate in the loop, in order to sync on the flag
+    for (CountType i = threadIdx.x; i < (slice_size + (CountType)blockDim.x-1); i += blockDim.x) {
+        bool in_range = (i < slice_size);
+        DataType v = in_range ? ptr_read_cached(data, i*stride) : 0;
+
+        if (in_range && ((RadixConfig<DataType>::convert(v) & known_bits_mask) == known_bits)) {
+            // There should not be conflicts if we are using find_pattern,
+            // since the result is unique
+            smem[0] = 1;
+            smem[1] = v; // can't use val as the flag, since it could be 0
+        }
+
+        local_barrier();
+
+        DataType found = smem[0];
+        DataType val = smem[1];
+
+        local_barrier();
+
+        // Check to see if a thread found the value
+        if (found != 0)
+            return val;
+    }
+    return 0;
+}
+
+// This function counts the distribution of all input values in a
+// slice we are selecting by radix digit at `radix_digit_pos`, but only
+// those that pass the filter `((v & known_bits_mask) == known_bits)`.
+// This produces and broadcasts the seen counts for a single block only.
+// `smem` must have at least `RADIX_SIZE` elements.
+template <typename DataType, typename RadixType, typename CountType>
+__device__ void count_radix_masked(CountType counts[RADIX_SIZE],
+                                    CountType* smem,
+                                    RadixType known_bits,
+                                    RadixType known_bits_mask,
+                                    int radix_digit_pos,
+                                    CountType slice_size,
+                                    CountType stride,
+                                    DataType* data) {
+    // Clear out per-thread counts from a previous round
+#pragma unroll
+    for (int i = 0; i < RADIX_SIZE; ++i)
+        counts[i] = 0;
+
+    if (threadIdx.x < RADIX_SIZE)
+        smem[threadIdx.x] = 0;
+
+    local_barrier();
+
+    // Scan over all the data. Upon a read, the warp will accumulate
+    // counts per each digit in the radix using warp voting.
+    for (CountType i = threadIdx.x; i < slice_size; i += blockDim.x) {
+        RadixType val = RadixConfig<DataType>::convert(ptr_read_cached(data, i*stride));
+
+        bool has_val = ((val & known_bits_mask) == known_bits);
+        RadixType digit_in_radix = Bitfield<RadixType>::get(val, radix_digit_pos, RADIX_BITS);
+
+        #pragma unroll
+        for (int j = 0; j < RADIX_SIZE; ++j) {
+            bool vote = has_val && (digit_in_radix == j);
+            counts[j] += __popc(__ballot(vote));
+        }
+    }
+
+    // Now, for each warp, sum values
+    if (lane_id() == 0) {
+        for (int i=0; i<RADIX_SIZE; ++i)
+            atomicAdd(&smem[i], counts[i]);
+    }
+    /*
+    // not sure why, but this just give wrong results
+    if (lane_id() < RADIX_SIZE)
+        atomicAdd(&smem[lane_id()], counts[lane_id()]);
+        */
+
+    local_barrier();
+
+    // For each thread, read in the total counts
+    #pragma unroll
+    for (unsigned int i = 0; i < RADIX_SIZE; ++i)
+        counts[i] = smem[i];
+
+    local_barrier();
+}
+
+template <typename DataType, typename RadixType, typename CountType>
+__device__ void radix_select(DataType* data,
+                            CountType k,
+                            bool order,
+                            CountType slice_size,
+                            CountType stride,
+                            CountType* smem,
+                            DataType* top_kth) {
+    // Per-thread buckets into which we accumulate digit counts in our
+    // radix
+    register CountType counts[RADIX_SIZE];
+
+    // We only consider elements x such that (x & known_bits_mask) == known_bits
+    // Initially, we consider all elements of the array, so the above
+    // statement is true regardless of input.
+    RadixType known_bits = 0, known_bits_mask = 0;
+
+    // We are looking for the top k_to_find-th element when iterating over
+    // digits; this count gets reduced by elimination when counting
+    // successive digits
+    CountType k_to_find = abs(k);
+
+    // We start at the most significant digit in our radix, scanning
+    // through to the least significant digit
+    #pragma unroll
+    for (int digit_pos = bitsof(DataType) - RADIX_BITS;
+            digit_pos >= 0; digit_pos -= RADIX_BITS) {
+
+        // Count radix distribution for the current position and reduce
+        // across all threads
+        count_radix_masked<DataType, RadixType, CountType>(
+                    counts, smem,
+                    known_bits, known_bits_mask, digit_pos,
+                    slice_size, stride, data);
+
+        // All threads participate in the comparisons below to know the
+        // final result
+
+        #define CHECK_RADIX(i) \\
+            int count = counts[i]; \\
+            /* All threads have the same value in counts here, so all */  \\
+            /* threads will return from the function. */  \\
+            if (count == 1 && k_to_find == 1) {  \\
+                /* There is a unique answer. */  \\
+                known_bits = Bitfield<RadixType>::set(  \\
+                    known_bits, i, digit_pos, RADIX_BITS);  \\
+                known_bits_mask = Bitfield<RadixType>::set(  \\
+                    known_bits_mask, RADIX_SIZE-1, digit_pos, RADIX_BITS);  \\
+                /* The answer is now the unique element v such that: */  \\
+                /* (v & known_bits_mask) == known_bits */  \\
+                /* However, we do not yet know what the actual element is. We */  \\
+                /* need to perform a search through the data to find the */  \\
+                /* element that matches this pattern. */  \\
+                *top_kth = find_pattern<DataType, RadixType, CountType>(  \\
+                        (DataType*) smem, data, slice_size,  \\
+                        stride, known_bits, known_bits_mask);  \\
+                return;  \\
+            }  \\
+            if (count >= k_to_find) {  \\
+                known_bits = Bitfield<RadixType>::set(known_bits, i, digit_pos, RADIX_BITS);  \\
+                known_bits_mask = Bitfield<RadixType>::set(  \\
+                    known_bits_mask, RADIX_SIZE-1, digit_pos, RADIX_BITS);  \\
+                /* The top-Kth element v must now be one such that: */  \\
+                /* (v & known_bits_mask == known_bits) */  \\
+                /* but we haven't narrowed it down; we must check the next */  \\
+                /* least-significant digit */  \\
+                break;  \\
+            }  \\
+            k_to_find -= count
+
+        if (order) {
+            #pragma unroll
+            for (int i=RADIX_SIZE - 1; i >= 0; --i) {
+                CHECK_RADIX(i);
+            }
+        } else {
+            #pragma unroll
+            for (int i=0; i < RADIX_SIZE; ++i) {
+                CHECK_RADIX(i);
+            }
+        }
+        #undef CHECK_RADIX
+    } // end digit_pos for
+
+    // There is no unique result, but there is a non-unique result
+    // matching `known_bits` exactly
+    *top_kth = RadixConfig<DataType>::deconvert(known_bits);
+}
+
+KERNEL void KERNEL_NAME(
+        $dims
+        // size_t dims_1, ssize_t dims_2, ... , dims_$${NDIM}
+        $dstv
+        // INPUT_TYPE *dstv
+        $dstv_strides
+        // ssize_t dstv_strides_0, ssize_t dstv_strides_1, ... , dstv_strides_$${NDIM}
+        $dsti
+        // INDEX_TYPE *dsti
+        $dsti_strides
+        // ssize_t dsti_strides_0, ssize_t dsti_strides_1, ... , dsti_strides_$${NDIM}
+        ssize_t k,
+        INPUT_TYPE* src,
+        $src_strides
+        // ssize_t src_strides_0, ssize_t src_strides_1, ... , src_strides_$${NDIM}
+        size_t size) {
+    __shared__ COUNT_TYPE smem[32];
+    INPUT_TYPE topkth_value;
+
+    const bool order = (k>0);
+    k = (order ? k : -k);
+    const int idx = threadIdx.x;
+    const int warp_id = idx / GA_WARP_SIZE;
+
+    // get the slice for thread block to work on
+    // size <- the axis to work on
+    // dims_1+ <- batched dimensions
+    unsigned int gid = blockIdx.x, gidx;
+    $set_slice
+    // $$set_slice expands into:
+    //for(int i=1; i<NDIM; i++) {
+        // gidx = gid % dims_$${i};
+        // gid /= dims_$${i};
+        // dsti = ptr_add(dsti, gidx*dsti_strides_$${i});
+        // dstv = ptr_add(dstv, gidx*dstv_strides_$${i});
+        // src = ptr_add(src, gidx*src_strides_$${i});
+    //}
+
+    radix_select<INPUT_TYPE, radix_t, COUNT_TYPE>(
+        src, k, order, size, src_strides_0,
+        smem, &topkth_value);
+
+    // Every value that is strictly less/greater than `pattern`
+    // (depending on sort dir) in sorted int format is in the top-K.
+    // The top-K value itself might not be unique.
+    //
+    // Since there are a variable number of elements that we see that
+    // are within the top-k, we don't know at what index to write out
+    // the resulting values.
+    // In order to get this, we perform an exclusive cumsum of
+    // `has_topk`. This will return the resulting index into which we
+    // need to write the result, if a thread has a result.
+
+    // All threads need to participate in the loop and the cumsum
+    // but not necessarily in the load; hence loop bounds being rounded
+    // up to a multiple of the block dim.
+    COUNT_TYPE iter_bound = size + blockDim.x-1;
+    INDEX_TYPE write_base = 0;
+
+    for (int i = idx; i < iter_bound; i += blockDim.x) {
+        bool in_range = (i < size);
+        INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : 0;
+        bool has_topk;
+        if (order) {
+            has_topk = in_range && (v > topkth_value);
+        } else {
+            has_topk = in_range && (v < topkth_value);
+        }
+
+        int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
+        int carry = smem[blockDim.x / 32 - 1];
+
+        if (has_topk) {
+            COUNT_TYPE write_idx = write_base + index;
+#if WRITE_VALUE == 1
+            ptr_at(dstv, write_idx * dstv_strides_0) = v;
+#endif
+#if WRITE_INDEX == 1
+            ptr_at(dsti, write_idx * dsti_strides_0) = (INDEX_TYPE)i;
+#endif
+        }
+
+        write_base += carry;
+    }
+
+    COUNT_TYPE topk_remaining = (k - write_base);
+
+    for (COUNT_TYPE i = idx; i < iter_bound; i += blockDim.x) {
+        bool in_range = (i < size);
+        INPUT_TYPE v = in_range ? ptr_read_cached(src, i*src_strides_0) : 0;
+        bool has_topk = in_range && (v == topkth_value);
+
+        int index = binary_cumsum_exclusive(idx, warp_id, smem, has_topk);
+        int carry = smem[blockDim.x / 32 - 1];
+
+        if (has_topk && index < topk_remaining) {
+            COUNT_TYPE write_idx = write_base + index;
+#if WRITE_VALUE == 1
+            ptr_at(dstv, write_idx * dstv_strides_0) = v;
+#endif
+#if WRITE_INDEX == 1
+            ptr_at(dsti, write_idx * dsti_strides_0) = (INDEX_TYPE)i;
+#endif
+        }
+
+        if (carry >= topk_remaining)
+            break;
+
+        topk_remaining -= carry;
+        write_base += carry;
+    }
+}
+
--- a/theano/gpuarray/sort.py
+++ b/theano/gpuarray/sort.py
+from __future__ import absolute_import, print_function, division
+import os
+from string import Template
+
+from theano import Apply
+from theano.tensor import as_tensor_variable
+from theano.tensor.sort import TopKOp
+
+from .basic_ops import (GpuKernelBase, Kernel, infer_context_name,
+                        as_gpuarray_variable, gpuarray_helper_inc_dir)
+from .opt import register_opt, op_lifter, register_opt2
+from .type import GpuArrayType
+
+try:
+    import pygpu
+    import pygpu.gpuarray as ga
+except ImportError as e:
+    # To make sure theano is importable
+    pass
+
+
+# TODO GPU sort / argsort
+class GpuTopKOp(GpuKernelBase, TopKOp):
+    '''
+    Implements TopKOp on gpu
+
+    '''
+    __props__ = TopKOp.__props__
+    _f16_ok = True
+
+    def __init__(
+        self, axis=-1,
+        sorted=True,
+        idx_dtype='int64',
+        return_values=True,
+        return_indices=True
+    ):
+        GpuKernelBase.__init__(self)
+        TopKOp.__init__(
+            self, axis=axis,
+            sorted=sorted,
+            idx_dtype=idx_dtype,
+            return_values=return_values,
+            return_indices=return_indices)
+
+    def c_headers(self):
+        return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']
+
+    def c_header_dirs(self):
+        return [
+            os.path.dirname(__file__),
+            gpuarray_helper_inc_dir(),
+            pygpu.get_include()]
+
+    def c_code_cache_version(self):
+        return (1,)
+
+    def gpu_kernels(self, node, nodename):
+        # load kernel source
+        device_type = node.inputs[0].type.context.kind
+        kernel_ext = {b'cuda': '.cu', b'opencl': '.cl'}[device_type]
+        common_ext = {b'cuda': '.cuh', b'opencl': '.h'}[device_type]
+
+        # prepare "$" macros
+        if device_type == b'cuda':
+            ndim = node.inputs[0].ndim
+            dstv_strides_code = ''.join('ssize_t dstv_strides_%d, ' % i for i in range(ndim))
+            dsti_strides_code = ''.join('ssize_t dsti_strides_%d, ' % i for i in range(ndim))
+            src_strides_code = ''.join('ssize_t src_strides_%d, ' % i for i in range(ndim))
+            set_slice_code = '''
+        gidx = gid %% dims_%(i)d;
+        gid /= dims_%(i)d;
+        {dstv};
+        {dsti};
+        src = ptr_add(src, gidx*src_strides_%(i)d);\n'''.format(
+                dstv='dstv = ptr_add(dstv, gidx*dstv_strides_%(i)d)' if self.return_values else '',
+                dsti='dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)' if self.return_indices else '')
+            set_slice_code = ''.join(
+                set_slice_code % dict(i=j) for j in range(1, ndim))
+            flags = Kernel.get_flags(node.inputs[0].dtype)
+            subs = dict(
+                inp_t=ga.dtype_to_ctype(node.inputs[0].dtype),
+                out_t=ga.dtype_to_ctype(self.idx_dtype),
+                dims=''.join('size_t dims_%d, ' % i for i in range(1, ndim)),
+                dstv='INPUT_TYPE *dstv,' if self.return_values else '',
+                dsti='INDEX_TYPE *dsti,' if self.return_indices else '',
+                dstv_strides=dstv_strides_code if self.return_values else '',
+                dsti_strides=dsti_strides_code if self.return_indices else '',
+                src_strides=src_strides_code,
+                set_slice=set_slice_code,
+                write_value=int(self.return_values),
+                write_index=int(self.return_indices),
+                ndim=str(ndim),
+                use_half=int(node.inputs[0].dtype == 'float16')
+                )
+        elif device_type == b'opencl':
+            raise NotImplementedError()
+
+        # setup parameters
+        param_types = [ga.SIZE] * (ndim - 1)  # dims
+        for _ in range(self.return_values + self.return_indices):
+            param_types.append(ga.GpuArray)  # dst*
+            param_types.extend([ga.SSIZE] * ndim)  # dst*_strides
+        param_types.append(ga.SIZE)  # k
+        param_types.append(ga.GpuArray)  # src
+        param_types.extend([ga.SSIZE] * ndim)  # src_strides
+        param_types.append(ga.SIZE)  # size
+
+        # load and compile kernels
+        with open(os.path.join(
+            os.path.dirname(__file__), 'c_code', 'topk_common' + common_ext
+        )) as f:
+            common_src = f.read()
+
+        kernels = []
+
+        def build_kernel(fname, kname, subs):
+            with open(os.path.join(
+                os.path.dirname(__file__), 'c_code', fname)
+            ) as f:
+                kernel_src = f.read()
+            ker = Kernel(
+                code=Template(common_src + kernel_src).substitute(**subs),
+                name=kname,
+                params=param_types,
+                flags=flags,
+                objvar=kname + nodename)
+            return ker
+
+        subs['count_t'] = 'int'
+        kernels.append(
+            build_kernel('topk_dense' + kernel_ext, 'k_topk_dense', subs))
+        subs['kname'] = 'k_topk_dense_large'
+        kernels.append(
+            build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_large', subs))
+        subs['count_t'] = 'long long'
+        subs['kname'] = 'k_topk_dense_xlarge'
+        kernels.append(
+            build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_xlarge', subs))
+        return kernels
+
+    def c_code(self, node, nodename, inps, outs, sub):
+        context = node.inputs[0].type.context
+        if context.kind != b'cuda':
+            raise NotImplementedError(
+                '%s: We only have CUDA '
+                'implementation so far.' % self.__class__.__name__)
+        x, k = inps
+        inp_dtc = ga.dtype_to_typecode(node.inputs[0].dtype)
+        if not self.return_indices:
+            yv, = outs
+        elif self.return_values:
+            yv, yi = outs
+        else:
+            yi, = outs
+        out_dtype_s = self.idx_dtype
+        out_dtc = ga.dtype_to_typecode(out_dtype_s)
+        fail = sub['fail']
+        ctx = sub['params']
+        k_dtype = node.inputs[1].type.dtype_specs()[1]
+        # max threads per block
+        MAX_TPB = context.maxlsize
+        # max blocks per grid
+        MAX_BPG = context.maxgsize0
+        WARP_SIZE = 32
+
+        ndim = node.inputs[0].ndim
+        reordered_axes = list(range(ndim))
+        axis = self.axis % ndim
+        del(reordered_axes[axis])
+        reordered_axes = [axis] + reordered_axes
+        dims = ''.join('(void*)(dims+%d), ' % i for i in reordered_axes[1:])
+        prep_output = ''
+        if self.return_values:
+            def_dvstrides = 'const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)' % yv
+            params_dv = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % (yv, yv)
+            params_dv += ''.join('(void*)(dvstrides+%d), ' % i for i in reordered_axes)
+            prep_output += '''
+    if (0 != theano_prep_output(
+        &%(yv)s, %(ndim)d, odims,
+        %(inp_dtc)s, GA_C_ORDER, %(ctx)s)) {
+        %(fail)s;
+    }\n''' % locals()
+        else:
+            def_dvstrides = params_dv = ''
+
+        if self.return_indices:
+            def_distrides = 'const ssize_t *distrides = PyGpuArray_STRIDES(%s)' % yi
+            params_di = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % yi
+            params_di += ''.join('(void*)(distrides+%d), ' % i for i in reordered_axes)
+            prep_output += '''
+    if (0 != theano_prep_output(
+        &%(yi)s, %(ndim)d, odims,
+        %(out_dtc)s, GA_C_ORDER, %(ctx)s)) {
+        %(fail)s;
+    }\n''' % locals()
+        else:
+            def_distrides = params_di = ''
+        sstrides = ', '.join('(void*)(sstrides+%d)' % i for i in reordered_axes)
+        code = '''
+{
+    const ssize_t k_ = ((%(k_dtype)s*)(PyArray_DATA(%(k)s)))[0];
+    const size_t *dims = PyGpuArray_DIMS(%(x)s);
+    size_t odims[%(ndim)d];
+    for (int i=0; i<%(ndim)d; i++)
+        odims[i] = dims[i];
+
+    odims[%(axis)d] = k_>=0 ? k_ : -k_;
+
+    if (0 == odims[%(axis)d]) {
+        PyErr_SetString(
+            PyExc_ValueError,
+            "topk: kth must not be zero");
+        %(fail)s;
+    } else if (dims[%(axis)d] < odims[%(axis)d]) {
+        PyErr_SetString(
+            PyExc_ValueError,
+            "topk: kth cannot be larger than the size of specified axis %(axis)d");
+        %(fail)s;
+    }
+    %(prep_output)s
+
+    size_t grid_size=1, block_size=1;
+    for (int i=0; i<%(ndim)d; ++i) {
+        if (i!=%(axis)d)
+            grid_size *= dims[i];
+        else
+            block_size = dims[i];
+    }
+    // round up to multiples of warp size
+    block_size = ((block_size + %(WARP_SIZE)d - 1) / %(WARP_SIZE)d) * %(WARP_SIZE)d;
+
+    if (grid_size > %(MAX_BPG)d) {
+        PyErr_SetString(
+            PyExc_ValueError,
+            "topk: too many slices to work with, expected <= %(MAX_BPG)d");
+        %(fail)s;
+    }
+
+    %(def_dvstrides)s;
+    %(def_distrides)s;
+    const ssize_t *sstrides = PyGpuArray_STRIDES(%(x)s);
+    void* args[] = {
+        %(dims)s
+        %(params_dv)s
+        %(params_di)s
+        (void*)(&k_),
+        (void*)((char*)(%(x)s->ga.data) + (%(x)s->ga.offset)),
+        %(sstrides)s,
+        (void*)(dims+%(axis)d),
+    };
+
+    int err;
+    if (dims[%(axis)d] > (1u << 31)) {
+        block_size = %(MAX_TPB)d;
+        err = GpuKernel_call(
+            &k_topk_dense_xlarge%(nodename)s, 1,
+            &grid_size, &block_size, 0, args);
+    } else if (block_size > %(MAX_TPB)d) {
+        block_size = %(MAX_TPB)d;
+        err = GpuKernel_call(
+            &k_topk_dense_large%(nodename)s, 1,
+            &grid_size, &block_size, 0, args);
+    } else {
+        err = GpuKernel_call(
+            &k_topk_dense%(nodename)s, 1,
+            &grid_size, &block_size, 0, args);
+    }
+    if (err != GA_NO_ERROR) {
+        PyErr_SetString(
+            PyExc_RuntimeError,
+            "topk: gpu kernel failed to execute");
+        %(fail)s;
+    }
+}
+        '''
+        return code % locals()
+
+    def make_node(self, inp, kth):
+        ctx_name = infer_context_name(inp)
+        inp = as_gpuarray_variable(inp, ctx_name)
+        kth = as_tensor_variable(kth)
+        bcast = inp.type.broadcastable
+        outs = []
+        if self.return_values:
+            outs.append(inp.type())
+        if self.return_indices:
+            outs.append(GpuArrayType(
+                dtype=self.idx_dtype,
+                broadcastable=bcast,
+                context_name=ctx_name)())
+        return Apply(self, [inp, kth], outs)
+
+    def get_params(self, node):
+        return node.inputs[0].type.context
+
+
+@register_opt('fast_compile')
+@op_lifter([TopKOp], cuda_only=True)
+@register_opt2([TopKOp], 'fast_compile')
+def local_gpua_topkop(op, ctx_name, inputs, outputs):
+    axis = op.axis
+    rv = op.return_values
+    ri = op.return_indices
+    x, k = inputs
+    x = as_gpuarray_variable(x, ctx_name)
+
+    gpu_op = GpuTopKOp(
+        axis=axis,
+        sorted=op.sorted,
+        idx_dtype=op.idx_dtype,
+        return_values=rv,
+        return_indices=ri)
+    rets = gpu_op(x, k)
+    return rets
--- a/theano/tensor/__init__.py
+++ b/theano/tensor/__init__.py
@@ -40,7 +40,7 @@ from theano.tensor import nnet  # used for softmax, sigmoid, etc.
 from theano.gradient import Rop, Lop, grad, numeric_grad, verify_grad, \
    jacobian, hessian, consider_constant

-from theano.tensor.sort import sort, argsort
+from theano.tensor.sort import sort, argsort, topk, argtopk, topk_and_argtopk
 from theano.tensor.extra_ops import (DiffOp, bincount, squeeze,
                       repeat, bartlett, fill_diagonal, fill_diagonal_offset,
                       cumsum, cumprod, unravel_index, ravel_multi_index)

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -35,6 +35,7 @@ from theano.tensor.subtensor import (get_idx_list, get_canonical_form_slice,
                                     advanced_subtensor,
                                     advanced_subtensor1,
                                     advanced_inc_subtensor1)
+from theano.tensor.sort import TopKOp
 from theano import scalar
 from theano.scalar import basic
 from theano.tensor import basic as T
@@ -7548,3 +7549,35 @@ def local_merge_alloc(node):
                    dim_outer, T.eq(dim_outer, dim_inner))
        i += 1
    return [T.alloc(inputs_inner[0], *dims_outer)]
+
+
+@register_useless('fast_compile')
+@gof.local_optimizer([TopKOp])
+def local_useless_topk(node):
+    """
+    TopKOp generates two outputs by default
+    This opt removes the useless ones
+
+    """
+    op = node.op
+    if not isinstance(op, TopKOp):
+        return
+    if not (op.return_values and op.return_indices):
+        return False
+
+    x, k = node.inputs
+    ret_val = bool(node.outputs[0].clients)
+    ret_idx = bool(node.outputs[1].clients)
+
+    if not (ret_val ^ ret_idx):
+        # both true -> nothing to remove
+        # both false -> let pruner handle
+        return False
+
+    old_output = node.outputs[ret_idx]
+    new_output = TopKOp(
+        axis=op.axis,
+        idx_dtype=op.idx_dtype,
+        return_values=ret_val,
+        return_indices=ret_idx)(x, k)
+    return {old_output: new_output}
--- a/theano/tensor/sort.py
+++ b/theano/tensor/sort.py
@@ -2,6 +2,22 @@ from __future__ import absolute_import, print_function, division
 import numpy as np
 import theano
 from theano.tensor.basic import mul, arange
+from theano.gradient import grad_undefined
+from theano.tensor.subtensor import set_subtensor
+
+
+def _variable_is_none(var):
+    return isinstance(var, theano.Constant) and var.data is None
+
+
+def _check_tensor_is_scalar(var):
+    '''
+    Checks if a tensor variable is scalar, raise ValueError otherwise
+    '''
+    msg = '%(var)s is expected to be 0d tensor, got %(ndim)d'
+    if var.ndim != 0:
+        raise ValueError(
+            msg % (var, var.ndim))


 class SortOp(theano.Op):
@@ -33,8 +49,7 @@ class SortOp(theano.Op):
        z[0] = np.sort(a, axis, self.kind, self.order)

    def infer_shape(self, node, inputs_shapes):
-        if (isinstance(node.inputs[1], theano.Constant) and
-                node.inputs[1].data is None):
+        if _variable_is_none(node.inputs[1]):
            # That means axis = None,
            # So the array is flattened before being sorted
            return [(mul(*inputs_shapes[0]),)]
@@ -49,7 +64,7 @@ class SortOp(theano.Op):
        a, axis = inputs
        indices = self.__get_argsort_indices(a, axis)
        inp_grad = output_grads[0][tuple(indices)]
-        axis_grad = theano.gradient.grad_undefined(
+        axis_grad = grad_undefined(
            self, 1, axis,
            "The gradient of sort is not defined "
            "with respect to the integer axes itself")
@@ -162,8 +177,7 @@ class ArgSortOp(theano.Op):
                               dtype=node.outputs[0].dtype)

    def infer_shape(self, node, inputs_shapes):
-        if (isinstance(node.inputs[1], theano.Constant) and
-                node.inputs[1].data is None):
+        if _variable_is_none(node.inputs[1]):
            return [(mul(*inputs_shapes[0]),)]
        # axis should not be None, so there should be the same number of
        # dimensions in the input and output
@@ -175,7 +189,7 @@ class ArgSortOp(theano.Op):
        # No grad defined for intergers.
        inp, axis = inputs
        inp_grad = inp.zeros_like()
-        axis_grad = theano.gradient.grad_undefined(
+        axis_grad = grad_undefined(
            self, 1, axis,
            "argsort is not defined for non-integer axes so"
            " argsort(x, axis+eps) is undefined")
@@ -206,3 +220,339 @@ def argsort(a, axis=-1, kind='quicksort', order=None):
        a = a.flatten()
        axis = 0
    return ArgSortOp(kind, order)(a, axis)
+
+
+def _topk_py_impl(op, x, k, axis, idx_dtype):
+    ndim = x.ndim
+    assert -ndim <= axis < ndim
+    axis %= ndim
+    if k == 0:
+        raise ValueError('topk: kth cannot be zero')
+    elif k > x.shape[axis]:
+        raise ValueError(
+            'topk: kth cannot be larger than the size of specified axis %d' % axis)
+    if abs(k) == 1:
+        # negative k means min instead of max
+        fn_max = [None, np.max, np.min][k]
+        fn_argmax = [None, np.argmax, np.argmin][k]
+        if not op.return_indices:
+            return np.expand_dims(fn_max(x, axis=axis), axis)
+        elif op.return_values:
+            zi = np.expand_dims(
+                fn_argmax(x, axis=axis), axis)
+            idx2 = tuple(
+                np.arange(s).reshape(
+                    (s,) + (1,) * (ndim - i - 1)
+                    ) if i != axis else zi for i, s in enumerate(x.shape))
+            zv = x[idx2]
+            return zv, zi.astype(idx_dtype)
+        else:
+            zi = np.expand_dims(
+                fn_argmax(x, axis=axis), axis)
+            return zi.astype(idx_dtype)
+
+    if x.shape[axis] == abs(k):
+        if not op.return_indices:
+            return x.copy()
+        else:
+            l = axis
+            r = ndim - l
+            reps = list(x.shape)
+            reps[axis] = 1
+            zi = np.arange(abs(k), dtype=idx_dtype)
+            zi = zi.reshape((1,) * l + (k,) + (1,) * (r - 1))
+            zi = np.tile(zi, reps)
+            if op.return_values:
+                return x.copy(), zi
+            else:
+                return zi
+
+    idx = [slice(None)] * ndim
+    idx[axis] = (slice(-k, None) if k > 0 else slice(-k))
+
+    if not op.return_indices:
+        zv = np.partition(x, -k, axis=axis)[idx]
+        return zv
+    elif op.return_values:
+        zi = np.argpartition(x, -k, axis=axis)[idx]
+        idx2 = tuple(
+            np.arange(s).reshape(
+                (s,) + (1,) * (ndim - i - 1)
+                ) if i != axis else zi for i, s in enumerate(x.shape))
+        zv = x[idx2]
+        return zv, zi.astype(idx_dtype)
+    else:
+        zi = np.argpartition(x, -k, axis=axis)[idx]
+        return zi.astype(idx_dtype)
+
+
+class TopKOp(theano.Op):
+    """
+    Operations related to finding k-largest elements.
+
+    Parameters
+    ----------
+    axis: integer
+        Defaults to ``-1``.
+        The axis to perform the operation. Must be in range ``[-ndim, ndim)``, where
+        ``ndim`` is the dimensionality of input tensor.
+
+    idx_dtype: string
+        Specify output dtype for indices, defaults to ``int64``, must be integer type.
+
+    sorted: bool
+        NOTE: NOT IMPLEMENTED YET
+        Defaults to ``True``
+
+        If True, the result array would be sorted in descending order.
+
+
+    Notes
+    -----
+    - By default, this Op give two outputs: values and indices. However optimizer may
+      remove a certain output if not needed.
+
+    - Computing gradient is only possible when both values and indices are computed in
+      forward pass.
+
+    - If the top-k-th value is not unique, we cannot guarantee the output indices being
+      deterministically chosen.
+
+    See Also
+    --------
+    topk
+    argtopk
+    argtopk_and_topk
+
+    """
+
+    # TODO more params
+    '''
+    only_top_kth: bool
+        Defaults to ``False``
+
+        If ``True``, will only find one exact top k-th element on given axis.
+
+    '''
+
+    # TODO c_code
+    # TODO add opt, if k==1, use max/min reduce
+    #      also if k is axis size, just copy input tensor
+    # TODO add opt, to merge argtopk / topk
+    __props__ = ('axis', 'sorted', 'return_values', 'return_indices', 'idx_dtype')
+
+    def __init__(
+            self,
+            axis=-1,
+            sorted=True,
+            idx_dtype='int64',
+            return_values=True,
+            return_indices=True
+            ):
+        # numpy always uses int64 as output dtype for arg*() routines
+        # however, we add "idx_dtype" param as memory is more precious on gpu
+        if not isinstance(axis, int):
+            raise TypeError(
+                '"axis" parameter must be integer, got "%s"' % type(axis))
+        if idx_dtype not in theano.tensor.integer_dtypes:
+            raise TypeError(
+                '"idx_dtype" parameter must be an integer dtype, got "%s"' % idx_dtype)
+
+        if not (return_indices or return_values):
+            raise ValueError(
+                "Neither return_values nor return_indices is True, this isn't allowed")
+
+        self.axis = axis
+        self.sorted = sorted
+        self.return_values = return_values
+        self.return_indices = return_indices
+        self.idx_dtype = idx_dtype
+
+    def __str__(self):
+        return '%(op)s{axis=%(axis)d, sorted=%(sorted)s}' % dict(
+            op=self.__class__.__name__,
+            axis=self.axis,
+            sorted=self.sorted)
+
+    def make_node(self, inp, kth):
+        inp = theano.tensor.as_tensor_variable(inp)
+        ndim = inp.ndim
+        if ndim == 0:
+            raise ValueError('Cannot take scalar as input')
+        if not -ndim <= self.axis < ndim:
+            raise IndexError(
+                '"axis" parameter out of range,'
+                ' expected integer within [%d, %d]' % (-ndim, ndim - 1))
+
+        kth = theano.tensor.as_tensor_variable(kth)
+        _check_tensor_is_scalar(kth)
+        bcast = inp.type.broadcastable
+        outs = []
+        if self.return_values:
+            outs.append(inp.type())
+        if self.return_indices:
+            outs.append(theano.tensor.TensorType(
+                dtype=self.idx_dtype, broadcastable=bcast)())
+        return theano.Apply(self, [inp, kth], outs)
+
+    def perform(self, node, inputs, output_storage):
+        x, k = inputs
+        axis = self.axis
+        if not self.return_indices:
+            pzv = output_storage[0]
+            pzv[0] = _topk_py_impl(self, x, k, axis, None)
+        elif self.return_values:
+            pzv = output_storage[0]
+            pzi = output_storage[1]
+            pzv[0], pzi[0] = _topk_py_impl(
+                self, x, k, axis, node.outputs[1].dtype)
+        else:
+            pzi = output_storage[0]
+            pzi[0] = _topk_py_impl(self, x, k, axis, node.outputs[0].dtype)
+
+    def infer_shape(self, node, inp_shapes):
+        shp = list(inp_shapes[0])
+        shp[self.axis] = np.abs(node.inputs[1])
+        shp = tuple(shp)
+        return [shp for i in [self.return_values, self.return_indices] if i]
+
+    def L_op(self, inputs, outputs, out_grads):
+        x, k = inputs
+        k_grad = grad_undefined(self, 1, k, 'topk: k is not differentiable')
+
+        if not (self.return_indices or self.return_values):
+            x_grad = grad_undefined(
+                self, 0, x, 'topk: cannot get gradient'
+                ' without both indices and values')
+        else:
+            x_shp = theano.tensor.shape(x)
+            z_grad = out_grads[0]
+            ndim = x.ndim
+            axis = self.axis % ndim
+            grad_indices = [
+                arange(x_shp[i]).dimshuffle([0] + ['x'] * (ndim - i - 1))
+                if i != axis else outputs[-1] for i in range(ndim)]
+            x_grad = x.zeros_like(dtype=z_grad.dtype)
+            x_grad = set_subtensor(x_grad[tuple(grad_indices)], z_grad)
+
+        return [x_grad, k_grad]
+
+
+def topk(x, kth, axis=-1, sorted=True, idx_dtype='int64'):
+    """
+    Returns the k-largest elements along an axis.
+
+    Parameters
+    ----------
+
+    x: tensor instance
+
+    kth: integer constant/variable
+        Must not be 0. If negative, gives k-smallest elements instead.
+
+    axis: integer or ``None``
+        Upon which axis shall the operation be performed on.
+        If ``None``, works on flattened array.
+
+    sorted: bool
+        NOTE: NOT IMPLEMENTED YET, USE ``False`` FOR NOW.
+        Defaults to ``True``
+
+        If True, the result array would be sorted in descending order.
+
+    idx_dtype: string
+        Specify output dtype used in indices, defaults to ``int64``, must be integer type.
+        This option is here because indices are needed for gradient.
+
+    Returns
+    -------
+    Tensor variable with same dtype as `x`.
+
+    Notes
+    -----
+    - ``sorted=True`` is not supported yet.
+
+    """
+    if sorted:
+        raise NotImplementedError(
+            "We are still working on sorted topk. Use sorted=False for now.")
+    if axis is None:
+        x = theano.tensor.flatten(x)
+        axis = 0
+    return TopKOp(
+        axis=axis,
+        sorted=sorted,
+        idx_dtype=idx_dtype)(x, kth)[0]
+
+
+def argtopk(x, kth, axis=-1, sorted=True, idx_dtype='int64'):
+    """
+    Returns the indices of k-largest elements along an axis.
+
+    Parameters
+    ----------
+
+    x: tensor instance
+
+    kth: integer constant/variable
+        Must not be 0. If negative, gives k-smallest elements instead.
+
+    sorted: bool
+        NOTE: NOT IMPLEMENTED YET, USE ``False`` FOR NOW.
+        Defaults to ``True``
+
+        If True, the result array of corresponding indices would be sorted in descending order.
+
+
+    axis: integer, tuple/list of integers, or ``None``
+        Upon which axis shall the operation be performed on.
+        If ``None``, works on flattened array.
+
+    idx_dtype: string
+        Specify output dtype, defaults to ``int64``, must be integer type.
+
+    Returns
+    -------
+    Tensor variable with dtype specified in `idx_dtype`.
+
+    Notes
+    -----
+    - ``sorted=True`` is not supported yet.
+
+    - If the top-k-th value is not unique, we cannot guarantee the output
+      indices are deterministically chosen.
+
+    """
+    if sorted:
+        raise NotImplementedError(
+            "We are still working on sorted topk. Use sorted=False for now.")
+    if axis is None:
+        x = theano.tensor.flatten(x)
+        axis = 0
+    return TopKOp(
+        axis=axis,
+        sorted=sorted,
+        idx_dtype=idx_dtype)(x, kth)[1]
+
+
+def topk_and_argtopk(x, kth, axis=-1, sorted=True, idx_dtype='int64'):
+    """
+    Returns the results of both topk() and argtopk() in one Op.
+
+    See the respective documentation for details.
+
+    Returns
+    -------
+    tuple: (values, indices)
+
+    """
+    if sorted:
+        raise NotImplementedError(
+            "We are still working on sorted topk. Use sorted=False for now.")
+    if axis is None:
+        x = theano.tensor.flatten(x)
+        axis = 0
+    return TopKOp(
+        axis=axis,
+        sorted=sorted,
+        idx_dtype=idx_dtype)(x, kth)
--- a/theano/tensor/tests/test_sort.py
+++ b/theano/tensor/tests/test_sort.py
 from __future__ import absolute_import, print_function, division
+from itertools import product, chain
+from functools import reduce
 import unittest
 from theano.tests import unittest_tools as utt

@@ -9,9 +11,18 @@ from theano import tensor

 from theano.tensor.sort import sort, SortOp
 from theano.tensor.sort import argsort, ArgSortOp
+from theano.tensor.sort import topk, argtopk, topk_and_argtopk, TopKOp

+_all_dtypes = tensor.integer_dtypes + tensor.float_dtypes

-class test_sort(unittest.TestCase):
+
+def gen_unique_vector(size, dtype):
+    # generate a randomized vector with unique elements
+    retval = np.arange(size) * 3. + np.random.uniform(-1., 1.)
+    return (retval[np.random.permutation(size)] - size * 1.5).astype(dtype)
+
+
+class Test_sort(unittest.TestCase):

    def setUp(self):
        self.rng = np.random.RandomState(seed=utt.fetch_seed())
@@ -22,7 +33,7 @@ class test_sort(unittest.TestCase):
        a = tensor.dmatrix()
        w = sort(a)
        f = theano.function([a], w)
-        assert np.allclose(f(self.m_val), np.sort(self.m_val))
+        utt.assert_allclose(f(self.m_val), np.sort(self.m_val))

    def test2(self):
        a = tensor.dmatrix()
@@ -32,7 +43,7 @@ class test_sort(unittest.TestCase):
        for axis_val in 0, 1:
            gv = f(self.m_val, axis_val)
            gt = np.sort(self.m_val, axis_val)
-            assert np.allclose(gv, gt)
+            utt.assert_allclose(gv, gt)

    def test3(self):
        a = tensor.dvector()
@@ -40,7 +51,7 @@ class test_sort(unittest.TestCase):
        f = theano.function([a], w2)
        gv = f(self.v_val)
        gt = np.sort(self.v_val)
-        assert np.allclose(gv, gt)
+        utt.assert_allclose(gv, gt)

    def test4(self):
        a = tensor.dmatrix()
@@ -50,7 +61,7 @@ class test_sort(unittest.TestCase):
        for axis_val in 0, 1:
            gv = f(self.m_val, axis_val)
            gt = np.sort(self.m_val, axis_val)
-            assert np.allclose(gv, gt)
+            utt.assert_allclose(gv, gt)

    def test5(self):
        a1 = SortOp("mergesort", [])
@@ -67,10 +78,9 @@ class test_sort(unittest.TestCase):
        f = theano.function([a], l)
        gv = f(self.m_val)
        gt = np.sort(self.m_val, None)
-        assert np.allclose(gv, gt)
+        utt.assert_allclose(gv, gt)

    def test_grad_vector(self):
-        a = theano.tensor.vector()
        data = np.random.rand(10).astype(theano.config.floatX)
        utt.verify_grad(sort, [data])

@@ -133,19 +143,19 @@ class test_sort(unittest.TestCase):
        utt.verify_grad(lambda x: sort(x, 3), [data])


-class TensorInferShapeTester(utt.InferShapeTester):
+class SortInferShapeTester(utt.InferShapeTester):
    def test_sort(self):
        x = tensor.matrix()
        self._compile_and_check(
-                [x],
-                [sort(x)],
-                [np.random.randn(10, 40).astype(theano.config.floatX)],
-                SortOp)
+            [x],
+            [sort(x)],
+            [np.random.randn(10, 40).astype(theano.config.floatX)],
+            SortOp)
        self._compile_and_check(
-                [x],
-                [sort(x, axis=None)],
-                [np.random.randn(10, 40).astype(theano.config.floatX)],
-                SortOp)
+            [x],
+            [sort(x, axis=None)],
+            [np.random.randn(10, 40).astype(theano.config.floatX)],
+            SortOp)


 def test_argsort():
@@ -160,7 +170,7 @@ def test_argsort():
    f = theano.function([a], w)
    gv = f(m_val)
    gt = np.argsort(m_val)
-    assert np.allclose(gv, gt)
+    utt.assert_allclose(gv, gt)

    # Example 2
    a = tensor.dmatrix()
@@ -170,7 +180,7 @@ def test_argsort():
    for axis_val in 0, 1:
        gv = f(m_val, axis_val)
        gt = np.argsort(m_val, axis_val)
-        assert np.allclose(gv, gt)
+        utt.assert_allclose(gv, gt)

    # Example 3
    a = tensor.dvector()
@@ -178,7 +188,7 @@ def test_argsort():
    f = theano.function([a], w2)
    gv = f(v_val)
    gt = np.argsort(v_val)
-    assert np.allclose(gv, gt)
+    utt.assert_allclose(gv, gt)

    # Example 4
    a = tensor.dmatrix()
@@ -188,7 +198,7 @@ def test_argsort():
    for axis_val in 0, 1:
        gv = f(m_val, axis_val)
        gt = np.argsort(m_val, axis_val)
-        assert np.allclose(gv, gt)
+        utt.assert_allclose(gv, gt)

    # Example 5
    a = tensor.dmatrix()
@@ -206,7 +216,7 @@ def test_argsort():
    f = theano.function([a], w2)
    gv = f(m_val)
    gt = np.argsort(m_val, None)
-    assert np.allclose(gv, gt)
+    utt.assert_allclose(gv, gt)


 def test_argsort_grad():
@@ -219,3 +229,222 @@ def test_argsort_grad():

    data = np.random.rand(2, 3, 3).astype(theano.config.floatX)
    utt.verify_grad(lambda x: argsort(x, axis=2), [data])
+
+
+class Test_TopK(unittest.TestCase):
+
+    def setUp(self):
+        pass
+
+    @utt.parameterized.expand(product(
+        _all_dtypes, tensor.integer_dtypes, [-1, 0, None], [False]))
+    def test_argtopk_sanity(self, dtype, idx_dtype, axis, sorted):
+        x = tensor.vector(name='x', dtype=dtype)
+        fn = theano.function([x], argtopk(x, 1, axis=axis, sorted=sorted, idx_dtype=idx_dtype))
+        xval = np.asarray([1]).astype(dtype)
+        yval = fn(xval)
+        assert yval == np.asarray([0], dtype=idx_dtype)
+        assert yval.dtype == np.dtype(idx_dtype)
+
+    @utt.parameterized.expand(product(
+        _all_dtypes, [-1, 0, None], [False]))
+    def test_topk_sanity(self, dtype, axis, sorted):
+        x = tensor.vector(name='x', dtype=dtype)
+        fn = theano.function([x], topk(x, 1, axis=axis, sorted=sorted))
+        xval = np.asarray([1]).astype(dtype)
+        yval = fn(xval)
+        assert yval == xval
+        assert yval.dtype == xval.dtype
+
+    @utt.parameterized.expand(product(
+        _all_dtypes, tensor.integer_dtypes, [-1, 0, None], [False]))
+    def test_combined_sanity(self, dtype, idx_dtype, axis, sorted):
+        x = tensor.vector(name='x', dtype=dtype)
+        yv, yi = topk_and_argtopk(x, 1, axis=axis, sorted=sorted, idx_dtype=idx_dtype)
+        fn = theano.function([x], [yv, yi])
+        xval = np.asarray([1]).astype(dtype)
+        yvval, yival = fn(xval)
+        assert yival == np.asarray([0], dtype=idx_dtype)
+        utt.assert_allclose(xval, yvval)
+        assert yvval.dtype == xval.dtype
+        assert yival.dtype == np.dtype(idx_dtype)
+
+    @utt.parameterized.expand(chain(
+        product(
+            (16, 61, 257),
+            (1, -1, -10, 'n//2', 'n-1', '-n', '1-n'),
+            ('float64', 'float16', 'int16', 'int8'),
+            (False,)),
+        ((2049, 1337, 'float64', False),)))
+    def test_topk_1d(self, size, k, dtype, sorted):
+        if isinstance(k, str):
+            k = eval(k.replace('n', str(size)))
+
+        x = theano.tensor.vector(name='x', dtype=dtype)
+        y = topk(x, k, sorted=sorted)
+        fn = theano.function([x], y)
+        # assert local_useless_topk opt is done properly
+        assert 1 == len(fn.maker.fgraph.outputs[0].owner.outputs)
+
+        # generate a all-unique array
+        xval = gen_unique_vector(size, dtype)
+        yval = fn(xval)
+        idx = (slice(-k, None) if k > 0 else slice(-k))
+        goal = np.sort(xval)[idx]
+
+        assert yval.dtype == goal.dtype
+        utt.assert_allclose(np.sort(yval), goal)
+
+    @utt.parameterized.expand(chain(
+        product(
+            (16, 61, 257),
+            (1, -1, -10, 'n//2', 'n-1', '-n'),
+            ('float32', 'int32'),
+            (False,),
+            ('int32', 'int64')),
+        ((2049, 1337, 'float32', False, 'int32'),)))
+    def test_argtopk_1d(self, size, k, dtype, sorted, idx_dtype):
+        if isinstance(k, str):
+            k = eval(k.replace('n', str(size)))
+
+        x = theano.tensor.vector(name='x', dtype=dtype)
+        y = argtopk(x, k, sorted=sorted, idx_dtype=idx_dtype)
+        fn = theano.function([x], y)
+
+        # assert local_useless_topk opt is done properly
+        assert 1 == len(fn.maker.fgraph.outputs[0].owner.outputs)
+
+        # generate a all-unique array
+        xval = gen_unique_vector(size, dtype)
+        yval = fn(xval)
+        idx = (slice(-k, None) if k > 0 else slice(-k))
+        goal = np.argsort(xval)[idx].astype(idx_dtype)
+
+        # due to uniqueness, we expect indices same
+        assert np.all(xval[np.sort(yval)] == xval[np.sort(goal)])
+
+    @utt.parameterized.expand(chain(
+        product(
+            (16, 61, 257),
+            (1, -1, 10, 'n//2', 'n-1', '1-n'),
+            ('float32', 'int32'),
+            (False,),
+            ('int32', 'int64')),
+        ((2049, 1337, 'float32', False, 'int32'),)))
+    def test_combined_1d(self, size, k, dtype, sorted, idx_dtype):
+        if isinstance(k, str):
+            k = eval(k.replace('n', str(size)))
+
+        x = theano.tensor.vector(name='x', dtype=dtype)
+        yv, yi = topk_and_argtopk(x, k, sorted=sorted, idx_dtype=idx_dtype)
+        fn = theano.function([x], [yv, yi])
+        # generate a all-unique array
+        xval = gen_unique_vector(size, dtype)
+        yvval, yival = fn(xval)
+        idx = (slice(-k, None) if k > 0 else slice(-k))
+        goali = np.argsort(xval)[idx].astype(idx_dtype)
+        goalv = xval[goali]
+
+        # due to uniqueness, we expect indices same
+        assert np.all(xval[np.sort(yival)] == xval[np.sort(goali)])
+        utt.assert_allclose(np.sort(yvval), goalv)
+
+    @utt.parameterized.expand(chain(
+        product(
+            (18, 62, 258),
+            (1, -1, 'n//2'),
+            ('int32', 'float32'),
+            (False,)),
+        ((2048, 1337, 'float32', False),)))
+    def test_argtopk_1d_collision(self, size, k, dtype, sorted):
+        # with non-unique kth max value
+        if isinstance(k, str):
+            k = eval(k.replace('n', str(size)))
+
+        x = theano.tensor.vector(name='x', dtype=dtype)
+        y = argtopk(x, k, sorted=sorted, idx_dtype='int32')
+        fn = theano.function([x], y)
+        xval = np.repeat(np.random.uniform(-100., 100., size=size // 2).astype(dtype), 2)
+        xval = xval[np.random.permutation(size)]
+        yval = fn(xval)
+        idx = (slice(-k, None) if k > 0 else slice(-k))
+        goal = np.argsort(xval)[idx].astype('int32')
+        utt.assert_allclose(np.sort(xval[yval]), np.sort(xval[goal]))
+
+    @utt.parameterized.expand(product(
+        ((17, 15), (2, 3, 5, 7, 11), (2017, 5, 3)),
+        (-1, '(1+n)//2', '-n', '1-n'),
+        ('float32', 'int32'),
+        (False,),
+        ('int32', 'int64')))
+    def test_argtopk_nd(self, shp, k_, dtype, sorted, idx_dtype):
+        ndim = len(shp)
+        for axis in range(-ndim, ndim):
+            if isinstance(k_, str):
+                k = eval(k_.replace('n', str(shp[axis])))
+            else:
+                k = k_
+
+            if k == 0:
+                continue
+
+            x = theano.tensor.tensor(
+                name='x', broadcastable=(False,) * len(shp), dtype=dtype)
+            y = argtopk(x, k, axis=axis, sorted=sorted, idx_dtype=idx_dtype)
+            fn = theano.function([x], y)
+            size = reduce(int.__mul__, shp)
+            xval = gen_unique_vector(size, dtype).reshape(shp)
+            yval = fn(xval)
+            idx = slice(-k, None) if k > 0 else slice(-k)
+            l = axis % ndim
+            r = ndim - l
+            idx = (slice(None),) * l + (idx,) + (slice(None),) * (r - 1)
+            goal = np.argsort(xval, axis=axis)[idx].astype(idx_dtype)
+
+            assert np.all(np.sort(yval, axis=axis) == np.sort(goal, axis=axis))
+
+    @utt.parameterized.expand(product(
+        ((257,), (17, 15), (5, 3, 5, 3), (2, 3, 5, 7, 11)),
+        (1, -1, '(1+n)//2', 'n-1', '-n', '1-n'), (False,)))
+    def test_grad(self, shp, k_, sorted):
+        ndim = len(shp)
+        for axis in range(-ndim, ndim):
+            if isinstance(k_, str):
+                k = eval(k_.replace('n', str(shp[axis])))
+            else:
+                k = k_
+
+            if k == 0:
+                continue
+
+            # make input away from undefined gradient (where some inputs are equal)
+            xval = gen_unique_vector(
+                reduce(int.__mul__, shp),
+                dtype=theano.config.floatX
+            ).reshape(shp)
+            utt.verify_grad(lambda x: topk(x, k, axis=axis, sorted=sorted), [xval], eps=1e-2)
+
+
+class TopKInferShapeTester(utt.InferShapeTester):
+    @utt.parameterized.expand(product(
+        ((2, 3), (15, 17), (11, 7, 5), (2, 3, 5, 7, 11), (2, 4, 3, 1)),
+        (1, '(1+n)//2', 'n-1', 'n')))
+    def test_combined_infer_shape(self, shp, k_):
+        ndim = len(shp)
+        for axis in range(-ndim, ndim):
+            if isinstance(k_, str):
+                k = eval(k_.replace('n', str(shp[axis])))
+            else:
+                k = k_
+
+            if k == 0:
+                continue
+
+            x = theano.tensor.tensor(
+                name='x', broadcastable=(False,) * len(shp),
+                dtype=theano.config.floatX)
+            yv, yi = topk_and_argtopk(x, k, axis=axis, sorted=False, idx_dtype='int32')
+            size = reduce(int.__mul__, shp)
+            xval = gen_unique_vector(size, theano.config.floatX).reshape(shp)
+            self._compile_and_check(
+                [x], [yv, yi], [xval], TopKOp)
--- a/theano/tests/unittest_tools.py
+++ b/theano/tests/unittest_tools.py
@@ -83,7 +83,7 @@ def seed_rng(pseed=None):

 def verify_grad(op, pt, n_tests=2, rng=None, *args, **kwargs):
    """
-    Wrapper for tensor/basic.py:verify_grad
+    Wrapper for gradient.py:verify_grad
    Takes care of seeding the random number generator if None is given
    """
    if rng is None: