Merge pull request #6317 from abergeron/gpuarray_07

Work to integrate libgpuarray 0.7 changes.

Merge pull request #6317 from abergeron/gpuarray_07
b980a8ee · Frédéric Bastien · GitHub · 814dc05a · 8b9dc5f1 · b980a8ee
--- a/.jenkins/gpuarray-branch
+++ b/.jenkins/gpuarray-branch
-v0.6.9
+v0.7.1
\ No newline at end of file
--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -46,23 +46,36 @@ def init_dev(dev, name=None, preallocate=None):
    global pygpu_activated
    if not config.cxx:
        raise RuntimeError("The new gpu-backend need a c++ compiler.")
-    if (pygpu.version.major, pygpu.version.minor, pygpu.version.patch) < (0, 6, 1):
+    if (pygpu.version.major != 0 or pygpu.version.minor != 7 or
+            pygpu.version.patch < 0):
        raise ValueError(
-            "Your installed version of pygpu is too old, please upgrade to 0.6.1 or later")
+            "Your installed version of pygpu(%s) is too old, please upgrade to 0.7.0 or later" %
+            pygpu.version.fullversion)
    # This is for the C headers API, we need to match the exact version.
-    if pygpu.gpuarray.api_version()[0] != 1:
+    gpuarray_version_major_supported = 2
+    gpuarray_version_major_detected = pygpu.gpuarray.api_version()[0]
+    if gpuarray_version_major_detected != gpuarray_version_major_supported:
        raise ValueError(
-            "Your installed libgpuarray is not in sync, please make sure to have the appropriate version")
+            "Your installed version oflibgpuarray is not in sync with the current Theano"
+            " version. The installed libgpuarray version support API version %d,"
+            " while current Theano support API version %d. Change the version of"
+            " libgpuarray or Theano to fix this problem.",
+            gpuarray_version_major_detected,
+            gpuarray_version_major_supported)
    if dev not in init_dev.devmap:
+        args = dict()
        if config.gpuarray.cache_path != '':
-            os.environ['GPUARRAY_CACHE_PATH'] = config.gpuarray.cache_path
+            args['kernel_cache_path'] = config.gpuarray.cache_path
        if preallocate is None:
            preallocate = config.gpuarray.preallocate
+        if preallocate < 0:
+            args['max_cache_size'] = 0
+        else:
+            args['initial_cache_size'] = preallocate
        context = pygpu.init(
            dev,
-            disable_alloc_cache=preallocate < 0,
+            sched=config.gpuarray.sched,
-            single_stream=config.gpuarray.single_stream,
+            **args)
-            sched=config.gpuarray.sched)
        context.dev = dev
        init_dev.devmap[dev] = context
        reg_context(name, context)
@@ -115,12 +128,12 @@ def init_dev(dev, name=None, preallocate=None):
    # This will map the context name to the real context object.
    if config.print_active_device:
        try:
-            pcibusid = '(' + context.pcibusid + ')'
+            unique_id = '(' + context.unique_id + ')'
        except pygpu.gpuarray.UnsupportedException:
-            pcibusid = ''
+            unique_id = ''
        print("Mapped name %s to device %s: %s %s" %
-              (name, dev, context.devname, pcibusid),
+              (name, dev, context.devname, unique_id),
              file=sys.stderr)
    pygpu_activated = True
@@ -207,5 +220,5 @@ else:
            config.device.startswith('opencl') or
            config.device.startswith('cuda') or
            config.contexts != ''):
-        error("pygpu was configured but could not be imported or is too old (version 0.6 or higher required)",
+        error("pygpu was configured but could not be imported or is too old (version 0.7 or higher required)",
              exc_info=True)
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -158,7 +158,7 @@ class Kernel(object):
            the `params` list consists of C typecodes
    It can also have the key `cflags` which is a string of C flag
-    values like this `"GA_USE_DOUBLE|GA_USE_CLUDA"`.
+    values like this `"GA_USE_DOUBLE|GA_USE_SMALL"`.
    Parameters
    ----------
@@ -216,7 +216,7 @@ class Kernel(object):
            else:
                raise TypeError("can't get a dtype from %s" % (type(t),))
        dtypes = [get_dtype(t) for t in types]
-        flags = dict(cluda=True)
+        flags = dict()
        if any(d == np.float64 for d in dtypes):
            flags['have_double'] = True
        if any(d.itemsize < 4 for d in dtypes):
@@ -231,8 +231,6 @@ class Kernel(object):
        res = []
        if self.flags.get('cflags', '') != '':
            res.append(self.flags['cflags'])
-        if self.flags.get('cluda', False):
-            res.append('GA_USE_CLUDA')
        if self.flags.get('have_double', False):
            res.append('GA_USE_DOUBLE')
        if self.flags.get('have_small', False):
@@ -241,15 +239,16 @@ class Kernel(object):
            res.append('GA_USE_COMPLEX')
        if self.flags.get('have_half', False):
            res.append('GA_USE_HALF')
-        return '|'.join(res)
+        res = '|'.join(res)
+        if not res:
+            return '0'
+        return res
    def _get_py_flags(self):
        res = dict(self.flags)
        cflags = res.pop('cflags', '')
        for fl in cflags.split('|'):
            fl = fl.strip()
-            if fl == 'GA_USE_CLUDA':
-                res['cluda'] = True
            if fl == 'GA_USE_DOUBLE':
                res['have_double'] = True
            if fl == 'GA_USE_SMALL':
@@ -555,7 +554,7 @@ class CGpuKernelBase(COp, GpuKernelBase):
                kflags = splt2[2].strip()
                kcode = def_macros + '\n' + kcode + '\n' + undef_macros
                res.append(Kernel(kcode, ktypes, kname,
-                                  flags=dict(cluda=True, cflags=kflags)))
+                                  flags=dict(cflags=kflags)))
                n += 2
            self._cached_kernels = res
            return res
@@ -703,39 +702,35 @@ class GpuFromHost(Op):
        if (%(name)s_tmp == NULL)
          %(fail)s
-        if (%(out)s != NULL && GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) &&
+        if (%(out)s == NULL || !GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) ||
-            theano_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp),
+            !theano_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp),
-                              (size_t *)PyArray_DIMS(%(name)s_tmp),
+                               (size_t *)PyArray_DIMS(%(name)s_tmp),
-                              get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) {
+                               get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) {
-          Py_BEGIN_ALLOW_THREADS
-          err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
-                               PyArray_NBYTES(%(name)s_tmp));
-          Py_END_ALLOW_THREADS
-          Py_DECREF(%(name)s_tmp);
-          if (err != GA_NO_ERROR) {
-            PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu");
-            %(fail)s;
-          }
-        } else {
          Py_XDECREF(%(out)s);
-          // This method will release the GIL when needed.
+          %(out)s = pygpu_empty(PyArray_NDIM(%(name)s_tmp),
-          %(out)s = pygpu_fromhostdata(PyArray_DATA(%(name)s_tmp),
+                                (size_t *)PyArray_DIMS(%(name)s_tmp),
-                                       get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
+                                get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
-                                       PyArray_NDIM(%(name)s_tmp),
+                                GA_C_ORDER, %(ctx)s, Py_None);
-                                       (size_t *)PyArray_DIMS(%(name)s_tmp),
-                                       (ssize_t *)PyArray_STRIDES(%(name)s_tmp),
-                                       %(ctx)s,
-                                       Py_None);
-          Py_DECREF(%(name)s_tmp);
          if (%(out)s == NULL) {
-              %(fail)s
+            Py_DECREF(%(name)s_tmp);
+            %(fail)s;
          }
        }
+        Py_BEGIN_ALLOW_THREADS
+        err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
+                             PyArray_NBYTES(%(name)s_tmp));
+        Py_END_ALLOW_THREADS
+        Py_DECREF(%(name)s_tmp);
+        if (err != GA_NO_ERROR) {
+          PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu");
+          %(fail)s;
+        }
        """ % {'name': name, 'inp': inputs[0], 'ctx': sub['params'],
               'out': outputs[0], 'fail': sub['fail']}
    def c_code_cache_version(self):
-        return (9,)
+        return (10,)
 class GpuToGpu(Op):
@@ -1619,7 +1614,8 @@ class GpuEye(GpuKernelBase, Op):
                for i in xrange(3)]
    def gpu_kernels(self, node, name):
-        code = """
+        code = """#include "cluda.h"
 KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
                ga_size n, ga_size m, ga_ssize k) {
    a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);

--- a/theano/gpuarray/c_code/corr3d_gemm.c
+++ b/theano/gpuarray/c_code/corr3d_gemm.c
 #section kernels
 #kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
-// TODO check kernel flags
+#include "cluda.h"
 // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
 // sources are clearly marked. Below we reproduce the original license of
 // the Caffe software.
@@ -87,6 +88,8 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
 }
 #kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
 KERNEL void im3d2col_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
    const ga_size offset_im,
@@ -139,6 +142,8 @@ KERNEL void im3d2col_kernel(const ga_size n,
 // GPU kernel for the case of dilation
 #kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
+#include "cluda.h"
 KERNEL void dilated_col2im3d_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
    const ga_size offset_col,
@@ -207,6 +212,7 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n,
 }
 #kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
+#include "cluda.h"
 KERNEL void col2im3d_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col,

--- a/theano/gpuarray/c_code/corr_gemm.c
+++ b/theano/gpuarray/c_code/corr_gemm.c
 #section kernels
 #kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
 // TODO check kernel flags
 // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
 // sources are clearly marked. Below we reproduce the original license of
@@ -77,6 +78,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
 }
 #kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
 KERNEL void im2col_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
@@ -122,6 +124,8 @@ KERNEL void im2col_kernel(const ga_size n,
 // GPU kernel for the case of dilation
 #kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
+#include "cluda.h"
 KERNEL void dilated_col2im_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
    const ga_size height, const ga_size width, const ga_size channels,
@@ -172,6 +176,7 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
 }
 #kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
+#include "cluda.h"
 KERNEL void col2im_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,

--- a/theano/gpuarray/c_code/dnn_fwd.c
+++ b/theano/gpuarray/c_code/dnn_fwd.c
@@ -199,7 +199,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    if (!reuse_algo) {
      char pci_id[16];
-      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+      gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
      // check out cache
      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups);
      if (hashkey.empty()) {

--- a/theano/gpuarray/c_code/dnn_gi.c
+++ b/theano/gpuarray/c_code/dnn_gi.c
@@ -168,7 +168,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    if (!reuse_algo) {
      char pci_id[16];
-      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+      gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
      // check out cache
      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
      if (hashkey.empty()) {

--- a/theano/gpuarray/c_code/dnn_gw.c
+++ b/theano/gpuarray/c_code/dnn_gw.c
@@ -155,7 +155,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    if (!reuse_algo) {
      char pci_id[16];
-      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+      gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
      // check out cache
      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), *kerns, desc, output, groups);
      if (hashkey.empty()) {

--- a/theano/gpuarray/c_code/magma_cholesky.c
+++ b/theano/gpuarray/c_code/magma_cholesky.c
 #section kernels
 #kernel tril_kernel : size, size, size, *:
+#include "cluda.h"
 KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
@@ -17,6 +18,7 @@ KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
 }
 #kernel triu_kernel : size, size, size, *:
+#include "cluda.h"
 KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {

--- a/theano/gpuarray/c_code/magma_qr.c
+++ b/theano/gpuarray/c_code/magma_qr.c
 #section kernels
 #kernel triu_kernel : size, size, size, *:
+#include "cluda.h"
 KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {

--- a/theano/gpuarray/c_code/pool.c
+++ b/theano/gpuarray/c_code/pool.c
 #section kernels
 #kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool2d_kernel(const ga_size nthreads,
@@ -44,6 +45,7 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
 }
 #kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool3d_kernel(const ga_size nthreads,
@@ -95,6 +97,7 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
 }
 #kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, bool, bool, *, size:
+#include "cluda.h"
 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool2d_kernel(const ga_size nthreads,
@@ -150,6 +153,7 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
 }
 #kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
+#include "cluda.h"
 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool3d_kernel(const ga_size nthreads,

--- a/theano/gpuarray/c_code/pool_ave_grad.c
+++ b/theano/gpuarray/c_code/pool_ave_grad.c
 #section kernels
 #kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, bool, bool, *, size :
+#include "cluda.h"
 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
@@ -50,6 +51,7 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
 }
 #kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
+#include "cluda.h"
 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,

--- a/theano/gpuarray/c_code/pool_grad_grad.c
+++ b/theano/gpuarray/c_code/pool_grad_grad.c
 #section kernels
 #kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
 KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size pooled_height,
@@ -47,6 +48,7 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
 }
 #kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
 KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size pooled_depth,

--- a/theano/gpuarray/c_code/pool_max_grad.c
+++ b/theano/gpuarray/c_code/pool_max_grad.c
 #section kernels
 #kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
@@ -43,6 +44,7 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
 }
 #kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,

--- a/theano/gpuarray/c_code/pool_max_rop.c
+++ b/theano/gpuarray/c_code/pool_max_rop.c
 #section kernels
 #kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
@@ -50,6 +51,7 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
 }
 #kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -1743,7 +1743,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_ccontig"
            k_var = "kernel_reduce_ccontig_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            KERNEL void %(kname)s(
                    const ga_size d0,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -1781,7 +1782,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_1"
            k_var = "kernel_reduce_1_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            KERNEL void %(kname)s(
                    const ga_size d0,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -1821,7 +1823,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_11"
            k_var = "kernel_reduce_11_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -1909,7 +1912,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
                {}, True)
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
                %(decl)s{
                    %(init)s
                    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
@@ -1943,7 +1947,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_010"
            k_var = "kernel_reduce_010_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1, const ga_size d2,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -1989,7 +1994,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_010_AD"
            k_var = "kernel_reduce_010_AD_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            KERNEL void %(kname)s(
                    const ga_size A, const ga_size B, const ga_size C, const ga_size D,
                    const %(in_type)s *X, const ga_size offset_X,
@@ -2053,7 +2059,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            %(decl)s
            {
              %(init)s
@@ -2088,7 +2095,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_110"
            k_var = "kernel_reduce_110_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1, const ga_size d2,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -2133,7 +2141,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            %(decl)s
            {
                %(init)s
@@ -2163,7 +2172,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            %(decl)s
            {
                %(init)s
@@ -2195,7 +2205,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_001"
            k_var = "kernel_reduce_001_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1, const ga_size d2,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -2244,7 +2254,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            %(decl)s
            {
                %(init)s
@@ -2280,7 +2291,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            %(decl)s
            {
                %(init)s
@@ -2314,7 +2326,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            %(decl)s
            {
                %(init)s
@@ -2345,7 +2358,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_1011"
            k_var = "kernel_reduce_1011_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -2502,15 +2516,15 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
    def gpu_kernels(self, node, name):
        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
-            # Some OpenCL compilers do not accept no-arguments kernels
+            # Some OpenCL compilers do not accept no-arguments empty kernels
-            src = "KERNEL void reduk(GLOBAL_MEM float *a) {}"
+            src = "#include \"cluda.h\"\nKERNEL void reduk(GLOBAL_MEM float *a) { a[0] = 0; }"
            params = ['float32']
        else:
            k = self.get_kernel_cache(node)
            _, src, _, _ = k._get_basic_kernel(k.init_local_size,
                                               node.inputs[0].ndim)
            nd = node.inputs[0].ndim
-            params = ['uint32', gpuarray.GpuArray]
+            params = ['uint32', gpuarray.GpuArray, 'uint32']
            params.extend('uint32' for _ in range(nd))
            params.append(gpuarray.GpuArray)
            params.append('uint32')
@@ -2617,9 +2631,10 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        code += """
        args[0] = &n;
        args[1] = tmp->ga.data;
+        args[2] = &tmp->ga.offset;
        """ % dict(output=output)
-        p = 2
+        p = 3
        for i in range(node.inputs[0].ndim):
            code += """
        proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
@@ -2677,7 +2692,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        return code
    def c_code_cache_version_apply(self, node):
-        return (3, self.kernel_version(node))
+        return (4, self.kernel_version(node))
    def generate_kernel(self, node, odtype, redux):
        if isinstance(self.scalar_op, scalar.basic.Add):

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -74,7 +74,8 @@ class GpuCumOp(GpuKernelBase, Op):
        k_var = "k_cumadd_" + nodename
        dtype_x = node.inputs[0].dtype
        flags = Kernel.get_flags(dtype_x)
-        code = """
+        code = """#include "cluda.h"
        KERNEL void %(kname)s(float* input, ga_size input_offset,
                              float* output, ga_size output_offset,
                              ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
@@ -112,7 +113,8 @@ class GpuCumOp(GpuKernelBase, Op):
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
                  'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE]
-        code = """
+        code = """#include "cluda.h"
        // helper functions
        WITHIN_KERNEL
        void k_reductionPhase(float* partialCumOp) {
@@ -213,7 +215,8 @@ class GpuCumOp(GpuKernelBase, Op):
        # k_finalCumOp
        kname = "k_finalCumOp"
        k_var = "k_finalCumOp_" + nodename
-        code = """
+        code = """#include "cluda.h"
        KERNEL void k_finalCumOp(float* output, ga_size output_offset,
                                 float* blockSum, ga_size blockSum_offset,
                                 size_t nbElementsPerCumOp,

--- a/theano/gpuarray/fp16_help.py
+++ b/theano/gpuarray/fp16_help.py
@@ -22,7 +22,7 @@ def load_w(dtype):
    """
    if dtype == 'float16':
-        return '__half2float'
+        return 'ga_half2float'
    else:
        return ''
@@ -37,6 +37,6 @@ def write_w(dtype):
    """
    if dtype == 'float16':
-        return '__float2half_rn'
+        return 'ga_float2half'
    else:
        return ''
--- a/theano/gpuarray/kernel_codegen.py
+++ b/theano/gpuarray/kernel_codegen.py
@@ -34,7 +34,9 @@ def nvcc_kernel(name, params, body):
            else:
                yield b
    bodystr = ';\n'.join(flatbody())
-    return """KERNEL void %(name)s (%(paramstr)s)
+    return """#include "cluda.h"
+    KERNEL void %(name)s (%(paramstr)s)
    {
        %(bodystr)s;
    }

--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -66,7 +66,8 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
        work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype))
        write_out_ctype = write_w(node.outputs[0].dtype)
        load_in_ctype = load_w(node.inputs[0].dtype)
-        code = """
+        code = """#include "cluda.h"
 KERNEL void k_multi_warp_multinomial(
    const ga_size nb_multi,
    const ga_size nb_outcomes,
@@ -276,7 +277,8 @@ class GPUAChoiceFromUniform(GpuKernelBase, Op):
    def gpu_kernels(self, node, name):
        replace = int(self.replace)
-        code = """
+        code = """#include "cluda.h"
 KERNEL void k_multi_warp_multinomial_wor(
    const ga_size nb_multi,
    const ga_size nb_outcomes,

--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
@@ -61,7 +61,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        kernels = []
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
-        code = """
+        code = """#include "cluda.h"
        // a version that uses less registers but doesn't work in all cases.
        %(mode_constants)s
        KERNEL void %(kname)s(
@@ -163,7 +164,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        kname = "k_multi_warp"
        k_var = "k_multi_warp_" + nodename
-        code = """
+        code = """#include "cluda.h"
        %(mode_constants)s
        KERNEL void %(kname)s(
            const ga_int mode,
@@ -500,7 +502,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
            size_t threads_per_block[3] = {d, c, 1};
            //get the max threads per blocks
            size_t max_threads_dim;
-            int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
+            int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
            if (err != GA_NO_ERROR){
                PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
                %(fail)s;

--- a/theano/gpuarray/nnet.py
+++ b/theano/gpuarray/nnet.py
@@ -75,7 +75,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE
        ]
        sio = StringIO()
-        print("""
+        print("""#include "cluda.h"
        KERNEL void %(kname)s(const ga_size M, const ga_size N,
            GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
            GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
@@ -393,7 +394,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
        ]
        sio = StringIO()
-        print("""
+        print("""#include "cluda.h"
        KERNEL void %(kname)s(
           const ga_size N, const ga_size K,
           GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
@@ -495,7 +497,7 @@ class GpuSoftmax(GpuKernelBase, Op):
        {
            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32 * 1024)), 1, 1};
 //TODO, detect the maximum number of thread per block.
-            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE
+            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE0
            size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
                                     2 * sizeof(npy_%(work_x)s);
            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
@@ -557,7 +559,8 @@ class GpuSoftmax(GpuKernelBase, Op):
        kernels = []
        kname = "kSoftmax"
        k_var = "kSoftmax_" + nodename
-        code = """
+        code = """#include "cluda.h"
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                               GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                               GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
@@ -630,7 +633,8 @@ class GpuSoftmax(GpuKernelBase, Op):
                              flags=flags, objvar=k_var))
        kname = "kSoftmax_fixed_shared"
        k_var = "kSoftmax_fixed_shared" + nodename
-        code = """
+        code = """#include "cluda.h"
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                               GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                               GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
@@ -788,7 +792,7 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
        {
            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024)), 1, 1};
 //TODO, detect the maximum number of thread per block.
-            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE
+            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE0
            size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
                                     2 * sizeof(npy_%(work_x)s);
            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
@@ -854,7 +858,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
        kernels = []
        kname = "kSoftmaxWithBias"
        k_var = "kSoftmaxWithBias_" + nodename
-        code = """
+        code = """#include "cluda.h"
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
@@ -930,7 +935,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
                              flags=flags, objvar=k_var))
        kname = "kSoftmaxWithBias_fixed_shared"
        k_var = "kSoftmaxWithBias_fixed_shared" + nodename
-        code = """
+        code = """#include "cluda.h"
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1110,17 +1110,11 @@ def local_gpua_advanced_boolean_subtensor(op, context_name, inputs, outputs):
 @op_lifter([tensor.AdvancedIncSubtensor1])
 @register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
 def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs):
-    context = get_context(context_name)
-    # This is disabled on non-cuda contexts
-    if context.kind != b'cuda':
-        return None
    x, y, ilist = inputs
    set_instead_of_inc = op.set_instead_of_inc
-    compute_capability = int(context.bin_id[-2])
+    if (x.ndim == 1 and y.ndim == 0 and
-    if (compute_capability >= 2 and x.ndim == 1 and y.ndim == 0 and
            config.deterministic == 'default'):
        x = x.dimshuffle(0, 'x')
        y = y.dimshuffle('x', 'x')
@@ -1128,7 +1122,7 @@ def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs):
            set_instead_of_inc=set_instead_of_inc)(x, y, ilist)
        ret = GpuDimShuffle(ret.type.broadcastable, [0])(ret)
        return ret
-    elif (compute_capability < 2 or x.ndim != 2 or y.ndim != 2 or
+    elif (x.ndim != 2 or y.ndim != 2 or
            config.deterministic == 'more'):
        return GpuAdvancedIncSubtensor1(
            set_instead_of_inc=set_instead_of_inc)

--- a/theano/gpuarray/rng_mrg.py
+++ b/theano/gpuarray/rng_mrg.py
@@ -80,7 +80,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        else:
            raise ValueError('Unsupported data type for output',
                             self.output_type.dtype)
-        code = """
+        code = """#include "cluda.h"
        KERNEL void mrg_uniform(
                GLOBAL_MEM %(otype)s *sample_data,
                ga_size sample_offset,

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -353,7 +353,7 @@ int sub_setarray(GpuArray *dst, GpuArray *src) {
  int err;
  err = GpuArray_setarray(dst, src);
  if (err != GA_NO_ERROR)
-    PyErr_SetString(PyExc_RuntimeError, "setarray failed");
+    PyErr_SetString(PyExc_RuntimeError, GpuArray_error(src, err));
  return err;
 }
 """
@@ -1037,8 +1037,7 @@ class GpuAdvancedIncSubtensor1(Op):
 class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
                                     GpuAdvancedIncSubtensor1):
    """
-    Implement AdvancedIncSubtensor1 on the gpu, but use function
+    Implement AdvancedIncSubtensor1 on the gpu with atomics
-    only avail on compute capability 2.0 and more recent.
    """
    _f16_ok = True
@@ -1089,12 +1088,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
        return [gpuarray_helper_inc_dir()]
    def c_code(self, node, name, inputs, outputs, sub):
-        ctx = self.get_params(node).context
-        if ctx.kind != b'cuda':
-            raise NotImplementedError("cuda only")
        if (node.inputs[0].ndim != node.inputs[1].ndim or
-                node.inputs[0].ndim != 2 or
+                node.inputs[0].ndim != 2):
-                int(ctx.bin_id[-2]) < 2):
            raise NotImplementedError("This case does not have C code yet.")
        return """
@@ -1125,110 +1120,33 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of
        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
        kname = "k_vector_add_fast"
        k_var = "k_vector_add_fast_" + nodename
-        code = """
+        code = """#include "cluda.h"
-/*
- * This is an atomicAdd that works for doubles since that is not provided
- * natively by cuda before arch 6.0.
- */
-#if __CUDA_ARCH__ < 600
-__device__ ga_double atomicAdd(ga_double* address, ga_double val) {
-    ga_ulong *address_as_ull = (ga_ulong *)address;
-    ga_ulong old = *address_as_ull, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed,
-                        __double_as_longlong(val +
-                        __longlong_as_double(assumed)));
-    } while (assumed != old);
-    return __longlong_as_double(old);
-}
-#endif
-__device__ ga_double atomicExch(ga_double *address, ga_double val) {
-    return atomicExch((ga_ulong *)address,
-                      __double_as_longlong(val));
-}
-/* GA_LONG */
-__device__ ga_long atomicAdd(ga_long* address, ga_long val) {
-    ga_ulong *address_as_ull = (ga_ulong *)address;
-    ga_ulong old = *address_as_ull, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed,
-                        (ga_ulong)(val + (ga_long)assumed));
-    } while (assumed != old);
-    return (ga_long)old;
-}
-__device__ ga_long atomicExch(ga_long *address, ga_long val) {
-    return (ga_long)atomicExch((ga_ulong *)address, (ga_ulong)val);
-}
-/* GA_HALF */
-/*
- * This may read and write 2 bytes more than the size of the array
- * if the array has an uneven number of elements.  The actual value
- * at that spot will not be modified.
- */
-__device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
-  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
-  ga_uint old, assumed, sum, new_;
-  old = *base;
-  do {
-    assumed = old;
-    sum = __float2half_rn(
-      __half2float(val) +
-      __half2float((ga_half)__byte_perm(old, 0,
-                     ((ga_size)addr & 2) ? 0x4432 : 0x4410)));
-    new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
-    old = atomicCAS(base, assumed, new_);
-  } while (assumed != old);
-  return (ga_half)__byte_perm(old, 0,
-                                  ((ga_size)addr & 2) ? 0x4432 : 0x4410);
-}
-__device__ ga_half atomicExch(ga_half *addr, ga_half val) {
-  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
-  ga_uint old, assumed, new_;
-  old = *base;
-  do {
-    assumed = old;
-    new_ = __byte_perm(old, val, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
-    old = atomicCAS(base, assumed, new_);
-  } while (assumed != old);
-  return (ga_half)__byte_perm(old, 0,
-                                  ((ga_size)addr & 2) ? 0x4432 : 0x4410);
-}
        KERNEL void k_vector_add_fast(const ga_size numRowsX,
                                      const ga_size numColsX,
                                      const ga_ssize stridesX0,
                                      const ga_ssize stridesX1,
-                                      %(type_x)s *X,
+                                      GLOBAL_MEM %(type_x)s *X,
                                      const ga_size offset_X,
                                      const ga_size numRowsY,
                                      const ga_size numColsY,
                                      const ga_ssize stridesY0,
                                      const ga_ssize stridesY1,
-                                      %(type_y)s *Y,
+                                      GLOBAL_MEM %(type_y)s *Y,
                                      const ga_size offset_Y,
                                      const ga_size numIndices,
                                      const ga_ssize stridesIndices,
-                                      %(type_ind)s *indices_arr,
+                                      GLOBAL_MEM %(type_ind)s *indices_arr,
                                      const ga_size offset_indices_arr,
-                                      const int set_instead_of_inc,
+                                      const ga_int set_instead_of_inc,
-                                      ga_int *err)
+                                      GLOBAL_MEM ga_int *err)
        {
-             X = (%(type_x)s *)(((char *)X)+offset_X);
+             X = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)X)+offset_X);
-             Y = (%(type_y)s *)(((char *)Y)+offset_Y);
+             Y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)Y)+offset_Y);
-             indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr);
+             indices_arr = (GLOBAL_MEM %(type_ind)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr);
-             for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
+             for (ga_int i = GID_0; i < numIndices; i += GDIM_0)
             {
-                  for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
+                  for (ga_int j = LID_0; j < numColsX; j += LDIM_0)
                  {
                      ga_ssize x_row = indices_arr[i * stridesIndices];
                      if (x_row < 0)
@@ -1236,10 +1154,10 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
                      ga_ssize y_row = i;
                      if (x_row < numRowsX && x_row >= 0) {
                        if (set_instead_of_inc) {
-                          atomicExch(&X[(x_row * stridesX0) + (j * stridesX1)],
+                          atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
                                   Y[(y_row * stridesY0) + (j * stridesY1)]);
                        } else {
-                          atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)],
+                          atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)],
                                    Y[(y_row * stridesY0) + (j * stridesY1)]);
                        }
                      } else {
@@ -1249,11 +1167,13 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
             }
             return;
        }
-        """ % dict(type_x=type_x, type_y=type_y, type_ind=type_ind)
+        """ % dict(type_x=type_x, type_y=type_y, type_ind=type_ind,
+                   tc=np.dtype(dtype_x).char)
+        from pygpu.gpuarray import SIZE, SSIZE
        params = [
-            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
+            SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE,
-            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
+            SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE,
-            'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'int',
+            SIZE, SSIZE, gpuarray.GpuArray, SIZE, 'int32',
            gpuarray.GpuArray]
        return [Kernel(code=code, name=kname, params=params,
                       flags=flags, objvar=k_var)]
@@ -1265,15 +1185,15 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
                                     PyGpuArrayObject* indices_arr,
                                     const int set_instead_of_inc)
        {
-            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
+            size_t threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256);
-            size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
+            size_t n_blocks = std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096);
            gpudata *errbuf;
            int err, kerr = 0;
            size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga);
            size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga);
            size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga);
-            if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
+            if (threads_per_block > 0 && n_blocks > 0) {
              err = gpudata_property(py_self->ga.data,
                                     GA_CTX_PROP_ERRBUF, &errbuf);
              if (err != GA_NO_ERROR) {
@@ -1281,30 +1201,27 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
                return 1;
              }
-              ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / itemsize_x;
+              err = k_vector_add_fast_call(
-              ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / itemsize_x;
+        1, &n_blocks, &threads_per_block, 0,
-              ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y;
+        PyGpuArray_DIMS(py_self)[0],
-              ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y;
+        PyGpuArray_DIMS(py_self)[1],
-              ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind;
+        PyGpuArray_STRIDES(py_self)[0] / itemsize_x,
-              void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0],
+        PyGpuArray_STRIDES(py_self)[1] / itemsize_x,
-                                       (void *)&PyGpuArray_DIMS(py_self)[1],
+        py_self->ga.data,
-                                       (void *)&stride_X0,
+        py_self->ga.offset,
-                                       (void *)&stride_X1,
+        PyGpuArray_DIMS(py_other)[0],
-                                       (void *)py_self->ga.data,
+        PyGpuArray_DIMS(py_other)[1],
-                                       (void *)&py_self->ga.offset,
+        PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y,
-                                       (void *)&PyGpuArray_DIMS(py_other)[0],
+        PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y,
-                                       (void *)&PyGpuArray_DIMS(py_other)[1],
+        py_other->ga.data,
-                                       (void *)&stride_Y0,
+        py_other->ga.offset,
-                                       (void *)&stride_Y1,
+        PyGpuArray_DIMS(indices_arr)[0],
-                                       (void *)py_other->ga.data,
+        PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind,
-                                       (void *)&py_other->ga.offset,
+        indices_arr->ga.data,
-                                       (void *)&PyGpuArray_DIMS(indices_arr)[0],
+        indices_arr->ga.offset,
-                                       (void *)&stride_ind,
+        set_instead_of_inc,
-                                       (void *)indices_arr->ga.data,
+        errbuf);
-                                       (void *)&indices_arr->ga.offset,
-                                       (void *)&set_instead_of_inc,
-                                       (void *)errbuf};
-              err = GpuKernel_call(&%(k_var)s, 3, n_blocks, threads_per_block, 0, kernel_params);
              if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "gpuarray error: %(k_var)s: %%s.",

--- a/theano/gpuarray/tests/c_code/tstgpueye.c
+++ b/theano/gpuarray/tests/c_code/tstgpueye.c
 #section kernels
 #kernel eye : *, size, size, size :
+#include <cluda.h>
 /* The eye name will be used to generate supporting objects.  The only
   you probably need to care about is the kernel object which will be
   named 'k_' + <the name above> (k_eye in this case).  This name also