Merge pull request #6317 from abergeron/gpuarray_07

Work to integrate libgpuarray 0.7 changes.

Merge pull request #6317 from abergeron/gpuarray_07
b980a8ee · Frédéric Bastien · GitHub · 814dc05a · 8b9dc5f1 · b980a8ee
--- a/.jenkins/gpuarray-branch
+++ b/.jenkins/gpuarray-branch
-v0.6.9
\ No newline at end of file
+v0.7.1
\ No newline at end of file
--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -46,23 +46,36 @@ def init_dev(dev, name=None, preallocate=None):
    global pygpu_activated
    if not config.cxx:
        raise RuntimeError("The new gpu-backend need a c++ compiler.")
-    if (pygpu.version.major, pygpu.version.minor, pygpu.version.patch) < (0, 6, 1):
+    if (pygpu.version.major != 0 or pygpu.version.minor != 7 or
+            pygpu.version.patch < 0):
        raise ValueError(
-            "Your installed version of pygpu is too old, please upgrade to 0.6.1 or later")
+            "Your installed version of pygpu(%s) is too old, please upgrade to 0.7.0 or later" %
+            pygpu.version.fullversion)
    # This is for the C headers API, we need to match the exact version.
-    if pygpu.gpuarray.api_version()[0] != 1:
+    gpuarray_version_major_supported = 2
+    gpuarray_version_major_detected = pygpu.gpuarray.api_version()[0]
+    if gpuarray_version_major_detected != gpuarray_version_major_supported:
        raise ValueError(
-            "Your installed libgpuarray is not in sync, please make sure to have the appropriate version")
+            "Your installed version oflibgpuarray is not in sync with the current Theano"
+            " version. The installed libgpuarray version support API version %d,"
+            " while current Theano support API version %d. Change the version of"
+            " libgpuarray or Theano to fix this problem.",
+            gpuarray_version_major_detected,
+            gpuarray_version_major_supported)
    if dev not in init_dev.devmap:
+        args = dict()
        if config.gpuarray.cache_path != '':
-            os.environ['GPUARRAY_CACHE_PATH'] = config.gpuarray.cache_path
+            args['kernel_cache_path'] = config.gpuarray.cache_path
        if preallocate is None:
            preallocate = config.gpuarray.preallocate
+        if preallocate < 0:
+            args['max_cache_size'] = 0
+        else:
+            args['initial_cache_size'] = preallocate
        context = pygpu.init(
            dev,
-            disable_alloc_cache=preallocate < 0,
-            single_stream=config.gpuarray.single_stream,
-            sched=config.gpuarray.sched)
+            sched=config.gpuarray.sched,
+            **args)
        context.dev = dev
        init_dev.devmap[dev] = context
        reg_context(name, context)
@@ -115,12 +128,12 @@ def init_dev(dev, name=None, preallocate=None):
    # This will map the context name to the real context object.
    if config.print_active_device:
        try:
-            pcibusid = '(' + context.pcibusid + ')'
+            unique_id = '(' + context.unique_id + ')'
        except pygpu.gpuarray.UnsupportedException:
-            pcibusid = ''
+            unique_id = ''

        print("Mapped name %s to device %s: %s %s" %
-              (name, dev, context.devname, pcibusid),
+              (name, dev, context.devname, unique_id),
              file=sys.stderr)
    pygpu_activated = True

@@ -207,5 +220,5 @@ else:
            config.device.startswith('opencl') or
            config.device.startswith('cuda') or
            config.contexts != ''):
-        error("pygpu was configured but could not be imported or is too old (version 0.6 or higher required)",
+        error("pygpu was configured but could not be imported or is too old (version 0.7 or higher required)",
              exc_info=True)
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -158,7 +158,7 @@ class Kernel(object):
            the `params` list consists of C typecodes

    It can also have the key `cflags` which is a string of C flag
-    values like this `"GA_USE_DOUBLE|GA_USE_CLUDA"`.
+    values like this `"GA_USE_DOUBLE|GA_USE_SMALL"`.

    Parameters
    ----------
@@ -216,7 +216,7 @@ class Kernel(object):
            else:
                raise TypeError("can't get a dtype from %s" % (type(t),))
        dtypes = [get_dtype(t) for t in types]
-        flags = dict(cluda=True)
+        flags = dict()
        if any(d == np.float64 for d in dtypes):
            flags['have_double'] = True
        if any(d.itemsize < 4 for d in dtypes):
@@ -231,8 +231,6 @@ class Kernel(object):
        res = []
        if self.flags.get('cflags', '') != '':
            res.append(self.flags['cflags'])
-        if self.flags.get('cluda', False):
-            res.append('GA_USE_CLUDA')
        if self.flags.get('have_double', False):
            res.append('GA_USE_DOUBLE')
        if self.flags.get('have_small', False):
@@ -241,15 +239,16 @@ class Kernel(object):
            res.append('GA_USE_COMPLEX')
        if self.flags.get('have_half', False):
            res.append('GA_USE_HALF')
-        return '|'.join(res)
+        res = '|'.join(res)
+        if not res:
+            return '0'
+        return res

    def _get_py_flags(self):
        res = dict(self.flags)
        cflags = res.pop('cflags', '')
        for fl in cflags.split('|'):
            fl = fl.strip()
-            if fl == 'GA_USE_CLUDA':
-                res['cluda'] = True
            if fl == 'GA_USE_DOUBLE':
                res['have_double'] = True
            if fl == 'GA_USE_SMALL':
@@ -555,7 +554,7 @@ class CGpuKernelBase(COp, GpuKernelBase):
                kflags = splt2[2].strip()
                kcode = def_macros + '\n' + kcode + '\n' + undef_macros
                res.append(Kernel(kcode, ktypes, kname,
-                                  flags=dict(cluda=True, cflags=kflags)))
+                                  flags=dict(cflags=kflags)))
                n += 2
            self._cached_kernels = res
            return res
@@ -703,39 +702,35 @@ class GpuFromHost(Op):
        if (%(name)s_tmp == NULL)
          %(fail)s

-        if (%(out)s != NULL && GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) &&
-            theano_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp),
-                              (size_t *)PyArray_DIMS(%(name)s_tmp),
-                              get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) {
-          Py_BEGIN_ALLOW_THREADS
-          err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
-                               PyArray_NBYTES(%(name)s_tmp));
-          Py_END_ALLOW_THREADS
-          Py_DECREF(%(name)s_tmp);
-          if (err != GA_NO_ERROR) {
-            PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu");
-            %(fail)s;
-          }
-        } else {
+        if (%(out)s == NULL || !GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) ||
+            !theano_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp),
+                               (size_t *)PyArray_DIMS(%(name)s_tmp),
+                               get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) {
          Py_XDECREF(%(out)s);
-          // This method will release the GIL when needed.
-          %(out)s = pygpu_fromhostdata(PyArray_DATA(%(name)s_tmp),
-                                       get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
-                                       PyArray_NDIM(%(name)s_tmp),
-                                       (size_t *)PyArray_DIMS(%(name)s_tmp),
-                                       (ssize_t *)PyArray_STRIDES(%(name)s_tmp),
-                                       %(ctx)s,
-                                       Py_None);
-          Py_DECREF(%(name)s_tmp);
+          %(out)s = pygpu_empty(PyArray_NDIM(%(name)s_tmp),
+                                (size_t *)PyArray_DIMS(%(name)s_tmp),
+                                get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
+                                GA_C_ORDER, %(ctx)s, Py_None);
          if (%(out)s == NULL) {
-              %(fail)s
+            Py_DECREF(%(name)s_tmp);
+            %(fail)s;
          }
        }
+
+        Py_BEGIN_ALLOW_THREADS
+        err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
+                             PyArray_NBYTES(%(name)s_tmp));
+        Py_END_ALLOW_THREADS
+        Py_DECREF(%(name)s_tmp);
+        if (err != GA_NO_ERROR) {
+          PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu");
+          %(fail)s;
+        }
        """ % {'name': name, 'inp': inputs[0], 'ctx': sub['params'],
               'out': outputs[0], 'fail': sub['fail']}

    def c_code_cache_version(self):
-        return (9,)
+        return (10,)


 class GpuToGpu(Op):
@@ -1619,7 +1614,8 @@ class GpuEye(GpuKernelBase, Op):
                for i in xrange(3)]

    def gpu_kernels(self, node, name):
-        code = """
+        code = """#include "cluda.h"
+
 KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
                ga_size n, ga_size m, ga_ssize k) {
    a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);

--- a/theano/gpuarray/c_code/corr3d_gemm.c
+++ b/theano/gpuarray/c_code/corr3d_gemm.c
 #section kernels

 #kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
-// TODO check kernel flags
+#include "cluda.h"
+
 // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
 // sources are clearly marked. Below we reproduce the original license of
 // the Caffe software.
@@ -87,6 +88,8 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
 }

 #kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
+
 KERNEL void im3d2col_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
    const ga_size offset_im,
@@ -139,6 +142,8 @@ KERNEL void im3d2col_kernel(const ga_size n,

 // GPU kernel for the case of dilation
 #kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
+#include "cluda.h"
+
 KERNEL void dilated_col2im3d_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
    const ga_size offset_col,
@@ -207,6 +212,7 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n,
 }

 #kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
+#include "cluda.h"

 KERNEL void col2im3d_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col,

--- a/theano/gpuarray/c_code/corr_gemm.c
+++ b/theano/gpuarray/c_code/corr_gemm.c
 #section kernels

 #kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
 // TODO check kernel flags
 // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
 // sources are clearly marked. Below we reproduce the original license of
@@ -77,6 +78,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
 }

 #kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 KERNEL void im2col_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
@@ -122,6 +124,8 @@ KERNEL void im2col_kernel(const ga_size n,

 // GPU kernel for the case of dilation
 #kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
+#include "cluda.h"
+
 KERNEL void dilated_col2im_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
    const ga_size height, const ga_size width, const ga_size channels,
@@ -172,6 +176,7 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
 }

 #kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
+#include "cluda.h"

 KERNEL void col2im_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,

--- a/theano/gpuarray/c_code/dnn_fwd.c
+++ b/theano/gpuarray/c_code/dnn_fwd.c
@@ -199,7 +199,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,

    if (!reuse_algo) {
      char pci_id[16];
-      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+      gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
      // check out cache
      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups);
      if (hashkey.empty()) {

--- a/theano/gpuarray/c_code/dnn_gi.c
+++ b/theano/gpuarray/c_code/dnn_gi.c
@@ -168,7 +168,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,

    if (!reuse_algo) {
      char pci_id[16];
-      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+      gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
      // check out cache
      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
      if (hashkey.empty()) {

--- a/theano/gpuarray/c_code/dnn_gw.c
+++ b/theano/gpuarray/c_code/dnn_gw.c
@@ -155,7 +155,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,

    if (!reuse_algo) {
      char pci_id[16];
-      gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
+      gpucontext_property(c->ctx, GA_CTX_PROP_UNIQUE_ID, pci_id);
      // check out cache
      hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), *kerns, desc, output, groups);
      if (hashkey.empty()) {

--- a/theano/gpuarray/c_code/magma_cholesky.c
+++ b/theano/gpuarray/c_code/magma_cholesky.c
 #section kernels

 #kernel tril_kernel : size, size, size, *:
+#include "cluda.h"

 KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
@@ -17,6 +18,7 @@ KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
 }

 #kernel triu_kernel : size, size, size, *:
+#include "cluda.h"

 KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {

--- a/theano/gpuarray/c_code/magma_qr.c
+++ b/theano/gpuarray/c_code/magma_qr.c
 #section kernels

 #kernel triu_kernel : size, size, size, *:
+#include "cluda.h"

 KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {

--- a/theano/gpuarray/c_code/pool.c
+++ b/theano/gpuarray/c_code/pool.c
 #section kernels

 #kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool2d_kernel(const ga_size nthreads,
@@ -44,6 +45,7 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
 }

 #kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool3d_kernel(const ga_size nthreads,
@@ -95,6 +97,7 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
 }

 #kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, bool, bool, *, size:
+#include "cluda.h"

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool2d_kernel(const ga_size nthreads,
@@ -150,6 +153,7 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
 }

 #kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
+#include "cluda.h"

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool3d_kernel(const ga_size nthreads,

--- a/theano/gpuarray/c_code/pool_ave_grad.c
+++ b/theano/gpuarray/c_code/pool_ave_grad.c
 #section kernels

 #kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, bool, bool, *, size :
+#include "cluda.h"

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
@@ -50,6 +51,7 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
 }

 #kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
+#include "cluda.h"

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,

--- a/theano/gpuarray/c_code/pool_grad_grad.c
+++ b/theano/gpuarray/c_code/pool_grad_grad.c
 #section kernels

 #kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size pooled_height,
@@ -47,6 +48,7 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
 }

 #kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size pooled_depth,

--- a/theano/gpuarray/c_code/pool_max_grad.c
+++ b/theano/gpuarray/c_code/pool_max_grad.c
 #section kernels

 #kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
@@ -43,6 +44,7 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
 }

 #kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,

--- a/theano/gpuarray/c_code/pool_max_rop.c
+++ b/theano/gpuarray/c_code/pool_max_rop.c
 #section kernels

 #kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
@@ -50,6 +51,7 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
 }

 #kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -1743,7 +1743,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_ccontig"
            k_var = "kernel_reduce_ccontig_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            KERNEL void %(kname)s(
                    const ga_size d0,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -1781,7 +1782,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_1"
            k_var = "kernel_reduce_1_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            KERNEL void %(kname)s(
                    const ga_size d0,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -1821,7 +1823,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_11"
            k_var = "kernel_reduce_11_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -1909,7 +1912,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
                {}, True)
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
                %(decl)s{
                    %(init)s
                    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
@@ -1943,7 +1947,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_010"
            k_var = "kernel_reduce_010_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1, const ga_size d2,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -1989,7 +1994,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_010_AD"
            k_var = "kernel_reduce_010_AD_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            KERNEL void %(kname)s(
                    const ga_size A, const ga_size B, const ga_size C, const ga_size D,
                    const %(in_type)s *X, const ga_size offset_X,
@@ -2053,7 +2059,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            %(decl)s
            {
              %(init)s
@@ -2088,7 +2095,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_110"
            k_var = "kernel_reduce_110_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1, const ga_size d2,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -2133,7 +2141,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            %(decl)s
            {
                %(init)s
@@ -2163,7 +2172,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            %(decl)s
            {
                %(init)s
@@ -2195,7 +2205,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_001"
            k_var = "kernel_reduce_001_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1, const ga_size d2,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -2244,7 +2254,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            %(decl)s
            {
                %(init)s
@@ -2280,7 +2291,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            %(decl)s
            {
                %(init)s
@@ -2314,7 +2326,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            %(decl)s
            {
                %(init)s
@@ -2345,7 +2358,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_1011"
            k_var = "kernel_reduce_1011_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -2502,15 +2516,15 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):

    def gpu_kernels(self, node, name):
        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
-            # Some OpenCL compilers do not accept no-arguments kernels
-            src = "KERNEL void reduk(GLOBAL_MEM float *a) {}"
+            # Some OpenCL compilers do not accept no-arguments empty kernels
+            src = "#include \"cluda.h\"\nKERNEL void reduk(GLOBAL_MEM float *a) { a[0] = 0; }"
            params = ['float32']
        else:
            k = self.get_kernel_cache(node)
            _, src, _, _ = k._get_basic_kernel(k.init_local_size,
                                               node.inputs[0].ndim)
            nd = node.inputs[0].ndim
-            params = ['uint32', gpuarray.GpuArray]
+            params = ['uint32', gpuarray.GpuArray, 'uint32']
            params.extend('uint32' for _ in range(nd))
            params.append(gpuarray.GpuArray)
            params.append('uint32')
@@ -2617,9 +2631,10 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        code += """
        args[0] = &n;
        args[1] = tmp->ga.data;
+        args[2] = &tmp->ga.offset;
        """ % dict(output=output)

-        p = 2
+        p = 3
        for i in range(node.inputs[0].ndim):
            code += """
        proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
@@ -2677,7 +2692,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
        return code

    def c_code_cache_version_apply(self, node):
-        return (3, self.kernel_version(node))
+        return (4, self.kernel_version(node))

    def generate_kernel(self, node, odtype, redux):
        if isinstance(self.scalar_op, scalar.basic.Add):

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -74,7 +74,8 @@ class GpuCumOp(GpuKernelBase, Op):
        k_var = "k_cumadd_" + nodename
        dtype_x = node.inputs[0].dtype
        flags = Kernel.get_flags(dtype_x)
-        code = """
+        code = """#include "cluda.h"
+
        KERNEL void %(kname)s(float* input, ga_size input_offset,
                              float* output, ga_size output_offset,
                              ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
@@ -112,7 +113,8 @@ class GpuCumOp(GpuKernelBase, Op):
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
                  'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE]
-        code = """
+        code = """#include "cluda.h"
+
        // helper functions
        WITHIN_KERNEL
        void k_reductionPhase(float* partialCumOp) {
@@ -213,7 +215,8 @@ class GpuCumOp(GpuKernelBase, Op):
        # k_finalCumOp
        kname = "k_finalCumOp"
        k_var = "k_finalCumOp_" + nodename
-        code = """
+        code = """#include "cluda.h"
+
        KERNEL void k_finalCumOp(float* output, ga_size output_offset,
                                 float* blockSum, ga_size blockSum_offset,
                                 size_t nbElementsPerCumOp,

--- a/theano/gpuarray/fp16_help.py
+++ b/theano/gpuarray/fp16_help.py
@@ -22,7 +22,7 @@ def load_w(dtype):

    """
    if dtype == 'float16':
-        return '__half2float'
+        return 'ga_half2float'
    else:
        return ''

@@ -37,6 +37,6 @@ def write_w(dtype):

    """
    if dtype == 'float16':
-        return '__float2half_rn'
+        return 'ga_float2half'
    else:
        return ''
--- a/theano/gpuarray/kernel_codegen.py
+++ b/theano/gpuarray/kernel_codegen.py
@@ -34,7 +34,9 @@ def nvcc_kernel(name, params, body):
            else:
                yield b
    bodystr = ';\n'.join(flatbody())
-    return """KERNEL void %(name)s (%(paramstr)s)
+    return """#include "cluda.h"
+
+    KERNEL void %(name)s (%(paramstr)s)
    {
        %(bodystr)s;
    }

--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -66,7 +66,8 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
        work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype))
        write_out_ctype = write_w(node.outputs[0].dtype)
        load_in_ctype = load_w(node.inputs[0].dtype)
-        code = """
+        code = """#include "cluda.h"
+
 KERNEL void k_multi_warp_multinomial(
    const ga_size nb_multi,
    const ga_size nb_outcomes,
@@ -276,7 +277,8 @@ class GPUAChoiceFromUniform(GpuKernelBase, Op):

    def gpu_kernels(self, node, name):
        replace = int(self.replace)
-        code = """
+        code = """#include "cluda.h"
+
 KERNEL void k_multi_warp_multinomial_wor(
    const ga_size nb_multi,
    const ga_size nb_outcomes,

--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
@@ -61,7 +61,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        kernels = []
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
-        code = """
+        code = """#include "cluda.h"
+
        // a version that uses less registers but doesn't work in all cases.
        %(mode_constants)s
        KERNEL void %(kname)s(
@@ -163,7 +164,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):

        kname = "k_multi_warp"
        k_var = "k_multi_warp_" + nodename
-        code = """
+        code = """#include "cluda.h"
+
        %(mode_constants)s
        KERNEL void %(kname)s(
            const ga_int mode,
@@ -500,7 +502,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
            size_t threads_per_block[3] = {d, c, 1};
            //get the max threads per blocks
            size_t max_threads_dim;
-            int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
+            int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim);
            if (err != GA_NO_ERROR){
                PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
                %(fail)s;

--- a/theano/gpuarray/nnet.py
+++ b/theano/gpuarray/nnet.py
@@ -75,7 +75,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE
        ]
        sio = StringIO()
-        print("""
+        print("""#include "cluda.h"
+
        KERNEL void %(kname)s(const ga_size M, const ga_size N,
            GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
            GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
@@ -393,7 +394,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
        ]
        sio = StringIO()
-        print("""
+        print("""#include "cluda.h"
+
        KERNEL void %(kname)s(
           const ga_size N, const ga_size K,
           GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
@@ -495,7 +497,7 @@ class GpuSoftmax(GpuKernelBase, Op):
        {
            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32 * 1024)), 1, 1};
 //TODO, detect the maximum number of thread per block.
-            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE
+            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE0
            size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
                                     2 * sizeof(npy_%(work_x)s);
            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
@@ -557,7 +559,8 @@ class GpuSoftmax(GpuKernelBase, Op):
        kernels = []
        kname = "kSoftmax"
        k_var = "kSoftmax_" + nodename
-        code = """
+        code = """#include "cluda.h"
+
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                               GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                               GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
@@ -630,7 +633,8 @@ class GpuSoftmax(GpuKernelBase, Op):
                              flags=flags, objvar=k_var))
        kname = "kSoftmax_fixed_shared"
        k_var = "kSoftmax_fixed_shared" + nodename
-        code = """
+        code = """#include "cluda.h"
+
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                               GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                               GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
@@ -788,7 +792,7 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
        {
            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024)), 1, 1};
 //TODO, detect the maximum number of thread per block.
-            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE
+            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)256), 1, 1}; // TODO: Read GA_CTX_PROP_MAXLSIZE0
            size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
                                     2 * sizeof(npy_%(work_x)s);
            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
@@ -854,7 +858,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
        kernels = []
        kname = "kSoftmaxWithBias"
        k_var = "kSoftmaxWithBias_" + nodename
-        code = """
+        code = """#include "cluda.h"
+
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
@@ -930,7 +935,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
                              flags=flags, objvar=k_var))
        kname = "kSoftmaxWithBias_fixed_shared"
        k_var = "kSoftmaxWithBias_fixed_shared" + nodename
-        code = """
+        code = """#include "cluda.h"
+
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1110,17 +1110,11 @@ def local_gpua_advanced_boolean_subtensor(op, context_name, inputs, outputs):
 @op_lifter([tensor.AdvancedIncSubtensor1])
 @register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
 def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs):
-    context = get_context(context_name)
-    # This is disabled on non-cuda contexts
-    if context.kind != b'cuda':
-        return None
-
    x, y, ilist = inputs

    set_instead_of_inc = op.set_instead_of_inc

-    compute_capability = int(context.bin_id[-2])
-    if (compute_capability >= 2 and x.ndim == 1 and y.ndim == 0 and
+    if (x.ndim == 1 and y.ndim == 0 and
            config.deterministic == 'default'):
        x = x.dimshuffle(0, 'x')
        y = y.dimshuffle('x', 'x')
@@ -1128,7 +1122,7 @@ def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs):
            set_instead_of_inc=set_instead_of_inc)(x, y, ilist)
        ret = GpuDimShuffle(ret.type.broadcastable, [0])(ret)
        return ret
-    elif (compute_capability < 2 or x.ndim != 2 or y.ndim != 2 or
+    elif (x.ndim != 2 or y.ndim != 2 or
            config.deterministic == 'more'):
        return GpuAdvancedIncSubtensor1(
            set_instead_of_inc=set_instead_of_inc)

--- a/theano/gpuarray/rng_mrg.py
+++ b/theano/gpuarray/rng_mrg.py
@@ -80,7 +80,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        else:
            raise ValueError('Unsupported data type for output',
                             self.output_type.dtype)
-        code = """
+        code = """#include "cluda.h"
+
        KERNEL void mrg_uniform(
                GLOBAL_MEM %(otype)s *sample_data,
                ga_size sample_offset,

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
--- a/theano/gpuarray/tests/c_code/tstgpueye.c
+++ b/theano/gpuarray/tests/c_code/tstgpueye.c
 #section kernels

 #kernel eye : *, size, size, size :
+#include <cluda.h>
 /* The eye name will be used to generate supporting objects.  The only
   you probably need to care about is the kernel object which will be
   named 'k_' + <the name above> (k_eye in this case).  This name also