Add cluda include to all kernels.

b80a7d12 · Arnaud Bergeron · Frederic Bastien · 28ffda75 · b80a7d12 · b80a7d12
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -158,7 +158,7 @@ class Kernel(object):
            the `params` list consists of C typecodes

    It can also have the key `cflags` which is a string of C flag
-    values like this `"GA_USE_DOUBLE|GA_USE_CLUDA"`.
+    values like this `"GA_USE_DOUBLE|GA_USE_SMALL"`.

    Parameters
    ----------
@@ -216,7 +216,7 @@ class Kernel(object):
            else:
                raise TypeError("can't get a dtype from %s" % (type(t),))
        dtypes = [get_dtype(t) for t in types]
-        flags = dict(cluda=True)
+        flags = dict()
        if any(d == np.float64 for d in dtypes):
            flags['have_double'] = True
        if any(d.itemsize < 4 for d in dtypes):
@@ -231,8 +231,6 @@ class Kernel(object):
        res = []
        if self.flags.get('cflags', '') != '':
            res.append(self.flags['cflags'])
-        if self.flags.get('cluda', False):
-            res.append('GA_USE_CLUDA')
        if self.flags.get('have_double', False):
            res.append('GA_USE_DOUBLE')
        if self.flags.get('have_small', False):
@@ -241,15 +239,16 @@ class Kernel(object):
            res.append('GA_USE_COMPLEX')
        if self.flags.get('have_half', False):
            res.append('GA_USE_HALF')
-        return '|'.join(res)
+        res = '|'.join(res)
+        if not res:
+            return '0'
+        return res

    def _get_py_flags(self):
        res = dict(self.flags)
        cflags = res.pop('cflags', '')
        for fl in cflags.split('|'):
            fl = fl.strip()
-            if fl == 'GA_USE_CLUDA':
-                res['cluda'] = True
            if fl == 'GA_USE_DOUBLE':
                res['have_double'] = True
            if fl == 'GA_USE_SMALL':
@@ -555,7 +554,7 @@ class CGpuKernelBase(COp, GpuKernelBase):
                kflags = splt2[2].strip()
                kcode = def_macros + '\n' + kcode + '\n' + undef_macros
                res.append(Kernel(kcode, ktypes, kname,
-                                  flags=dict(cluda=True, cflags=kflags)))
+                                  flags=dict(cflags=kflags)))
                n += 2
            self._cached_kernels = res
            return res
@@ -1619,7 +1618,8 @@ class GpuEye(GpuKernelBase, Op):
                for i in xrange(3)]

    def gpu_kernels(self, node, name):
-        code = """
+        code = """#include "cluda.h"
+
 KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
                ga_size n, ga_size m, ga_ssize k) {
    a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);

--- a/theano/gpuarray/c_code/corr3d_gemm.c
+++ b/theano/gpuarray/c_code/corr3d_gemm.c
 #section kernels

 #kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
-// TODO check kernel flags
+#include "cluda.h"
+
 // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
 // sources are clearly marked. Below we reproduce the original license of
 // the Caffe software.
@@ -87,6 +88,8 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
 }

 #kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
+
 KERNEL void im3d2col_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
    const ga_size offset_im,
@@ -139,6 +142,8 @@ KERNEL void im3d2col_kernel(const ga_size n,

 // GPU kernel for the case of dilation
 #kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
+#include "cluda.h"
+
 KERNEL void dilated_col2im3d_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
    const ga_size offset_col,
@@ -207,6 +212,7 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n,
 }

 #kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
+#include "cluda.h"

 KERNEL void col2im3d_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col,

--- a/theano/gpuarray/c_code/corr_gemm.c
+++ b/theano/gpuarray/c_code/corr_gemm.c
 #section kernels

 #kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"
 // TODO check kernel flags
 // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
 // sources are clearly marked. Below we reproduce the original license of
@@ -77,6 +78,7 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
 }

 #kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 KERNEL void im2col_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
@@ -122,6 +124,8 @@ KERNEL void im2col_kernel(const ga_size n,

 // GPU kernel for the case of dilation
 #kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
+#include "cluda.h"
+
 KERNEL void dilated_col2im_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
    const ga_size height, const ga_size width, const ga_size channels,
@@ -172,6 +176,7 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
 }

 #kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
+#include "cluda.h"

 KERNEL void col2im_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,

--- a/theano/gpuarray/c_code/magma_cholesky.c
+++ b/theano/gpuarray/c_code/magma_cholesky.c
 #section kernels

 #kernel tril_kernel : size, size, size, *:
+#include "cluda.h"

 KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
@@ -17,6 +18,7 @@ KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
 }

 #kernel triu_kernel : size, size, size, *:
+#include "cluda.h"

 KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {

--- a/theano/gpuarray/c_code/magma_qr.c
+++ b/theano/gpuarray/c_code/magma_qr.c
 #section kernels

 #kernel triu_kernel : size, size, size, *:
+#include "cluda.h"

 KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {

--- a/theano/gpuarray/c_code/pool.c
+++ b/theano/gpuarray/c_code/pool.c
 #section kernels

 #kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool2d_kernel(const ga_size nthreads,
@@ -44,6 +45,7 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
 }

 #kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool3d_kernel(const ga_size nthreads,
@@ -95,6 +97,7 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
 }

 #kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, bool, bool, *, size:
+#include "cluda.h"

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool2d_kernel(const ga_size nthreads,
@@ -150,6 +153,7 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
 }

 #kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
+#include "cluda.h"

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool3d_kernel(const ga_size nthreads,

--- a/theano/gpuarray/c_code/pool_ave_grad.c
+++ b/theano/gpuarray/c_code/pool_ave_grad.c
 #section kernels

 #kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, bool, bool, *, size :
+#include "cluda.h"

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
@@ -50,6 +51,7 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
 }

 #kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, bool, bool, *, size :
+#include "cluda.h"

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,

--- a/theano/gpuarray/c_code/pool_grad_grad.c
+++ b/theano/gpuarray/c_code/pool_grad_grad.c
 #section kernels

 #kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size pooled_height,
@@ -47,6 +48,7 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
 }

 #kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size pooled_depth,

--- a/theano/gpuarray/c_code/pool_max_grad.c
+++ b/theano/gpuarray/c_code/pool_max_grad.c
 #section kernels

 #kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
@@ -43,6 +44,7 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
 }

 #kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,

--- a/theano/gpuarray/c_code/pool_max_rop.c
+++ b/theano/gpuarray/c_code/pool_max_rop.c
 #section kernels

 #kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
@@ -50,6 +51,7 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
 }

 #kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :
+#include "cluda.h"

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -1743,7 +1743,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_ccontig"
            k_var = "kernel_reduce_ccontig_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            KERNEL void %(kname)s(
                    const ga_size d0,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -1781,7 +1782,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_1"
            k_var = "kernel_reduce_1_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            KERNEL void %(kname)s(
                    const ga_size d0,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -1821,7 +1823,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_11"
            k_var = "kernel_reduce_11_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -1909,7 +1912,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
                {}, True)
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
                %(decl)s{
                    %(init)s
                    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
@@ -1943,7 +1947,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_010"
            k_var = "kernel_reduce_010_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1, const ga_size d2,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -1989,7 +1994,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_010_AD"
            k_var = "kernel_reduce_010_AD_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            KERNEL void %(kname)s(
                    const ga_size A, const ga_size B, const ga_size C, const ga_size D,
                    const %(in_type)s *X, const ga_size offset_X,
@@ -2053,7 +2059,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            %(decl)s
            {
              %(init)s
@@ -2088,7 +2095,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_110"
            k_var = "kernel_reduce_110_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1, const ga_size d2,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -2133,7 +2141,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            %(decl)s
            {
                %(init)s
@@ -2163,7 +2172,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            %(decl)s
            {
                %(init)s
@@ -2195,7 +2205,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_001"
            k_var = "kernel_reduce_001_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1, const ga_size d2,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -2244,7 +2254,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            %(decl)s
            {
                %(init)s
@@ -2280,7 +2291,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            %(decl)s
            {
                %(init)s
@@ -2314,7 +2326,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            %(decl)s
            {
                %(init)s
@@ -2345,7 +2358,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            kname = "kernel_reduce_1011"
            k_var = "kernel_reduce_1011_" + nodename
            sio = StringIO()
-            print("""
+            print("""#include "cluda.h"
+
            KERNEL void %(kname)s(
                    const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3,
                    const %(in_type)s *A, const ga_size offset_A,
@@ -2502,8 +2516,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):

    def gpu_kernels(self, node, name):
        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
-            # Some OpenCL compilers do not accept no-arguments kernels
-            src = "KERNEL void reduk(GLOBAL_MEM float *a) {}"
+            # Some OpenCL compilers do not accept no-arguments empty kernels
+            src = "#include \"cluda.h\"\nKERNEL void reduk(GLOBAL_MEM float *a) { a[0] = 0; }"
            params = ['float32']
        else:
            k = self.get_kernel_cache(node)

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -74,7 +74,8 @@ class GpuCumOp(GpuKernelBase, Op):
        k_var = "k_cumadd_" + nodename
        dtype_x = node.inputs[0].dtype
        flags = Kernel.get_flags(dtype_x)
-        code = """
+        code = """#include "cluda.h"
+
        KERNEL void %(kname)s(float* input, ga_size input_offset,
                              float* output, ga_size output_offset,
                              ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
@@ -112,7 +113,8 @@ class GpuCumOp(GpuKernelBase, Op):
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
                  'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE]
-        code = """
+        code = """#include "cluda.h"
+
        // helper functions
        WITHIN_KERNEL
        void k_reductionPhase(float* partialCumOp) {
@@ -213,7 +215,8 @@ class GpuCumOp(GpuKernelBase, Op):
        # k_finalCumOp
        kname = "k_finalCumOp"
        k_var = "k_finalCumOp_" + nodename
-        code = """
+        code = """#include "cluda.h"
+
        KERNEL void k_finalCumOp(float* output, ga_size output_offset,
                                 float* blockSum, ga_size blockSum_offset,
                                 size_t nbElementsPerCumOp,

--- a/theano/gpuarray/kernel_codegen.py
+++ b/theano/gpuarray/kernel_codegen.py
@@ -34,7 +34,9 @@ def nvcc_kernel(name, params, body):
            else:
                yield b
    bodystr = ';\n'.join(flatbody())
-    return """KERNEL void %(name)s (%(paramstr)s)
+    return """#include "cluda.h"
+
+    KERNEL void %(name)s (%(paramstr)s)
    {
        %(bodystr)s;
    }

--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -66,7 +66,8 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
        work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype))
        write_out_ctype = write_w(node.outputs[0].dtype)
        load_in_ctype = load_w(node.inputs[0].dtype)
-        code = """
+        code = """#include "cluda.h"
+
 KERNEL void k_multi_warp_multinomial(
    const ga_size nb_multi,
    const ga_size nb_outcomes,
@@ -276,7 +277,8 @@ class GPUAChoiceFromUniform(GpuKernelBase, Op):

    def gpu_kernels(self, node, name):
        replace = int(self.replace)
-        code = """
+        code = """#include "cluda.h"
+
 KERNEL void k_multi_warp_multinomial_wor(
    const ga_size nb_multi,
    const ga_size nb_outcomes,

--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
@@ -61,7 +61,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        kernels = []
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
-        code = """
+        code = """#include "cluda.h"
+
        // a version that uses less registers but doesn't work in all cases.
        %(mode_constants)s
        KERNEL void %(kname)s(
@@ -163,7 +164,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):

        kname = "k_multi_warp"
        k_var = "k_multi_warp_" + nodename
-        code = """
+        code = """#include "cluda.h"
+
        %(mode_constants)s
        KERNEL void %(kname)s(
            const ga_int mode,

--- a/theano/gpuarray/nnet.py
+++ b/theano/gpuarray/nnet.py
@@ -75,7 +75,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE
        ]
        sio = StringIO()
-        print("""
+        print("""#include "cluda.h"
+
        KERNEL void %(kname)s(const ga_size M, const ga_size N,
            GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1,
            GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0,
@@ -393,7 +394,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
            gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE,
        ]
        sio = StringIO()
-        print("""
+        print("""#include "cluda.h"
+
        KERNEL void %(kname)s(
           const ga_size N, const ga_size K,
           GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0,
@@ -557,7 +559,8 @@ class GpuSoftmax(GpuKernelBase, Op):
        kernels = []
        kname = "kSoftmax"
        k_var = "kSoftmax_" + nodename
-        code = """
+        code = """#include "cluda.h"
+
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                               GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                               GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
@@ -630,7 +633,8 @@ class GpuSoftmax(GpuKernelBase, Op):
                              flags=flags, objvar=k_var))
        kname = "kSoftmax_fixed_shared"
        k_var = "kSoftmax_fixed_shared" + nodename
-        code = """
+        code = """#include "cluda.h"
+
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                               GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                               GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf))
@@ -854,7 +858,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
        kernels = []
        kname = "kSoftmaxWithBias"
        k_var = "kSoftmaxWithBias_" + nodename
-        code = """
+        code = """#include "cluda.h"
+
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,
@@ -930,7 +935,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
                              flags=flags, objvar=k_var))
        kname = "kSoftmaxWithBias_fixed_shared"
        k_var = "kSoftmaxWithBias_fixed_shared" + nodename
-        code = """
+        code = """#include "cluda.h"
+
        KERNEL void %(kname)s (const ga_size M, const ga_size N,
                       GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1,
                       GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0,

--- a/theano/gpuarray/rng_mrg.py
+++ b/theano/gpuarray/rng_mrg.py
@@ -80,7 +80,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        else:
            raise ValueError('Unsupported data type for output',
                             self.output_type.dtype)
-        code = """
+        code = """#include "cluda.h"
+
        KERNEL void mrg_uniform(
                GLOBAL_MEM %(otype)s *sample_data,
                ga_size sample_offset,

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -1121,7 +1121,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of
        flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
        kname = "k_vector_add_fast"
        k_var = "k_vector_add_fast_" + nodename
-        code = """#include <cluda.h>
+        code = """#include "cluda.h"
        KERNEL void k_vector_add_fast(const ga_size numRowsX,
                                      const ga_size numColsX,
                                      const ga_ssize stridesX0,
@@ -1211,7 +1211,7 @@ if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of
        PyGpuArray_DIMS(py_other)[0],
        PyGpuArray_DIMS(py_other)[1],
        PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y,
-        PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y
+        PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y,
        py_other->ga.data,
        py_other->ga.offset,
        PyGpuArray_DIMS(indices_arr)[0],

--- a/theano/gpuarray/tests/c_code/tstgpueye.c
+++ b/theano/gpuarray/tests/c_code/tstgpueye.c
 #section kernels

 #kernel eye : *, size, size, size :
+#include <cluda.h>
 /* The eye name will be used to generate supporting objects.  The only
   you probably need to care about is the kernel object which will be
   named 'k_' + <the name above> (k_eye in this case).  This name also