Use the CUDA driver API for CUDA gpuarray operations.

Instead of mixing the CUDA driver API and the runtime API in the generated code, use only the CUDA driver API. GPU programs for CUDA gpuarray operations (except conv operations) are now generated as a string that is passed to the python interface of libgpuarray. libgpuarray then generates a cubin bytearray, which is embedded in the generated code. The generated code then uses the CUDA driver API via the C++ interface of libgpuarray to load and launch the GPU program. This has at least two benefits: (1) This approach does not use the nvcc offline compiler to compile the generated code into the shared library. It uses the host compiler directly, which is likely to be faster. Note that, for cubin generation, libgpuarray still uses the nvcc offline compiler, but an improvement is being made to use NVRTC and ptxas instead of nvcc, which should be, again, faster. (2) Mixing the CUDA driver API and the runtime API is typically discouraged.

Use the CUDA driver API for CUDA gpuarray operations.
89f584bc · Sean Lee · 7852531c · 89f584bc · 89f584bc · 89f584bc
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -145,11 +145,18 @@ class GpuKernelBase(object):
        return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar)

    def c_support_code_apply(self, node, name):
+        ceil_intdiv = """
+        template <typename T>
+        static T ceil_intdiv(T a, T b)
+        {
+            return (a/b) + ((a % b) ? 1: 0);
+        }
+        """
        kernels = self.gpu_kernels(node, name)
        bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels)
        codes = '\n'.join(self._generate_kernel_code(k) for k in kernels)
        vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels)
-        return '\n'.join([bins, codes, vars])
+        return '\n'.join([ceil_intdiv, bins, codes, vars])

    def _generate_kernel_init(self, k, err):
        if PY3:

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -8,6 +8,7 @@ from theano import Apply, scalar, config
 from theano import scalar as scal
 from six.moves import StringIO, xrange
 from theano.gof.utils import MethodNotDefined
+from theano.gof.cmodule import GCC_compiler
 from theano.scalar import Scalar
 from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)

@@ -23,7 +24,6 @@ except ImportError:

 from .basic_ops import (as_gpuarray_variable, HideC,
                        GpuKernelBase, Kernel)
-from .comp import NVCC_compiler
 from .type import GpuArrayType
 from .fp16_help import load_w, write_w

@@ -57,7 +57,7 @@ def as_C_string_const(s):
                     for l in s.split('\n'))


-class GpuElemwise(HideC, Elemwise):
+class GpuElemwise(GpuKernelBase, HideC, Elemwise):
    nin = property(lambda self: self.scalar_op.nin)
    nout = property(lambda self: self.scalar_op.nout)
    _f16_ok = True
@@ -150,39 +150,7 @@ class GpuElemwise(HideC, Elemwise):
            code.append('}')
            kop = '\n'.join(code)

-        # Translate types for scalar composite ops (except complex).
-        # NB: OpenCL implicitly has 'stdint' defs at the kernel
-        # compilation stage
-        support_code = "" if pygpu.get_default_context().kind == 'opencl' else """
-#ifdef _MSC_VER
-#define signed __int8 int8_t
-#define unsigned __int8 uint8_t
-#define signed __int16 int16_t
-#define unsigned __int16 uint16_t
-#define signed __int32 int32_t
-#define unsigned __int32 uint32_t
-#define signed __int64 int64_t
-#define unsigned __int64 uint64_t
-#else
-#include <stdint.h>
-#endif
-"""
-        # Translate ga_ pseudo-types into their specific realizations
-        support_code += """
-#define ga_bool uint8_t
-#define ga_byte int8_t
-#define ga_ubyte uint8_t
-#define ga_short int16_t
-#define ga_ushort uint16_t
-#define ga_int int32_t
-#define ga_uint uint32_t
-#define ga_long int64_t
-#define ga_ulong uint64_t
-#define ga_float float
-#define ga_double double
-#define ga_half uint16_t
-
-"""
+        support_code = ""
        try:
            # We accept only some c_support_code().
            # This filter is done in the make_node()
@@ -204,60 +172,65 @@ class GpuElemwise(HideC, Elemwise):
            kop = kop.replace(npy, ga)
        return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)

-    def c_headers(self):
+    def c_header_dirs(self):
        if pygpu.get_default_context().kind == 'opencl':
            raise MethodNotDefined('cuda only')
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
+        cuda_root = config.cuda.root
+        if cuda_root:
+            import os
+            return [os.path.join(cuda_root, 'include')]
+        else:
+            return []

    def c_compiler(self):
+        return GCC_compiler
+
+    def c_headers(self):
        if pygpu.get_default_context().kind == 'opencl':
            raise MethodNotDefined('cuda only')
-        return NVCC_compiler
+        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
+                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']

    def c_support_code(self):
        return self.scalar_op.c_support_code()

-    def c_support_code_apply(self, node, nodename):
+    def _gpu_kernel_code(self, node, nodename):
        if pygpu.get_default_context().kind == 'opencl':
            raise MethodNotDefined('cuda only')
        # This is useless by itself, but will serve an eventual c_code
        # implementation
        k = self.generate_kernel(node, nodename)
        nd = node.inputs[0].type.ndim
-        CLUDA_PREAMBLE = """
-#define local_barrier() __syncthreads();
-
-#define WITHIN_KERNEL __device__
-#define KERNEL extern "C" __global__
-#define GLOBAL_MEM /* empty */
-#define LOCAL_MEM __shared__
-#define LOCAL_MEM_ARG /* empty */
-#define REQD_WG_SIZE(X,Y,Z) __launch_bounds__(X*Y*Z, 1)
-
-#define LID_0 threadIdx.x
-#define LID_1 threadIdx.y
-#define LID_2 threadIdx.z
-
-#define GID_0 blockIdx.x
-#define GID_1 blockIdx.y
-#define GID_2 blockIdx.z
-
-#define LDIM_0 blockDim.x
-#define LDIM_1 blockDim.y
-#define LDIM_2 blockDim.z
-
-#define GDIM_0 gridDim.x
-#define GDIM_1 gridDim.y
-#define GDIM_2 gridDim.z
-"""
-        res = [CLUDA_PREAMBLE]
+        res = []
        for i in range(0, nd + 1):
            res.append(k.render_basic(i, name="elem_" + str(i)) + ';')
        res.append(k.contig_src + ';')

        return '\n'.join(res)

+    def gpu_kernels(self, node, nodename):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        src = self._gpu_kernel_code(node, nodename)
+        nd = node.outputs[0].ndim
+        params = ['uintp']
+        params.extend('uintp' for _ in range(nd))
+        num_inputs = len(node.inputs)
+        num_outputs = len(node.outputs)
+        for n in range(num_inputs + num_outputs):
+            if (n - len(node.inputs)) in self.inplace_pattern:
+                continue
+            params.extend([gpuarray.GpuArray, 'uintp'])
+            params.extend('intp' for _ in range(nd))
+        acc_dtype = getattr(self, 'acc_dtype', None)
+        if acc_dtype is None:
+            acc_dtype = node.outputs[0].type.dtype
+        return [Kernel(code=src, name="elem_%d" % nd, params=params,
+                       flags=Kernel.get_flags(node.inputs[0].type.dtype,
+                                              acc_dtype,
+                                              node.outputs[0].type.dtype),
+                       objvar='elem_%d_%s' % (nd, nodename))]
+
    def c_init_code(self):
        if pygpu.get_default_context().kind == 'opencl':
            raise MethodNotDefined('cuda only')
@@ -273,11 +246,15 @@ class GpuElemwise(HideC, Elemwise):

        # check that all inputs have valid dimensions
        emitted_inames = {}
+        num_kernel_params = 1 + nd + len(inputs + outputs) * (2 + nd)
        code = """
-        int n_blocks = 0;
-        int threads_per_block = 0;
+        size_t n_blocks = 0;
+        size_t threads_per_block = 0;
        size_t numEls = 0;
-        """
+        const ssize_t zero = 0;
+        void *kernel_params[%(num_kernel_params)d] = {0};
+        int err;
+        """ % locals()
        if nd > 0:
            code += """
            size_t dims[%(nd)s] = {%(initial_dims)s};
@@ -416,23 +393,41 @@ class GpuElemwise(HideC, Elemwise):
                //std::cerr << "calling callkernel returned\\n";
        """ % locals()

-        code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals()
-        param = []
+        kname = 'elem_%d_%s' % (nd, name)
+        param = ["(void *)&numEls"]
        for i in range(nd):
-            param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
-                                                              i=i))
+            param.append("(void *)&%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
+                                                                       i=i))
        for n, (name, var) in enumerate(zip(inputs + outputs,
                                       node.inputs + node.outputs)):
            if (n - len(inputs)) in self.inplace_pattern:
                continue
            dtype = dtype_to_ctype(var.dtype)
-            param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
-            param.append("%(name)s->ga.offset" % locals())
+            param.append("(void *)%(name)s->ga.data" % locals())
+            param.append("(void *)&%(name)s->ga.offset" % locals())
            for i in range(nd):
-                param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
-        code += ',\n'.join(param) + ");\n"
+                param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? (void *)&zero: (void *)&PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
+        for n, p in enumerate(param):
+            code += "kernel_params[%(n)d] = %(p)s;\n" % locals()
+        code += """
+        err = GpuKernel_call(&%(kname)s, 1, &threads_per_block, &n_blocks, 0, kernel_params);
+        if (err != GA_NO_ERROR) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "gpuarray error: %(kname)s: %%s.",
+                         GpuKernel_error(&%(kname)s, err));
+            %(fail)s;
+        }
+        """ % dict(kname=kname,fail=fail)
        if config.gpuarray.sync:
-            code += "GpuArray_sync(&%(z)s->ga);\n" % dict(z=z)
+            code += """
+            err = GpuArray_sync(&%(z)s->ga);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(kname)s: %%s.",
+                             GpuKernel_error(&%(kname)s, err));
+                %(fail)s;
+            }
+            """ % locals()
        return str(code)

    def perform(self, node, inputs, output_storage):
@@ -573,7 +568,7 @@ class GpuDimShuffle(HideC, DimShuffle):
        return (4,)


-class GpuCAReduceCuda(HideC, CAReduceDtype):
+class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
    """
    GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.

@@ -737,12 +732,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            return False
        return True

+    def c_header_dirs(self):
+        cuda_root = config.cuda.root
+        if cuda_root:
+            import os
+            return [os.path.join(cuda_root, 'include')]
+
    def c_headers(self):
        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
-
-    def c_compiler(self):
-        return NVCC_compiler
+                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']

    def c_init_code(self):
        return ['setup_ext_cuda();']
@@ -840,7 +838,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        # \begin bracket the reduction in a check that there is
        # actually work to do
        if getattr(self.scalar_op, 'identity', None) == 0:
-            zero_shp = "cudaMemset((%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(out_dtype)s))" % locals()
+            zero_shp = "GpuArray_memset(&%(z)s->ga, 0)" % locals()
        # TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
        else:
            scalar_op = self.scalar_op
@@ -891,28 +889,24 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):

            .. code-block:: c

+                ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
+                ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
+                ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
                if (verbose)
                    printf("running kernel_reduce_10_%(name)s\\n");
-                int n_shared = sizeof(%(acc_dtype)s) * n_threads.x * n_threads.y * n_threads.z;
-                kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
-                                                n_shared>>>(
-                        PyGpuArray_DIMS(%(x)s)[0],
-                        PyGpuArray_DIMS(%(x)s)[1],
-                        (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
-                        (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s)
-                        );
-                [
-        if config.gpuarray.sync:
-            code += "GpuArray_sync(&%(z)s->ga);\n" % dict(z=z)
-                ]
-                if (cudaSuccess != cudaGetLastError())
-                {
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: ... );
-                    %(fail)s;
-                }
+                size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0] * n_threads[1] * n_threads[2];
+                void *kernel_params[] = {
+                        (void *)&PyGpuArray_DIMS(%(x)s)[0],
+                        (void *)&PyGpuArray_DIMS(%(x)s)[1],
+                        (void *)%(x)s->ga.data,
+                        (void *)&%(x)s->ga.offset,
+                        (void *)&stride_A0,
+                        (void *)&stride_A1,
+                        (void *)%(z)s->ga.data,
+                        (void *)&%(z)s->ga.offset,
+                        (void *)&stride_Z0};
+                int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
+                %(err_check)s
        """
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
@@ -923,64 +917,66 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        ndim = len(self.reduce_mask)
        nd_out = ndim - sum(self.reduce_mask)
        shapes_format = "shape=(%s)" % ",".join(["%llu"] * node.inputs[0].ndim)
-        shapes_data = ",".join(["(unsigned long long) PyGpuArray_DIMS(%s)[%d]" % (x, i)
+        shapes_data = ",".join(["(size_t) PyGpuArray_DIMS(%s)[%d]" % (x, i)
                                for i in range(node.inputs[0].ndim)])
+        k_var = "kernel_reduce_%(pattern)s_%(name)s" % locals()
+        params = []

-        print("""
-            if (verbose)
-                printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
-            int n_shared = sizeof(%(acc_dtype)s) * n_threads.x * n_threads.y * n_threads.z;
-            if (verbose>1)
-                printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
-                       " nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
-                       " nb_block=%%d, n_shared=%%d, %(shapes_format)s\\n",
-                                  n_threads.x,n_threads.y,n_threads.z,
-                                  n_threads.x*n_threads.y*n_threads.z,
-                                  n_blocks.x,n_blocks.y,
-                                  n_blocks.x*n_blocks.y, n_shared, %(shapes_data)s);
-            kernel_reduce_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-            """ % locals(), file=sio)
        for i in xrange(ndim):
-            print("""
-                    PyGpuArray_DIMS(%(x)s)[%(i)s],
-            """ % locals(), file=sio)
-        print("""
-                    (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset)
-            """ % locals(), file=sio)
+            params.append("(void *)&PyGpuArray_DIMS(%(x)s)[%(i)s]" % locals())
+        params.append("(void *)%(x)s->ga.data" % locals())
+        params.append("(void *)&%(x)s->ga.offset" % locals())
        for i in xrange(ndim):
            print("""
-                    ,PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s)
-            """ % locals(), file=sio)
-        print("""
-                    ,(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset)
+            ssize_t stride_A%(i)d = PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s);
            """ % locals(), file=sio)
+            params.append("(void *)&stride_A%(i)d" % locals())
+
+        params.append("(void *)%(z)s->ga.data" % locals())
+        params.append("(void *)&%(z)s->ga.offset" % locals())
        for i in xrange(nd_out):
            print("""
-                    ,PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(out_dtype)s)
+            ssize_t stride_Z%(i)d = PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(out_dtype)s);
            """ % locals(), file=sio)
+            params.append("(void *)&stride_Z%(i)d" % locals())
+        kernel_params = ', '.join(params)
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+                %(fail)s;
+            }
+        """ % locals()
+        print("""
+            if (verbose)
+                printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
+            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0] * n_threads[1] * n_threads[2];
+            void *kernel_params[] = { %(kernel_params)s };
+            if (verbose>1)
+                printf("n_threads[0]=%%lu, n_threads[1]=%%lu, "
+                       "n_threads[2]=%%lu, n_threads=%%lu, "
+                       "n_blocks[0]=%%lu, n_blocks[1]=%%lu, n_blocks[2]=%%lu, "
+                       "n_blocks=%%lu, n_shared=%%d, %(shapes_format)s\\n",
+                                  n_threads[0],n_threads[1],
+                                  n_threads[2],
+                                  n_threads[0]*n_threads[1]*
+                                  n_threads[2],
+                                  n_blocks[0],n_blocks[1],n_blocks[2],
+                                  n_blocks[0]*n_blocks[1]*n_blocks[2],
+                                  n_shared, %(shapes_data)s);
+            int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
+            %(err_check)s
+            """ % locals(), file=sio)
+
        sync = ""
        if config.gpuarray.sync:
-            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        print("""
-                    );
            %(sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                    "Cuda error: %%s: %%s."
-                    " (grid: %%i x %%i; block: %%i x %%i x %%i)"
-                    " %(shapes_format)s \\n",
-                    "kernel_reduce_%(pattern)s_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z,
-                    %(shapes_data)s);
-                %(fail)s;
-            }
        """ % locals(), file=sio)
        return sio.getvalue()

@@ -993,66 +989,86 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):

        .. code-block:: c

-            static __global__ void kernel_reduce_110_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(in_dtype)s *A,
-                    const int sA0,
-                    const int sA1,
-                    const int sA2,
-                    %(out_dtype)s * Z,
-                    const int sZ0)
+            KERNEL void kernel_reduce_110_%(nodename)s(
+                    const ga_size d0,
+                    const ga_size d1,
+                    const ga_size d2,
+                    const %(in_type)s *A,
+                    const ga_size offset_A,
+                    const ga_ssize sA0,
+                    const ga_ssize sA1,
+                    const ga_ssize sA2,
+                    %(out_type)s * Z,
+                    const ga_size offset_Z,
+                    const ga_ssize sZ0)

        Since the nodename is unique, we don't need to put the name
        of the scalar_op in here.

        """
-        in_dtype = "npy_" + node.inputs[0].dtype
-        out_dtype = "npy_" + node.outputs[0].dtype
+        in_dtype = node.inputs[0].dtype
+        out_dtype = node.outputs[0].dtype
+        in_type = gpuarray.dtype_to_ctype(in_dtype)
+        out_type = gpuarray.dtype_to_ctype(out_dtype)
        if reduce_mask is None:
            reduce_mask = self.reduce_mask
        if ndim is None:
            ndim = len(reduce_mask)
        if pattern is None:
            pattern = ''.join(str(i) for i in reduce_mask)
+        kname = "kernel_reduce_%(pattern)s" % locals()
+        k_var = "kernel_reduce_%(pattern)s_%(nodename)s" % locals()
+        params = []
        sio = StringIO()

        print("""
-            static __global__ void kernel_reduce_%(pattern)s_%(nodename)s(
+            KERNEL void %(kname)s(
        """ % locals(), file=sio)
        for i in xrange(ndim):
+            params.append('uintp')
            print("""
-                    const int d%(i)s,
+                    const ga_size d%(i)s,
        """ % locals(), file=sio)
+        params.append(gpuarray.GpuArray)
+        params.append('uintp')
        print("""
-                    const %(in_dtype)s *A,
+                    const %(in_type)s *A, const ga_size offset_A,
        """ % locals(), file=sio)
        for i in xrange(ndim):
+            params.append('intp')
            print("""
-                    const int sA%(i)s,
+                    const ga_ssize sA%(i)s,
        """ % locals(), file=sio)
+        params.append(gpuarray.GpuArray)
+        params.append('uintp')
        print("""
-                    %(out_dtype)s * Z
+                    %(out_type)s * Z, const ga_size offset_Z
        """ % locals(), file=sio)
        for i in xrange(ndim - sum(reduce_mask)):
+            params.append('intp')
            print("""
-                    , const int sZ%(i)s
+                    , const ga_ssize sZ%(i)s
        """ % locals(), file=sio)
        print(")", file=sio)
-        return sio.getvalue()
+        return sio.getvalue(), kname, params, k_var

    def _k_init(self, node, nodename):
+        in_dtype = node.inputs[0].dtype
+        out_dtype = node.outputs[0].dtype
        acc_dtype = self._acc_dtype(node.inputs[0].dtype)
        # We need to use theano_complex* and not npy_complex*
-        acc_dtype = theano.scalar.basic.Scalar(acc_dtype).dtype_specs()[1]
+        in_type = gpuarray.dtype_to_ctype(in_dtype)
+        out_type = gpuarray.dtype_to_ctype(out_dtype)
+        acc_type = gpuarray.dtype_to_ctype(acc_dtype)

        return """
                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
                const int threadNum = threadIdx.z * blockDim.x * blockDim.y
                + threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = 0;
+                extern __shared__ %(acc_type)s buf[];
+                %(acc_type)s myresult = 0;
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                //This is caught in cuda/init.py when we init the gpu. I keep
                //it here to ease finding code that rely on this.
@@ -1315,7 +1331,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
        if getattr(self.scalar_op, 'identity', None) == 0:
-            zero_shp = "cudaMemset((%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(out_dtype)s))" % locals()
+            zero_shp = "GpuArray_memset(&%(z)s->ga, 0)" % locals()
        # TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
        else:
            zero_shp = """
@@ -1325,44 +1341,43 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            """ % locals()

        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
+        k_var = "kernel_reduce_ccontig_%(name)s" % locals()
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+                %(fail)s;
+            }
+        """ % locals()
        sync = ""
        if config.gpuarray.sync:
-            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        print("""
        {
          if(PyGpuArray_SIZE(%(x)s)==0){
            %(zero_shp)s;
          }else{
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_SIZE(%(x)s),
-                            (size_t) 256));
-            dim3 n_blocks(1);
+            size_t numEls = PyGpuArray_SIZE(%(x)s);
+            size_t n_threads = std::min(numEls, (size_t) 256);
+            size_t n_blocks = 1;
+            void *kernel_params[] = {(void *)&numEls,
+                                     (void *)%(x)s->ga.data,
+                                     (void *)&%(x)s->ga.offset,
+                                     (void *)%(z)s->ga.data,
+                                     (void *)&%(z)s->ga.offset};
            if (verbose) printf("running kernel_reduce_ccontig_%(name)s"
-                                " n_threads.x=%%d, size=%%d, ndim=%%d\\n",
-                                n_threads.x,PyGpuArray_SIZE(%(x)s),
+                                " n_threads=%%lu, size=%%lu, ndim=%%d\\n",
+                                n_threads,numEls,
                                PyGpuArray_NDIM(%(x)s));
-            int n_shared = sizeof(%(acc_dtype)s) * n_threads.x;
-            kernel_reduce_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                    PyGpuArray_SIZE(%(x)s),
-                    (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                    (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset));
+            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads;
+            int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
+            %(err_check)s
            %(sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %%s: %%s."
-                             " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_ccontig_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z);
-                %(fail)s;
-            }
         }
        }
        """ % locals(), file=sio)
@@ -1372,10 +1387,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[0],
-                            (size_t) 256));
-            dim3 n_blocks(1);
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
+            size_t n_blocks[3] = {1, 1, 1};
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1385,15 +1398,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[1],
-                            (size_t) 256));
-            while (n_threads.y * n_threads.x <= 256) ++n_threads.y;
-            n_threads.y -= 1;
-            if (n_threads.y > PyGpuArray_DIMS(%(x)s)[0])
-                n_threads.y = PyGpuArray_DIMS(%(x)s)[0];
-
-            dim3 n_blocks(1);
+
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
+            while (n_threads[1] * n_threads[0] <= 256) ++n_threads[1];
+            n_threads[1] -= 1;
+            if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[0])
+                n_threads[1] = PyGpuArray_DIMS(%(x)s)[0];
+
+            size_t n_blocks[3] = {1, 1, 1};
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1421,25 +1433,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):

        threads_y = """
            //get as many y threads as we can fit
-            while (n_threads.x * (n_threads.y+1) <= 256)
+            while (n_threads[0] * (n_threads[1]+1) <= 256)
            {
-                if (n_threads.y < PyGpuArray_DIMS(%(x)s)[%(N)s-1])
-                    n_threads.y += 1;
+                if (n_threads[1] < PyGpuArray_DIMS(%(x)s)[%(N)s-1])
+                    n_threads[1] += 1;
                else
                    break;
            }""" % locals()

        threads_z = """
            //get as many z threads as we can fit
-            while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256)
+            while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256)
            {
-                if (n_threads.z < PyGpuArray_DIMS(%(x)s)[%(N)s-2])
-                    n_threads.z += 1;
+                if (n_threads[2] < PyGpuArray_DIMS(%(x)s)[%(N)s-2])
+                    n_threads[2] += 1;
                else
                    break;
            }
            //Maximum for Fermi GPU on that dimensions.
-            n_threads.z = std::min(n_threads.z, (unsigned)64);
+            n_threads[2] = std::min(n_threads[2], (size_t)64);
        """ % locals()

        if len(self.reduce_mask) == 2:
@@ -1452,13 +1464,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[%(N)s],
-                            (size_t) 256));
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[%(N)s], (size_t) 256), 1, 1};
            %(threads_y)s
            %(threads_z)s
-            dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[0],
-                                   (size_t) 4096));
+            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1476,9 +1485,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
+        k_var = "kernel_reduce_10_%(name)s" % locals()
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(%(k_var)s, err));
+                %(fail)s;
+            }
+        """ % locals()
        sync = ""
        if config.gpuarray.sync:
-            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        print("""
    {
        int verbose = 0;
@@ -1491,95 +1512,71 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                // we could schedule more threads if we were maxing out the gridsize below, but
                // the gridsize is way more than the physical hardware and I think 32 threads
                // on a huge grid is enough to fully use the hardware.
-                dim3 n_threads(32,1,1);
+                size_t n_threads[3] = {32, 1, 1};

                // We kindof reshape the input implicitly to something 4D:
                //  the shape A,B,C    ->   A, B, D, E
                //  where C <= D*E < C+32
                //  where E==32

-                int A = 1;
-                int B = PyGpuArray_DIMS(%(x)s)[0];
-                int C = PyGpuArray_DIMS(%(x)s)[1];
-                int D = C/32;
+                GpuKernel *%(k_var)s = &kernel_reduce_010_AD_%(name)s;
+                size_t A = 1;
+                size_t B = PyGpuArray_DIMS(%(x)s)[0];
+                size_t C = PyGpuArray_DIMS(%(x)s)[1];
+                size_t D = C/32;
                if (32*D < C) D+= 1;
                assert ((C <= 32*D) && (32*D < C+32));

                // The gridsize would ideally be (A, D).  But we do the following logic to make
                // sure we don't ask for a grid that is too big.
-                dim3 n_blocks(A,D);
-                if (n_blocks.x > 4096) n_blocks.x = 4096;
-                if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x;
-                kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads>>>(
-                A,B,C,D,
-                        (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        1,
-                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
-                        (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        1,
-                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s)
-                        );
-
-            %(sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                    "Cuda error: %%s: %%s."
-                    " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_10_AD%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z);
-                %(fail)s;
-            }
+                size_t n_blocks[3] = {A, D, 1};
+                if (n_blocks[0] > 4096) n_blocks[0] = 4096;
+                if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
+                ssize_t stride_A0 = 1;
+                ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
+                ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
+                ssize_t stride_Z0 = 1;
+                ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
+                void *kernel_params[] = {
+                        (void *)&A, (void *)&B, (void *)&C, (void *)&D,
+                        (void *)%(x)s->ga.data,
+                        (void *)&%(x)s->ga.offset,
+                        (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
+                        (void *)%(z)s->ga.data,
+                        (void *)&%(z)s->ga.offset,
+                        (void *)&stride_Z0, (void *)&stride_Z1};
+                int err = GpuKernel_call(%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
+                %(err_check)s
+                %(sync)s
        }else{
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[0],
-                            (size_t) 256));
-            dim3 n_blocks(1,
-                std::min(PyGpuArray_DIMS(%(x)s)[1],
-                    (size_t) 4096));
+            GpuKernel *%(k_var)s = &kernel_reduce_010_%(name)s;
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
+            size_t n_blocks[3] = {1, std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 4096), 1};
            if (verbose) {
              fprintf(stderr,
                "running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n",
-                n_blocks.x,
-                n_blocks.y);
+                n_blocks[0],
+                n_blocks[1]);
            }
            assert(PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]);
-            int n_shared = sizeof(%(acc_dtype)s) * n_threads.x;
-            kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                    1,
-                    PyGpuArray_DIMS(%(x)s)[0],
-                    PyGpuArray_DIMS(%(x)s)[1],
-                    (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                    1,
-                    PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                    PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
-                    (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                    1,
-                    PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s)
-                    );
+            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0];
+            size_t dim_0 = 1;
+            ssize_t stride_A0 = 1;
+            ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
+            ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
+            ssize_t stride_Z0 = 1;
+            ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
+            void *kernel_params[] = {
+                    (void *)&dim_0,
+                    (void *)&PyGpuArray_DIMS(%(x)s)[0],
+                    (void *)&PyGpuArray_DIMS(%(x)s)[1],
+                    (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
+                    (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
+                    (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
+                    (void *)&stride_Z0, (void *)&stride_Z1};
+            int err = GpuKernel_call(%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
+            %(err_check)s
            %(sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                    "Cuda error: %%s: %%s."
-                    " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_010_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z);
-                %(fail)s;
-            }
        }
    }
        """ % locals(), file=sio)
@@ -1591,9 +1588,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        pattern = ''.join(str(i) for i in self.reduce_mask)
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
+        k_var = "kernel_reduce_010_AD_%(name)s" % locals()
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+                %(fail)s;
+            }
+        """ % locals()
        sync = ""
        if config.gpuarray.sync:
-            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        print("""
        {
            //int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2];
@@ -1608,108 +1617,82 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                // we could schedule more threads if we were maxing out the gridsize below, but
                // the gridsize is way more than the physical hardware and I think 32 threads
                // on a huge grid is enough to fully use the hardware.
-                dim3 n_threads(32,1,1);
+                size_t n_threads[3] = {32, 1, 1};

                // We kindof reshape the input implicitly to something 4D:
                //  the shape A,B,C    ->   A, B, D, E
                //  where C <= D*E < C+32
                //  where E==32

-                int A = PyGpuArray_DIMS(%(x)s)[0];
-                int B = PyGpuArray_DIMS(%(x)s)[1];
-                int C = PyGpuArray_DIMS(%(x)s)[2];
-                int D = C/32;
+                size_t A = PyGpuArray_DIMS(%(x)s)[0];
+                size_t B = PyGpuArray_DIMS(%(x)s)[1];
+                size_t C = PyGpuArray_DIMS(%(x)s)[2];
+                size_t D = C/32;
                if (32*D < C) D+= 1;
                assert ((C <= 32*D) && (32*D < C+32));

                // The gridsize would ideally be (A, D).  But we do the following logic to make
                // sure we don't ask for a grid that is too big.
-                dim3 n_blocks(A,D);
-                if (n_blocks.x > 4096) n_blocks.x = 4096;
-                if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x;
-                int n_shared = 0;
-                kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                A,B,C,D,
-                        (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s),
-                        (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s),
-                        PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s)
-                        );
+                size_t n_blocks[3] = {A, D, 1};
+                if (n_blocks[0] > 4096) n_blocks[0] = 4096;
+                if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
+                ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
+                ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
+                ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s);
+                ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
+                ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s);
+                void *kernel_params[] = {
+                        (void *)&A, (void *)&B, (void *)&C, (void *)&D,
+                        (void *)%(x)s->ga.data,
+                        (void *)&%(x)s->ga.offset,
+                        (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
+                        (void *)%(z)s->ga.data,
+                        (void *)&%(z)s->ga.offset,
+                        (void *)&stride_Z0, (void *)&stride_Z1};
+                int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
+                %(err_check)s
                %(sync)s
-                cudaError_t sts = cudaGetLastError();
-                if (cudaSuccess != sts)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                        "Cuda error: %%s: %%s."
-                        " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                        "kernel_reduce_010_%(name)s",
-                        cudaGetErrorString(sts),
-                        n_blocks.x,
-                        n_blocks.y,
-                        n_threads.x,
-                        n_threads.y,
-                        n_threads.z);
-                    %(fail)s;
-                }
            }
            else
            {
                int verbose = 2;

-                  dim3 n_threads(std::min((size_t) 32,
-                                          PyGpuArray_DIMS(%(x)s)[2]));
-                  while(    (n_threads.x*(n_threads.y+1)<=256)
-                         && (n_threads.y<PyGpuArray_DIMS(%(x)s)[1])){
-                      n_threads.y++;
+                  size_t n_threads[3] = {std::min((size_t) 32, PyGpuArray_DIMS(%(x)s)[2]), 1, 1};
+                  while(    (n_threads[0]*(n_threads[1]+1)<=256)
+                         && (n_threads[1]<PyGpuArray_DIMS(%(x)s)[1])){
+                      n_threads[1]++;
                  }

-                  dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[0],
-                                (size_t)4096));
-                  n_blocks.y = std::min(
+                  size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096), 1, 1};
+                  n_blocks[1] = std::min(
                      ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
-                                  (size_t)n_threads.x),
-                      (size_t)(4096 / n_blocks.x)
+                                  (size_t)n_threads[0]),
+                      (size_t)(4096 / n_blocks[0])
                      );
                if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
                                     PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s)),
                            PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s))
                   ==PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s)
-                  && n_blocks.y==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
-                                             (size_t)n_threads.x)){
+                  && n_blocks[1]==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
+                                             (size_t)n_threads[0])){
                  if(verbose>1)
                    printf("n_block.x.1=%%d, n_block.x.2=%%d, n_block.y.1=%%d, n_block.y.2=%%d,\\n",
                           PyGpuArray_DIMS(%(x)s)[0],4096,
-                           ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],(size_t)n_threads.x),
-                                       (size_t)(4096 / n_blocks.x));
-                  assert(n_threads.x<=32);
+                           ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],(size_t)n_threads[0]),
+                                       (size_t)(4096 / n_blocks[0]));
+                  assert(n_threads[0]<=32);
                  %(makecall_inner)s
                }else{
-                  n_threads.x = std::min(PyGpuArray_DIMS(%(x)s)[1],
+                  n_threads[0] = std::min(PyGpuArray_DIMS(%(x)s)[1],
                                         (size_t) 256);
-                  n_blocks.x = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096);
-                  n_blocks.y = std::min(
+                  n_blocks[0] = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096);
+                  n_blocks[1] = std::min(
                      PyGpuArray_DIMS(%(x)s)[2],
-                      (size_t)(4096 / n_blocks.x)
+                      (size_t)(4096 / n_blocks[0])
                      );
                  %(makecall)s
                }
                %(sync)s
-                cudaError_t sts = cudaGetLastError();
-                if (cudaSuccess != sts)
-                {
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                        "kernel_reduce_%(pattern)s_%(name)s",
-                        cudaGetErrorString(sts),
-                        n_blocks.x,
-                        n_blocks.y,
-                        n_threads.x,
-                        n_threads.y,
-                        n_threads.z);
-                    %(fail)s;
-                }
            }
        }
        """ % locals(), file=sio)
@@ -1719,16 +1702,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[3],
-                             (size_t) 256));
-            while (n_threads.x * n_threads.y <= 256)
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
+            while (n_threads[0] * n_threads[1] <= 256)
            {
-                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1]) break;
-                n_threads.y += 1;
+                if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1]) break;
+                n_threads[1] += 1;
            }
-            n_threads.y -= 1;
-            dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[2]);
+            n_threads[1] -= 1;
+            size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[2], 1};
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1738,7 +1719,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
-        sync = bool(config.gpuarray.sync)
+        k_var = "kernel_reduce_010_AD_%(name)s" % locals()
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+                %(fail)s;
+            }
+        """ % locals()
+        sync = ""
+        if config.gpuarray.sync:
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        # use threadIdx.x for i0
        # use blockIdx.x for i1
        # use blockIdx.y for i2
@@ -1747,15 +1742,12 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            int verbose = 0;
            if (PyGpuArray_STRIDES(%(x)s)[2] != sizeof(%(in_dtype)s)){
              printf("slow\\n");
-                dim3 n_threads(
-                        std::min(PyGpuArray_DIMS(%(x)s)[0],
-                                 (size_t) 256));
-                dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[1],
-                              (size_t)4096));
-                while (n_blocks.x * (n_blocks.y+1) <= 4096 &&
-                       n_blocks.y <= PyGpuArray_DIMS(%(x)s)[2])
+                size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
+                size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)4096), 1, 1};
+                while (n_blocks[0] * (n_blocks[1]+1) <= 4096 &&
+                       n_blocks[1] <= PyGpuArray_DIMS(%(x)s)[2])
                {
-                    n_blocks.y += 1;
+                    n_blocks[1] += 1;
                }
                %(makecall)s
            }
@@ -1763,50 +1755,38 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            {   // reuse 010_AD kernel, we transpose the 2 first dim
                // See the reduction for the real 010_AD kernel for
                // explanation. We do this to get coalesced read.
-                dim3 n_threads(32,1,1);
+                size_t n_threads[3] = {32, 1, 1};

-                int A = PyGpuArray_DIMS(%(x)s)[1];
-                int B = PyGpuArray_DIMS(%(x)s)[0];
-                int C = PyGpuArray_DIMS(%(x)s)[2];
-                int D = C/32;
+                size_t A = PyGpuArray_DIMS(%(x)s)[1];
+                size_t B = PyGpuArray_DIMS(%(x)s)[0];
+                size_t C = PyGpuArray_DIMS(%(x)s)[2];
+                size_t D = C/32;
                if (32*D < C) D+= 1;
                assert ((C <= 32*D) && (32*D < C+32));

                // The gridsize would ideally be (A, D).  But we do the following logic to make
                // sure we don't ask for a grid that is too big.
-                dim3 n_blocks(A,D);
-                if (n_blocks.x > 4096) n_blocks.x = 4096;
-                if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x;
-                int n_shared = 0;
-                kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                A,B,C,D,
-                        (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s),
-                        (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s),
-                        PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s)
-                        );
-                if (%(sync)d)
-                    GpuArray_sync(&%(z)s->ga);
-                cudaError_t sts = cudaGetLastError();
-                if (cudaSuccess != sts)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                        "Cuda error: %%s: %%s."
-                        " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                        "kernel_reduce_010_%(name)s",
-                        cudaGetErrorString(sts),
-                        n_blocks.x,
-                        n_blocks.y,
-                        n_threads.x,
-                        n_threads.y,
-                        n_threads.z);
-                    %(fail)s;
-                }
+                size_t n_blocks[3] = {A, D, 1};
+                if (n_blocks[0] > 4096) n_blocks[0] = 4096;
+                if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
+                size_t n_shared = 0;
+                ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
+                ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
+                ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s);
+                ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
+                ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s);
+                void *kernel_params[] = {
+                        (void *)&A, (void *)&B, (void *)&C, (void *)&D,
+                        (void *)%(x)s->ga.data,
+                        (void *)&%(x)s->ga.offset,
+                        (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
+                        (void *)%(z)s->ga.data,
+                        (void *)&%(z)s->ga.offset,
+                        (void *)&stride_Z0, (void *)&stride_Z1};
+                int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
+                %(err_check)s
+                %(sync)s
            }
-
        }
        """ % locals(), file=sio)

@@ -1815,18 +1795,16 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[1],
-                             (size_t) 256));
-            while (n_threads.x*n_threads.y <= 256)
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
+            while (n_threads[0]*n_threads[1] <= 256)
            {
-                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[0])
+                if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[0])
                    break;
-                n_threads.y += 1;
+                n_threads[1] += 1;
            }
-            n_threads.y -= 1;
+            n_threads[1] -= 1;

-            dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[2]);
+            size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[2], 1, 1};
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1836,19 +1814,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[2],
-                             (size_t) 256));
-            dim3 n_blocks(
-                    std::min(PyGpuArray_DIMS(%(x)s)[0],
-                             (size_t) 4096));
-            while (n_blocks.x * n_blocks.y <= 4096)
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
+            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
+            while (n_blocks[0] * n_blocks[1] <= 4096)
            {
-                if (n_blocks.y > PyGpuArray_DIMS(%(x)s)[1])
+                if (n_blocks[1] > PyGpuArray_DIMS(%(x)s)[1])
                    break;
-                n_blocks.y += 1;
+                n_blocks[1] += 1;
            }
-            n_blocks.y -= 1;
+            n_blocks[1] -= 1;
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1858,31 +1832,29 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[2],
-                             (size_t) 256));
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};

            //get as many y threads as we can fit
-            while (n_threads.x * n_threads.y <= 256)
+            while (n_threads[0] * n_threads[1] <= 256)
            {
-                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1])
+                if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1])
                    break;
-                n_threads.y += 1;
+                n_threads[1] += 1;
            }
-            n_threads.y -= 1;
+            n_threads[1] -= 1;

            //get as many z threads as we can fit
-            while (n_threads.x * n_threads.y * n_threads.z <= 256)
+            while (n_threads[0] * n_threads[1] * n_threads[2] <= 256)
            {
-                if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
+                if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
                    break;
-                n_threads.z += 1;
+                n_threads[2] += 1;
            }
-            n_threads.z -= 1;
+            n_threads[2] -= 1;
            //Maximum for Fermi GPU on that dimensions.
-            n_threads.z = std::min(n_threads.z, (unsigned)64);
+            n_threads[2] = std::min(n_threads[2], (size_t)64);

-            dim3 n_blocks(1,1,1);
+            size_t n_blocks[3] = {1, 1, 1};
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1896,24 +1868,20 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        {
            int verbose = 0;

-            dim3 n_blocks(
-                    std::min(PyGpuArray_DIMS(%(x)s)[0],
-                             (size_t) 4096));
+            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};

-            while (n_blocks.x * n_blocks.y <= 4096 &&
-                   n_blocks.y < PyGpuArray_DIMS(%(x)s)[1])
+            while (n_blocks[0] * n_blocks[1] <= 4096 &&
+                   n_blocks[1] < PyGpuArray_DIMS(%(x)s)[1])
            {
-                n_blocks.y += 1;
+                n_blocks[1] += 1;
            }

-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[3],
-                             (size_t) 256));
-            while (n_threads.x * n_threads.y <= 256
-                   && n_threads.y < PyGpuArray_DIMS(%(x)s)[2]
-                   && n_threads.x * n_threads.y * sizeof(%(acc_dtype)s) <=(15*1024-200))
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
+            while (n_threads[0] * n_threads[1] <= 256
+                   && n_threads[1] < PyGpuArray_DIMS(%(x)s)[2]
+                   && n_threads[0] * n_threads[1] * sizeof(%(acc_dtype)s) <=(15*1024-200))
            {
-                n_threads.y += 1;
+                n_threads[1] += 1;
            }

            %(makecall)s
@@ -1925,32 +1893,30 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[2],
-                             (size_t) 256));
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};

            //get as many y threads as we can fit
-            while (n_threads.x * n_threads.y <= 256)
+            while (n_threads[0] * n_threads[1] <= 256)
            {
-                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1])
+                if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1])
                    break;
-                n_threads.y += 1;
+                n_threads[1] += 1;
            }
-            n_threads.y -= 1;
+            n_threads[1] -= 1;

            //get as many z threads as we can fit
-            while (n_threads.x * n_threads.y * n_threads.z <= 256)
+            while (n_threads[0] * n_threads[1] * n_threads[2] <= 256)
            {
-                if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
+                if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
                    break;
-                n_threads.z += 1;
+                n_threads[2] += 1;
            }
-            n_threads.z -= 1;
+            n_threads[2] -= 1;

            //Maximum for Fermi GPU on that dimensions.
-            n_threads.z = std::min(n_threads.z, (unsigned)64);
+            n_threads[2] = std::min(n_threads[2], (size_t)64);

-            dim3 n_blocks(1,1,1);
+            size_t n_blocks[3] = {1, 1, 1};
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1960,27 +1926,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[3],
-                             (size_t) 256));
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};

-            while (n_threads.x * (n_threads.y+1) <= 256) ++n_threads.y;
-            if (n_threads.y > PyGpuArray_DIMS(%(x)s)[2])
-                n_threads.y = PyGpuArray_DIMS(%(x)s)[2];
+            while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1];
+            if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[2])
+                n_threads[1] = PyGpuArray_DIMS(%(x)s)[2];

-            while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256) ++n_threads.z;
-            if (n_threads.z > 64)
-                n_threads.z = 64;
-            if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
-                n_threads.z = PyGpuArray_DIMS(%(x)s)[0];
+            while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256) ++n_threads[2];
+            if (n_threads[2] > 64)
+                n_threads[2] = 64;
+            if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
+                n_threads[2] = PyGpuArray_DIMS(%(x)s)[0];

-            dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[1]);
+            size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[1], 1, 1};
            %(makecall)s
        }
        """ % locals(), file=sio)

    def c_code_cache_version_apply(self, node):
-        version = [15]  # the version corresponding to the c code in this Op
+        version = [16]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,
@@ -1994,14 +1958,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        else:
            return ()

-    def c_support_code_apply(self, node, nodename):
-        sio = StringIO()
+    def gpu_kernels(self, node, nodename):
        nd_in = len(self.reduce_mask)
-        in_dtype = "npy_" + node.inputs[0].dtype
-        out_dtype = "npy_" + node.outputs[0].dtype
-        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
-        load_in = load_w(node.inputs[0].dtype)
-        write_out = write_w(node.outputs[0].dtype)
+        in_dtype = node.inputs[0].dtype
+        out_dtype = node.outputs[0].dtype
+        acc_dtype = self._acc_dtype(node.inputs[0].dtype)
+        flags=Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
+        in_type = gpuarray.dtype_to_ctype(in_dtype)
+        out_type = gpuarray.dtype_to_ctype(out_dtype)
+        acc_type = gpuarray.dtype_to_ctype(acc_dtype)
+        load_in = load_w(in_dtype)
+        write_out = write_w(out_dtype)
+        kernels = []

        if all(i == 1 for i in self.reduce_mask):
            # this kernel is ok for up to a few thousand elements, but
@@ -2011,16 +1979,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
+            kname = "kernel_reduce_ccontig"
+            k_var = "kernel_reduce_ccontig_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_ccontig_%(nodename)s(
-                    const unsigned int d0,
-                    const %(in_dtype)s *A,
-                    %(out_dtype)s * Z)
+            KERNEL void %(kname)s(
+                    const ga_size d0,
+                    const %(in_type)s *A, const ga_size offset_A,
+                    %(out_type)s *Z, const ga_size offset_Z)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
+                extern __shared__ %(acc_type)s buf[];
+                %(acc_type)s myresult = %(reduce_init)s;
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2034,6 +2007,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
+            params = [
+                'uintp',
+                gpuarray.GpuArray, 'uintp',
+                gpuarray.GpuArray, 'uintp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (1,):
            # this kernel is ok for up to a few thousand elements, but
            # it only runs on ONE multiprocessor
@@ -2042,16 +2022,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0 * sA0])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
+            kname = "kernel_reduce_1"
+            k_var = "kernel_reduce_1_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_1_%(nodename)s(
-                    const unsigned int d0,
-                    const %(in_dtype)s *A, const int sA0,
-                    %(out_dtype)s * Z)
+            KERNEL void %(kname)s(
+                    const ga_size d0,
+                    const %(in_type)s *A, const ga_size offset_A,
+                    const ga_ssize sA0,
+                    %(out_type)s * Z, const ga_size offset_Z)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
+                extern __shared__ %(acc_type)s buf[];
+                %(acc_type)s myresult = %(reduce_init)s;
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2065,6 +2051,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
+            params = [
+                'uintp',
+                gpuarray.GpuArray, 'uintp',
+                'intp',
+                gpuarray.GpuArray, 'uintp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (1, 1):
            # this kernel is ok for up to a few thousand elements, but
            # it only runs on ONE multiprocessor
@@ -2073,17 +2067,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0 * sA0 + i1 * sA1])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
+            kname = "kernel_reduce_11"
+            k_var = "kernel_reduce_11_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_11_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const %(in_dtype)s *A, const int sA0, const int sA1,
-                    %(out_dtype)s * Z)
+            KERNEL void %(kname)s(
+                    const ga_size d0, const ga_size d1,
+                    const %(in_type)s *A, const ga_size offset_A,
+                    const ga_ssize sA0, const ga_ssize sA1,
+                    %(out_type)s * Z, const ga_size offset_Z)
            {
                const int threadCount = blockDim.x * blockDim.y;
                const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
+                extern __shared__ %(acc_type)s buf[];
+                %(acc_type)s myresult = %(reduce_init)s;
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2100,6 +2099,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
+            params = [
+                'uintp', 'uintp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp',
+                gpuarray.GpuArray, 'uintp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        #01, 011, 0111
        if (0 == self.reduce_mask[0] and
            all(self.reduce_mask[1:]) and
@@ -2144,17 +2151,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):

            reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node,
                                           nodename, sub={})
-            param_dim = ",".join(["const int d%d" % i
+            param_dim = ",".join(["const ga_size d%d" % i
                                  for i in xrange(nd_in)])
-            param_strides = ",".join(["const int sA%d" % i
+            param_strides = ",".join(["const ga_ssize sA%d" % i
                                      for i in xrange(nd_in)])
-            decl = self._k_decl(node, nodename)
+            decl, kname, params, k_var = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals())
            reduce_fct = self._assign_reduce(
                node, nodename, "myresult",
                load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
                {}, True)
+            sio = StringIO()
            print("""
                %(decl)s{
                    %(init)s
@@ -2171,6 +2179,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                    }
                }
                """ % locals(), file=sio)
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
            # this kernel uses one block for each column,
            # threads per block for each element per column.
@@ -2184,18 +2194,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])")
+            kname = "kernel_reduce_010"
+            k_var = "kernel_reduce_010_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_010_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(in_dtype)s *A, const int sA0,
-                    const int sA1, const int sA2,
-                    %(out_dtype)s * Z, const int sZ0, const int sZ1)
+            KERNEL void %(kname)s(
+                    const ga_size d0, const ga_size d1, const ga_size d2,
+                    const %(in_type)s *A, const ga_size offset_A,
+                    const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
+                    %(out_type)s * Z, const ga_size offset_Z,
+                    const ga_ssize sZ0, const ga_ssize sZ1)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
+                extern __shared__ %(acc_type)s buf[];
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2207,7 +2221,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                {
                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
                    {
-                        %(acc_dtype)s myresult = %(reduce_init)s;
+                        %(acc_type)s myresult = %(reduce_init)s;
                        for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
                        {
                            %(reduce_fct)s;
@@ -2218,25 +2232,36 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):

            }
            """ % locals(), file=sio)
+            params = [
+                'uintp', 'uintp', 'uintp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp', 'intp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask in [(0, 1, 0), (1, 0), (1, 0, 0)]:
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(X[a * sX0 + b * sX1 + c * sX2])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])")
+            kname = "kernel_reduce_010_AD"
+            k_var = "kernel_reduce_010_AD_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_010_AD_%(nodename)s(
-                    const int A,
-                    const int B,
-                    const int C,
-                    const int D,
-                    //const int E, // THIS is 32
-                    const %(in_dtype)s *X, const int sX0,
-                    const int sX1, const int sX2,
-                    %(out_dtype)s * Z, const int sZ0, const int sZ1)
+            KERNEL void %(kname)s(
+                    const ga_size A, const ga_size B, const ga_size C, const ga_size D,
+                    const %(in_type)s *X, const ga_size offset_X,
+                    const ga_ssize sX0, const ga_ssize sX1, const ga_ssize sX2,
+                    %(out_type)s * Z, const ga_size offset_Z,
+                    const ga_ssize sZ0, const ga_ssize sZ1)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                %(acc_dtype)s myresult = 0;
+                %(acc_type)s myresult = 0;
+                X = (const %(in_type)s *)(((char *)X)+offset_X);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2262,6 +2287,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):

            }
            """ % locals(), file=sio)
+            params = [
+                'uintp', 'uintp', 'uintp', 'uintp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp', 'intp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (0, 1, 0):
            #
            # This kernel is optimized when the inner most dimensions
@@ -2275,7 +2309,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
 # block.x = dim 0
 # block.y = dim 1 rest
            init = self._k_init(node, nodename)
-            decl = self._k_decl(node, nodename, pattern="010_inner")
+            decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner")
            reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
                                                    node, nodename,
                                                    'blockDim.x')
@@ -2283,6 +2317,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])")
+            sio = StringIO()
            print("""
            %(decl)s
            {
@@ -2307,6 +2342,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
              }
            }
            """ % locals(), file=sio)
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (1, 1, 0):
            # this kernel uses one block for each column,
            # threads per block for each element per column.
@@ -2319,19 +2356,23 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])")
+            kname = "kernel_reduce_110"
+            k_var = "kernel_reduce_110_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_110_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(in_dtype)s *A, const int sA0,
-                    const int sA1, const int sA2,
-                    %(out_dtype)s * Z, const int sZ0)
+            KERNEL void %(kname)s(
+                    const ga_size d0, const ga_size d1, const ga_size d2,
+                    const %(in_type)s *A, const ga_size offset_A,
+                    const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
+                    %(out_type)s * Z, const ga_size offset_Z,
+                    const ga_ssize sZ0)
            {
                const int threadCount = blockDim.x * blockDim.y;
                const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
+                extern __shared__ %(acc_type)s buf[];
+                %(acc_type)s myresult = %(reduce_init)s;
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2351,15 +2392,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
+            params = [
+                'uintp', 'uintp', 'uintp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp', 'intp',
+                gpuarray.GpuArray, 'uintp',
+                'intp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (1, 0, 0):
            reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]',
                                           node, nodename, sub={})
-            decl = self._k_decl(node, nodename)
+            decl, kname, params, k_var = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])")
+            sio = StringIO()
            print("""
            %(decl)s
            {
@@ -2378,15 +2429,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                }
            }
            """ % locals(), file=sio)
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (1, 1, 1):
            reducebuf = self._k_reduce_buf('Z[0]', node,
                                           nodename, sub={})
-            decl = self._k_decl(node, nodename)
+            decl, kname, params, k_var = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
+            sio = StringIO()
            print("""
            %(decl)s
            {
@@ -2405,6 +2459,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (0, 0, 1):
            # this kernel uses one block for each row,
            # threads per block for each element per row.
@@ -2414,18 +2470,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
+            kname = "kernel_reduce_001"
+            k_var = "kernel_reduce_001_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_001_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(in_dtype)s *A, const int sA0,
-                    const int sA1, const int sA2,
-                    %(out_dtype)s * Z, const int sZ0, const int sZ1)
+            KERNEL void %(kname)s(
+                    const ga_size d0, const ga_size d1, const ga_size d2,
+                    const %(in_type)s *A, const ga_size offset_A,
+                    const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
+                    %(out_type)s * Z, const ga_size offset_Z,
+                    const ga_ssize sZ0, const ga_ssize sZ1)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
+                extern __shared__ %(acc_type)s buf[];
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2436,7 +2496,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                {
                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
                    {
-                        %(acc_dtype)s myresult = %(reduce_init)s;
+                        %(acc_type)s myresult = %(reduce_init)s;
                        for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
                        {
                            %(reduce_fct)s;
@@ -2446,17 +2506,27 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                }
            }
            """ % locals(), file=sio)
+            params = [
+                'uintp', 'uintp', 'uintp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp', 'intp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (0, 0, 1, 1):
             # this kernel uses one block for each row,
            # threads per block for each element per row.
            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
                                           node, nodename, sub={})
-            decl = self._k_decl(node, nodename)
+            decl, kname, params, k_var = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
+            sio = StringIO()
            print("""
            %(decl)s
            {
@@ -2466,7 +2536,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                {
                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
                    {
-                        %(acc_dtype)s myresult = %(reduce_init)s;
+                        %(acc_type)s myresult = %(reduce_init)s;
                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
                    {
                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
@@ -2479,17 +2549,20 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                }
            }
            """ % locals(), file=sio)
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (0, 1, 0, 1):
            # this kernel uses one block for each row,
            # threads per block for each element per row.
            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]',
                                           node, nodename, sub={})
-            decl = self._k_decl(node, nodename)
+            decl, kname, params, k_var = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])")
+            sio = StringIO()
            print("""
            %(decl)s
            {
@@ -2499,7 +2572,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                {
                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
                    {
-                        %(acc_dtype)s myresult = %(reduce_init)s;
+                        %(acc_type)s myresult = %(reduce_init)s;
                    for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
                    {
                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
@@ -2512,15 +2585,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                }
            }
            """ % locals(), file=sio)
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (1, 1, 1, 1):
            reducebuf = self._k_reduce_buf('Z[0]', node, nodename,
                                           sub={})
-            decl = self._k_decl(node, nodename)
+            decl, kname, params, k_var = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
+            sio = StringIO()
            print("""
            %(decl)s
            {
@@ -2540,6 +2616,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (1, 0, 1, 1):
            reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
                                           node, nodename, sub={})
@@ -2547,20 +2625,23 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])")
+            kname = "kernel_reduce_1011"
+            k_var= "kernel_reduce_1011_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_1011_%(nodename)s(
-                    const unsigned int d0,
-                    const unsigned int d1,
-                    const unsigned int d2,
-                    const unsigned int d3,
-                    const %(in_dtype)s *A, const int sA0, const int sA1,
-                    const int sA2, const int sA3,
-                    %(out_dtype)s * Z, const int sZ0)
+            KERNEL void %(kname)s(
+                    const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3,
+                    const %(in_type)s *A, const ga_size offset_A,
+                    const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2, const ga_ssize sA3,
+                    %(out_type)s * Z, const ga_size offset_Z,
+                    const ga_ssize sZ0)
            {
                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
                const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
+                extern __shared__ %(acc_type)s buf[];
+                %(acc_type)s myresult = %(reduce_init)s;
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2580,14 +2661,16 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
-        print("""
-        template <typename T>
-        static T ceil_intdiv(T a, T b)
-        {
-            return (a/b) + ((a % b) ? 1: 0);
-        }
-        """, file=sio)
-        return sio.getvalue()
+            params = [
+                'uintp', 'uintp', 'uintp', 'uintp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp', 'intp', 'intp',
+                gpuarray.GpuArray, 'uintp',
+                'intp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
+        return kernels


 class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
@@ -2820,8 +2903,15 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
            %(output)s = tmp;
        }

-        if (%(sync)d)
-            GpuArray_sync(&%(output)s->ga);
+        if (%(sync)d) {
+            err = GpuArray_sync(&%(output)s->ga);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: GpuCAReduceCPY: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+                %(fail)s
+            }
+        }
 """ % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
           ls=ls, fail=sub['fail'], output=output, input=input,
           cast_out=bool(acc_dtype != node.outputs[0].type.dtype))

--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ b/theano/sandbox/gpuarray/kernel_codegen.py
@@ -3,6 +3,12 @@ Helper routines for generating gpu kernels for nvcc.

 """

+try:
+    import pygpu
+    from pygpu import gpuarray
+except ImportError:
+    pass
+
 def nvcc_kernel(name, params, body):
    """
    Return the c code of a kernel function.
@@ -26,7 +32,7 @@ def nvcc_kernel(name, params, body):
            else:
                yield b
    bodystr = ';\n'.join(flatbody())
-    return """__global__ void %(name)s (%(paramstr)s)
+    return """KERNEL void %(name)s (%(paramstr)s)
    {
        %(bodystr)s;
    }
@@ -167,11 +173,12 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
    We use __i as an int variable in a loop.

    """
+    ctype = gpuarray.dtype_to_ctype(dtype)
    return [
            # get max of buf (trashing all but buf[0])
            inline_reduce_max(N, buf, threadPos, threadCount),
            '__syncthreads()',
-            ('npy_%s row_max = ' + buf + '[0]') % dtype,
+            ('%s row_max = ' + buf + '[0]') % ctype,
            '__syncthreads()',
            'for(int __i=' + threadPos + '; __i<' + N +
                  '; __i+=' + threadCount + '){',
@@ -181,7 +188,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
            '__syncthreads()',
            inline_reduce_sum(N, buf, threadPos, threadCount),
            '__syncthreads()',
-            ('npy_%s row_sum = ' + buf + '[0]') % dtype,
+            ('%s row_sum = ' + buf + '[0]') % ctype,
            '__syncthreads()',
            # divide each exp() result by the sum to complete the job.
            'for(int __i=' + threadPos + '; __i<' + N +
@@ -259,11 +266,12 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))

+    ctype = gpuarray.dtype_to_ctype(dtype)
    return """
    {
        // This function trashes buf[1..n_threads],
        // leaving the reduction result in buf[0].
-        npy_%(dtype)s red = %(init)s;
+        %(ctype)s red = %(init)s;
        #pragma unroll 16
        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
          red = %(loop_line)s;
@@ -356,6 +364,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
    We use tx as an int variable in a loop.

    """
+    ctype = gpuarray.dtype_to_ctype(dtype)
    ret = [
        # get max of buf (trashing all but buf[0])
        inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x,
@@ -363,7 +372,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
                                       b, stride_b, load_b,
                                       dtype),
        '__syncthreads()',
-        ('npy_%s row_max = ' + buf + '[0]') % dtype,
+        ('%s row_max = ' + buf + '[0]') % ctype,
        '__syncthreads()',
        inline_reduce_fixed_shared(N, buf, x, stride_x, load_x,
                                   threadPos, threadCount,
@@ -371,7 +380,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
                                   lambda a: "exp(%s - row_max)" % a,
                                   b, stride_b, load_b, dtype),
        '__syncthreads()',
-        ('npy_%s row_sum = ' + buf + '[0]') % dtype,
+        ('%s row_sum = ' + buf + '[0]') % ctype,
        '__syncthreads()',
        "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
        ]

--- a/theano/sandbox/gpuarray/neighbours.py
+++ b/theano/sandbox/gpuarray/neighbours.py
+import os
 import numpy

 from theano import Op, Apply, config
@@ -12,13 +13,14 @@ except ImportError:
    pass

 from .basic_ops import (as_gpuarray_variable,
-                        host_from_gpu, gpu_from_host)
+                        host_from_gpu, gpu_from_host,
+                        GpuKernelBase, Kernel)
 from .opt import register_opt as register_gpu_opt, op_lifter
 from .type import GpuArrayType
 from .comp import NVCC_compiler


-class GpuImages2Neibs(Images2Neibs, Op):
+class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
    def __init__(self, mode='valid'):
        if mode not in ['valid', 'ignore_borders', 'wrap_centered']:
            raise NotImplementedError("Only the mode valid, ignore_borders"
@@ -43,25 +45,42 @@ class GpuImages2Neibs(Images2Neibs, Op):
                                   dtype=ten4.type.dtype)()])

    def c_code_cache_version(self):
-        return (9, 1)
+        return (10,1)

    def c_headers(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
-
-    def c_compiler(self):
-        return NVCC_compiler
+                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
+
+    def c_header_dirs(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        cuda_root = config.cuda.root
+        if cuda_root:
+            import os
+            return [os.path.join(cuda_root, 'include')]
+        else:
+            return []

    def c_init_code(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
        return ['setup_ext_cuda();']

-    def c_support_code_apply(self, node, nodename):
+    def gpu_kernels(self, node, nodename):
        dtype_ten4 = node.inputs[0].dtype
        dtype_z = node.outputs[0].dtype
+        flags = Kernel.get_flags(dtype_ten4, dtype_z)
+        type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
+        type_z = gpuarray.dtype_to_ctype(dtype_z)
        mode = self.mode
-        return """
+        kernels = []
+        kname = "k_multi_warp_less"
+        k_var = "k_multi_warp_less_" + nodename
+        code = """
 //a version that use less register but don't work in all case.
-        static __global__ void k_multi_warp_less_%(nodename)s(
+        KERNEL void %(kname)s(
            const int nb_batch,
            const int nb_stack,
            const int height,
@@ -72,15 +91,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
            const int step_y,
            const int grid_c,
            const int grid_d,
-            const int stride0, const int stride1,
-            const int stride2, const int stride3,
-            npy_%(dtype_ten4)s * global_ten4,
-            const int out_s0, const int out_s1,
-            npy_%(dtype_z)s * global_out
+            const size_t stride0, const size_t stride1,
+            const size_t stride2, const size_t stride3,
+            const %(type_ten4)s * global_ten4, const size_t offset_ten4,
+            const size_t out_s0, const size_t out_s1,
+            %(type_z)s * global_out, const size_t offset_out
        )
        {
            const int wrap_centered_idx_shift_x = c/2;
            const int wrap_centered_idx_shift_y = d/2;
+            global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
+            global_out = (%(type_z)s *)(((char *)global_out)+offset_out);

            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
@@ -131,9 +152,22 @@ class GpuImages2Neibs(Images2Neibs, Op):
                                }
                            }
            }
-        }
-
-        static __global__ void k_multi_warp_%(nodename)s(
+        }""" % locals()
+        params = [
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc',
+            'uintp', 'uintp', 'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            ]
+        kernels.append(Kernel(code=code, name=kname, params=params,
+                              flags=flags, objvar=k_var))
+
+        kname = "k_multi_warp"
+        k_var = "k_multi_warp_" + nodename
+        code = """
+        KERNEL void %(kname)s(
            const int nb_batch,
            const int nb_stack,
            const int height,
@@ -144,15 +178,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
            const int step_y,
            const int grid_c,
            const int grid_d,
-            const int stride0, const int stride1,
-            const int stride2, const int stride3,
-            npy_%(dtype_ten4)s * global_ten4,
-            const int out_s0, const int out_s1,
-            npy_%(dtype_z)s * global_out
+            const size_t stride0, const size_t stride1,
+            const size_t stride2, const size_t stride3,
+            const %(type_ten4)s * global_ten4, const size_t offset_ten4,
+            const size_t out_s0, const size_t out_s1,
+            %(type_z)s * global_out, const size_t offset_out
        )
        {
            const int wrap_centered_idx_shift_x = c/2;
            const int wrap_centered_idx_shift_y = d/2;
+            global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
+            global_out = (%(type_z)s *)(((char *)global_out)+offset_out);

            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
@@ -207,6 +243,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
            }
        }
        """ % locals()
+        params = [
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc',
+            'uintp', 'uintp', 'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            ]
+        kernels.append(Kernel(code=code, name=kname, params=params,
+                              flags=flags, objvar=k_var))
+        return kernels

    def c_code(self, node, name, inp, out, sub):
        dtype_ten4 = node.inputs[0].dtype
@@ -220,15 +267,21 @@ class GpuImages2Neibs(Images2Neibs, Op):
        z, = out
        fail = sub['fail']
        mode = self.mode
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: *fptr: %%s.",
+                             GpuKernel_error(fptr, err));
+                %(fail)s;
+            }
+        """ % locals()
+        sync = ""
        if config.gpuarray.sync:
-            cnda_thread_sync = "GpuArray_sync(&%(z)s->ga);" % dict(z=z)
-        else:
-            cnda_thread_sync = ""
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        return """
-#ifndef CEIL_INTDIV
-#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
-#endif
-
        int grid_c = -1;
        int grid_d = -1;

@@ -281,10 +334,10 @@ class GpuImages2Neibs(Images2Neibs, Op):
                                 PyGpuArray_DIMS(%(ten4)s)[3]);
                    %(fail)s;
                }
-                grid_c = CEIL_INTDIV(((PyGpuArray_DIMS(%(ten4)s))[2]),
-                                     step_x);
-                grid_d = CEIL_INTDIV(((PyGpuArray_DIMS(%(ten4)s))[3]),
-                                     step_y);
+                grid_c = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[2]),
+                                     (size_t)step_x);
+                grid_d = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[3]),
+                                     (size_t)step_y);


            }else if ( "%(mode)s" == "valid") {
@@ -367,75 +420,57 @@ class GpuImages2Neibs(Images2Neibs, Op):
            const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
                                         PyArray_GETPTR1(%(neib_step)s, 1);

-            dim3 n_threads(d,c,1);
+            size_t threads_per_block[3] = {d, c, 1};
            //Their is a max of 512 threads per blocks
-            while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--;
-            while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--;
+            while(threads_per_block[0]*threads_per_block[1]>512 && threads_per_block[1]>1)threads_per_block[1]--;
+            while(threads_per_block[0]*threads_per_block[1]>512 && threads_per_block[0]>1)threads_per_block[0]--;

            //Make bigger block to have better memory access pattern and
            //a higher core utilisation. for smaller patch size

-            while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 &&
-                  n_threads.z<PyGpuArray_DIMS(%(z)s)[0]){
-                n_threads.z++;
+            while(c*d*(threads_per_block[2]+1) < 128 && threads_per_block[2]<64 &&
+                  threads_per_block[2]<PyGpuArray_DIMS(%(z)s)[0]){
+                threads_per_block[2]++;
            }
            int nb_block;
-            if (PyGpuArray_DIMS(%(z)s)[0] %% n_threads.z == 0)
-                nb_block = PyGpuArray_DIMS(%(z)s)[0] / n_threads.z;
+            if (PyGpuArray_DIMS(%(z)s)[0] %% threads_per_block[2] == 0)
+                nb_block = PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2];
            else
-                nb_block = (PyGpuArray_DIMS(%(z)s)[0] / n_threads.z) + 1;
-            dim3 n_blocks(std::min(32*1024,nb_block));
-            int n_shared = 0;
-
-            void (*f)(int, int, int ,int,
-                      int, int, int ,int,
-                      int, int,
-                      int, int, int, int,
-                      npy_%(dtype_ten4)s*,
-                      int, int,
-                      npy_%(dtype_z)s*);
-            if(n_threads.x==d && n_threads.y==c){
-                f = k_multi_warp_less_%(name)s;
-            }else{
-                f = k_multi_warp_%(name)s;
-            }
+                nb_block = (PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]) + 1;
+            size_t n_blocks[3] = {std::min(32*1024,nb_block), 1, 1};

-            f<<<n_blocks, n_threads, n_shared>>>(
-                nb_batch,
-                nb_stack,
-                height, width,
-                c, d, step_x, step_y,
-                grid_c, grid_d,
-                PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s,
-                PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s,
-                PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s,
-                PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s,
-                (npy_%(dtype_ten4)s*)(
-                                ((char *)cuda_get_ptr(%(ten4)s->ga.data)) +
-                                %(ten4)s->ga.offset),
-                PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s,
-                (npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                   %(z)s->ga.offset)
-            );
-            %(cnda_thread_sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError, "GpuImages2Neibs:"
-                             " Cuda error: %%s: %%s. (grid: %%i x %%i;"
-                             " block: %%i x %%i x %%i; shared: %%i)\\n",
-                    "k_multi_warp_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z,
-                    n_shared);
-                %(fail)s;
+            GpuKernel *fptr;
+            if(threads_per_block[0]==d && threads_per_block[1]==c){
+                fptr = &k_multi_warp_less_%(name)s;
+            }else{
+                fptr = &k_multi_warp_%(name)s;
            }

+            size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s;
+            size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s;
+            size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s;
+            size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s;
+            size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
+            size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
+            void *kernel_params[] = {(void *)&nb_batch,
+                                     (void *)&nb_stack,
+                                     (void *)&height, (void *)&width,
+                                     (void *)&c, (void *)&d,
+                                     (void *)&step_x, (void *)&step_y,
+                                     (void *)&grid_c, (void *)&grid_d,
+                                     (void *)&stride_A0,
+                                     (void *)&stride_A1,
+                                     (void *)&stride_A2,
+                                     (void *)&stride_A3,
+                                     (void *)%(ten4)s->ga.data,
+                                     (void *)&%(ten4)s->ga.offset,
+                                     (void *)&stride_Z0,
+                                     (void *)&stride_Z1,
+                                     (void *)%(z)s->ga.data,
+                                     (void *)&%(z)s->ga.offset};
+            int err = GpuKernel_call(fptr, 3, threads_per_block, n_blocks, 0, kernel_params);
+            %(err_check)s
+            %(sync)s
        } // END NESTED SCOPE
        """ % locals()


--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
@@ -10,16 +10,15 @@ try:
 except ImportError:
    pass

-from .basic_ops import as_gpuarray_variable
-from .comp import NVCC_compiler
+from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
 from .type import GpuArrayType
 from .kernel_codegen import (nvcc_kernel,
-                             inline_softmax,
-                             inline_softmax_fixed_shared)
+                            inline_softmax,
+                            inline_softmax_fixed_shared)
 from .fp16_help import work_dtype, load_w, write_w


-class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
+class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
    """
    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.

@@ -41,10 +40,19 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
        am = y_idx.type()
        return Apply(self, [x, b, y_idx], [nll, sm, am])

+    def c_header_dirs(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        cuda_root = config.cuda.root
+        if cuda_root:
+            import os
+            return [os.path.join(cuda_root, 'include')]
+
    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>']
+        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
+                '<gpuarray/types.h>']

-    def c_support_code_apply(self, node, nodename):
+    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
@@ -54,28 +62,48 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
        load_b = load_w(dtype_b)
        write_x = write_w(dtype_x)
        write_b = write_w(dtype_b)
-        return """
-        __global__ void k_xent_sm_1hot_bias_%(nodename)s(int M, int N,
-            const npy_%(dtype_x)s* x_data, int xs0, int xs1,
-            const npy_%(dtype_b)s* b, int bs0,
-            const npy_%(dtype_y_idx)s* y_idx_data, int y_idxs0,
-            npy_%(dtype_x)s* nll_data, int nlls0,
-            npy_%(dtype_x)s* sm_data, int sms0, int sms1,
-            npy_%(dtype_y_idx)s* am_data, int ams0)
+        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
+        type_x = gpuarray.dtype_to_ctype(work_x)
+        type_b = gpuarray.dtype_to_ctype(work_b)
+        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
+        kname = "k_xent_sm_1hot_bias"
+        k_var = "k_xent_sm_1hot_bias_" + nodename
+        sio = StringIO()
+        print("""
+        KERNEL void %(kname)s(const ga_size M, const ga_size N,
+            const %(type_x)s* x_data, const ga_size offset_x,
+            const ga_ssize xs0, const ga_ssize xs1,
+            const %(type_b)s* b, const ga_size offset_b,
+            const ga_ssize bs0,
+            const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx,
+            const ga_ssize y_idxs0,
+            %(type_x)s* nll_data, const ga_size offset_nll,
+            const ga_ssize nlls0,
+            %(type_x)s* sm_data, const ga_size offset_sm,
+            const ga_ssize sms0, const ga_ssize sms1,
+            %(type_y_idx)s* am_data, const ga_size offset_am,
+            const ga_ssize ams0)
        {
+          x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
+          b = (const %(type_b)s *)(((char *)b)+offset_b);
+          y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
+          nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll);
+          sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm);
+          am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);
+
          for (int row = blockIdx.x; row < M; row += gridDim.x){

-            const npy_%(dtype_x)s* x = x_data + xs0 * row;
-            const npy_%(dtype_y_idx)s y_idx = y_idx_data[row * y_idxs0];
-            npy_%(dtype_x)s* sm = sm_data + sms0 * row;
+            const %(type_x)s* x = x_data + xs0 * row;
+            const %(type_y_idx)s y_idx = y_idx_data[row * y_idxs0];
+            %(type_x)s* sm = sm_data + sms0 * row;

-            npy_%(work_x)s sum = 0.0;
+            %(type_x)s sum = 0.0;
            int row_max_j = 0;
-            npy_%(work_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
+            %(type_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
            for (int j = 1; j < N; ++j)
            {
-                npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) +
-                                        %(load_b)s(b[j*bs0]);
+                %(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
+                                    %(load_b)s(b[j*bs0]);
                //todo: store to shared memory
                row_max_j = (row_ij > row_max) ? j : row_max_j;
                row_max   = (row_ij > row_max) ? row_ij : row_max;
@@ -83,16 +111,16 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
            //compute the exp
            for (int j = 0; j < N; ++j)
            {
-                npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) +
-                                        %(load_b)s(b[j*bs0]);
-                npy_%(work_x)s sm_ij = exp(row_ij - row_max);
+                %(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
+                                    %(load_b)s(b[j*bs0]);
+                %(type_x)s sm_ij = exp(row_ij - row_max);
                sum += sm_ij;
                sm[j * sms1] = %(write_x)s(sm_ij);
            }
-            npy_%(work_x)s sum_inv = 1.0 / sum;
+            %(type_x)s sum_inv = 1.0 / sum;
            for (int j = 0; j < N; ++j)
            {
-                npy_%(work_x)s __tmp = %(load_x)s(sm[j * sms1]);
+                %(type_x)s __tmp = %(load_x)s(sm[j * sms1]);
                __tmp *= sum_inv;
                sm[j * sms1] = %(write_x)s(__tmp);
            }
@@ -111,12 +139,18 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
            am_data[row*ams0] = row_max_j;
          }
        }
-
-        CUdeviceptr (*cuda_get_ptr)(gpudata *g);
-        """ % locals()
-
-    def c_init_code(self):
-        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
+        """ % locals(), file=sio)
+        params = [
+            'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp'
+            ]
+        return [Kernel(code=sio.getvalue(), name=kname, params=params,
+                       flags=flags, objvar=k_var)]

    def c_code(self, node, nodename, inp, out, sub):
        typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
@@ -138,6 +172,21 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
        dtype_am = node.outputs[2].dtype
        classname = self.__class__.__name__
        fail = sub['fail']
+        k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+                %(fail)s;
+            }
+        """ % locals()
+        sync = ""
+        if config.gpuarray.sync:
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        sio = StringIO()
        print("""
        if (PyGpuArray_NDIM(%(y_idx)s) != 1)
@@ -219,62 +268,47 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
            }
        }
        {
-            int n_blocks = PyGpuArray_DIMS(%(x)s)[0] < 256 ? PyGpuArray_DIMS(%(x)s)[0] : 256;
+            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)256), 1, 1};
+            size_t threads_per_block[3] = {1, 1, 1};
+            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
+            ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
+            ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
+            ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
+            ssize_t stride_NLL0 = PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s;
+            ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
+            ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
+            ssize_t stride_AM0 = PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s;
     //TODO: launch more threads per row and do parallel sum and max reductions
-            int n_threads = 1;
-            int n_shared_bytes = 0; //n_threads * sizeof(dtype);
-
-
-            k_xent_sm_1hot_bias_%(nodename)s<<<n_blocks, n_threads, n_shared_bytes>>>(
-                PyGpuArray_DIMS(%(x)s)[0],
-                PyGpuArray_DIMS(%(x)s)[1],
-                (npy_%(dtype_x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                   %(x)s->ga.offset),
-                PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-                (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
-                                   %(b)s->ga.offset),
-                PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
-                (npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
-                                   %(y_idx)s->ga.offset),
-                PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
-                (npy_%(dtype_nll)s*)(((char *)cuda_get_ptr(%(nll)s->ga.data)) +
-                                   %(nll)s->ga.offset),
-                PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s,
-                (npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
-                                   %(sm)s->ga.offset),
-                PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
-                PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
-                (npy_%(dtype_am)s*)(((char *)cuda_get_ptr(%(am)s->ga.data)) +
-                                   %(am)s->ga.offset),
-                PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s);
-            cudaError_t err = cudaGetLastError();
-            if (cudaSuccess != err)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n"
-                             "The kernel was launched with %%d threads,"
-                             " %%d blocks and %%d shared memory\\n",
-                             cudaGetErrorString(err),
-                             n_threads, n_blocks, n_shared_bytes);
-                // no need to decref output vars the cleanup code will do it
-                %(fail)s;
-            }
+            void *kernel_params[] = {
+                (void *)&PyGpuArray_DIMS(%(x)s)[0],
+                (void *)&PyGpuArray_DIMS(%(x)s)[1],
+                (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
+                (void *)&stride_X0, (void *)&stride_X1,
+                (void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
+                (void *)&stride_B0,
+                (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
+                (void *)&stride_YIDX0,
+                (void *)%(nll)s->ga.data, (void *)&%(nll)s->ga.offset,
+                (void *)&stride_NLL0,
+                (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
+                (void *)&stride_SM0, (void *)&stride_SM1,
+                (void *)%(am)s->ga.data, (void *)&%(am)s->ga.offset,
+                (void *)&stride_AM0};
+            int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
+            %(err_check)s
+            %(sync)s
        }
        """ % locals(), file=sio)
        return sio.getvalue()

    def c_code_cache_version(self):
-        return (6,)
-
-    def c_compiler(self):
-        return NVCC_compiler
+        return (7,)


 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()


-class GpuCrossentropySoftmax1HotWithBiasDx(Op):
+class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
    """
    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.

@@ -294,13 +328,19 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
        return Apply(self, [dnll, sm, y_idx], [sm.type()])

    def c_code_cache_version(self):
-        return (9,)
+        return (10,)

-    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>']
+    def c_header_dirs(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        cuda_root = config.cuda.root
+        if cuda_root:
+            import os
+            return [os.path.join(cuda_root, 'include')]

-    def c_compiler(self):
-        return NVCC_compiler
+    def c_headers(self):
+        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
+                '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
        typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
@@ -312,20 +352,36 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
        dtype_sm = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        dtype_dx = node.outputs[0].dtype
+        type_intp = gpuarray.dtype_to_ctype(numpy.intp)
        dnll, sm, y_idx = inp
        dx, = out
        fail = sub['fail']
+        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+                %(fail)s;
+            }
+        """ % locals()
+        sync = ""
+        if config.gpuarray.sync:
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        return """
        // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
-        const npy_intp %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
-                                         PyGpuArray_DIMS(%(dnll)s)[0] :
-                                         (npy_intp) 0);
+        const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
+                                        PyGpuArray_DIMS(%(dnll)s)[0] :
+                                        (ssize_t) 0);

        // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
        // or a vector with just one element.
-        const npy_intp %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
-                                            PyGpuArray_STRIDES(%(dnll)s)[0] :
-                                            (npy_intp) 0);
+        const ssize_t %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
+                                           PyGpuArray_STRIDES(%(dnll)s)[0] :
+                                           (ssize_t) 0);

        if ((PyGpuArray_NDIM(%(dnll)s) > 1)
            || (PyGpuArray_NDIM(%(sm)s) != 2)
@@ -373,48 +429,33 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
            }
        }
        {
-            int n_blocks = PyGpuArray_DIMS(%(dx)s)[0] < 256 ? PyGpuArray_DIMS(%(dx)s)[0] : 256;
-            int n_threads = PyGpuArray_DIMS(%(dx)s)[1] < 256 ? PyGpuArray_DIMS(%(dx)s)[1] : 256;
-
-            kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
-                <<<n_blocks, n_threads>>>(
-                        PyGpuArray_DIMS(%(dx)s)[0],
-                        PyGpuArray_DIMS(%(dx)s)[1],
-
-                        (npy_%(dtype_dnll)s*)(((char *)cuda_get_ptr(%(dnll)s->ga.data)) +
-                                           %(dnll)s->ga.offset),
-                        %(dnll)s_strides0 / %(itemsize_dnll)s,
-
-                        (npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
-                                           %(sm)s->ga.offset),
-                        PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
-                        PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
-
-                        (npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
-                                           %(y_idx)s->ga.offset),
-                        PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
-
-                        (npy_%(dtype_dx)s*)(((char *)cuda_get_ptr(%(dx)s->ga.data)) +
-                                           %(dx)s->ga.offset),
-                        PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s,
-                        PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s
-                );
-            cudaError_t err = cudaGetLastError();
-            if( cudaSuccess != err)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %%s: %%s.\\n"
-                             "The kernel was launched with %%d threads and"
-                             " %%d blocks\\n",
-                             "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
-                             cudaGetErrorString(err), n_threads, n_blocks);
-                %(fail)s;
-            }
+            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[0], (size_t)256), 1, 1};
+            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[1], (size_t)256), 1, 1};
+            ssize_t stride_DNLL0 = %(dnll)s_strides0 / %(itemsize_dnll)s;
+            ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
+            ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
+            ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
+            ssize_t stride_DX0 = PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s;
+            ssize_t stride_DX1 = PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s;
+            void *kernel_params[] = {
+                (void *)&PyGpuArray_DIMS(%(dx)s)[0],
+                (void *)&PyGpuArray_DIMS(%(dx)s)[1],
+                (void *)%(dnll)s->ga.data, (void *)&%(dnll)s->ga.offset,
+                (void *)&stride_DNLL0,
+                (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
+                (void *)&stride_SM0, (void *)&stride_SM1,
+                (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
+                (void *)&stride_YIDX0,
+                (void *)%(dx)s->ga.data, (void *)&%(dx)s->ga.offset,
+                (void *)&stride_DX0, (void *)&stride_DX1};
+            int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
+            %(err_check)s
+            %(sync)s
        }
        assert(%(dx)s);
        """ % locals()

-    def c_support_code_apply(self, node, nodename):
+    def gpu_kernels(self, node, nodename):
        dtype_dnll = node.inputs[0].dtype
        dtype_sm = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
@@ -423,18 +464,35 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
        load_dnll = load_w(dtype_dnll)
        load_sm = load_w(dtype_sm)
        write_dx = write_w(dtype_dx)
-        return """
-        __global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s(
-           int N, int K,
-           const npy_%(dtype_dnll)s* dnll, const int dnll_s0,
-           const npy_%(dtype_sm)s* sm, const int sm_s0, const int sm_s1,
-           const npy_%(dtype_y_idx)s* y_idx, const int y_idx_s0,
-           npy_%(dtype_dx)s* dx, const int dx_s0, const int dx_s1)
+        flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx)
+        type_dnll = gpuarray.dtype_to_ctype(work_dnll)
+        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
+        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
+        type_dx = gpuarray.dtype_to_ctype(dtype_dx)
+        kname = "kCrossEntropySoftmax1HotWithBiasDx"
+        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
+        sio = StringIO()
+        print("""
+        KERNEL void %(kname)s(
+           const ga_size N, const ga_size K,
+           const %(type_dnll)s* dnll, const ga_size offset_dnll,
+           const ga_ssize dnll_s0,
+           const %(type_sm)s* sm, const ga_size offset_sm,
+           const ga_ssize sm_s0, const ga_ssize sm_s1,
+           const %(type_y_idx)s* y_idx, const ga_size offset_y_idx,
+           const ga_ssize y_idx_s0,
+           %(type_dx)s* dx, const ga_size offset_dx,
+           const ga_ssize dx_s0, const ga_ssize dx_s1)
        {
+            dnll = (const %(type_dnll)s *)(((char *)dnll)+offset_dnll);
+            sm = (const %(type_sm)s *)(((char *)sm)+offset_sm);
+            y_idx = (const %(type_y_idx)s *)(((char *)y_idx)+offset_y_idx);
+            dx = (%(type_dx)s *)(((char *)dx)+offset_dx);
+
            for (int i = blockIdx.x; i < N; i += gridDim.x)
            {
-                npy_%(work_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
-                npy_%(dtype_y_idx)s y_i = y_idx[i * y_idx_s0];
+                %(type_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
+                %(type_y_idx)s y_i = y_idx[i * y_idx_s0];

                for (int j = threadIdx.x; j < K; j += blockDim.x)
                {
@@ -453,17 +511,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
                }
            }
        }
-
-        CUdeviceptr (*cuda_get_ptr)(gpudata *g);
-        """ % locals()
-
-    def c_init_code(self):
-        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
+        """ % locals(), file=sio)
+        params = [
+            'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp'
+            ]
+        return [Kernel(code=sio.getvalue(), name=kname, params=params,
+                       flags=flags, objvar=k_var)]

 gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()


-class GpuSoftmax (Op):
+class GpuSoftmax (GpuKernelBase, Op):
    """
    Implement Softmax on the gpu.

@@ -482,12 +544,17 @@ class GpuSoftmax (Op):
    def c_code_cache_version(self):
        return (13,) + inline_softmax.code_version

+    def c_header_dirs(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        cuda_root = config.cuda.root
+        if cuda_root:
+            import os
+            return [os.path.join(cuda_root, 'include')]
+
    def c_headers(self):
        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
-
-    def c_compiler(self):
-        return NVCC_compiler
+                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']

    def c_init_code(self):
        return ['setup_ext_cuda();']
@@ -502,10 +569,21 @@ class GpuSoftmax (Op):
        x, = inp
        z, = out
        fail = sub['fail']
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
+                %(fail)s;
+            }
+        """ % locals()
+        sync = ""
        if config.gpuarray.sync:
-            cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            msg = "sync error";
+            %(err_check)s
+            """ % locals()
        else:
-            cnda_thread_sync = ""
+            sync = ""
        return """
        if (PyGpuArray_NDIM(%(x)s) != 2)
        {
@@ -528,97 +606,82 @@ class GpuSoftmax (Op):
            }
        }
        {
-            int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0],
-                                    (size_t)(32 * 1024));
+            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32 * 1024)), 1, 1};
 //TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512);
-            int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
+            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512), 1, 1};
+            size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
                                     2 * sizeof(npy_%(work_x)s);
-
+            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
+            ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
+            ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
+            ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
+            const char *fmt_str, *msg;
+            void *kernel_params[] = {
+                (void *)&PyGpuArray_DIMS(%(x)s)[0],
+                (void *)&PyGpuArray_DIMS(%(x)s)[1],
+                (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
+                (void *)&stride_X0, (void *)&stride_X1,
+                (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
+                (void *)&stride_Z0, (void *)&stride_Z1};
+            int err = GA_NO_ERROR;
            if (PyGpuArray_DIMS(%(x)s)[0] > 0)
            {
              //Those numbers are based on not too recent GPU
              //to make them compatible with more GPU.
              //TODO: read the information from the card.
-              if(n_shared_bytes < (32 * 1024 - 500)){
-                kSoftmax_%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_shared_bytes
-                    >>>(
-                            PyGpuArray_DIMS(%(x)s)[0],
-                            PyGpuArray_DIMS(%(x)s)[1],
-
-                            (npy_%(dtype_x)s*)(
-                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                    %(x)s->ga.offset),
-                            PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                            PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-
-                            (npy_%(dtype_z)s*)(
-                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                    %(z)s->ga.offset),
-                            PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                            PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
-                    );
+              if(shmem_sz < (32 * 1024 - 500)){
+                err = GpuKernel_call(&kSoftmax_%(nodename)s, 3,
+                                     threads_per_block, n_blocks, shmem_sz,
+                                     kernel_params);
+                fmt_str = "gpuarray error: kSoftmax_%(nodename)s: %%s";
+                msg = GpuKernel_error(&kSoftmax_%(nodename)s, err);
              }else{
-                kSoftmax_fixed_shared%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_threads * sizeof(npy_%(work_x)s)
-                    >>>(
-                            PyGpuArray_DIMS(%(x)s)[0],
-                            PyGpuArray_DIMS(%(x)s)[1],
-
-                            (npy_%(dtype_x)s*)(
-                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                    %(x)s->ga.offset),
-                            PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                            PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-
-                            (npy_%(dtype_z)s*)(
-                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                    %(z)s->ga.offset),
-                            PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                            PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
-                    );
-              }
-              %(cnda_thread_sync)s
-              cudaError_t err = cudaGetLastError();
-              if( cudaSuccess != err)
-              {
-                  PyErr_Format(PyExc_RuntimeError,
-                               "Cuda error: %%s: %%s.\\n Used %%d blocks,"
-                               " %%d threads %%d bytes of shared memory",
-                               "kSoftmax[_fixed_shared]%(nodename)s",
-                               cudaGetErrorString(err),
-                               n_blocks, n_threads, n_shared_bytes);
-                  %(fail)s;
+                err = GpuKernel_call(&kSoftmax_fixed_shared%(nodename)s, 3,
+                                     threads_per_block, n_blocks,
+                                     threads_per_block[0] * sizeof(npy_%(work_x)s),
+                                     kernel_params);
+                fmt_str = "gpuarray error: kSoftmax_fixed_shared%(nodename)s: %%s";
+                msg = GpuKernel_error(&kSoftmax_fixed_shared%(nodename)s, err);
              }
+              %(err_check)s
+              %(sync)s
            }
        }
        assert(%(z)s);
        """ % locals()

-    def c_support_code_apply(self, node, nodename):
+    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_sm = node.outputs[0].dtype
-        load_x = load_w(node.inputs[0].dtype)
+        load_x = load_w(dtype_x)
        write_sm = write_w(node.outputs[0].dtype)
-        work_sm = work_dtype(node.outputs[0].dtype)
-        ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
-                params=['int M', 'int N',
-                    'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
-                    'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
+        work_sm = work_dtype(dtype_sm)
+        flags = Kernel.get_flags(dtype_x, dtype_sm)
+        type_x = gpuarray.dtype_to_ctype(dtype_x)
+        type_sm = gpuarray.dtype_to_ctype(work_sm)
+        params = [
+            'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp'
+            ]
+        kernels = []
+        kname = "kSoftmax"
+        k_var= "kSoftmax_" + nodename
+        code = nvcc_kernel(kname,
+                params=['const ga_size M', 'const ga_size N',
+                    'const %s * x' % type_x, 'const ga_size offset_x',
+                    'const ga_ssize sx0', 'const ga_ssize sx1',
+                    '%s * sm' % type_sm, 'const ga_size offset_sm',
+                    'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
                body=[
-                    "extern __shared__ npy_%(work_sm)s buf[]",
-                    "npy_%(work_sm)s * buf2 = buf + N",
+                    "extern __shared__ %s buf[]" % type_sm,
+                    "%s * buf2 = buf + N" % type_sm,
+                    "x = (const %s *)(((char *)x)+offset_x)" % type_x,
+                    "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
                    "     blockIDX += gridDim.x){",
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                        "buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1])",
+                        "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
                        "buf2[tx] = buf[tx]",
                      "}",
                      "__syncthreads()",
@@ -626,21 +689,29 @@ class GpuSoftmax (Op):
                                     'threadIdx.x', 'blockDim.x', work_sm),
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                        # This set all value correctly
-                        "sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx])",
+                        "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
                      "}",
                      "__syncthreads()",
                    "}",
                ])
-        ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
-                params=['int M', 'int N',
-                    'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
-                    'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
+        kernels.append(Kernel(code=code, name=kname, params=params,
+                              flags=flags, objvar=k_var))
+        kname = "kSoftmax_fixed_shared"
+        k_var= "kSoftmax_fixed_shared" + nodename
+        code = nvcc_kernel(kname,
+                params=['const ga_size M', 'const ga_size N',
+                    'const %s * x' % type_x, 'const ga_size offset_x',
+                    'const ga_ssize sx0', 'const ga_ssize sx1',
+                    '%s * sm' % type_sm, 'const ga_size offset_sm',
+                    'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
                body=[
-                    "extern __shared__ npy_%(work_sm)s buf[]",
+                    "extern __shared__ %s buf[]" % type_sm,
+                    "x = (const %s *)(((char *)x)+offset_x)" % type_x,
+                    "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
                    "     blockIDX += gridDim.x){",
-                      "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
-                      "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
+                      "const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
+                      "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
                      inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
                                                  load_x,
                                                  'sm_ptr', 'sm_s1', write_sm,
@@ -649,12 +720,14 @@ class GpuSoftmax (Op):
                      "__syncthreads()",
                    "}",
                    ])
-        return (ret1 + "\n" + ret2) % locals()
+        kernels.append(Kernel(code=code, name=kname, params=params,
+                              flags=flags, objvar=k_var))
+        return kernels

 gpu_softmax = GpuSoftmax()


-class GpuSoftmaxWithBias (Op):
+class GpuSoftmaxWithBias (GpuKernelBase, Op):
    """
    Implement SoftmaxWithBias on the gpu.

@@ -676,12 +749,19 @@ class GpuSoftmaxWithBias (Op):
    def c_code_cache_version(self):
        return (12,) + inline_softmax.code_version

+    def c_header_dirs(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        cuda_root = config.cuda.root
+        if cuda_root:
+            import os
+            return [os.path.join(cuda_root, 'include')]
+        else:
+            return []
+
    def c_headers(self):
        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
-
-    def c_compiler(self):
-        return NVCC_compiler
+                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']

    def c_init_code(self):
        return ['setup_ext_cuda();']
@@ -698,10 +778,19 @@ class GpuSoftmaxWithBias (Op):
        x, b = inp
        z, = out
        fail = sub['fail']
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
+                %(fail)s;
+            }
+        """ % locals()
+        sync = ""
        if config.gpuarray.sync:
-            cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
-        else:
-            cnda_thread_sync = ""
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            msg = "sync error";
+            %(err_check)s
+            """ % locals()
        return """
        if (PyGpuArray_NDIM(%(x)s) != 2)
        {
@@ -739,82 +828,51 @@ class GpuSoftmaxWithBias (Op):
            }
        }
        {
-            int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024));
+            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024)), 1, 1};
 //TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512);
-            int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
+            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512), 1, 1};
+            size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
                                     2 * sizeof(npy_%(work_x)s);
+            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
+            ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
+            ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
+            ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
+            ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
+            const char *fmt_str, *msg;
+            void *kernel_params[] = {
+                (void *)&PyGpuArray_DIMS(%(x)s)[0],
+                (void *)&PyGpuArray_DIMS(%(x)s)[1],
+                (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
+                (void *)&stride_X0, (void *)&stride_X1,
+                (void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
+                (void *)&stride_B0,
+                (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
+                (void *)&stride_Z0, (void *)&stride_Z1};
+            int err = GA_NO_ERROR;
            if (PyGpuArray_DIMS(%(x)s)[0] > 0)
            {
-              if(n_shared_bytes < (32 * 1024 - 500)){
-                kSoftmaxWithBias_%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_shared_bytes
-                    >>>(
-                        PyGpuArray_DIMS(%(x)s)[0],
-                        PyGpuArray_DIMS(%(x)s)[1],
-
-                        (npy_%(dtype_x)s*)(
-                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                    %(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                        PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-
-                        (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
-                                           %(b)s->ga.offset),
-                        PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
-
-                        (npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                           %(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                        PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
-                    );
+              if(shmem_sz < (32 * 1024 - 500)){
+                err = GpuKernel_call(&kSoftmaxWithBias_%(nodename)s, 3,
+                                     threads_per_block, n_blocks, shmem_sz,
+                                     kernel_params);
+                fmt_str = "gpuarray error: kSoftmaxWithBias_%(nodename)s: %%s";
+                msg = GpuKernel_error(&kSoftmaxWithBias_%(nodename)s, err);
              }else{
-                kSoftmaxWithBias_fixed_shared%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_threads * sizeof(npy_%(work_x)s)
-                    >>>(
-                        PyGpuArray_DIMS(%(x)s)[0],
-                        PyGpuArray_DIMS(%(x)s)[1],
-
-                        (npy_%(dtype_x)s*)(
-                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                    %(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                        PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-
-                        (npy_%(dtype_b)s*)(
-                                    ((char *)cuda_get_ptr(%(b)s->ga.data)) +
-                                    %(b)s->ga.offset),
-                        PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
-
-                        (npy_%(dtype_z)s*)(
-                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                    %(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                        PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
-                    );
+                err = GpuKernel_call(&kSoftmaxWithBias_fixed_shared%(nodename)s,
+                                     3, threads_per_block, n_blocks,
+                                     threads_per_block[0] * sizeof(npy_%(work_x)s),
+                                     kernel_params);
+                fmt_str = "gpuarray error: kSoftmaxWithBias_fixed_shared%(nodename)s: %%s";
+                msg = GpuKernel_error(&kSoftmaxWithBias_fixed_shared%(nodename)s, err);
              }
-                %(cnda_thread_sync)s
-                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "Cuda error: %%s: %%s.\\n",
-                                 "kSoftmaxWithBias_%(nodename)s",
-                                 cudaGetErrorString(err));
-                    %(fail)s;
-                }
+              %(err_check)s
+              %(sync)s
            }
        }
        assert(%(z)s);
        """ % locals()

-    def c_support_code_apply(self, node, nodename):
+    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_sm = node.outputs[0].dtype
@@ -822,55 +880,80 @@ class GpuSoftmaxWithBias (Op):
        load_b = load_w(node.inputs[1].dtype)
        write_sm = write_w(node.outputs[0].dtype)
        work_sm = work_dtype(node.outputs[0].dtype)
-        ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
-                params=['int M', 'int N',
-                        'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
-                        'const npy_%(dtype_b)s * b', 'const int sb0',
-                        'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
+        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
+        type_x = gpuarray.dtype_to_ctype(dtype_x)
+        type_b = gpuarray.dtype_to_ctype(dtype_b)
+        type_sm = gpuarray.dtype_to_ctype(work_sm)
+        params = [
+            'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp'
+            ]
+        kernels = []
+        kname = "kSoftmaxWithBias"
+        k_var = "kSoftmaxWithBias_" + nodename
+        code = nvcc_kernel(kname,
+                params=['const ga_size M', 'const ga_size N',
+                        'const %s * x' % type_x, 'const ga_size offset_x',
+                        'const ga_ssize sx0', 'const ga_ssize sx1',
+                        'const %s * b' % type_b, 'const ga_size offset_b',
+                        'const ga_ssize sb0',
+                        '%s * sm' % type_sm, 'const ga_size offset_sm',
+                        'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
                body=[
-                    "extern __shared__ npy_%(work_sm)s buf[]",
-                    "npy_%(work_sm)s * buf2 = buf + N",
+                    "extern __shared__ %s buf[]" % type_sm,
+                    "%s * buf2 = buf + N" % type_sm,
+                    "x = (const %s *)(((char *)x)+offset_x)" % type_x,
+                    "b = (const %s *)(((char *)b)+offset_b)" % type_b,
+                    "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
                    "     blockIDX += gridDim.x){",
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                         "buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1])",
-                         "buf[tx] += %(load_b)s(b[tx * sb0])",
+                         "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
+                         "buf[tx] += %s(b[tx * sb0])" % load_b,
                         "buf2[tx] = buf[tx]",
                      "}",
                       "__syncthreads()",
                       inline_softmax('N', 'buf', 'buf2',
                                      'threadIdx.x', 'blockDim.x', work_sm),
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                         "sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx])",
+                         "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
                      "}",
                      "__syncthreads()",
                    "}",
                    ])
-        ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
-                           params=['int M', 'int N',
-                                   'const npy_%(dtype_x)s * x',
-                                   'const int sx0', 'const int sx1',
-                                   'const npy_%(dtype_b)s * b', 'const int sb0',
-                                   'npy_%(dtype_sm)s * sm',
-                                   'const int sm_s0', 'const int sm_s1'],
-                           body=[
-                               "extern __shared__ npy_%(work_sm)s buf[]",
-                               "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                               "     blockIDX += gridDim.x){",
-                               "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
-                               "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
-                               inline_softmax_fixed_shared('N', 'buf',
-                                                           'x_ptr', 'sx1',
-                                                           load_x,
-                                                           'sm_ptr', 'sm_s1',
-                                                           write_sm,
-                                                           'threadIdx.x',
-                                                           'blockDim.x',
-                                                           'b', 'sb0', load_b,
-                                                           work_sm),
-                               "__syncthreads()",
-                               "}",
-                           ])
-        return (ret1 + "\n" + ret2) % locals()
+        kernels.append(Kernel(code=code, name=kname, params=params,
+                              flags=flags, objvar=k_var))
+        kname = "kSoftmaxWithBias_fixed_shared"
+        k_var = "kSoftmaxWithBias_fixed_shared" + nodename
+        code = nvcc_kernel(kname,
+                params=['const ga_size M', 'const ga_size N',
+                        'const %s * x' % type_x, 'const ga_size offset_x',
+                        'const ga_ssize sx0', 'const ga_ssize sx1',
+                        'const %s * b' % type_b, 'const ga_size offset_b',
+                        'const ga_ssize sb0',
+                        '%s * sm' % type_sm, 'const ga_size offset_sm',
+                        'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
+                body=[
+                    "extern __shared__ %s buf[]" % type_sm,
+                    "x = (const %s *)(((char *)x)+offset_x)" % type_x,
+                    "b = (const %s *)(((char *)b)+offset_b)" % type_b,
+                    "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
+                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                    "     blockIDX += gridDim.x){",
+                    "const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
+                    "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
+                    inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
+                                                load_x,
+                                                'sm_ptr', 'sm_s1', write_sm,
+                                                'threadIdx.x', 'blockDim.x',
+                                                'b', 'sb0', load_b, work_sm),
+                    "__syncthreads()",
+                    "}",
+                    ])
+        kernels.append(Kernel(code=code, name=kname, params=params,
+                              flags=flags, objvar=k_var))
+        return kernels

 gpu_softmax_with_bias = GpuSoftmaxWithBias()
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
@@ -3,7 +3,7 @@ import copy
 import numpy

 import theano
-from theano import tensor, gof, Op
+from theano import tensor, gof, Op, config
 from six.moves import StringIO
 from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
 import theano.tensor.inplace
@@ -15,7 +15,7 @@ except ImportError:
    pass

 from .type import GpuArrayType
-from .basic_ops import as_gpuarray_variable, HideC
+from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel)
 from .elemwise import GpuElemwise
 from .comp import NVCC_compiler

@@ -159,7 +159,7 @@ class GpuSubtensor(HideC, Subtensor):
        return (6,)


-class GpuIncSubtensor(IncSubtensor):
+class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
    """
    Implement IncSubtensor on the gpu.

@@ -177,6 +177,14 @@ class GpuIncSubtensor(IncSubtensor):
    def _f16_ok(self):
        return self.iadd_node.op._f16_ok

+    def c_header_dirs(self):
+        cuda_root = config.cuda.root
+        if cuda_root:
+            import os
+            return [os.path.join(cuda_root, 'include')]
+        else:
+            return []
+
    def c_headers(self):
        return self.iadd_node.op.c_headers()

@@ -186,6 +194,10 @@ class GpuIncSubtensor(IncSubtensor):
    def c_init_code(self):
        return self.iadd_node.op.c_init_code()

+    def gpu_kernels(self, node, nodename):
+        subname = nodename + "_add_to_zview"
+        return self.iadd_node.op.gpu_kernels(self.iadd_node, subname)
+
    def make_node(self, x, y, *inputs):
        x = as_gpuarray_variable(x)
        y = as_gpuarray_variable(y)
@@ -486,7 +498,7 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
                    k(x[i], reshaped_y, broadcast=True)


-class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
+class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
    """
    Implement AdvancedIncSubtensor1 on the gpu, but use function
    only avail on compute capability 2.0 and more recent.
@@ -525,16 +537,25 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])

    def c_code_cache_version(self):
-        return (4,)
+        return (5,)

    def c_headers(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
+                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']

-    def c_compiler(self):
-        return NVCC_compiler
+    def c_header_dirs(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        cuda_root = config.cuda.root
+        if cuda_root:
+            import os
+            return [os.path.join(cuda_root, 'include')]

    def c_init_code(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
        return ['setup_ext_cuda();']

    def c_code(self, node, name, inputs, outputs, sub):
@@ -569,7 +590,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        }
        """ % locals()

-    def c_support_code_apply(self, node, nodename):
+    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
@@ -578,7 +599,14 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        itemsize_y = numpy.dtype(dtype_y).itemsize
        itemsize_ind = numpy.dtype(dtype_ind).itemsize
        itemsize_out = numpy.dtype(dtype_out).itemsize
-        return """
+        flags=Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
+        type_x = gpuarray.dtype_to_ctype(dtype_x)
+        type_y = gpuarray.dtype_to_ctype(dtype_y)
+        type_ind = gpuarray.dtype_to_ctype(dtype_ind)
+        type_out = gpuarray.dtype_to_ctype(dtype_out)
+        kname = "k_vector_add_fast"
+        k_var = "k_vector_add_fast_" + nodename
+        code = """

 /*
 * This is a version of atomicAdd that works for half-floats.  It may
@@ -587,37 +615,43 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
 * will not be modified.
 */

-__device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
-  npy_uint32 *base = (npy_uint32 *)((size_t)addr & ~2);
-  npy_uint32 old, assumed, sum, new_;
+__device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
+  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
+  ga_uint old, assumed, sum, new_;
  old = *base;
  do {
    assumed = old;
    sum = __float2half_rn(
      __half2float(val) +
-      __half2float((npy_float16)__byte_perm(old, 0,
-                     ((size_t)addr & 2) ? 0x4432 : 0x4410)));
-    new_ = __byte_perm(old, sum, ((size_t)addr & 2) ? 0x5410 : 0x3254);
+      __half2float((ga_half)__byte_perm(old, 0,
+                     ((ga_size)addr & 2) ? 0x4432 : 0x4410)));
+    new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
    old = atomicCAS(base, assumed, new_);
  } while (assumed != old);
-  return (npy_float16)__byte_perm(old, 0,
-                                  ((size_t)addr & 2) ? 0x4432 : 0x4410);
+  return (ga_half)__byte_perm(old, 0,
+                                  ((ga_size)addr & 2) ? 0x4432 : 0x4410);
 }

-        __global__ void k_vector_add_fast(int numRowsX,
-                                          int numColsX,
-                                          int stridesX0,
-                                          int stridesX1,
-                                          npy_%(dtype_x)s *X,
-                                          int numRowsY,
-                                          int numColsY,
-                                          int stridesY0,
-                                          int stridesY1,
-                                          npy_%(dtype_y)s *Y,
-                                          int numIndices,
-                                          int stridesIndices,
-                                          npy_%(dtype_ind)s *indices_arr)
+        KERNEL void k_vector_add_fast(const ga_size numRowsX,
+                                      const ga_size numColsX,
+                                      const ga_ssize stridesX0,
+                                      const ga_ssize stridesX1,
+                                      %(type_x)s *X,
+                                      const ga_size offset_X,
+                                      const ga_size numRowsY,
+                                      const ga_size numColsY,
+                                      const ga_ssize stridesY0,
+                                      const ga_ssize stridesY1,
+                                      %(type_y)s *Y,
+                                      const ga_size offset_Y,
+                                      const ga_size numIndices,
+                                      const ga_ssize stridesIndices,
+                                      %(type_ind)s *indices_arr,
+                                      const ga_size offset_indices_arr)
        {
+             X = (%(type_x)s *)(((char *)X)+offset_X);
+             Y = (%(type_y)s *)(((char *)Y)+offset_Y);
+             indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr);
             for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
             {
                  for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
@@ -631,41 +665,71 @@ __device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
             }
             return;
        }
+        """ % locals()
+        params = [
+            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
+            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
+            'uintp', 'intp', gpuarray.GpuArray, 'uintp'
+            ]
+        return [Kernel(code=code, name=kname, params=params,
+                       flags=flags, objvar=k_var)]

+    def c_support_code_apply(self, node, nodename):
+        dtype_x = node.inputs[0].dtype
+        dtype_y = node.inputs[1].dtype
+        dtype_ind = node.inputs[2].dtype
+        dtype_out = node.outputs[0].dtype
+        itemsize_x = numpy.dtype(dtype_x).itemsize
+        itemsize_y = numpy.dtype(dtype_y).itemsize
+        itemsize_ind = numpy.dtype(dtype_ind).itemsize
+        itemsize_out = numpy.dtype(dtype_out).itemsize
+        k_var = "k_vector_add_fast_" + nodename
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+            }
+        """ % locals()
+        sync = ""
+        if config.gpuarray.sync:
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
+        return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """
        void GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
                                      PyGpuArrayObject* py_other,
                                      PyGpuArrayObject *indices_arr)
        {
-                int num_threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1],
-                                                     (size_t)256);
-                int num_blocks = std::min(PyGpuArray_SIZE(indices_arr),
-                                          (size_t)4096);
-
-                dim3 n_blocks(num_blocks);
-                dim3 n_threads(num_threads_per_block);
-
-                k_vector_add_fast<<<n_blocks, n_threads>>>(
-                        PyGpuArray_DIM(py_self, 0),
-                        PyGpuArray_DIM(py_self, 1),
-                        PyGpuArray_STRIDE(py_self, 0) / %(itemsize_x)s,
-                        PyGpuArray_STRIDE(py_self, 1) / %(itemsize_x)s,
-                        (npy_%(dtype_x)s*)(
-                            ((char *)cuda_get_ptr(py_self->ga.data)) +
-                            py_self->ga.offset),
-                        PyGpuArray_DIM(py_other, 0),
-                        PyGpuArray_DIM(py_other, 1),
-                        PyGpuArray_DIM(py_other, 0) == 1 ? 0 : PyGpuArray_STRIDE(py_other, 0) / %(itemsize_y)s,
-                        PyGpuArray_DIM(py_other, 1) == 1 ? 0 : PyGpuArray_STRIDE(py_other, 1) / %(itemsize_y)s,
-                        (npy_%(dtype_x)s*)(
-                            ((char *)cuda_get_ptr(py_other->ga.data)) +
-                            py_other->ga.offset),
-                        PyGpuArray_DIMS(indices_arr)[0],
-                        PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s,
-                        (npy_%(dtype_ind)s*)(
-                            ((char *)cuda_get_ptr(indices_arr->ga.data)) +
-                            indices_arr->ga.offset)
-                );
-                return;
+            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
+            size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
+            if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
+                ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s;
+                ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s;
+                ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s;
+                ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s;
+                ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s;
+                void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0],
+                                         (void *)&PyGpuArray_DIMS(py_self)[1],
+                                         (void *)&stride_X0,
+                                         (void *)&stride_X1,
+                                         (void *)py_self->ga.data,
+                                         (void *)&py_self->ga.offset,
+                                         (void *)&PyGpuArray_DIMS(py_other)[0],
+                                         (void *)&PyGpuArray_DIMS(py_other)[1],
+                                         (void *)&stride_Y0,
+                                         (void *)&stride_Y1,
+                                         (void *)py_other->ga.data,
+                                         (void *)&py_other->ga.offset,
+                                         (void *)&PyGpuArray_DIMS(indices_arr)[0],
+                                         (void *)&stride_ind,
+                                         (void *)indices_arr->ga.data,
+                                         (void *)&indices_arr->ga.offset};
+                int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
+                %(err_check)s
+                %(sync)s
+            }
        }

        """ % locals()