Merge pull request #3364 from seanprime7/drvapi

Use the libgpuarray APIs to manage GPU code compilation, execution, etc.

Merge pull request #3364 from seanprime7/drvapi
5ecbbde2 · abergeron · 752f1f73 · 16f37f44 · 5ecbbde2 · 5ecbbde2
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -144,6 +144,15 @@ class GpuKernelBase(object):
    def _generate_kernel_vars(self, k):
        return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar)

+    def c_support_code(self):
+        return """
+        template <typename T>
+        static T ceil_intdiv(T a, T b)
+        {
+            return (a/b) + ((a % b) ? 1: 0);
+        }
+        """
+
    def c_support_code_apply(self, node, name):
        kernels = self.gpu_kernels(node, name)
        bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels)

--- a/theano/sandbox/gpuarray/conv.cu
+++ b/theano/sandbox/gpuarray/conv.cu
@@ -10,12 +10,6 @@ PyObject * PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
                           const size_t subsample_cols,
                           const int version, const int verbose);

-template <typename T>
-static T ceil_intdiv(T a, T b)
-{
-    return (a/b) + ((a % b) ? 1: 0);
-}
-
 /*
 * version: -1, autodetect, >=0 a specific version to use.
 *          If it can't be executed, we revert to the reference implementation
@@ -108,6 +102,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
    //TODO: make a parameter the number of division
    //TODO: Should we make them in separate grid block instead?
 
+    const int stack_len = PyGpuArray_DIMS(img)[1];
    const int nstack=PyGpuArray_DIMS(kern)[1];
    const int nbatch=PyGpuArray_DIMS(img)[0];
    const int nkern=PyGpuArray_DIMS(kern)[0];
@@ -126,6 +121,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
    const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
    const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
    const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
+    const int out_stride_col = PyGpuArray_STRIDES(out)[3]/4;
+    const int out_stride_row = PyGpuArray_STRIDES(out)[2]/4;
+    const int out_stride_nkern = PyGpuArray_STRIDES(out)[1]/4;
+    const int out_stride_batch = PyGpuArray_STRIDES(out)[0]/4;

    const int img_size=img_len*img_wid;
    const int kern_size=kern_len*kern_wid;
@@ -156,16 +155,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
    //we don't need to unflip it, but have the new value when we unflip it.
    bool kern_flipped=true;
    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
-    const float * kern_data_unflipped = cuda_get_ptr(kern);
-    int kern_stride_col_unflipped=kern_stride_col;
-    int kern_stride_row_unflipped=kern_stride_row;
-    if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
+    if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
      //the last two dimensions are c_contiguous but flipped!
-      kern_stride_col_unflipped=1;
-      kern_stride_row_unflipped=kern_wid;
      kern_flipped=false;
      kern_contiguous_2d_unflipped = true;
-      kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
    }

    //if we remove the restriction
@@ -195,46 +188,47 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
        while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0)
            nb_split++;
-        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
-
-        dim3 grid(nbatch, nkern);
-        int shared_size=(img_size + kern_size)*sizeof(float);
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int,
-                  int, int);
-
-#define CONV_PATCH_SPECIAL(kern_wid) \
-            if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
-            else f=conv_patch<true,kern_wid,true>;
-
-        CONV_PATCH_SPECIAL(THEANO_KERN_WID);
-
-         f<<< grid, threads, shared_size>>>
-             (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
-              img_len, img_wid, kern_len, kern_wid, nkern, nstack);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
+        size_t threads_per_block[3] = {(size_t)out_wid,
+                               ceil_intdiv((size_t)out_len,(size_t)nb_split),
+                               (size_t)1};
+        size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
+
+        size_t shmem_sz = (img_size + kern_size)*sizeof(float);
+
+        GpuKernel *k = NULL;
+        if(threads_per_block[1]==out_len) k=&conv_patch_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else k=&conv_patch_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
+
+        void *kernel_params[] = {(void *)img->ga.data, (void *)&img->ga.offset,
+                                 (void *)kern->ga.data, (void *)&kern->ga.offset,
+                                 (void *)out->ga.data, (void *)&out->ga.offset,
+                                 (void *)&img_len, (void *)&img_wid,
+                                 (void *)&kern_len, (void *)&kern_wid,
+                                 (void *)&nkern, (void *)&nstack};
+        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
+
+        if (err == GA_NO_ERROR)
        {
            if (verbose)
              fprintf(stderr,
                      "INFO: used 'conv_patch' version %s nb_split=%d\n",
-                      threads.y==out_len ? "no split": "split", nb_split);
+                      threads_per_block[1]==out_len ? "no split": "split", nb_split);
            work_complete = true;
        }
        else
        {
            if (verbose)
              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i, nb_split=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y, nb_split);
+                      "threads_per_block[0]=%i, threads_per_block[1]=%i,"
+                      " n_blocks[0]=%i, n_blocks[1]=%i,"
+                      " shmem_sz=%i, nb_threads=%i, nb_split=%i\n",
+                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
+                      shmem_sz, threads_per_block[0] * threads_per_block[1], nb_split);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_patch' failed (%s),"
                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
+                      GpuKernel_error(k, err));
        }
    }

@@ -253,77 +247,77 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
        //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
        while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++;
-        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
+        size_t threads_per_block[3] = {(size_t)out_wid,
+                               (size_t)ceil_intdiv(out_len,nb_split),
+                               (size_t)1};

        bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
        if(version==11 || version==12) preload_full_kernel=false;
-        dim3 grid(nbatch,nkern);
-        int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
-
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
-
-#define CONV_PATCH_STACK_SPECIAL(kern_wid) \
-        if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;}
-
-        CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
-        f<<< grid, threads, shared_size>>>
-            (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
-              img_len, img_wid, kern_len, kern_wid, 
-              out_len, out_wid, nkern, nstack,
-              img_stride_col, img_stride_row, img_stride_stack,
-              img_stride_batch, kern_stride_col, kern_stride_row,
-              kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
+        size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
+        size_t shmem_sz = (img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
+
+        GpuKernel *k = NULL;
+        if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_64_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_65_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_66_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_67_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_68_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_69_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_70_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_71_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_72_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_73_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_74_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_75_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_76_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_77_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_78_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_79_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_80_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_81_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_82_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_83_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_84_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_85_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_86_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_87_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_88_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_89_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_90_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_91_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_92_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_93_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_94_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_95_node_<<<<HASH_PLACEHOLDER>>>>_0;}
+
+        void *kernel_params[] = {(void *)img->ga.data, (void *)&img->ga.offset,
+                                 (void *)kern->ga.data, (void *)&kern->ga.offset,
+                                 (void *)out->ga.data, (void *)&out->ga.offset,
+                                 (void *)&img_len, (void *)&img_wid,
+                                 (void *)&kern_len, (void *)&kern_wid,
+                                 (void *)&out_len, (void *)&out_wid,
+                                 (void *)&nkern, (void *)&nstack,
+                                 (void *)&img_stride_col, (void *)&img_stride_row,
+                                 (void *)&img_stride_stack, (void *)&img_stride_batch,
+                                 (void *)&kern_stride_col, (void *)&kern_stride_row,
+                                 (void *)&kern_stride_stack, (void *)&kern_stride_nkern,
+                                 (void *)&subsample_rows, (void *)&subsample_cols};
+
+        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
+
+        if (err == GA_NO_ERROR)
        {
            if (verbose>1)
              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i,"
+                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
+                      " shmem_sz=%i, nb_threads=%i,"
                      " kern_flipped=true, accumulate=false, kern_width=%i,"
                      " img_c_contiguous_2d=%i,"
                      " kern_c_contiguous_2d=%i, nb_split=%i,"
                      " preload_full_kernel=%i,"
                      " subsample_rows=%llu, subsample_cols=%llu\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y,
+                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
+                      shmem_sz, threads_per_block[0] * threads_per_block[1],
                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
                      nb_split, preload_full_kernel,
                      (unsigned long long)subsample_rows,
@@ -342,15 +336,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        {
            if (verbose)
              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i,"
+                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
+                      " shmem_sz=%i, nb_threads=%i,"
                      " kern_flipped=true, accumulate=false,"
                      " kern_width=%i, img_c_contiguous_2d=%i,"
                      " kern_c_contiguous_2d=%i, nb_split=%i,"
                      " preload_full_kernel=%i,"
                      " subsample_rows=%llu, subsample_cols=%llu\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y,
+                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
+                      shmem_sz, threads_per_block[0] * threads_per_block[1],
                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
                      nb_split, preload_full_kernel,
                      (unsigned long long)subsample_rows,
@@ -359,7 +353,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
              fprintf(stderr,
                      "INFO: impl 'conv_patch_stack' failed (%s),"
                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
+                      GpuKernel_error(k, err));
        }
    }

@@ -371,30 +365,28 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        !work_complete) //conv_rows

    {
-        dim3 threads(out_wid);
-        dim3 grid(out_len, nbatch*nkern);
-        int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
-
-#define CONV_ROWS_SPECIAL(kern_wid) \
-        if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
-        else f = conv_rows<kern_wid, true>;\
-
-        CONV_ROWS_SPECIAL(THEANO_KERN_WID);
-        f<<< grid, threads, shared_size >>>
-            (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
-           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-           img_stride_col, img_stride_row,
-           img_stride_stack,img_stride_batch,
-           kern_stride_col, kern_stride_row,
-           kern_stride_stack, kern_stride_nkern);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
+        size_t threads_per_block[3] = {(size_t)out_wid, (size_t)1, (size_t)1};
+        size_t n_blocks[3] = {(size_t)out_len, (size_t)nbatch*nkern, (size_t)1};
+        size_t shmem_sz = (kern_len*img_wid + kern_size)*sizeof(float);
+
+        GpuKernel *k = NULL;
+        if(!img_contiguous_2d || !kern_contiguous_2d) k=&conv_rows_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else k=&conv_rows_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
+
+        void *kernel_params[] = {
+            (void *)img->ga.data, (void *)&img->ga.offset,
+            (void *)kern->ga.data, (void *)&kern->ga.offset,
+            (void *)out->ga.data, (void *)&out->ga.offset,
+            (void *)&img_len, (void *)&img_wid,
+            (void *)&kern_len, (void *)&kern_wid,
+            (void *)&nkern, (void *)&nstack,
+            (void *)&img_stride_col, (void *)&img_stride_row,
+            (void *)&img_stride_stack, (void *)&img_stride_batch,
+            (void *)&kern_stride_col, (void *)&kern_stride_row,
+            (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
+        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
+
+        if (err == GA_NO_ERROR)
        {
            work_complete = true;
            if (verbose)
@@ -404,15 +396,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        {
            if (verbose)
              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
+                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
+                      " shmem_sz=%i, nb_threads=%i\n",
+                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
+                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_rows' failed (%s),"
                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
+                      GpuKernel_error(k, err));
        }
    }
    if (!subsample && out_contiguous &&
@@ -430,52 +422,50 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
            nb_row=i;
        }

-        dim3 threads(out_wid,nb_row);
-        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
-
-        int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
+        size_t threads_per_block[3] = {(size_t)out_wid, (size_t)nb_row, (size_t)1};
+        size_t n_blocks[3] = {(size_t)ceil_intdiv(out_len,nb_row),
+                              (size_t)nbatch*nkern, (size_t)1};

-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
+        size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);

        if (0)
          fprintf(stderr,
                  "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
                  img_contiguous_2d, kern_contiguous_2d,
-                  threads.x, threads.y, threads.z,
-                  grid.x, grid.y, grid.z);
+                  threads_per_block[0], threads_per_block[1], threads_per_block[2],
+                  n_blocks[0], n_blocks[1], n_blocks[2]);

+        GpuKernel *k = NULL;
        if(!img_contiguous_2d || !kern_contiguous_2d) {
            //fprintf(stderr, "using false version\n");
-            f = conv_rows_stack<THEANO_KERN_WID, false>;
+            k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
        } else {
            //fprintf(stderr, "using true version\n");
-            f = conv_rows_stack<THEANO_KERN_WID, true>;
+            k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
        }

-        f<<< grid, threads, shared_size >>>
-            (cuda_get_ptr(img),
-             cuda_get_ptr(kern),
-             cuda_get_ptr(out),
-           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-           img_stride_col, img_stride_row,
-           img_stride_stack,img_stride_batch,
-           kern_stride_col, kern_stride_row,
-           kern_stride_stack, kern_stride_nkern);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
+        void *kernel_params[] = {
+            (void *)img->ga.data, (void *)&img->ga.offset,
+            (void *)kern->ga.data, (void *)&kern->ga.offset,
+            (void *)out->ga.data, (void *)&out->ga.offset,
+            (void *)&img_len, (void *)&img_wid,
+            (void *)&kern_len, (void *)&kern_wid,
+            (void *)&nkern, (void *)&nstack,
+            (void *)&img_stride_col, (void *)&img_stride_row,
+            (void *)&img_stride_stack, (void *)&img_stride_batch,
+            (void *)&kern_stride_col, (void *)&kern_stride_row,
+            (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
+        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
+
+        if (err == GA_NO_ERROR)
        {
            work_complete = true;
            if (verbose>1)
              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
+                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
+                      " shmem_sz=%i, nb_threads=%i\n",
+                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
+                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
        }
@@ -483,15 +473,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        {
            if (verbose)
              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
+                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
+                      " shmem_sz=%i, nb_threads=%i\n",
+                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
+                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_rows_stack' failed (%s),"
                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
+                      GpuKernel_error(k, err));
        }
    }

@@ -524,45 +514,41 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        //to test the case when we don't have a thread by output pixel.
        if((version_back!=-1)&& nb_row>1) nb_row--;

-        dim3 threads(out_wid,nb_row);
-        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
+
+        size_t threads_per_block[3] = {(size_t)out_wid, (size_t)nb_row, (size_t)1};
+        size_t n_blocks[3] = {(size_t)ceil_intdiv(out_len,nb_row),
+                              (size_t)nbatch*nkern, (size_t)1};
          
-        int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
-
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
-
-#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
-        if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
-        else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
-        else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
-        else f = conv_rows_stack2<kern_wid, true, false>;
-
-        CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
-
-        f<<< grid, threads, shared_size >>>
-            (cuda_get_ptr(img),
-             cuda_get_ptr(kern),
-             cuda_get_ptr(out),
-           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-           img_stride_col, img_stride_row,
-           img_stride_stack,img_stride_batch,
-           kern_stride_col, kern_stride_row,
-           kern_stride_stack, kern_stride_nkern);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+        size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
+
+        GpuKernel *k = NULL;
+        if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) k=&conv_rows_stack2_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(version==9) k=&conv_rows_stack2_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(!img_contiguous_2d || !kern_contiguous_2d) k=&conv_rows_stack2_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else k=&conv_rows_stack2_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
+
+        void *kernel_params[] = {
+            (void *)img->ga.data, (void *)&img->ga.offset,
+            (void *)kern->ga.data, (void *)&kern->ga.offset,
+            (void *)out->ga.data, (void *)&out->ga.offset,
+            (void *)&img_len, (void *)&img_wid,
+            (void *)&kern_len, (void *)&kern_wid,
+            (void *)&nkern, (void *)&nstack,
+            (void *)&img_stride_col, (void *)&img_stride_row,
+            (void *)&img_stride_stack, (void *)&img_stride_batch,
+            (void *)&kern_stride_col, (void *)&kern_stride_row,
+            (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
+        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
+
+        if (err == GA_NO_ERROR)
        {
            work_complete = true;
            if (verbose>1)
              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
+                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
+                      " shmem_sz=%i, nb_threads=%i\n",
+                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
+                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: used 'conv_rows_stack2' version %s with"
@@ -574,15 +560,15 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        {
            if (verbose)
              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i version=%d\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y,(version==9?2:3));
+                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
+                      " shmem_sz=%i, nb_threads=%i version=%d\n",
+                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
+                      shmem_sz, threads_per_block[0] * threads_per_block[1],(version==9?2:3));
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_rows_stack2' failed (%s),"
                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
+                      GpuKernel_error(k, err));
        }
    }

@@ -629,18 +615,18 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
            nb_split++;

        // tentative estimates (prior to contraint c)
-        int thread_z=ceil_intdiv(kern_len,nb_split);
-        int shared_size = sizeof(float)*(full_kern
-                ? std::max(img_size + kern_size, out_size*thread_z)
-                : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
+        size_t thread_z=ceil_intdiv(kern_len,nb_split);
+        size_t shmem_sz = sizeof(float)*(full_kern
+                ? std::max((size_t)img_size + kern_size, out_size*thread_z)
+                : std::max((size_t)img_size + thread_z*kern_wid, out_size*thread_z));

        // constraint (c)
-        while ((shared_size >= shared_avail) && (nb_split <= kern_len)){
+        while ((shmem_sz >= shared_avail) && (nb_split <= kern_len)){
            //if we can't fit the kernel in shared memory, we must split it more.
            nb_split++;
            thread_z=ceil_intdiv(kern_len,nb_split);
-            shared_size = sizeof(float)*(full_kern
-                ? std::max(img_size + kern_size, out_size*thread_z)
+            shmem_sz = sizeof(float)*(full_kern
+                ? std::max((size_t)img_size + kern_size, out_size*thread_z)
                : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
        }
        if (nb_split <= kern_len)
@@ -648,57 +634,59 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
            assert(thread_z>0);//should not happen, but in case...
            if(!full_kern) assert(thread_z!=kern_len);

-            dim3 threads(out_wid, out_len, thread_z);
-            dim3 grid(nbatch,nkern);
+            size_t threads_per_block[3] = {(size_t)out_wid,
+                                   (size_t)out_len,
+                                   (size_t)thread_z};
+            size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};

-            void (*f)(const float*, const float*, float*,
-                      int, int, int, int,
-                      int, int, int, int,
-                      int, int,
-                      int, int,
-                      int, int);
+            GpuKernel *k = NULL;

            const bool split=thread_z!=kern_len;
            const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;

            //printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
            //We will always be split when we don't load the full kernel
-#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
-                if     (kern_flipped  && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
-                else if(kern_flipped  && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
-                else if(kern_flipped  && ccontig  && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
-                else if(kern_flipped  && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
-                else if(!kern_flipped && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
-                else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
-                else if(!kern_flipped && ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
-                else if(!kern_flipped && !ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
-                /*else if(kern_flipped  && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
-                /*else if(kern_flipped  && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
-                else if(kern_flipped  && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
-                else if(kern_flipped  && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
-                /*else if(!kern_flipped && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
-                /*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
-                else if(!kern_flipped && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
-                else if(!kern_flipped && !ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
-            CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
-
-            f<<< grid, threads, shared_size>>>(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
-                                               img_len, img_wid, kern_len, kern_wid,
-                                               nkern, nstack,
-                                               img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
-                                               kern_stride_col_unflipped, kern_stride_row_unflipped,
-                                               kern_stride_stack, kern_stride_nkern);
-
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess == sts)
+
+            /* if(!kern_flipped && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
+            /*else*/ if(!kern_flipped && !ccontig && !split && full_kern) k=&conv_patch_stack_reduce_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
+            else if(!kern_flipped && !ccontig  && split && !full_kern) k=&conv_patch_stack_reduce_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
+            else if(!kern_flipped && !ccontig  && split && full_kern) k=&conv_patch_stack_reduce_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
+            /*else if(!kern_flipped && ccontig  && !split && !full_kern) k=&conv_patch_stack_reduce_4_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
+            else if(!kern_flipped && ccontig  && !split && full_kern) k=&conv_patch_stack_reduce_5_node_<<<<HASH_PLACEHOLDER>>>>_0;
+            else if(!kern_flipped && ccontig  && split && !full_kern) k=&conv_patch_stack_reduce_6_node_<<<<HASH_PLACEHOLDER>>>>_0;
+            else if(!kern_flipped && ccontig  && split && full_kern) k=&conv_patch_stack_reduce_7_node_<<<<HASH_PLACEHOLDER>>>>_0;
+            /*else if(kern_flipped  && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_8_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
+            else if(kern_flipped  && !ccontig && !split && full_kern) k=&conv_patch_stack_reduce_9_node_<<<<HASH_PLACEHOLDER>>>>_0;
+            else if(kern_flipped  && !ccontig && split && !full_kern) k=&conv_patch_stack_reduce_10_node_<<<<HASH_PLACEHOLDER>>>>_0;
+            else if(kern_flipped  && !ccontig && split && full_kern) k=&conv_patch_stack_reduce_11_node_<<<<HASH_PLACEHOLDER>>>>_0;
+            /*else if(kern_flipped  && ccontig  && !split && !full_kern) k=&conv_patch_stack_reduce_12_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
+            else if(kern_flipped  && ccontig  && !split && full_kern) k=&conv_patch_stack_reduce_13_node_<<<<HASH_PLACEHOLDER>>>>_0;
+            else if(kern_flipped  && ccontig  && split && !full_kern) k=&conv_patch_stack_reduce_14_node_<<<<HASH_PLACEHOLDER>>>>_0;
+            else if(kern_flipped  && ccontig  && split && full_kern) k=&conv_patch_stack_reduce_15_node_<<<<HASH_PLACEHOLDER>>>>_0;
+
+            void *kernel_params[] = {
+                (void *)img->ga.data, (void *)&img->ga.offset,
+                (void *)kern->ga.data, (void *)&kern->ga.offset,
+                (void *)out->ga.data, (void *)&out->ga.offset,
+                (void *)&img_len, (void *)&img_wid,
+                (void *)&kern_len, (void *)&kern_wid,
+                (void *)&nkern, (void *)&nstack,
+                (void *)&img_stride_col, (void *)&img_stride_row,
+                (void *)&img_stride_stack, (void *)&img_stride_batch,
+                (void *)&kern_stride_col,
+                (void *)&kern_stride_row,
+                (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
+            int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
+
+            if (err == GA_NO_ERROR)
            {
                if (verbose>1)
                    fprintf(stderr,
-                            "threads.x=%i, threads.y=%i, threads.z=%i, "
-                            "grid.x=%i, grid.y=%i, shared_size=%i,"
+                            "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i, "
+                            "n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i,"
                            " nb_threads=%i\n",
-                            threads.x, threads.y, threads.z, grid.x, grid.y,
-                            shared_size, threads.x * threads.y * threads.z);
+                            threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1],
+                            shmem_sz, threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
                if (verbose)
                    fprintf(stderr,
                            "INFO: used 'conv_patch_stack_reduce' version"
@@ -711,17 +699,17 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
            {
                if (verbose)
                  fprintf(stderr,
-                          "threads.x=%i, threads.y=%i, threads.z=%i,"
-                          " grid.x=%i, grid.y=%i,shared_size=%i,"
+                          "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
+                          " n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i,"
                          " nb_threads=%i\n",
-                          threads.x, threads.y, threads.z,
-                          grid.x, grid.y, shared_size,
-                          threads.x * threads.y * threads.z);
+                          threads_per_block[0], threads_per_block[1], threads_per_block[2],
+                          n_blocks[0], n_blocks[1], shmem_sz,
+                          threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
                if (verbose)
                  fprintf(stderr,
                          "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
                          " trying next implementation\n",
-                          cudaGetErrorString(sts));
+                          GpuKernel_error(k, err));
            }
        } // else no good nb_splits was found
    }
@@ -730,8 +718,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        kern_len<=320 &&
        !work_complete) //conv_valid_row_reduce
    {
-        int outsize = PyGpuArray_SIZE(out);
-        int n_blocks = std::min(outsize, 4096);
+        size_t outsize = PyGpuArray_SIZE(out);
+        size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
+                              (size_t)1, (size_t)1};

        int block_nstack=nstack;
        //Max of 512 threads per blocks.
@@ -739,9 +728,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        //8k registers and the kernel use 23 register
        //TODO: check if we have 8k or 16k of register...
        while(block_nstack*kern_len>320)block_nstack--;
-        dim3 n_threads(block_nstack, kern_len, 1);
+        size_t threads_per_block[3] = {(size_t)block_nstack, (size_t)kern_len, (size_t)1};

-        int n_reduce_buf = block_nstack * kern_len * sizeof(float);
+        size_t n_reduce_buf = block_nstack * kern_len * sizeof(float);
        /* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
         *
         * if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
@@ -758,39 +747,34 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
            assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
        }

-
-        void (*f)(int, int, int, int,
-                  int, int, int, int, int,
-                  const float*, int, int, int, int,
-                  const float*, int, int, int, int,
-                  float*, int, int, int, int,
-                  int, int, int);
-
+        GpuKernel *k = NULL;
        //std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
        //std::cerr << "kerns " << nstack << " " << kern_len << "\n";
        //std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
        if(block_nstack==nstack)
-          f=conv_valid_row_reduce<false>;
+          k=&conv_valid_row_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
        else
-          f=conv_valid_row_reduce<true>;
-        f<<<n_blocks, n_threads, n_reduce_buf>>>(
-                nbatch, nkern, PyGpuArray_DIMS(img)[1],
-                img_len, img_wid,
-                kern_len, kern_wid,
-                out_len, out_wid,
-                cuda_get_ptr(img),
-                PyGpuArray_STRIDES(img)[0]/4, PyGpuArray_STRIDES(img)[1]/4, 
-                img_stride_row, img_stride_col,
-                cuda_get_ptr(kern),
-                PyGpuArray_STRIDES(kern)[0]/4, PyGpuArray_STRIDES(kern)[1]/4,
-                PyGpuArray_STRIDES(kern)[2]/4, PyGpuArray_STRIDES(kern)[3]/4,
-                cuda_get_ptr(out),
-                PyGpuArray_STRIDES(out)[0]/4, PyGpuArray_STRIDES(out)[1]/4,
-                PyGpuArray_STRIDES(out)[2]/4, PyGpuArray_STRIDES(out)[3]/4,
-                subsample_rows, subsample_cols, initial_reduce_boundary);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+          k=&conv_valid_row_reduce_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
+
+        void *kernel_params[] = {
+            (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
+            (void *)&img_len, (void *)&img_wid,
+            (void *)&kern_len, (void *)&kern_wid,
+            (void *)&out_len, (void *)&out_wid,
+            (void *)img->ga.data, (void *)&img->ga.offset,
+            (void *)&img_stride_batch, (void *)&img_stride_stack,
+            (void *)&img_stride_row, (void *)&img_stride_col,
+            (void *)kern->ga.data, (void *)&kern->ga.offset,
+            (void *)&kern_stride_nkern, (void *)&kern_stride_stack,
+            (void *)&kern_stride_row, (void *)&kern_stride_col,
+            (void *)out->ga.data, (void *)&out->ga.offset,
+            (void *)&out_stride_batch, (void *)&out_stride_nkern,
+            (void *)&out_stride_row, (void *)&out_stride_col,
+            (void *)&subsample_rows, (void *)&subsample_cols,
+            (void *)&initial_reduce_boundary};
+        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, n_reduce_buf, kernel_params);
+
+        if (err == GA_NO_ERROR)
        {
            work_complete = true;
            if (verbose)
@@ -800,24 +784,27 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        {
            if (verbose)
              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      n_threads.x, n_threads.y, n_blocks,
-                      n_reduce_buf, n_threads.x * n_threads.y);
+                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i,"
+                      " shmem_sz=%i, nb_threads=%i\n",
+                      threads_per_block[0], threads_per_block[1], n_blocks[0],
+                      n_reduce_buf, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_valid_row_reduce' failed (%s),"
                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
+                      GpuKernel_error(k, err));
        }
    }

    if (1 && !work_complete) //conv_reference_valid
    {
-        int outsize = PyGpuArray_SIZE(out);
-        int n_blocks = std::min(outsize, 4096);
-        int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
-                                 256);
+        size_t outsize = PyGpuArray_SIZE(out);
+        size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
+                              (size_t)1, (size_t)1};
+        size_t threads_per_block[3] = {std::min(ceil_intdiv(outsize, n_blocks[0]),
+                                        (size_t)256),
+                               (size_t)1, (size_t)1};
+
        if (1)
        {
            if (verbose)
@@ -825,61 +812,56 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
            if (verbose>1)
              fprintf(stderr, "      img : %i %llu %i %i %p  "
                      "%lld %lld %lld %lld\n",
-                      nbatch, (unsigned long long)PyGpuArray_DIMS(img)[1],
-                      img_len, img_wid,
-                      cuda_get_ptr(img),
-                      (long long)PyGpuArray_STRIDES(img)[0]/4,
-                      (long long)PyGpuArray_STRIDES(img)[1]/4,
-                      (long long)PyGpuArray_STRIDES(img)[2]/4,
-                      (long long)PyGpuArray_STRIDES(img)[3]/4);
+                      nbatch, (unsigned long long)stack_len, img_len, img_wid,
+                      (void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
+                      (long long)img_stride_batch,
+                      (long long)img_stride_stack,
+                      (long long)img_stride_row,
+                      (long long)img_stride_col);
            if (verbose>1)
              fprintf(stderr, "      kern: %i %i %i %i %p  "
                      "%lld %lld %lld %lld\n",
                      nkern, nstack, kern_len, kern_wid,
-                      cuda_get_ptr(kern),
-                      (long long)PyGpuArray_STRIDES(kern)[0]/4,
-                      (long long)PyGpuArray_STRIDES(kern)[1]/4,
-                      (long long)PyGpuArray_STRIDES(kern)[2]/4,
-                      (long long)PyGpuArray_STRIDES(kern)[3]/4);
+                      (void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
+                      (long long)kern_stride_nkern,
+                      (long long)kern_stride_stack,
+                      (long long)kern_stride_row,
+                      (long long)kern_stride_col);
            if (verbose>1)
                fprintf(stderr, "      out : %llu %llu %i %i %p  "
                        "%lld %lld %lld %lld\n",
                      (unsigned long long)PyGpuArray_DIMS(out)[0],
                      (unsigned long long)PyGpuArray_DIMS(out)[1],
                      out_len, out_wid,
-                      cuda_get_ptr(out),
-                      (long long)PyGpuArray_STRIDES(out)[0]/4,
-                      (long long)PyGpuArray_STRIDES(out)[1]/4,
-                      (long long)PyGpuArray_STRIDES(out)[2]/4,
-                      (long long)PyGpuArray_STRIDES(out)[3]/4);
+                      (void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
+                      (long long)out_stride_batch,
+                      (long long)out_stride_nkern,
+                      (long long)out_stride_row,
+                      (long long)out_stride_col);
            if (verbose>1)
              fprintf(stderr, "   launch params: %i %i %i\n",
-                      outsize, n_blocks, n_threads);
+                      outsize, n_blocks[0], threads_per_block[0]);
        }
-        conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
-                PyGpuArray_DIMS(img)[1],
-                img_len, img_wid,
-                kern_len, kern_wid,
-                out_len, out_wid,
-                cuda_get_ptr(img),
-                PyGpuArray_STRIDES(img)[0]/4,
-                PyGpuArray_STRIDES(img)[1]/4,
-                PyGpuArray_STRIDES(img)[2]/4,
-                PyGpuArray_STRIDES(img)[3]/4,
-                cuda_get_ptr(kern),
-                PyGpuArray_STRIDES(kern)[0]/4,
-                PyGpuArray_STRIDES(kern)[1]/4,
-                PyGpuArray_STRIDES(kern)[2]/4,
-                PyGpuArray_STRIDES(kern)[3]/4,
-                cuda_get_ptr(out),
-                PyGpuArray_STRIDES(out)[0]/4,
-                PyGpuArray_STRIDES(out)[1]/4,
-                PyGpuArray_STRIDES(out)[2]/4,
-                PyGpuArray_STRIDES(out)[3]/4,
-                subsample_rows, subsample_cols);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
+
+        void *kernel_params[] = {
+            (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
+            (void *)&img_len, (void *)&img_wid,
+            (void *)&kern_len, (void *)&kern_wid,
+            (void *)&out_len, (void *)&out_wid,
+            (void *)img->ga.data, (void *)&img->ga.offset,
+            (void *)&img_stride_batch, (void *)&img_stride_stack,
+            (void *)&img_stride_row, (void *)&img_stride_col,
+            (void *)kern->ga.data, (void *)&kern->ga.offset,
+            (void *)&kern_stride_nkern, (void *)&kern_stride_stack,
+            (void *)&kern_stride_row, (void *)&kern_stride_col,
+            (void *)out->ga.data, (void *)&out->ga.offset,
+            (void *)&out_stride_batch, (void *)&out_stride_nkern,
+            (void *)&out_stride_row, (void *)&out_stride_col,
+            (void *)&subsample_rows, (void *)&subsample_cols};
+        int err = GpuKernel_call(&conv_reference_valid_node_<<<<HASH_PLACEHOLDER>>>>_0,
+                                 3, threads_per_block, n_blocks, 0, kernel_params);
+
+        if (err == GA_NO_ERROR)
        {
            work_complete = true;
            if (verbose)
@@ -892,7 +874,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
            PyErr_Format(PyExc_RuntimeError,
                         "ERROR: all implementations failed for"
                         " PyGpuArray_conv_valid! (%s)",
-                         cudaGetErrorString(sts));
+                         GpuKernel_error(&conv_reference_valid_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
            return -1;
        }
    }
@@ -941,6 +923,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
    assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]);
    assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]);

+    const int stack_len=PyGpuArray_DIMS(img)[1];
    const int nstack=PyGpuArray_DIMS(kern)[1];
    const int nbatch=PyGpuArray_DIMS(img)[0];
    const int nkern=PyGpuArray_DIMS(kern)[0];
@@ -959,6 +942,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
    const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
    const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
    const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
+    const int out_stride_col = PyGpuArray_STRIDES(out)[3]/4;
+    const int out_stride_row = PyGpuArray_STRIDES(out)[2]/4;
+    const int out_stride_nkern = PyGpuArray_STRIDES(out)[1]/4;
+    const int out_stride_batch = PyGpuArray_STRIDES(out)[0]/4;

    const int img_size=img_len*img_wid;
    const int kern_size=kern_len*kern_wid;
@@ -1001,16 +988,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
    //we don't need to unflip it, but have the new value when we unflip it.
    bool kern_flipped=true;
    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
-    const float * kern_data_unflipped = cuda_get_ptr(kern);
-    int kern_stride_col_unflipped=kern_stride_col;
-    int kern_stride_row_unflipped=kern_stride_row;
-    if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
+    if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
      //the last two dimensions are c_contiguous but flipped!
-      kern_stride_col_unflipped=1;
-      kern_stride_row_unflipped=kern_wid;
      kern_flipped=false;
      kern_contiguous_2d_unflipped = true;
-      kern_data_unflipped=&(cuda_get_ptr(kern)[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
    }

    if (verbose>1)
@@ -1019,34 +1000,34 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
               " MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
        printf("INFO:   img  dim: %llu %llu %llu %llu  "
               "img  stride: %lld %lld %lld %lld\n",
-               (unsigned long long)PyGpuArray_DIMS(img)[0],
-               (unsigned long long)PyGpuArray_DIMS(img)[1],
-               (unsigned long long)PyGpuArray_DIMS(img)[2],
-               (unsigned long long)PyGpuArray_DIMS(img)[3],
-               (long long)PyGpuArray_STRIDES(img)[0]/4,
-               (long long)PyGpuArray_STRIDES(img)[1]/4,
-               (long long)PyGpuArray_STRIDES(img)[2]/4,
-               (long long)PyGpuArray_STRIDES(img)[3]/4);
+               (unsigned long long)nbatch,
+               (unsigned long long)stack_len,
+               (unsigned long long)img_len,
+               (unsigned long long)img_wid,
+               (long long)img_stride_batch,
+               (long long)img_stride_stack,
+               (long long)img_stride_row,
+               (long long)img_stride_col);
        printf("INFO:   kern dim: %llu %llu %llu %llu  "
               "kern stride: %lld %lld %lld %lld\n",
-               (unsigned long long)PyGpuArray_DIMS(kern)[0],
-               (unsigned long long)PyGpuArray_DIMS(kern)[1],
-               (unsigned long long)PyGpuArray_DIMS(kern)[2],
-               (unsigned long long)PyGpuArray_DIMS(kern)[3],
-               (long long)PyGpuArray_STRIDES(kern)[0]/4,
-               (long long)PyGpuArray_STRIDES(kern)[1]/4,
-               (long long)PyGpuArray_STRIDES(kern)[2]/4,
-               (long long)PyGpuArray_STRIDES(kern)[3]/4);
+               (unsigned long long)nkern,
+               (unsigned long long)nstack,
+               (unsigned long long)kern_len,
+               (unsigned long long)kern_wid,
+               (long long)kern_stride_nkern,
+               (long long)kern_stride_stack,
+               (long long)kern_stride_row,
+               (long long)kern_stride_col);
        printf("INFO:   out dim: %llu %llu %llu %llu  "
               "out stride: %lld %lld %lld %lld\n",
               (unsigned long long)PyGpuArray_DIMS(out)[0],
               (unsigned long long)PyGpuArray_DIMS(out)[1],
-               (unsigned long long)PyGpuArray_DIMS(out)[2],
-               (unsigned long long)PyGpuArray_DIMS(out)[3],
-               (long long)PyGpuArray_STRIDES(out)[0]/4,
-               (long long)PyGpuArray_STRIDES(out)[1]/4,
-               (long long)PyGpuArray_STRIDES(out)[2]/4,
-               (long long)PyGpuArray_STRIDES(out)[3]/4);
+               (unsigned long long)out_len,
+               (unsigned long long)out_wid,
+               (long long)out_stride_batch,
+               (long long)out_stride_nkern,
+               (long long)out_stride_row,
+               (long long)out_stride_col);
    }

    if (!subsample &&
@@ -1093,53 +1074,53 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        assert(version!=5 || kern_len>1);
        assert(version!=-1);

-        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
-        dim3 grid(nbatch,nkern);
+        size_t threads_per_block[3] = {(size_t)out_wid,
+                               ceil_intdiv((size_t)out_len,(size_t)nb_split),
+                               (size_t)1};
+        size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};

-        int shared_size=img_size_padded_byte + kern_size_byte;
+        size_t shmem_sz=img_size_padded_byte + kern_size_byte;
        if(version==5)
-          shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int);
-
-#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
-             if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
-        else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
-        else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
-        else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
-        else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
-        else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
-        else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
+          shmem_sz=((kern_len+threads_per_block[1]-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
+
+        GpuKernel *k = NULL;
+        if(version==3) k=&conv_full_patch_stack_padded_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(version==5) k=&conv_full_patch_stack_padded_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(version==4) k=&conv_full_patch_stack_padded_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) k=&conv_full_patch_stack_padded_4_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) k=&conv_full_patch_stack_padded_5_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) k=&conv_full_patch_stack_padded_6_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(version==3 && kern_flipped) k=&conv_full_patch_stack_padded_8_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(version==5 && kern_flipped)k=&conv_full_patch_stack_padded_9_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(version==4 && kern_flipped)k=&conv_full_patch_stack_padded_10_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) k=&conv_full_patch_stack_padded_12_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) k=&conv_full_patch_stack_padded_13_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) k=&conv_full_patch_stack_padded_14_node_<<<<HASH_PLACEHOLDER>>>>_0;
        else assert(false);

-        CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
-
-        f<<< grid, threads, shared_size>>>
-            (cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
-              img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-              img_stride_col, img_stride_row, img_stride_stack,
-              img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
-              kern_stride_stack, kern_stride_nkern);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts)
+        void *kernel_params[] = {
+            (void *)img->ga.data, (void *)&img->ga.offset,
+            (void *)kern->ga.data, (void *)&kern->ga.offset,
+            (void *)out->ga.data, (void *)&out->ga.offset,
+            (void *)&img_len, (void *)&img_wid,
+            (void *)&kern_len, (void *)&kern_wid,
+            (void *)&nkern, (void *)&nstack,
+            (void *)&img_stride_col, (void *)&img_stride_row,
+            (void *)&img_stride_stack, (void *)&img_stride_batch,
+            (void *)&kern_stride_col, (void *)&kern_stride_row,
+            (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
+        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
+
+        if (err == GA_NO_ERROR)
        {
          if (verbose>1)
            fprintf(stderr,
-                    "threads.x=%i, threads.y=%i, threads.z=%i,"
-                    " grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
+                    "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
+                    " n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i, nb_threads=%i,"
                    " out_len=%i, nb_split=%i, version=%i\n",
-                    threads.x, threads.y, threads.z,
-                    grid.x, grid.y, shared_size,
-                    threads.x * threads.y * threads.z,
+                    threads_per_block[0], threads_per_block[1], threads_per_block[2],
+                    n_blocks[0], n_blocks[1], shmem_sz,
+                    threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
                    out_len, nb_split, version);
            if (verbose)
              fprintf(stderr,
@@ -1152,12 +1133,12 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        {
          if (verbose)
            fprintf(stderr,
-                    "threads.x=%i, threads.y=%i, threads.z=%i,"
-                    " grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i,"
+                    "threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
+                    " n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i, nb_threads=%i,"
                    " out_len=%i, nb_split=%i, version=%i\n",
-                    threads.x, threads.y, threads.z,
-                    grid.x, grid.y, shared_size,
-                    threads.x * threads.y * threads.z,
+                    threads_per_block[0], threads_per_block[1], threads_per_block[2],
+                    n_blocks[0], n_blocks[1], shmem_sz,
+                    threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
                    out_len, nb_split, version);
          if (verbose)
            fprintf(stderr,
@@ -1165,7 +1146,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
                    " failed (%s), trying next implementation\n",
                    version==3?"no split": "split",
                    (version==5?"low_mem":"not_low_mem"),
-                    cudaGetErrorString(sts));
+                    GpuKernel_error(k, err));
        }                         
    }

@@ -1176,21 +1157,22 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
        !work_complete) //conv_full_patch
    {
-        dim3 threads(out_wid, out_len);
-        dim3 grid(nbatch,nkern);
-        int shared_size=(img_size + kern_size)*sizeof(float);
+        size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
+        size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
+        size_t shmem_sz = (img_size + kern_size)*sizeof(float);
        //TODO assert c_continious for img, kern and out in the 2 inner dimensions.

-        conv_full_patch<<< grid, threads, shared_size>>>
-            (cuda_get_ptr(img),
-             cuda_get_ptr(kern),
-             cuda_get_ptr(out),
-           img_len, img_wid,
-           kern_len, kern_wid,
-           nkern, nstack);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+        void *kernel_params[] = {
+            (void *)img->ga.data, (void *)&img->ga.offset,
+            (void *)kern->ga.data, (void *)&kern->ga.offset,
+            (void *)out->ga.data, (void *)&out->ga.offset,
+            (void *)&img_len, (void *)&img_wid,
+            (void *)&kern_len, (void *)&kern_wid,
+            (void *)&nkern, (void *)&nstack};
+        int err = GpuKernel_call(&conv_full_patch_node_<<<<HASH_PLACEHOLDER>>>>_0,
+                                 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
+
+        if (err == GA_NO_ERROR)
        {
            if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n");
            work_complete = true;
@@ -1199,15 +1181,15 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        {
            if (verbose)
              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y, shared_size,
-                      threads.x * threads.y);
+                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
+                      " shmem_sz=%i, nb_threads=%i\n",
+                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
+                      threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr,
                      "INFO: impl 'conv_full_patch' failed (%s),"
                      " trying next implementation\n",
-                      cudaGetErrorString(sts));
+                      GpuKernel_error(&conv_full_patch_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
        }                         
    }
    if (false && !subsample && //disabled as test fail for this kernel
@@ -1217,37 +1199,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
        !work_complete) //conv_full_load_everything
    {
-        dim3 threads(out_wid, out_len);
-        dim3 grid(nbatch);
-        int shared_size=(img_size + kern_size)*nstack*sizeof(float);
+        size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
+        size_t n_blocks[3] = {(size_t)nbatch, (size_t)1, (size_t)1};
+        size_t shmem_sz = (img_size + kern_size)*nstack*sizeof(float);
        //TODO assert c_continious for img, kern and out in the 2 inner dimensions.

-        //typeof(conv_full_load_everything<0>) f = ;
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int, int, int,
-                  int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
-
-        f = conv_full_load_everything<THEANO_KERN_WID>;
-
-        f<<< grid, threads, shared_size>>>
-            (cuda_get_ptr(img),
-             cuda_get_ptr(kern),
-             cuda_get_ptr(out),
-           img_len, img_wid, 
-           kern_len, kern_wid,
-           nkern, nstack,
-           PyGpuArray_STRIDES(img)[3]/4,
-           PyGpuArray_STRIDES(img)[2]/4,
-           PyGpuArray_STRIDES(img)[1]/4,
-           PyGpuArray_STRIDES(img)[0]/4,
-           PyGpuArray_STRIDES(kern)[3]/4,
-           PyGpuArray_STRIDES(kern)[2]/4,
-           PyGpuArray_STRIDES(kern)[1]/4,
-           PyGpuArray_STRIDES(kern)[0]/4
-           );
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+        void *kernel_params[] = {
+            (void *)img->ga.data, (void *)&img->ga.offset,
+            (void *)kern->ga.data, (void *)&kern->ga.offset,
+            (void *)out->ga.data, (void *)&out->ga.offset,
+            (void *)&img_len, (void *)&img_wid,
+            (void *)&kern_len, (void *)&kern_wid,
+            (void *)&nkern, (void *)&nstack,
+            (void *)&img_stride_col, (void *)&img_stride_row,
+            (void *)&img_stride_stack, (void *)&img_stride_batch,
+            (void *)&kern_stride_col, (void *)&kern_stride_row,
+            (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
+        int err = GpuKernel_call(&conv_full_load_everything_node_<<<<HASH_PLACEHOLDER>>>>_0,
+                                 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
+
+        if (err == GA_NO_ERROR)
        {
            if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n");
            work_complete = true;
@@ -1256,14 +1227,14 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        {
            if (verbose)
              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y, shared_size,
-                      threads.x * threads.y);
+                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
+                      " shmem_sz=%i, nb_threads=%i\n",
+                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
+                      threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
                      " failed (%s), trying next implementation\n",
-                      cudaGetErrorString(sts));
+                      GpuKernel_error(&conv_full_load_everything_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
        }
    }

@@ -1275,32 +1246,29 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
        !work_complete) //conv_full_patch_stack
    {
-        dim3 threads(out_wid, out_len);
-        dim3 grid(nbatch,nkern);
-        int shared_size=(img_size + kern_size)*sizeof(float);
-
-        void (*f)(const float*, const float*, float*,
-                  int, int, int, int,
-                  int, int, int, int,
-                  int, int, int, int);
-
-        if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
-        else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
-        else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<false,true>;\
-        else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>;
-
-        f<<< grid, threads, shared_size>>>(
-                cuda_get_ptr(img),
-                cuda_get_ptr(kern),
-                cuda_get_ptr(out),
-                img_len, img_wid,
-                kern_len, kern_wid,
-                nkern, nstack,img_stride_col, img_stride_row,
-                kern_stride_col, kern_stride_row,
-                kern_stride_stack, kern_stride_nkern);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+        size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
+        size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
+        size_t shmem_sz = (img_size + kern_size)*sizeof(float);
+
+        GpuKernel *k = NULL;
+        if(!img_contiguous_2d && !kern_contiguous_2d) k=&conv_full_patch_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(!img_contiguous_2d && kern_contiguous_2d) k=&conv_full_patch_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(img_contiguous_2d && !kern_contiguous_2d) k=&conv_full_patch_stack_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
+        else if(img_contiguous_2d && kern_contiguous_2d) k=&conv_full_patch_stack_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
+
+        void *kernel_params[] = {
+            (void *)img->ga.data, (void *)&img->ga.offset,
+            (void *)kern->ga.data, (void *)&kern->ga.offset,
+            (void *)out->ga.data, (void *)&out->ga.offset,
+            (void *)&img_len, (void *)&img_wid,
+            (void *)&kern_len, (void *)&kern_wid,
+            (void *)&nkern, (void *)&nstack,
+            (void *)&img_stride_col, (void *)&img_stride_row,
+            (void *)&kern_stride_col, (void *)&kern_stride_row,
+            (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
+        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
+
+        if (err == GA_NO_ERROR)
        {
            if (verbose)
              fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
@@ -1310,23 +1278,26 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        {
            if (verbose)
              fprintf(stderr,
-                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                      " shared_size=%i, nb_threads=%i\n",
-                      threads.x, threads.y, grid.x, grid.y,
-                      shared_size, threads.x * threads.y);
+                      "threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
+                      " shmem_sz=%i, nb_threads=%i\n",
+                      threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
+                      shmem_sz, threads_per_block[0] * threads_per_block[1]);
            if (verbose)
              fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
-                      cudaGetErrorString(sts));
+                      GpuKernel_error(k, err));
        }                         
    }
    if (1 && !work_complete) //conv_reference_full
    {
        if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");

-        int outsize = PyGpuArray_SIZE(out);
-        int n_blocks = std::min(outsize, 4096);
-        int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
-                                 256);
+        size_t outsize = PyGpuArray_SIZE(out);
+        size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
+                              (size_t)1, (size_t)1};
+        size_t threads_per_block[3] = {std::min(ceil_intdiv(outsize, n_blocks[0]),
+                                        (size_t)256),
+                               (size_t)1, (size_t)1};
+
        if (0)
        {
            if (verbose)
@@ -1334,70 +1305,67 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
            if (verbose)
              fprintf(stderr, "      img : %llu %llu %llu %llu %p  "
                      "%lld %lld %lld %lld\n",
-                      (unsigned long long)PyGpuArray_DIMS(img)[0],
-                      (unsigned long long)PyGpuArray_DIMS(img)[1],
-                      (unsigned long long)PyGpuArray_DIMS(img)[2],
-                      (unsigned long long)PyGpuArray_DIMS(img)[3],
-                      cuda_get_ptr(img),
-                      (long long)PyGpuArray_STRIDES(img)[0]/4,
-                      (long long)PyGpuArray_STRIDES(img)[1]/4,
-                      (long long)PyGpuArray_STRIDES(img)[2]/4,
-                      (long long)PyGpuArray_STRIDES(img)[3]/4);
+                      (unsigned long long)nbatch,
+                      (unsigned long long)stack_len,
+                      (unsigned long long)img_len,
+                      (unsigned long long)img_wid,
+                      (void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
+                      (long long)img_stride_batch,
+                      (long long)img_stride_stack,
+                      (long long)img_stride_row,
+                      (long long)img_stride_col);
            if (verbose)
              fprintf(stderr, "      kern: %llu %llu %llu %llu %p  "
                      "%lld %lld %lld %lld\n",
-                      (unsigned long long)PyGpuArray_DIMS(kern)[0],
-                      (unsigned long long)PyGpuArray_DIMS(kern)[1],
-                      (unsigned long long)PyGpuArray_DIMS(kern)[2],
-                      (unsigned long long)PyGpuArray_DIMS(kern)[3],
-                      cuda_get_ptr(kern),
-                      (long long)PyGpuArray_STRIDES(kern)[0]/4,
-                      (long long)PyGpuArray_STRIDES(kern)[1]/4,
-                      (long long)PyGpuArray_STRIDES(kern)[2]/4,
-                      (long long)PyGpuArray_STRIDES(kern)[3]/4
-                        );
+                      (unsigned long long)nkern,
+                      (unsigned long long)nstack,
+                      (unsigned long long)kern_len,
+                      (unsigned long long)kern_wid,
+                      (void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
+                      (long long)kern_stride_nkern,
+                      (long long)kern_stride_stack,
+                      (long long)kern_stride_row,
+                      (long long)kern_stride_col);
            if (verbose)
                fprintf(stderr, "      out : %llu %llu %llu %llu %p  "
                        "%lld %lld %lld %lld\n",
                      (unsigned long long)PyGpuArray_DIMS(out)[0],
                      (unsigned long long)PyGpuArray_DIMS(out)[1],
-                      (unsigned long long)PyGpuArray_DIMS(out)[2],
-                      (unsigned long long)PyGpuArray_DIMS(out)[3],
-                      cuda_get_ptr(out),
-                      (long long)PyGpuArray_STRIDES(out)[0]/4,
-                      (long long)PyGpuArray_STRIDES(out)[1]/4,
-                      (long long)PyGpuArray_STRIDES(out)[2]/4,
-                      (long long)PyGpuArray_STRIDES(out)[3]/4);
+                      (unsigned long long)out_len,
+                      (unsigned long long)out_wid,
+                      (void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
+                      (long long)out_stride_batch,
+                      (long long)out_stride_nkern,
+                      (long long)out_stride_row,
+                      (long long)out_stride_col);
            if (verbose)
              fprintf(stderr, "   launch params: %i %i %i\n",
-                      outsize, n_blocks, n_threads);
+                      outsize, n_blocks[0], threads_per_block[0]);
            if (verbose)
                fprintf(stderr, "   subsample params: %llu %llu\n",
                        (unsigned long long)subsample_rows,
                        (unsigned long long)subsample_cols);
        }
-        conv_reference_full<<<n_blocks, n_threads>>>(
-                PyGpuArray_DIMS(img)[0], PyGpuArray_DIMS(kern)[0],
-                PyGpuArray_DIMS(img)[1],
-                PyGpuArray_DIMS(img)[2], PyGpuArray_DIMS(img)[3],
-                PyGpuArray_DIMS(kern)[2], PyGpuArray_DIMS(kern)[3],
-                PyGpuArray_DIMS(out)[2], PyGpuArray_DIMS(out)[3],
-                cuda_get_ptr(img), PyGpuArray_STRIDES(img)[0]/4,
-                PyGpuArray_STRIDES(img)[1]/4,
-                PyGpuArray_STRIDES(img)[2]/4,
-                PyGpuArray_STRIDES(img)[3]/4,
-                cuda_get_ptr(kern), PyGpuArray_STRIDES(kern)[0]/4,
-                PyGpuArray_STRIDES(kern)[1]/4,
-                PyGpuArray_STRIDES(kern)[2]/4,
-                PyGpuArray_STRIDES(kern)[3]/4,
-                cuda_get_ptr(out), PyGpuArray_STRIDES(out)[0]/4,
-                PyGpuArray_STRIDES(out)[1]/4,
-                PyGpuArray_STRIDES(out)[2]/4,
-                PyGpuArray_STRIDES(out)[3]/4,
-                subsample_rows, subsample_cols);
-
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+
+        void *kernel_params[] = {
+            (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
+            (void *)&img_len, (void *)&img_wid,
+            (void *)&kern_len, (void *)&kern_wid,
+            (void *)&out_len, (void *)&out_wid,
+            (void *)img->ga.data, (void *)&img->ga.offset,
+            (void *)&img_stride_batch, (void *)&img_stride_stack,
+            (void *)&img_stride_row, (void *)&img_stride_col,
+            (void *)kern->ga.data, (void *)&kern->ga.offset,
+            (void *)&kern_stride_nkern, (void *)&kern_stride_stack,
+            (void *)&kern_stride_row, (void *)&kern_stride_col,
+            (void *)out->ga.data, (void *)&out->ga.offset,
+            (void *)&out_stride_batch, (void *)&out_stride_nkern,
+            (void *)&out_stride_row, (void *)&out_stride_col,
+            (void *)&subsample_rows, (void *)&subsample_cols};
+        int err = GpuKernel_call(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0,
+                                 3, threads_per_block, n_blocks, 0, kernel_params);
+
+        if (err == GA_NO_ERROR)
        {
            if (verbose)
              fprintf(stderr, "INFO: used 'conv_reference_full' version"
@@ -1410,17 +1378,18 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        else
        {
          if (verbose)
-            fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-                    " shared_size=%i, nb_threads=%i\n",
-                    n_threads, 1, n_blocks, 1, 0, n_threads);
+            fprintf(stderr, "threads_per_block[0]=%i, threads_per_block[1]=%i,"
+                    " n_blocks[0]=%i, n_blocks[1]=%i,"
+                    " shmem_sz=%i, nb_threads=%i\n",
+                    threads_per_block[0], 1, n_blocks[0], 1, 0, threads_per_block[0]);
          if (verbose)
            fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
                    " trying next implementation\n",
-                    cudaGetErrorString(sts));
+                    GpuKernel_error(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
          PyErr_Format(PyExc_RuntimeError,
                       "ERROR: all implementations failed for"
                       " CudaNdarray_conv_full! (%s)",
-                       cudaGetErrorString(sts));
+                       GpuKernel_error(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
          return -1;
        }
    }

--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
@@ -3,13 +3,20 @@ import os

 import theano
 from theano import config, gof
+
+try:
+    import pygpu
+    from pygpu import gpuarray
+except ImportError:
+    pass
+
 from six.moves import reduce
 from .comp import NVCC_compiler
 from .type import GpuArrayType
-from .basic_ops import as_gpuarray_variable
-
+from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
+from theano.gof import utils

-class GpuConv(gof.Op):
+class GpuConv(GpuKernelBase, gof.Op):
    """
    Implement the batched and stacked 2d convolution on the gpu.

@@ -223,29 +230,29 @@ class GpuConv(gof.Op):
        return ['-DTHEANO_KERN_WID=' + str(nb)]  # ,'-g','-G']

    def c_headers(self):
-        return ['<stdio.h>', 'cuda.h',
-                '<gpuarray/extension.h>', '<numpy_compat.h>']
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        return ['<stdint.h>', '<stdio.h>', 'cuda.h',
+                '<gpuarray/extension.h>', '<numpy_compat.h>',
+                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
+
+    def c_header_dirs(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        cuda_root = config.cuda.root
+        if cuda_root:
+            return [os.path.join(cuda_root, 'include')]
+        else:
+            return []

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
        return (0, 21)

    def c_init_code(self):
-        return ['cuda_get_ptr_raw = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
-
-    def c_support_code_apply(self, node, nodename):
-        # REMEMBER TO RAISE c_code_cache_version when changing any of
-        # these files
-        files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu']
-        codes = ["CUdeviceptr (*cuda_get_ptr_raw)(gpudata *g);",
-                 "float* cuda_get_ptr(PyGpuArrayObject * o){return (float*) (cuda_get_ptr_raw(o->ga.data) + o->ga.offset);}",
-                 "const float* cuda_get_ptr(const PyGpuArrayObject * o){return (float*) (cuda_get_ptr_raw(o->ga.data) + o->ga.offset);}"]
-        codes += [open(os.path.join(os.path.split(__file__)[0], f)).read()
-                  for f in files]
-        return reduce(str.__add__, codes)
-
-    def c_compiler(self):
-        return NVCC_compiler
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        return ['setup_ext_cuda();']

    def c_code(self, node, nodename, inp, out_, sub):
        img, kern = inp
@@ -270,8 +277,8 @@ class GpuConv(gof.Op):
    //Optional args
    int version = %(version)s;
    int verbose = %(verbose)s;
-    int dx = %(dx)s;
-    int dy = %(dy)s;
+    size_t dx = %(dx)s;
+    size_t dy = %(dy)s;

    int mode;
    if (strcmp(mode_str, "full") == 0)
@@ -286,7 +293,7 @@ class GpuConv(gof.Op):
    {
        PyErr_SetString(PyExc_ValueError,
                        "mode must be one of 'full' or 'valid'");
-        return NULL;
+        return 0;
    }

    // TODO, make out be decref before we alloc out2!
@@ -303,3 +310,261 @@ class GpuConv(gof.Op):
        %(fail)s
    }
 """ % sub
+
+    def c_support_code_apply(self, node, name):
+        nb = 0
+        if self.kshp is not None:
+            nb = self.kshp[1]
+        kernels = self.gpu_kernels(node, name)
+        k = kernels[0]
+        code = """
+        #define THEANO_KERN_WID %(nb)d
+        """ % locals()
+        code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
+                           for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
+        kname = "conv_full_load_everything"
+        gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
+        bin = gk._binary
+        bcode = ','.join(hex(ord(c)) for c in bin)
+        code = code.replace('\\', '\\\\')
+        code = code.replace('"', '\\"')
+        code = code.replace('\n', '\\n')
+        mod = """
+        static const char conv_bcode[] = {%(bcode)s};
+        static const char *conv_code = "%(code)s";
+        """ % locals()
+        for k in kernels:
+            mod += "static GpuKernel " + k.name + '_' + name + ";\n"
+        mod += open(os.path.join(os.path.split(__file__)[0], "conv.cu")).read()
+        return mod
+
+    @utils.memoize
+    def gpu_kernels(self, node, name):
+        dtypes = [i.dtype for i in node.inputs]
+        dtypes.extend([o.dtype for o in node.outputs])
+        flags = Kernel.get_flags(*dtypes)
+        kernels = self.conv_patch_kernels(name, flags)
+        kernels.extend(self.conv_patch_stack_kernels(name, flags))
+        kernels.extend(self.conv_patch_stack_reduce_kernels(name, flags))
+        kernels.extend(self.conv_rows_kernels(name, flags))
+        kernels.extend(self.conv_rows_stack_kernels(name, flags))
+        kernels.extend(self.conv_rows_stack2_kernels(name, flags))
+        kernels.extend(self.conv_valid_row_reduce_kernels(name, flags))
+        kernels.extend(self.conv_reference_valid_kernels(name, flags))
+        kernels.extend(self.conv_reference_full_kernels(name, flags))
+        kernels.extend(self.conv_full_patch_kernels(name, flags))
+        kernels.extend(self.conv_full_patch_stack_kernels(name, flags))
+        kernels.extend(self.conv_full_patch_stack_padded_kernels(name, flags))
+        kernels.extend(self.conv_full_load_everything_kernels(name, flags))
+        return kernels
+
+    def conv_patch_kernels(self, name, flags):
+        kname = "conv_patch_%d"
+        k_var = "conv_patch_%d_" + name
+        params = [
+            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
+            ]
+        return [
+            Kernel(None, params, kname % i, flags,
+                   'conv_code', 'conv_bcode', k_var % i)
+            for i in [2, 3]
+            ]
+
+    def conv_patch_stack_kernels(self, name, flags):
+        kname = "conv_patch_stack_%d"
+        k_var = "conv_patch_stack_%d_" + name
+        params = [
+            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
+            ]
+        return [
+            Kernel(None, params, kname % i, flags,
+                   'conv_code', 'conv_bcode', k_var % i)
+            for i in range(64, 96)
+            ]
+
+    def conv_patch_stack_reduce_kernels(self, name, flags):
+        kname = "conv_patch_stack_reduce_%d"
+        k_var = "conv_patch_stack_reduce_%d_" + name
+        params = [
+            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc'
+            ]
+        return [
+            Kernel(None, params, kname % i, flags,
+                   'conv_code', 'conv_bcode', k_var % i)
+            for i in [1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15]
+            ]
+
+    def conv_rows_kernels(self, name, flags):
+        kname = "conv_rows_%d"
+        k_var = "conv_rows_%d_" + name
+        params = [
+            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc'
+            ]
+        return [
+            Kernel(None, params, kname % i, flags,
+                   'conv_code', 'conv_bcode', k_var % i)
+            for i in [0, 1]
+            ]
+
+    def conv_rows_stack_kernels(self, name, flags):
+        kname = "conv_rows_stack_%d"
+        k_var = "conv_rows_stack_%d_" + name
+        params = [
+            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc'
+            ]
+        return [
+            Kernel(None, params, kname % i, flags,
+                   'conv_code', 'conv_bcode', k_var % i)
+            for i in [0, 1]
+            ]
+
+    def conv_rows_stack2_kernels(self, name, flags):
+        kname = "conv_rows_stack2_%d"
+        k_var = "conv_rows_stack2_%d_" + name
+        params = [
+            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc'
+            ]
+        return [
+            Kernel(None, params, kname % i, flags,
+                   'conv_code', 'conv_bcode', k_var % i)
+            for i in [0, 1, 2, 3]
+            ]
+
+    def conv_valid_row_reduce_kernels(self, name, flags):
+        kname = "conv_valid_row_reduce_%d"
+        k_var = "conv_valid_row_reduce_%d_" + name
+        params = [
+            'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc'
+            ]
+        return [
+            Kernel(None, params, kname % i, flags,
+                   'conv_code', 'conv_bcode', k_var % i)
+            for i in [0, 1]
+            ]
+
+    def conv_reference_valid_kernels(self, name, flags):
+        kname = "conv_reference_valid"
+        k_var = "conv_reference_valid_" + name
+        params = [
+            'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc'
+            ]
+        return [
+            Kernel(None, params, kname, flags,
+                   'conv_code', 'conv_bcode', k_var)
+            ]
+
+    def conv_reference_full_kernels(self, name, flags):
+        kname = "conv_reference_full"
+        k_var = "conv_reference_full_" + name
+        params = [
+            'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc'
+            ]
+        return [
+            Kernel(None, params, kname, flags,
+                   'conv_code', 'conv_bcode', k_var)
+            ]
+
+    def conv_full_patch_kernels(self, name, flags):
+        kname = "conv_full_patch"
+        k_var = "conv_full_patch_" + name
+        params = [
+            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
+            ]
+        return [
+            Kernel(None, params, kname, flags,
+                   'conv_code', 'conv_bcode', k_var)
+            ]
+
+    def conv_full_patch_stack_kernels(self, name, flags):
+        kname = "conv_full_patch_stack_%d"
+        k_var = "conv_full_patch_stack_%d_" + name
+        params = [
+            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
+            ]
+        return [
+            Kernel(None, params, kname % i, flags,
+                   'conv_code', 'conv_bcode', k_var % i)
+            for i in [0, 1, 2, 3]
+            ]
+
+    def conv_full_patch_stack_padded_kernels(self, name, flags):
+        kname = "conv_full_patch_stack_padded_%d"
+        k_var = "conv_full_patch_stack_padded_%d_" + name
+        params = [
+            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc'
+            ]
+        return [
+            Kernel(None, params, kname % i, flags,
+                   'conv_code', 'conv_bcode', k_var % i)
+            for i in [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14]
+            ]
+
+    def conv_full_load_everything_kernels(self, name, flags):
+        kname = "conv_full_load_everything"
+        k_var = "conv_full_load_everything_" + name
+        params = [
+            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc'
+            ]
+        return [
+            Kernel(None, params, kname, flags,
+                   'conv_code', 'conv_bcode', k_var)
+            ]
--- a/theano/sandbox/gpuarray/conv_full_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_full_kernel.cu
+extern __shared__ float s_data[];
+
 //we store the full image and the full kernel in the shared memory
 //each thread compute only one value for the output
 //thread block size=out_wid, out_len/nb_split
 //grid block size=batch_id
 //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
-__global__ void
-conv_full_patch_split(const float* img, const float* kern, float* out,
+extern "C" __global__ void
+conv_full_patch_split(const float* img, const size_t img_offset,
+                      const float* kern, const size_t kern_offset,
+                      float* out, const size_t out_offset,
                      int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
 {
  int __shared__ out_len, out_wid, nb_thread_id;
+
+  kern = (const float *)(((const char *)kern)+kern_offset);
+  img = (const float *)(((const char *)img)+img_offset);
+  out = (float *)(((char *)out)+out_offset);
+
  out_len = img_len + kern_len - 1;
  out_wid = img_wid + kern_wid - 1;
  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;

-  extern __shared__ float s_data[];
-
    int batch_id = blockIdx.x;

    // Thread index
@@ -60,18 +67,23 @@ conv_full_patch_split(const float* img, const float* kern, float* out,
 //thread block size=out_wid, out_len
 //grid block size=batch_id, nkern
 //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
-__global__ void
-conv_full_patch( const float* img, const float* kern, float* out,
+extern "C" __global__ void
+conv_full_patch( const float* img, const size_t img_offset,
+                 const float* kern, const size_t kern_offset,
+                 float* out, const size_t out_offset,
                 int img_len, int img_wid,
                 int kern_len, int kern_wid, int nkern, int nstack)
 {
  int __shared__ out_len, out_wid, nb_thread_id;
+
+  kern = (const float *)(((const char *)kern)+kern_offset);
+  img = (const float *)(((const char *)img)+img_offset);
+  out = (float *)(((char *)out)+out_offset);
+
  out_len = img_len + kern_len - 1;
  out_wid = img_wid + kern_wid - 1;
  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;

-  extern __shared__ float s_data[];
-
    int batch_id = blockIdx.x;

    // Thread index
@@ -114,6 +126,8 @@ conv_full_patch( const float* img, const float* kern, float* out,
        out_row*out_wid+out_col] = sum;
 }

+
+
 //we store the full image and the full kernel in the shared memory
 //each thread compute only one value for the output
 //thread block size=out_wid, out_len
@@ -122,8 +136,10 @@ conv_full_patch( const float* img, const float* kern, float* out,
 //template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.

 template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
-__global__ void
-conv_full_patch_stack( const float* img, const float* kern, float* out,
+__device__ inline void
+conv_full_patch_stack( const float* img, const size_t img_offset,
+                       const float* kern, const size_t kern_offset,
+                       float* out, const size_t out_offset,
                       int img_len, int img_wid,
                       int kern_len, int kern_wid, int nkern, int nstack,
                       int img_stride_col, int img_stride_row,
@@ -131,12 +147,15 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
                       int kern_stride_stack, int kern_stride_nkern)
 {
  int __shared__ out_len, out_wid, nb_thread_id;
+
+  kern = (const float *)(((const char *)kern)+kern_offset);
+  img = (const float *)(((const char *)img)+img_offset);
+  out = (float *)(((char *)out)+out_offset);
+
  out_len = img_len + kern_len - 1;
  out_wid = img_wid + kern_wid - 1;
  nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
  const float __shared__ *kern_, *img_;
-  extern __shared__ float s_data[];
-
    const int batch_id = blockIdx.x;
    const int nkern_id = blockIdx.y;

@@ -182,6 +201,36 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
        out_row*out_wid+out_col] = sum;
 }

+extern "C" {
+#define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \
+__global__ void \
+conv_full_patch_stack_##suffix( \
+    const float *img, const size_t img_offset, \
+    const float *kern, const size_t kern_offset, \
+    float *out, const size_t out_offset, \
+    int img_len, int img_wid, \
+    int kern_len, int kern_wid, int nkern, int nstack, \
+    int img_stride_col, int img_stride_row, \
+    int kern_stride_col, int kern_stride_row,  \
+    int kern_stride_stack, int kern_stride_nkern) \
+{ \
+    conv_full_patch_stack<__VA_ARGS__>( \
+        img, img_offset, kern, kern_offset, out, out_offset, \
+        img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
+        img_stride_col, img_stride_row, kern_stride_col, kern_stride_row,  \
+        kern_stride_stack, kern_stride_nkern); \
+}
+
+__INSTANTIATE_CONV_FULL_PATCH_STACK(0, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK(1, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK(2, true, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
+
+#undef __INSTANTIATE_CONV_FULL_PATCH_STACK
+}
+
+
+
 /**
 * As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
 * I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
@@ -200,24 +249,36 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
 * template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
 *          if you set split and low_mem to true, we will use the low_mem version!
 */
-template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
-__global__ void
-conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
+template<bool flipped_kern, bool c_contiguous, bool split, bool low_mem >
+__device__ inline void
+conv_full_patch_stack_padded( const float* img, const size_t img_offset,
+                  const float* kern, const size_t kern_offset,
+                  float* out, const size_t out_offset,
                  const int img_len, const int img_wid,
                  const int kern_len, const int kern_wid,
                  const int nkern, const int nstack,
                  const int img_stride_col, const int img_stride_row,
                  const int img_stride_stack, const int img_stride_batch,
-                  const int kern_stride_col, const int kern_stride_row,
+                  int kern_stride_col, int kern_stride_row,
                  const int kern_stride_stack, const int kern_stride_nkern)
 {
  int __shared__ out_len, out_wid, nb_thread_id;
+
+  kern = (const float *)(((const char *)kern)+kern_offset);
+  img = (const float *)(((const char *)img)+img_offset);
+  out = (float *)(((char *)out)+out_offset);
+
+  if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
+    //the last two dimensions are c_contiguous but flipped!
+    kern = &(kern[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
+    kern_stride_col=1;
+    kern_stride_row=kern_wid;
+  }
+
  out_len = img_len + kern_len - 1;
  out_wid = img_wid + kern_wid - 1;
  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;

-  extern __shared__ float s_data[];
-
    __shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
    batch_id = blockIdx.x;
    kern_id = blockIdx.y;
@@ -257,7 +318,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
          const float* idx_kern=&d_kern[row*kern_wid];
          const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
          
-          convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+          convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
        }
      }
      out[batch_id*out_wid*out_len*nkern+//the good batch
@@ -292,7 +353,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
              const float* idx_kern=&d_kern[row*kern_wid];
              const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
              
-              convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+              convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
            }
          if(out_row<out_len)
            out[batch_id*out_wid*out_len*nkern+//the good batch
@@ -340,7 +401,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
            const float* idx_kern=&d_kern[row*kern_wid];
            const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
            
-            convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+            convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
          }
        }
        if(out_row<out_len)
@@ -351,6 +412,46 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
    }
 }

+extern "C" {
+#define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \
+__global__ void \
+conv_full_patch_stack_padded_##suffix( \
+    const float *img, const size_t img_offset, \
+    const float *kern, const size_t kern_offset, \
+    float *out, const size_t out_offset, \
+    const int img_len, const int img_wid, \
+    const int kern_len, const int kern_wid, \
+    const int nkern, const int nstack, \
+    const int img_stride_col, const int img_stride_row, \
+    const int img_stride_stack, const int img_stride_batch, \
+    const int kern_stride_col, const int kern_stride_row, \
+    const int kern_stride_stack, const int kern_stride_nkern) \
+{ \
+    conv_full_patch_stack_padded<__VA_ARGS__>( \
+        img, img_offset, kern, kern_offset, out, out_offset, \
+        img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
+        img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
+        kern_stride_col, kern_stride_row, \
+        kern_stride_stack, kern_stride_nkern); \
+}
+
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(0, false, false, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(1, false, false, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(2, false, false, true, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(4, false, true, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(5, false, true, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(6, false, true, true, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(8, true, false, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(9, true, false, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(10, true, false, true, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(12, true, true, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(13, true, true, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false)
+
+#undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED
+}
+
+
 template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy) 
 { 
    return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
@@ -364,9 +465,10 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
 { 
    return x[0] * y[0];
 }
-template<int NSTACK>
-__global__ void
-conv_full_load_everything( const float* img, const float* kern, float* out,
+extern "C" __global__ void
+conv_full_load_everything( const float* img, const size_t img_offset,
+                 const float* kern, const size_t kern_offset,
+                 float* out, const size_t out_offset,
                 int img_len, int img_wid,
                 int kern_len, int kern_wid, int nkern, int nstack,
                 int img_stride_col, int img_stride_row,
@@ -375,12 +477,15 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
                 int kern_stride_stack, int kern_stride_nkern)
 {
    int __shared__ out_len, out_wid, nb_thread_id;
+
+    kern = (const float *)(((const char *)kern)+kern_offset);
+    img = (const float *)(((const char *)img)+img_offset);
+    out = (float *)(((char *)out)+out_offset);
+
    out_len = img_len + kern_len - 1;
    out_wid = img_wid + kern_wid - 1;
    nb_thread_id = blockDim.y*blockDim.x;

-    extern __shared__ float s_data[];
-
    int batch_id = blockIdx.x;

    const int out_col = threadIdx.x;//output col
@@ -423,9 +528,9 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
            {
                int icol = out_col - kern_wid+1+col;
                if (icol < 0 || icol > img_wid) continue;
-                if (NSTACK > 0)
+                if (THEANO_KERN_WID > 0)
                {
-                    sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid,
+                    sum += everything_dot<THEANO_KERN_WID>(d_img + irow*img_wid + icol, img_len*img_wid,
                            d_kern + row*kern_wid+col, kern_len*kern_wid);
                }
                else
@@ -443,6 +548,8 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
        __syncthreads(); //don't start loading another kernel until we're done here
    }
 }
+
+
 /*
  Local Variables:
  mode:c++

--- a/theano/sandbox/gpuarray/conv_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_kernel.cu
@@ -29,7 +29,6 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
 */
 #ifndef CONV_KERNEL_CU
 #define CONV_KERNEL_CU
-#include <stdint.h>

 /*
 #define CHECK_BANK_CONFLICTS 0
@@ -182,12 +181,11 @@ template<> __device__ float convolutionRowNoFlip<0>(const float *data,
    return 0;
 }

-template<int KERN_WIDTH>
 __device__ void convolutionRowNoFlip(float& sum,
                                     const float *data,
                                     const float *kern, const int kern_wid){
-  if(KERN_WIDTH>0)
-    sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
+  if(THEANO_KERN_WID>0)
+    sum+=convolutionRowNoFlip<THEANO_KERN_WID>(data,kern);
  else
 #pragma unroll 8
    for (int col=0; col < kern_wid; col++) {//loop over col
@@ -219,13 +217,20 @@ __device__ void store_or_accumulate(float& dst,const float value ){
 *                 When true, allow for output image bigger then 512 pixel.
 *                 Use more registers.
 */
-template<bool flipped_kern, int KERN_WIDTH, bool split>
-__global__ void
-conv_patch( const float* img, const float* kern, float* out,
+template<bool flipped_kern, bool split>
+__device__ inline void
+conv_patch( const float* img, const size_t img_offset,
+            const float* kern, const size_t kern_offset,
+            float* out, const size_t out_offset,
            int img_len, int img_wid, int kern_len, int kern_wid,
            int nkern, int nstack)
 {
  int __shared__ out_len, out_wid, nb_thread_id;
+
+  kern = (const float *)(((const char *)kern)+kern_offset);
+  img = (const float *)(((const char *)img)+img_offset);
+  out = (float *)(((char *)out)+out_offset);
+
  out_len = img_len - kern_len + 1;
  out_wid = img_wid - kern_wid + 1;
  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
@@ -260,7 +265,7 @@ conv_patch( const float* img, const float* kern, float* out,
      for (int row=0; row < kern_len; row++) {//loop over row
        const float* idx_kern=&d_kern[row*kern_wid];
        const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
      }
      out[batch_id*out_wid*out_len*nkern+//the good batch
          blockIdx.y*out_wid*out_len+//the output image
@@ -271,7 +276,7 @@ conv_patch( const float* img, const float* kern, float* out,
        for (int row=0; row < kern_len; row++) {//loop over row
          const float* idx_kern=&d_kern[row*kern_wid];
          const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+          convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
        }
        out[batch_id*out_wid*out_len*nkern+//the good batch
            kern_id*out_wid*out_len+//the output image
@@ -280,6 +285,28 @@ conv_patch( const float* img, const float* kern, float* out,
    }
 }

+extern "C" {
+#define __INSTANTIATE_CONV_PATCH(suffix, ...) \
+__global__ void \
+conv_patch_##suffix(const float *img, const size_t img_offset, \
+                    const float *kern, const size_t kern_offset, \
+                    float *out, const size_t out_offset, \
+                    int img_len, int img_wid, int kern_len, int kern_wid, \
+                    int nkern, int nstack) \
+{ \
+    conv_patch<__VA_ARGS__>(img, img_offset, kern, kern_offset, \
+                            out, out_offset, img_len, img_wid, kern_len, \
+                            kern_wid, nkern, nstack); \
+}
+
+__INSTANTIATE_CONV_PATCH(2, true, false)
+__INSTANTIATE_CONV_PATCH(3, true, true)
+
+#undef __INSTANTIATE_CONV_PATCH
+}
+
+
+
 /**
 * As conv_patch, but implement the stack in the kernel.
 * I keep it separated from conv_patch as we take more registers and this could lower the occupency.
@@ -295,16 +322,17 @@ conv_patch( const float* img, const float* kern, float* out,
 * dy: patch stride cols(1 for normal convolution)
 * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
 * template accumulate: if true, we add the result, else we override the result
- * template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
 * template img_c_contiguous_2d: if true, the img have are collon and row contiguous
 * template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
 * template split: if true, each thread generate more than 1 output pixel, but use more registers.
 * template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
 * template subsample: if false, remove some computation needed when dx or dy!=1.
 */
-template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
-__global__ void
-conv_patch_stack( const float* img, const float* kern, float* out,
+template<bool flipped_kern, bool accumulate, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
+__device__ inline void
+conv_patch_stack( const float* img, const size_t img_offset,
+                  const float* kern, const size_t kern_offset,
+                  float* out, const size_t out_offset,
                  int img_len, int img_wid, int kern_len, int kern_wid,
                  int out_len, int out_wid,
                  int nkern, int nstack, int img_stride_col,int img_stride_row,
@@ -313,6 +341,11 @@ conv_patch_stack( const float* img, const float* kern, float* out,
                  int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
 {
  int __shared__ nb_thread_id;
+
+  kern = (const float *)(((const char *)kern)+kern_offset);
+  img = (const float *)(((const char *)img)+img_offset);
+  out = (float *)(((char *)out)+out_offset);
+
  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;

  extern __shared__ float s_data[];
@@ -365,7 +398,7 @@ conv_patch_stack( const float* img, const float* kern, float* out,
          else
            idx_in=&d_img[(row+out_row)*img_wid+out_col];
          
-          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+          convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
        }
        __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
      }
@@ -425,7 +458,7 @@ conv_patch_stack( const float* img, const float* kern, float* out,
            //as we store the result of only the good thread.
            //This was with nvcc 3.0 on an GTX470 card.
            if(out_row<out_len)
-              convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+              convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
          }
          __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
        }
@@ -440,6 +473,67 @@ conv_patch_stack( const float* img, const float* kern, float* out,

 }

+extern "C" {
+#define __INSTANTIATE_CONV_PATCH_STACK(suffix, ...) \
+__global__ void \
+conv_patch_stack_##suffix(const float *img, const size_t img_offset, \
+                          const float *kern, const size_t kern_offset, \
+                          float *out, const size_t out_offset, \
+                          int img_len, int img_wid, int kern_len, int kern_wid, \
+                          int out_len, int out_wid, int nkern, int nstack, \
+                          int img_stride_col, int img_stride_row, \
+                          int img_stride_stack, int img_stride_batch, \
+                          int kern_stride_col, int kern_stride_row, \
+                          int kern_stride_stack, int kern_stride_nkern, \
+                          int dx, int dy) \
+{ \
+    conv_patch_stack<__VA_ARGS__>( \
+        img, img_offset, kern, kern_offset, out, out_offset, \
+        img_len, img_wid, kern_len, kern_wid, out_len, \
+        out_wid, nkern, nstack, img_stride_col, img_stride_row, \
+        img_stride_stack, img_stride_batch, \
+        kern_stride_col, kern_stride_row, \
+        kern_stride_stack, kern_stride_nkern, dx, dy); \
+}
+
+__INSTANTIATE_CONV_PATCH_STACK(64, true, false, false, false, false, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(65, true, false, false, false, false, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(66, true, false, false, false, false, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(67, true, false, false, false, false, true, true)
+__INSTANTIATE_CONV_PATCH_STACK(68, true, false, false, false, true, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(69, true, false, false, false, true, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(70, true, false, false, false, true, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(71, true, false, false, false, true, true, true)
+__INSTANTIATE_CONV_PATCH_STACK(72, true, false, false, true, false, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(73, true, false, false, true, false, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(74, true, false, false, true, false, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(75, true, false, false, true, false, true, true)
+__INSTANTIATE_CONV_PATCH_STACK(76, true, false, false, true, true, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(77, true, false, false, true, true, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(78, true, false, false, true, true, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(79, true, false, false, true, true, true, true)
+__INSTANTIATE_CONV_PATCH_STACK(80, true, false, true, false, false, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(81, true, false, true, false, false, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(82, true, false, true, false, false, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(83, true, false, true, false, false, true, true)
+__INSTANTIATE_CONV_PATCH_STACK(84, true, false, true, false, true, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(85, true, false, true, false, true, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(86, true, false, true, false, true, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(87, true, false, true, false, true, true, true)
+__INSTANTIATE_CONV_PATCH_STACK(88, true, false, true, true, false, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(89, true, false, true, true, false, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(90, true, false, true, true, false, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(91, true, false, true, true, false, true, true)
+__INSTANTIATE_CONV_PATCH_STACK(92, true, false, true, true, true, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(93, true, false, true, true, true, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(94, true, false, true, true, true, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true)
+
+#undef __INSTANTIATE_CONV_PATCH_STACK
+}
+
+
+
 /**
 * As conv_patch_stack, but kern_len thread for each output pixel
 * I keep it separated as use more register.
@@ -454,9 +548,11 @@ conv_patch_stack( const float* img, const float* kern, float* out,
 * template img_contiguous: if true, the img have are collon and row contiguous
 * template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
 */
-template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
-__global__ void
-conv_patch_stack_reduce( const float* img, const float* kern, float* out,
+template<bool flipped_kern, bool c_contiguous, bool split, bool preload_full_kern>
+__device__ inline void
+conv_patch_stack_reduce( const float* img, const size_t img_offset,
+                  const float* kern, const size_t kern_offset,
+                  float* out, const size_t out_offset,
                  int img_len, int img_wid, int kern_len, int kern_wid,
                  int nkern, int nstack, int img_stride_col,int img_stride_row,
                  int img_stride_stack, int img_stride_batch,
@@ -470,6 +566,17 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
  const int out_len = blockDim.y;
  const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;

+  kern = (const float *)(((const char *)kern)+kern_offset);
+  img = (const float *)(((const char *)img)+img_offset);
+  out = (float *)(((char *)out)+out_offset);
+
+  if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
+    //the last two dimensions are c_contiguous but flipped!
+    kern = &(kern[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
+    kern_stride_col=1;
+    kern_stride_row=kern_wid;
+  }
+
  extern __shared__ float s_data[];

    int batch_id = blockIdx.x;
@@ -521,7 +628,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
          const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
          float sum2 = 0;
          if(tz<len3)
-            convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
+            convolutionRowNoFlip(sum2,idx_in,idx_kern,kern_wid);
          sum+=sum2;
        }
      }else if(split){
@@ -531,7 +638,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
        for(int row=tz;row<kern_len;row+=blockDim.z){
          const float* idx_kern=&d_kern[row*kern_wid];
          const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+          convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
        }
      }else{
        int row = tz;//The row of the kernel.
@@ -540,7 +647,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
        __syncthreads();
-        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
      }
        __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
    }
@@ -559,6 +666,49 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
    }
 }

+extern "C" {
+#define __INSTANTIATE_CONV_PATCH_STACK_REDUCE(suffix, ...) \
+__global__ void \
+conv_patch_stack_reduce_##suffix( \
+    const float *img, const size_t img_offset, \
+    const float *kern, const size_t kern_offset, \
+    float *out, const size_t out_offset, \
+    int img_len, int img_wid, int kern_len, int kern_wid, \
+    int nkern, int nstack, int img_stride_col, int img_stride_row, \
+    int img_stride_stack, int img_stride_batch, \
+    int kern_stride_col, int kern_stride_row, \
+    int kern_stride_stack, int kern_stride_nkern) \
+{ \
+    conv_patch_stack_reduce<__VA_ARGS__>( \
+        img, img_offset, kern, kern_offset, out, out_offset, \
+        img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
+        img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
+        kern_stride_col, kern_stride_row, \
+        kern_stride_stack, kern_stride_nkern); \
+}
+
+/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE#(0, false, false, false, false)*/
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(1, false, false, false, true)
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(2, false, false, true, false)
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(3, false, false, true, true)
+/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE#(4, false, true, false, false)*/
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(5, false, true, false, true)
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(6, false, true, true, false)
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(7, false, true, true, true)
+/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(8, true, false, false, false)*/
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(9, true, false, false, true)
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(10, true, false, true, false)
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(11, true, false, true, true)
+/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(12, true, true, false, false)*/
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(13, true, true, false, true)
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(14, true, true, true, false)
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true)
+
+#undef __INSTANTIATE_CONV_PATCH_STACK_REDUCE
+}
+
+
+
 /**
 * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
 * we store kern_len row of the image and the full kernel in the shared memory
@@ -570,9 +720,11 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
 * Diff with conv_patch: don't store the full image in the shared memory. 
 *    I.E. work for bigger image then conv_patch<split=true,...>.
 */
-template<int KERN_WIDTH, bool c_contiguous>
-__global__ void
-conv_rows( const float* img, const float* kern, float* out,
+template<bool c_contiguous>
+__device__ inline void
+conv_rows( const float* img, const size_t img_offset,
+           const float* kern, const size_t kern_offset,
+           float* out, const size_t out_offset,
           int img_len, int img_wid, int kern_len, int kern_wid,
           int nkern, int nstack,
           int img_stride_col, int img_stride_row,
@@ -582,6 +734,11 @@ conv_rows( const float* img, const float* kern, float* out,
 {
  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
  float __shared__ *d_img, *d_kern;
+
+  kern = (const float *)(((const char *)kern)+kern_offset);
+  img = (const float *)(((const char *)img)+img_offset);
+  out = (float *)(((char *)out)+out_offset);
+
  out_len = img_len - kern_len + 1;
  out_wid = img_wid - kern_wid + 1;
  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
@@ -612,7 +769,7 @@ conv_rows( const float* img, const float* kern, float* out,
    for (int row=0; row < kern_len; row++) {//loop over row
      const float* idx_kern=&d_kern[row*kern_wid];
      const float* idx_in=&d_img[(row)*img_wid+out_col];
-      convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+      convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
    }

    out[batch_id*out_wid*out_len*nkern+//the good batch
@@ -620,6 +777,36 @@ conv_rows( const float* img, const float* kern, float* out,
        out_row*out_wid+out_col] = sum;
 }

+extern "C" {
+#define __INSTANTIATE_CONV_ROWS(suffix, ...) \
+__global__ void \
+conv_rows_##suffix(const float *img, const size_t img_offset, \
+                   const float *kern, const size_t kern_offset, \
+                   float *out, const size_t out_offset, \
+                   int img_len, int img_wid, int kern_len, int kern_wid, \
+                   int nkern, int nstack, \
+                   int img_stride_col, int img_stride_row, \
+                   int img_stride_stack, int img_stride_batch, \
+                   int kern_stride_col, int kern_stride_row, \
+                   int kern_stride_stack, int kern_stride_nkern) \
+{ \
+    conv_rows<__VA_ARGS__>( \
+        img, img_offset, kern, kern_offset, out, out_offset, \
+        img_len, img_wid, kern_len, kern_wid, \
+        nkern, nstack, img_stride_col, img_stride_row, \
+        img_stride_stack, img_stride_batch, \
+        kern_stride_col, kern_stride_row, \
+        kern_stride_stack, kern_stride_nkern); \
+}
+
+__INSTANTIATE_CONV_ROWS(0, false)
+__INSTANTIATE_CONV_ROWS(1, true)
+
+#undef __INSTANTIATE_CONV_ROWS
+}
+
+
+
 /**
 * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
 * as conv_rows, but implement the stack. Separate as this use more register.
@@ -631,9 +818,11 @@ conv_rows( const float* img, const float* kern, float* out,
 * Diff with conv_patch: don't store the full image in the shared memory. 
 *    I.E. work for bigger image then conv_patch<split=true,...>.
 */
-template<int KERN_WIDTH, bool c_contiguous>
-__global__ void
-conv_rows_stack( const float* img, const float* kern, float* out,
+template<bool c_contiguous>
+__device__ inline void
+conv_rows_stack( const float* img, const size_t img_offset,
+                 const float* kern, const size_t kern_offset,
+                 float* out, const size_t out_offset,
                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
                 const int nkern, const int nstack,
                 const int img_stride_col, const int img_stride_row,
@@ -643,6 +832,11 @@ conv_rows_stack( const float* img, const float* kern, float* out,
 {
  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
  float  __shared__ *d_img, *d_kern;
+
+  kern = (const float *)(((const char *)kern)+kern_offset);
+  img = (const float *)(((const char *)img)+img_offset);
+  out = (float *)(((char *)out)+out_offset);
+
  out_len = img_len - kern_len + 1;
  out_wid = img_wid - kern_wid + 1;
  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
@@ -708,7 +902,7 @@ conv_rows_stack( const float* img, const float* kern, float* out,
      for (int row=0; row < kern_len; row++) {//loop over row
        const float* idx_kern=&d_kern[row*kern_wid];
        const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
-        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
      }
      __syncthreads();//to be sure all thread have finished before we modif the shared memory.
    }
@@ -718,6 +912,38 @@ conv_rows_stack( const float* img, const float* kern, float* out,
          out_row*out_wid+out_col] = sum;
 }

+extern "C" {
+#define __INSTANTIATE_CONV_ROWS_STACK(suffix, ...) \
+__global__ void \
+conv_rows_stack_##suffix( \
+    const float *img, const size_t img_offset, \
+    const float *kern, const size_t kern_offset, \
+    float *out, const size_t out_offset, \
+    const int img_len, const int img_wid, \
+    const int kern_len, const int kern_wid, \
+    const int nkern, const int nstack, \
+    const int img_stride_col, const int img_stride_row, \
+    const int img_stride_stack, const int img_stride_batch, \
+    const int kern_stride_col, const int kern_stride_row, \
+    const int kern_stride_stack, const int kern_stride_nkern) \
+{ \
+    conv_rows_stack<__VA_ARGS__>( \
+        img, img_offset, kern, kern_offset, out, out_offset, \
+        img_len, img_wid, kern_len, kern_wid, \
+        nkern, nstack, img_stride_col, img_stride_row, \
+        img_stride_stack, img_stride_batch, \
+        kern_stride_col, kern_stride_row, \
+        kern_stride_stack, kern_stride_nkern); \
+}
+
+__INSTANTIATE_CONV_ROWS_STACK(0, false)
+__INSTANTIATE_CONV_ROWS_STACK(1, true)
+
+#undef __INSTANTIATE_CONV_ROWS_STACK
+}
+
+
+
 /**
 * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
 * as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
@@ -729,9 +955,11 @@ conv_rows_stack( const float* img, const float* kern, float* out,
 * Diff with conv_patch: don't store the full image and kernel in the shared memory. 
 *    I.E. work for bigger image then conv_patch<split=true,...>.
 */
-template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
-__global__ void
-conv_rows_stack2(const float* img, const float* kern, float* out,
+template<bool c_contiguous, bool preload_full_kern>
+__device__ inline void
+conv_rows_stack2(const float* img, const size_t img_offset,
+                 const float* kern, const size_t kern_offset,
+                 float* out, const size_t out_offset,
                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
                 const int nkern, const int nstack,
                 const int img_stride_col, const int img_stride_row,
@@ -741,6 +969,11 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
 {
  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
  float  __shared__ *d_img, *d_kern;
+
+  kern = (const float *)(((const char *)kern)+kern_offset);
+  img = (const float *)(((const char *)img)+img_offset);
+  out = (float *)(((char *)out)+out_offset);
+
  out_len = img_len - kern_len + 1;
  out_wid = img_wid - kern_wid + 1;
  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
@@ -804,7 +1037,7 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
          else idx_kern=d_kern;
          const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
          float sum_ =0.0f;
-          convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
+          convolutionRowNoFlip(sum_,idx_in,idx_kern,kern_wid);
          sum+=sum_;//We pass by an intermediate variable to have more precission.
        }
      }
@@ -816,6 +1049,39 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
          out_row*out_wid+out_col] = sum;
 }

+extern "C" {
+#define __INSTANTIATE_CONV_ROWS_STACK2(suffix, ...) \
+__global__ void \
+conv_rows_stack2_##suffix( \
+    const float *img, const size_t img_offset, \
+    const float *kern, const size_t kern_offset, \
+    float *out, const size_t out_offset, \
+    const int img_len, const int img_wid, \
+    const int kern_len, const int kern_wid, \
+    const int nkern, const int nstack, \
+    const int img_stride_col, const int img_stride_row, \
+    const int img_stride_stack, const int img_stride_batch, \
+    const int kern_stride_col, const int kern_stride_row, \
+    const int kern_stride_stack, const int kern_stride_nkern) \
+{ \
+    conv_rows_stack2<__VA_ARGS__>( \
+        img, img_offset, kern, kern_offset, out, out_offset, \
+        img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
+        img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
+        kern_stride_col, kern_stride_row, \
+        kern_stride_stack, kern_stride_nkern); \
+}
+
+__INSTANTIATE_CONV_ROWS_STACK2(0, false, false)
+__INSTANTIATE_CONV_ROWS_STACK2(1, false, true)
+__INSTANTIATE_CONV_ROWS_STACK2(2, true, false)
+__INSTANTIATE_CONV_ROWS_STACK2(3, true, true)
+
+#undef __INSTANTIATE_CONV_ROWS_STACK2
+}
+
+
+
 /**
 * Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
 * kernel-image inner-product in parallel.
@@ -826,18 +1092,23 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
 * TODO: explain parameters, preconditions
 */
 template<bool stack_loop>
-__global__ void
+__device__ inline void
 conv_valid_row_reduce(int nB, int nK, int stacklen,
        int img_len, int img_wid, 
        int kern_len, int kern_wid,
        int out_len, int out_wid, //physical
-        const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
-        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
+        const float *img, const size_t img_offset, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        const float *kern, const size_t kern_offset, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        float *out, const size_t out_offset, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
        int subsample_rows, int subsample_cols,
        const int initial_reduce_boundary)
 {
    const int outsize = nB * nK * out_len * out_wid;
+
+    kern = (const float *)(((const char *)kern)+kern_offset);
+    img = (const float *)(((const char *)img)+img_offset);
+    out = (float *)(((char *)out)+out_offset);
+
    extern __shared__ float reducebuf[];
    for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x)
    {
@@ -911,6 +1182,36 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
    }
 }

+extern "C" {
+#define __INSTANTIATE_CONV_VALID_ROW_REDUCE(suffix, ...) \
+__global__ void \
+conv_valid_row_reduce_##suffix( \
+    int nB, int nK, int stacklen, int img_len, int img_wid,  \
+    int kern_len, int kern_wid, int out_len, int out_wid, \
+    const float *img, const size_t img_offset, \
+    int img_str_B, int img_str_S, int img_str_R, int img_str_C, \
+    const float *kern, const size_t kern_offset, \
+    int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, \
+    float *out, const size_t out_offset, \
+    int out_str_B, int out_str_K, int out_str_R, int out_str_C, \
+    int subsample_rows, int subsample_cols, \
+    const int initial_reduce_boundary) \
+{ \
+    conv_valid_row_reduce<__VA_ARGS__>( \
+        nB, nK, stacklen, img_len, img_wid, \
+        kern_len, kern_wid, out_len, out_wid, \
+        img, img_offset, img_str_B, img_str_S, img_str_R, img_str_C, \
+        kern, kern_offset, kern_str_K, kern_str_S, kern_str_R, kern_str_C, \
+        out, out_offset, out_str_B, out_str_K, out_str_R, out_str_C, \
+        subsample_rows, subsample_cols, initial_reduce_boundary); \
+}
+
+__INSTANTIATE_CONV_VALID_ROW_REDUCE(0, false)
+__INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true)
+
+#undef __INSTANTIATE_CONV_VALID_ROW_REDUCE
+}
+


 /**
@@ -920,18 +1221,26 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
 *
 * TODO: explain parameters, preconditions
 */
-__global__ void
+extern "C" __global__ void
 conv_reference_valid(int nB, int nK, int stacklen,
        int img_len, int img_wid, 
        int kern_len, int kern_wid,
        int out_len, int out_wid, //physical
-        const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
-        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
+        const float *img, const size_t img_offset,
+        int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        const float *kern, const size_t kern_offset,
+        int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        float *out, const size_t out_offset,
+        int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
        int subsample_rows, int subsample_cols)
 {
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    __shared__ int numThreads, outsize;
+
+    kern = (const float *)(((const char *)kern)+kern_offset);
+    img = (const float *)(((const char *)img)+img_offset);
+    out = (float *)(((char *)out)+out_offset);
+
    numThreads = blockDim.x * gridDim.x;
    outsize = nB * nK * out_len * out_wid;

@@ -972,6 +1281,8 @@ conv_reference_valid(int nB, int nK, int stacklen,
    }
 }

+
+
 /**
 * Reference implementation of 'full' mode convolution (with stack)
 * 
@@ -979,18 +1290,26 @@ conv_reference_valid(int nB, int nK, int stacklen,
 *
 * TODO: explain parameters, preconditions
 */
-__global__ void
+extern "C" __global__ void
 conv_reference_full(int nB, int nK, int stacklen,
        int img_len, int img_wid, 
        int kern_len, int kern_wid,
        int out_len, int out_wid, //physical dimensions
-        const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
-        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C,
+        const float *img, const size_t img_offset,
+        int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        const float *kern, const size_t kern_offset,
+        int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        float *out, const size_t out_offset,
+        int out_str_B, int out_str_K, int out_str_R, int out_str_C,
        int subsample_rows, int subsample_cols)
 {
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    __shared__ int numThreads, physical_outsize;
+
+    kern = (const float *)(((const char *)kern)+kern_offset);
+    img = (const float *)(((const char *)img)+img_offset);
+    out = (float *)(((char *)out)+out_offset);
+
    numThreads = blockDim.x * gridDim.x;
    physical_outsize = nB * nK * out_len * out_wid;


--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
 from __future__ import print_function
 import copy
+import os
 from theano.compat import izip
 import numpy

@@ -8,6 +9,7 @@ from theano import Apply, scalar, config
 from theano import scalar as scal
 from six.moves import StringIO, xrange
 from theano.gof.utils import MethodNotDefined
+from theano.gof.cmodule import GCC_compiler
 from theano.scalar import Scalar
 from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)

@@ -23,7 +25,6 @@ except ImportError:

 from .basic_ops import (as_gpuarray_variable, HideC,
                        GpuKernelBase, Kernel)
-from .comp import NVCC_compiler
 from .type import GpuArrayType
 from .fp16_help import load_w, write_w

@@ -57,7 +58,7 @@ def as_C_string_const(s):
                     for l in s.split('\n'))


-class GpuElemwise(HideC, Elemwise):
+class GpuElemwise(GpuKernelBase, HideC, Elemwise):
    nin = property(lambda self: self.scalar_op.nin)
    nout = property(lambda self: self.scalar_op.nout)
    _f16_ok = True
@@ -150,39 +151,7 @@ class GpuElemwise(HideC, Elemwise):
            code.append('}')
            kop = '\n'.join(code)

-        # Translate types for scalar composite ops (except complex).
-        # NB: OpenCL implicitly has 'stdint' defs at the kernel
-        # compilation stage
-        support_code = "" if pygpu.get_default_context().kind == 'opencl' else """
-#ifdef _MSC_VER
-#define signed __int8 int8_t
-#define unsigned __int8 uint8_t
-#define signed __int16 int16_t
-#define unsigned __int16 uint16_t
-#define signed __int32 int32_t
-#define unsigned __int32 uint32_t
-#define signed __int64 int64_t
-#define unsigned __int64 uint64_t
-#else
-#include <stdint.h>
-#endif
-"""
-        # Translate ga_ pseudo-types into their specific realizations
-        support_code += """
-#define ga_bool uint8_t
-#define ga_byte int8_t
-#define ga_ubyte uint8_t
-#define ga_short int16_t
-#define ga_ushort uint16_t
-#define ga_int int32_t
-#define ga_uint uint32_t
-#define ga_long int64_t
-#define ga_ulong uint64_t
-#define ga_float float
-#define ga_double double
-#define ga_half uint16_t
-
-"""
+        support_code = ""
        try:
            # We accept only some c_support_code().
            # This filter is done in the make_node()
@@ -204,60 +173,64 @@ class GpuElemwise(HideC, Elemwise):
            kop = kop.replace(npy, ga)
        return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)

-    def c_headers(self):
+    def c_header_dirs(self):
        if pygpu.get_default_context().kind == 'opencl':
            raise MethodNotDefined('cuda only')
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
+        cuda_root = config.cuda.root
+        if cuda_root:
+            return [os.path.join(cuda_root, 'include')]
+        else:
+            return []

    def c_compiler(self):
+        return GCC_compiler
+
+    def c_headers(self):
        if pygpu.get_default_context().kind == 'opencl':
            raise MethodNotDefined('cuda only')
-        return NVCC_compiler
+        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
+                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']

    def c_support_code(self):
        return self.scalar_op.c_support_code()

-    def c_support_code_apply(self, node, nodename):
+    def _gpu_kernel_code(self, node, nodename):
        if pygpu.get_default_context().kind == 'opencl':
            raise MethodNotDefined('cuda only')
        # This is useless by itself, but will serve an eventual c_code
        # implementation
        k = self.generate_kernel(node, nodename)
        nd = node.inputs[0].type.ndim
-        CLUDA_PREAMBLE = """
-#define local_barrier() __syncthreads();
-
-#define WITHIN_KERNEL __device__
-#define KERNEL extern "C" __global__
-#define GLOBAL_MEM /* empty */
-#define LOCAL_MEM __shared__
-#define LOCAL_MEM_ARG /* empty */
-#define REQD_WG_SIZE(X,Y,Z) __launch_bounds__(X*Y*Z, 1)
-
-#define LID_0 threadIdx.x
-#define LID_1 threadIdx.y
-#define LID_2 threadIdx.z
-
-#define GID_0 blockIdx.x
-#define GID_1 blockIdx.y
-#define GID_2 blockIdx.z
-
-#define LDIM_0 blockDim.x
-#define LDIM_1 blockDim.y
-#define LDIM_2 blockDim.z
-
-#define GDIM_0 gridDim.x
-#define GDIM_1 gridDim.y
-#define GDIM_2 gridDim.z
-"""
-        res = [CLUDA_PREAMBLE]
+        res = []
        for i in range(0, nd + 1):
            res.append(k.render_basic(i, name="elem_" + str(i)) + ';')
        res.append(k.contig_src + ';')

        return '\n'.join(res)

+    def gpu_kernels(self, node, nodename):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        src = self._gpu_kernel_code(node, nodename)
+        nd = node.outputs[0].ndim
+        params = ['uintp']
+        params.extend('uintp' for _ in range(nd))
+        num_inputs = len(node.inputs)
+        num_outputs = len(node.outputs)
+        for n in range(num_inputs + num_outputs):
+            if (n - len(node.inputs)) in self.inplace_pattern:
+                continue
+            params.extend([gpuarray.GpuArray, 'uintp'])
+            params.extend('intp' for _ in range(nd))
+        acc_dtype = getattr(self, 'acc_dtype', None)
+        if acc_dtype is None:
+            acc_dtype = node.outputs[0].type.dtype
+        return [Kernel(code=src, name="elem_%d" % nd, params=params,
+                       flags=Kernel.get_flags(node.inputs[0].type.dtype,
+                                              acc_dtype,
+                                              node.outputs[0].type.dtype),
+                       objvar='elem_%d_%s' % (nd, nodename))]
+
    def c_init_code(self):
        if pygpu.get_default_context().kind == 'opencl':
            raise MethodNotDefined('cuda only')
@@ -273,11 +246,15 @@ class GpuElemwise(HideC, Elemwise):

        # check that all inputs have valid dimensions
        emitted_inames = {}
+        num_kernel_params = 1 + nd + len(inputs + outputs) * (2 + nd)
        code = """
-        int n_blocks = 0;
-        int threads_per_block = 0;
+        size_t n_blocks = 0;
+        size_t threads_per_block = 0;
        size_t numEls = 0;
-        """
+        const ssize_t zero = 0;
+        void *kernel_params[%(num_kernel_params)d] = {0};
+        int err;
+        """ % locals()
        if nd > 0:
            code += """
            size_t dims[%(nd)s] = {%(initial_dims)s};
@@ -416,23 +393,41 @@ class GpuElemwise(HideC, Elemwise):
                //std::cerr << "calling callkernel returned\\n";
        """ % locals()

-        code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals()
-        param = []
+        kname = 'elem_%d_%s' % (nd, name)
+        param = ["(void *)&numEls"]
        for i in range(nd):
-            param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
-                                                              i=i))
+            param.append("(void *)&%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
+                                                                       i=i))
        for n, (name, var) in enumerate(zip(inputs + outputs,
                                       node.inputs + node.outputs)):
            if (n - len(inputs)) in self.inplace_pattern:
                continue
            dtype = dtype_to_ctype(var.dtype)
-            param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
-            param.append("%(name)s->ga.offset" % locals())
+            param.append("(void *)%(name)s->ga.data" % locals())
+            param.append("(void *)&%(name)s->ga.offset" % locals())
            for i in range(nd):
-                param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
-        code += ',\n'.join(param) + ");\n"
+                param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? (void *)&zero: (void *)&PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
+        for n, p in enumerate(param):
+            code += "kernel_params[%(n)d] = %(p)s;\n" % locals()
+        code += """
+        err = GpuKernel_call(&%(kname)s, 1, &threads_per_block, &n_blocks, 0, kernel_params);
+        if (err != GA_NO_ERROR) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "gpuarray error: %(kname)s: %%s.",
+                         GpuKernel_error(&%(kname)s, err));
+            %(fail)s;
+        }
+        """ % dict(kname=kname,fail=fail)
        if config.gpuarray.sync:
-            code += "GpuArray_sync(&%(z)s->ga);\n" % dict(z=z)
+            code += """
+            err = GpuArray_sync(&%(z)s->ga);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(kname)s: %%s.",
+                             GpuKernel_error(&%(kname)s, err));
+                %(fail)s;
+            }
+            """ % locals()
        return str(code)

    def perform(self, node, inputs, output_storage):
@@ -573,7 +568,7 @@ class GpuDimShuffle(HideC, DimShuffle):
        return (4,)


-class GpuCAReduceCuda(HideC, CAReduceDtype):
+class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
    """
    GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.

@@ -737,12 +732,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            return False
        return True

+    def c_header_dirs(self):
+        cuda_root = config.cuda.root
+        if cuda_root:
+            return [os.path.join(cuda_root, 'include')]
+
    def c_headers(self):
        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
-
-    def c_compiler(self):
-        return NVCC_compiler
+                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']

    def c_init_code(self):
        return ['setup_ext_cuda();']
@@ -840,7 +837,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        # \begin bracket the reduction in a check that there is
        # actually work to do
        if getattr(self.scalar_op, 'identity', None) == 0:
-            zero_shp = "cudaMemset((%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(out_dtype)s))" % locals()
+            zero_shp = "GpuArray_memset(&%(z)s->ga, 0)" % locals()
        # TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
        else:
            scalar_op = self.scalar_op
@@ -891,28 +888,24 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):

            .. code-block:: c

+                ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
+                ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
+                ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
                if (verbose)
                    printf("running kernel_reduce_10_%(name)s\\n");
-                int n_shared = sizeof(%(acc_dtype)s) * n_threads.x * n_threads.y * n_threads.z;
-                kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
-                                                n_shared>>>(
-                        PyGpuArray_DIMS(%(x)s)[0],
-                        PyGpuArray_DIMS(%(x)s)[1],
-                        (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
-                        (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s)
-                        );
-                [
-        if config.gpuarray.sync:
-            code += "GpuArray_sync(&%(z)s->ga);\n" % dict(z=z)
-                ]
-                if (cudaSuccess != cudaGetLastError())
-                {
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: ... );
-                    %(fail)s;
-                }
+                size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0] * n_threads[1] * n_threads[2];
+                void *kernel_params[] = {
+                        (void *)&PyGpuArray_DIMS(%(x)s)[0],
+                        (void *)&PyGpuArray_DIMS(%(x)s)[1],
+                        (void *)%(x)s->ga.data,
+                        (void *)&%(x)s->ga.offset,
+                        (void *)&stride_A0,
+                        (void *)&stride_A1,
+                        (void *)%(z)s->ga.data,
+                        (void *)&%(z)s->ga.offset,
+                        (void *)&stride_Z0};
+                int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
+                %(err_check)s
        """
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
@@ -923,64 +916,66 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        ndim = len(self.reduce_mask)
        nd_out = ndim - sum(self.reduce_mask)
        shapes_format = "shape=(%s)" % ",".join(["%llu"] * node.inputs[0].ndim)
-        shapes_data = ",".join(["(unsigned long long) PyGpuArray_DIMS(%s)[%d]" % (x, i)
+        shapes_data = ",".join(["(size_t) PyGpuArray_DIMS(%s)[%d]" % (x, i)
                                for i in range(node.inputs[0].ndim)])
+        k_var = "kernel_reduce_%(pattern)s_%(name)s" % locals()
+        params = []

-        print("""
-            if (verbose)
-                printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
-            int n_shared = sizeof(%(acc_dtype)s) * n_threads.x * n_threads.y * n_threads.z;
-            if (verbose>1)
-                printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
-                       " nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
-                       " nb_block=%%d, n_shared=%%d, %(shapes_format)s\\n",
-                                  n_threads.x,n_threads.y,n_threads.z,
-                                  n_threads.x*n_threads.y*n_threads.z,
-                                  n_blocks.x,n_blocks.y,
-                                  n_blocks.x*n_blocks.y, n_shared, %(shapes_data)s);
-            kernel_reduce_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-            """ % locals(), file=sio)
        for i in xrange(ndim):
-            print("""
-                    PyGpuArray_DIMS(%(x)s)[%(i)s],
-            """ % locals(), file=sio)
-        print("""
-                    (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset)
-            """ % locals(), file=sio)
+            params.append("(void *)&PyGpuArray_DIMS(%(x)s)[%(i)s]" % locals())
+        params.append("(void *)%(x)s->ga.data" % locals())
+        params.append("(void *)&%(x)s->ga.offset" % locals())
        for i in xrange(ndim):
            print("""
-                    ,PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s)
-            """ % locals(), file=sio)
-        print("""
-                    ,(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset)
+            ssize_t stride_A%(i)d = PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s);
            """ % locals(), file=sio)
+            params.append("(void *)&stride_A%(i)d" % locals())
+
+        params.append("(void *)%(z)s->ga.data" % locals())
+        params.append("(void *)&%(z)s->ga.offset" % locals())
        for i in xrange(nd_out):
            print("""
-                    ,PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(out_dtype)s)
+            ssize_t stride_Z%(i)d = PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(out_dtype)s);
            """ % locals(), file=sio)
+            params.append("(void *)&stride_Z%(i)d" % locals())
+        kernel_params = ', '.join(params)
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+                %(fail)s;
+            }
+        """ % locals()
+        print("""
+            if (verbose)
+                printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
+            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0] * n_threads[1] * n_threads[2];
+            void *kernel_params[] = { %(kernel_params)s };
+            if (verbose>1)
+                printf("n_threads[0]=%%lu, n_threads[1]=%%lu, "
+                       "n_threads[2]=%%lu, n_threads=%%lu, "
+                       "n_blocks[0]=%%lu, n_blocks[1]=%%lu, n_blocks[2]=%%lu, "
+                       "n_blocks=%%lu, n_shared=%%d, %(shapes_format)s\\n",
+                                  n_threads[0],n_threads[1],
+                                  n_threads[2],
+                                  n_threads[0]*n_threads[1]*
+                                  n_threads[2],
+                                  n_blocks[0],n_blocks[1],n_blocks[2],
+                                  n_blocks[0]*n_blocks[1]*n_blocks[2],
+                                  n_shared, %(shapes_data)s);
+            int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
+            %(err_check)s
+            """ % locals(), file=sio)
+
        sync = ""
        if config.gpuarray.sync:
-            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        print("""
-                    );
            %(sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                    "Cuda error: %%s: %%s."
-                    " (grid: %%i x %%i; block: %%i x %%i x %%i)"
-                    " %(shapes_format)s \\n",
-                    "kernel_reduce_%(pattern)s_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z,
-                    %(shapes_data)s);
-                %(fail)s;
-            }
        """ % locals(), file=sio)
        return sio.getvalue()

@@ -993,66 +988,86 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):

        .. code-block:: c

-            static __global__ void kernel_reduce_110_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(in_dtype)s *A,
-                    const int sA0,
-                    const int sA1,
-                    const int sA2,
-                    %(out_dtype)s * Z,
-                    const int sZ0)
+            KERNEL void kernel_reduce_110_%(nodename)s(
+                    const ga_size d0,
+                    const ga_size d1,
+                    const ga_size d2,
+                    const %(in_type)s *A,
+                    const ga_size offset_A,
+                    const ga_ssize sA0,
+                    const ga_ssize sA1,
+                    const ga_ssize sA2,
+                    %(out_type)s * Z,
+                    const ga_size offset_Z,
+                    const ga_ssize sZ0)

        Since the nodename is unique, we don't need to put the name
        of the scalar_op in here.

        """
-        in_dtype = "npy_" + node.inputs[0].dtype
-        out_dtype = "npy_" + node.outputs[0].dtype
+        in_dtype = node.inputs[0].dtype
+        out_dtype = node.outputs[0].dtype
+        in_type = gpuarray.dtype_to_ctype(in_dtype)
+        out_type = gpuarray.dtype_to_ctype(out_dtype)
        if reduce_mask is None:
            reduce_mask = self.reduce_mask
        if ndim is None:
            ndim = len(reduce_mask)
        if pattern is None:
            pattern = ''.join(str(i) for i in reduce_mask)
+        kname = "kernel_reduce_%(pattern)s" % locals()
+        k_var = "kernel_reduce_%(pattern)s_%(nodename)s" % locals()
+        params = []
        sio = StringIO()

        print("""
-            static __global__ void kernel_reduce_%(pattern)s_%(nodename)s(
+            KERNEL void %(kname)s(
        """ % locals(), file=sio)
        for i in xrange(ndim):
+            params.append('uintp')
            print("""
-                    const int d%(i)s,
+                    const ga_size d%(i)s,
        """ % locals(), file=sio)
+        params.append(gpuarray.GpuArray)
+        params.append('uintp')
        print("""
-                    const %(in_dtype)s *A,
+                    const %(in_type)s *A, const ga_size offset_A,
        """ % locals(), file=sio)
        for i in xrange(ndim):
+            params.append('intp')
            print("""
-                    const int sA%(i)s,
+                    const ga_ssize sA%(i)s,
        """ % locals(), file=sio)
+        params.append(gpuarray.GpuArray)
+        params.append('uintp')
        print("""
-                    %(out_dtype)s * Z
+                    %(out_type)s * Z, const ga_size offset_Z
        """ % locals(), file=sio)
        for i in xrange(ndim - sum(reduce_mask)):
+            params.append('intp')
            print("""
-                    , const int sZ%(i)s
+                    , const ga_ssize sZ%(i)s
        """ % locals(), file=sio)
        print(")", file=sio)
-        return sio.getvalue()
+        return sio.getvalue(), kname, params, k_var

    def _k_init(self, node, nodename):
+        in_dtype = node.inputs[0].dtype
+        out_dtype = node.outputs[0].dtype
        acc_dtype = self._acc_dtype(node.inputs[0].dtype)
        # We need to use theano_complex* and not npy_complex*
-        acc_dtype = theano.scalar.basic.Scalar(acc_dtype).dtype_specs()[1]
+        in_type = gpuarray.dtype_to_ctype(in_dtype)
+        out_type = gpuarray.dtype_to_ctype(out_dtype)
+        acc_type = gpuarray.dtype_to_ctype(acc_dtype)

        return """
                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
                const int threadNum = threadIdx.z * blockDim.x * blockDim.y
                + threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = 0;
+                extern __shared__ %(acc_type)s buf[];
+                %(acc_type)s myresult = 0;
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                //This is caught in cuda/init.py when we init the gpu. I keep
                //it here to ease finding code that rely on this.
@@ -1315,7 +1330,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
        if getattr(self.scalar_op, 'identity', None) == 0:
-            zero_shp = "cudaMemset((%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(out_dtype)s))" % locals()
+            zero_shp = "GpuArray_memset(&%(z)s->ga, 0)" % locals()
        # TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
        else:
            zero_shp = """
@@ -1325,44 +1340,43 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            """ % locals()

        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
+        k_var = "kernel_reduce_ccontig_%(name)s" % locals()
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+                %(fail)s;
+            }
+        """ % locals()
        sync = ""
        if config.gpuarray.sync:
-            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        print("""
        {
          if(PyGpuArray_SIZE(%(x)s)==0){
            %(zero_shp)s;
          }else{
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_SIZE(%(x)s),
-                            (size_t) 256));
-            dim3 n_blocks(1);
+            size_t numEls = PyGpuArray_SIZE(%(x)s);
+            size_t n_threads = std::min(numEls, (size_t) 256);
+            size_t n_blocks = 1;
+            void *kernel_params[] = {(void *)&numEls,
+                                     (void *)%(x)s->ga.data,
+                                     (void *)&%(x)s->ga.offset,
+                                     (void *)%(z)s->ga.data,
+                                     (void *)&%(z)s->ga.offset};
            if (verbose) printf("running kernel_reduce_ccontig_%(name)s"
-                                " n_threads.x=%%d, size=%%d, ndim=%%d\\n",
-                                n_threads.x,PyGpuArray_SIZE(%(x)s),
+                                " n_threads=%%lu, size=%%lu, ndim=%%d\\n",
+                                n_threads,numEls,
                                PyGpuArray_NDIM(%(x)s));
-            int n_shared = sizeof(%(acc_dtype)s) * n_threads.x;
-            kernel_reduce_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                    PyGpuArray_SIZE(%(x)s),
-                    (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                    (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset));
+            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads;
+            int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
+            %(err_check)s
            %(sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %%s: %%s."
-                             " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_ccontig_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z);
-                %(fail)s;
-            }
         }
        }
        """ % locals(), file=sio)
@@ -1372,10 +1386,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[0],
-                            (size_t) 256));
-            dim3 n_blocks(1);
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
+            size_t n_blocks[3] = {1, 1, 1};
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1385,15 +1397,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[1],
-                            (size_t) 256));
-            while (n_threads.y * n_threads.x <= 256) ++n_threads.y;
-            n_threads.y -= 1;
-            if (n_threads.y > PyGpuArray_DIMS(%(x)s)[0])
-                n_threads.y = PyGpuArray_DIMS(%(x)s)[0];
-
-            dim3 n_blocks(1);
+
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
+            while (n_threads[1] * n_threads[0] <= 256) ++n_threads[1];
+            n_threads[1] -= 1;
+            if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[0])
+                n_threads[1] = PyGpuArray_DIMS(%(x)s)[0];
+
+            size_t n_blocks[3] = {1, 1, 1};
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1421,25 +1432,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):

        threads_y = """
            //get as many y threads as we can fit
-            while (n_threads.x * (n_threads.y+1) <= 256)
+            while (n_threads[0] * (n_threads[1]+1) <= 256)
            {
-                if (n_threads.y < PyGpuArray_DIMS(%(x)s)[%(N)s-1])
-                    n_threads.y += 1;
+                if (n_threads[1] < PyGpuArray_DIMS(%(x)s)[%(N)s-1])
+                    n_threads[1] += 1;
                else
                    break;
            }""" % locals()

        threads_z = """
            //get as many z threads as we can fit
-            while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256)
+            while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256)
            {
-                if (n_threads.z < PyGpuArray_DIMS(%(x)s)[%(N)s-2])
-                    n_threads.z += 1;
+                if (n_threads[2] < PyGpuArray_DIMS(%(x)s)[%(N)s-2])
+                    n_threads[2] += 1;
                else
                    break;
            }
            //Maximum for Fermi GPU on that dimensions.
-            n_threads.z = std::min(n_threads.z, (unsigned)64);
+            n_threads[2] = std::min(n_threads[2], (size_t)64);
        """ % locals()

        if len(self.reduce_mask) == 2:
@@ -1452,13 +1463,10 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[%(N)s],
-                            (size_t) 256));
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[%(N)s], (size_t) 256), 1, 1};
            %(threads_y)s
            %(threads_z)s
-            dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[0],
-                                   (size_t) 4096));
+            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1476,9 +1484,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
+        k_var = "kernel_reduce_10_%(name)s" % locals()
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(%(k_var)s, err));
+                %(fail)s;
+            }
+        """ % locals()
        sync = ""
        if config.gpuarray.sync:
-            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        print("""
    {
        int verbose = 0;
@@ -1491,95 +1511,71 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                // we could schedule more threads if we were maxing out the gridsize below, but
                // the gridsize is way more than the physical hardware and I think 32 threads
                // on a huge grid is enough to fully use the hardware.
-                dim3 n_threads(32,1,1);
+                size_t n_threads[3] = {32, 1, 1};

                // We kindof reshape the input implicitly to something 4D:
                //  the shape A,B,C    ->   A, B, D, E
                //  where C <= D*E < C+32
                //  where E==32

-                int A = 1;
-                int B = PyGpuArray_DIMS(%(x)s)[0];
-                int C = PyGpuArray_DIMS(%(x)s)[1];
-                int D = C/32;
+                GpuKernel *%(k_var)s = &kernel_reduce_010_AD_%(name)s;
+                size_t A = 1;
+                size_t B = PyGpuArray_DIMS(%(x)s)[0];
+                size_t C = PyGpuArray_DIMS(%(x)s)[1];
+                size_t D = C/32;
                if (32*D < C) D+= 1;
                assert ((C <= 32*D) && (32*D < C+32));

                // The gridsize would ideally be (A, D).  But we do the following logic to make
                // sure we don't ask for a grid that is too big.
-                dim3 n_blocks(A,D);
-                if (n_blocks.x > 4096) n_blocks.x = 4096;
-                if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x;
-                kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads>>>(
-                A,B,C,D,
-                        (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        1,
-                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
-                        (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        1,
-                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s)
-                        );
-
-            %(sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                    "Cuda error: %%s: %%s."
-                    " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_10_AD%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z);
-                %(fail)s;
-            }
+                size_t n_blocks[3] = {A, D, 1};
+                if (n_blocks[0] > 4096) n_blocks[0] = 4096;
+                if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
+                ssize_t stride_A0 = 1;
+                ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
+                ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
+                ssize_t stride_Z0 = 1;
+                ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
+                void *kernel_params[] = {
+                        (void *)&A, (void *)&B, (void *)&C, (void *)&D,
+                        (void *)%(x)s->ga.data,
+                        (void *)&%(x)s->ga.offset,
+                        (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
+                        (void *)%(z)s->ga.data,
+                        (void *)&%(z)s->ga.offset,
+                        (void *)&stride_Z0, (void *)&stride_Z1};
+                int err = GpuKernel_call(%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
+                %(err_check)s
+                %(sync)s
        }else{
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[0],
-                            (size_t) 256));
-            dim3 n_blocks(1,
-                std::min(PyGpuArray_DIMS(%(x)s)[1],
-                    (size_t) 4096));
+            GpuKernel *%(k_var)s = &kernel_reduce_010_%(name)s;
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
+            size_t n_blocks[3] = {1, std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 4096), 1};
            if (verbose) {
              fprintf(stderr,
                "running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n",
-                n_blocks.x,
-                n_blocks.y);
+                n_blocks[0],
+                n_blocks[1]);
            }
            assert(PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]);
-            int n_shared = sizeof(%(acc_dtype)s) * n_threads.x;
-            kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                    1,
-                    PyGpuArray_DIMS(%(x)s)[0],
-                    PyGpuArray_DIMS(%(x)s)[1],
-                    (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                    1,
-                    PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                    PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
-                    (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                    1,
-                    PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s)
-                    );
+            size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0];
+            size_t dim_0 = 1;
+            ssize_t stride_A0 = 1;
+            ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
+            ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
+            ssize_t stride_Z0 = 1;
+            ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
+            void *kernel_params[] = {
+                    (void *)&dim_0,
+                    (void *)&PyGpuArray_DIMS(%(x)s)[0],
+                    (void *)&PyGpuArray_DIMS(%(x)s)[1],
+                    (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
+                    (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
+                    (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
+                    (void *)&stride_Z0, (void *)&stride_Z1};
+            int err = GpuKernel_call(%(k_var)s, 3, n_threads, n_blocks, n_shared, kernel_params);
+            %(err_check)s
            %(sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                    "Cuda error: %%s: %%s."
-                    " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_010_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z);
-                %(fail)s;
-            }
        }
    }
        """ % locals(), file=sio)
@@ -1591,9 +1587,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        pattern = ''.join(str(i) for i in self.reduce_mask)
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
+        k_var = "kernel_reduce_010_AD_%(name)s" % locals()
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+                %(fail)s;
+            }
+        """ % locals()
        sync = ""
        if config.gpuarray.sync:
-            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        print("""
        {
            //int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2];
@@ -1608,108 +1616,82 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                // we could schedule more threads if we were maxing out the gridsize below, but
                // the gridsize is way more than the physical hardware and I think 32 threads
                // on a huge grid is enough to fully use the hardware.
-                dim3 n_threads(32,1,1);
+                size_t n_threads[3] = {32, 1, 1};

                // We kindof reshape the input implicitly to something 4D:
                //  the shape A,B,C    ->   A, B, D, E
                //  where C <= D*E < C+32
                //  where E==32

-                int A = PyGpuArray_DIMS(%(x)s)[0];
-                int B = PyGpuArray_DIMS(%(x)s)[1];
-                int C = PyGpuArray_DIMS(%(x)s)[2];
-                int D = C/32;
+                size_t A = PyGpuArray_DIMS(%(x)s)[0];
+                size_t B = PyGpuArray_DIMS(%(x)s)[1];
+                size_t C = PyGpuArray_DIMS(%(x)s)[2];
+                size_t D = C/32;
                if (32*D < C) D+= 1;
                assert ((C <= 32*D) && (32*D < C+32));

                // The gridsize would ideally be (A, D).  But we do the following logic to make
                // sure we don't ask for a grid that is too big.
-                dim3 n_blocks(A,D);
-                if (n_blocks.x > 4096) n_blocks.x = 4096;
-                if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x;
-                int n_shared = 0;
-                kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                A,B,C,D,
-                        (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s),
-                        (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s),
-                        PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s)
-                        );
+                size_t n_blocks[3] = {A, D, 1};
+                if (n_blocks[0] > 4096) n_blocks[0] = 4096;
+                if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
+                ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
+                ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
+                ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s);
+                ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
+                ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s);
+                void *kernel_params[] = {
+                        (void *)&A, (void *)&B, (void *)&C, (void *)&D,
+                        (void *)%(x)s->ga.data,
+                        (void *)&%(x)s->ga.offset,
+                        (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
+                        (void *)%(z)s->ga.data,
+                        (void *)&%(z)s->ga.offset,
+                        (void *)&stride_Z0, (void *)&stride_Z1};
+                int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
+                %(err_check)s
                %(sync)s
-                cudaError_t sts = cudaGetLastError();
-                if (cudaSuccess != sts)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                        "Cuda error: %%s: %%s."
-                        " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                        "kernel_reduce_010_%(name)s",
-                        cudaGetErrorString(sts),
-                        n_blocks.x,
-                        n_blocks.y,
-                        n_threads.x,
-                        n_threads.y,
-                        n_threads.z);
-                    %(fail)s;
-                }
            }
            else
            {
                int verbose = 2;

-                  dim3 n_threads(std::min((size_t) 32,
-                                          PyGpuArray_DIMS(%(x)s)[2]));
-                  while(    (n_threads.x*(n_threads.y+1)<=256)
-                         && (n_threads.y<PyGpuArray_DIMS(%(x)s)[1])){
-                      n_threads.y++;
+                  size_t n_threads[3] = {std::min((size_t) 32, PyGpuArray_DIMS(%(x)s)[2]), 1, 1};
+                  while(    (n_threads[0]*(n_threads[1]+1)<=256)
+                         && (n_threads[1]<PyGpuArray_DIMS(%(x)s)[1])){
+                      n_threads[1]++;
                  }

-                  dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[0],
-                                (size_t)4096));
-                  n_blocks.y = std::min(
+                  size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096), 1, 1};
+                  n_blocks[1] = std::min(
                      ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
-                                  (size_t)n_threads.x),
-                      (size_t)(4096 / n_blocks.x)
+                                  (size_t)n_threads[0]),
+                      (size_t)(4096 / n_blocks[0])
                      );
                if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
                                     PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s)),
                            PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s))
                   ==PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s)
-                  && n_blocks.y==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
-                                             (size_t)n_threads.x)){
+                  && n_blocks[1]==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
+                                             (size_t)n_threads[0])){
                  if(verbose>1)
                    printf("n_block.x.1=%%d, n_block.x.2=%%d, n_block.y.1=%%d, n_block.y.2=%%d,\\n",
                           PyGpuArray_DIMS(%(x)s)[0],4096,
-                           ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],(size_t)n_threads.x),
-                                       (size_t)(4096 / n_blocks.x));
-                  assert(n_threads.x<=32);
+                           ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],(size_t)n_threads[0]),
+                                       (size_t)(4096 / n_blocks[0]));
+                  assert(n_threads[0]<=32);
                  %(makecall_inner)s
                }else{
-                  n_threads.x = std::min(PyGpuArray_DIMS(%(x)s)[1],
+                  n_threads[0] = std::min(PyGpuArray_DIMS(%(x)s)[1],
                                         (size_t) 256);
-                  n_blocks.x = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096);
-                  n_blocks.y = std::min(
+                  n_blocks[0] = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096);
+                  n_blocks[1] = std::min(
                      PyGpuArray_DIMS(%(x)s)[2],
-                      (size_t)(4096 / n_blocks.x)
+                      (size_t)(4096 / n_blocks[0])
                      );
                  %(makecall)s
                }
                %(sync)s
-                cudaError_t sts = cudaGetLastError();
-                if (cudaSuccess != sts)
-                {
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                        "kernel_reduce_%(pattern)s_%(name)s",
-                        cudaGetErrorString(sts),
-                        n_blocks.x,
-                        n_blocks.y,
-                        n_threads.x,
-                        n_threads.y,
-                        n_threads.z);
-                    %(fail)s;
-                }
            }
        }
        """ % locals(), file=sio)
@@ -1719,16 +1701,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[3],
-                             (size_t) 256));
-            while (n_threads.x * n_threads.y <= 256)
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
+            while (n_threads[0] * n_threads[1] <= 256)
            {
-                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1]) break;
-                n_threads.y += 1;
+                if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1]) break;
+                n_threads[1] += 1;
            }
-            n_threads.y -= 1;
-            dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[2]);
+            n_threads[1] -= 1;
+            size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[2], 1};
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1738,7 +1718,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
-        sync = bool(config.gpuarray.sync)
+        k_var = "kernel_reduce_010_AD_%(name)s" % locals()
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+                %(fail)s;
+            }
+        """ % locals()
+        sync = ""
+        if config.gpuarray.sync:
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        # use threadIdx.x for i0
        # use blockIdx.x for i1
        # use blockIdx.y for i2
@@ -1747,15 +1741,12 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            int verbose = 0;
            if (PyGpuArray_STRIDES(%(x)s)[2] != sizeof(%(in_dtype)s)){
              printf("slow\\n");
-                dim3 n_threads(
-                        std::min(PyGpuArray_DIMS(%(x)s)[0],
-                                 (size_t) 256));
-                dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[1],
-                              (size_t)4096));
-                while (n_blocks.x * (n_blocks.y+1) <= 4096 &&
-                       n_blocks.y <= PyGpuArray_DIMS(%(x)s)[2])
+                size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
+                size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)4096), 1, 1};
+                while (n_blocks[0] * (n_blocks[1]+1) <= 4096 &&
+                       n_blocks[1] <= PyGpuArray_DIMS(%(x)s)[2])
                {
-                    n_blocks.y += 1;
+                    n_blocks[1] += 1;
                }
                %(makecall)s
            }
@@ -1763,50 +1754,38 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            {   // reuse 010_AD kernel, we transpose the 2 first dim
                // See the reduction for the real 010_AD kernel for
                // explanation. We do this to get coalesced read.
-                dim3 n_threads(32,1,1);
+                size_t n_threads[3] = {32, 1, 1};

-                int A = PyGpuArray_DIMS(%(x)s)[1];
-                int B = PyGpuArray_DIMS(%(x)s)[0];
-                int C = PyGpuArray_DIMS(%(x)s)[2];
-                int D = C/32;
+                size_t A = PyGpuArray_DIMS(%(x)s)[1];
+                size_t B = PyGpuArray_DIMS(%(x)s)[0];
+                size_t C = PyGpuArray_DIMS(%(x)s)[2];
+                size_t D = C/32;
                if (32*D < C) D+= 1;
                assert ((C <= 32*D) && (32*D < C+32));

                // The gridsize would ideally be (A, D).  But we do the following logic to make
                // sure we don't ask for a grid that is too big.
-                dim3 n_blocks(A,D);
-                if (n_blocks.x > 4096) n_blocks.x = 4096;
-                if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x;
-                int n_shared = 0;
-                kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                A,B,C,D,
-                        (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
-                        PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s),
-                        (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s),
-                        PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s)
-                        );
-                if (%(sync)d)
-                    GpuArray_sync(&%(z)s->ga);
-                cudaError_t sts = cudaGetLastError();
-                if (cudaSuccess != sts)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                        "Cuda error: %%s: %%s."
-                        " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                        "kernel_reduce_010_%(name)s",
-                        cudaGetErrorString(sts),
-                        n_blocks.x,
-                        n_blocks.y,
-                        n_threads.x,
-                        n_threads.y,
-                        n_threads.z);
-                    %(fail)s;
-                }
+                size_t n_blocks[3] = {A, D, 1};
+                if (n_blocks[0] > 4096) n_blocks[0] = 4096;
+                if (n_blocks[0]*n_blocks[1] > 4096) n_blocks[1] = 4096/n_blocks[0];
+                size_t n_shared = 0;
+                ssize_t stride_A0 = PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s);
+                ssize_t stride_A1 = PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s);
+                ssize_t stride_A2 = PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s);
+                ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s);
+                ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s);
+                void *kernel_params[] = {
+                        (void *)&A, (void *)&B, (void *)&C, (void *)&D,
+                        (void *)%(x)s->ga.data,
+                        (void *)&%(x)s->ga.offset,
+                        (void *)&stride_A0, (void *)&stride_A1, (void *)&stride_A2,
+                        (void *)%(z)s->ga.data,
+                        (void *)&%(z)s->ga.offset,
+                        (void *)&stride_Z0, (void *)&stride_Z1};
+                int err = GpuKernel_call(&%(k_var)s, 3, n_threads, n_blocks, 0, kernel_params);
+                %(err_check)s
+                %(sync)s
            }
-
        }
        """ % locals(), file=sio)

@@ -1815,18 +1794,16 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[1],
-                             (size_t) 256));
-            while (n_threads.x*n_threads.y <= 256)
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
+            while (n_threads[0]*n_threads[1] <= 256)
            {
-                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[0])
+                if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[0])
                    break;
-                n_threads.y += 1;
+                n_threads[1] += 1;
            }
-            n_threads.y -= 1;
+            n_threads[1] -= 1;

-            dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[2]);
+            size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[2], 1, 1};
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1836,19 +1813,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[2],
-                             (size_t) 256));
-            dim3 n_blocks(
-                    std::min(PyGpuArray_DIMS(%(x)s)[0],
-                             (size_t) 4096));
-            while (n_blocks.x * n_blocks.y <= 4096)
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
+            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
+            while (n_blocks[0] * n_blocks[1] <= 4096)
            {
-                if (n_blocks.y > PyGpuArray_DIMS(%(x)s)[1])
+                if (n_blocks[1] > PyGpuArray_DIMS(%(x)s)[1])
                    break;
-                n_blocks.y += 1;
+                n_blocks[1] += 1;
            }
-            n_blocks.y -= 1;
+            n_blocks[1] -= 1;
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1858,31 +1831,29 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[2],
-                             (size_t) 256));
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};

            //get as many y threads as we can fit
-            while (n_threads.x * n_threads.y <= 256)
+            while (n_threads[0] * n_threads[1] <= 256)
            {
-                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1])
+                if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1])
                    break;
-                n_threads.y += 1;
+                n_threads[1] += 1;
            }
-            n_threads.y -= 1;
+            n_threads[1] -= 1;

            //get as many z threads as we can fit
-            while (n_threads.x * n_threads.y * n_threads.z <= 256)
+            while (n_threads[0] * n_threads[1] * n_threads[2] <= 256)
            {
-                if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
+                if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
                    break;
-                n_threads.z += 1;
+                n_threads[2] += 1;
            }
-            n_threads.z -= 1;
+            n_threads[2] -= 1;
            //Maximum for Fermi GPU on that dimensions.
-            n_threads.z = std::min(n_threads.z, (unsigned)64);
+            n_threads[2] = std::min(n_threads[2], (size_t)64);

-            dim3 n_blocks(1,1,1);
+            size_t n_blocks[3] = {1, 1, 1};
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1896,24 +1867,20 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        {
            int verbose = 0;

-            dim3 n_blocks(
-                    std::min(PyGpuArray_DIMS(%(x)s)[0],
-                             (size_t) 4096));
+            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};

-            while (n_blocks.x * n_blocks.y <= 4096 &&
-                   n_blocks.y < PyGpuArray_DIMS(%(x)s)[1])
+            while (n_blocks[0] * n_blocks[1] <= 4096 &&
+                   n_blocks[1] < PyGpuArray_DIMS(%(x)s)[1])
            {
-                n_blocks.y += 1;
+                n_blocks[1] += 1;
            }

-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[3],
-                             (size_t) 256));
-            while (n_threads.x * n_threads.y <= 256
-                   && n_threads.y < PyGpuArray_DIMS(%(x)s)[2]
-                   && n_threads.x * n_threads.y * sizeof(%(acc_dtype)s) <=(15*1024-200))
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
+            while (n_threads[0] * n_threads[1] <= 256
+                   && n_threads[1] < PyGpuArray_DIMS(%(x)s)[2]
+                   && n_threads[0] * n_threads[1] * sizeof(%(acc_dtype)s) <=(15*1024-200))
            {
-                n_threads.y += 1;
+                n_threads[1] += 1;
            }

            %(makecall)s
@@ -1925,32 +1892,30 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[2],
-                             (size_t) 256));
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};

            //get as many y threads as we can fit
-            while (n_threads.x * n_threads.y <= 256)
+            while (n_threads[0] * n_threads[1] <= 256)
            {
-                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1])
+                if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[1])
                    break;
-                n_threads.y += 1;
+                n_threads[1] += 1;
            }
-            n_threads.y -= 1;
+            n_threads[1] -= 1;

            //get as many z threads as we can fit
-            while (n_threads.x * n_threads.y * n_threads.z <= 256)
+            while (n_threads[0] * n_threads[1] * n_threads[2] <= 256)
            {
-                if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
+                if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
                    break;
-                n_threads.z += 1;
+                n_threads[2] += 1;
            }
-            n_threads.z -= 1;
+            n_threads[2] -= 1;

            //Maximum for Fermi GPU on that dimensions.
-            n_threads.z = std::min(n_threads.z, (unsigned)64);
+            n_threads[2] = std::min(n_threads[2], (size_t)64);

-            dim3 n_blocks(1,1,1);
+            size_t n_blocks[3] = {1, 1, 1};
            %(makecall)s
        }
        """ % locals(), file=sio)
@@ -1960,27 +1925,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        print("""
        {
            int verbose = 0;
-            dim3 n_threads(
-                    std::min(PyGpuArray_DIMS(%(x)s)[3],
-                             (size_t) 256));
+            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};

-            while (n_threads.x * (n_threads.y+1) <= 256) ++n_threads.y;
-            if (n_threads.y > PyGpuArray_DIMS(%(x)s)[2])
-                n_threads.y = PyGpuArray_DIMS(%(x)s)[2];
+            while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1];
+            if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[2])
+                n_threads[1] = PyGpuArray_DIMS(%(x)s)[2];

-            while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256) ++n_threads.z;
-            if (n_threads.z > 64)
-                n_threads.z = 64;
-            if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
-                n_threads.z = PyGpuArray_DIMS(%(x)s)[0];
+            while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256) ++n_threads[2];
+            if (n_threads[2] > 64)
+                n_threads[2] = 64;
+            if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
+                n_threads[2] = PyGpuArray_DIMS(%(x)s)[0];

-            dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[1]);
+            size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[1], 1, 1};
            %(makecall)s
        }
        """ % locals(), file=sio)

    def c_code_cache_version_apply(self, node):
-        version = [15]  # the version corresponding to the c code in this Op
+        version = [16]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,
@@ -1994,14 +1957,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        else:
            return ()

-    def c_support_code_apply(self, node, nodename):
-        sio = StringIO()
+    def gpu_kernels(self, node, nodename):
        nd_in = len(self.reduce_mask)
-        in_dtype = "npy_" + node.inputs[0].dtype
-        out_dtype = "npy_" + node.outputs[0].dtype
-        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
-        load_in = load_w(node.inputs[0].dtype)
-        write_out = write_w(node.outputs[0].dtype)
+        in_dtype = node.inputs[0].dtype
+        out_dtype = node.outputs[0].dtype
+        acc_dtype = self._acc_dtype(node.inputs[0].dtype)
+        flags=Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
+        in_type = gpuarray.dtype_to_ctype(in_dtype)
+        out_type = gpuarray.dtype_to_ctype(out_dtype)
+        acc_type = gpuarray.dtype_to_ctype(acc_dtype)
+        load_in = load_w(in_dtype)
+        write_out = write_w(out_dtype)
+        kernels = []

        if all(i == 1 for i in self.reduce_mask):
            # this kernel is ok for up to a few thousand elements, but
@@ -2011,16 +1978,21 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
+            kname = "kernel_reduce_ccontig"
+            k_var = "kernel_reduce_ccontig_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_ccontig_%(nodename)s(
-                    const unsigned int d0,
-                    const %(in_dtype)s *A,
-                    %(out_dtype)s * Z)
+            KERNEL void %(kname)s(
+                    const ga_size d0,
+                    const %(in_type)s *A, const ga_size offset_A,
+                    %(out_type)s *Z, const ga_size offset_Z)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
+                extern __shared__ %(acc_type)s buf[];
+                %(acc_type)s myresult = %(reduce_init)s;
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2034,6 +2006,13 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
+            params = [
+                'uintp',
+                gpuarray.GpuArray, 'uintp',
+                gpuarray.GpuArray, 'uintp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (1,):
            # this kernel is ok for up to a few thousand elements, but
            # it only runs on ONE multiprocessor
@@ -2042,16 +2021,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0 * sA0])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
+            kname = "kernel_reduce_1"
+            k_var = "kernel_reduce_1_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_1_%(nodename)s(
-                    const unsigned int d0,
-                    const %(in_dtype)s *A, const int sA0,
-                    %(out_dtype)s * Z)
+            KERNEL void %(kname)s(
+                    const ga_size d0,
+                    const %(in_type)s *A, const ga_size offset_A,
+                    const ga_ssize sA0,
+                    %(out_type)s * Z, const ga_size offset_Z)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
+                extern __shared__ %(acc_type)s buf[];
+                %(acc_type)s myresult = %(reduce_init)s;
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2065,6 +2050,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
+            params = [
+                'uintp',
+                gpuarray.GpuArray, 'uintp',
+                'intp',
+                gpuarray.GpuArray, 'uintp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (1, 1):
            # this kernel is ok for up to a few thousand elements, but
            # it only runs on ONE multiprocessor
@@ -2073,17 +2066,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0 * sA0 + i1 * sA1])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
+            kname = "kernel_reduce_11"
+            k_var = "kernel_reduce_11_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_11_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const %(in_dtype)s *A, const int sA0, const int sA1,
-                    %(out_dtype)s * Z)
+            KERNEL void %(kname)s(
+                    const ga_size d0, const ga_size d1,
+                    const %(in_type)s *A, const ga_size offset_A,
+                    const ga_ssize sA0, const ga_ssize sA1,
+                    %(out_type)s * Z, const ga_size offset_Z)
            {
                const int threadCount = blockDim.x * blockDim.y;
                const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
+                extern __shared__ %(acc_type)s buf[];
+                %(acc_type)s myresult = %(reduce_init)s;
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2100,6 +2098,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
+            params = [
+                'uintp', 'uintp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp',
+                gpuarray.GpuArray, 'uintp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        #01, 011, 0111
        if (0 == self.reduce_mask[0] and
            all(self.reduce_mask[1:]) and
@@ -2144,17 +2150,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):

            reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node,
                                           nodename, sub={})
-            param_dim = ",".join(["const int d%d" % i
+            param_dim = ",".join(["const ga_size d%d" % i
                                  for i in xrange(nd_in)])
-            param_strides = ",".join(["const int sA%d" % i
+            param_strides = ",".join(["const ga_ssize sA%d" % i
                                      for i in xrange(nd_in)])
-            decl = self._k_decl(node, nodename)
+            decl, kname, params, k_var = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals())
            reduce_fct = self._assign_reduce(
                node, nodename, "myresult",
                load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
                {}, True)
+            sio = StringIO()
            print("""
                %(decl)s{
                    %(init)s
@@ -2171,6 +2178,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                    }
                }
                """ % locals(), file=sio)
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
            # this kernel uses one block for each column,
            # threads per block for each element per column.
@@ -2184,18 +2193,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])")
+            kname = "kernel_reduce_010"
+            k_var = "kernel_reduce_010_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_010_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(in_dtype)s *A, const int sA0,
-                    const int sA1, const int sA2,
-                    %(out_dtype)s * Z, const int sZ0, const int sZ1)
+            KERNEL void %(kname)s(
+                    const ga_size d0, const ga_size d1, const ga_size d2,
+                    const %(in_type)s *A, const ga_size offset_A,
+                    const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
+                    %(out_type)s * Z, const ga_size offset_Z,
+                    const ga_ssize sZ0, const ga_ssize sZ1)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
+                extern __shared__ %(acc_type)s buf[];
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2207,7 +2220,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                {
                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
                    {
-                        %(acc_dtype)s myresult = %(reduce_init)s;
+                        %(acc_type)s myresult = %(reduce_init)s;
                        for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
                        {
                            %(reduce_fct)s;
@@ -2218,25 +2231,36 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):

            }
            """ % locals(), file=sio)
+            params = [
+                'uintp', 'uintp', 'uintp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp', 'intp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask in [(0, 1, 0), (1, 0), (1, 0, 0)]:
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(X[a * sX0 + b * sX1 + c * sX2])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])")
+            kname = "kernel_reduce_010_AD"
+            k_var = "kernel_reduce_010_AD_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_010_AD_%(nodename)s(
-                    const int A,
-                    const int B,
-                    const int C,
-                    const int D,
-                    //const int E, // THIS is 32
-                    const %(in_dtype)s *X, const int sX0,
-                    const int sX1, const int sX2,
-                    %(out_dtype)s * Z, const int sZ0, const int sZ1)
+            KERNEL void %(kname)s(
+                    const ga_size A, const ga_size B, const ga_size C, const ga_size D,
+                    const %(in_type)s *X, const ga_size offset_X,
+                    const ga_ssize sX0, const ga_ssize sX1, const ga_ssize sX2,
+                    %(out_type)s * Z, const ga_size offset_Z,
+                    const ga_ssize sZ0, const ga_ssize sZ1)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                %(acc_dtype)s myresult = 0;
+                %(acc_type)s myresult = 0;
+                X = (const %(in_type)s *)(((char *)X)+offset_X);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2262,6 +2286,15 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):

            }
            """ % locals(), file=sio)
+            params = [
+                'uintp', 'uintp', 'uintp', 'uintp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp', 'intp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (0, 1, 0):
            #
            # This kernel is optimized when the inner most dimensions
@@ -2275,7 +2308,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
 # block.x = dim 0
 # block.y = dim 1 rest
            init = self._k_init(node, nodename)
-            decl = self._k_decl(node, nodename, pattern="010_inner")
+            decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner")
            reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
                                                    node, nodename,
                                                    'blockDim.x')
@@ -2283,6 +2316,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])")
+            sio = StringIO()
            print("""
            %(decl)s
            {
@@ -2307,6 +2341,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
              }
            }
            """ % locals(), file=sio)
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (1, 1, 0):
            # this kernel uses one block for each column,
            # threads per block for each element per column.
@@ -2319,19 +2355,23 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])")
+            kname = "kernel_reduce_110"
+            k_var = "kernel_reduce_110_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_110_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(in_dtype)s *A, const int sA0,
-                    const int sA1, const int sA2,
-                    %(out_dtype)s * Z, const int sZ0)
+            KERNEL void %(kname)s(
+                    const ga_size d0, const ga_size d1, const ga_size d2,
+                    const %(in_type)s *A, const ga_size offset_A,
+                    const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
+                    %(out_type)s * Z, const ga_size offset_Z,
+                    const ga_ssize sZ0)
            {
                const int threadCount = blockDim.x * blockDim.y;
                const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
+                extern __shared__ %(acc_type)s buf[];
+                %(acc_type)s myresult = %(reduce_init)s;
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2351,15 +2391,25 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
+            params = [
+                'uintp', 'uintp', 'uintp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp', 'intp',
+                gpuarray.GpuArray, 'uintp',
+                'intp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (1, 0, 0):
            reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]',
                                           node, nodename, sub={})
-            decl = self._k_decl(node, nodename)
+            decl, kname, params, k_var = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])")
+            sio = StringIO()
            print("""
            %(decl)s
            {
@@ -2378,15 +2428,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                }
            }
            """ % locals(), file=sio)
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (1, 1, 1):
            reducebuf = self._k_reduce_buf('Z[0]', node,
                                           nodename, sub={})
-            decl = self._k_decl(node, nodename)
+            decl, kname, params, k_var = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
+            sio = StringIO()
            print("""
            %(decl)s
            {
@@ -2405,6 +2458,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (0, 0, 1):
            # this kernel uses one block for each row,
            # threads per block for each element per row.
@@ -2414,18 +2469,22 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
+            kname = "kernel_reduce_001"
+            k_var = "kernel_reduce_001_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_001_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(in_dtype)s *A, const int sA0,
-                    const int sA1, const int sA2,
-                    %(out_dtype)s * Z, const int sZ0, const int sZ1)
+            KERNEL void %(kname)s(
+                    const ga_size d0, const ga_size d1, const ga_size d2,
+                    const %(in_type)s *A, const ga_size offset_A,
+                    const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2,
+                    %(out_type)s * Z, const ga_size offset_Z,
+                    const ga_ssize sZ0, const ga_ssize sZ1)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
+                extern __shared__ %(acc_type)s buf[];
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2436,7 +2495,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                {
                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
                    {
-                        %(acc_dtype)s myresult = %(reduce_init)s;
+                        %(acc_type)s myresult = %(reduce_init)s;
                        for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
                        {
                            %(reduce_fct)s;
@@ -2446,17 +2505,27 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                }
            }
            """ % locals(), file=sio)
+            params = [
+                'uintp', 'uintp', 'uintp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp', 'intp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (0, 0, 1, 1):
             # this kernel uses one block for each row,
            # threads per block for each element per row.
            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
                                           node, nodename, sub={})
-            decl = self._k_decl(node, nodename)
+            decl, kname, params, k_var = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
+            sio = StringIO()
            print("""
            %(decl)s
            {
@@ -2466,7 +2535,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                {
                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
                    {
-                        %(acc_dtype)s myresult = %(reduce_init)s;
+                        %(acc_type)s myresult = %(reduce_init)s;
                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
                    {
                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
@@ -2479,17 +2548,20 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                }
            }
            """ % locals(), file=sio)
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (0, 1, 0, 1):
            # this kernel uses one block for each row,
            # threads per block for each element per row.
            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]',
                                           node, nodename, sub={})
-            decl = self._k_decl(node, nodename)
+            decl, kname, params, k_var = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])")
+            sio = StringIO()
            print("""
            %(decl)s
            {
@@ -2499,7 +2571,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                {
                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
                    {
-                        %(acc_dtype)s myresult = %(reduce_init)s;
+                        %(acc_type)s myresult = %(reduce_init)s;
                    for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
                    {
                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
@@ -2512,15 +2584,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                }
            }
            """ % locals(), file=sio)
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (1, 1, 1, 1):
            reducebuf = self._k_reduce_buf('Z[0]', node, nodename,
                                           sub={})
-            decl = self._k_decl(node, nodename)
+            decl, kname, params, k_var = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[0])")
+            sio = StringIO()
            print("""
            %(decl)s
            {
@@ -2540,6 +2615,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
        if self.reduce_mask == (1, 0, 1, 1):
            reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
                                           node, nodename, sub={})
@@ -2547,20 +2624,23 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                             load_in + "(A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3])",
                                             {}, True)
            reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])")
+            kname = "kernel_reduce_1011"
+            k_var= "kernel_reduce_1011_" + nodename
+            sio = StringIO()
            print("""
-            static __global__ void kernel_reduce_1011_%(nodename)s(
-                    const unsigned int d0,
-                    const unsigned int d1,
-                    const unsigned int d2,
-                    const unsigned int d3,
-                    const %(in_dtype)s *A, const int sA0, const int sA1,
-                    const int sA2, const int sA3,
-                    %(out_dtype)s * Z, const int sZ0)
+            KERNEL void %(kname)s(
+                    const ga_size d0, const ga_size d1, const ga_size d2, const ga_size d3,
+                    const %(in_type)s *A, const ga_size offset_A,
+                    const ga_ssize sA0, const ga_ssize sA1, const ga_ssize sA2, const ga_ssize sA3,
+                    %(out_type)s * Z, const ga_size offset_Z,
+                    const ga_ssize sZ0)
            {
                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
                const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ %(acc_dtype)s buf[];
-                %(acc_dtype)s myresult = %(reduce_init)s;
+                extern __shared__ %(acc_type)s buf[];
+                %(acc_type)s myresult = %(reduce_init)s;
+                A = (const %(in_type)s *)(((char *)A)+offset_A);
+                Z = (%(out_type)s *)(((char *)Z)+offset_Z);

                if (warpSize != 32)
                {
@@ -2580,14 +2660,16 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                %(reducebuf)s
            }
            """ % locals(), file=sio)
-        print("""
-        template <typename T>
-        static T ceil_intdiv(T a, T b)
-        {
-            return (a/b) + ((a % b) ? 1: 0);
-        }
-        """, file=sio)
-        return sio.getvalue()
+            params = [
+                'uintp', 'uintp', 'uintp', 'uintp',
+                gpuarray.GpuArray, 'uintp',
+                'intp', 'intp', 'intp', 'intp',
+                gpuarray.GpuArray, 'uintp',
+                'intp'
+                ]
+            kernels.append(Kernel(code=sio.getvalue(), name=kname,
+                                  params=params, flags=flags, objvar=k_var))
+        return kernels


 class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
@@ -2820,8 +2902,15 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
            %(output)s = tmp;
        }

-        if (%(sync)d)
-            GpuArray_sync(&%(output)s->ga);
+        if (%(sync)d) {
+            err = GpuArray_sync(&%(output)s->ga);
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: GpuCAReduceCPY: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+                %(fail)s
+            }
+        }
 """ % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
           ls=ls, fail=sub['fail'], output=output, input=input,
           cast_out=bool(acc_dtype != node.outputs[0].type.dtype))

--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ b/theano/sandbox/gpuarray/kernel_codegen.py
@@ -3,6 +3,12 @@ Helper routines for generating gpu kernels for nvcc.

 """

+try:
+    import pygpu
+    from pygpu import gpuarray
+except ImportError:
+    pass
+
 def nvcc_kernel(name, params, body):
    """
    Return the c code of a kernel function.
@@ -26,7 +32,7 @@ def nvcc_kernel(name, params, body):
            else:
                yield b
    bodystr = ';\n'.join(flatbody())
-    return """__global__ void %(name)s (%(paramstr)s)
+    return """KERNEL void %(name)s (%(paramstr)s)
    {
        %(bodystr)s;
    }
@@ -167,11 +173,12 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
    We use __i as an int variable in a loop.

    """
+    ctype = gpuarray.dtype_to_ctype(dtype)
    return [
            # get max of buf (trashing all but buf[0])
            inline_reduce_max(N, buf, threadPos, threadCount),
            '__syncthreads()',
-            ('npy_%s row_max = ' + buf + '[0]') % dtype,
+            ('%s row_max = ' + buf + '[0]') % ctype,
            '__syncthreads()',
            'for(int __i=' + threadPos + '; __i<' + N +
                  '; __i+=' + threadCount + '){',
@@ -181,7 +188,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
            '__syncthreads()',
            inline_reduce_sum(N, buf, threadPos, threadCount),
            '__syncthreads()',
-            ('npy_%s row_sum = ' + buf + '[0]') % dtype,
+            ('%s row_sum = ' + buf + '[0]') % ctype,
            '__syncthreads()',
            # divide each exp() result by the sum to complete the job.
            'for(int __i=' + threadPos + '; __i<' + N +
@@ -259,11 +266,12 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))

+    ctype = gpuarray.dtype_to_ctype(dtype)
    return """
    {
        // This function trashes buf[1..n_threads],
        // leaving the reduction result in buf[0].
-        npy_%(dtype)s red = %(init)s;
+        %(ctype)s red = %(init)s;
        #pragma unroll 16
        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
          red = %(loop_line)s;
@@ -356,6 +364,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
    We use tx as an int variable in a loop.

    """
+    ctype = gpuarray.dtype_to_ctype(dtype)
    ret = [
        # get max of buf (trashing all but buf[0])
        inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x,
@@ -363,7 +372,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
                                       b, stride_b, load_b,
                                       dtype),
        '__syncthreads()',
-        ('npy_%s row_max = ' + buf + '[0]') % dtype,
+        ('%s row_max = ' + buf + '[0]') % ctype,
        '__syncthreads()',
        inline_reduce_fixed_shared(N, buf, x, stride_x, load_x,
                                   threadPos, threadCount,
@@ -371,7 +380,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
                                   lambda a: "exp(%s - row_max)" % a,
                                   b, stride_b, load_b, dtype),
        '__syncthreads()',
-        ('npy_%s row_sum = ' + buf + '[0]') % dtype,
+        ('%s row_sum = ' + buf + '[0]') % ctype,
        '__syncthreads()',
        "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
        ]

--- a/theano/sandbox/gpuarray/neighbours.py
+++ b/theano/sandbox/gpuarray/neighbours.py
+import os
 import numpy

 from theano import Op, Apply, config
@@ -12,13 +13,14 @@ except ImportError:
    pass

 from .basic_ops import (as_gpuarray_variable,
-                        host_from_gpu, gpu_from_host)
+                        host_from_gpu, gpu_from_host,
+                        GpuKernelBase, Kernel)
 from .opt import register_opt as register_gpu_opt, op_lifter
 from .type import GpuArrayType
 from .comp import NVCC_compiler


-class GpuImages2Neibs(Images2Neibs, Op):
+class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
    def __init__(self, mode='valid'):
        if mode not in ['valid', 'ignore_borders', 'wrap_centered']:
            raise NotImplementedError("Only the mode valid, ignore_borders"
@@ -43,25 +45,41 @@ class GpuImages2Neibs(Images2Neibs, Op):
                                   dtype=ten4.type.dtype)()])

    def c_code_cache_version(self):
-        return (9, 1)
+        return (10,1)

    def c_headers(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
-
-    def c_compiler(self):
-        return NVCC_compiler
+                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
+
+    def c_header_dirs(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        cuda_root = config.cuda.root
+        if cuda_root:
+            return [os.path.join(cuda_root, 'include')]
+        else:
+            return []

    def c_init_code(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
        return ['setup_ext_cuda();']

-    def c_support_code_apply(self, node, nodename):
+    def gpu_kernels(self, node, nodename):
        dtype_ten4 = node.inputs[0].dtype
        dtype_z = node.outputs[0].dtype
+        flags = Kernel.get_flags(dtype_ten4, dtype_z)
+        type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
+        type_z = gpuarray.dtype_to_ctype(dtype_z)
        mode = self.mode
-        return """
+        kernels = []
+        kname = "k_multi_warp_less"
+        k_var = "k_multi_warp_less_" + nodename
+        code = """
 //a version that use less register but don't work in all case.
-        static __global__ void k_multi_warp_less_%(nodename)s(
+        KERNEL void %(kname)s(
            const int nb_batch,
            const int nb_stack,
            const int height,
@@ -72,15 +90,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
            const int step_y,
            const int grid_c,
            const int grid_d,
-            const int stride0, const int stride1,
-            const int stride2, const int stride3,
-            npy_%(dtype_ten4)s * global_ten4,
-            const int out_s0, const int out_s1,
-            npy_%(dtype_z)s * global_out
+            const size_t stride0, const size_t stride1,
+            const size_t stride2, const size_t stride3,
+            const %(type_ten4)s * global_ten4, const size_t offset_ten4,
+            const size_t out_s0, const size_t out_s1,
+            %(type_z)s * global_out, const size_t offset_out
        )
        {
            const int wrap_centered_idx_shift_x = c/2;
            const int wrap_centered_idx_shift_y = d/2;
+            global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
+            global_out = (%(type_z)s *)(((char *)global_out)+offset_out);

            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
@@ -131,9 +151,22 @@ class GpuImages2Neibs(Images2Neibs, Op):
                                }
                            }
            }
-        }
-
-        static __global__ void k_multi_warp_%(nodename)s(
+        }""" % locals()
+        params = [
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc',
+            'uintp', 'uintp', 'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            ]
+        kernels.append(Kernel(code=code, name=kname, params=params,
+                              flags=flags, objvar=k_var))
+
+        kname = "k_multi_warp"
+        k_var = "k_multi_warp_" + nodename
+        code = """
+        KERNEL void %(kname)s(
            const int nb_batch,
            const int nb_stack,
            const int height,
@@ -144,15 +177,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
            const int step_y,
            const int grid_c,
            const int grid_d,
-            const int stride0, const int stride1,
-            const int stride2, const int stride3,
-            npy_%(dtype_ten4)s * global_ten4,
-            const int out_s0, const int out_s1,
-            npy_%(dtype_z)s * global_out
+            const size_t stride0, const size_t stride1,
+            const size_t stride2, const size_t stride3,
+            const %(type_ten4)s * global_ten4, const size_t offset_ten4,
+            const size_t out_s0, const size_t out_s1,
+            %(type_z)s * global_out, const size_t offset_out
        )
        {
            const int wrap_centered_idx_shift_x = c/2;
            const int wrap_centered_idx_shift_y = d/2;
+            global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4);
+            global_out = (%(type_z)s *)(((char *)global_out)+offset_out);

            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
@@ -207,6 +242,17 @@ class GpuImages2Neibs(Images2Neibs, Op):
            }
        }
        """ % locals()
+        params = [
+            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
+            'intc', 'intc', 'intc', 'intc',
+            'uintp', 'uintp', 'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp',
+            ]
+        kernels.append(Kernel(code=code, name=kname, params=params,
+                              flags=flags, objvar=k_var))
+        return kernels

    def c_code(self, node, name, inp, out, sub):
        dtype_ten4 = node.inputs[0].dtype
@@ -220,15 +266,21 @@ class GpuImages2Neibs(Images2Neibs, Op):
        z, = out
        fail = sub['fail']
        mode = self.mode
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: *fptr: %%s.",
+                             GpuKernel_error(fptr, err));
+                %(fail)s;
+            }
+        """ % locals()
+        sync = ""
        if config.gpuarray.sync:
-            cnda_thread_sync = "GpuArray_sync(&%(z)s->ga);" % dict(z=z)
-        else:
-            cnda_thread_sync = ""
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        return """
-#ifndef CEIL_INTDIV
-#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
-#endif
-
        int grid_c = -1;
        int grid_d = -1;

@@ -281,10 +333,10 @@ class GpuImages2Neibs(Images2Neibs, Op):
                                 PyGpuArray_DIMS(%(ten4)s)[3]);
                    %(fail)s;
                }
-                grid_c = CEIL_INTDIV(((PyGpuArray_DIMS(%(ten4)s))[2]),
-                                     step_x);
-                grid_d = CEIL_INTDIV(((PyGpuArray_DIMS(%(ten4)s))[3]),
-                                     step_y);
+                grid_c = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[2]),
+                                     (size_t)step_x);
+                grid_d = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[3]),
+                                     (size_t)step_y);


            }else if ( "%(mode)s" == "valid") {
@@ -367,75 +419,57 @@ class GpuImages2Neibs(Images2Neibs, Op):
            const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
                                         PyArray_GETPTR1(%(neib_step)s, 1);

-            dim3 n_threads(d,c,1);
+            size_t threads_per_block[3] = {d, c, 1};
            //Their is a max of 512 threads per blocks
-            while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--;
-            while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--;
+            while(threads_per_block[0]*threads_per_block[1]>512 && threads_per_block[1]>1)threads_per_block[1]--;
+            while(threads_per_block[0]*threads_per_block[1]>512 && threads_per_block[0]>1)threads_per_block[0]--;

            //Make bigger block to have better memory access pattern and
            //a higher core utilisation. for smaller patch size

-            while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 &&
-                  n_threads.z<PyGpuArray_DIMS(%(z)s)[0]){
-                n_threads.z++;
+            while(c*d*(threads_per_block[2]+1) < 128 && threads_per_block[2]<64 &&
+                  threads_per_block[2]<PyGpuArray_DIMS(%(z)s)[0]){
+                threads_per_block[2]++;
            }
            int nb_block;
-            if (PyGpuArray_DIMS(%(z)s)[0] %% n_threads.z == 0)
-                nb_block = PyGpuArray_DIMS(%(z)s)[0] / n_threads.z;
+            if (PyGpuArray_DIMS(%(z)s)[0] %% threads_per_block[2] == 0)
+                nb_block = PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2];
            else
-                nb_block = (PyGpuArray_DIMS(%(z)s)[0] / n_threads.z) + 1;
-            dim3 n_blocks(std::min(32*1024,nb_block));
-            int n_shared = 0;
-
-            void (*f)(int, int, int ,int,
-                      int, int, int ,int,
-                      int, int,
-                      int, int, int, int,
-                      npy_%(dtype_ten4)s*,
-                      int, int,
-                      npy_%(dtype_z)s*);
-            if(n_threads.x==d && n_threads.y==c){
-                f = k_multi_warp_less_%(name)s;
-            }else{
-                f = k_multi_warp_%(name)s;
-            }
+                nb_block = (PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]) + 1;
+            size_t n_blocks[3] = {std::min(32*1024,nb_block), 1, 1};

-            f<<<n_blocks, n_threads, n_shared>>>(
-                nb_batch,
-                nb_stack,
-                height, width,
-                c, d, step_x, step_y,
-                grid_c, grid_d,
-                PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s,
-                PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s,
-                PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s,
-                PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s,
-                (npy_%(dtype_ten4)s*)(
-                                ((char *)cuda_get_ptr(%(ten4)s->ga.data)) +
-                                %(ten4)s->ga.offset),
-                PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s,
-                (npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                   %(z)s->ga.offset)
-            );
-            %(cnda_thread_sync)s
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError, "GpuImages2Neibs:"
-                             " Cuda error: %%s: %%s. (grid: %%i x %%i;"
-                             " block: %%i x %%i x %%i; shared: %%i)\\n",
-                    "k_multi_warp_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z,
-                    n_shared);
-                %(fail)s;
+            GpuKernel *fptr;
+            if(threads_per_block[0]==d && threads_per_block[1]==c){
+                fptr = &k_multi_warp_less_%(name)s;
+            }else{
+                fptr = &k_multi_warp_%(name)s;
            }

+            size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / %(itemsize_ten4)s;
+            size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / %(itemsize_ten4)s;
+            size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / %(itemsize_ten4)s;
+            size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / %(itemsize_ten4)s;
+            size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
+            size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
+            void *kernel_params[] = {(void *)&nb_batch,
+                                     (void *)&nb_stack,
+                                     (void *)&height, (void *)&width,
+                                     (void *)&c, (void *)&d,
+                                     (void *)&step_x, (void *)&step_y,
+                                     (void *)&grid_c, (void *)&grid_d,
+                                     (void *)&stride_A0,
+                                     (void *)&stride_A1,
+                                     (void *)&stride_A2,
+                                     (void *)&stride_A3,
+                                     (void *)%(ten4)s->ga.data,
+                                     (void *)&%(ten4)s->ga.offset,
+                                     (void *)&stride_Z0,
+                                     (void *)&stride_Z1,
+                                     (void *)%(z)s->ga.data,
+                                     (void *)&%(z)s->ga.offset};
+            int err = GpuKernel_call(fptr, 3, threads_per_block, n_blocks, 0, kernel_params);
+            %(err_check)s
+            %(sync)s
        } // END NESTED SCOPE
        """ % locals()


--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
 from __future__ import print_function
 import numpy
+import os

 from theano import Op, Apply, config
 from six import StringIO
@@ -10,16 +11,15 @@ try:
 except ImportError:
    pass

-from .basic_ops import as_gpuarray_variable
-from .comp import NVCC_compiler
+from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
 from .type import GpuArrayType
 from .kernel_codegen import (nvcc_kernel,
-                             inline_softmax,
-                             inline_softmax_fixed_shared)
+                            inline_softmax,
+                            inline_softmax_fixed_shared)
 from .fp16_help import work_dtype, load_w, write_w


-class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
+class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
    """
    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.

@@ -41,10 +41,18 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
        am = y_idx.type()
        return Apply(self, [x, b, y_idx], [nll, sm, am])

+    def c_header_dirs(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        cuda_root = config.cuda.root
+        if cuda_root:
+            return [os.path.join(cuda_root, 'include')]
+
    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>']
+        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
+                '<gpuarray/types.h>']

-    def c_support_code_apply(self, node, nodename):
+    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
@@ -54,28 +62,48 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
        load_b = load_w(dtype_b)
        write_x = write_w(dtype_x)
        write_b = write_w(dtype_b)
-        return """
-        __global__ void k_xent_sm_1hot_bias_%(nodename)s(int M, int N,
-            const npy_%(dtype_x)s* x_data, int xs0, int xs1,
-            const npy_%(dtype_b)s* b, int bs0,
-            const npy_%(dtype_y_idx)s* y_idx_data, int y_idxs0,
-            npy_%(dtype_x)s* nll_data, int nlls0,
-            npy_%(dtype_x)s* sm_data, int sms0, int sms1,
-            npy_%(dtype_y_idx)s* am_data, int ams0)
+        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx)
+        type_x = gpuarray.dtype_to_ctype(work_x)
+        type_b = gpuarray.dtype_to_ctype(work_b)
+        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
+        kname = "k_xent_sm_1hot_bias"
+        k_var = "k_xent_sm_1hot_bias_" + nodename
+        sio = StringIO()
+        print("""
+        KERNEL void %(kname)s(const ga_size M, const ga_size N,
+            const %(type_x)s* x_data, const ga_size offset_x,
+            const ga_ssize xs0, const ga_ssize xs1,
+            const %(type_b)s* b, const ga_size offset_b,
+            const ga_ssize bs0,
+            const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx,
+            const ga_ssize y_idxs0,
+            %(type_x)s* nll_data, const ga_size offset_nll,
+            const ga_ssize nlls0,
+            %(type_x)s* sm_data, const ga_size offset_sm,
+            const ga_ssize sms0, const ga_ssize sms1,
+            %(type_y_idx)s* am_data, const ga_size offset_am,
+            const ga_ssize ams0)
        {
+          x_data = (const %(type_x)s *)(((char *)x_data)+offset_x);
+          b = (const %(type_b)s *)(((char *)b)+offset_b);
+          y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx);
+          nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll);
+          sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm);
+          am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am);
+
          for (int row = blockIdx.x; row < M; row += gridDim.x){

-            const npy_%(dtype_x)s* x = x_data + xs0 * row;
-            const npy_%(dtype_y_idx)s y_idx = y_idx_data[row * y_idxs0];
-            npy_%(dtype_x)s* sm = sm_data + sms0 * row;
+            const %(type_x)s* x = x_data + xs0 * row;
+            const %(type_y_idx)s y_idx = y_idx_data[row * y_idxs0];
+            %(type_x)s* sm = sm_data + sms0 * row;

-            npy_%(work_x)s sum = 0.0;
+            %(type_x)s sum = 0.0;
            int row_max_j = 0;
-            npy_%(work_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
+            %(type_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]);
            for (int j = 1; j < N; ++j)
            {
-                npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) +
-                                        %(load_b)s(b[j*bs0]);
+                %(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
+                                    %(load_b)s(b[j*bs0]);
                //todo: store to shared memory
                row_max_j = (row_ij > row_max) ? j : row_max_j;
                row_max   = (row_ij > row_max) ? row_ij : row_max;
@@ -83,16 +111,16 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
            //compute the exp
            for (int j = 0; j < N; ++j)
            {
-                npy_%(work_x)s row_ij = %(load_x)s(x[j*xs1]) +
-                                        %(load_b)s(b[j*bs0]);
-                npy_%(work_x)s sm_ij = exp(row_ij - row_max);
+                %(type_x)s row_ij = %(load_x)s(x[j*xs1]) +
+                                    %(load_b)s(b[j*bs0]);
+                %(type_x)s sm_ij = exp(row_ij - row_max);
                sum += sm_ij;
                sm[j * sms1] = %(write_x)s(sm_ij);
            }
-            npy_%(work_x)s sum_inv = 1.0 / sum;
+            %(type_x)s sum_inv = 1.0 / sum;
            for (int j = 0; j < N; ++j)
            {
-                npy_%(work_x)s __tmp = %(load_x)s(sm[j * sms1]);
+                %(type_x)s __tmp = %(load_x)s(sm[j * sms1]);
                __tmp *= sum_inv;
                sm[j * sms1] = %(write_x)s(__tmp);
            }
@@ -111,12 +139,18 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
            am_data[row*ams0] = row_max_j;
          }
        }
-
-        CUdeviceptr (*cuda_get_ptr)(gpudata *g);
-        """ % locals()
-
-    def c_init_code(self):
-        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
+        """ % locals(), file=sio)
+        params = [
+            'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp'
+            ]
+        return [Kernel(code=sio.getvalue(), name=kname, params=params,
+                       flags=flags, objvar=k_var)]

    def c_code(self, node, nodename, inp, out, sub):
        typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
@@ -138,6 +172,21 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
        dtype_am = node.outputs[2].dtype
        classname = self.__class__.__name__
        fail = sub['fail']
+        k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+                %(fail)s;
+            }
+        """ % locals()
+        sync = ""
+        if config.gpuarray.sync:
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        sio = StringIO()
        print("""
        if (PyGpuArray_NDIM(%(y_idx)s) != 1)
@@ -219,62 +268,47 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
            }
        }
        {
-            int n_blocks = PyGpuArray_DIMS(%(x)s)[0] < 256 ? PyGpuArray_DIMS(%(x)s)[0] : 256;
+            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)256), 1, 1};
+            size_t threads_per_block[3] = {1, 1, 1};
+            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
+            ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
+            ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
+            ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
+            ssize_t stride_NLL0 = PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s;
+            ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
+            ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
+            ssize_t stride_AM0 = PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s;
     //TODO: launch more threads per row and do parallel sum and max reductions
-            int n_threads = 1;
-            int n_shared_bytes = 0; //n_threads * sizeof(dtype);
-
-
-            k_xent_sm_1hot_bias_%(nodename)s<<<n_blocks, n_threads, n_shared_bytes>>>(
-                PyGpuArray_DIMS(%(x)s)[0],
-                PyGpuArray_DIMS(%(x)s)[1],
-                (npy_%(dtype_x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                   %(x)s->ga.offset),
-                PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-                (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
-                                   %(b)s->ga.offset),
-                PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
-                (npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
-                                   %(y_idx)s->ga.offset),
-                PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
-                (npy_%(dtype_nll)s*)(((char *)cuda_get_ptr(%(nll)s->ga.data)) +
-                                   %(nll)s->ga.offset),
-                PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s,
-                (npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
-                                   %(sm)s->ga.offset),
-                PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
-                PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
-                (npy_%(dtype_am)s*)(((char *)cuda_get_ptr(%(am)s->ga.data)) +
-                                   %(am)s->ga.offset),
-                PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s);
-            cudaError_t err = cudaGetLastError();
-            if (cudaSuccess != err)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n"
-                             "The kernel was launched with %%d threads,"
-                             " %%d blocks and %%d shared memory\\n",
-                             cudaGetErrorString(err),
-                             n_threads, n_blocks, n_shared_bytes);
-                // no need to decref output vars the cleanup code will do it
-                %(fail)s;
-            }
+            void *kernel_params[] = {
+                (void *)&PyGpuArray_DIMS(%(x)s)[0],
+                (void *)&PyGpuArray_DIMS(%(x)s)[1],
+                (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
+                (void *)&stride_X0, (void *)&stride_X1,
+                (void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
+                (void *)&stride_B0,
+                (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
+                (void *)&stride_YIDX0,
+                (void *)%(nll)s->ga.data, (void *)&%(nll)s->ga.offset,
+                (void *)&stride_NLL0,
+                (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
+                (void *)&stride_SM0, (void *)&stride_SM1,
+                (void *)%(am)s->ga.data, (void *)&%(am)s->ga.offset,
+                (void *)&stride_AM0};
+            int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
+            %(err_check)s
+            %(sync)s
        }
        """ % locals(), file=sio)
        return sio.getvalue()

    def c_code_cache_version(self):
-        return (6,)
-
-    def c_compiler(self):
-        return NVCC_compiler
+        return (7,)


 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()


-class GpuCrossentropySoftmax1HotWithBiasDx(Op):
+class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
    """
    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.

@@ -294,13 +328,18 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
        return Apply(self, [dnll, sm, y_idx], [sm.type()])

    def c_code_cache_version(self):
-        return (9,)
+        return (10,)

-    def c_headers(self):
-        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>']
+    def c_header_dirs(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        cuda_root = config.cuda.root
+        if cuda_root:
+            return [os.path.join(cuda_root, 'include')]

-    def c_compiler(self):
-        return NVCC_compiler
+    def c_headers(self):
+        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
+                '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
        typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
@@ -312,20 +351,36 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
        dtype_sm = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        dtype_dx = node.outputs[0].dtype
+        type_intp = gpuarray.dtype_to_ctype(numpy.intp)
        dnll, sm, y_idx = inp
        dx, = out
        fail = sub['fail']
+        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+                %(fail)s;
+            }
+        """ % locals()
+        sync = ""
+        if config.gpuarray.sync:
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
        return """
        // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
-        const npy_intp %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
-                                         PyGpuArray_DIMS(%(dnll)s)[0] :
-                                         (npy_intp) 0);
+        const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
+                                        PyGpuArray_DIMS(%(dnll)s)[0] :
+                                        (ssize_t) 0);

        // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
        // or a vector with just one element.
-        const npy_intp %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
-                                            PyGpuArray_STRIDES(%(dnll)s)[0] :
-                                            (npy_intp) 0);
+        const ssize_t %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
+                                           PyGpuArray_STRIDES(%(dnll)s)[0] :
+                                           (ssize_t) 0);

        if ((PyGpuArray_NDIM(%(dnll)s) > 1)
            || (PyGpuArray_NDIM(%(sm)s) != 2)
@@ -373,48 +428,33 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
            }
        }
        {
-            int n_blocks = PyGpuArray_DIMS(%(dx)s)[0] < 256 ? PyGpuArray_DIMS(%(dx)s)[0] : 256;
-            int n_threads = PyGpuArray_DIMS(%(dx)s)[1] < 256 ? PyGpuArray_DIMS(%(dx)s)[1] : 256;
-
-            kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
-                <<<n_blocks, n_threads>>>(
-                        PyGpuArray_DIMS(%(dx)s)[0],
-                        PyGpuArray_DIMS(%(dx)s)[1],
-
-                        (npy_%(dtype_dnll)s*)(((char *)cuda_get_ptr(%(dnll)s->ga.data)) +
-                                           %(dnll)s->ga.offset),
-                        %(dnll)s_strides0 / %(itemsize_dnll)s,
-
-                        (npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
-                                           %(sm)s->ga.offset),
-                        PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
-                        PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
-
-                        (npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
-                                           %(y_idx)s->ga.offset),
-                        PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
-
-                        (npy_%(dtype_dx)s*)(((char *)cuda_get_ptr(%(dx)s->ga.data)) +
-                                           %(dx)s->ga.offset),
-                        PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s,
-                        PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s
-                );
-            cudaError_t err = cudaGetLastError();
-            if( cudaSuccess != err)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %%s: %%s.\\n"
-                             "The kernel was launched with %%d threads and"
-                             " %%d blocks\\n",
-                             "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
-                             cudaGetErrorString(err), n_threads, n_blocks);
-                %(fail)s;
-            }
+            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[0], (size_t)256), 1, 1};
+            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[1], (size_t)256), 1, 1};
+            ssize_t stride_DNLL0 = %(dnll)s_strides0 / %(itemsize_dnll)s;
+            ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s;
+            ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s;
+            ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s;
+            ssize_t stride_DX0 = PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s;
+            ssize_t stride_DX1 = PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s;
+            void *kernel_params[] = {
+                (void *)&PyGpuArray_DIMS(%(dx)s)[0],
+                (void *)&PyGpuArray_DIMS(%(dx)s)[1],
+                (void *)%(dnll)s->ga.data, (void *)&%(dnll)s->ga.offset,
+                (void *)&stride_DNLL0,
+                (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset,
+                (void *)&stride_SM0, (void *)&stride_SM1,
+                (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset,
+                (void *)&stride_YIDX0,
+                (void *)%(dx)s->ga.data, (void *)&%(dx)s->ga.offset,
+                (void *)&stride_DX0, (void *)&stride_DX1};
+            int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
+            %(err_check)s
+            %(sync)s
        }
        assert(%(dx)s);
        """ % locals()

-    def c_support_code_apply(self, node, nodename):
+    def gpu_kernels(self, node, nodename):
        dtype_dnll = node.inputs[0].dtype
        dtype_sm = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
@@ -423,18 +463,35 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
        load_dnll = load_w(dtype_dnll)
        load_sm = load_w(dtype_sm)
        write_dx = write_w(dtype_dx)
-        return """
-        __global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s(
-           int N, int K,
-           const npy_%(dtype_dnll)s* dnll, const int dnll_s0,
-           const npy_%(dtype_sm)s* sm, const int sm_s0, const int sm_s1,
-           const npy_%(dtype_y_idx)s* y_idx, const int y_idx_s0,
-           npy_%(dtype_dx)s* dx, const int dx_s0, const int dx_s1)
+        flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx)
+        type_dnll = gpuarray.dtype_to_ctype(work_dnll)
+        type_sm = gpuarray.dtype_to_ctype(dtype_sm)
+        type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx)
+        type_dx = gpuarray.dtype_to_ctype(dtype_dx)
+        kname = "kCrossEntropySoftmax1HotWithBiasDx"
+        k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
+        sio = StringIO()
+        print("""
+        KERNEL void %(kname)s(
+           const ga_size N, const ga_size K,
+           const %(type_dnll)s* dnll, const ga_size offset_dnll,
+           const ga_ssize dnll_s0,
+           const %(type_sm)s* sm, const ga_size offset_sm,
+           const ga_ssize sm_s0, const ga_ssize sm_s1,
+           const %(type_y_idx)s* y_idx, const ga_size offset_y_idx,
+           const ga_ssize y_idx_s0,
+           %(type_dx)s* dx, const ga_size offset_dx,
+           const ga_ssize dx_s0, const ga_ssize dx_s1)
        {
+            dnll = (const %(type_dnll)s *)(((char *)dnll)+offset_dnll);
+            sm = (const %(type_sm)s *)(((char *)sm)+offset_sm);
+            y_idx = (const %(type_y_idx)s *)(((char *)y_idx)+offset_y_idx);
+            dx = (%(type_dx)s *)(((char *)dx)+offset_dx);
+
            for (int i = blockIdx.x; i < N; i += gridDim.x)
            {
-                npy_%(work_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
-                npy_%(dtype_y_idx)s y_i = y_idx[i * y_idx_s0];
+                %(type_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]);
+                %(type_y_idx)s y_i = y_idx[i * y_idx_s0];

                for (int j = threadIdx.x; j < K; j += blockDim.x)
                {
@@ -453,17 +510,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
                }
            }
        }
-
-        CUdeviceptr (*cuda_get_ptr)(gpudata *g);
-        """ % locals()
-
-    def c_init_code(self):
-        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
+        """ % locals(), file=sio)
+        params = [
+            'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp'
+            ]
+        return [Kernel(code=sio.getvalue(), name=kname, params=params,
+                       flags=flags, objvar=k_var)]

 gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()


-class GpuSoftmax (Op):
+class GpuSoftmax (GpuKernelBase, Op):
    """
    Implement Softmax on the gpu.

@@ -482,12 +543,16 @@ class GpuSoftmax (Op):
    def c_code_cache_version(self):
        return (13,) + inline_softmax.code_version

+    def c_header_dirs(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        cuda_root = config.cuda.root
+        if cuda_root:
+            return [os.path.join(cuda_root, 'include')]
+
    def c_headers(self):
        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
-
-    def c_compiler(self):
-        return NVCC_compiler
+                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']

    def c_init_code(self):
        return ['setup_ext_cuda();']
@@ -502,10 +567,21 @@ class GpuSoftmax (Op):
        x, = inp
        z, = out
        fail = sub['fail']
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
+                %(fail)s;
+            }
+        """ % locals()
+        sync = ""
        if config.gpuarray.sync:
-            cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            msg = "sync error";
+            %(err_check)s
+            """ % locals()
        else:
-            cnda_thread_sync = ""
+            sync = ""
        return """
        if (PyGpuArray_NDIM(%(x)s) != 2)
        {
@@ -528,97 +604,82 @@ class GpuSoftmax (Op):
            }
        }
        {
-            int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0],
-                                    (size_t)(32 * 1024));
+            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32 * 1024)), 1, 1};
 //TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512);
-            int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
+            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512), 1, 1};
+            size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
                                     2 * sizeof(npy_%(work_x)s);
-
+            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
+            ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
+            ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
+            ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
+            const char *fmt_str, *msg;
+            void *kernel_params[] = {
+                (void *)&PyGpuArray_DIMS(%(x)s)[0],
+                (void *)&PyGpuArray_DIMS(%(x)s)[1],
+                (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
+                (void *)&stride_X0, (void *)&stride_X1,
+                (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
+                (void *)&stride_Z0, (void *)&stride_Z1};
+            int err = GA_NO_ERROR;
            if (PyGpuArray_DIMS(%(x)s)[0] > 0)
            {
              //Those numbers are based on not too recent GPU
              //to make them compatible with more GPU.
              //TODO: read the information from the card.
-              if(n_shared_bytes < (32 * 1024 - 500)){
-                kSoftmax_%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_shared_bytes
-                    >>>(
-                            PyGpuArray_DIMS(%(x)s)[0],
-                            PyGpuArray_DIMS(%(x)s)[1],
-
-                            (npy_%(dtype_x)s*)(
-                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                    %(x)s->ga.offset),
-                            PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                            PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-
-                            (npy_%(dtype_z)s*)(
-                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                    %(z)s->ga.offset),
-                            PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                            PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
-                    );
+              if(shmem_sz < (32 * 1024 - 500)){
+                err = GpuKernel_call(&kSoftmax_%(nodename)s, 3,
+                                     threads_per_block, n_blocks, shmem_sz,
+                                     kernel_params);
+                fmt_str = "gpuarray error: kSoftmax_%(nodename)s: %%s";
+                msg = GpuKernel_error(&kSoftmax_%(nodename)s, err);
              }else{
-                kSoftmax_fixed_shared%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_threads * sizeof(npy_%(work_x)s)
-                    >>>(
-                            PyGpuArray_DIMS(%(x)s)[0],
-                            PyGpuArray_DIMS(%(x)s)[1],
-
-                            (npy_%(dtype_x)s*)(
-                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                    %(x)s->ga.offset),
-                            PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                            PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-
-                            (npy_%(dtype_z)s*)(
-                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                    %(z)s->ga.offset),
-                            PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                            PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
-                    );
-              }
-              %(cnda_thread_sync)s
-              cudaError_t err = cudaGetLastError();
-              if( cudaSuccess != err)
-              {
-                  PyErr_Format(PyExc_RuntimeError,
-                               "Cuda error: %%s: %%s.\\n Used %%d blocks,"
-                               " %%d threads %%d bytes of shared memory",
-                               "kSoftmax[_fixed_shared]%(nodename)s",
-                               cudaGetErrorString(err),
-                               n_blocks, n_threads, n_shared_bytes);
-                  %(fail)s;
+                err = GpuKernel_call(&kSoftmax_fixed_shared%(nodename)s, 3,
+                                     threads_per_block, n_blocks,
+                                     threads_per_block[0] * sizeof(npy_%(work_x)s),
+                                     kernel_params);
+                fmt_str = "gpuarray error: kSoftmax_fixed_shared%(nodename)s: %%s";
+                msg = GpuKernel_error(&kSoftmax_fixed_shared%(nodename)s, err);
              }
+              %(err_check)s
+              %(sync)s
            }
        }
        assert(%(z)s);
        """ % locals()

-    def c_support_code_apply(self, node, nodename):
+    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_sm = node.outputs[0].dtype
-        load_x = load_w(node.inputs[0].dtype)
+        load_x = load_w(dtype_x)
        write_sm = write_w(node.outputs[0].dtype)
-        work_sm = work_dtype(node.outputs[0].dtype)
-        ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
-                params=['int M', 'int N',
-                    'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
-                    'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
+        work_sm = work_dtype(dtype_sm)
+        flags = Kernel.get_flags(dtype_x, dtype_sm)
+        type_x = gpuarray.dtype_to_ctype(dtype_x)
+        type_sm = gpuarray.dtype_to_ctype(work_sm)
+        params = [
+            'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp'
+            ]
+        kernels = []
+        kname = "kSoftmax"
+        k_var= "kSoftmax_" + nodename
+        code = nvcc_kernel(kname,
+                params=['const ga_size M', 'const ga_size N',
+                    'const %s * x' % type_x, 'const ga_size offset_x',
+                    'const ga_ssize sx0', 'const ga_ssize sx1',
+                    '%s * sm' % type_sm, 'const ga_size offset_sm',
+                    'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
                body=[
-                    "extern __shared__ npy_%(work_sm)s buf[]",
-                    "npy_%(work_sm)s * buf2 = buf + N",
+                    "extern __shared__ %s buf[]" % type_sm,
+                    "%s * buf2 = buf + N" % type_sm,
+                    "x = (const %s *)(((char *)x)+offset_x)" % type_x,
+                    "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
                    "     blockIDX += gridDim.x){",
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                        "buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1])",
+                        "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
                        "buf2[tx] = buf[tx]",
                      "}",
                      "__syncthreads()",
@@ -626,21 +687,29 @@ class GpuSoftmax (Op):
                                     'threadIdx.x', 'blockDim.x', work_sm),
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                        # This set all value correctly
-                        "sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx])",
+                        "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
                      "}",
                      "__syncthreads()",
                    "}",
                ])
-        ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
-                params=['int M', 'int N',
-                    'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
-                    'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
+        kernels.append(Kernel(code=code, name=kname, params=params,
+                              flags=flags, objvar=k_var))
+        kname = "kSoftmax_fixed_shared"
+        k_var= "kSoftmax_fixed_shared" + nodename
+        code = nvcc_kernel(kname,
+                params=['const ga_size M', 'const ga_size N',
+                    'const %s * x' % type_x, 'const ga_size offset_x',
+                    'const ga_ssize sx0', 'const ga_ssize sx1',
+                    '%s * sm' % type_sm, 'const ga_size offset_sm',
+                    'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
                body=[
-                    "extern __shared__ npy_%(work_sm)s buf[]",
+                    "extern __shared__ %s buf[]" % type_sm,
+                    "x = (const %s *)(((char *)x)+offset_x)" % type_x,
+                    "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
                    "     blockIDX += gridDim.x){",
-                      "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
-                      "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
+                      "const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
+                      "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
                      inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
                                                  load_x,
                                                  'sm_ptr', 'sm_s1', write_sm,
@@ -649,12 +718,14 @@ class GpuSoftmax (Op):
                      "__syncthreads()",
                    "}",
                    ])
-        return (ret1 + "\n" + ret2) % locals()
+        kernels.append(Kernel(code=code, name=kname, params=params,
+                              flags=flags, objvar=k_var))
+        return kernels

 gpu_softmax = GpuSoftmax()


-class GpuSoftmaxWithBias (Op):
+class GpuSoftmaxWithBias (GpuKernelBase, Op):
    """
    Implement SoftmaxWithBias on the gpu.

@@ -676,12 +747,18 @@ class GpuSoftmaxWithBias (Op):
    def c_code_cache_version(self):
        return (12,) + inline_softmax.code_version

+    def c_header_dirs(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        cuda_root = config.cuda.root
+        if cuda_root:
+            return [os.path.join(cuda_root, 'include')]
+        else:
+            return []
+
    def c_headers(self):
        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
-
-    def c_compiler(self):
-        return NVCC_compiler
+                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']

    def c_init_code(self):
        return ['setup_ext_cuda();']
@@ -698,10 +775,19 @@ class GpuSoftmaxWithBias (Op):
        x, b = inp
        z, = out
        fail = sub['fail']
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
+                %(fail)s;
+            }
+        """ % locals()
+        sync = ""
        if config.gpuarray.sync:
-            cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
-        else:
-            cnda_thread_sync = ""
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            msg = "sync error";
+            %(err_check)s
+            """ % locals()
        return """
        if (PyGpuArray_NDIM(%(x)s) != 2)
        {
@@ -739,82 +825,51 @@ class GpuSoftmaxWithBias (Op):
            }
        }
        {
-            int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024));
+            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024)), 1, 1};
 //TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512);
-            int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
+            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512), 1, 1};
+            size_t shmem_sz = PyGpuArray_DIMS(%(x)s)[1] *
                                     2 * sizeof(npy_%(work_x)s);
+            ssize_t stride_X0 = PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s;
+            ssize_t stride_X1 = PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s;
+            ssize_t stride_B0 = PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s;
+            ssize_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s;
+            ssize_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s;
+            const char *fmt_str, *msg;
+            void *kernel_params[] = {
+                (void *)&PyGpuArray_DIMS(%(x)s)[0],
+                (void *)&PyGpuArray_DIMS(%(x)s)[1],
+                (void *)%(x)s->ga.data, (void *)&%(x)s->ga.offset,
+                (void *)&stride_X0, (void *)&stride_X1,
+                (void *)%(b)s->ga.data, (void *)&%(b)s->ga.offset,
+                (void *)&stride_B0,
+                (void *)%(z)s->ga.data, (void *)&%(z)s->ga.offset,
+                (void *)&stride_Z0, (void *)&stride_Z1};
+            int err = GA_NO_ERROR;
            if (PyGpuArray_DIMS(%(x)s)[0] > 0)
            {
-              if(n_shared_bytes < (32 * 1024 - 500)){
-                kSoftmaxWithBias_%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_shared_bytes
-                    >>>(
-                        PyGpuArray_DIMS(%(x)s)[0],
-                        PyGpuArray_DIMS(%(x)s)[1],
-
-                        (npy_%(dtype_x)s*)(
-                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                    %(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                        PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-
-                        (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
-                                           %(b)s->ga.offset),
-                        PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
-
-                        (npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                           %(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                        PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
-                    );
+              if(shmem_sz < (32 * 1024 - 500)){
+                err = GpuKernel_call(&kSoftmaxWithBias_%(nodename)s, 3,
+                                     threads_per_block, n_blocks, shmem_sz,
+                                     kernel_params);
+                fmt_str = "gpuarray error: kSoftmaxWithBias_%(nodename)s: %%s";
+                msg = GpuKernel_error(&kSoftmaxWithBias_%(nodename)s, err);
              }else{
-                kSoftmaxWithBias_fixed_shared%(nodename)s
-                    <<<
-                        n_blocks,
-                        n_threads,
-                        n_threads * sizeof(npy_%(work_x)s)
-                    >>>(
-                        PyGpuArray_DIMS(%(x)s)[0],
-                        PyGpuArray_DIMS(%(x)s)[1],
-
-                        (npy_%(dtype_x)s*)(
-                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
-                                    %(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                        PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
-
-                        (npy_%(dtype_b)s*)(
-                                    ((char *)cuda_get_ptr(%(b)s->ga.data)) +
-                                    %(b)s->ga.offset),
-                        PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
-
-                        (npy_%(dtype_z)s*)(
-                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                                    %(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                        PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
-                    );
+                err = GpuKernel_call(&kSoftmaxWithBias_fixed_shared%(nodename)s,
+                                     3, threads_per_block, n_blocks,
+                                     threads_per_block[0] * sizeof(npy_%(work_x)s),
+                                     kernel_params);
+                fmt_str = "gpuarray error: kSoftmaxWithBias_fixed_shared%(nodename)s: %%s";
+                msg = GpuKernel_error(&kSoftmaxWithBias_fixed_shared%(nodename)s, err);
              }
-                %(cnda_thread_sync)s
-                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "Cuda error: %%s: %%s.\\n",
-                                 "kSoftmaxWithBias_%(nodename)s",
-                                 cudaGetErrorString(err));
-                    %(fail)s;
-                }
+              %(err_check)s
+              %(sync)s
            }
        }
        assert(%(z)s);
        """ % locals()

-    def c_support_code_apply(self, node, nodename):
+    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_sm = node.outputs[0].dtype
@@ -822,55 +877,80 @@ class GpuSoftmaxWithBias (Op):
        load_b = load_w(node.inputs[1].dtype)
        write_sm = write_w(node.outputs[0].dtype)
        work_sm = work_dtype(node.outputs[0].dtype)
-        ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
-                params=['int M', 'int N',
-                        'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
-                        'const npy_%(dtype_b)s * b', 'const int sb0',
-                        'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
+        flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
+        type_x = gpuarray.dtype_to_ctype(dtype_x)
+        type_b = gpuarray.dtype_to_ctype(dtype_b)
+        type_sm = gpuarray.dtype_to_ctype(work_sm)
+        params = [
+            'uintp', 'uintp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp',
+            gpuarray.GpuArray, 'uintp', 'intp', 'intp'
+            ]
+        kernels = []
+        kname = "kSoftmaxWithBias"
+        k_var = "kSoftmaxWithBias_" + nodename
+        code = nvcc_kernel(kname,
+                params=['const ga_size M', 'const ga_size N',
+                        'const %s * x' % type_x, 'const ga_size offset_x',
+                        'const ga_ssize sx0', 'const ga_ssize sx1',
+                        'const %s * b' % type_b, 'const ga_size offset_b',
+                        'const ga_ssize sb0',
+                        '%s * sm' % type_sm, 'const ga_size offset_sm',
+                        'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
                body=[
-                    "extern __shared__ npy_%(work_sm)s buf[]",
-                    "npy_%(work_sm)s * buf2 = buf + N",
+                    "extern __shared__ %s buf[]" % type_sm,
+                    "%s * buf2 = buf + N" % type_sm,
+                    "x = (const %s *)(((char *)x)+offset_x)" % type_x,
+                    "b = (const %s *)(((char *)b)+offset_b)" % type_b,
+                    "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
                    "     blockIDX += gridDim.x){",
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                         "buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1])",
-                         "buf[tx] += %(load_b)s(b[tx * sb0])",
+                         "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
+                         "buf[tx] += %s(b[tx * sb0])" % load_b,
                         "buf2[tx] = buf[tx]",
                      "}",
                       "__syncthreads()",
                       inline_softmax('N', 'buf', 'buf2',
                                      'threadIdx.x', 'blockDim.x', work_sm),
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                         "sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx])",
+                         "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
                      "}",
                      "__syncthreads()",
                    "}",
                    ])
-        ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
-                           params=['int M', 'int N',
-                                   'const npy_%(dtype_x)s * x',
-                                   'const int sx0', 'const int sx1',
-                                   'const npy_%(dtype_b)s * b', 'const int sb0',
-                                   'npy_%(dtype_sm)s * sm',
-                                   'const int sm_s0', 'const int sm_s1'],
-                           body=[
-                               "extern __shared__ npy_%(work_sm)s buf[]",
-                               "for (int blockIDX = blockIdx.x; blockIDX < M;"
-                               "     blockIDX += gridDim.x){",
-                               "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
-                               "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
-                               inline_softmax_fixed_shared('N', 'buf',
-                                                           'x_ptr', 'sx1',
-                                                           load_x,
-                                                           'sm_ptr', 'sm_s1',
-                                                           write_sm,
-                                                           'threadIdx.x',
-                                                           'blockDim.x',
-                                                           'b', 'sb0', load_b,
-                                                           work_sm),
-                               "__syncthreads()",
-                               "}",
-                           ])
-        return (ret1 + "\n" + ret2) % locals()
+        kernels.append(Kernel(code=code, name=kname, params=params,
+                              flags=flags, objvar=k_var))
+        kname = "kSoftmaxWithBias_fixed_shared"
+        k_var = "kSoftmaxWithBias_fixed_shared" + nodename
+        code = nvcc_kernel(kname,
+                params=['const ga_size M', 'const ga_size N',
+                        'const %s * x' % type_x, 'const ga_size offset_x',
+                        'const ga_ssize sx0', 'const ga_ssize sx1',
+                        'const %s * b' % type_b, 'const ga_size offset_b',
+                        'const ga_ssize sb0',
+                        '%s * sm' % type_sm, 'const ga_size offset_sm',
+                        'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
+                body=[
+                    "extern __shared__ %s buf[]" % type_sm,
+                    "x = (const %s *)(((char *)x)+offset_x)" % type_x,
+                    "b = (const %s *)(((char *)b)+offset_b)" % type_b,
+                    "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
+                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                    "     blockIDX += gridDim.x){",
+                    "const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
+                    "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
+                    inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
+                                                load_x,
+                                                'sm_ptr', 'sm_s1', write_sm,
+                                                'threadIdx.x', 'blockDim.x',
+                                                'b', 'sb0', load_b, work_sm),
+                    "__syncthreads()",
+                    "}",
+                    ])
+        kernels.append(Kernel(code=code, name=kname, params=params,
+                              flags=flags, objvar=k_var))
+        return kernels

 gpu_softmax_with_bias = GpuSoftmaxWithBias()
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
 from __future__ import print_function
 import copy
 import numpy
+import os

 import theano
-from theano import tensor, gof, Op
+from theano import tensor, gof, Op, config
 from six.moves import StringIO
 from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
 import theano.tensor.inplace
@@ -15,7 +16,7 @@ except ImportError:
    pass

 from .type import GpuArrayType
-from .basic_ops import as_gpuarray_variable, HideC
+from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel)
 from .elemwise import GpuElemwise
 from .comp import NVCC_compiler

@@ -159,7 +160,7 @@ class GpuSubtensor(HideC, Subtensor):
        return (6,)


-class GpuIncSubtensor(IncSubtensor):
+class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
    """
    Implement IncSubtensor on the gpu.

@@ -177,6 +178,13 @@ class GpuIncSubtensor(IncSubtensor):
    def _f16_ok(self):
        return self.iadd_node.op._f16_ok

+    def c_header_dirs(self):
+        cuda_root = config.cuda.root
+        if cuda_root:
+            return [os.path.join(cuda_root, 'include')]
+        else:
+            return []
+
    def c_headers(self):
        return self.iadd_node.op.c_headers()

@@ -186,6 +194,10 @@ class GpuIncSubtensor(IncSubtensor):
    def c_init_code(self):
        return self.iadd_node.op.c_init_code()

+    def gpu_kernels(self, node, nodename):
+        subname = nodename + "_add_to_zview"
+        return self.iadd_node.op.gpu_kernels(self.iadd_node, subname)
+
    def make_node(self, x, y, *inputs):
        x = as_gpuarray_variable(x)
        y = as_gpuarray_variable(y)
@@ -486,7 +498,7 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
                    k(x[i], reshaped_y, broadcast=True)


-class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
+class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
    """
    Implement AdvancedIncSubtensor1 on the gpu, but use function
    only avail on compute capability 2.0 and more recent.
@@ -525,16 +537,24 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])

    def c_code_cache_version(self):
-        return (4,)
+        return (5,)

    def c_headers(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
-                '<gpuarray/ext_cuda.h>']
+                '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']

-    def c_compiler(self):
-        return NVCC_compiler
+    def c_header_dirs(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
+        cuda_root = config.cuda.root
+        if cuda_root:
+            return [os.path.join(cuda_root, 'include')]

    def c_init_code(self):
+        if pygpu.get_default_context().kind == 'opencl':
+            raise MethodNotDefined('cuda only')
        return ['setup_ext_cuda();']

    def c_code(self, node, name, inputs, outputs, sub):
@@ -569,7 +589,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        }
        """ % locals()

-    def c_support_code_apply(self, node, nodename):
+    def gpu_kernels(self, node, nodename):
        dtype_x = node.inputs[0].dtype
        dtype_y = node.inputs[1].dtype
        dtype_ind = node.inputs[2].dtype
@@ -578,7 +598,14 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        itemsize_y = numpy.dtype(dtype_y).itemsize
        itemsize_ind = numpy.dtype(dtype_ind).itemsize
        itemsize_out = numpy.dtype(dtype_out).itemsize
-        return """
+        flags=Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
+        type_x = gpuarray.dtype_to_ctype(dtype_x)
+        type_y = gpuarray.dtype_to_ctype(dtype_y)
+        type_ind = gpuarray.dtype_to_ctype(dtype_ind)
+        type_out = gpuarray.dtype_to_ctype(dtype_out)
+        kname = "k_vector_add_fast"
+        k_var = "k_vector_add_fast_" + nodename
+        code = """

 /*
 * This is a version of atomicAdd that works for half-floats.  It may
@@ -587,37 +614,43 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
 * will not be modified.
 */

-__device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
-  npy_uint32 *base = (npy_uint32 *)((size_t)addr & ~2);
-  npy_uint32 old, assumed, sum, new_;
+__device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
+  ga_uint *base = (ga_uint *)((ga_size)addr & ~2);
+  ga_uint old, assumed, sum, new_;
  old = *base;
  do {
    assumed = old;
    sum = __float2half_rn(
      __half2float(val) +
-      __half2float((npy_float16)__byte_perm(old, 0,
-                     ((size_t)addr & 2) ? 0x4432 : 0x4410)));
-    new_ = __byte_perm(old, sum, ((size_t)addr & 2) ? 0x5410 : 0x3254);
+      __half2float((ga_half)__byte_perm(old, 0,
+                     ((ga_size)addr & 2) ? 0x4432 : 0x4410)));
+    new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
    old = atomicCAS(base, assumed, new_);
  } while (assumed != old);
-  return (npy_float16)__byte_perm(old, 0,
-                                  ((size_t)addr & 2) ? 0x4432 : 0x4410);
+  return (ga_half)__byte_perm(old, 0,
+                                  ((ga_size)addr & 2) ? 0x4432 : 0x4410);
 }

-        __global__ void k_vector_add_fast(int numRowsX,
-                                          int numColsX,
-                                          int stridesX0,
-                                          int stridesX1,
-                                          npy_%(dtype_x)s *X,
-                                          int numRowsY,
-                                          int numColsY,
-                                          int stridesY0,
-                                          int stridesY1,
-                                          npy_%(dtype_y)s *Y,
-                                          int numIndices,
-                                          int stridesIndices,
-                                          npy_%(dtype_ind)s *indices_arr)
+        KERNEL void k_vector_add_fast(const ga_size numRowsX,
+                                      const ga_size numColsX,
+                                      const ga_ssize stridesX0,
+                                      const ga_ssize stridesX1,
+                                      %(type_x)s *X,
+                                      const ga_size offset_X,
+                                      const ga_size numRowsY,
+                                      const ga_size numColsY,
+                                      const ga_ssize stridesY0,
+                                      const ga_ssize stridesY1,
+                                      %(type_y)s *Y,
+                                      const ga_size offset_Y,
+                                      const ga_size numIndices,
+                                      const ga_ssize stridesIndices,
+                                      %(type_ind)s *indices_arr,
+                                      const ga_size offset_indices_arr)
        {
+             X = (%(type_x)s *)(((char *)X)+offset_X);
+             Y = (%(type_y)s *)(((char *)Y)+offset_Y);
+             indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr);
             for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
             {
                  for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
@@ -631,41 +664,71 @@ __device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
             }
             return;
        }
+        """ % locals()
+        params = [
+            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
+            'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
+            'uintp', 'intp', gpuarray.GpuArray, 'uintp'
+            ]
+        return [Kernel(code=code, name=kname, params=params,
+                       flags=flags, objvar=k_var)]

+    def c_support_code_apply(self, node, nodename):
+        dtype_x = node.inputs[0].dtype
+        dtype_y = node.inputs[1].dtype
+        dtype_ind = node.inputs[2].dtype
+        dtype_out = node.outputs[0].dtype
+        itemsize_x = numpy.dtype(dtype_x).itemsize
+        itemsize_y = numpy.dtype(dtype_y).itemsize
+        itemsize_ind = numpy.dtype(dtype_ind).itemsize
+        itemsize_out = numpy.dtype(dtype_out).itemsize
+        k_var = "k_vector_add_fast_" + nodename
+        err_check = """
+            if (err != GA_NO_ERROR) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "gpuarray error: %(k_var)s: %%s.",
+                             GpuKernel_error(&%(k_var)s, err));
+            }
+        """ % locals()
+        sync = ""
+        if config.gpuarray.sync:
+            sync = """
+            err = GpuArray_sync(&%(z)s->ga);
+            %(err_check)s
+            """ % locals()
+        return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """
        void GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
                                      PyGpuArrayObject* py_other,
                                      PyGpuArrayObject *indices_arr)
        {
-                int num_threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1],
-                                                     (size_t)256);
-                int num_blocks = std::min(PyGpuArray_SIZE(indices_arr),
-                                          (size_t)4096);
-
-                dim3 n_blocks(num_blocks);
-                dim3 n_threads(num_threads_per_block);
-
-                k_vector_add_fast<<<n_blocks, n_threads>>>(
-                        PyGpuArray_DIM(py_self, 0),
-                        PyGpuArray_DIM(py_self, 1),
-                        PyGpuArray_STRIDE(py_self, 0) / %(itemsize_x)s,
-                        PyGpuArray_STRIDE(py_self, 1) / %(itemsize_x)s,
-                        (npy_%(dtype_x)s*)(
-                            ((char *)cuda_get_ptr(py_self->ga.data)) +
-                            py_self->ga.offset),
-                        PyGpuArray_DIM(py_other, 0),
-                        PyGpuArray_DIM(py_other, 1),
-                        PyGpuArray_DIM(py_other, 0) == 1 ? 0 : PyGpuArray_STRIDE(py_other, 0) / %(itemsize_y)s,
-                        PyGpuArray_DIM(py_other, 1) == 1 ? 0 : PyGpuArray_STRIDE(py_other, 1) / %(itemsize_y)s,
-                        (npy_%(dtype_x)s*)(
-                            ((char *)cuda_get_ptr(py_other->ga.data)) +
-                            py_other->ga.offset),
-                        PyGpuArray_DIMS(indices_arr)[0],
-                        PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s,
-                        (npy_%(dtype_ind)s*)(
-                            ((char *)cuda_get_ptr(indices_arr->ga.data)) +
-                            indices_arr->ga.offset)
-                );
-                return;
+            size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
+            size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
+            if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
+                ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s;
+                ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s;
+                ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s;
+                ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s;
+                ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s;
+                void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0],
+                                         (void *)&PyGpuArray_DIMS(py_self)[1],
+                                         (void *)&stride_X0,
+                                         (void *)&stride_X1,
+                                         (void *)py_self->ga.data,
+                                         (void *)&py_self->ga.offset,
+                                         (void *)&PyGpuArray_DIMS(py_other)[0],
+                                         (void *)&PyGpuArray_DIMS(py_other)[1],
+                                         (void *)&stride_Y0,
+                                         (void *)&stride_Y1,
+                                         (void *)py_other->ga.data,
+                                         (void *)&py_other->ga.offset,
+                                         (void *)&PyGpuArray_DIMS(indices_arr)[0],
+                                         (void *)&stride_ind,
+                                         (void *)indices_arr->ga.data,
+                                         (void *)&indices_arr->ga.offset};
+                int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params);
+                %(err_check)s
+                %(sync)s
+            }
        }

        """ % locals()