Merge pull request #6012 from abergeron/fix_offset

Fix offset problems in the new backend.

Merge pull request #6012 from abergeron/fix_offset
e5ba1b08 · Frédéric Bastien · GitHub · 8dcc5fc6 · a762b617 · e5ba1b08
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -1630,7 +1630,9 @@ class GpuEye(GpuKernelBase, Op):

    def gpu_kernels(self, node, name):
        code = """
-KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m, ga_ssize k) {
+KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
+                ga_size n, ga_size m, ga_ssize k) {
+    a = (GLOBAL_MEM %(ctype)s *)(((char *)a) + a_off);
    ga_ssize coff = max(k, (ga_ssize) 0);
    ga_ssize roff = -min(k, (ga_ssize) 0);
    ga_size nb = (ga_size) min(n - roff, m - coff);
@@ -1641,7 +1643,8 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m, ga_ssize k) {
            name=name, write_a=write_w(self.dtype))
        return [Kernel(
                code=code, name="eye",
-                params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE, gpuarray.SSIZE],
+                params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
+                        gpuarray.SIZE, gpuarray.SSIZE],
                flags=Kernel.get_flags(self.dtype),
                objvar='k_eye_' + name)]

@@ -1685,7 +1688,8 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m, ga_ssize k) {
        col_off = (size_t) (k > 0?k:0);
        row_off = (size_t) (k < 0?-k:0);
        if (row_off < dims[0] && col_off < dims[1]) {
-            err = eye_call(1, &gs, &ls, 0, %(z)s->ga.data, dims[0], dims[1], k);
+            err = eye_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset,
+                           dims[0], dims[1], k);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "gpuarray error: kEye: %%s. n%%lu, m=%%lu.",
@@ -1702,4 +1706,4 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m, ga_ssize k) {
        return s

    def c_code_cache_version(self):
-        return (7,)
+        return (8,)
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -552,8 +552,8 @@ class BaseGpuCorrMM(CGpuKernelBase):
        return [os.path.dirname(__file__)]

    def c_code_cache_version(self):
-        # Raise this whenever modifying the code below.
-        return (7,)
+        # Raise this whenever modifying the C code (including the file).
+        return (8,)

    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
        """

--- a/theano/gpuarray/corr3d_gemm.c
+++ b/theano/gpuarray/corr3d_gemm.c
 #section kernels

-#kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, * : 
+#kernel dilated_im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
 // TODO check kernel flags
 // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
 // sources are clearly marked. Below we reproduce the original license of
@@ -35,14 +35,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // GPU kernel for the case of dilation
 KERNEL void dilated_im3d2col_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
+    const ga_size offset_im,
    const ga_size data_im_offset,
+    // offset_im is the pointer offset for data_im.
+    // data_im_offset is an offset of elements in the array
    const ga_size height, const ga_size width, const ga_size depth,
    const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
    const ga_size dilation_h, const ga_size dilation_w, const ga_size dilation_d,
    const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
    const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
    const ga_size height_col, const ga_size width_col, const ga_size depth_col,
-    GLOBAL_MEM DTYPE_INPUT_0 * data_col) {
+    GLOBAL_MEM DTYPE_INPUT_0 * data_col,
+    const ga_size offset_col) {
+  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
+  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
@@ -80,16 +86,22 @@ KERNEL void dilated_im3d2col_kernel(const ga_size n,
  }
 }

-#kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, * : 
+#kernel im3d2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
 KERNEL void im3d2col_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
+    const ga_size offset_im,
    const ga_size data_im_offset,
+    // offset_im is the pointer offset for data_im.
+    // data_im_offset is an offset of elements in the array
    const ga_size height, const ga_size width, const ga_size depth,
    const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
    const ga_size pad_h, const ga_size pad_w, const ga_size pad_d,
    const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
    const ga_size height_col, const ga_size width_col, const ga_size depth_col,
-    GLOBAL_MEM DTYPE_INPUT_0 * data_col) {
+    GLOBAL_MEM DTYPE_INPUT_0 * data_col,
+    const ga_size offset_col) {
+  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
+  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
@@ -126,9 +138,10 @@ KERNEL void im3d2col_kernel(const ga_size n,
 }

 // GPU kernel for the case of dilation
-#kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : 
+#kernel dilated_col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
 KERNEL void dilated_col2im3d_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
+    const ga_size offset_col,
    const ga_size height, const ga_size width, const ga_size depth,
    const ga_size channels,
    const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
@@ -137,7 +150,12 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n,
    const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
    const ga_size height_col, const ga_size width_col, const ga_size depth_col,
    GLOBAL_MEM DTYPE_INPUT_0 * data_im,
+    const ga_size offset_im,
    const ga_size data_im_offset) {
+    // offset_im is the pointer offset for data_im.
+    // data_im_offset is an offset of elements in the array
+  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
+  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
@@ -188,9 +206,11 @@ KERNEL void dilated_col2im3d_kernel(const ga_size n,
  }
 }

-#kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : 
+#kernel col2im3d_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
+
 KERNEL void col2im3d_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
+    const ga_size offset_col,
    const ga_size height, const ga_size width, const ga_size depth,
    const ga_size channels,
    const ga_size kernel_h, const ga_size kernel_w, const ga_size kernel_d,
@@ -198,7 +218,12 @@ KERNEL void col2im3d_kernel(const ga_size n,
    const ga_size stride_h, const ga_size stride_w, const ga_size stride_d,
    const ga_size height_col, const ga_size width_col, const ga_size depth_col,
    GLOBAL_MEM DTYPE_INPUT_0 * data_im,
+    const ga_size offset_im,
    const ga_size data_im_offset) {
+    // offset_im is the pointer offset for data_im.
+    // data_im_offset is an offset of elements in the array
+  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
+  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
@@ -236,16 +261,50 @@ KERNEL void col2im3d_kernel(const ga_size n,
  }
 }

+#section support_code
+
+int rgemm(cb_order o, cb_transpose tA, cb_transpose tB,
+          size_t M, size_t N, size_t K, double alpha,
+          GpuArray *A, size_t offA, size_t lda,
+          GpuArray *B, size_t offB, size_t ldb,
+          double beta, GpuArray *C, size_t offC, size_t ldc) {
+  switch (A->typecode) {
+  case GA_FLOAT:
+    return gpublas_sgemm(o, tA, tB,
+                         M, N, K, alpha,
+                         A->data, (A->offset / 4) + offA, lda,
+                         B->data, (B->offset / 4) + offB, ldb,
+                         beta,
+                         C->data, (C->offset / 4) + offC, ldc);
+  case GA_DOUBLE:
+    return gpublas_dgemm(o, tA, tB,
+                         M, N, K, alpha,
+                         A->data, (A->offset / 8) + offA, lda,
+                         B->data, (B->offset / 8) + offB, ldb,
+                         beta,
+                         C->data, (C->offset / 8) + offC, ldc);
+  case GA_HALF:
+    return gpublas_hgemm(o, tA, tB,
+                         M, N, K, alpha,
+                         A->data, (A->offset / 2) + offA, lda,
+                         B->data, (B->offset / 2) + offB, ldb,
+                         beta,
+                         C->data, (C->offset / 2) + offC, ldc);
+  default:
+    return GA_UNSUPPORTED_ERROR;
+  }
+}
+
 #section support_code_struct

 int im3d2col(
-    gpudata * data_im, const size_t data_im_offset, const size_t channels,
+    GpuArray *data_im, const size_t data_im_offset, const size_t channels,
    const size_t height, const size_t width, const size_t depth,
    const size_t kernel_h, const size_t kernel_w, const size_t kernel_d,
    const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
    const size_t pad_h, const size_t pad_w, const size_t pad_d,
    const size_t stride_h, const size_t stride_w, const size_t stride_d,
-    gpudata * data_col) {
+    GpuArray *data_col) {
  // We are going to launch channels * height_col * width_col * depth_col
  // kernels, each kernel responsible for copying a single-channel grid.
  size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
@@ -259,10 +318,11 @@ int im3d2col(
  if (dilation_h != 1 || dilation_w != 1 || dilation_d != 1) {
    err = dilated_im3d2col_kernel_scall(
      1, &num_kernels, 0,
-      num_kernels, data_im, data_im_offset, height, width, depth,
+      num_kernels, data_im->data, data_im->offset,
+      data_im_offset, height, width, depth,
      kernel_h, kernel_w, kernel_d, dilation_h, dilation_w, dilation_d,
      pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, height_col,
-      width_col, depth_col, data_col);
+      width_col, depth_col, data_col->data, data_col->offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: dilated_im3d2col_kernel: %s.",
@@ -271,10 +331,11 @@ int im3d2col(
  } else {
    err = im3d2col_kernel_scall(
      1, &num_kernels, 0,
-      num_kernels, data_im, data_im_offset, height, width, depth,
+      num_kernels, data_im->data, data_im->offset,
+      data_im_offset, height, width, depth,
      kernel_h, kernel_w, kernel_d, pad_h, pad_w, pad_d,
      stride_h, stride_w, stride_d, height_col, width_col, depth_col,
-      data_col);
+      data_col->data, data_col->offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: im3d2col_kernel: %s.",
@@ -284,13 +345,13 @@ int im3d2col(
  return err;
 }

-int col2im3d(gpudata * data_col, const size_t channels,
+int col2im3d(GpuArray *data_col, const size_t channels,
    const size_t height, const size_t width, const size_t depth,
    const size_t patch_h, const size_t patch_w, const size_t patch_d,
    const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
    const size_t pad_h, const size_t pad_w, const size_t pad_d,
    const size_t stride_h, const size_t stride_w, const size_t stride_d,
-    gpudata * data_im, const size_t data_im_offset) {
+    GpuArray *data_im, const size_t data_im_offset) {
  size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
  size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
  size_t dil_patch_d = (patch_d - 1) * dilation_d + 1;
@@ -304,10 +365,11 @@ int col2im3d(gpudata * data_col, const size_t channels,
  if (dilation_h != 1 || dilation_w != 1 || dilation_d != 1) {
    err = dilated_col2im3d_kernel_scall(
      1, &num_kernels, 0,
-      num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
+      num_kernels, data_col->data, data_col->offset,
+      height, width, depth, channels, patch_h, patch_w,
      patch_d, dilation_h, dilation_w, dilation_d, pad_h, pad_w, pad_d,
      stride_h, stride_w, stride_d, height_col, width_col, depth_col,
-      data_im, data_im_offset);
+      data_im->data, data_im->offset, data_im_offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: dilated_col2im3d_kernel: %s.",
@@ -317,9 +379,11 @@ int col2im3d(gpudata * data_col, const size_t channels,
  else{
    err = col2im3d_kernel_scall(
      1, &num_kernels, 0,
-      num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
+      num_kernels, data_col->data, data_col->offset,
+      height, width, depth, channels, patch_h, patch_w,
      patch_d, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d,
-      height_col, width_col, depth_col, data_im, data_im_offset);
+      height_col, width_col, depth_col,
+      data_im->data, data_im->offset, data_im_offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: col2im3d_kernel: %s.",
@@ -503,42 +567,20 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
        for (size_t n = 0; n < batchSize; n++) {
            // First, im3d2col
            err = im3d2col(
-              bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
+              &bottom->ga, n * bottom_stride, nChannels, bottomHeight,
              bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
-              padH, padW, padD, dH, dW, dD, col->ga.data);
+              padH, padW, padD, dH, dW, dD, &col->ga);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
-            switch (col->ga.typecode) {
-            case GA_FLOAT:
-              err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                                  N_, M_, K_, 1,
-                                  col->ga.data, 0, N_,
-                                  weight->ga.data, 0, K_,
-                                  0,
-                                  top->ga.data, n * top_stride, N_);
-              break;
-            case GA_DOUBLE:
-              err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                                  N_, M_, K_, 1,
-                                  col->ga.data, 0, N_,
-                                  weight->ga.data, 0, K_,
-                                  0,
-                                  top->ga.data, n * top_stride, N_);
-              break;
-            case GA_HALF:
-              err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                                  N_, M_, K_, 1,
-                                  col->ga.data, 0, N_,
-                                  weight->ga.data, 0, K_,
-                                  0,
-                                  top->ga.data, n * top_stride, N_);
-              break;
-            default:
-              err = GA_UNSUPPORTED_ERROR;
-            }
+            err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
+                        N_, M_, K_, 1,
+                        &col->ga, 0, N_,
+                        &weight->ga, 0, K_,
+                        0,
+                        &top->ga, n * top_stride, N_);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorr3dMM forward encountered an error running gemm.");
@@ -565,9 +607,9 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
        for (size_t n = 0; n < batchSize; n++) {
            // First, im3d2col
            err = im3d2col(
-              bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
+              &bottom->ga, n * bottom_stride, nChannels, bottomHeight,
              bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
-              padH, padW, padD, dH, dW, dD, col->ga.data);
+              padH, padW, padD, dH, dW, dD, &col->ga);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
@@ -576,34 +618,12 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
            // is faster than setting weight to all zeros before the loop.)
-            switch (col->ga.typecode) {
-            case GA_FLOAT:
-              err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
-                                  K_, M_, N_, 1,
-                                  col->ga.data, 0, N_,
-                                  top->ga.data, n * top_stride, N_,
-                                  (n == 0) ? 0 : 1,
-                                  weight->ga.data, 0, K_);
-              break;
-            case GA_DOUBLE:
-              err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans,
-                                  K_, M_, N_, 1,
-                                  col->ga.data, 0, N_,
-                                  top->ga.data, n * top_stride, N_,
-                                  (n == 0) ? 0 : 1,
-                                  weight->ga.data, 0, K_);
-              break;
-            case GA_HALF:
-              err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans,
-                                  K_, M_, N_, 1,
-                                  col->ga.data, 0, N_,
-                                  top->ga.data, n * top_stride, N_,
-                                  (n == 0) ? 0 : 1,
-                                  weight->ga.data, 0, K_);
-              break;
-            default:
-              err = GA_UNSUPPORTED_ERROR;
-            }
+            err = rgemm(cb_fortran, cb_trans, cb_no_trans,
+                        K_, M_, N_, 1,
+                        &col->ga, 0, N_,
+                        &top->ga, n * top_stride, N_,
+                        (n == 0) ? 0 : 1,
+                        &weight->ga, 0, K_);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorr3dMM grad weights encountered an error running gemm.");
@@ -638,34 +658,12 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
          // gemm into columns
-          switch (top->ga.typecode) {
-          case GA_FLOAT:
-            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
-                                N_, K_, M_, 1,
-                                top->ga.data, n * top_stride, N_,
-                                weight->ga.data, 0, K_,
-                                0,
-                                col->ga.data, 0, N_);
-            break;
-          case GA_DOUBLE:
-            err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans,
-                                N_, K_, M_, 1,
-                                top->ga.data, n * top_stride, N_,
-                                weight->ga.data, 0, K_,
-                                0,
-                                col->ga.data, 0, N_);
-            break;
-          case GA_HALF:
-            err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans,
-                                N_, K_, M_, 1,
-                                top->ga.data, n * top_stride, N_,
-                                weight->ga.data, 0, K_,
-                                0,
-                                col->ga.data, 0, N_);
-            break;
-          default:
-            err = GA_UNSUPPORTED_ERROR;
-          }
+          err = rgemm(cb_fortran, cb_no_trans, cb_trans,
+                      N_, K_, M_, 1,
+                      &top->ga, n * top_stride, N_,
+                      &weight->ga, 0, K_,
+                      0,
+                      &col->ga, 0, N_);
          if (err != GA_NO_ERROR) {
            PyErr_Format(PyExc_RuntimeError,
                         "GpuCorr3dMM grad inputs encountered an error running gemm.");
@@ -673,10 +671,10 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
            return NULL;
          }
          // col2im3d back to the data
-          err = col2im3d(col->ga.data, nChannels,
+          err = col2im3d(&col->ga, nChannels,
                         bottomHeight, bottomWidth, bottomDepth,
                         kH, kW, kD, dilH, dilW, dilD, padH, padW, padD,
-                         dH, dW, dD, bottom->ga.data, n * bottom_stride);
+                         dH, dW, dD, &bottom->ga, n * bottom_stride);
          if (err != GA_NO_ERROR) {
            Py_DECREF(col);
            return NULL;

--- a/theano/gpuarray/corr_gemm.c
+++ b/theano/gpuarray/corr_gemm.c
 #section kernels

-#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, * : 
+#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
 // TODO check kernel flags
 // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
 // sources are clearly marked. Below we reproduce the original license of
@@ -35,14 +35,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // GPU kernel for the case of dilation
 KERNEL void dilated_im2col_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
+    const ga_size offset_im,
    const ga_size data_im_offset,
+    // offset_im is the pointer offset for data_im.
+    // data_im_offset is an offset of elements in the array
    const ga_size height, const ga_size width,
    const ga_size kernel_h, const ga_size kernel_w,
    const ga_size dilation_h, const ga_size dilation_w,
    const ga_size pad_h, const ga_size pad_w,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
-    GLOBAL_MEM DTYPE_INPUT_0 * data_col) {
+    GLOBAL_MEM DTYPE_INPUT_0 * data_col,
+    const ga_size offset_col) {
+  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
+  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
@@ -70,16 +76,23 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
  }
 }

-#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, * : 
+#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
+
 KERNEL void im2col_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
+    const ga_size offset_im,
    const ga_size data_im_offset,
+    // offset_im is the pointer offset for data_im.
+    // data_im_offset is an offset of elements in the array
    const ga_size height, const ga_size width,
    const ga_size kernel_h, const ga_size kernel_w,
    const ga_size pad_h, const ga_size pad_w,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
-    GLOBAL_MEM DTYPE_INPUT_0 * data_col) {
+    GLOBAL_MEM DTYPE_INPUT_0 * data_col,
+    const ga_size offset_col) {
+  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
+  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
@@ -108,9 +121,9 @@ KERNEL void im2col_kernel(const ga_size n,
 }

 // GPU kernel for the case of dilation
-#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : 
+#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
 KERNEL void dilated_col2im_kernel(const ga_size n,
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
+    GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
    const ga_size height, const ga_size width, const ga_size channels,
    const ga_size kernel_h, const ga_size kernel_w,
    const ga_size dilation_h, const ga_size dilation_w,
@@ -118,7 +131,12 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
    GLOBAL_MEM DTYPE_INPUT_0 * data_im,
+    const ga_size offset_im,
    const ga_size data_im_offset) {
+    // offset_im is the pointer offset for data_im.
+    // data_im_offset is an offset of elements in the array
+  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
+  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
@@ -153,16 +171,22 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
  }
 }

-#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, *, size : 
+#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
+
 KERNEL void col2im_kernel(const ga_size n,
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
+    GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
    const ga_size height, const ga_size width, const ga_size channels,
    const ga_size kernel_h, const ga_size kernel_w,
    const ga_size pad_h, const ga_size pad_w,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
    GLOBAL_MEM DTYPE_INPUT_0 * data_im,
+    const ga_size offset_im,
    const ga_size data_im_offset) {
+    // offset_im is the pointer offset for data_im.
+    // data_im_offset is an offset of elements in the array
+  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
+  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
@@ -191,16 +215,48 @@ KERNEL void col2im_kernel(const ga_size n,
  }
 }

+#section support_code

+int rgemm(cb_order o, cb_transpose tA, cb_transpose tB,
+          size_t M, size_t N, size_t K, double alpha,
+          GpuArray *A, size_t offA, size_t lda,
+          GpuArray *B, size_t offB, size_t ldb,
+          double beta, GpuArray *C, size_t offC, size_t ldc) {
+  switch (A->typecode) {
+  case GA_FLOAT:
+    return gpublas_sgemm(o, tA, tB,
+                         M, N, K, alpha,
+                         A->data, (A->offset / 4) + offA, lda,
+                         B->data, (B->offset / 4) + offB, ldb,
+                         beta,
+                         C->data, (C->offset / 4) + offC, ldc);
+  case GA_DOUBLE:
+    return gpublas_dgemm(o, tA, tB,
+                         M, N, K, alpha,
+                         A->data, (A->offset / 8) + offA, lda,
+                         B->data, (B->offset / 8) + offB, ldb,
+                         beta,
+                         C->data, (C->offset / 8) + offC, ldc);
+  case GA_HALF:
+    return gpublas_hgemm(o, tA, tB,
+                         M, N, K, alpha,
+                         A->data, (A->offset / 2) + offA, lda,
+                         B->data, (B->offset / 2) + offB, ldb,
+                         beta,
+                         C->data, (C->offset / 2) + offC, ldc);
+  default:
+    return GA_UNSUPPORTED_ERROR;
+  }
+}

 #section support_code_struct

-int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
+int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels,
    const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w,
    const size_t dilation_h, const size_t dilation_w,
    const size_t pad_h, const size_t pad_w,
    const size_t stride_h, const size_t stride_w,
-    gpudata * data_col) {
+    GpuArray *data_col) {
  // We are going to launch channels * height_col * width_col kernels, each
  // kernel responsible for copying a single-channel grid.
  size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
@@ -212,9 +268,10 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
  if (dilation_h != 1 || dilation_w != 1) {
    err = dilated_im2col_kernel_scall(
      1, &num_kernels, 0,
-      num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
+      num_kernels, data_im->data, data_im->offset, data_im_offset,
+      height, width, kernel_h, kernel_w,
      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
-      width_col, data_col);
+      width_col, data_col->data, data_col->offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: dilated_im2col_kernel: %s.",
@@ -223,9 +280,10 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
  } else {
    err = im2col_kernel_scall(
      1, &num_kernels, 0,
-      num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
+      num_kernels, data_im->data, data_im->offset, data_im_offset,
+      height, width, kernel_h, kernel_w,
      pad_h, pad_w, stride_h, stride_w, height_col,
-      width_col, data_col);
+      width_col, data_col->data, data_col->offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: im2col_kernel: %s.",
@@ -235,11 +293,11 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
  return err;
 }

-int col2im(gpudata * data_col, const size_t channels,
+int col2im(GpuArray *data_col, const size_t channels,
    const size_t height, const size_t width, const size_t patch_h, const size_t patch_w,
    const size_t dilation_h, const size_t dilation_w,
    const size_t pad_h, const size_t pad_w, const size_t stride_h,
-    const size_t stride_w, gpudata * data_im, const size_t data_im_offset) {
+    const size_t stride_w, GpuArray *data_im, const size_t data_im_offset) {
  size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
  size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
  size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
@@ -251,9 +309,10 @@ int col2im(gpudata * data_col, const size_t channels,
  if (dilation_h != 1 || dilation_w != 1) {
    err = dilated_col2im_kernel_scall(
      1, &num_kernels, 0,
-      num_kernels, data_col, height, width, channels, patch_h, patch_w,
+      num_kernels, data_col->data, data_col->offset,
+      height, width, channels, patch_h, patch_w,
      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
-      height_col, width_col, data_im, data_im_offset);
+      height_col, width_col, data_im->data, data_im->offset, data_im_offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: dilated_col2im_kernel: %s.",
@@ -262,9 +321,10 @@ int col2im(gpudata * data_col, const size_t channels,
  } else {
    err = col2im_kernel_scall(
      1, &num_kernels, 0,
-      num_kernels, data_col, height, width, channels, patch_h, patch_w,
+      num_kernels, data_col->data, data_col->offset,
+      height, width, channels, patch_h, patch_w,
      pad_h, pad_w, stride_h, stride_w,
-      height_col, width_col, data_im, data_im_offset);
+      height_col, width_col, data_im->data, data_im->offset, data_im_offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: col2im_kernel: %s.",
@@ -433,43 +493,21 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im2col
-            err = im2col(bottom->ga.data, n * bottom_stride,
-                         nChannels, bottomHeight,
-                         bottomWidth, kH, kW, dilH, dilW,
-                         padH, padW, dH, dW, col->ga.data);
+	  err = im2col(&bottom->ga, n * bottom_stride,
+		       nChannels, bottomHeight,
+		       bottomWidth, kH, kW, dilH, dilW,
+		       padH, padW, dH, dW, &col->ga);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
-            switch (col->ga.typecode) {
-            case GA_FLOAT:
-              err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                                  N_, M_, K_, 1,
-                                  col->ga.data, 0, N_,
-                                  weight->ga.data, 0, K_,
-                                  0,
-                                  top->ga.data, n * top_stride, N_);
-              break;
-            case GA_DOUBLE:
-              err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                                  N_, M_, K_, 1,
-                                  col->ga.data, 0, N_,
-                                  weight->ga.data, 0, K_,
-                                  0,
-                                  top->ga.data, n * top_stride, N_);
-              break;
-            case GA_HALF:
-              err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                                  N_, M_, K_, 1,
-                                  col->ga.data, 0, N_,
-                                  weight->ga.data, 0, K_,
-                                  0,
-                                  top->ga.data, n * top_stride, N_);
-              break;
-            default:
-              err = GA_UNSUPPORTED_ERROR;
-            }
+            err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
+                        N_, M_, K_, 1,
+                        &col->ga, 0, N_,
+                        &weight->ga, 0, K_,
+                        0,
+                        &top->ga, n * top_stride, N_);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM forward encountered an error running gemm: %d", err);
@@ -495,10 +533,10 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im2col
-            err = im2col(bottom->ga.data, n * bottom_stride,
+            err = im2col(&bottom->ga, n * bottom_stride,
                         nChannels, bottomHeight,
                         bottomWidth, kH, kW, dilH, dilW,
-                         padH, padW, dH, dW, col->ga.data);
+                         padH, padW, dH, dW, &col->ga);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
@@ -507,34 +545,12 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
            // is faster than setting weight to all zeros before the loop.)
-            switch (col->ga.typecode) {
-            case GA_FLOAT:
-              err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
-                                  K_, M_, N_, 1,
-                                  col->ga.data, 0, N_,
-                                  top->ga.data, n * top_stride, N_,
-                                  (n == 0) ? 0 : 1,
-                                  weight->ga.data, 0, K_);
-              break;
-            case GA_DOUBLE:
-              err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans,
-                                  K_, M_, N_, 1,
-                                  col->ga.data, 0, N_,
-                                  top->ga.data, n * top_stride, N_,
-                                  (n == 0) ? 0 : 1,
-                                  weight->ga.data, 0, K_);
-              break;
-            case GA_HALF:
-              err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans,
-                                  K_, M_, N_, 1,
-                                  col->ga.data, 0, N_,
-                                  top->ga.data, n * top_stride, N_,
-                                  (n == 0) ? 0 : 1,
-                                  weight->ga.data, 0, K_);
-              break;
-            default:
-                err = GA_UNSUPPORTED_ERROR;
-            }
+            err = rgemm(cb_fortran, cb_trans, cb_no_trans,
+                        K_, M_, N_, 1,
+                        &col->ga, 0, N_,
+                        &top->ga, n * top_stride, N_,
+                        (n == 0) ? 0 : 1,
+                        &weight->ga, 0, K_);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM grad weights encountered an error running gemm: %d", err);
@@ -559,35 +575,13 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // full convolution: gemm, then col2im
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
-            // gemm into columns
-          switch (top->ga.typecode) {
-          case GA_FLOAT:
-            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
-                                N_, K_, M_, 1,
-                                top->ga.data, n * top_stride, N_,
-                                weight->ga.data, 0, K_,
-                                0,
-                                col->ga.data, 0, N_);
-            break;
-          case GA_DOUBLE:
-            err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans,
-                                N_, K_, M_, 1,
-                                top->ga.data, n * top_stride, N_,
-                                weight->ga.data, 0, K_,
-                                0,
-                                col->ga.data, 0, N_);
-            break;
-          case GA_HALF:
-            err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans,
-                                N_, K_, M_, 1,
-                                top->ga.data, n * top_stride, N_,
-                                weight->ga.data, 0, K_,
-                                0,
-                                col->ga.data, 0, N_);
-            break;
-          default:
-            err = GA_UNSUPPORTED_ERROR;
-          }
+          // gemm into columns
+          err = rgemm(cb_fortran, cb_no_trans, cb_trans,
+                      N_, K_, M_, 1,
+                      &top->ga, n * top_stride, N_,
+                      &weight->ga, 0, K_,
+                      0,
+                      &col->ga, 0, N_);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
@@ -595,9 +589,9 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
                return NULL;
            }
            // col2im back to the data
-            err = col2im(col->ga.data, nChannels, bottomHeight, bottomWidth,
+            err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
                         kH, kW, dilH, dilW, padH, padW,
-                         dH, dW, bottom->ga.data, n * bottom_stride);
+                         dH, dW, &bottom->ga, n * bottom_stride);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -35,7 +35,7 @@ class GpuCumOp(GpuKernelBase, Op):
        return hash(self.axis) ^ hash(self.mode)

    def c_code_cache_version(self):
-        return (5,)
+        return (6,)

    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
@@ -69,11 +69,9 @@ class GpuCumOp(GpuKernelBase, Op):
        code = """
        KERNEL void %(kname)s(float* input, ga_size input_offset,
                              float* output, ga_size output_offset,
-                              ga_ssize inputStrides_x,
-                              ga_ssize inputStrides_y,
-                              ga_ssize inputStrides_z,
-                              ga_ssize outputStrides_x, ga_ssize outputStrides_y,
-                              ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
+                              ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
+                              ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_z,
+                              const int offsetY, const int offsetZ,
                              const int beforeLastElementIdx, const int lastElementIdx){
            input = (float *)(((char *)input) + input_offset);
            output = (float *)(((char *)output) + output_offset);
@@ -216,6 +214,7 @@ class GpuCumOp(GpuKernelBase, Op):

            output = (float *)(((char *)output) + output_offset);
            blockSum = (float *)(((char *)blockSum) + blockSum_offset);
+
            int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;

            // Check if current has data to process.
@@ -397,23 +396,8 @@ class GpuCumOp(GpuKernelBase, Op):
                    size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
                    size_t dimBlock[3] = {dimBlockX, 1, 1};  // One cum op per block.
                    size_t sharedBytes = (2*dimBlockX) * sizeof(float);
-                    void* kernel_params[] = {(void*) input->ga.data,
-                                             (void*) &(input->ga.offset),
-                                             (void*) output->ga.data,
-                                             (void*) &(output->ga.offset),
-                                             (void*) &nbElementsPerCumOp,
-                                             (void*) &inputStrides_x,
-                                             (void*) &inputStrides_y,
-                                             (void*) &inputStrides_z,
-                                             (void*) &outputStrides_x,
-                                             (void*) &outputStrides_y,
-                                             (void*) &outputStrides_z,
-                                             (void*) &offsetY,
-                                             (void*) &offsetZ,
-                                             (void*) deviceBlockSum->ga.data,
-                                             (void*) &(deviceBlockSum->ga.offset)
-                        };
-                    int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
+
+                    int err = k_blockCumOp_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, nbElementsPerCumOp, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, deviceBlockSum->ga.data, deviceBlockSum->ga.offset);
                    if (err != GA_NO_ERROR){
                        PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed");
                        return -1;
@@ -429,18 +413,8 @@ class GpuCumOp(GpuKernelBase, Op):
                        //  report partial cum ops of previous blocks to subsequents ones.
                        size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
                        size_t dimBlock[3] = {dimBlockX, 1, 1};
-                        void* kernel_params[] = {(void*) output->ga.data,
-                                                 (void*) &(output->ga.offset),
-                                                 (void*) deviceBlockSum->ga.data,
-                                                 (void*) &(deviceBlockSum->ga.offset),
-                                                 (void*) &nbElementsPerCumOp,
-                                                 (void*) &outputStrides_x,
-                                                 (void*) &outputStrides_y,
-                                                 (void*) &outputStrides_z,
-                                                 (void*) &offsetY,
-                                                 (void*) &offsetZ
-                            };
-                        int err = GpuKernel_call(&k_finalCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
+
+                        int err = k_finalCumOp_call(3, dimGrid, dimBlock, sharedBytes, output->ga.data, output->ga.offset, deviceBlockSum->ga.data, deviceBlockSum->ga.offset, nbElementsPerCumOp, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);
                        if (err != GA_NO_ERROR){
                            PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed");
                            return -1;
@@ -450,24 +424,8 @@ class GpuCumOp(GpuKernelBase, Op):
                    if (shape[axis] != nbElementsPerCumOp){
                        size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
                        size_t dimBlock[3] = {1, 1, 1};
-                        size_t tmp0 = shape[axis]-2;
-                        size_t tmp1 = shape[axis]-1;
-                        void* kernel_params[] = {(void*) input->ga.data,
-                                                 (void*) &(input->ga.offset),
-                                                 (void*) output->ga.data,
-                                                 (void*) &(output->ga.offset),
-                                                 (void*) &inputStrides_x,
-                                                 (void*) &inputStrides_y,
-                                                 (void*) &inputStrides_z,
-                                                 (void*) &outputStrides_x,
-                                                 (void*) &outputStrides_y,
-                                                 (void*) &outputStrides_z,
-                                                 (void*) &offsetY,
-                                                 (void*) &offsetZ,
-                                                 (void*) &(tmp0),
-                                                 (void*) &(tmp1)
-                        };
-                        int err = GpuKernel_call(&k_cumadd_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
+
+                        int err = k_cumadd_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, shape[axis] - 2, shape[axis] - 1);
                        if (err != GA_NO_ERROR){
                            PyErr_SetString(PyExc_RuntimeError, "cumadd call failed");
                            return -1;

--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -71,16 +71,22 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
 KERNEL void k_multi_warp_multinomial(
    const ga_size nb_multi,
    const ga_size nb_outcomes,
-    GLOBAL_MEM %(in_ctype)s * global_pvals,
+    GLOBAL_MEM %(in_ctype)s *global_pvals,
+    const ga_size global_pvals_offset,
    const ga_ssize pvals_row_stride,
    const ga_ssize pvals_col_stride,
-    GLOBAL_MEM %(in_ctype)s * global_unis,
+    GLOBAL_MEM %(in_ctype)s *global_unis,
+    const ga_size global_unis_offset,
    const ga_ssize unis_stride,
-    GLOBAL_MEM %(out_ctype)s * global_outs,
+    GLOBAL_MEM %(out_ctype)s *global_outs,
+    const ga_size global_outs_offset,
    const ga_ssize outs_row_stride,
    const ga_ssize outs_col_stride
 )
 {
+    global_pvals = (GLOBAL_MEM %(in_ctype)s *)(((char *)global_pvals) + global_pvals_offset);
+    global_unis = (GLOBAL_MEM %(in_ctype)s *)(((char *)global_unis) + global_unis_offset);
+    global_outs = (GLOBAL_MEM %(out_ctype)s *)(((char *)global_outs) + global_outs_offset);
    // each thread takes care of one multinomial draw
    int n = LDIM_0*GID_0 + LID_0;
    if (n < nb_multi)
@@ -113,11 +119,14 @@ KERNEL void k_multi_warp_multinomial(
            params=[pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.SSIZE,
                    pygpu.gpuarray.SSIZE,
                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.SSIZE,
                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.SSIZE,
                    pygpu.gpuarray.SSIZE],
            flags=Kernel.get_flags(node.outputs[0].dtype),
@@ -193,27 +202,8 @@ KERNEL void k_multi_warp_multinomial(

        assert(nb_blocks*nb_threads >= nb_multi);

-        void *args[10];
-        ssize_t strides[5] = {
-            PyGpuArray_STRIDES(pvals)[0]/gpuarray_get_elsize(%(in_typecode)s),
-            PyGpuArray_STRIDES(pvals)[1]/gpuarray_get_elsize(%(in_typecode)s),
-            PyGpuArray_STRIDES(unis)[0]/gpuarray_get_elsize(%(in_typecode)s),
-            PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s),
-            PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s)
-        };
-        int err;
-        args[0] = (void*)&PyGpuArray_DIMS(out)[1];
-        args[1] = (void*)&PyGpuArray_DIMS(out)[0];
-        args[2] = pvals->ga.data; //PyGpuArray_DEV_DATA(pvals);
-        args[3] = (void*)&strides[0];
-        args[4] = (void*)&strides[1];
-        args[5] = unis->ga.data; //PyGpuArray_DEV_DATA(unis);
-        args[6] = (void*)&strides[2];
-        args[7] = out->ga.data; //PyGpuArray_DEV_DATA(out);
-        args[8] = (void*)&strides[3];
-        args[9] = (void*)&strides[4];
-
-        err = GpuKernel_call(&%(kname)s, 1, &nb_blocks, &nb_threads, 0, args);
+        int err = k_multi_warp_multinomial_call(1, &nb_blocks, &nb_threads, 0,  PyGpuArray_DIMS(out)[1], PyGpuArray_DIMS(out)[0], pvals->ga.data, pvals->ga.offset, PyGpuArray_STRIDES(pvals)[0]/gpuarray_get_elsize(%(in_typecode)s), PyGpuArray_STRIDES(pvals)[1]/gpuarray_get_elsize(%(in_typecode)s), unis->ga.data, unis->ga.offset, PyGpuArray_STRIDES(unis)[0]/gpuarray_get_elsize(%(in_typecode)s), out->ga.data, out->ga.offset, PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s), PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s));
+
        if (err != GA_NO_ERROR) {
           PyErr_Format(
                PyExc_RuntimeError,
@@ -230,7 +220,7 @@ KERNEL void k_multi_warp_multinomial(
        return s

    def c_code_cache_version(self):
-        return (3,)
+        return (4,)


 class GPUAChoiceFromUniform(GpuKernelBase, Op):
@@ -295,15 +285,21 @@ KERNEL void k_multi_warp_multinomial_wor(
    const ga_size nb_outcomes,
    const ga_size n_samples,
    GLOBAL_MEM float * global_pvals_copy,
+    const ga_size global_pvals_offset,
    const ga_ssize pvals_row_stride,
    const ga_ssize pvals_col_stride,
    GLOBAL_MEM float * global_unis,
+    const ga_size global_unis_offset,
    const ga_ssize unis_stride,
    GLOBAL_MEM ga_long * global_outs,
+    const ga_size global_outs_offset,
    const ga_ssize outs_row_stride,
    const ga_ssize outs_col_stride
 )
 {
+    global_pvals_copy = (GLOBAL_MEM float *)(((char *)global_pvals_copy) + global_pvals_offset);
+    global_unis = (GLOBAL_MEM float *)(((char *)global_unis) + global_unis_offset);
+    global_outs = (GLOBAL_MEM ga_long *)(((char *)global_outs) + global_outs_offset);
    // each thread takes care of one multinomial-wor n_samples-draw
    int n = LDIM_0*GID_0 + LID_0;

@@ -344,11 +340,14 @@ KERNEL void k_multi_warp_multinomial_wor(
                    pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.SSIZE,
                    pygpu.gpuarray.SSIZE,
                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.SSIZE,
                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SIZE,
                    pygpu.gpuarray.SSIZE,
                    pygpu.gpuarray.SSIZE
                    ],
@@ -438,28 +437,7 @@ KERNEL void k_multi_warp_multinomial_wor(

        assert(nb_blocks*nb_threads >= nb_multi);

-        void *args[11];
-        ssize_t strides[5] = {
-            PyGpuArray_STRIDES(pvals)[0]/sizeof(float),
-            PyGpuArray_STRIDES(pvals)[1]/sizeof(float),
-            PyGpuArray_STRIDES(unis)[0]/sizeof(float),
-            PyGpuArray_STRIDES(out)[0]/8,
-            PyGpuArray_STRIDES(out)[1]/8
-        };
-        int err;
-        args[0] = (void*)&PyGpuArray_DIMS(pvals)[0];
-        args[1] = (void*)&PyGpuArray_DIMS(pvals)[1];
-        args[2] = (void*)&n_samples;
-        args[3] = pvals_copy->ga.data; //PyGpuArray_DEV_DATA(pvals);
-        args[4] = (void*)&strides[0];
-        args[5] = (void*)&strides[1];
-        args[6] = unis->ga.data; //PyGpuArray_DEV_DATA(unis);
-        args[7] = (void*)&strides[2];
-        args[8] = out->ga.data; //PyGpuArray_DEV_DATA(out);
-        args[9] = (void*)&strides[3];
-        args[10] = (void*)&strides[4];
-
-        err = GpuKernel_call(&%(kname)s, 1, &nb_blocks, &nb_threads, 0, args);
+        int err = k_multi_warp_multinomial_wor_call(1, &nb_blocks, &nb_threads, 0, PyGpuArray_DIMS(pvals)[0], PyGpuArray_DIMS(pvals)[1], n_samples, pvals_copy->ga.data, pvals_copy->ga.offset, PyGpuArray_STRIDES(pvals)[0]/sizeof(float), PyGpuArray_STRIDES(pvals)[1]/sizeof(float), unis->ga.data, unis->ga.offset, PyGpuArray_STRIDES(unis)[0]/sizeof(float), out->ga.data, out->ga.offset, PyGpuArray_STRIDES(out)[0]/8, PyGpuArray_STRIDES(out)[1]/8);
        if (err != GA_NO_ERROR) {
           PyErr_Format(
                PyExc_RuntimeError,
@@ -477,7 +455,7 @@ KERNEL void k_multi_warp_multinomial_wor(
        return s

    def c_code_cache_version(self):
-        return (7,)
+        return (8,)


 @register_opt('fast_compile')

--- a/theano/gpuarray/pool.c
+++ b/theano/gpuarray/pool.c
 #section kernels

-#kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, * :
+#kernel max_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, *, size :

 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool2d_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size pooled_height,
   const ga_size pooled_width, const ga_size height, const ga_size width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_h, const ga_size kernel_w,
+   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_h, const ga_size kernel_w,
   const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *z)
+   GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
 {
+  x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
+  z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)z) + z_off);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < nthreads;
@@ -41,18 +43,20 @@ KERNEL void max_pool2d_kernel(const ga_size nthreads,
  }
 }

-#kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, * :
+#kernel max_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, *, size :

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool3d_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size pooled_depth,
   const ga_size pooled_height, const ga_size pooled_width,
   const ga_size depth, const ga_size height, const ga_size width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_d, const ga_size kernel_h,
+   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_d, const ga_size kernel_h,
   const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h,
   const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *z)
+   GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
 {
+  x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
+  z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)z) + z_off);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < nthreads;
@@ -90,17 +94,19 @@ KERNEL void max_pool3d_kernel(const ga_size nthreads,
  }
 }

-#kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, * :
+#kernel ave_pool2d_kernel : size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, *, size:

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool2d_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size pooled_height,
   const ga_size pooled_width, const ga_size height, const ga_size width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_h, const ga_size kernel_w,
+   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_h, const ga_size kernel_w,
   const ga_size stride_h, const ga_size stride_w, const ga_size pad_h, const ga_size pad_w,
   const ga_bool inc_pad, const ga_bool sum_mode,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *z)
+   GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
 {
+  x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
+  z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)z) + z_off);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < nthreads;
@@ -143,20 +149,22 @@ KERNEL void ave_pool2d_kernel(const ga_size nthreads,
  }
 }

-#kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, size, * :
+#kernel ave_pool3d_kernel : size, size, size, size, size, size, size, size, size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool3d_kernel(const ga_size nthreads,
                              const ga_size num, const ga_size channels, const ga_size pooled_depth,
                              const ga_size pooled_height, const ga_size pooled_width,
                              const ga_size depth, const ga_size height, const ga_size width,
-                              GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size kernel_d, const ga_size kernel_h,
+                              GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, const ga_size kernel_d, const ga_size kernel_h,
                              const ga_size kernel_w, const ga_size stride_d, const ga_size stride_h,
                              const ga_size stride_w, const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
                              const ga_bool inc_pad, const ga_bool sum_mode,
-                              GLOBAL_MEM DTYPE_OUTPUT_0 *z)
+                              GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
 {
  // grid stride looping
+  x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
+  z = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)z) + z_off);
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < nthreads;
       index += LDIM_0 * GDIM_0) {
@@ -273,8 +281,8 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
        err = max_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
                                      z_dims[0], z_dims[1], z_dims[2], z_dims[3],
                                      x_dims[2], x_dims[3],
-                                      x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1],
-                                      (*z)->ga.data);
+                                      x->ga.data, x->ga.offset, w[0], w[1], s[0], s[1], p[0], p[1],
+                                      (*z)->ga.data, (*z)->ga.offset);
        if (err != GA_NO_ERROR) {
          PyErr_Format(PyExc_RuntimeError,
                       "GpuPool: max_pool2d_kernel %s.",
@@ -285,8 +293,10 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
        err = ave_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
                                      z_dims[0], z_dims[1], z_dims[2], z_dims[3],
                                      x_dims[2], x_dims[3],
-                                      x->ga.data, w[0], w[1], s[0], s[1], p[0], p[1],
-                                      INC_PAD, SUM_MODE, (*z)->ga.data);
+                                      x->ga.data, x->ga.offset,
+                                      w[0], w[1], s[0], s[1], p[0], p[1],
+                                      INC_PAD, SUM_MODE,
+                                      (*z)->ga.data, (*z)->ga.offset);
        if (err != GA_NO_ERROR) {
          PyErr_Format(PyExc_RuntimeError,
                       "GpuPool: ave_pool2d_kernel %s.",
@@ -301,8 +311,8 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
        err = max_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
                                      z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
                                      x_dims[2], x_dims[3], x_dims[4],
-                                      x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2],
-                                      p[0], p[1], p[2], (*z)->ga.data);
+                                      x->ga.data, x->ga.offset, w[0], w[1], w[2], s[0], s[1], s[2],
+                                      p[0], p[1], p[2], (*z)->ga.data, (*z)->ga.offset);
        if (err != GA_NO_ERROR) {
          PyErr_Format(PyExc_RuntimeError,
                       "GpuPool: max_pool3d_kernel %s.",
@@ -313,9 +323,11 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
        err = ave_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
                                      z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
                                      x_dims[2], x_dims[3], x_dims[4],
-                                      x->ga.data, w[0], w[1], w[2], s[0], s[1], s[2],
+                                      x->ga.data, x->ga.offset,
+                                      w[0], w[1], w[2], s[0], s[1], s[2],
                                      p[0], p[1], p[2],
-                                      INC_PAD, SUM_MODE, (*z)->ga.data);
+                                      INC_PAD, SUM_MODE,
+                                      (*z)->ga.data, (*z)->ga.offset);
        if (err != GA_NO_ERROR) {
          PyErr_Format(PyExc_RuntimeError,
                       "GpuPool: ave_pool3d_kernel %s.",

--- a/theano/gpuarray/pool_ave_grad.c
+++ b/theano/gpuarray/pool_ave_grad.c
 #section kernels

-#kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, * :
+#kernel ave_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, *, size :

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size height,
   const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *gz,
+   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *gz, const ga_size gz_off,
   const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
   const ga_size pad_h, const ga_size pad_w, const ga_bool inc_pad, const ga_bool sum_mode,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *gx)
+   GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
 {
+  x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((char *)x) + x_off);
+  gz = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((char *)gz) + gz_off);
+  gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gx) + gx_off);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < nthreads; index += LDIM_0 * GDIM_0) {
@@ -46,19 +49,22 @@ KERNEL void ave_pool2d_grad_kernel(const ga_size nthreads,
  }
 }

-#kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, size, size, size, * :
+#kernel ave_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void ave_pool3d_grad_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size depth,
   const ga_size height, const ga_size width, const ga_size pooled_depth,
   const ga_size pooled_height, const ga_size pooled_width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *gz,
+   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *gz, const ga_size gz_off,
   const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
   const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
   const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
-   const ga_bool inc_pad, const ga_bool sum_mode, GLOBAL_MEM DTYPE_OUTPUT_0 *gx)
+   const ga_bool inc_pad, const ga_bool sum_mode, GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
 {
+  x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((char *)x) + x_off);
+  gz = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((char *)gz) + gz_off);
+  gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gx) + gx_off);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < nthreads; index += LDIM_0 * GDIM_0) {
@@ -152,9 +158,11 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
      err = ave_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3],
                                         z_dims[2], z_dims[3],
-                                         x->ga.data, gz->ga.data,
+                                         x->ga.data, x->ga.offset,
+                                         gz->ga.data, gz->ga.offset,
                                         w[0], w[1], s[0], s[1], p[0], p[1],
-                                         INC_PAD, SUM_MODE, (*gx)->ga.data);
+                                         INC_PAD, SUM_MODE,
+                                         (*gx)->ga.data, (*gx)->ga.offset);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuAveragePoolGrad: ave_pool2d_grad_kernel %s.",
@@ -166,10 +174,11 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
      err = ave_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
                                         z_dims[2], z_dims[3], z_dims[4],
-                                         x->ga.data, gz->ga.data,
+                                         x->ga.data, x->ga.offset,
+                                         gz->ga.data, gz->ga.offset,
                                         w[0], w[1], w[2], s[0], s[1], s[2],
                                         p[0], p[1], p[2], INC_PAD, SUM_MODE,
-                                         (*gx)->ga.data);
+                                         (*gx)->ga.data, (*gx)->ga.offset);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuAveragePoolGrad: ave_pool3d_grad_kernel %s.",

--- a/theano/gpuarray/pool_grad_grad.c
+++ b/theano/gpuarray/pool_grad_grad.c
 #section kernels

-#kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, * :
+#kernel max_pool2d_grad_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :

 KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size pooled_height,
   const ga_size pooled_width, const ga_size height, const ga_size width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gx,
+   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gx, const ga_size gx_off,
   const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
   const ga_size pad_h, const ga_size pad_w,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *gz)
+   GLOBAL_MEM DTYPE_OUTPUT_0 *gz, const ga_size gz_off)
 {
+  x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
+  z = (GLOBAL_MEM DTYPE_INPUT_1 *)(((char *)z) + z_off);
+  gx = (GLOBAL_MEM DTYPE_INPUT_2 *)(((char *)gx) + gx_off);
+  gz = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gz) + gz_off);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < nthreads; index += LDIM_0 * GDIM_0) {
@@ -42,18 +46,22 @@ KERNEL void max_pool2d_grad_grad_kernel(const ga_size nthreads,
  }
 }

-#kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, size, size, size, * :
+#kernel max_pool3d_grad_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :

 KERNEL void max_pool3d_grad_grad_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size pooled_depth,
   const ga_size pooled_height, const ga_size pooled_width,
   const ga_size depth, const ga_size height, const ga_size width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gx,
+   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gx, const ga_size gx_off,
   const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
   const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
   const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *gz)
+   GLOBAL_MEM DTYPE_OUTPUT_0 *gz, const ga_size gz_off)
 {
+  x = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)x) + x_off);
+  z = (GLOBAL_MEM DTYPE_INPUT_1 *)(((char *)z) + z_off);
+  gx = (GLOBAL_MEM DTYPE_INPUT_2 *)(((char *)gx) + gx_off);
+  gz = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gz) + gz_off);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < nthreads; index += LDIM_0 * GDIM_0) {
@@ -146,9 +154,11 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
      err = max_pool2d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
                                              z_dims[0], z_dims[1], z_dims[2], z_dims[3],
                                              x_dims[2], x_dims[3],
-                                              x->ga.data, z->ga.data, gx->ga.data,
+                                              x->ga.data, x->ga.offset,
+                                              z->ga.data, z->ga.offset,
+                                              gx->ga.data, gx->ga.offset,
                                              w[0], w[1], s[0], s[1], p[0], p[1],
-                                              (*gz)->ga.data);
+                                              (*gz)->ga.data, (*gz)->ga.offset);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuPoolingGradGrad: max_pool2d_grad_grad_kernel %s.",
@@ -161,9 +171,11 @@ int APPLY_SPECIFIC(pool_grad_grad)(PyGpuArrayObject *x,
      err = max_pool3d_grad_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
                                              z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
                                              x_dims[2], x_dims[3], x_dims[4],
-                                              x->ga.data, z->ga.data, gx->ga.data,
+                                              x->ga.data, x->ga.offset,
+                                              z->ga.data, z->ga.offset,
+                                              gx->ga.data, gx->ga.offset,
                                              w[0], w[1], w[2], s[0], s[1], s[2], p[0], p[1], p[2],
-                                              (*gz)->ga.data);
+                                              (*gz)->ga.data, (*gz)->ga.offset);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuPoolingGradGrad: max_pool3d_grad_grad_kernel %s.",

--- a/theano/gpuarray/pool_max_grad.c
+++ b/theano/gpuarray/pool_max_grad.c
 #section kernels

-#kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, * :
+#kernel max_pool2d_grad_kernel : size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, *, size :

 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size height,
   const ga_size width, const ga_size pooled_height, const ga_size pooled_width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gz,
+   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gz, const ga_size gz_off,
   const ga_size kernel_h, const ga_size kernel_w, const ga_size stride_h, const ga_size stride_w,
-   const ga_size pad_h, const ga_size pad_w, GLOBAL_MEM DTYPE_OUTPUT_0 *gx)
+   const ga_size pad_h, const ga_size pad_w, GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
 {
+  x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((char *)x) + x_off);
+  z = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((char *)z) + z_off);
+  gz = (GLOBAL_MEM const DTYPE_INPUT_2 *)(((char *)gz) + gz_off);
+  gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gx) + gx_off);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < nthreads; index += LDIM_0 * GDIM_0) {
@@ -38,19 +42,23 @@ KERNEL void max_pool2d_grad_kernel(const ga_size nthreads,
  }
 }

-#kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, *, *, size, size, size, size, size, size, size, size, size, * :
+#kernel max_pool3d_grad_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool3d_grad_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size depth,
   const ga_size height, const ga_size width, const ga_size pooled_depth,
   const ga_size pooled_height, const ga_size pooled_width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *z, GLOBAL_MEM const DTYPE_INPUT_2 *gz,
+   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *z, const ga_size z_off, GLOBAL_MEM const DTYPE_INPUT_2 *gz, const ga_size gz_off,
   const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
   const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
   const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *gx)
+   GLOBAL_MEM DTYPE_OUTPUT_0 *gx, const ga_size gx_off)
 {
+  x = (GLOBAL_MEM const DTYPE_INPUT_0 *)(((char *)x) + x_off);
+  z = (GLOBAL_MEM const DTYPE_INPUT_1 *)(((char *)z) + z_off);
+  gz = (GLOBAL_MEM const DTYPE_INPUT_2 *)(((char *)gz) + gz_off);
+  gx = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)gx) + gx_off);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < nthreads; index += LDIM_0 * GDIM_0) {
@@ -138,9 +146,11 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
      err = max_pool2d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3],
                                         z_dims[2], z_dims[3],
-                                         x->ga.data, z->ga.data, gz->ga.data,
+                                         x->ga.data, x->ga.offset,
+                                         z->ga.data, z->ga.offset,
+                                         gz->ga.data, gz->ga.offset,
                                         w[0], w[1], s[0], s[1], p[0], p[1],
-                                         (*gx)->ga.data);
+                                         (*gx)->ga.data, (*gx)->ga.offset);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuMaxPoolGrad: max_pool2d_grad_kernel %s.",
@@ -152,9 +162,11 @@ int APPLY_SPECIFIC(max_pool_grad)(PyGpuArrayObject *x,
      err = max_pool3d_grad_kernel_scall(1, &num_kernels, 0, num_kernels,
                                         x_dims[0], x_dims[1], x_dims[2], x_dims[3], x_dims[4],
                                         z_dims[2], z_dims[3], z_dims[4],
-                                         x->ga.data, z->ga.data, gz->ga.data,
+                                         x->ga.data, x->ga.offset,
+                                         z->ga.data, z->ga.offset,
+                                         gz->ga.data, gz->ga.offset,
                                         w[0], w[1], w[2], s[0], s[1], s[2],
-                                         p[0], p[1], p[2], (*gx)->ga.data);
+                                         p[0], p[1], p[2], (*gx)->ga.data, (*gx)->ga.offset);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuMaxPoolGrad: max_pool3d_grad_kernel %s.",

--- a/theano/gpuarray/pool_max_rop.c
+++ b/theano/gpuarray/pool_max_rop.c
 #section kernels

-#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, * :
+#kernel max_pool2d_rop_kernel : size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, *, size :

 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size pooled_height,
   const ga_size pooled_width, const ga_size height, const ga_size width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *ex,
+   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *ex, const ga_size ex_off,
   const ga_size kernel_h, const ga_size kernel_w,
   const ga_size stride_h, const ga_size stride_w,
   const ga_size pad_h, const ga_size pad_w,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *z)
+   GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size z_off)
 {
+  x = (GLOBAL_MEM DTYPE_INPUT_0 *x)(((char *)x) + x_off);
+  ex = (GLOBAL_MEM DTYPE_INPUT_1 *x)(((char *)ex) + ex_off);
+  z = (GLOBAL_MEM DTYPE_OUTPUT_0 *x)(((char *)z) + z_off);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < nthreads;
@@ -46,19 +49,22 @@ KERNEL void max_pool2d_rop_kernel(const ga_size nthreads,
  }
 }

-#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, *, size, size, size, size, size, size, size, size, size, * :
+#kernel max_pool3d_rop_kernel : size, size, size, size, size, size, size, size, size, *, size, *, size, size, size, size, size, size, size, size, size, size, *, size :

 // (adopted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
 KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
   const ga_size num, const ga_size channels, const ga_size pooled_depth,
   const ga_size pooled_height, const ga_size pooled_width,
   const ga_size depth, const ga_size height, const ga_size width,
-   GLOBAL_MEM const DTYPE_INPUT_0 *x, GLOBAL_MEM const DTYPE_INPUT_1 *ex,
+   GLOBAL_MEM const DTYPE_INPUT_0 *x, const ga_size x_off, GLOBAL_MEM const DTYPE_INPUT_1 *ex, const ga_size ex_off,
   const ga_size kernel_d, const ga_size kernel_h, const ga_size kernel_w,
   const ga_size stride_d, const ga_size stride_h, const ga_size stride_w,
   const ga_size pad_d, const ga_size pad_h, const ga_size pad_w,
-   GLOBAL_MEM DTYPE_OUTPUT_0 *z)
+   GLOBAL_MEM DTYPE_OUTPUT_0 *z, const ga_size x_off)
 {
+  x = (GLOBAL_MEM DTYPE_INPUT_0 *x)(((char *)x) + x_off);
+  ex = (GLOBAL_MEM DTYPE_INPUT_1 *x)(((char *)ex) + ex_off);
+  z = (GLOBAL_MEM DTYPE_OUTPUT_0 *x)(((char *)z) + z_off);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < nthreads;
@@ -167,9 +173,10 @@ int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
      err = max_pool2d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
                                        z_dims[0], z_dims[1], z_dims[2], z_dims[3],
                                        x_dims[2], x_dims[3],
-                                        x->ga.data, ex->ga.data,
+                                        x->ga.data, x->ga.offset,
+                                        ex->ga.data, ex->ga.offset,
                                        w[0], w[1], s[0], s[1], p[0], p[1],
-                                        (*z)->ga.data);
+                                        (*z)->ga.data, (*z)->ga.offset);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuMaxPoolRop: max_pool2d_rop_kernel %s.",
@@ -182,9 +189,11 @@ int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
      err = max_pool3d_rop_kernel_scall(1, &num_kernels, 0, num_kernels,
                                        z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
                                        x_dims[2], x_dims[3], x_dims[4],
-                                        x->ga.data, ex->ga.data,
+                                        x->ga.data, x->ga.offset,
+                                        ex->ga.data, ex->ga.offset,
                                        w[0], w[1], w[2], s[0], s[1], s[2],
-                                        p[0], p[1], p[2], (*z)->ga.data);
+                                        p[0], p[1], p[2],
+                                        (*z)->ga.data, (*z)->ga.offset);
      if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "GpuMaxPoolRop: max_pool3d_rop_kernel %s.",

--- a/theano/gpuarray/rng_mrg.py
+++ b/theano/gpuarray/rng_mrg.py
@@ -75,10 +75,14 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        code = """
        KERNEL void mrg_uniform(
                GLOBAL_MEM %(otype)s *sample_data,
+                ga_size sample_offset,
                GLOBAL_MEM ga_int *state_data,
+                ga_size state_offset,
                const ga_uint Nsamples,
                const ga_uint Nstreams_used)
        {
+            sample_data = (GLOBAL_MEM %(otype)s *)(((char *)sample_data) + sample_offset);
+            state_data = (GLOBAL_MEM ga_int *)(((char *)state_data) + state_offset);
            /*
             * The cluda backend makes sure that ga_int corresponds to
             * a 32 bit signed type on the target device.  It is not a
@@ -157,7 +161,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        from pygpu import gpuarray

        return [Kernel(code=code, name="mrg_uniform",
-                       params=[gpuarray.GpuArray, gpuarray.GpuArray,
+                       params=[gpuarray.GpuArray, gpuarray.SIZE,
+                               gpuarray.GpuArray, gpuarray.SIZE,
                               'uint32', 'uint32'],
                       flags=Kernel.get_flags(self.output_type.dtype, 'int32'))
                ]
@@ -273,7 +278,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
          }
          // Make sure we run as many blocks as we need to cover the whole n_streams
          gs = (n_streams + ls - 1)/ls;
-          err = mrg_uniform_call(1, &ls, &gs, 0, %(o_sample)s->ga.data, %(o_rstate)s->ga.data, n_elements, n_streams);
+          err = mrg_uniform_call(1, &ls, &gs, 0, %(o_sample)s->ga.data, %(o_sample)s->ga.offset, %(o_rstate)s->ga.data, %(o_rstate)s->ga.offset, n_elements, n_streams);
          if (err != GA_NO_ERROR) {
              PyErr_Format(PyExc_RuntimeError, "mrg_uniform_call: %%s\\n",
                           GpuKernel_error(&%(kname)s, err));
@@ -283,7 +288,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
        """ % locals()

    def c_code_cache_version(self):
-        return (12,)
+        return (13,)


 @register_opt2([mrg_uniform], 'fast_compile')

--- a/theano/gpuarray/tests/tstgpueye.c
+++ b/theano/gpuarray/tests/tstgpueye.c
 #section kernels

-#kernel eye : *, size, size :
+#kernel eye : *, size, size, size :
 /* The eye name will be used to generate supporting objects.  The only
   you probably need to care about is the kernel object which will be
   named 'k_' + <the name above> (k_eye in this case).  This name also
   has to match the kernel function name below.
 */

-KERNEL void eye(GLOBAL_MEM DTYPE_OUTPUT_0 *a, ga_size n, ga_size m) {
+KERNEL void eye(GLOBAL_MEM DTYPE_OUTPUT_0 *a, ga_size a_off, ga_size n, ga_size m) {
+  a = (GLOBAL_MEM DTYPE_OUTPUT_0 *)(((char *)a) + a_off);
  ga_size nb = n < m ? n : m;
  for (ga_size i = LID_0; i < nb; i += LDIM_0) {
    a[i*m + i] = 1;
@@ -37,7 +38,7 @@ int APPLY_SPECIFIC(tstgpueye)(PyArrayObject *n, PyArrayObject *m,
  ls = 1;
  gs = 256;
  /* The eye_call name comes from the kernel declaration above. */
-  err = eye_call(1, &gs, &ls, 0, (*z)->ga.data, dims[0], dims[1]);
+  err = eye_call(1, &gs, &ls, 0, (*z)->ga.data, (*z)->ga.offset, dims[0], dims[1]);
  if (err != GA_NO_ERROR) {
    PyErr_Format(PyExc_RuntimeError,
                 "gpuarray error: kEye: %s. n%lu, m=%lu.",