Handle array offsets in GpuCorrMM.

0641c6e3 · Arnaud Bergeron · c35ef4d8 · 0641c6e3
--- a/theano/gpuarray/corr_gemm.c
+++ b/theano/gpuarray/corr_gemm.c
 #section kernels

-#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, * : 
+#kernel dilated_im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
 // TODO check kernel flags
 // This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
 // sources are clearly marked. Below we reproduce the original license of
@@ -35,6 +35,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // GPU kernel for the case of dilation
 KERNEL void dilated_im2col_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
+    const ga_size offset_im,
    const ga_size data_im_offset,
    const ga_size height, const ga_size width,
    const ga_size kernel_h, const ga_size kernel_w,
@@ -42,7 +43,10 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
    const ga_size pad_h, const ga_size pad_w,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
-    GLOBAL_MEM DTYPE_INPUT_0 * data_col) {
+    GLOBAL_MEM DTYPE_INPUT_0 * data_col,
+    const ga_size offset_col) {
+  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
+  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
@@ -70,16 +74,20 @@ KERNEL void dilated_im2col_kernel(const ga_size n,
  }
 }

-#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, * : 
+#kernel im2col_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size :
 KERNEL void im2col_kernel(const ga_size n,
    GLOBAL_MEM const DTYPE_INPUT_0 * data_im,
+    const ga_size offset_im,
    const ga_size data_im_offset,
    const ga_size height, const ga_size width,
    const ga_size kernel_h, const ga_size kernel_w,
    const ga_size pad_h, const ga_size pad_w,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
-    GLOBAL_MEM DTYPE_INPUT_0 * data_col) {
+    GLOBAL_MEM DTYPE_INPUT_0 * data_col,
+    const ga_size offset_col) {
+  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
+  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
@@ -108,9 +116,9 @@ KERNEL void im2col_kernel(const ga_size n,
 }

 // GPU kernel for the case of dilation
-#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size : 
+#kernel dilated_col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
 KERNEL void dilated_col2im_kernel(const ga_size n,
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
+    GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
    const ga_size height, const ga_size width, const ga_size channels,
    const ga_size kernel_h, const ga_size kernel_w,
    const ga_size dilation_h, const ga_size dilation_w,
@@ -118,7 +126,10 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
    GLOBAL_MEM DTYPE_INPUT_0 * data_im,
+    const ga_size offset_im,
    const ga_size data_im_offset) {
+  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
+  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
@@ -153,16 +164,19 @@ KERNEL void dilated_col2im_kernel(const ga_size n,
  }
 }

-#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, *, size : 
+#kernel col2im_kernel : size, *, size, size, size, size, size, size, size, size, size, size, size, size, *, size, size :
 KERNEL void col2im_kernel(const ga_size n,
-    GLOBAL_MEM const DTYPE_INPUT_0 * data_col,
+    GLOBAL_MEM const DTYPE_INPUT_0 * data_col, const ga_size offset_col,
    const ga_size height, const ga_size width, const ga_size channels,
    const ga_size kernel_h, const ga_size kernel_w,
    const ga_size pad_h, const ga_size pad_w,
    const ga_size stride_h, const ga_size stride_w,
    const ga_size height_col, const ga_size width_col,
    GLOBAL_MEM DTYPE_INPUT_0 * data_im,
+    const ga_size offset_im,
    const ga_size data_im_offset) {
+  data_col = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_col) + offset_col);
+  data_im = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)data_im) + offset_im);
  // grid stride looping
  for (ga_size index = GID_0 * LDIM_0 + LID_0;
       index < (n); index += LDIM_0 * GDIM_0) {
@@ -195,12 +209,12 @@ KERNEL void col2im_kernel(const ga_size n,

 #section support_code_struct

-int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
+int im2col(GpuArray *data_im, const size_t data_im_offset, const size_t channels,
    const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w,
    const size_t dilation_h, const size_t dilation_w,
    const size_t pad_h, const size_t pad_w,
    const size_t stride_h, const size_t stride_w,
-    gpudata * data_col) {
+    GpuArray *data_col) {
  // We are going to launch channels * height_col * width_col kernels, each
  // kernel responsible for copying a single-channel grid.
  size_t dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
@@ -212,9 +226,10 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
  if (dilation_h != 1 || dilation_w != 1) {
    err = dilated_im2col_kernel_scall(
      1, &num_kernels, 0,
-      num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
+      num_kernels, data_im->data, data_im->offset, data_im_offset,
+      height, width, kernel_h, kernel_w,
      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
-      width_col, data_col);
+      width_col, data_col->data, data_col->offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: dilated_im2col_kernel: %s.",
@@ -223,9 +238,10 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
  } else {
    err = im2col_kernel_scall(
      1, &num_kernels, 0,
-      num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
+      num_kernels, data_im->data, data_im->offset, data_im_offset,
+      height, width, kernel_h, kernel_w,
      pad_h, pad_w, stride_h, stride_w, height_col,
-      width_col, data_col);
+      width_col, data_col->data, data_col->offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: im2col_kernel: %s.",
@@ -235,11 +251,11 @@ int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
  return err;
 }

-int col2im(gpudata * data_col, const size_t channels,
+int col2im(GpuArray *data_col, const size_t channels,
    const size_t height, const size_t width, const size_t patch_h, const size_t patch_w,
    const size_t dilation_h, const size_t dilation_w,
    const size_t pad_h, const size_t pad_w, const size_t stride_h,
-    const size_t stride_w, gpudata * data_im, const size_t data_im_offset) {
+    const size_t stride_w, GpuArray *data_im, const size_t data_im_offset) {
  size_t dil_patch_h = (patch_h - 1) * dilation_h + 1;
  size_t dil_patch_w = (patch_w - 1) * dilation_w + 1;
  size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
@@ -251,9 +267,10 @@ int col2im(gpudata * data_col, const size_t channels,
  if (dilation_h != 1 || dilation_w != 1) {
    err = dilated_col2im_kernel_scall(
      1, &num_kernels, 0,
-      num_kernels, data_col, height, width, channels, patch_h, patch_w,
+      num_kernels, data_col->data, data_col->offset,
+      height, width, channels, patch_h, patch_w,
      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
-      height_col, width_col, data_im, data_im_offset);
+      height_col, width_col, data_im->data, data_im->offset, data_im_offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: dilated_col2im_kernel: %s.",
@@ -262,9 +279,10 @@ int col2im(gpudata * data_col, const size_t channels,
  } else {
    err = col2im_kernel_scall(
      1, &num_kernels, 0,
-      num_kernels, data_col, height, width, channels, patch_h, patch_w,
+      num_kernels, data_col->data, data_col->offset,
+      height, width, channels, patch_h, patch_w,
      pad_h, pad_w, stride_h, stride_w,
-      height_col, width_col, data_im, data_im_offset);
+      height_col, width_col, data_im->data, data_im->offset, data_im_offset);
    if (err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError,
                     "gpuarray error: col2im_kernel: %s.",
@@ -433,10 +451,10 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im2col
-            err = im2col(bottom->ga.data, n * bottom_stride,
-                         nChannels, bottomHeight,
-                         bottomWidth, kH, kW, dilH, dilW,
-                         padH, padW, dH, dW, col->ga.data);
+	  err = im2col(&bottom->ga, n * bottom_stride,
+		       nChannels, bottomHeight,
+		       bottomWidth, kH, kW, dilH, dilW,
+		       padH, padW, dH, dW, &col->ga);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
@@ -495,10 +513,10 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im2col
-            err = im2col(bottom->ga.data, n * bottom_stride,
+            err = im2col(&bottom->ga, n * bottom_stride,
                         nChannels, bottomHeight,
                         bottomWidth, kH, kW, dilH, dilW,
-                         padH, padW, dH, dW, col->ga.data);
+                         padH, padW, dH, dW, &col->ga);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
@@ -595,9 +613,9 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
                return NULL;
            }
            // col2im back to the data
-            err = col2im(col->ga.data, nChannels, bottomHeight, bottomWidth,
+            err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
                         kH, kW, dilH, dilW, padH, padW,
-                         dH, dW, bottom->ga.data, n * bottom_stride);
+                         dH, dW, &bottom->ga, n * bottom_stride);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;