Merge pull request #5988 from lamblin/fix_gpucumop_offset

Correctly handle offset in GpuCumOp

Merge pull request #5988 from lamblin/fix_gpucumop_offset
b4661fbd · abergeron · GitHub · 043cb678 · 226c3f9c · b4661fbd
--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -35,7 +35,7 @@ class GpuCumOp(GpuKernelBase, Op):
        return hash(self.axis) ^ hash(self.mode)

    def c_code_cache_version(self):
-        return (4,)
+        return (5,)

    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
@@ -67,13 +67,16 @@ class GpuCumOp(GpuKernelBase, Op):
        dtype_x = node.inputs[0].dtype
        flags = Kernel.get_flags(dtype_x)
        code = """
-        KERNEL void %(kname)s(float* input, float* output,
+        KERNEL void %(kname)s(float* input, ga_size input_offset,
+                              float* output, ga_size output_offset,
                              ga_ssize inputStrides_x,
                              ga_ssize inputStrides_y,
                              ga_ssize inputStrides_z,
                              ga_ssize outputStrides_x, ga_ssize outputStrides_y,
                              ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
                              const int beforeLastElementIdx, const int lastElementIdx){
+            input = (float *)(((char *)input) + input_offset);
+            output = (float *)(((char *)output) + output_offset);
            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ;

@@ -85,8 +88,10 @@ class GpuCumOp(GpuKernelBase, Op):
            output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast];
            }
        """ % locals()
-        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE,
-                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
+        params = [gpuarray.GpuArray, gpuarray.SIZE,
+                  gpuarray.GpuArray, gpuarray.SIZE,
+                  gpuarray.SSIZE, gpuarray.SSIZE,
+                  gpuarray.SSIZE, gpuarray.SSIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE,
                  'intc', 'intc',
                  'intc', 'intc',
@@ -96,10 +101,11 @@ class GpuCumOp(GpuKernelBase, Op):
        # blockCumOp
        kname = "k_blockCumOp"
        k_var = "k_blockCumOp_" + nodename
-        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
+        params = [gpuarray.GpuArray, gpuarray.SIZE,
+                  gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
-                  'int32', 'int32', gpuarray.GpuArray, ]
+                  'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE]
        code = """
        // helper functions
        WITHIN_KERNEL
@@ -154,12 +160,17 @@ class GpuCumOp(GpuKernelBase, Op):
            output[idx_odd]  = partialCumOp[threadIdx.x*2 + 1];
        }

-        KERNEL void k_blockCumOp(float* input, float* output,
-                                        size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
-                                        ga_ssize inputStrides_y,  ga_ssize inputStrides_z,
-                                        ga_ssize outputStrides_x, ga_ssize outputStrides_y,
-                                        ga_ssize outputStrides_z, int offsetY,
-                                        int offsetZ, float* blockSum) {
+        KERNEL void k_blockCumOp(float* input, ga_size input_offset,
+                                 float* output, ga_size output_offset,
+                                 size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
+                                 ga_ssize inputStrides_y,  ga_ssize inputStrides_z,
+                                 ga_ssize outputStrides_x, ga_ssize outputStrides_y,
+                                 ga_ssize outputStrides_z, int offsetY,
+                                 int offsetZ, float* blockSum, ga_size blockSum_offset) {
+            input = (float *)(((char *)input) + input_offset);
+            output = (float *)(((char *)output) + output_offset);
+            blockSum = (float *)(((char *)blockSum) + blockSum_offset);
+
            // Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis.
            // The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case.

@@ -197,9 +208,14 @@ class GpuCumOp(GpuKernelBase, Op):
        kname = "k_finalCumOp"
        k_var = "k_finalCumOp_" + nodename
        code = """
-        KERNEL void k_finalCumOp(float* output, float* blockSum, size_t nbElementsPerCumOp,
-                                               ga_ssize dataStrides_x,  ga_ssize dataStrides_y,  ga_ssize dataStrides_z,
-                                               int offsetY, int offsetZ) {
+        KERNEL void k_finalCumOp(float* output, ga_size output_offset,
+                                 float* blockSum, ga_size blockSum_offset,
+                                 size_t nbElementsPerCumOp,
+                                 ga_ssize dataStrides_x,  ga_ssize dataStrides_y,  ga_ssize dataStrides_z,
+                                 int offsetY, int offsetZ) {
+
+            output = (float *)(((char *)output) + output_offset);
+            blockSum = (float *)(((char *)blockSum) + blockSum_offset);
            int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;

            // Check if current has data to process.
@@ -218,7 +234,8 @@ class GpuCumOp(GpuKernelBase, Op):
            output[idx_odd] %(op)s= currentBlockSum;
        }
        """ % locals()
-        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
+        params = [gpuarray.GpuArray, gpuarray.SIZE,
+                  gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
                  'int32', 'int32', ]
        kernels.append(Kernel(code=code, name=kname, params=params,
@@ -380,8 +397,10 @@ class GpuCumOp(GpuKernelBase, Op):
                    size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
                    size_t dimBlock[3] = {dimBlockX, 1, 1};  // One cum op per block.
                    size_t sharedBytes = (2*dimBlockX) * sizeof(float);
-                    void* kernel_params[] = {(void*) ((char *)(input->ga.data) + input->ga.offset),
-                                             (void*) ((char *)(output->ga.data) + output->ga.offset),
+                    void* kernel_params[] = {(void*) input->ga.data,
+                                             (void*) &(input->ga.offset),
+                                             (void*) output->ga.data,
+                                             (void*) &(output->ga.offset),
                                             (void*) &nbElementsPerCumOp,
                                             (void*) &inputStrides_x,
                                             (void*) &inputStrides_y,
@@ -391,7 +410,8 @@ class GpuCumOp(GpuKernelBase, Op):
                                             (void*) &outputStrides_z,
                                             (void*) &offsetY,
                                             (void*) &offsetZ,
-                                             (void*) ((char*)(deviceBlockSum->ga.data) + deviceBlockSum->ga.offset)
+                                             (void*) deviceBlockSum->ga.data,
+                                             (void*) &(deviceBlockSum->ga.offset)
                        };
                    int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
                    if (err != GA_NO_ERROR){
@@ -409,8 +429,10 @@ class GpuCumOp(GpuKernelBase, Op):
                        //  report partial cum ops of previous blocks to subsequents ones.
                        size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
                        size_t dimBlock[3] = {dimBlockX, 1, 1};
-                        void* kernel_params[] = {(void*) ((char *)(output->ga.data) + output->ga.offset),
-                                                 (void*) ((char *)(deviceBlockSum->ga.data) + deviceBlockSum->ga.offset),
+                        void* kernel_params[] = {(void*) output->ga.data,
+                                                 (void*) &(output->ga.offset),
+                                                 (void*) deviceBlockSum->ga.data,
+                                                 (void*) &(deviceBlockSum->ga.offset),
                                                 (void*) &nbElementsPerCumOp,
                                                 (void*) &outputStrides_x,
                                                 (void*) &outputStrides_y,
@@ -430,8 +452,10 @@ class GpuCumOp(GpuKernelBase, Op):
                        size_t dimBlock[3] = {1, 1, 1};
                        size_t tmp0 = shape[axis]-2;
                        size_t tmp1 = shape[axis]-1;
-                        void* kernel_params[] = {(void*) ((char *)(input->ga.data) + input->ga.offset),
-                                                 (void*) ((char *)(output->ga.data) + output->ga.offset),
+                        void* kernel_params[] = {(void*) input->ga.data,
+                                                 (void*) &(input->ga.offset),
+                                                 (void*) output->ga.data,
+                                                 (void*) &(output->ga.offset),
                                                 (void*) &inputStrides_x,
                                                 (void*) &inputStrides_y,
                                                 (void*) &inputStrides_z,