Bugs and adding tests

ba81f75f · kelvinxu · Kelvin Xu · 5c373d90 · ba81f75f · ba81f75f
--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -28,7 +28,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
                   reg_context, get_context, ContextNotDefined)
 from .basic_ops import as_gpuarray_variable
-from . import dnn, opt, nerv
+from . import dnn, opt, nerv, extra_ops

 def transfer(x, target):
    try:

--- a/theano/sandbox/gpuarray/extra_ops.py
+++ b/theano/sandbox/gpuarray/extra_ops.py
@@ -16,49 +16,40 @@ from .opt import register_opt as register_gpu_opt, op_lifter
 from .type import GpuArrayType


-class GpuCumsum(CumsumOp, Op):
+class GpuCumsum(CumsumOp, GpuKernelBase):
    """
    Parameters
    ----------
    axis
-        Can not be None. If you want the array flatten, do it before.
+        Can not be None. If you want the array flattened, do it before.
    """
    SUPPORTED_NDIMS = 3
+    __props__ = ('axis',)

    def __init__(self, axis):
        self.axis = axis
-        # not sure if this should be here
-        self.max_threads_dim0 = None
-        self.max_grid_size1 = None
-        self.max_gride_size2 = None
-

    def __str__(self):
        return "%s{%s}" % (self.__class__.__name__, self.axis)

-
    def c_code_cache_version(self):
        return (1,)

-
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']


    def make_node(self, x): 
+        assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"
+        x = as_gpuarray_variable(x, infer_context_name(x))
+
        if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
            raise NotImplementedError('Only cumsum on 1D, 2D and\
                                       3D arrays are supported right now!')
+
        if self.axis >= x.ndim or self.axis < -x.ndim:
            raise ValueError('axis(={1}) out of bounds'.format(self.axis))
-
-        x_ = as_gpuarray_variable(x, infer_context_name(x_))
-        return Apply(self, [x_], [x_.type()])
-
-
-    # ask Arnaud about this
-    def make_thunk(self, node, storage_map, comput_map, no_recycling):
-        pass
+        return Apply(self, [x], [x.type()])


    # copied from neighbour.py
@@ -70,34 +61,102 @@ class GpuCumsum(CumsumOp, Op):
    def gpu_kernels(self, node, nodename):
        kernels = []
        # cumadd
-        k_name = "k_cumadd"   
+        kname = "k_cumadd"   
        k_var = "k_cumadd_" + nodename
-        params =  
        dtype_x = node.inputs[0].dtype
        flags = Kernel.get_flags(dtype_x)
        code = """
-        KERNEL void %(kname)s(float* input, float* output, size_t inputStrides[3],
-                              size_t[3] outputStrides, int offsetY, int offsetZ, 
-                              int beforeLastElementIdx, int lastElementIdx){
+        KERNEL void %(kname)s(float* input, float* output, ssize_t inputStrides_x,
+                              ssize_t inputStrides_y, ssize_t inputStrides_z,
+                              ssize_t outputStrides_x, ssize_t outputStrides_y,
+                              ssize_t outputStrides_z, const int offsetY, const int offsetZ, 
+                              const int beforeLastElementIdx, const int lastElementIdx){
            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ;

-            int dataOffsetY_input = idY * inputStrides[1] + idZ * inputStrides.[3];
-            int dataOffsetY_output = idY * outputStrides[1] + idZ * outputStrides[2];
-            int idx_last_input = lastElementIdx*inputStrides[0] + dataOffsetY_input;
-            int idx_last_output = lastElementIdx*outputStrides[0] + dataOffsetY_output;
-            int idx_beforelast = beforeLastElementIdx*outputStrides[0] + dataOffsetY_output;
+            int dataOffsetY_input = idY * inputStrides_y + idZ * inputStrides_z;
+            int dataOffsetY_output = idY * outputStrides_y + idZ * outputStrides_z;
+            int idx_last_input = lastElementIdx*inputStrides_x + dataOffsetY_input;
+            int idx_last_output = lastElementIdx*outputStrides_x + dataOffsetY_output;
+            int idx_beforelast = beforeLastElementIdx*outputStrides_x + dataOffsetY_output;
            output[idx_last_output] = input[idx_last_input] + output[idx_beforelast];
            }
        """ % locals()
-        kernels.append(Kernel(code=code, name="k_cumadd", params=params,
+        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE, 
+                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, 
+                  gpuarray.SSIZE, gpuarray.SSIZE, 
+                  'intc', 'intc',
+                  'intc', 'intc',
+                  ]
+        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
-        # finalCumSum
-        k_name = "k_finalCumSum" 
-        k_var = "k_finalCumSum_" + nodename
-        # params = 
-        code = """
-        void k_blockCumSum_%(nodename)s(float* input, float* output, int nbElementsPerCumsum, size_t inputStrides[3], size_t outputStrides[3], int offsetY, int offsetZ, float* blockSum) {
+        # blockCumSum
+        kname = "k_blockCumSum" 
+        k_var = "k_blockCumSum_" + nodename
+        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
+                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
+                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
+                  'int32', 'int32', gpuarray.GpuArray,]
+        code="""
+        // helper functions
+        WITHIN_KERNEL
+        void k_reductionPhase_%(nodename)s(float* partialCumSum) {
+            // Traverse down from leaves to root building partial sums at internal nodes in the tree.
+            for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
+                local_barrier();
+                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
+                if(index < blockDim.x*2) {
+                    partialCumSum[index] += partialCumSum[index - stride];
+                }
+            }
+        }
+
+        WITHIN_KERNEL
+        void k_fetchData_%(nodename)s(float* partialCumSum, float* input, int globalThreadID,
+                                      ssize_t dataStrides_x, ssize_t dataStrides_y, ssize_t dataStrides_z,
+                                      int offsetY, int offsetZ) {
+            // blockIdx.y and blockIdx.z represents the current independent cumsum
+            int idY = blockIdx.y + offsetY;
+            int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z;
+            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
+            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
+            partialCumSum[threadIdx.x*2]     = input[idx_even];
+            partialCumSum[threadIdx.x*2 + 1] = input[idx_odd];
+        }
+
+        WITHIN_KERNEL
+        void k_reversePhase_%(nodename)s(float* partialCumSum) {
+            // Traverse back up the tree building the scan from the partial sums
+            for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
+                local_barrier();
+                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
+                if(index + stride < blockDim.x*2) {
+                    partialCumSum[index + stride] += partialCumSum[index];
+                }
+            }
+        }
+
+        WITHIN_KERNEL
+        void k_pushData_%(nodename)s(float* partialCumSum, float* output, int globalThreadID,
+                                     ssize_t dataStrides_x, ssize_t dataStrides_y, ssize_t dataStrides_z,
+                                     int offsetY, int offsetZ) {
+            local_barrier();
+            // blockIdx.y and blockIdx.z represents the current independent cumsum
+            int idY = blockIdx.y + offsetY;
+            int idZ = blockIdx.z + offsetZ;
+            int offset = idY * dataStrides_y + idZ * dataStrides_z;
+            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
+            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
+            output[idx_even] = partialCumSum[threadIdx.x*2];
+            output[idx_odd]  = partialCumSum[threadIdx.x*2 + 1];
+        }
+
+        KERNEL void k_blockCumSum(float* input, float* output,
+                                        size_t nbElementsPerCumsum, ssize_t inputStrides_x,
+                                        ssize_t inputStrides_y,  ssize_t inputStrides_z,
+                                        ssize_t outputStrides_x, ssize_t outputStrides_y,
+                                        ssize_t outputStrides_z, int offsetY,
+                                        int offsetZ, float* blockSum) {
            // Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis.
            // The Y and Z axis of the grid will contain all independent cumsums of the 2D/3D case.

@@ -111,7 +170,7 @@ class GpuCumsum(CumsumOp, Op):
            extern __shared__ float partialCumSum[];

            // Load data in shared memory
-            k_fetchData_%(nodename)s(partialCumSum, input, globalThreadID, inputStrides, offsetY, offsetZ);
+            k_fetchData_%(nodename)s(partialCumSum, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ);

            // Use a dichotomy approach to compute the cumsum (i.e. balanced binary tree).
            // The tree is sweeped from the leaves to the root and from the root to the leaves.
@@ -120,7 +179,7 @@ class GpuCumsum(CumsumOp, Op):
            k_reversePhase_%(nodename)s(partialCumSum);

            // Write the final output to global memory
-            k_pushData_%(nodename)s(partialCumSum, output, globalThreadID, outputStrides, offsetY, offsetZ);
+            k_pushData_%(nodename)s(partialCumSum, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_x,, offsetY, offsetZ);

            if (blockSum != NULL){
                if (threadIdx.x == blockDim.x - 1) {
@@ -128,165 +187,38 @@ class GpuCumsum(CumsumOp, Op):
                }
            }
        }
+        """
+        kernels.append(Kernel(code=code, name=kname, params=params,
+                              flags=flags, objvar=k_var))
+        # k_finalCumSum
+        kname = "k_finalCumSum" 
+        k_var = "k_finalCumSum_" + nodename
+        code = """
+        KERNEL void k_finalCumSum(float* output, float* blockSum, size_t nbElementsPerCumsum, 
+                                               ssize_t dataStrides_x,  ssize_t dataStrides_y,  ssize_t dataStrides_z,
+                                               int offsetY, int offsetZ) {
+            int globalThreadID = (blockIdx_x + 1) * blockDim_x + threadIdx_x;

-        int cumSum_%(nodename)s(CudaNdarray* input, CudaNdarray* output, int axis, int maxThreads, int maxGridY, int maxGridZ) {
-            int shape[3] = { 1, 1, 1 };
-            size_t inputStrides[3] = {0, 0, 0};
-            size_t outputStrides[3] = {0, 0, 0};
-
-            switch (PyGpuArray_NDIM(input))
-            {
-            case 1:
-                shape[0] = PyGpuArray_DIMS(input)[0];
-                inputStrides[0] = PyGpuArray_STRIDES(input)[0];
-                outputStrides[0] = PyGpuArray_STRIDES(output)[0];
-                break;
-            case 2:
-                shape[0] = PyGpuArray_DIMS(input)[0];
-                shape[1] = PyGpuArray_DIMS(input)[1];
-                inputStrides[0] = PyGpuArray_STRIDES(input)[0];
-                inputStrides[1] = PyGpuArray_STRIDES(input)[1];
-                outputStrides[0] = PyGpuArray_STRIDES(output)[0];
-                outputStrides[1] = PyGpuArray_STRIDES(output)[1];
-                break;
-            case 3:
-                shape[0] = PyGpuArray_DIMS(input)[0];
-                shape[1] = PyGpuArray_DIMS(input)[1];
-                shape[2] = PyGpuArray_DIMS(input)[2];
-                inputStrides[0] = PyGpuArray_STRIDES(input)[0];
-                inputStrides[1] = PyGpuArray_STRIDES(input)[1];
-                inputStrides[2] = PyGpuArray_STRIDES(input)[2];
-                outputStrides[0] = PyGpuArray_STRIDES(output)[0];
-                outputStrides[1] = PyGpuArray_STRIDES(output)[1];
-                outputStrides[2] = PyGpuArray_STRIDES(output)[2];
-                break;
-            default:
-                return -1;
-            }
-
-            if (shape[axis] <= 1) {
-                CudaNdarray_CopyFromCudaNdarray(output, input);
-                return 0;
-            }
-
-            // Perform cumsum on array of even size.
-            int nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2);
-
-            // Determine how many elements can be processed in one block.
-            int dimBlockX = ceil( min(nbElementsPerCumsum, 2*maxThreads) / 2.0);
-
-            // Determine how many blocks are needed in total.
-            int dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX));  // Nb. of blocks needed per cumsum.
-            int dimGridY;  // Nb. of independent cumsums (width).
-            int dimGridZ;  // Nb. of independent cumsums (height).
-
-            int tmp;
-            switch (axis)
-            {
-            case 0:
-                dimGridY = shape[1];
-                dimGridZ = shape[2];
-                break;
-            case 1:
-                dimGridY = shape[0];
-                dimGridZ = shape[2];
-
-                tmp = inputStrides[0];
-                inputStrides[0] = inputStrides[1];
-                inputStrides[1] = tmp;
-
-                tmp = outputStrides[0];
-                outputStrides[0] = outputStrides[1];
-                outputStrides[1] = tmp;
-                break;
-            case 2:
-                dimGridY = shape[1];
-                dimGridZ = shape[0];
-
-                tmp = inputStrides[0];
-                inputStrides[0] = inputStrides[2];
-                inputStrides[2] = tmp;
-
-                tmp = outputStrides[0];
-                outputStrides[0] = outputStrides[2];
-                outputStrides[2] = tmp;
-                break;
-            default:
-                return -1;
+            // Check if current has data to process.
+            if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
+                return;
            }

-            const int shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
-            CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(2, shapeBlockSum);
-
-            // Perform `maxGridY`*`maxGridZ` cumsums in parallel.
-            for (int offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
-                int localDimGridY = min(dimGridY - offsetY, maxGridY);
-
-                for (int offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
-                    int localDimGridZ = min(dimGridZ - offsetZ, maxGridZ);
-
-                    size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
-                    size_t dimBlock[3] = {dimBlockX, 1, 1};  // One cumsum per block.
-                    int sharedBytes = (2*dimBlockX) * sizeof(float);
-
-                    k_blockCumSum_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
-                    (
-                        CudaNdarray_DEV_DATA(input),
-                        CudaNdarray_DEV_DATA(output),
-                        nbElementsPerCumsum,
-                        inputStrides,
-                        outputStrides,
-                        offsetY,
-                        offsetZ,
-                        CudaNdarray_DEV_DATA(deviceBlockSum)
-                    );
-
-                    if (dimGridX > 1) {
-                        // Do a cumsum over the blockSum (recursive).
-                        if (cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
-                            Py_DECREF(deviceBlockSum);
-                            return -1;
-                        }
+            int idY = blockIdx_y + offsetY;
+            int idZ = blockIdx_z + offsetZ;

-                        // Since there are more than one block (i.e. `dimGridX > 1`)
-                        //  report partial cumsums of previous blocks to subsequents ones.
-                        size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
-                        size_t dimBlock[3] = {dimBlockX, 1, 1};
-                        k_finalCumSum_%(nodename)s<<<dimGrid, dimBlock>>>
-                        (
-                            CudaNdarray_DEV_DATA(output),
-                            CudaNdarray_DEV_DATA(deviceBlockSum),
-                            nbElementsPerCumsum,
-                            outputStrides,
-                            offsetY,
-                            offsetZ
-                        );
-                    }
+            const float currentBlockSum = blockSum[blockIdx_x*(gridDim_y*gridDim_z) + idY*gridDim.z + idZ];

-                    // If shape[axis] is odd, the last element is compute manually
-                    if (shape[axis] != nbElementsPerCumsum){
-                        size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
-                        size_t dimBlock[3] = {1, 1, 1};
-                        k_cumadd_%(nodename)s<<<dimGrid, dimBlock>>>
-                        (
-                            CudaNdarray_DEV_DATA(input),
-                            CudaNdarray_DEV_DATA(output),
-                            inputStrides,
-                            outputStrides,
-                            offsetY,
-                            offsetZ,
-                            shape[axis]-2,
-                            shape[axis]-1
-                        );
-                    }
-                }
-            }
-            Py_DECREF(deviceBlockSum);
-            CNDA_THREAD_SYNC;
-            return 0;
+            int offset = idY * dataStrides_y + idZ * dataStrides_z;
+            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
+            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
+            output[idx_even] += currentBlockSum;
+            output[idx_odd] += currentBlockSum;
        }
-
-        """       
+        """
+        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
+                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
+                  'int32', 'int32',]
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
        return kernels
@@ -300,17 +232,8 @@ class GpuCumsum(CumsumOp, Op):
        axis = self.axis if self.axis is not None else 0
        fail = sub['fail']

-        max_threads_dim0 = self.max_threads_dim0
-        max_grid_size1 = self.max_grid_size1
-        max_grid_size2 = self.max_grid_size2
-        if max_threads_dim0 is None or max_grid_size1 is None or max_grid_size2 is None:
-            raise NotImplementedError("GpuCumsum.c_code should not be called "
-                                      "directly. It should be called by "
-                                      "make_thunk() that add some information "
-                                      "related to the selected GPU.")
-
        code = """
-            const int* shape = PyGpuArray_DIMS(%(x)s);
+            const size_t* shape = PyGpuArray_DIMS(%(x)s);
            bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s);

            int axis = %(axis)s;
@@ -319,36 +242,31 @@ class GpuCumsum(CumsumOp, Op):
                axis += PyGpuArray_NDIM(%(x)s);
            }

-            // If output is already allocated, check if its shape matches the input's one.
-            if (!needAllocation) {
-                for (int i= 0; i < PyGpuArray_NDIM(%(x)s); ++i) {
-                    if (PyGpuArray_DIMS(%(x)s)[i] != PyGpuArray_DIMS(%(z)s)[i]) {
-                        needAllocation = true;
-                    }
-                }
-            }
-
-            if (needAllocation){
-                Py_XDECREF(%(z)s);
-                %(z)s = (CudaNdarray*) CudaNdarray_NewDims(PyGpuArray_NDIM(%(x)s), shape);
-            }
-
-            if (!%(z)s) {
+            if (theano_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s), %(type)s, GA_C_ORDER, %(ctx)s) == 0){
                %(fail)s;
            }

            { // Namespace for kernel calls //
-                if (cumSum_%(nodename)s(%(x)s, %(z)s, axis, %(max_threads_dim0)s, %(max_grid_size1)s, %(max_grid_size2)s) == -1){
+                size_t max_threads_dim0; 
+                size_t max_grid_size1; 
+                size_t max_grid_size2; 
+                int err;
+                err = %(ctx)s->ops->property(%(ctx)s->ctx, NULL, NULL, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
+                if (err != GA_NO_ERROR){
+                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0");
                    %(fail)s;
                }
-
-                cudaError_t sts = cudaGetLastError();
-                if (cudaSuccess != sts)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "Cuda error: %%s: %%s.\\n",
-                        "cumSum_%(nodename)s",
-                        cudaGetErrorString(sts));
+                err = %(ctx)s->ops->property(%(ctx)s->ctx, NULL, NULL, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
+                if (err != GA_NO_ERROR){
+                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1");
+                    %(fail)s;
+                }
+                err = %(ctx)s->ops->property(%(ctx)s->ctx, NULL, NULL, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
+                if (err != GA_NO_ERROR){
+                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
+                    %(fail)s;
+                }
+                if (cumSum_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){
                    %(fail)s;
                }
            }
@@ -356,8 +274,166 @@ class GpuCumsum(CumsumOp, Op):

        return code

+        def c_support_code_apply(self, node, nodename):
+            code = """int cumSum_%(nodename)s(float* input, float* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) {
+                size_t shape[3] = { 1, 1, 1 };
+                ssize_t inputStrides_x;
+                ssize_t inputStrides_y;
+                ssize_t inputStrides_z;
+                ssize_t outputStrides_x;
+                ssize_t outputStrides_y;
+                ssize_t outputStrides_z;
+                switch (PYArray_NDIM(input))
+                {
+                case 1:
+                    shape[0] = PyArray_DIMS(input)[0];
+                    inputStrides_x = PyGpuArray_STRIDES(input)[0];
+                    outputStrides_x = PyGpuArray_STRIDES(output)[0];
+                    break;
+                case 2:
+                    shape[0] = PyArray_DIMS(input)[0];
+                    shape[1] = PyArray_DIMS(input)[1];
+                    inputStrides_x = PyGpuArray_STRIDES(input)[0];
+                    inputStrides_y = PyGpuArray_STRIDES(input)[1];
+                    outputStrides_x = PyGpuArray_STRIDES(output)[0];
+                    outputStrides_y = PyGpuArray_STRIDES(output)[1];
+                    break;
+                case 3:
+                    shape[0] = PyArray_DIMS(input)[0];
+                    shape[1] = PyArray_DIMS(input)[1];
+                    shape[2] = PyArray_DIMS(input)[2];
+                    inputStrides_x = PyGpuArray_STRIDES(input)[0];
+                    inputStrides_y = PyGpuArray_STRIDES(input)[1];
+                    inputStrides_z = PyGpuArray_STRIDES(input)[2];
+                    outputStrides_x = PyGpuArray_STRIDES(output)[0];
+                    outputStrides_y = PyGpuArray_STRIDES(output)[1];
+                    outputStrides_z = PyGpuArray_STRIDES(output)[2];
+                    break;
+                default:
+                    return -1;
+                }
+                if (shape[axis] <= 1) {
+                    output = pygpu_copy(input, GA_ANY_ORDER);
+                    return 0;
+                }
+                // Perform cumsum on array of even size.
+                size_t nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2);
+                // Determine how many elements can be processed in one block.
+                size_t dimBlockX = ceil( min(nbElementsPerCumsum, 2*maxThreads) / 2.0);
+                // Determine how many blocks are needed in total.
+                size_t dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX));  // Nb. of blocks needed per cumsum.
+                size_t dimGridY;  // Nb. of independent cumsums (width).
+                size_t dimGridZ;  // Nb. of independent cumsums (height).
+                ssize_t tmp;
+                switch (axis)
+                {
+                case 0:
+                    dimGridY = shape[1];
+                    dimGridZ = shape[2];
+                    break;
+                case 1:
+                    dimGridY = shape[0];
+                    dimGridZ = shape[2];
+                    tmp = inputStrides_x;
+                    inputStrides_x = inputStrides_y;
+                    inputStrides_y = tmp;
+                    tmp = outputStrides_x;
+                    outputStrides_x = outputStrides_y;
+                    outputStrides_y = tmp;
+                    break;
+                case 2:
+                    dimGridY = shape[1];
+                    dimGridZ = shape[0];
+                    tmp = inputStrides_x;
+                    inputStrides_x = inputStrides_z;
+                    inputStrides_z = tmp;
+                    tmp = outputStrides_x;
+                    outputStrides_x = outputStrides_z;
+                    outputStrides_z = tmp;
+                    break;
+                default:
+                    return -1;
+                }
+                const size_t shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
+                PyGpuArrayObject* deviceBlockSum = pygpu_empty(2, shapeBlockSum, output->typecode,
+                                                               GA_C_ORDER, input->context->ctx, Py_None);
+                if (deviceBlockSum == NULL){
+                    return -1;
+                }
+                // Perform `maxGridY`*`maxGridZ` cumsums in parallel.
+                for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
+                    size_t localDimGridY = min(dimGridY - offsetY, maxGridY);
+                    for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
+                        size_t localDimGridZ = min(dimGridZ - offsetZ, maxGridZ);
+                        size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
+                        size_t dimBlock[3] = {dimBlockX, 1, 1};  // One cumsum per block.
+                        size_t sharedBytes = (2*dimBlockX) * sizeof(float);
+                        void* kernel_params[] = {(void*) input->ga.data,
+                                                 (void*) output->ga.data,
+                                                 (void*) &nbElementsPerCumsum, 
+                                                 (void*) &inputStrides_x,
+                                                 (void*) &inputStrides_y,
+                                                 (void*) &inputStrides_z,
+                                                 (void*) &outputStrides_x,
+                                                 (void*) &outputStrides_y,
+                                                 (void*) &outputStrides_z,
+                                                 (void*) &offsetY,
+                                                 (void*) &offsetZ,
+                                                 (void*) deviceBlockSum->ga.data;
+                            };
+                        int err = GpuKernel_call(k_blockCumSum_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
+
+                        if (dimGridX > 1) {
+                            // Do a cumsum over the blockSum (recursive).
+                            if (cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
+                                Py_DECREF(deviceBlockSum);
+                                return -1;
+                            }
+                            // Since there are more than one block (i.e. `dimGridX > 1`)
+                            //  report partial cumsums of previous blocks to subsequents ones.
+                            size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
+                            size_t dimBlock[3] = {dimBlockX, 1, 1};
+                            void* kernel_params[] = {(void*) output->ga.data,
+                                                     (void*) deviceBlockSum->ga.data, 
+                                                     (void*) &nbElementsPerCumsum,
+                                                     (void*) &outputStrides_x,
+                                                     (void*) &outputStrides_y,
+                                                     (void*) &outputStrides_z,
+                                                     (void*) &offsetY,
+                                                     (void*) &offsetZ
+                                };
+                            int err = GpuKernel_call(k_finalCumSum_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
+                        }
+                        // If shape[axis] is odd, the last element is compute manually
+                        if (shape[axis] != nbElementsPerCumsum){
+                            size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
+                            size_t dimBlock[3] = {1, 1, 1};
+                            void* kernel_params[] = {(void*) input->ga.data,
+                                                     (void*) output->ga.data,
+                                                     (void*) &inputStrides_x,
+                                                     (void*) &inputStrides_y,
+                                                     (void*) &inputStrides_z,
+                                                     (void*) &outputStrides_x,
+                                                     (void*) &outputStrides_y,
+                                                     (void*) &outputStrides_z,
+                                                     (void*) &offsetY,
+                                                     (void*) &offsetZ,
+                                                     (void*) &(shape[axis]-2),
+                                                     (void*) &(shape[axis]-1)
+                            };
+                            int err = GpuKernel_call(k_cumadd_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
+                        }
+                    }
+                }
+                Py_XDECREF(deviceBlockSum);
+                return 0;
+            }
+            """
+            return "\n".join(super(GpuKernelBase, self).c_support_code_apply(node, name), code)
+
+
 @op_lifter([CumsumOp])
-def use_gpu_cumsumop(node, axis):
-    return GpuCumsum(axis)
+def use_gpu_cumsumop(node, ctx_name):
+    return GpuCumsum(node.op.axis)

 register_gpu_opt()(use_gpu_cumsumop)
--- a/theano/sandbox/gpuarray/tests/test_extra_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_extra_ops.py
+# Skip test if cuda_ndarray is not available.
+from __future__ import absolute_import, print_function, division
+import itertools
+
+import numpy as np
+from six.moves import xrange
+
+from theano import tensor as T
+import theano
+import theano.tensor.tests.test_extra_ops
+
+from theano.tensor.extra_ops import cumsum, CumsumOp
+from theano.tests import unittest_tools as utt
+
+from .config import mode_with_gpu, test_ctx_name, test_ctx
+from ..extra_ops import GpuCumsum
+from ..type import get_context
+
+
+class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
+    mode = mode_with_gpu
+
+    def setUp(self):
+        super(TestGpuCumsum, self).setUp()
+        if get_context(test_ctx_name).kind != 'cuda':
+            raise SkipTest("Cuda specific tests")
+        self.max_threads_dim0 = test_ctx.maxlsize0
+        self.max_grid_size1 = test_ctx.maxgsize1
+
+
+
+    def test_Strides1D(self):
+        x = T.fvector('x')
+
+        for axis in [0, None, -1]:
+            a = np.random.random((42,)).astype("float32")
+            cumsum_function = theano.function([x], cumsum(x, axis=axis),
+                                              mode=self.mode)
+
+            slicings = [slice(None, None, None),    # Normal strides
+                        slice(None, None, 2),       # Stepped strides
+                        slice(None, None, -1),      # Negative strides
+                        ]
+
+            # Cartesian product of all slicings to test.
+            for slicing in itertools.product(slicings, repeat=x.ndim):
+                f = theano.function([x], cumsum(x[slicing], axis=axis),
+                                    mode=self.mode)
+                assert [n for n in f.maker.fgraph.toposort()
+                        if isinstance(n.op, GpuCumsum)]
+                utt.assert_allclose(np.cumsum(a[slicing], axis=axis), f(a))
+                utt.assert_allclose(np.cumsum(a[slicing], axis=axis),
+                                    cumsum_function(a[slicing]))
+
+    def test_Strides2D(self):
+        x = T.fmatrix('x')
+
+        for axis in [0, 1, None, -1, -2]:
+            a = np.random.random((42, 30)).astype("float32")
+            cumsum_function = theano.function([x], cumsum(x, axis=axis),
+                                              mode=self.mode)
+
+            slicings = [slice(None, None, None),    # Normal strides
+                        slice(None, None, 2),       # Stepped strides
+                        slice(None, None, -1),      # Negative strides
+                        ]
+
+            # Cartesian product of all slicings to test.
+            for slicing in itertools.product(slicings, repeat=x.ndim):
+                f = theano.function([x], cumsum(x[slicing], axis=axis),
+                                    mode=self.mode)
+                assert [n for n in f.maker.fgraph.toposort()
+                        if isinstance(n.op, GpuCumsum)]
+                utt.assert_allclose(np.cumsum(a[slicing], axis=axis), f(a))
+                utt.assert_allclose(np.cumsum(a[slicing], axis=axis),
+                                    cumsum_function(a[slicing]))
+
+    def test_Strides3D(self):
+        x = T.ftensor3('x')
+
+        for axis in [0, 1, 2, None, -1, -2, -3]:
+            a = np.random.random((42, 30, 25)).astype("float32")
+            cumsum_function = theano.function([x], cumsum(x, axis=axis),
+                                              mode=self.mode)
+
+            slicings = [slice(None, None, None),    # Normal strides
+                        slice(None, None, 2),       # Stepped strides
+                        slice(None, None, -1),      # Negative strides
+                        ]
+
+            # Cartesian product of all slicings to test.
+            for slicing in itertools.product(slicings, repeat=x.ndim):
+                f = theano.function([x], cumsum(x[slicing], axis=axis),
+                                    mode=self.mode)
+                assert [n for n in f.maker.fgraph.toposort()
+                        if isinstance(n.op, GpuCumsum)]
+                utt.assert_allclose(np.cumsum(a[slicing], axis=axis), f(a))
+                utt.assert_allclose(np.cumsum(a[slicing], axis=axis),
+                                    cumsum_function(a[slicing]))
+
+    def test_GpuCumsum1D(self):
+        block_max_size = self.max_threads_dim0 * 2
+
+        x = T.fvector('x')
+        f = theano.function([x], cumsum(x), mode=self.mode)
+        assert [n for n in f.maker.fgraph.toposort()
+                if isinstance(n.op, GpuCumsum)]
+
+        # Extensive testing for the first 1025 sizes
+        a = np.random.random(1025).astype("float32")
+        for i in xrange(a.shape[0]):
+            utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))
+
+        # Use multiple GPU threadblocks
+        a = np.random.random((block_max_size+2,)).astype("float32")
+        utt.assert_allclose(np.cumsum(a), f(a))
+
+        # Use recursive cumsum
+        a = np.ones((block_max_size*(block_max_size+1)+2,),
+                    dtype="float32")
+        utt.assert_allclose(np.cumsum(a), f(a))
+
+    def test_GpuCumsum2D(self):
+        block_max_size = self.max_threads_dim0 * 2 
+
+        x = T.fmatrix('x')
+        for shape_axis, axis in zip([0, 1, 0, 1, 0], [0, 1, None, -1, -2]):
+            f = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
+            assert [n for n in f.maker.fgraph.toposort()
+                    if isinstance(n.op, GpuCumsum)]
+
+            # Extensive testing for the first 1025 sizes
+            a_shape = [5, 5]
+            a_shape[shape_axis] = 1025
+            a = np.random.random(a_shape).astype("float32")
+            slices = [slice(None), slice(None)]
+            for i in xrange(a.shape[shape_axis]):
+                slices[shape_axis] = slice(i)
+                fa = f(a[slices])
+                npa = np.cumsum(a[slices], axis=axis)
+                utt.assert_allclose(npa, fa)
+
+            # Use multiple GPU threadblocks
+            a_shape = [5, 5]
+            a_shape[shape_axis] = block_max_size+2
+            a = np.random.random(a_shape).astype("float32")
+            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
+
+            # Use multiple GPU gridblocks
+            a_shape = [4, 4]
+            a_shape[1-shape_axis] = self.max_grid_size1+1
+            a = np.random.random(a_shape).astype("float32")
+            utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)
+
+            # Use recursive cumsum
+            a_shape = [3, 3]
+            a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
+            a = np.random.random(a_shape).astype("float32")
+            a = np.sign(a-0.5).astype("float32")  # Avoid floating point error
+            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
+
+    def test_GpuCumsum3D(self):
+        block_max_size = self.max_threads_dim0 * 2
+
+        x = T.ftensor3('x')
+        for shape_axis, axis in zip([0, 1, 2, 0, 2, 1, 0], [0, 1, 2, None, -1, -2, -3]):
+            f = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
+            assert [n for n in f.maker.fgraph.toposort()
+                    if isinstance(n.op, GpuCumsum)]
+
+            # Extensive testing for the first 1025 sizes
+            a_shape = [5, 5, 5]
+            a_shape[shape_axis] = 1025
+            a = np.random.rand(*a_shape).astype("float32")
+            slices = [slice(None), slice(None), slice(None)]
+            for i in xrange(a.shape[shape_axis]):
+                slices[shape_axis] = slice(i)
+                fa = f(a[slices])
+                npa = np.cumsum(a[slices], axis=axis)
+                utt.assert_allclose(npa, fa)
+
+            # Use multiple GPU threadblocks (along accumulation axis)
+            a_shape = [2, 2, 2]
+            a_shape[shape_axis] = block_max_size+2
+            a = np.random.random(a_shape).astype("float32")
+            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
+
+            # Use multiple GPU gridblocks (not along accumulation axis)
+            a_shape = [5, 5, 5]
+            a_shape[(shape_axis+1) % 3] = self.max_grid_size1+1
+            a = np.random.random(a_shape).astype("float32")
+            if axis is None:
+                # Avoid floating point error
+                a = np.sign(a-0.5).astype("float32")
+            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
+
+            a_shape = [5, 5, 5]
+            a_shape[(shape_axis+2) % 3] = self.max_grid_size1+1
+            a = np.random.random(a_shape).astype("float32")
+            if axis is None:
+                # Avoid floating point error
+                a = np.sign(a-0.5).astype("float32")
+            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
+
+            # Use recursive cumsum (along accumulation axis)
+            a_shape = [3, 3, 3]
+            a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
+            a = np.random.random(a_shape).astype("float32")
+            a = np.sign(a-0.5).astype("float32")  # Avoid floating point error
+            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
+
+    def test_GpuCumsum4D(self):
+        # Should not use the GPU version.
+        x = T.ftensor4('x')
+        f = theano.function([x], cumsum(x, axis=1), mode=self.mode)
+        assert [n for n in f.maker.fgraph.toposort()
+                if isinstance(n.op, CumsumOp)]