Add a 2D version cumsum

65f5d0c7 · Marc-Alexandre Cote · 6dbb2457 · 65f5d0c7 · 65f5d0c7
--- a/theano/sandbox/cuda/extra_ops.py
+++ b/theano/sandbox/cuda/extra_ops.py
@@ -46,6 +46,7 @@ class GpuCumsum(CumsumOp, GpuOp):
            cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
            prop = cuda_ndarray.device_properties(device_id)
            node_.op.max_threads_dim0 = prop['maxThreadsDim0']
+            node_.op.max_grid_size1 = prop['maxGridSize1']
        return super(GpuCumsum, node_.op).make_thunk(node_, storage_map,
                                                     compute_map, no_recycling)
@@ -56,149 +57,74 @@ class GpuCumsum(CumsumOp, GpuOp):
    def c_support_code_apply(self, node, nodename):
        axis = self.axis
        return """
-        __global__
+        __device__
-        void k_finalCumSum_1D_%(nodename)s(float* output, float* blockSum, int numElements) {
+        void k_reductionPhase_%(nodename)s(float* partialCumSum) {
-            int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
+            for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
-            if (globalThreadID < ceil(numElements/2.0)) {
-                const float currentBlockSum = blockSum[blockIdx.x];
-                output[globalThreadID*2]     += currentBlockSum;
-                output[globalThreadID*2 + 1] += currentBlockSum;
-            }
-        }
-        __global__
-        void k_cumadd_%(nodename)s(float* input, float* output, int beforeLastElementIdx, int lastElementIdx) {
-            output[lastElementIdx] = input[lastElementIdx] + output[beforeLastElementIdx];
-        }
-        __global__
-        void k_blockCumSum_1D_%(nodename)s(float* input, float* output, int numElements, float* blockSum) {
-            int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
-            if (globalThreadID < ceil(numElements/2.0)) {
-                extern __shared__ float partialCumSum[];
-                // Load data in shared memory
-                partialCumSum[threadIdx.x*2]     = input[globalThreadID*2];
-                partialCumSum[threadIdx.x*2 + 1] = input[globalThreadID*2 + 1];
-                // Reduction Phase
-                int stride;
-                for (stride = 1; stride <= blockDim.x; stride *= 2) {
                __syncthreads();
-                    int index = (threadIdx.x + 1) * (stride * 2) - 1;
+                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
                if(index < blockDim.x*2) {
                    partialCumSum[index] += partialCumSum[index - stride];
                }
            }
+        }
-                // Reverse Phase
+        __device__
-                for (; stride > 0; stride /= 2) {
+        void k_reversePhase_%(nodename)s(float* partialCumSum) {
+            for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
                __syncthreads();
-                    int index = (threadIdx.x + 1) * (stride * 2) - 1;
+                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
                if(index + stride < blockDim.x*2) {
                    partialCumSum[index + stride] += partialCumSum[index];
                }
            }
-                // Write the final output to global memory
-                __syncthreads();
-                output[globalThreadID*2]     = partialCumSum[threadIdx.x*2];
-                output[globalThreadID*2 + 1] = partialCumSum[threadIdx.x*2 + 1];
-                if (blockSum != NULL){
-                    if (threadIdx.x == blockDim.x - 1) {
-                        blockSum[blockIdx.x] = partialCumSum[threadIdx.x*2 + 1];
-                    }
-                }
-            }
        }
-        void cumSum_1D_%(nodename)s(CudaNdarray* input, CudaNdarray* output, npy_intp* shape, int maxThreads) {
+        __device__
+        void k_fetchData_%(nodename)s(float* partialCumSum, float* input, int globalThreadID, dim3 dataStrides, int dataOffset) {
-            if (shape[0] <= 1) {
+            // blockIdx.y represents the # of the current independent cumsum
-                CudaNdarray_CopyFromCudaNdarray(output, input);
+            partialCumSum[threadIdx.x*2]     = input[(globalThreadID*2    ) * dataStrides.x + (blockIdx.y + dataOffset) * dataStrides.y];
-                return;
+            partialCumSum[threadIdx.x*2 + 1] = input[(globalThreadID*2 + 1) * dataStrides.x + (blockIdx.y + dataOffset) * dataStrides.y];
        }
-            int numElements = shape[0] - (shape[0] %% 2);
+        __device__
-            int blockSize = ceil( min(numElements, 2*maxThreads) / 2.0);
+        void k_pushData_%(nodename)s(float* partialCumSum, float* output, int globalThreadID, dim3 dataStrides, int dataOffset) {
-            int dimGridX = ceil(numElements / (2.0*blockSize));
+            __syncthreads();
-            npy_intp shapeBlockSum[1] = { dimGridX };
+            // blockIdx.y represents the # of the current independent cumsum
-            CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(1, shapeBlockSum);
+            output[(globalThreadID*2    ) * dataStrides.x + (blockIdx.y + dataOffset) * dataStrides.y] = partialCumSum[threadIdx.x*2];
+            output[(globalThreadID*2 + 1) * dataStrides.x + (blockIdx.y + dataOffset) * dataStrides.y] = partialCumSum[threadIdx.x*2 + 1];
-            dim3 dimBlock(blockSize, 1, 1);
-            dim3 dimGrid(dimGridX, 1, 1);
-            int sharedBytes = (2*blockSize) * sizeof(float);
-            cudaThreadSynchronize();
-            k_blockCumSum_1D_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
-            (
-                CudaNdarray_DEV_DATA(input),
-                CudaNdarray_DEV_DATA(output),
-                numElements,
-                CudaNdarray_DEV_DATA(deviceBlockSum)
-            );
-            if (dimGridX > 1) {
-                // Do a cumsum over the blockSum (recursive).
-                cumSum_1D_%(nodename)s(deviceBlockSum, deviceBlockSum, shapeBlockSum, maxThreads);
-                dim3 dimGrid(dimGridX-1, 1, 1);
-                dim3 dimBlock(blockSize, 1, 1);
-                k_finalCumSum_1D_%(nodename)s<<<dimGrid, dimBlock>>>
-                (
-                    CudaNdarray_DEV_DATA(output),
-                    CudaNdarray_DEV_DATA(deviceBlockSum),
-                    numElements
-                );
-            }
-            // If shape[0] is odd, the last element is compute manually
-            if (shape[0] != numElements){
-                cudaThreadSynchronize();
-                k_cumadd_%(nodename)s<<<dim3(1,1,1), dim3(1,1,1)>>>
-                (
-                    CudaNdarray_DEV_DATA(input),
-                    CudaNdarray_DEV_DATA(output),
-                    shape[0]-2,
-                    shape[0]-1
-                );
        }
-            cudaFree(CudaNdarray_DEV_DATA(deviceBlockSum));
+        __global__
-            cudaThreadSynchronize();
+        void k_cumadd_%(nodename)s(float* input, float* output, dim3 dataStrides, int dataOffset, int beforeLastElementIdx, int lastElementIdx) {
+            int dataOffsetY = (blockIdx.y + dataOffset) * dataStrides.y;
+            output[lastElementIdx*dataStrides.x + dataOffsetY] = input[lastElementIdx*dataStrides.x + dataOffsetY]
+                                                               + output[beforeLastElementIdx*dataStrides.x + dataOffsetY];
        }
        __global__
-        void k_finalCumSum_2D_axis1_%(nodename)s(float* output, float* blockSum, int numElements, dim3 dataStrides) {
+        void k_finalCumSum_%(nodename)s(float* output, float* blockSum, int numElements, dim3 dataStrides, int dataOffset) {
-            int globalThreadID = (blockIdx.y + 1) * blockDim.y + threadIdx.y;
+            int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
            // Check if current has data to process.
            if (globalThreadID >= ceil(numElements/2.0)) {
                return;
            }
-            const float currentBlockSum = blockSum[blockIdx.x*gridDim.y + blockIdx.y];
+            const float currentBlockSum = blockSum[blockIdx.x*gridDim.y + blockIdx.y + dataOffset];
-            output[globalThreadID*2     + blockIdx.x*dataStrides.x] += currentBlockSum;
+            int dataOffsetY = (blockIdx.y + dataOffset) * dataStrides.y;
-            output[globalThreadID*2 + 1 + blockIdx.x*dataStrides.x] += currentBlockSum;
+            output[(globalThreadID*2    ) * dataStrides.x + dataOffsetY] += currentBlockSum;
+            output[(globalThreadID*2 + 1) * dataStrides.x + dataOffsetY] += currentBlockSum;
        }
        __global__
-        void k_cumadd_2D_axis1_%(nodename)s(float* input, float* output, int beforeLastElementIdx, int lastElementIdx) {
+        void k_blockCumSum_%(nodename)s(float* input, float* output, int numElements, dim3 dataStrides, int dataOffset, float* blockSum) {
-            output[blockIdx.x*(lastElementIdx+1) + lastElementIdx] = input[blockIdx.x*(lastElementIdx+1) + lastElementIdx]
+            // Regarding blockIdx and threadIdx, 'Cumsum' is always perform along the X axis.
-                                                                     + output[blockIdx.x*(lastElementIdx+1) + beforeLastElementIdx];
+            // The Y axis will contain all the independent cumsums of the 2D case.
-        }
-        __global__
+            int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
-        void k_blockCumSum_2D_axis1_%(nodename)s(float* input, float* output, int numElements, dim3 dataStrides, float* blockSum) {
-            int globalThreadID = blockIdx.y * blockDim.y + threadIdx.y;
-            // Check if current has data to process.
+            // Check if current thread has data to process.
            if (globalThreadID >= ceil(numElements/2.0)) {
                return;
            }
@@ -206,102 +132,109 @@ class GpuCumsum(CumsumOp, GpuOp):
            extern __shared__ float partialCumSum[];
            // Load data in shared memory
-            partialCumSum[threadIdx.y*2]     = input[globalThreadID*2     + blockIdx.x*dataStrides.x];
+            k_fetchData_%(nodename)s(partialCumSum, input, globalThreadID, dataStrides, dataOffset);
-            partialCumSum[threadIdx.y*2 + 1] = input[globalThreadID*2 + 1 + blockIdx.x*dataStrides.x];
-            // Reduction Phase
+            k_reductionPhase_%(nodename)s(partialCumSum);
-            int stride;
+            k_reversePhase_%(nodename)s(partialCumSum);
-            for (stride = 1; stride <= blockDim.y; stride *= 2) {
-                __syncthreads();
-                int index = (threadIdx.y + 1) * (stride * 2) - 1;
-                if(index < blockDim.y*2) {
-                    partialCumSum[index] += partialCumSum[index - stride];
-                }
-            }
-            // Reverse Phase
-            for (; stride > 0; stride /= 2) {
-                __syncthreads();
-                int index = (threadIdx.y + 1) * (stride * 2) - 1;
-                if(index + stride < blockDim.y*2) {
-                    partialCumSum[index + stride] += partialCumSum[index];
-                }
-            }
            // Write the final output to global memory
-            __syncthreads();
+            k_pushData_%(nodename)s(partialCumSum, output, globalThreadID, dataStrides, dataOffset);
-            output[globalThreadID*2     + blockIdx.x*dataStrides.x] = partialCumSum[threadIdx.y*2];
-            output[globalThreadID*2 + 1 + blockIdx.x*dataStrides.x] = partialCumSum[threadIdx.y*2 + 1];
            if (blockSum != NULL){
-                if (threadIdx.y == blockDim.y - 1) {
+                if (threadIdx.x == blockDim.x - 1) {
-                    blockSum[blockIdx.x*gridDim.y + blockIdx.y] = partialCumSum[threadIdx.y*2 + 1];
+                    blockSum[blockIdx.x*gridDim.y + blockIdx.y + dataOffset] = partialCumSum[threadIdx.x*2 + 1];
                }
            }
        }
-        void cumSum_2D_axis1_%(nodename)s(CudaNdarray* input, CudaNdarray* output, const int* shape, int maxThreads) {
+        void cumSum_%(nodename)s(CudaNdarray* input, CudaNdarray* output, int maxThreads, int axis, int maxGridY) {
-            int axis = 1; // Convert into a parameter
+            int shape[2] = { 1, 1 };
+            dim3 dataStrides(0,0,0);
+            switch (CudaNdarray_NDIM(input))
+            {
+            case 1:
+                shape[0] = CudaNdarray_HOST_DIMS(input)[0];
+                dataStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
+                break;
+            case 2:
+                shape[0] = CudaNdarray_HOST_DIMS(input)[0];
+                shape[1] = CudaNdarray_HOST_DIMS(input)[1];
+                dataStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
+                dataStrides.y = CudaNdarray_HOST_STRIDES(input)[1];
+                break;
+            default: printf("Only 1D and 2D cumsum is implemented yet.\\n");
+            }
            if (shape[axis] <= 1) {
                CudaNdarray_CopyFromCudaNdarray(output, input);
                return;
            }
+            if (axis == 1) {
+                int tmp = dataStrides.x;
+                dataStrides.x = dataStrides.y;
+                dataStrides.y = tmp;
+            }
            int numElements = shape[axis] - (shape[axis] %% 2);
            int blockSize = ceil( min(numElements, 2*maxThreads) / 2.0);
-            int dimGridX = shape[0];
+            int dimGridX = ceil(numElements / (2.0*blockSize));  // Nb. of elements to perform cumsum on.
-            int dimGridY = ceil(numElements / (2.0*blockSize));
+            int dimGridY = shape[1-axis];                        // Nb. of independent cumsums.
            const int shapeBlockSum[2] = { dimGridX, dimGridY };
            //CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(2, shapeBlockSum);
            CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_ZEROS(2, (int*)shapeBlockSum);
+            for (int dataOffset = 0; dataOffset < dimGridY; dataOffset += maxGridY){
-            dim3 dimBlock(1, blockSize, 1);
+                int localDimGridY = min(dimGridY - dataOffset, maxGridY);
-            dim3 dimGrid(dimGridX, dimGridY, 1);
+                dim3 dimBlock(blockSize, 1, 1);
+                dim3 dimGrid(dimGridX, localDimGridY, 1);
                int sharedBytes = (2*blockSize) * sizeof(float);
-            dim3 dataStrides(CudaNdarray_HOST_STRIDES(input)[0], CudaNdarray_HOST_STRIDES(input)[1], 0);
                cudaThreadSynchronize();
-            k_blockCumSum_2D_axis1_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
+                k_blockCumSum_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
                (
                    CudaNdarray_DEV_DATA(input),
                    CudaNdarray_DEV_DATA(output),
                    numElements,
                    dataStrides,
+                    dataOffset,
                    CudaNdarray_DEV_DATA(deviceBlockSum)
                );
-            if (dimGridY > 1) {
+                if (dimGridX > 1) {
                    // Do a cumsum over the blockSum (recursive).
-                cumSum_2D_axis1_%(nodename)s(deviceBlockSum, deviceBlockSum, shapeBlockSum, maxThreads);
+                    cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, maxThreads, 0, maxGridY);
                    dim3 dimGrid(dimGridX, dimGridY, 1);
-                dim3 dimBlock(1, blockSize, 1);
+                    dim3 dimBlock(blockSize, 1, 1);
-                k_finalCumSum_2D_axis1_%(nodename)s<<<dimGrid, dimBlock>>>
+                    k_finalCumSum_%(nodename)s<<<dimGrid, dimBlock>>>
                    (
                        CudaNdarray_DEV_DATA(output),
                        CudaNdarray_DEV_DATA(deviceBlockSum),
                        numElements,
-                    dataStrides
+                        dataStrides,
+                        dataOffset
                    );
                }
                // If shape[axis] is odd, the last element is compute manually
                if (shape[axis] != numElements){
                    cudaThreadSynchronize();
-                dim3 dimGrid(dimGridX, 1, 1);
+                    dim3 dimGrid(1, localDimGridY, 1);
                    dim3 dimBlock(1, 1, 1);
-                k_cumadd_2D_axis1_%(nodename)s<<<dimGrid, dimBlock>>>
+                    k_cumadd_%(nodename)s<<<dimGrid, dimBlock>>>
                    (
                        CudaNdarray_DEV_DATA(input),
                        CudaNdarray_DEV_DATA(output),
+                        dataStrides,
+                        dataOffset,
                        shape[axis]-2,
                        shape[axis]-1
                    );
                }
+            }
            cudaFree(CudaNdarray_DEV_DATA(deviceBlockSum));
            cudaThreadSynchronize();
@@ -316,18 +249,17 @@ class GpuCumsum(CumsumOp, GpuOp):
        sub = sub.copy()
        max_threads_dim0 = self.max_threads_dim0
-        if max_threads_dim0 is None:
+        max_grid_size1 = self.max_grid_size1
+        if max_threads_dim0 is None or max_grid_size1 is None:
            raise NotImplementedError("GpuConv.c_code should not be called "
                                      "directly. It should be called by "
                                      "make_thunk() that add some information "
                                      "related to the selected GPU.")
        sub.update(locals())
-        #Right now, only the 1D case works.
        if self.axis is None or (self.axis == 0 and node.inputs[0].ndim == 1):
            code = """
-                npy_intp shape[1] = { CudaNdarray_SIZE(%(x)s) };
+                const int* shape = CudaNdarray_HOST_DIMS(%(x)s);
                if(! (%(z)s && CudaNdarray_HOST_DIMS(%(z)s)[0] == shape[0]) ) {
                    Py_XDECREF(%(z)s);
                    %(z)s = (CudaNdarray*) CudaNdarray_NewDims(1, shape);
@@ -338,7 +270,7 @@ class GpuCumsum(CumsumOp, GpuOp):
                }
                { // Namespace for kernel calls //
-                    cumSum_1D_%(nodename)s(%(x)s, %(z)s, shape, %(max_threads_dim0)s);
+                    cumSum_%(nodename)s(%(x)s, %(z)s, %(max_threads_dim0)s, 0, %(max_grid_size1)s);
                    cudaError_t sts = cudaGetLastError();
                    if (cudaSuccess != sts)
@@ -351,7 +283,7 @@ class GpuCumsum(CumsumOp, GpuOp):
                    }
                }
            """ % locals()
-        elif node.inputs[0].ndim == 2 and self.axis == 1: 
+        elif node.inputs[0].ndim == 2: 
            code = """
                const int* shape = CudaNdarray_HOST_DIMS(%(x)s);
                bool needAllocation = !%(z)s || CudaNdarray_NDIM(%(x)s) != CudaNdarray_NDIM(%(z)s);
@@ -375,7 +307,7 @@ class GpuCumsum(CumsumOp, GpuOp):
                }
                { // Namespace for kernel calls //
-                    cumSum_2D_axis1_%(nodename)s(%(x)s, %(z)s, shape, %(max_threads_dim0)s);
+                    cumSum_%(nodename)s(%(x)s, %(z)s, %(max_threads_dim0)s, %(axis)s, %(max_grid_size1)s);
                    cudaError_t sts = cudaGetLastError();
                    if (cudaSuccess != sts)

--- a/theano/sandbox/cuda/tests/test_extra_ops.py
+++ b/theano/sandbox/cuda/tests/test_extra_ops.py
@@ -17,7 +17,7 @@ from theano import tensor as T
 import numpy as np
 import theano
 from theano import config
-from theano.tensor.extra_ops import cumsum
+from theano.tensor.extra_ops import cumsum, diff
 from mlpython.misc.utils import Timer
@@ -26,123 +26,148 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
    op = GpuCumsum
    dtypes = ['float32']
-    def test_GpuCumsum(self):
+    def test_benchmark_1D_vs_2D(self):
-        ### Test 1D case ###
+        print "\nBenchmark:"
-        x = T.vector('x')
-        f = theano.function([x], cumsum(x))
-        # # Even number of elements
+        from theano import sandbox, Out
-        # a = np.random.random((18,)).astype(config.floatX)
+        import time
-        # assert np.allclose(np.cumsum(a), f(a))
-        # # Odd number of elements
+        vlen = 40 * 1024 * 2048  # 10 x # cores x # threads per core
-        # a = np.random.random((7,)).astype(config.floatX)
+        iters = 25
-        # assert np.allclose(np.cumsum(a), f(a))
-        # # Use multiple GPU threadblocks
+        x = theano.shared(np.ones((vlen,), dtype=config.floatX), borrow=False)
-        # a = np.random.random((2048+2,)).astype(config.floatX)
+        res = Out(sandbox.cuda.basic_ops.gpu_from_host(cumsum(x)), borrow=True)
-        # assert np.allclose(np.cumsum(a), f(a))
+        f = theano.function([], res)
-        # # Use multiple GPU threadblocks
+        print f.maker.fgraph.toposort()
-        # a = np.random.random((2048*75+2,)).astype(config.floatX)
+        t0 = time.time()
-        # assert np.allclose(np.cumsum(a), f(a))
+        for i in xrange(iters):
+            r = f()
+        t1 = time.time()
-        # # Use multiple GPU gridblocks
+        print 'Looping %d times took' % iters, t1 - t0, 'seconds'
-        # a = np.ones((2048*2048+2,)).astype(config.floatX)
+        print 'Result is', r
-        # assert np.allclose(np.cumsum(a), f(a))
+        print 'Numpy result is', np.asarray(r)
-        print "\nBenchmark:"
-        import timeit as t
+        # x = theano.shared(np.ones((1,vlen), dtype=config.floatX), borrow=True)
-        #theano_time = t.timeit("np.ones((100,))", "import numpy as np", number=1000)
+        # f = theano.function([], Out(sandbox.cuda.basic_ops.gpu_from_host(cumsum(x,axis=1)), borrow=True))
+        # print f.maker.fgraph.toposort()
+        # t0 = time.time()
+        # for i in xrange(iters):
+        #     r = f()
+        # t1 = time.time()
-        stmt = "f(a)"
+        # print 'Looping %d times took' % iters, t1 - t0, 'seconds'
-        setup = """
+        # print 'Result is', r
-        import numpy as np
+        # print 'Numpy result is', np.asarray(r)
-        import theano
-        import theano.tensor as T
+        # print 'Used the', config.device
-        from theano.tensor.extra_ops import cumsum
-        from theano import config
+    def test_GpuCumsum(self):
+        ### Test 1D case ###
        x = T.vector('x')
        f = theano.function([x], cumsum(x))
-        a = np.ones((100000,), dtype=config.floatX)
-        """.replace("        ", "")
+        # Even number of elements
-        theano_time = t.timeit(stmt, setup, number=1000)
+        a = np.random.random((18,)).astype(config.floatX)
-        print "Theano:\t", theano_time
+        print f(a)
+        print np.cumsum(a)
-        stmt = "np.cumsum(a)"
+        assert np.allclose(np.cumsum(a), f(a))
-        setup = """
-        import numpy as np
+        # Odd number of elements
-        from theano import config
+        a = np.random.random((7,)).astype(config.floatX)
-        a = np.ones((100000,), dtype=config.floatX)
+        assert np.allclose(np.cumsum(a), f(a))
-        """.replace("        ", "")
-        numpy_time = t.timeit(stmt, setup, number=1000)
+        # Use multiple GPU threadblocks
-        print "Numpy:\t", numpy_time
+        a = np.random.random((2048+2,)).astype(config.floatX)
-        print "Speedup: {0}x".format(numpy_time/theano_time)
+        assert np.allclose(np.cumsum(a), f(a))
+        # Use multiple GPU threadblocks
-        # # Extensive testing
+        a = np.random.random((2048*75+2,)).astype(config.floatX)
-        # i = 0;
+        assert np.allclose(np.cumsum(a), f(a))
-        # while True:
-        #     a = np.ones((i,), dtype=config.floatX)
+        # Use multiple GPU gridblocks
+        a = np.ones((2048*2048+2,)).astype(config.floatX)
-        #     fa = f(a)
+        assert np.allclose(np.cumsum(a), f(a))
-        #     npa = np.cumsum(a)
-        #     if not np.allclose(npa, fa):
+        # Extensive testing
-        #         print i, np.allclose(npa, fa)  # Test axis=None
+        for i in xrange(int(1e3)*5):
-        #         print fa
+            a = np.ones((i,), dtype=config.floatX)
-        #         print npa
-        #         assert False
+            fa = f(a)
+            npa = np.cumsum(a)
-        #     if i % 1000 == 0:
-        #         print i
+            if not np.allclose(npa, fa):
+                print i, np.allclose(npa, fa)  # Test axis=None
-        #     i += 1
+                print fa
+                print npa
+                assert False
-        # ### Test 2D case - axis=1 ###
-        # x = T.matrix('x')
+            if i % 1000 == 0:
-        # f = theano.function([x], cumsum(x, axis=1))
+                print i
-        # # # Even number of elements
-        # # print "\n# Even number of elements"
+        #for axis in xrange(2):
-        # # a = np.random.random((18,18)).astype(config.floatX)
+        for axis in xrange(2):
-        # # assert np.allclose(np.cumsum(a, axis=1), f(a))
+            ### Test 2D case - axis=1 ###
+            x = T.matrix('x')
-        # # # Odd number of elements
+            f = theano.function([x], cumsum(x, axis=axis))
-        # # print "\n# Odd number of elements"
-        # # assert np.allclose(np.cumsum(a, axis=1), f(a))
+            # Even number of elements
+            print "\n# Even number of elements (axis={0})".format(axis)
-        # # # Use multiple GPU threadblocks
+            a = np.random.random((18,18)).astype(config.floatX)
-        # # print "\n# Use multiple GPU threadblocks"
+            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-        # # a = np.random.random((2048+2,2048+2)).astype(config.floatX)
-        # # assert np.allclose(np.cumsum(a, axis=1), f(a))
+            # Odd number of elements
+            print "\n# Odd number of elements (axis={0})".format(axis)
-        # # # Use multiple GPU threadblocks
+            a = np.random.random((21,21)).astype(config.floatX)
-        # # print "\n# Use multiple GPU threadblocks"
+            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-        # # a = np.ones((10,2048*75+3)).astype(config.floatX)
-        # # assert np.allclose(np.cumsum(a, axis=1), f(a))
+            # Use two GPU threadblocks
+            print "\n# Use two GPU threadblocks (axis={0})".format(axis)
-        # # # Use multiple GPU gridblocks
+            a = np.random.random((2048+2,2048+2)).astype(config.floatX)
-        # # print "\n# Use multiple GPU gridblocks"
+            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-        # # a = np.ones((11,2048*2048+3)).astype(config.floatX)
-        # # assert np.allclose(np.cumsum(a, axis=1), f(a))
+            # Use multiple GPU threadblocks
+            print "\n# Use multiple GPU threadblocks (axis={0})".format(axis)
-        # # Extensive testing
+            a = np.ones((10,2048*75+3)).astype(config.floatX)
-        # i = 19000;
+            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-        # while True:
-        #     a = np.ones((11,i), dtype=config.floatX)
+            a = np.ones((2048*75+3,10)).astype(config.floatX)
-        #     fa = f(a)
+            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-        #     npa = np.cumsum(a, axis=1)
+            # Use multiple GPU gridblocks
-        #     if not np.allclose(npa, fa):
+            print "\n# Use multiple GPU gridblocks (axis={0})".format(axis)
-        #         print i, np.allclose(npa, fa)  # Test axis=None
+            a = np.ones((11,2048*2048+3)).astype(config.floatX)
-        #         print fa
+            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-        #         print npa
+            a = np.ones((2048*2048+3,11)).astype(config.floatX)
-        #         assert False
+            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-        #     if i % 1000 == 0:
+            # Extensive testing for the first 10k sizes
-        #         print i
+            for i in xrange(int(1e3)*5):
+                a = np.ones((11,i), dtype=config.floatX)
-        #     i += 1
+                fa = f(a)
+                npa = np.cumsum(a, axis=axis)
+                if not np.allclose(npa, fa):
+                    print i, np.allclose(npa, fa)  # Test axis=None
+                    print fa
+                    print npa
+                    assert False
+                a = np.ones((i,11), dtype=config.floatX)
+                fa = f(a)
+                npa = np.cumsum(a, axis=axis)
+                if not np.allclose(npa, fa):
+                    print i, np.allclose(npa, fa)  # Test axis=None
+                    print fa
+                    print npa
+                    assert False
+                if i % 1000 == 0:
+                    print i
\ No newline at end of file