Merge pull request #5317 from khaotik/cumop

Merge CumsumOp/CumprodOp into CumOp

Merge pull request #5317 from khaotik/cumop
90dd93d0 · Frédéric Bastien · GitHub · 170aff07 · 1455b49c · 90dd93d0
--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
 from __future__ import absolute_import, print_function, division
 import os
 from theano import Apply, Op
-from theano.tensor.extra_ops import CumsumOp
+from theano.tensor.extra_ops import CumOp
 from .basic_ops import infer_context_name
 try:
    from pygpu import gpuarray
@@ -12,7 +12,7 @@ from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape)
 from .opt import register_opt, op_lifter, register_opt2


-class GpuCumsum(GpuKernelBase, Op):
+class GpuCumOp(GpuKernelBase, Op):
    """
    Parameters
    ----------
@@ -20,10 +20,19 @@ class GpuCumsum(GpuKernelBase, Op):
        Can not be None. If you want the array flattened, do it before.
    """
    SUPPORTED_NDIMS = 3
-    __props__ = ('axis',)
+    __props__ = ('axis', 'mode')

-    def __init__(self, axis):
-        self.axis = axis
+    def __init__(self, axis, mode='add'):
+        self.axis = axis if axis else 0
+        self.mode = mode
+
+    def __eq__(self, other):
+        if type(other) != type(self):
+            return False
+        return self.axis == other.axis and self.mode == other.mode
+
+    def __hash__(self):
+        return hash(self.axis) ^ hash(self.mode)

    def c_code_cache_version(self):
        return (3,)
@@ -38,14 +47,14 @@ class GpuCumsum(GpuKernelBase, Op):
        return node.inputs[0].type.context

    def make_node(self, x):
-        assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"
+        assert x.type.dtype == 'float32', "Only float32 supported for GpuCumOp"

        context_name = infer_context_name(x)

        x = as_gpuarray_variable(x, context_name)

-        if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
-            raise NotImplementedError('Only cumsum on 1D, 2D and\
+        if x.ndim > GpuCumOp.SUPPORTED_NDIMS:
+            raise NotImplementedError('Only cum op on 1D, 2D and\
                                       3D arrays are supported right now!')

        if self.axis >= x.ndim or self.axis < -x.ndim:
@@ -56,6 +65,7 @@ class GpuCumsum(GpuKernelBase, Op):
        kernels = []
        # cumadd
        kname = "k_cumadd"
+        op = {'mul': '*', 'add': '+'}[self.mode]
        k_var = "k_cumadd_" + nodename
        dtype_x = node.inputs[0].dtype
        flags = Kernel.get_flags(dtype_x)
@@ -75,7 +85,7 @@ class GpuCumsum(GpuKernelBase, Op):
            int idx_last_input = lastElementIdx*inputStrides_x + dataOffsetY_input;
            int idx_last_output = lastElementIdx*outputStrides_x + dataOffsetY_output;
            int idx_beforelast = beforeLastElementIdx*outputStrides_x + dataOffsetY_output;
-            output[idx_last_output] = input[idx_last_input] + output[idx_beforelast];
+            output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast];
            }
        """ % locals()
        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE,
@@ -86,9 +96,9 @@ class GpuCumsum(GpuKernelBase, Op):
                  ]
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
-        # blockCumSum
-        kname = "k_blockCumSum"
-        k_var = "k_blockCumSum_" + nodename
+        # blockCumOp
+        kname = "k_blockCumOp"
+        k_var = "k_blockCumOp_" + nodename
        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
@@ -96,109 +106,108 @@ class GpuCumsum(GpuKernelBase, Op):
        code = """
        // helper functions
        WITHIN_KERNEL
-        void k_reductionPhase(float* partialCumSum) {
+        void k_reductionPhase(float* partialCumOp) {
            // Traverse down from leaves to root building partial sums at internal nodes in the tree.
            for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
                local_barrier();
                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
                if(index < blockDim.x*2) {
-                    partialCumSum[index] += partialCumSum[index - stride];
+                    partialCumOp[index] %(op)s= partialCumOp[index - stride];
                }
            }
        }

        WITHIN_KERNEL
-        void k_fetchData(float* partialCumSum, float* input, int globalThreadID,
+        void k_fetchData(float* partialCumOp, float* input, int globalThreadID,
                         ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
                         int offsetY, int offsetZ) {
-            // blockIdx.y and blockIdx.z represents the current independent cumsum
+            // blockIdx.y and blockIdx.z represents the current independent cum op
            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z;
            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
-            partialCumSum[threadIdx.x*2]     = input[idx_even];
-            partialCumSum[threadIdx.x*2 + 1] = input[idx_odd];
+            partialCumOp[threadIdx.x*2]     = input[idx_even];
+            partialCumOp[threadIdx.x*2 + 1] = input[idx_odd];
        }

        WITHIN_KERNEL
-        void k_reversePhase(float* partialCumSum) {
+        void k_reversePhase(float* partialCumOp) {
            // Traverse back up the tree building the scan from the partial sums
            for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
                local_barrier();
                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
                if(index + stride < blockDim.x*2) {
-                    partialCumSum[index + stride] += partialCumSum[index];
+                    partialCumOp[index + stride] %(op)s= partialCumOp[index];
                }
            }
        }

        WITHIN_KERNEL
-        void k_pushData(float* partialCumSum, float* output, int globalThreadID,
+        void k_pushData(float* partialCumOp, float* output, int globalThreadID,
                        ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
                        int offsetY, int offsetZ) {
            local_barrier();
-            // blockIdx.y and blockIdx.z represents the current independent cumsum
+            // blockIdx.y and blockIdx.z represents the current independent cum op
            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ;
            int offset = idY * dataStrides_y + idZ * dataStrides_z;
            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
-            output[idx_even] = partialCumSum[threadIdx.x*2];
-            output[idx_odd]  = partialCumSum[threadIdx.x*2 + 1];
+            output[idx_even] = partialCumOp[threadIdx.x*2];
+            output[idx_odd]  = partialCumOp[threadIdx.x*2 + 1];
        }

-        KERNEL void k_blockCumSum(float* input, float* output,
-                                        size_t nbElementsPerCumsum, ga_ssize inputStrides_x,
+        KERNEL void k_blockCumOp(float* input, float* output,
+                                        size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
                                        ga_ssize inputStrides_y,  ga_ssize inputStrides_z,
                                        ga_ssize outputStrides_x, ga_ssize outputStrides_y,
                                        ga_ssize outputStrides_z, int offsetY,
                                        int offsetZ, float* blockSum) {
-            // Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis.
-            // The Y and Z axis of the grid will contain all independent cumsums of the 2D/3D case.
+            // Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis.
+            // The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case.

            int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;

            // Check if current thread has data to process.
-            if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
+            if (globalThreadID >= (nbElementsPerCumOp+1)/2) {
                return;
            }

-            extern __shared__ float partialCumSum[];
+            extern __shared__ float partialCumOp[];

            // Load data in shared memory
-            k_fetchData(partialCumSum, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ);
+            k_fetchData(partialCumOp, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ);

-            // Use a dichotomy approach to compute the cumsum (i.e. balanced binary tree).
+            // Use a dichotomy approach to compute the cum op (i.e. balanced binary tree).
            // The tree is sweeped from the leaves to the root and from the root to the leaves.
            // Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
-            k_reductionPhase(partialCumSum);
-            k_reversePhase(partialCumSum);
+            k_reductionPhase(partialCumOp);
+            k_reversePhase(partialCumOp);

            // Write the final output to global memory
-            k_pushData(partialCumSum, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);
+            k_pushData(partialCumOp, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);

            if (blockSum != NULL){
                if (threadIdx.x == blockDim.x - 1) {
-                    blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumSum[threadIdx.x*2 + 1];
+                    blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumOp[threadIdx.x*2 + 1];
                }
            }
        }
-        """
+        """ % locals()
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
-        # k_finalCumSum
-        kname = "k_finalCumSum"
-        k_var = "k_finalCumSum_" + nodename
+        # k_finalCumOp
+        kname = "k_finalCumOp"
+        k_var = "k_finalCumOp_" + nodename
        code = """
-        KERNEL void k_finalCumSum(float* output, float* blockSum, size_t nbElementsPerCumsum,
+        KERNEL void k_finalCumOp(float* output, float* blockSum, size_t nbElementsPerCumOp,
                                               ga_ssize dataStrides_x,  ga_ssize dataStrides_y,  ga_ssize dataStrides_z,
                                               int offsetY, int offsetZ) {
            int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;

            // Check if current has data to process.
-            if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
+            if (globalThreadID >= (nbElementsPerCumOp+1)/2)
                return;
-            }

            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ;
@@ -208,10 +217,10 @@ class GpuCumsum(GpuKernelBase, Op):
            int offset = idY * dataStrides_y + idZ * dataStrides_z;
            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
-            output[idx_even] += currentBlockSum;
-            output[idx_odd] += currentBlockSum;
+            output[idx_even] %(op)s= currentBlockSum;
+            output[idx_odd] %(op)s= currentBlockSum;
        }
-        """
+        """ % locals()
        params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
                  gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
                  'int32', 'int32', ]
@@ -263,7 +272,7 @@ class GpuCumsum(GpuKernelBase, Op):
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
                    %(fail)s;
                }
-                if (cumSum_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){
+                if (cumOp_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){
                    %(fail)s;
                }
            }
@@ -274,7 +283,7 @@ class GpuCumsum(GpuKernelBase, Op):
    def c_support_code_struct(self, node, nodename):
        code = """

-        int cumSum_%(nodename)s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) {
+        int cumOp_%(nodename)s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) {
            size_t shape[3] = { 1, 1, 1 };
            ssize_t inputStrides_x;
            ssize_t inputStrides_y;
@@ -316,14 +325,14 @@ class GpuCumsum(GpuKernelBase, Op):
                int err = pygpu_move(output, input);
                return err;
            }
-            // Perform cumsum on array of even size.
-            size_t nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2);
+            // Perform cum op on array of even size.
+            size_t nbElementsPerCumOp = shape[axis] - (shape[axis] %% 2);
            // Determine how many elements can be processed in one block.
-            size_t dimBlockX = ceil((nbElementsPerCumsum > 2*maxThreads ? 2*maxThreads : nbElementsPerCumsum) / 2.0);
+            size_t dimBlockX = ((nbElementsPerCumOp > 2*maxThreads ? 2*maxThreads : nbElementsPerCumOp)+1)/2;
            // Determine how many blocks are needed in total.
-            size_t dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX));  // Nb. of blocks needed per cumsum.
-            size_t dimGridY;  // Nb. of independent cumsums (width).
-            size_t dimGridZ;  // Nb. of independent cumsums (height).
+            size_t dimGridX = (nbElementsPerCumOp+2*dimBlockX-1) / (2*dimBlockX);  // Nb. of blocks needed per cum op.
+            size_t dimGridY;  // Nb. of independent cum ops (width).
+            size_t dimGridZ;  // Nb. of independent cum ops (height).
            ssize_t tmp;
            switch (axis)
            {
@@ -365,18 +374,18 @@ class GpuCumsum(GpuKernelBase, Op):
            if (deviceBlockSum == NULL){
                return -1;
            }
-            // Perform `maxGridY`*`maxGridZ` cumsums in parallel.
+            // Perform `maxGridY`*`maxGridZ` cum ops in parallel.
            for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
                size_t localDimGridY = (dimGridY - offsetY < maxGridY) ? (dimGridY - offsetY) : (maxGridY);

                for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
                    size_t localDimGridZ = (dimGridZ - offsetZ < maxGridZ) ? (dimGridZ - offsetZ) : (maxGridZ);
                    size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
-                    size_t dimBlock[3] = {dimBlockX, 1, 1};  // One cumsum per block.
+                    size_t dimBlock[3] = {dimBlockX, 1, 1};  // One cum op per block.
                    size_t sharedBytes = (2*dimBlockX) * sizeof(float);
                    void* kernel_params[] = {(void*) input->ga.data,
                                             (void*) output->ga.data,
-                                             (void*) &nbElementsPerCumsum,
+                                             (void*) &nbElementsPerCumOp,
                                             (void*) &inputStrides_x,
                                             (void*) &inputStrides_y,
                                             (void*) &inputStrides_z,
@@ -387,39 +396,39 @@ class GpuCumsum(GpuKernelBase, Op):
                                             (void*) &offsetZ,
                                             (void*) deviceBlockSum->ga.data
                        };
-                    int err = GpuKernel_call(&k_blockCumSum_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
+                    int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
                    if (err != GA_NO_ERROR){
-                        PyErr_SetString(PyExc_RuntimeError, "blockCumSum call failed");
+                        PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed");
                        return -1;
                    }

                    if (dimGridX > 1) {
-                        // Do a cumsum over the blockSum (recursive).
-                        if (cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
+                        // Do a cum op over the blockSum (recursive).
+                        if (cumOp_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
                            Py_DECREF(deviceBlockSum);
                            return -1;
                        }
                        // Since there are more than one block (i.e. `dimGridX > 1`)
-                        //  report partial cumsums of previous blocks to subsequents ones.
+                        //  report partial cum ops of previous blocks to subsequents ones.
                        size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
                        size_t dimBlock[3] = {dimBlockX, 1, 1};
                        void* kernel_params[] = {(void*) output->ga.data,
                                                 (void*) deviceBlockSum->ga.data,
-                                                 (void*) &nbElementsPerCumsum,
+                                                 (void*) &nbElementsPerCumOp,
                                                 (void*) &outputStrides_x,
                                                 (void*) &outputStrides_y,
                                                 (void*) &outputStrides_z,
                                                 (void*) &offsetY,
                                                 (void*) &offsetZ
                            };
-                        int err = GpuKernel_call(&k_finalCumSum_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
+                        int err = GpuKernel_call(&k_finalCumOp_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
                        if (err != GA_NO_ERROR){
-                            PyErr_SetString(PyExc_RuntimeError, "finalCumSum call failed");
+                            PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed");
                            return -1;
                        }
                    }
                    // If shape[axis] is odd, the last element is compute manually
-                    if (shape[axis] != nbElementsPerCumsum){
+                    if (shape[axis] != nbElementsPerCumOp){
                        size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
                        size_t dimBlock[3] = {1, 1, 1};
                        size_t tmp0 = shape[axis]-2;
@@ -450,26 +459,39 @@ class GpuCumsum(GpuKernelBase, Op):
            return 0;
        }
        """ % locals()
-        return super(GpuCumsum, self).c_support_code_struct(node, nodename) + code
+        return super(GpuCumOp, self).c_support_code_struct(node, nodename) + code


-@register_opt('fast_compile')
-@op_lifter([CumsumOp])
-@register_opt2([CumsumOp], 'fast_compile')
-def local_gpua_cumsumop(op, ctx_name, inputs, outputs):
-    if inputs[0].dtype == 'float32':
-        axis = op.axis
-        x = inputs[0]
-        if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
-            return None
-
-        x = as_gpuarray_variable(x, ctx_name)
+# GpuCumsumOp exists only to serve backward compatibility.
+# Once an object is created, it will be converted to CumOp object.
+class GpuCumsumOp(GpuKernelBase, Op):
+    SUPPORTED_NDIMS = 3
+    __props__ = ("axis",)

-        if axis is None and x.ndim > 1:
-            x = GpuReshape(1)(x, (-1,))
+    def __new__(typ, *args, **kwargs):
+        obj = object.__new__(GpuCumOp, *args, **kwargs)
+        obj.mode = 'add'
+        return obj

-        # ``gpu_cumsum`` assume array has been flattened if needed.
-        if axis is None:
-            axis = 0

-        return GpuCumsum(axis)(x)
+@register_opt('fast_compile')
+@op_lifter([CumOp])
+@register_opt2([CumOp], 'fast_compile')
+def local_gpua_cumop(op, ctx_name, inputs, outputs):
+    if inputs[0].dtype != 'float32':
+        return False
+    axis = op.axis
+    x = inputs[0]
+    if axis is not None and x.ndim > GpuCumOp.SUPPORTED_NDIMS:
+        return False
+
+    x = as_gpuarray_variable(x, ctx_name)
+
+    if axis is None and x.ndim > 1:
+        x = GpuReshape(1)(x, (-1,))
+
+    # ``gpu_cumop`` assume array has been flattened if needed.
+    if axis is None:
+        axis = 0
+
+    return GpuCumOp(axis, op.mode)(x)
--- a/theano/gpuarray/tests/test_extra_ops.py
+++ b/theano/gpuarray/tests/test_extra_ops.py
-# Skip test if cuda_ndarray is not available.
 from __future__ import absolute_import, print_function, division
-import itertools
+from functools import partial
+from itertools import product

 import numpy as np
 from six.moves import xrange
@@ -9,54 +9,62 @@ from theano import tensor as T
 import theano
 import theano.tensor.tests.test_extra_ops

-from theano.tensor.extra_ops import cumsum, CumsumOp
+from theano.tensor.extra_ops import CumOp
 from theano.tests.unittest_tools import SkipTest
 from theano.tests import unittest_tools as utt

 from .config import mode_with_gpu, test_ctx_name
-from ..extra_ops import GpuCumsum
+from ..extra_ops import GpuCumOp
 from ..type import get_context

+cum_modes = utt.parameterized.expand([('mul',), ('add',)])

-class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
+
+class TestGpuCumOp(theano.tensor.tests.test_extra_ops.TestCumOp):
    mode = mode_with_gpu

    def setUp(self):
-        super(TestGpuCumsum, self).setUp()
+        super(TestGpuCumOp, self).setUp()
        test_ctx = get_context(test_ctx_name)
        if test_ctx.kind != b'cuda':
            raise SkipTest("Cuda specific tests")
        self.max_threads_dim0 = test_ctx.maxlsize0
        self.max_grid_size1 = test_ctx.maxgsize2
-        self.op_class = GpuCumsum
+        self.op_class = CumOp

-    def test_infer_shape(self):
-        # GpuCumSum is only defined for float32 for now, so we skip it
+    @cum_modes
+    def test_infer_shape(self, mode):
+        # GpuCumOp is only defined for float32 for now, so we skip it
        # in the unsupported cases
-        gpucumsum_supported_dtypes = ('float32',)
-        if theano.config.floatX not in gpucumsum_supported_dtypes:
-            raise SkipTest('GpuCumSum not implemented for dtype %s'
+        op_class = partial(self.op_class, mode=mode)
+        gpucumop_supported_dtypes = ('float32',)
+        if theano.config.floatX not in gpucumop_supported_dtypes:
+            raise SkipTest('Gpucumop not implemented for dtype %s'
                           % theano.config.floatX)
        x = T.tensor3('x')
        a = np.random.random((3, 5, 2)).astype(theano.config.floatX)

        for axis in range(-len(a.shape), len(a.shape)):
            self._compile_and_check([x],
-                                    [cumsum(x, axis=axis)],
+                                    [op_class(axis=axis)(x)],
                                    [a],
-                                    self.op_class)
+                                    GpuCumOp)

-    def test_grad(self):
-        # no grad for GpuCumsum
+    @cum_modes
+    def test_grad(self, mode):
+        # no grad for GpuCumOp
        pass

-    def test_Strides1D(self):
+    @cum_modes
+    def test_Strides1D(self, mode):
+        op_class = partial(self.op_class, mode=mode)
+        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
        x = T.fvector('x')

        for axis in [0, None, -1]:
            a = np.random.random((42,)).astype("float32")
-            cumsum_function = theano.function([x], cumsum(x, axis=axis),
-                                              mode=self.mode)
+            cumop_function = theano.function(
+                [x], op_class(axis=axis)(x), mode=self.mode)

            slicings = [slice(None, None, None),    # Normal strides
                        slice(None, None, 2),       # Stepped strides
@@ -64,22 +72,25 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
                        ]

            # Cartesian product of all slicings to test.
-            for slicing in itertools.product(slicings, repeat=x.ndim):
-                f = theano.function([x], cumsum(x[slicing], axis=axis),
+            for slicing in product(slicings, repeat=x.ndim):
+                f = theano.function([x], op_class(axis=axis)(x[slicing]),
                                    mode=self.mode)
                assert [n for n in f.maker.fgraph.toposort()
-                        if isinstance(n.op, GpuCumsum)]
-                utt.assert_allclose(np.cumsum(a[slicing], axis=axis), f(a))
-                utt.assert_allclose(np.cumsum(a[slicing], axis=axis),
-                                    cumsum_function(a[slicing]))
-
-    def test_Strides2D(self):
+                        if isinstance(n.op, GpuCumOp)]
+                utt.assert_allclose(np_func(a[slicing], axis=axis), f(a))
+                utt.assert_allclose(np_func(a[slicing], axis=axis),
+                                    cumop_function(a[slicing]))
+
+    @cum_modes
+    def test_Strides2D(self, mode):
+        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
+        op_class = partial(self.op_class, mode=mode)
        x = T.fmatrix('x')

        for axis in [0, 1, None, -1, -2]:
            a = np.random.random((42, 30)).astype("float32")
-            cumsum_function = theano.function([x], cumsum(x, axis=axis),
-                                              mode=self.mode)
+            cumop_function = theano.function(
+                [x], op_class(axis=axis)(x), mode=self.mode)

            slicings = [slice(None, None, None),    # Normal strides
                        slice(None, None, 2),       # Stepped strides
@@ -87,22 +98,25 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
                        ]

            # Cartesian product of all slicings to test.
-            for slicing in itertools.product(slicings, repeat=x.ndim):
-                f = theano.function([x], cumsum(x[slicing], axis=axis),
+            for slicing in product(slicings, repeat=x.ndim):
+                f = theano.function([x], op_class(axis=axis)(x[slicing]),
                                    mode=self.mode)
                assert [n for n in f.maker.fgraph.toposort()
-                        if isinstance(n.op, GpuCumsum)]
-                utt.assert_allclose(np.cumsum(a[slicing], axis=axis), f(a))
-                utt.assert_allclose(np.cumsum(a[slicing], axis=axis),
-                                    cumsum_function(a[slicing]))
-
-    def test_Strides3D(self):
+                        if isinstance(n.op, GpuCumOp)]
+                utt.assert_allclose(np_func(a[slicing], axis=axis), f(a))
+                utt.assert_allclose(np_func(a[slicing], axis=axis),
+                                    cumop_function(a[slicing]))
+
+    @cum_modes
+    def test_Strides3D(self, mode):
+        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
+        op_class = partial(self.op_class, mode=mode)
        x = T.ftensor3('x')

        for axis in [0, 1, 2, None, -1, -2, -3]:
            a = np.random.random((42, 30, 25)).astype("float32")
-            cumsum_function = theano.function([x], cumsum(x, axis=axis),
-                                              mode=self.mode)
+            cumop_function = theano.function(
+                [x], op_class(axis=axis)(x), mode=self.mode)

            slicings = [slice(None, None, None),    # Normal strides
                        slice(None, None, 2),       # Stepped strides
@@ -110,45 +124,51 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
                        ]

            # Cartesian product of all slicings to test.
-            for slicing in itertools.product(slicings, repeat=x.ndim):
-                f = theano.function([x], cumsum(x[slicing], axis=axis),
-                                    mode=self.mode)
+            for slicing in product(slicings, repeat=x.ndim):
+                f = theano.function(
+                    [x], op_class(axis=axis)(x[slicing]), mode=self.mode)
                assert [n for n in f.maker.fgraph.toposort()
-                        if isinstance(n.op, GpuCumsum)]
-                utt.assert_allclose(np.cumsum(a[slicing], axis=axis), f(a))
-                utt.assert_allclose(np.cumsum(a[slicing], axis=axis),
-                                    cumsum_function(a[slicing]))
-
-    def test_GpuCumsum1D(self):
+                        if isinstance(n.op, GpuCumOp)]
+                utt.assert_allclose(np_func(a[slicing], axis=axis), f(a))
+                utt.assert_allclose(np_func(a[slicing], axis=axis),
+                                    cumop_function(a[slicing]))
+
+    @cum_modes
+    def test_GpuCumOp1D(self, mode):
+        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
+        op_class = partial(self.op_class, mode=mode)
        block_max_size = self.max_threads_dim0 * 2

        x = T.fvector('x')
-        f = theano.function([x], cumsum(x), mode=self.mode)
+        f = theano.function([x], op_class(axis=0)(x), mode=self.mode)
        assert [n for n in f.maker.fgraph.toposort()
-                if isinstance(n.op, GpuCumsum)]
+                if isinstance(n.op, GpuCumOp)]

        # Extensive testing for the first 1025 sizes
        a = np.random.random(1025).astype("float32")
        for i in xrange(a.shape[0]):
-            utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))
+            utt.assert_allclose(np_func(a[:i]), f(a[:i]))

        # Use multiple GPU threadblocks
        a = np.random.random((block_max_size + 2, )).astype("float32")
-        utt.assert_allclose(np.cumsum(a), f(a))
+        utt.assert_allclose(np_func(a), f(a))

-        # Use recursive cumsum
+        # Use recursive cumop
        a = np.ones((block_max_size * (block_max_size + 1) + 2,),
                    dtype="float32")
-        utt.assert_allclose(np.cumsum(a), f(a))
+        utt.assert_allclose(np_func(a), f(a))

-    def test_GpuCumsum2D(self):
+    @cum_modes
+    def test_GpuCumOp2D(self, mode):
+        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
+        op_class = partial(self.op_class, mode=mode)
        block_max_size = self.max_threads_dim0 * 2

        x = T.fmatrix('x')
        for shape_axis, axis in zip([0, 1, 0, 1, 0], [0, 1, None, -1, -2]):
-            f = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
+            f = theano.function([x], op_class(axis=axis)(x), mode=self.mode)
            assert [n for n in f.maker.fgraph.toposort()
-                    if isinstance(n.op, GpuCumsum)]
+                    if isinstance(n.op, GpuCumOp)]

            # Extensive testing for the first 1025 sizes
            a_shape = [5, 5]
@@ -158,36 +178,39 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            for i in xrange(a.shape[shape_axis]):
                slices[shape_axis] = slice(i)
                fa = f(a[slices])
-                npa = np.cumsum(a[slices], axis=axis)
+                npa = np_func(a[slices], axis=axis)
                utt.assert_allclose(npa, fa)

            # Use multiple GPU threadblocks
            a_shape = [5, 5]
            a_shape[shape_axis] = block_max_size + 2
            a = np.random.random(a_shape).astype("float32")
-            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
+            utt.assert_allclose(np_func(a, axis=axis), f(a))

            # Use multiple GPU gridblocks
            a_shape = [4, 4]
            a_shape[1 - shape_axis] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
-            utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)
+            utt.assert_allclose(np_func(a, axis=axis), f(a), rtol=5e-5)

-            # Use recursive cumsum
+            # Use recursive cumop
            a_shape = [3, 3]
            a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2
            a = np.random.random(a_shape).astype("float32")
            a = np.sign(a - 0.5).astype("float32")  # Avoid floating point error
-            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
+            utt.assert_allclose(np_func(a, axis=axis), f(a))

-    def test_GpuCumsum3D(self):
+    @cum_modes
+    def test_GpuCumOp3D(self, mode):
+        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
+        op_class = partial(self.op_class, mode=mode)
        block_max_size = self.max_threads_dim0 * 2

        x = T.ftensor3('x')
        for shape_axis, axis in zip([0, 1, 2, 0, 2, 1, 0], [0, 1, 2, None, -1, -2, -3]):
-            f = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
+            f = theano.function([x], op_class(axis=axis)(x), mode=self.mode)
            assert [n for n in f.maker.fgraph.toposort()
-                    if isinstance(n.op, GpuCumsum)]
+                    if isinstance(n.op, GpuCumOp)]

            # Extensive testing for the first 1025 sizes
            a_shape = [5, 5, 5]
@@ -197,14 +220,14 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            for i in xrange(a.shape[shape_axis]):
                slices[shape_axis] = slice(i)
                fa = f(a[slices])
-                npa = np.cumsum(a[slices], axis=axis)
+                npa = np_func(a[slices], axis=axis)
                utt.assert_allclose(npa, fa)

            # Use multiple GPU threadblocks (along accumulation axis)
            a_shape = [2, 2, 2]
            a_shape[shape_axis] = block_max_size + 2
            a = np.random.random(a_shape).astype("float32")
-            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
+            utt.assert_allclose(np_func(a, axis=axis), f(a))

            # Use multiple GPU gridblocks (not along accumulation axis)
            a_shape = [5, 5, 5]
@@ -213,7 +236,7 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            if axis is None:
                # Avoid floating point error
                a = np.sign(a - 0.5).astype("float32")
-            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
+            utt.assert_allclose(np_func(a, axis=axis), f(a))

            a_shape = [5, 5, 5]
            a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
@@ -221,18 +244,20 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
            if axis is None:
                # Avoid floating point error
                a = np.sign(a - 0.5).astype("float32")
-            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
+            utt.assert_allclose(np_func(a, axis=axis), f(a))

-            # Use recursive cumsum (along accumulation axis)
+            # Use recursive cumop (along accumulation axis)
            a_shape = [3, 3, 3]
            a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2
            a = np.random.random(a_shape).astype("float32")
            a = np.sign(a - 0.5).astype("float32")  # Avoid floating point error
-            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
+            utt.assert_allclose(np_func(a, axis=axis), f(a))

-    def test_GpuCumsum4D(self):
+    @cum_modes
+    def test_GpuCumOp4D(self, mode):
+        op_class = partial(self.op_class, mode=mode)
        # Should not use the GPU version.
        x = T.ftensor4('x')
-        f = theano.function([x], cumsum(x, axis=1), mode=self.mode)
+        f = theano.function([x], op_class(axis=1)(x), mode=self.mode)
        assert [n for n in f.maker.fgraph.toposort()
-                if isinstance(n.op, CumsumOp)]
+                if isinstance(n.op, GpuCumOp)]
--- a/theano/sandbox/cuda/extra_ops.py
+++ b/theano/sandbox/cuda/extra_ops.py
@@ -5,7 +5,7 @@ from theano import Op
 from theano.gof import local_optimizer
 from theano.sandbox.cuda import cuda_available, GpuOp
 from theano.sandbox.cuda.basic_ops import gpu_flatten
-from theano.tensor.extra_ops import CumsumOp
+from theano.tensor.extra_ops import CumOp

 if cuda_available:
    from theano.sandbox.cuda import CudaNdarrayType
@@ -13,7 +13,7 @@ if cuda_available:
    from theano.sandbox.cuda import register_opt as register_gpu_opt


-class GpuCumsum(CumsumOp, GpuOp):
+class GpuCumsum(CumOp, GpuOp):
    """

    Parameters
@@ -438,13 +438,16 @@ def values_eq_approx_high_tol(a, b):


 @register_gpu_opt()
-@local_optimizer([CumsumOp])
+@local_optimizer([CumOp])
 def use_gpu_cumsum(node):
-    if type(node.op) is CumsumOp \
+    if type(node.op) is CumOp \
       and node.inputs[0].dtype == 'float32' \
       and node.inputs[0].owner \
       and isinstance(node.inputs[0].owner.op, HostFromGpu):

+        if node.op.mode != 'add':
+            return None
+
        axis = node.op.axis
        x = node.inputs[0]


--- a/theano/sandbox/cuda/tests/test_extra_ops.py
+++ b/theano/sandbox/cuda/tests/test_extra_ops.py
@@ -7,7 +7,7 @@ import numpy as np
 from six.moves import xrange
 from theano import tensor as T
 import theano
-from theano.tensor.extra_ops import cumsum, CumsumOp
+from theano.tensor.extra_ops import cumsum, CumOp
 from theano.tests import unittest_tools as utt
 import theano.sandbox.cuda as cuda_ndarray
 if cuda_ndarray.cuda_available:
@@ -22,7 +22,7 @@ else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')


-class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
+class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumOp):
    mode = mode_with_gpu

    def setUp(self):
@@ -232,4 +232,4 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
        x = T.ftensor4('x')
        f = theano.function([x], cumsum(x, axis=1), mode=self.mode)
        assert [n for n in f.maker.fgraph.toposort()
-                if isinstance(n.op, CumsumOp)]
+                if isinstance(n.op, CumOp)]
--- a/theano/tensor/extra_ops.py
+++ b/theano/tensor/extra_ops.py
@@ -242,13 +242,16 @@ def searchsorted(x, v, side='left', sorter=None):
    return SearchsortedOp(side=side)(x, v, sorter)


-class CumsumOp(theano.Op):
-    # See function cumsum for docstring
+class CumOp(theano.Op):
+    # See function cumsum/cumprod for docstring

-    __props__ = ("axis",)
+    __props__ = ("axis", "mode")

-    def __init__(self, axis=None):
+    def __init__(self, axis=None, mode='add'):
+        if mode not in ('add', 'mul'):
+            raise ValueError('%s: Unknown mode "%s"' % (type(self).__name__, mode))
        self.axis = axis
+        self.mode = mode

    def make_node(self, x):
        x = basic.as_tensor_variable(x)
@@ -264,20 +267,39 @@ class CumsumOp(theano.Op):
    def perform(self, node, inputs, output_storage):
        x = inputs[0]
        z = output_storage[0]
-        z[0] = np.cumsum(x, axis=self.axis)
+        z[0] = {'add': np.cumsum, 'mul': np.cumprod}[self.mode](x, axis=self.axis)

    def grad(self, inputs, output_gradients):
-        [gi] = output_gradients
+        x, = inputs
+        gi, = output_gradients

        if self.axis is None:
-            return [cumsum(gi[::-1])[::-1].reshape(inputs[0].shape)]
+            if self.mode == 'add':
+                return [cumsum(gi[::-1])[::-1].reshape(x.shape)]
+            elif self.mode == 'mul':
+                fx = cumprod(x, axis=self.axis)
+                return [cumsum(
+                    (fx * gi)[::-1])[::-1].reshape(x.shape) / x]
+            else:
+                raise NotImplementedError(
+                    '%s: unknown gradient for mode "%s"' %
+                    (type(self).__name__, self.mode))

-        # We need to reverse the gradients along ``self.axis``,
-        #  compute cumsum, then reverse again
        reverse_slicing = [slice(None, None, None)] * gi.ndim
        reverse_slicing[self.axis] = slice(None, None, -1)
        reverse_slicing = tuple(reverse_slicing)
-        return [cumsum(gi[reverse_slicing], self.axis)[reverse_slicing]]
+        # We need to reverse the gradients along ``self.axis``,
+        #  compute cumsum, then reverse again
+        if self.mode == 'add':
+            return [cumsum(gi[reverse_slicing], self.axis)[reverse_slicing]]
+        elif self.mode == 'mul':
+            fx = cumprod(x, axis=self.axis)
+            return [cumsum(
+                (fx * gi)[reverse_slicing], self.axis)[reverse_slicing] / x]
+        else:
+            raise NotImplementedError(
+                '%s: unknown gradient for mode "%s"' %
+                (type(self).__name__, self.mode))

    def infer_shape(self, node, shapes):
        if self.axis is None:
@@ -290,6 +312,7 @@ class CumsumOp(theano.Op):
        z, = onames
        axis = self.axis
        fail = sub['fail']
+        func = dict(mul='CumProd', add='CumSum')[self.mode]

        if self.axis is None or (self.axis == 0 and node.inputs[0].ndim == 1):
            code = """
@@ -303,13 +326,13 @@ class CumsumOp(theano.Op):
                if (!%(z)s)
                    %(fail)s;
                {
-                    PyObject * t = PyArray_CumSum(
+                    PyObject * t = PyArray_%(func)s(
                        %(x)s, NPY_MAXDIMS,
                        PyArray_TYPE((PyArrayObject*) py_%(x)s), %(z)s);
                    if (!t){
                       %(fail)s;
                    }
-                    // Because PyArray_CumSum returns a newly created reference on t.
+                    // Because PyArray_%(func)s returns a newly created reference on t.
                    Py_XDECREF(t);
                }
            """ % locals()
@@ -325,13 +348,13 @@ class CumsumOp(theano.Op):
                    %(fail)s;
                {

-                    PyObject * t = PyArray_CumSum(
+                    PyObject * t = PyArray_%(func)s(
                        %(x)s, %(axis)s,
                        PyArray_TYPE((PyArrayObject*) py_%(x)s), %(z)s);
                    if (!t){
                       %(fail)s;
                    }
-                    // Because PyArray_CumSum returns a newly created reference on t.
+                    // Because PyArray_%(func)s returns a newly created reference on t.
                    Py_XDECREF(t);
                }
            """ % locals()
@@ -339,10 +362,10 @@ class CumsumOp(theano.Op):
        return code

    def c_code_cache_version(self):
-        return (6,)
+        return (7,)

    def __str__(self):
-        return "%s{%s}" % (self.__class__.__name__, self.axis)
+        return "%s{%s, %s}" % (self.__class__.__name__, self.axis, self.mode)


 def cumsum(x, axis=None):
@@ -362,112 +385,7 @@ def cumsum(x, axis=None):
    .. versionadded:: 0.7

    """
-    return CumsumOp(axis=axis)(x)
-
-
-class CumprodOp(theano.Op):
-    # See function cumprod for docstring
-
-    __props__ = ("axis",)
-
-    def __init__(self, axis=None):
-        self.axis = axis
-
-    def make_node(self, x):
-        x = basic.as_tensor_variable(x)
-        out_type = x.type()
-
-        if self.axis is None:
-            out_type = theano.tensor.vector(dtype=x.dtype)  # Flatten
-        elif self.axis >= x.ndim or self.axis < -x.ndim:
-            raise ValueError('axis(={0}) out of bounds'.format(self.axis))
-
-        return theano.Apply(self, [x], [out_type])
-
-    def perform(self, node, inputs, output_storage):
-        x = inputs[0]
-        z = output_storage[0]
-        z[0] = np.cumprod(x, axis=self.axis)
-
-    def grad(self, inputs, output_gradients):
-        x, = inputs
-        gi, = output_gradients
-        fx = cumprod(x, axis=self.axis)
-
-        if self.axis is None:
-            return [cumsum((fx * gi)[::-1])[::-1].reshape(inputs[0].shape) / x]
-
-        # We need to reverse the gradients along ``self.axis``,
-        #  compute cumsum, then reverse again
-        reverse_slicing = [slice(None, None, None)] * gi.ndim
-        reverse_slicing[self.axis] = slice(None, None, -1)
-        reverse_slicing = tuple(reverse_slicing)
-        return [cumsum((fx * gi)[reverse_slicing],
-                       self.axis)[reverse_slicing] / x]
-
-    def infer_shape(self, node, shapes):
-        if self.axis is None:
-            return [(tensor.prod(shapes[0]),)]  # Flatten
-
-        return shapes
-
-    def c_code(self, node, name, inames, onames, sub):
-        x, = inames
-        z, = onames
-        axis = self.axis
-        fail = sub['fail']
-
-        if self.axis is None or (self.axis == 0 and node.inputs[0].ndim == 1):
-            code = """
-                npy_intp shape[1] = { PyArray_SIZE(%(x)s) };
-                if(!(%(z)s && PyArray_DIMS(%(z)s)[0] == shape[0]))
-                {
-                    Py_XDECREF(%(z)s);
-                    %(z)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, PyArray_TYPE((PyArrayObject*) py_%(x)s));
-                }
-
-                if (!%(z)s)
-                    %(fail)s;
-                {
-                    PyObject * t = PyArray_CumProd(
-                        %(x)s, NPY_MAXDIMS,
-                        PyArray_TYPE((PyArrayObject*) py_%(x)s), %(z)s);
-                    if (!t){
-                       %(fail)s;
-                    }
-                    // Because PyArray_CumSum returns a newly created reference on t.
-                    Py_XDECREF(t);
-                }
-            """ % locals()
-        else:
-            code = """
-                if(!(%(z)s && PyArray_CompareLists(PyArray_DIMS(%(z)s), PyArray_DIMS(%(x)s), PyArray_NDIM(%(x)s)) ))
-                {
-                    Py_XDECREF(%(z)s);
-                    %(z)s = (PyArrayObject*) PyArray_SimpleNew(PyArray_NDIM(%(x)s), PyArray_DIMS(%(x)s), PyArray_TYPE((PyArrayObject*) py_%(x)s));
-                }
-
-                if (!%(z)s)
-                    %(fail)s;
-                {
-                    PyObject * t = PyArray_CumProd(
-                        %(x)s, %(axis)s,
-                        PyArray_TYPE((PyArrayObject*) py_%(x)s), %(z)s);
-                    if (!t){
-                       %(fail)s;
-                    }
-                    // Because PyArray_CumSum returns a newly created reference on t.
-                    Py_XDECREF(t);
-                }
-            """ % locals()
-
-        return code
-
-    def c_code_cache_version(self):
-        return (4,)
-
-    def __str__(self):
-        return "%s{%s}" % (self.__class__.__name__, self.axis)
+    return CumOp(axis=axis, mode='add')(x)


 def cumprod(x, axis=None):
@@ -488,7 +406,27 @@ def cumprod(x, axis=None):
    .. versionadded:: 0.7

    """
-    return CumprodOp(axis=axis)(x)
+    return CumOp(axis=axis, mode='mul')(x)
+
+
+# CumsumOp and CumprodOp are for compatibility with old version,
+# just in case unpickling a theano function with old Ops.
+class CumsumOp(theano.Op):
+    __props__ = ("axis",)
+
+    def __new__(typ, *args, **kwargs):
+        obj = object.__new__(CumOp, *args, **kwargs)
+        obj.mode = 'add'
+        return obj
+
+
+class CumprodOp(theano.Op):
+    __props__ = ("axis",)
+
+    def __new__(typ, *args, **kwargs):
+        obj = object.__new__(CumOp, *args, **kwargs)
+        obj.mode = 'mul'
+        return obj


 class DiffOp(theano.Op):

--- a/theano/tensor/tests/test_extra_ops.py
+++ b/theano/tensor/tests/test_extra_ops.py
 from __future__ import absolute_import, print_function, division
+from functools import partial

 import numpy as np
 import numpy
@@ -7,7 +8,7 @@ import theano
 from theano.tests import unittest_tools as utt

 from theano.tensor.extra_ops import (SearchsortedOp, searchsorted,
-                                     CumsumOp, cumsum, CumprodOp, cumprod,
+                                     CumOp, cumsum, cumprod,
                                     CpuContiguous, cpu_contiguous,
                                     bincount, DiffOp, diff, squeeze, compress,
                                     RepeatOp, repeat, Bartlett, bartlett,
@@ -121,74 +122,33 @@ class TestSearchsortedOp(utt.InferShapeTester):
        utt.verify_grad(self.op, [self.a[self.idx_sorted], self.b])


-class TestCumsumOp(utt.InferShapeTester):
+class TestCumOp(utt.InferShapeTester):

    def setUp(self):
-        super(TestCumsumOp, self).setUp()
-        self.op_class = CumsumOp
-        self.op = CumsumOp()
+        super(TestCumOp, self).setUp()
+        self.op_class = CumOp
+        self.op = CumOp()

-    def test_cumsumOp(self):
+    def test_cum_op(self):
        x = T.tensor3('x')
        a = np.random.random((3, 5, 2)).astype(config.floatX)

        # Test axis out of bounds
        self.assertRaises(ValueError, cumsum, x, axis=3)
        self.assertRaises(ValueError, cumsum, x, axis=-4)
-
-        f = theano.function([x], cumsum(x))
-        assert np.allclose(np.cumsum(a), f(a))  # Test axis=None
-
-        for axis in range(-len(a.shape), len(a.shape)):
-            f = theano.function([x], cumsum(x, axis=axis))
-            assert np.allclose(np.cumsum(a, axis=axis), f(a))
-
-    def test_infer_shape(self):
-        x = T.tensor3('x')
-        a = np.random.random((3, 5, 2)).astype(config.floatX)
-
-        # Test axis=None
-        self._compile_and_check([x],
-                                [self.op(x)],
-                                [a],
-                                self.op_class)
-
-        for axis in range(-len(a.shape), len(a.shape)):
-            self._compile_and_check([x],
-                                    [cumsum(x, axis=axis)],
-                                    [a],
-                                    self.op_class)
-
-    def test_grad(self):
-        a = np.random.random((3, 5, 2)).astype(config.floatX)
-
-        utt.verify_grad(self.op, [a])  # Test axis=None
-
-        for axis in range(-len(a.shape), len(a.shape)):
-            utt.verify_grad(self.op_class(axis=axis), [a], eps=4e-4)
-
-
-class TestCumprodOp(utt.InferShapeTester):
-
-    def setUp(self):
-        super(TestCumprodOp, self).setUp()
-        self.op_class = CumprodOp
-        self.op = CumprodOp()
-
-    def test_CumprodOp(self):
-        x = T.tensor3('x')
-        a = np.random.random((3, 5, 2)).astype(config.floatX)
-
-        # Test axis out of bounds
        self.assertRaises(ValueError, cumprod, x, axis=3)
        self.assertRaises(ValueError, cumprod, x, axis=-4)

-        f = theano.function([x], cumprod(x))
-        assert np.allclose(np.cumprod(a), f(a))  # Test axis=None
+        f = theano.function([x], [cumsum(x), cumprod(x)])
+        s, p = f(a)
+        assert np.allclose(np.cumsum(a), s)  # Test axis=None
+        assert np.allclose(np.cumprod(a), p)  # Test axis=None

        for axis in range(-len(a.shape), len(a.shape)):
-            f = theano.function([x], cumprod(x, axis=axis))
-            assert np.allclose(np.cumprod(a, axis=axis), f(a))
+            f = theano.function([x], [cumsum(x, axis=axis), cumprod(x, axis=axis)])
+            s, p = f(a)
+            assert np.allclose(np.cumsum(a, axis=axis), s)
+            assert np.allclose(np.cumprod(a, axis=axis), p)

    def test_infer_shape(self):
        x = T.tensor3('x')
@@ -202,17 +162,19 @@ class TestCumprodOp(utt.InferShapeTester):

        for axis in range(-len(a.shape), len(a.shape)):
            self._compile_and_check([x],
-                                    [cumprod(x, axis=axis)],
+                                    [cumsum(x, axis=axis)],
                                    [a],
                                    self.op_class)

    def test_grad(self):
        a = np.random.random((3, 5, 2)).astype(config.floatX)

-        utt.verify_grad(self.op, [a])  # Test axis=None
+        utt.verify_grad(self.op_class(mode='add'), [a])  # Test axis=None
+        utt.verify_grad(self.op_class(mode='mul'), [a])  # Test axis=None

        for axis in range(-len(a.shape), len(a.shape)):
-            utt.verify_grad(self.op_class(axis=axis), [a])
+            utt.verify_grad(self.op_class(axis=axis, mode='add'), [a], eps=4e-4)
+            utt.verify_grad(self.op_class(axis=axis, mode='mul'), [a], eps=4e-4)


 class TestBinCount(utt.InferShapeTester):