Reuse allocated memory when possible.

100d336e · Marc-Alexandre Cote · 2dddf4ac · 100d336e · 100d336e · 100d336e
--- a/theano/tensor/__init__.py
+++ b/theano/tensor/__init__.py
@@ -62,4 +62,4 @@ from theano.gradient import Rop, Lop, grad, numeric_grad, verify_grad, \
 from theano.tensor.sort import sort, argsort
 from theano.tensor.extra_ops import (DiffOp, bincount, squeeze,
-                       repeat, bartlett, fill_diagonal)
+                       repeat, bartlett, fill_diagonal, cumsum)
--- a/theano/tensor/extra_ops.py
+++ b/theano/tensor/extra_ops.py
@@ -49,7 +49,7 @@ class CumsumOp(theano.Op):
    def infer_shape(self, node, shapes):
        if self.axis is None:
-            return [(np.prod(shapes[0]),)]  # Flatten
+            return [(tensor.prod(shapes[0]),)]  # Flatten
        return shapes
@@ -59,11 +59,14 @@ class CumsumOp(theano.Op):
        axis = self.axis
        fail = sub['fail']
-        if self.axis is None:
+        if self.axis is None or (self.axis == 0 and node.inputs[0].ndim == 1):
            code = """
                npy_intp shape[1] = { PyArray_SIZE(%(x)s) };
+                if(!(%(z)s && PyArray_DIMS(%(z)s)[0] == shape[0]))
+                {
                    Py_XDECREF(%(z)s);
                    %(z)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, type_num_%(x)s);
+                }
                if (!%(z)s)
                    %(fail)s;
@@ -73,8 +76,11 @@ class CumsumOp(theano.Op):
            """ % locals()
        else:
            code = """
+                if(!(%(z)s && PyArray_CompareLists(PyArray_DIMS(%(z)s), PyArray_DIMS(%(x)s), PyArray_NDIM(%(x)s)) ))
+                {
                    Py_XDECREF(%(z)s);
                    %(z)s = (PyArrayObject*) PyArray_SimpleNew(PyArray_NDIM(%(x)s), PyArray_DIMS(%(x)s), type_num_%(x)s);
+                }
                if (!%(z)s)
                    %(fail)s;
@@ -86,10 +92,10 @@ class CumsumOp(theano.Op):
        return code
    def c_code_cache_version(self):
-        return (1,)
+        return (2,)
    def __str__(self):
-        return self.__class__.__name__
+        return "%s{%s}" % (self.__class__.__name__, self.axis)
 def cumsum(x, axis=None):

--- a/theano/tensor/tests/test_extra_ops.py
+++ b/theano/tensor/tests/test_extra_ops.py
@@ -22,15 +22,26 @@ class TestCumsumOp(utt.InferShapeTester):
    def test_cumsumOp(self):
        x = T.tensor3('x')
-        a = np.random.random((30, 50, 20)).astype(config.floatX)
+        a = np.random.random((3, 5, 2)).astype(config.floatX)
+        b = np.random.random((30, 5, 2)).astype(config.floatX)
        f = theano.function([x], cumsum(x), mode="DebugMode")
        assert np.allclose(np.cumsum(a), f(a))  # Test axis=None
+        # Test without garbage collector
+        f = theano.function([x], cumsum(x).sum(), mode=theano.compile.Mode(linker="cvm_nogc", optimizer="fast_run") )
+        assert np.allclose(np.cumsum(a).sum(), f(a))  # Test axis=None
+        assert np.allclose(np.cumsum(b).sum(), f(b))  # Would fail without re-allocation
        for axis in range(len(a.shape)):
            f = theano.function([x], cumsum(x, axis=axis), mode="DebugMode")
            assert np.allclose(np.cumsum(a, axis=axis), f(a))
+            # Test without garbage collector
+            f = theano.function([x], cumsum(x, axis=axis).sum(), mode=theano.compile.Mode(linker="cvm_nogc", optimizer="fast_run"))
+            assert np.allclose(np.cumsum(a, axis=axis).sum(), f(a))
+            assert np.allclose(np.cumsum(b, axis=axis).sum(), f(b))  # Would fail without re-allocation
    def test_infer_shape(self):
        x = T.tensor3('x')
        a = np.random.random((30, 50, 20)).astype(config.floatX)
@@ -53,7 +64,7 @@ class TestCumsumOp(utt.InferShapeTester):
        utt.verify_grad(self.op, [a])  # Test axis=None
        for axis in range(len(a.shape)):
-            utt.verify_grad(CumsumOp(axis=axis), [a])
+            utt.verify_grad(self.op_class(axis=axis), [a])
 class TestBinCountOp(utt.InferShapeTester):