Add a 2D version cumsum

65f5d0c7 · Marc-Alexandre Cote · 6dbb2457 · 65f5d0c7 · 65f5d0c7
--- a/theano/sandbox/cuda/extra_ops.py
+++ b/theano/sandbox/cuda/extra_ops.py
--- a/theano/sandbox/cuda/tests/test_extra_ops.py
+++ b/theano/sandbox/cuda/tests/test_extra_ops.py
@@ -17,7 +17,7 @@ from theano import tensor as T
 import numpy as np
 import theano
 from theano import config
-from theano.tensor.extra_ops import cumsum
+from theano.tensor.extra_ops import cumsum, diff
 from mlpython.misc.utils import Timer
@@ -26,123 +26,148 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
    op = GpuCumsum
    dtypes = ['float32']
-    def test_GpuCumsum(self):
+    def test_benchmark_1D_vs_2D(self):
-        ### Test 1D case ###
-        x = T.vector('x')
-        f = theano.function([x], cumsum(x))
-        # # Even number of elements
-        # a = np.random.random((18,)).astype(config.floatX)
-        # assert np.allclose(np.cumsum(a), f(a))
-        # # Odd number of elements
-        # a = np.random.random((7,)).astype(config.floatX)
-        # assert np.allclose(np.cumsum(a), f(a))
-        # # Use multiple GPU threadblocks
-        # a = np.random.random((2048+2,)).astype(config.floatX)
-        # assert np.allclose(np.cumsum(a), f(a))
-        # # Use multiple GPU threadblocks
-        # a = np.random.random((2048*75+2,)).astype(config.floatX)
-        # assert np.allclose(np.cumsum(a), f(a))
-        # # Use multiple GPU gridblocks
-        # a = np.ones((2048*2048+2,)).astype(config.floatX)
-        # assert np.allclose(np.cumsum(a), f(a))
        print "\nBenchmark:"
-        import timeit as t
+        from theano import sandbox, Out
-        #theano_time = t.timeit("np.ones((100,))", "import numpy as np", number=1000)
+        import time
-        stmt = "f(a)"
-        setup = """
-        import numpy as np
-        import theano
-        import theano.tensor as T
-        from theano.tensor.extra_ops import cumsum
-        from theano import config
-        x = T.vector('x')
-        f = theano.function([x], cumsum(x))
-        a = np.ones((100000,), dtype=config.floatX)
-        """.replace("        ", "")
-        theano_time = t.timeit(stmt, setup, number=1000)
-        print "Theano:\t", theano_time
-        stmt = "np.cumsum(a)"
-        setup = """
-        import numpy as np
-        from theano import config
-        a = np.ones((100000,), dtype=config.floatX)
-        """.replace("        ", "")
-        numpy_time = t.timeit(stmt, setup, number=1000)
-        print "Numpy:\t", numpy_time
-        print "Speedup: {0}x".format(numpy_time/theano_time)
+        vlen = 40 * 1024 * 2048  # 10 x # cores x # threads per core
+        iters = 25
-        # # Extensive testing
+        x = theano.shared(np.ones((vlen,), dtype=config.floatX), borrow=False)
-        # i = 0;
+        res = Out(sandbox.cuda.basic_ops.gpu_from_host(cumsum(x)), borrow=True)
-        # while True:
+        f = theano.function([], res)
-        #     a = np.ones((i,), dtype=config.floatX)
-        #     fa = f(a)
+        print f.maker.fgraph.toposort()
-        #     npa = np.cumsum(a)
+        t0 = time.time()
+        for i in xrange(iters):
+            r = f()
+        t1 = time.time()
+        print 'Looping %d times took' % iters, t1 - t0, 'seconds'
+        print 'Result is', r
+        print 'Numpy result is', np.asarray(r)
-        #     if not np.allclose(npa, fa):
+        # x = theano.shared(np.ones((1,vlen), dtype=config.floatX), borrow=True)
-        #         print i, np.allclose(npa, fa)  # Test axis=None
+        # f = theano.function([], Out(sandbox.cuda.basic_ops.gpu_from_host(cumsum(x,axis=1)), borrow=True))
-        #         print fa
-        #         print npa
-        #         assert False
-        #     if i % 1000 == 0:
+        # print f.maker.fgraph.toposort()
-        #         print i
+        # t0 = time.time()
+        # for i in xrange(iters):
+        #     r = f()
+        # t1 = time.time()
+        # print 'Looping %d times took' % iters, t1 - t0, 'seconds'
+        # print 'Result is', r
+        # print 'Numpy result is', np.asarray(r)
-        #     i += 1
+        # print 'Used the', config.device
-        # ### Test 2D case - axis=1 ###
+    def test_GpuCumsum(self):
-        # x = T.matrix('x')
+        ### Test 1D case ###
-        # f = theano.function([x], cumsum(x, axis=1))
+        x = T.vector('x')
+        f = theano.function([x], cumsum(x))
-        # # # Even number of elements
+        # Even number of elements
-        # # print "\n# Even number of elements"
+        a = np.random.random((18,)).astype(config.floatX)
-        # # a = np.random.random((18,18)).astype(config.floatX)
+        print f(a)
-        # # assert np.allclose(np.cumsum(a, axis=1), f(a))
+        print np.cumsum(a)
+        assert np.allclose(np.cumsum(a), f(a))
-        # # # Odd number of elements
-        # # print "\n# Odd number of elements"
+        # Odd number of elements
-        # # assert np.allclose(np.cumsum(a, axis=1), f(a))
+        a = np.random.random((7,)).astype(config.floatX)
+        assert np.allclose(np.cumsum(a), f(a))
-        # # # Use multiple GPU threadblocks
-        # # print "\n# Use multiple GPU threadblocks"
+        # Use multiple GPU threadblocks
-        # # a = np.random.random((2048+2,2048+2)).astype(config.floatX)
+        a = np.random.random((2048+2,)).astype(config.floatX)
-        # # assert np.allclose(np.cumsum(a, axis=1), f(a))
+        assert np.allclose(np.cumsum(a), f(a))
-        # # # Use multiple GPU threadblocks
+        # Use multiple GPU threadblocks
-        # # print "\n# Use multiple GPU threadblocks"
+        a = np.random.random((2048*75+2,)).astype(config.floatX)
-        # # a = np.ones((10,2048*75+3)).astype(config.floatX)
+        assert np.allclose(np.cumsum(a), f(a))
-        # # assert np.allclose(np.cumsum(a, axis=1), f(a))
+        # Use multiple GPU gridblocks
-        # # # Use multiple GPU gridblocks
+        a = np.ones((2048*2048+2,)).astype(config.floatX)
-        # # print "\n# Use multiple GPU gridblocks"
+        assert np.allclose(np.cumsum(a), f(a))
-        # # a = np.ones((11,2048*2048+3)).astype(config.floatX)
-        # # assert np.allclose(np.cumsum(a, axis=1), f(a))
+        # Extensive testing
-        # # Extensive testing
+        for i in xrange(int(1e3)*5):
-        # i = 19000;
+            a = np.ones((i,), dtype=config.floatX)
-        # while True:
-        #     a = np.ones((11,i), dtype=config.floatX)
+            fa = f(a)
-        #     fa = f(a)
+            npa = np.cumsum(a)
-        #     npa = np.cumsum(a, axis=1)
+            if not np.allclose(npa, fa):
-        #     if not np.allclose(npa, fa):
+                print i, np.allclose(npa, fa)  # Test axis=None
-        #         print i, np.allclose(npa, fa)  # Test axis=None
+                print fa
-        #         print fa
+                print npa
-        #         print npa
+                assert False
-        #         assert False
+            if i % 1000 == 0:
-        #     if i % 1000 == 0:
+                print i
-        #         print i
-        #     i += 1
+        #for axis in xrange(2):
+        for axis in xrange(2):
+            ### Test 2D case - axis=1 ###
+            x = T.matrix('x')
+            f = theano.function([x], cumsum(x, axis=axis))
+            # Even number of elements
+            print "\n# Even number of elements (axis={0})".format(axis)
+            a = np.random.random((18,18)).astype(config.floatX)
+            assert np.allclose(np.cumsum(a, axis=axis), f(a))
+            # Odd number of elements
+            print "\n# Odd number of elements (axis={0})".format(axis)
+            a = np.random.random((21,21)).astype(config.floatX)
+            assert np.allclose(np.cumsum(a, axis=axis), f(a))
+            # Use two GPU threadblocks
+            print "\n# Use two GPU threadblocks (axis={0})".format(axis)
+            a = np.random.random((2048+2,2048+2)).astype(config.floatX)
+            assert np.allclose(np.cumsum(a, axis=axis), f(a))
+            # Use multiple GPU threadblocks
+            print "\n# Use multiple GPU threadblocks (axis={0})".format(axis)
+            a = np.ones((10,2048*75+3)).astype(config.floatX)
+            assert np.allclose(np.cumsum(a, axis=axis), f(a))
+            a = np.ones((2048*75+3,10)).astype(config.floatX)
+            assert np.allclose(np.cumsum(a, axis=axis), f(a))
+            # Use multiple GPU gridblocks
+            print "\n# Use multiple GPU gridblocks (axis={0})".format(axis)
+            a = np.ones((11,2048*2048+3)).astype(config.floatX)
+            assert np.allclose(np.cumsum(a, axis=axis), f(a))
+            a = np.ones((2048*2048+3,11)).astype(config.floatX)
+            assert np.allclose(np.cumsum(a, axis=axis), f(a))
+            # Extensive testing for the first 10k sizes
+            for i in xrange(int(1e3)*5):
+                a = np.ones((11,i), dtype=config.floatX)
+                fa = f(a)
+                npa = np.cumsum(a, axis=axis)
+                if not np.allclose(npa, fa):
+                    print i, np.allclose(npa, fa)  # Test axis=None
+                    print fa
+                    print npa
+                    assert False
+                a = np.ones((i,11), dtype=config.floatX)
+                fa = f(a)
+                npa = np.cumsum(a, axis=axis)
+                if not np.allclose(npa, fa):
+                    print i, np.allclose(npa, fa)  # Test axis=None
+                    print fa
+                    print npa
+                    assert False
+                if i % 1000 == 0:
+                    print i
\ No newline at end of file