提交 6dbb2457 authored 作者: Marc-Alexandre Cote's avatar Marc-Alexandre Cote

Cumsum 2D in cuda is working when axis=1.

上级 34239976
......@@ -31,41 +31,118 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
x = T.vector('x')
f = theano.function([x], cumsum(x))
# Even number of elements
a = np.random.random((18,)).astype(config.floatX)
assert np.allclose(np.cumsum(a), f(a))
# # Even number of elements
# a = np.random.random((18,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
# # Odd number of elements
# a = np.random.random((7,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
# # Use multiple GPU threadblocks
# a = np.random.random((2048+2,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
# # Use multiple GPU threadblocks
# a = np.random.random((2048*75+2,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
# # Use multiple GPU gridblocks
# a = np.ones((2048*2048+2,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
print "\nBenchmark:"
import timeit as t
#theano_time = t.timeit("np.ones((100,))", "import numpy as np", number=1000)
stmt = "f(a)"
setup = """
import numpy as np
import theano
import theano.tensor as T
from theano.tensor.extra_ops import cumsum
from theano import config
x = T.vector('x')
f = theano.function([x], cumsum(x))
a = np.ones((100000,), dtype=config.floatX)
""".replace(" ", "")
theano_time = t.timeit(stmt, setup, number=1000)
print "Theano:\t", theano_time
# Odd number of elements
a = np.random.random((7,)).astype(config.floatX)
assert np.allclose(np.cumsum(a), f(a))
stmt = "np.cumsum(a)"
setup = """
import numpy as np
from theano import config
a = np.ones((100000,), dtype=config.floatX)
""".replace(" ", "")
numpy_time = t.timeit(stmt, setup, number=1000)
print "Numpy:\t", numpy_time
print "Speedup: {0}x".format(numpy_time/theano_time)
# Use multiple GPU threadblocks
a = np.random.random((2048+1,)).astype(config.floatX)
assert np.allclose(np.cumsum(a), f(a))
# Use multiple GPU threadblocks
a = np.random.random((2048*75+1,)).astype(config.floatX)
assert np.allclose(np.cumsum(a), f(a))
# # Extensive testing
# i = 0;
# while True:
# a = np.ones((i,), dtype=config.floatX)
# Use multiple GPU gridblocks
a = np.ones((2048*2048+1,)).astype(config.floatX)
assert np.allclose(np.cumsum(a), f(a))
# fa = f(a)
# npa = np.cumsum(a)
# if not np.allclose(npa, fa):
# print i, np.allclose(npa, fa) # Test axis=None
# print fa
# print npa
# assert False
# Extensive testing
i = 0;
while True:
a = np.ones((i,), dtype=config.floatX)
fa = f(a)
npa = np.cumsum(a)
# if i % 1000 == 0:
# print i
if not np.allclose(npa, fa):
print i, np.allclose(npa, fa) # Test axis=None
print fa
print npa
assert False
# i += 1
if i % 1000 == 0:
print i
i += 1
# ### Test 2D case - axis=1 ###
# x = T.matrix('x')
# f = theano.function([x], cumsum(x, axis=1))
# # # Even number of elements
# # print "\n# Even number of elements"
# # a = np.random.random((18,18)).astype(config.floatX)
# # assert np.allclose(np.cumsum(a, axis=1), f(a))
# # # Odd number of elements
# # print "\n# Odd number of elements"
# # assert np.allclose(np.cumsum(a, axis=1), f(a))
# # # Use multiple GPU threadblocks
# # print "\n# Use multiple GPU threadblocks"
# # a = np.random.random((2048+2,2048+2)).astype(config.floatX)
# # assert np.allclose(np.cumsum(a, axis=1), f(a))
# # # Use multiple GPU threadblocks
# # print "\n# Use multiple GPU threadblocks"
# # a = np.ones((10,2048*75+3)).astype(config.floatX)
# # assert np.allclose(np.cumsum(a, axis=1), f(a))
# # # Use multiple GPU gridblocks
# # print "\n# Use multiple GPU gridblocks"
# # a = np.ones((11,2048*2048+3)).astype(config.floatX)
# # assert np.allclose(np.cumsum(a, axis=1), f(a))
# # Extensive testing
# i = 19000;
# while True:
# a = np.ones((11,i), dtype=config.floatX)
# fa = f(a)
# npa = np.cumsum(a, axis=1)
# if not np.allclose(npa, fa):
# print i, np.allclose(npa, fa) # Test axis=None
# print fa
# print npa
# assert False
# if i % 1000 == 0:
# print i
# i += 1
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论