提交 65f5d0c7 authored 作者: Marc-Alexandre Cote's avatar Marc-Alexandre Cote

Add a 2D version cumsum

上级 6dbb2457
...@@ -17,7 +17,7 @@ from theano import tensor as T ...@@ -17,7 +17,7 @@ from theano import tensor as T
import numpy as np import numpy as np
import theano import theano
from theano import config from theano import config
from theano.tensor.extra_ops import cumsum from theano.tensor.extra_ops import cumsum, diff
from mlpython.misc.utils import Timer from mlpython.misc.utils import Timer
...@@ -26,123 +26,148 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -26,123 +26,148 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
op = GpuCumsum op = GpuCumsum
dtypes = ['float32'] dtypes = ['float32']
def test_GpuCumsum(self): def test_benchmark_1D_vs_2D(self):
### Test 1D case ###
x = T.vector('x')
f = theano.function([x], cumsum(x))
# # Even number of elements
# a = np.random.random((18,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
# # Odd number of elements
# a = np.random.random((7,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
# # Use multiple GPU threadblocks
# a = np.random.random((2048+2,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
# # Use multiple GPU threadblocks
# a = np.random.random((2048*75+2,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
# # Use multiple GPU gridblocks
# a = np.ones((2048*2048+2,)).astype(config.floatX)
# assert np.allclose(np.cumsum(a), f(a))
print "\nBenchmark:" print "\nBenchmark:"
import timeit as t from theano import sandbox, Out
#theano_time = t.timeit("np.ones((100,))", "import numpy as np", number=1000) import time
stmt = "f(a)"
setup = """
import numpy as np
import theano
import theano.tensor as T
from theano.tensor.extra_ops import cumsum
from theano import config
x = T.vector('x')
f = theano.function([x], cumsum(x))
a = np.ones((100000,), dtype=config.floatX)
""".replace(" ", "")
theano_time = t.timeit(stmt, setup, number=1000)
print "Theano:\t", theano_time
stmt = "np.cumsum(a)"
setup = """
import numpy as np
from theano import config
a = np.ones((100000,), dtype=config.floatX)
""".replace(" ", "")
numpy_time = t.timeit(stmt, setup, number=1000)
print "Numpy:\t", numpy_time
print "Speedup: {0}x".format(numpy_time/theano_time)
vlen = 40 * 1024 * 2048 # 10 x # cores x # threads per core
iters = 25
# # Extensive testing x = theano.shared(np.ones((vlen,), dtype=config.floatX), borrow=False)
# i = 0; res = Out(sandbox.cuda.basic_ops.gpu_from_host(cumsum(x)), borrow=True)
# while True: f = theano.function([], res)
# a = np.ones((i,), dtype=config.floatX)
# fa = f(a) print f.maker.fgraph.toposort()
# npa = np.cumsum(a) t0 = time.time()
for i in xrange(iters):
r = f()
t1 = time.time()
print 'Looping %d times took' % iters, t1 - t0, 'seconds'
print 'Result is', r
print 'Numpy result is', np.asarray(r)
# if not np.allclose(npa, fa): # x = theano.shared(np.ones((1,vlen), dtype=config.floatX), borrow=True)
# print i, np.allclose(npa, fa) # Test axis=None # f = theano.function([], Out(sandbox.cuda.basic_ops.gpu_from_host(cumsum(x,axis=1)), borrow=True))
# print fa
# print npa
# assert False
# if i % 1000 == 0: # print f.maker.fgraph.toposort()
# print i # t0 = time.time()
# for i in xrange(iters):
# r = f()
# t1 = time.time()
# print 'Looping %d times took' % iters, t1 - t0, 'seconds'
# print 'Result is', r
# print 'Numpy result is', np.asarray(r)
# i += 1 # print 'Used the', config.device
# ### Test 2D case - axis=1 ### def test_GpuCumsum(self):
# x = T.matrix('x') ### Test 1D case ###
# f = theano.function([x], cumsum(x, axis=1)) x = T.vector('x')
f = theano.function([x], cumsum(x))
# # # Even number of elements # Even number of elements
# # print "\n# Even number of elements" a = np.random.random((18,)).astype(config.floatX)
# # a = np.random.random((18,18)).astype(config.floatX) print f(a)
# # assert np.allclose(np.cumsum(a, axis=1), f(a)) print np.cumsum(a)
assert np.allclose(np.cumsum(a), f(a))
# # # Odd number of elements
# # print "\n# Odd number of elements" # Odd number of elements
# # assert np.allclose(np.cumsum(a, axis=1), f(a)) a = np.random.random((7,)).astype(config.floatX)
assert np.allclose(np.cumsum(a), f(a))
# # # Use multiple GPU threadblocks
# # print "\n# Use multiple GPU threadblocks" # Use multiple GPU threadblocks
# # a = np.random.random((2048+2,2048+2)).astype(config.floatX) a = np.random.random((2048+2,)).astype(config.floatX)
# # assert np.allclose(np.cumsum(a, axis=1), f(a)) assert np.allclose(np.cumsum(a), f(a))
# # # Use multiple GPU threadblocks # Use multiple GPU threadblocks
# # print "\n# Use multiple GPU threadblocks" a = np.random.random((2048*75+2,)).astype(config.floatX)
# # a = np.ones((10,2048*75+3)).astype(config.floatX) assert np.allclose(np.cumsum(a), f(a))
# # assert np.allclose(np.cumsum(a, axis=1), f(a))
# Use multiple GPU gridblocks
# # # Use multiple GPU gridblocks a = np.ones((2048*2048+2,)).astype(config.floatX)
# # print "\n# Use multiple GPU gridblocks" assert np.allclose(np.cumsum(a), f(a))
# # a = np.ones((11,2048*2048+3)).astype(config.floatX)
# # assert np.allclose(np.cumsum(a, axis=1), f(a))
# Extensive testing
# # Extensive testing for i in xrange(int(1e3)*5):
# i = 19000; a = np.ones((i,), dtype=config.floatX)
# while True:
# a = np.ones((11,i), dtype=config.floatX) fa = f(a)
# fa = f(a) npa = np.cumsum(a)
# npa = np.cumsum(a, axis=1)
if not np.allclose(npa, fa):
# if not np.allclose(npa, fa): print i, np.allclose(npa, fa) # Test axis=None
# print i, np.allclose(npa, fa) # Test axis=None print fa
# print fa print npa
# print npa assert False
# assert False
if i % 1000 == 0:
# if i % 1000 == 0: print i
# print i
# i += 1 #for axis in xrange(2):
for axis in xrange(2):
### Test 2D case - axis=1 ###
x = T.matrix('x')
f = theano.function([x], cumsum(x, axis=axis))
# Even number of elements
print "\n# Even number of elements (axis={0})".format(axis)
a = np.random.random((18,18)).astype(config.floatX)
assert np.allclose(np.cumsum(a, axis=axis), f(a))
# Odd number of elements
print "\n# Odd number of elements (axis={0})".format(axis)
a = np.random.random((21,21)).astype(config.floatX)
assert np.allclose(np.cumsum(a, axis=axis), f(a))
# Use two GPU threadblocks
print "\n# Use two GPU threadblocks (axis={0})".format(axis)
a = np.random.random((2048+2,2048+2)).astype(config.floatX)
assert np.allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU threadblocks
print "\n# Use multiple GPU threadblocks (axis={0})".format(axis)
a = np.ones((10,2048*75+3)).astype(config.floatX)
assert np.allclose(np.cumsum(a, axis=axis), f(a))
a = np.ones((2048*75+3,10)).astype(config.floatX)
assert np.allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks
print "\n# Use multiple GPU gridblocks (axis={0})".format(axis)
a = np.ones((11,2048*2048+3)).astype(config.floatX)
assert np.allclose(np.cumsum(a, axis=axis), f(a))
a = np.ones((2048*2048+3,11)).astype(config.floatX)
assert np.allclose(np.cumsum(a, axis=axis), f(a))
# Extensive testing for the first 10k sizes
for i in xrange(int(1e3)*5):
a = np.ones((11,i), dtype=config.floatX)
fa = f(a)
npa = np.cumsum(a, axis=axis)
if not np.allclose(npa, fa):
print i, np.allclose(npa, fa) # Test axis=None
print fa
print npa
assert False
a = np.ones((i,11), dtype=config.floatX)
fa = f(a)
npa = np.cumsum(a, axis=axis)
if not np.allclose(npa, fa):
print i, np.allclose(npa, fa) # Test axis=None
print fa
print npa
assert False
if i % 1000 == 0:
print i
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论