提交 4c0ca769 authored 作者: abergeron's avatar abergeron

Merge pull request #1760 from MarcCote/cumsum_gpu

CumsumOp on GPU
差异被折叠。
...@@ -1958,3 +1958,5 @@ optdb.register('gpu_scanOp_make_inplace', ...@@ -1958,3 +1958,5 @@ optdb.register('gpu_scanOp_make_inplace',
'fast_run', 'fast_run',
'inplace', 'inplace',
'scan') 'scan')
import theano.sandbox.cuda.extra_ops
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
import theano.tensor.tests.test_extra_ops
from theano.sandbox.cuda.extra_ops import GpuCumsum
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
from theano import tensor as T
import numpy as np
import theano
from theano import config
from theano.tensor.extra_ops import cumsum, CumsumOp
class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
mode = mode_with_gpu
def setUp(self):
super(TestGpuCumsum, self).setUp()
# Fetch some useful properties on the device
cuda = theano.sandbox.cuda
device_id = cuda.use.device_number
if device_id is None:
cuda.use("gpu",
force=False,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False,
test_driver=True)
device_id = cuda.use.device_number
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
prop = cuda_ndarray.device_properties(device_id)
self.max_threads_dim0 = prop['maxThreadsDim0']
self.max_grid_size1 = prop['maxGridSize1']
def test_Strides1D(self):
x = T.fvector('x')
# Stepped strides
f = theano.function([x], cumsum(x[::2]), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
a = np.random.randint(10, size=(42,)).astype("float32")
assert np.allclose(np.cumsum(a[::2]), f(a))
# Alternative stepped strides
f = theano.function([x], cumsum(x), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
a = np.random.randint(10, size=(42,)).astype("float32")
assert np.allclose(np.cumsum(a[::2]), f(a[::2]))
# Negative strides
f = theano.function([x], cumsum(x[::-1]), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
a = np.random.randint(10, size=(42,)).astype("float32")
assert np.allclose(np.cumsum(a[::-1]), f(a))
def test_Strides2D(self):
x = T.fmatrix('x')
for shape_axis, axis in zip([0, 1, 0], [0, 1, None]):
a = np.random.random((42, 30)).astype("float32")
# Stepped strides along axis=0
f = theano.function([x], cumsum(x[::2], axis=axis), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
assert np.allclose(np.cumsum(a[::2], axis=axis), f(a))
# Stepped strides along axis=1
f = theano.function([x], cumsum(x[:, ::2], axis=axis), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
assert np.allclose(np.cumsum(a[:, ::2], axis=axis), f(a))
# Alternative stepped strides along axis=0
f = theano.function([x], cumsum(x), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
assert np.allclose(np.cumsum(a[::2]), f(a[::2]))
# Alternative stepped strides along axis=1
f = theano.function([x], cumsum(x), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
assert np.allclose(np.cumsum(a[:, ::2]), f(a[:, ::2]))
# Negative strides along axis=0
f = theano.function([x], cumsum(x[::-1], axis=axis), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
assert np.allclose(np.cumsum(a[::-1], axis=axis), f(a))
# Negative strides along axis=1
f = theano.function([x], cumsum(x[:, ::-1], axis=axis), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
assert np.allclose(np.cumsum(a[:, ::-1], axis=axis), f(a))
def test_GpuCumsum1D(self):
block_max_size = self.max_threads_dim0 * 2
x = T.fvector('x')
f = theano.function([x], cumsum(x), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
# Extensive testing for the first 1025 sizes
a = np.random.random(1025).astype("float32")
for i in xrange(a.shape[0]):
assert np.allclose(np.cumsum(a[:i]), f(a[:i]))
# Use multiple GPU threadblocks
a = np.random.random((block_max_size+2,)).astype("float32")
assert np.allclose(np.cumsum(a), f(a))
# Use recursive cumsum
a = np.ones((block_max_size*(block_max_size+1)+2,),
dtype="float32")
assert np.allclose(np.cumsum(a), f(a))
def test_GpuCumsum2D(self):
block_max_size = self.max_threads_dim0 * 2
x = T.fmatrix('x')
for shape_axis, axis in zip([0, 1, 0], [0, 1, None]):
f = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
# Extensive testing for the first 1025 sizes
a_shape = [5, 5]
a_shape[shape_axis] = 1025
a = np.random.random(a_shape).astype("float32")
slices = [slice(None), slice(None)]
for i in xrange(a.shape[shape_axis]):
slices[shape_axis] = slice(i)
fa = f(a[slices])
npa = np.cumsum(a[slices], axis=axis)
assert np.allclose(npa, fa)
# Use multiple GPU threadblocks
a_shape = [5, 5]
a_shape[shape_axis] = block_max_size+2
a = np.random.random(a_shape).astype("float32")
assert np.allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks
a_shape = [5, 5]
a_shape[1-shape_axis] = self.max_grid_size1+1
a = np.random.random(a_shape).astype("float32")
assert np.allclose(np.cumsum(a, axis=axis), f(a))
# Use recursive cumsum
a_shape = [5, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
a = np.ones(a_shape, dtype="float32")
assert np.allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum3D(self):
# Should not use the GPU version.
x = T.ftensor3('x')
f = theano.function([x], cumsum(x, axis=1), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, CumsumOp)]
...@@ -28,6 +28,8 @@ class CumsumOp(theano.Op): ...@@ -28,6 +28,8 @@ class CumsumOp(theano.Op):
if self.axis is None: if self.axis is None:
out_type = theano.tensor.vector(dtype=x.dtype) # Flatten out_type = theano.tensor.vector(dtype=x.dtype) # Flatten
elif self.axis >= x.ndim:
raise ValueError('axis(={0}) out of bounds'.format(self.axis))
return theano.Apply(self, [x], [out_type]) return theano.Apply(self, [x], [out_type])
...@@ -148,6 +150,8 @@ class CumprodOp(theano.Op): ...@@ -148,6 +150,8 @@ class CumprodOp(theano.Op):
if self.axis is None: if self.axis is None:
out_type = theano.tensor.vector(dtype=x.dtype) # Flatten out_type = theano.tensor.vector(dtype=x.dtype) # Flatten
elif self.axis >= x.ndim:
raise ValueError('axis(={0}) out of bounds'.format(self.axis))
return theano.Apply(self, [x], [out_type]) return theano.Apply(self, [x], [out_type])
......
...@@ -28,6 +28,9 @@ class TestCumsumOp(utt.InferShapeTester): ...@@ -28,6 +28,9 @@ class TestCumsumOp(utt.InferShapeTester):
x = T.tensor3('x') x = T.tensor3('x')
a = np.random.random((3, 5, 2)).astype(config.floatX) a = np.random.random((3, 5, 2)).astype(config.floatX)
# Test axis out of bounds
self.assertRaises(ValueError, cumsum, x, axis=4)
f = theano.function([x], cumsum(x)) f = theano.function([x], cumsum(x))
assert np.allclose(np.cumsum(a), f(a)) # Test axis=None assert np.allclose(np.cumsum(a), f(a)) # Test axis=None
...@@ -35,7 +38,6 @@ class TestCumsumOp(utt.InferShapeTester): ...@@ -35,7 +38,6 @@ class TestCumsumOp(utt.InferShapeTester):
f = theano.function([x], cumsum(x, axis=axis)) f = theano.function([x], cumsum(x, axis=axis))
assert np.allclose(np.cumsum(a, axis=axis), f(a)) assert np.allclose(np.cumsum(a, axis=axis), f(a))
def test_infer_shape(self): def test_infer_shape(self):
x = T.tensor3('x') x = T.tensor3('x')
a = np.random.random((3, 5, 2)).astype(config.floatX) a = np.random.random((3, 5, 2)).astype(config.floatX)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论