提交 6657d35b authored 作者: goodfeli's avatar goodfeli

Merge pull request #947 from nouiz/gpu_reduce

Gpu reduce small change.
......@@ -373,7 +373,12 @@ def use(device,
if test_driver:
import theano.sandbox.cuda.tests.test_driver
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
if device_properties(use.device_number)["warpSize"] != 32:
raise ValueError("Your GPU have a warpSize of 32. Currently"
" we have code that depend on this. Email"
" Theano mailing list to tell us about"
" this new GPU as we don't know any with"
" this properties")
if move_shared_float32_to_gpu:
handle_shared_float32(True)
......
......@@ -624,8 +624,8 @@ class GpuCAReduce(GpuOp):
# but tensor.elemwise.CAReduce has this exact same check so I guess
# this is OK to do
if self.scalar_op in [scal.minimum, scal.maximum]:
conds = []
for i in xrange(nd_in):
conds = []
if self.reduce_mask[i]:
conds.append("(CudaNdarray_HOST_DIMS(%(x)s)[%(i)s] == 0)" % locals())
assert len(conds) > 0
......@@ -723,7 +723,7 @@ class GpuCAReduce(GpuOp):
if (verbose)
printf("running kernel_reduce_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x;
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
n_shared>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
......@@ -862,11 +862,10 @@ class GpuCAReduce(GpuOp):
extern __shared__ float buf[];
float myresult = 0.0f;
//This is caught in cuda/init.py when we init the gpu. I keep
//it here to ease finding code that rely on this.
if (warpSize != 32)
{
// TODO: set error code
// 2012-09-20 IG: as of today, Fred says he will check
// this elsewhere, in a different PR
Z[0] = -666;
return;
}
......
......@@ -42,7 +42,15 @@ def tes_use():
tcn.use()
def test_sum():
def tensor_pattern_to_gpu_pattern(shape, pattern):
gpu_pattern = [0 for elem in shape]
for idx in pattern:
gpu_pattern[idx] = 1
gpu_pattern = tuple(gpu_pattern)
return gpu_pattern
def test_careduce():
"""
test sum pattern 1, 11, 10, 01, 001, 010, 100, 110, 011, 111,
0011, 0101, 0111, 1011, 1111
......@@ -56,363 +64,198 @@ def test_sum():
TODO: test with broadcast
"""
for shape, pattern in [((1,1),(1,)),
((1,0),(1,)),
((0,1),(1,)),
((0,0),(1,)),
((0,0,0),(1,2)),
((0,0,0,0),(1,2,3)),
((2,1),(1,)),
((1,2),(1,)),
((100,3,1300),[1]),
((0,),[0]),((5,),[0]),
((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
((0,0,0,0),[0,1,2,3]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
((5,4,3,10,11),[1,2]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
#test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
((4100,3),[0]),((3,4101),[0]),#10
((1024,33),[0]),((33,1024),[0]),#10
((1025,33),[0]),((33,1025),[0]),#10
((4100,3),[1]),((3,4101),[1]),#01
((1024,33),[1]),((33,1024),[1]),#01
((1025,33),[1]),((33,1025),[1]),#01
((4100,3),[0,1]),((3,4101),[0,1]),#11
((1024,33),[0,1]),((33,1024),[0,1]),#01
((1025,33),[0,1]),((33,1025),[0,1]),#01
((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
#test pattern implemented by reshape
((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
]:
a = tensor.TensorType('float32', (False,) * len(shape))()
b = T.Sum(pattern)(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu)
f2 = theano.function([a], b, mode=mode_without_gpu)
assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert T.Sum in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
if val.size == 0:
assert _allclose(f2(val), f(val)), ('shape', shape, 'pattern', pattern)
else:
try:
#We raise the error threashold as we sum big matrix
#and this cause small rounding difference with some seed
#example in debug mode with unittests.rseed=9275
orig_rtol = theano.tensor.basic.float32_rtol
theano.tensor.basic.float32_rtol = 2e-5
assert _allclose(f2(val), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]),
f2(val), f(val), val)
finally:
theano.tensor.basic.float32_rtol = orig_rtol
#test with dimshuffle
#we shuffle the 2 outer dims.
for shape, pattern in [#((5,),[0]),
((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
a = tensor.TensorType('float32', (False,) * len(shape))()
dim_pattern = range(len(shape))
dim_pattern[0] = 1
dim_pattern[1] = 0
a = a.dimshuffle(dim_pattern)
b = T.Sum(pattern)(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu)
f2 = theano.function([a], b, mode=mode_without_gpu)
assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert T.Sum in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
assert _allclose(f2(val), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
#test with broadcast
for shape, pattern in [((5,),[0]),
((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
shape = numpy.asarray(shape) * 2
a = tensor.TensorType('float32', (False,) * len(shape))()
a2 = tcn.CudaNdarrayType((False,) * len(shape))()
b = T.Sum(pattern)(a)
b2 = T.Sum(pattern)(a2)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
val2 = cuda.CudaNdarray(val)
if len(shape) == 1:
val = val[::2]
val2 = val2[::2]
elif len(shape) == 2:
val = val[::2, ::2]
val2 = val2[::2, ::2]
elif len(shape) == 3:
val = val[::2, ::2, ::2]
val2 = val2[::2, ::2, ::2]
elif len(shape) == 4:
val = val[::2, ::2, ::2, ::2]
val2 = val2[::2, ::2, ::2, ::2]
f = theano.function([a], b, mode=mode_without_gpu)
f2 = theano.function([a2], b2, mode=mode_with_gpu)
assert tcn.GpuCAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
assert T.Sum in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert _allclose(f2(val2), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
def test_max():
"""
test GpuMax pattern 01, 011, 0111 (tensor.max pattern (1,), (1,2), (1,2,3) )
for scalar_op in [theano.scalar.add, theano.scalar.maximum]:
for shape, pattern in [((1,1),(1,)),
((1,0),(1,)),
((0,1),(1,)),
((0,0),(1,)),
((0,0,0),(1,2)),
((0,0,0,0),(1,2,3)),
((2,1),(1,)),
((1,2),(1,)),
((100,3,1300),[1]),
((0,),[0]),((5,),[0]),
((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
((0,0,0,0),[0,1,2,3]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
((5,4,3,10,11),[1,2]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
#test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
((4100,3),[0]),((3,4101),[0]),#10
((1024,33),[0]),((33,1024),[0]),#10
((1025,33),[0]),((33,1025),[0]),#10
((4100,3),[1]),((3,4101),[1]),#01
((1024,33),[1]),((33,1024),[1]),#01
((1025,33),[1]),((33,1025),[1]),#01
((4100,3),[0,1]),((3,4101),[0,1]),#11
((1024,33),[0,1]),((33,1024),[0,1]),#01
((1025,33),[0,1]),((33,1025),[0,1]),#01
((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
#test pattern implemented by reshape
((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
]:
op = tensor.CAReduce(scalar_op, axis=pattern)
pat = tensor_pattern_to_gpu_pattern(shape, pattern)
#GpuCAReduce{maximum} support only those patterns
if scalar_op is theano.scalar.maximum and pat not in [
(0, 1), (0, 1, 1), (0, 1, 1)]:
continue
TODO: are others currently implemented by reshape?
"""
a = tensor.TensorType('float32', (False,) * len(shape))()
b = op(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu)
f2 = theano.function([a], b, mode=mode_without_gpu)
assert tcn.GpuCAReduce in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__
for x in f2.maker.fgraph.toposort()]
f_caused_value_error = False
try:
f_out = f(val)
except ValueError, e:
exc = e
f_caused_value_error = True
def tensor_pattern_to_gpu_pattern(shape, pattern):
gpu_pattern = [ 0 for elem in shape ]
for idx in pattern:
gpu_pattern[idx] = 1
gpu_pattern = tuple(gpu_pattern)
return gpu_pattern
known_fail = False
for shape, pattern in [((1,1),(1,)),
((1,0),(1,)),
((0,1),(1,)),
((0,0),(1,)),
((0,0,0),(1,2)),
((0,0,0,0),(1,2,3)),
((2,1),(1,)),
((1,2),(1,)),
((100,3,1300),[1]),
((0,),[0]),((5,),[0]),
((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
((0,0,0,0),[0,1,2,3]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
((5,4,3,10,11),[1,2]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
#test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
((4100,3),[0]),((3,4101),[0]),#10
((1024,33),[0]),((33,1024),[0]),#10
((1025,33),[0]),((33,1025),[0]),#10
((4100,3),[1]),((3,4101),[1]),#01
((1024,33),[1]),((33,1024),[1]),#01
((1025,33),[1]),((33,1025),[1]),#01
((4100,3),[0,1]),((3,4101),[0,1]),#11
((1024,33),[0,1]),((33,1024),[0,1]),#01
((1025,33),[0,1]),((33,1025),[0,1]),#01
((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
#test pattern implemented by reshape
((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
]:
# Don't test patterns that aren't implemented for max yet
if tensor_pattern_to_gpu_pattern(shape, pattern) not in \
[ (0,1), (0,1,1), (0,1,1) ]:
continue
a = tensor.TensorType('float32', (False,) * len(shape))()
b = T.max(a, pattern)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu)
f_caused_value_error = False
try:
f_out = f(val)
except ValueError, e:
exc = e
f_caused_value_error = True
f2 = theano.function([a], b, mode=mode_without_gpu)
try:
f2_out = f2(val)
f2_caused_value_error = False
except ValueError, e:
exc2 = e
f2_caused_value_error = True
assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert T.CAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
try:
f2_out = f2(val)
except ValueError, e:
exc2 = e
f2_caused_value_error = True
if f_caused_value_error != f2_caused_value_error:
if f_caused_value_error:
print 'f caused this value error:'
print exc
else:
print 'f did not raise a value error, but should have'
if f2_caused_value_error:
print 'f2 caused this value error:'
print exc2
else:
print 'f should not have raised a value error'
print 'shape was: ', shape
print 'pattern was: ', pattern
assert False
# Check that 0 shape matrices are invalid in the same cases
if f_caused_value_error != f2_caused_value_error:
if f_caused_value_error:
print 'f caused this value error:'
print exc
else:
print 'f did not raise a value error, but should have'
if f2_caused_value_error:
print 'f2 caused this value error:'
print exc2
else:
print 'f should not have raised a value error'
print 'shape was: ',shape
print 'pattern was: ',pattern
assert False
if f_caused_value_error:
continue
if val.size == 0:
assert f2(val).size == f(val).size
assert f2(val).shape == f(val).shape
else:
try:
#We raise the error threashold as we sum big matrix
#and this cause small rounding difference with some seed
#example in debug mode with unittests.rseed=9275
orig_rtol = theano.tensor.basic.float32_rtol
theano.tensor.basic.float32_rtol = 2e-5
f2_val = f2(val)
f_val = f(val)
if not _allclose(f2_val, f_val):
print 'failed for the following arguments: '
print 'shape:',shape
print 'pattern: ',pattern
print 'input:'
print val
print 'correct output: '
print f2_val
print 'actual output: '
print f_val
assert False
assert _allclose(f_out, f2_out), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]),
f2(val), f(val), val)
finally:
theano.tensor.basic.float32_rtol = orig_rtol
#test with dimshuffle
#we shuffle the 2 outer dims.
for shape, pattern in [#((5,),[0]),
((5,4),(0,1)),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
# Don't test patterns that aren't implemented for max yet
if tensor_pattern_to_gpu_pattern(shape, pattern) not in \
[ (0,1), (0,1,1), (0,1,1) ]:
continue
a = tensor.TensorType('float32', (False,) * len(shape))()
dim_pattern = range(len(shape))
dim_pattern[0] = 1
dim_pattern[1] = 0
a = a.dimshuffle(dim_pattern)
b = T.max(a, pattern)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu)
f2 = theano.function([a], b, mode=mode_without_gpu)
assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert T.CAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
assert _allclose(f2(val), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
#test with broadcast
for shape, pattern in [((5,),(0,)),
((5,4),(0,1)),
((5,4),(0,)),
((5,4,3),(0,)),
((5,4,3),(0,1)),
((5,4,3),(2,)),
((5,4,3),(0,1,2)),
((5,4,3,2),(0,1,2,3)),
((5,4,3,2),(0,2,3))]:
# Don't test patterns that aren't implemented for max yet
if tensor_pattern_to_gpu_pattern(shape, pattern) not in \
[ (0,1), (0,1,1), (0,1,1) ]:
continue
shape = numpy.asarray(shape) * 2
a = tensor.TensorType('float32', (False,) * len(shape))()
a2 = tcn.CudaNdarrayType((False,) * len(shape))()
b = T.max(a, pattern)
b2 = T.max(a2, pattern)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
val2 = cuda.CudaNdarray(val)
if len(shape) == 1:
val = val[::2]
val2 = val2[::2]
elif len(shape) == 2:
val = val[::2, ::2]
val2 = val2[::2, ::2]
elif len(shape) == 3:
val = val[::2, ::2, ::2]
val2 = val2[::2, ::2, ::2]
elif len(shape) == 4:
val = val[::2, ::2, ::2, ::2]
val2 = val2[::2, ::2, ::2, ::2]
f = theano.function([a], b, mode=mode_without_gpu)
f2 = theano.function([a2], b2, mode=mode_with_gpu)
assert tcn.GpuCAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
assert T.CAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert _allclose(f2(val2), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
#test with dimshuffle
#we shuffle the 2 outer dims.
for shape, pattern in [#((5,),[0]),
((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
op = tensor.CAReduce(scalar_op, axis=pattern)
pat = tensor_pattern_to_gpu_pattern(shape, pattern)
#GpuCAReduce{maximum} support only those patterns
if scalar_op is theano.scalar.maximum and pat not in [
(0, 1), (0, 1, 1), (0, 1, 1)]:
continue
a = tensor.TensorType('float32', (False,) * len(shape))()
dim_pattern = range(len(shape))
dim_pattern[0] = 1
dim_pattern[1] = 0
a = a.dimshuffle(dim_pattern)
b = op(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu)
f2 = theano.function([a], b, mode=mode_without_gpu)
assert tcn.GpuCAReduce in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__
for x in f2.maker.fgraph.toposort()]
assert _allclose(f2(val), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
#test with broadcast
for shape, pattern in [((5,),[0]),
((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),
((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
op = tensor.CAReduce(scalar_op, axis=pattern)
pat = tensor_pattern_to_gpu_pattern(shape, pattern)
#GpuCAReduce{maximum} support only those patterns
if scalar_op is theano.scalar.maximum and pat not in [
(0, 1), (0, 1, 1), (0, 1, 1)]:
continue
shape = numpy.asarray(shape) * 2
a = tensor.TensorType('float32', (False,) * len(shape))()
a2 = tcn.CudaNdarrayType((False,) * len(shape))()
b = op(a)
b2 = op(a2)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
val2 = cuda.CudaNdarray(val)
if len(shape) == 1:
val = val[::2]
val2 = val2[::2]
elif len(shape) == 2:
val = val[::2, ::2]
val2 = val2[::2, ::2]
elif len(shape) == 3:
val = val[::2, ::2, ::2]
val2 = val2[::2, ::2, ::2]
elif len(shape) == 4:
val = val[::2, ::2, ::2, ::2]
val2 = val2[::2, ::2, ::2, ::2]
f = theano.function([a], b, mode=mode_without_gpu)
f2 = theano.function([a2], b2, mode=mode_with_gpu)
assert tcn.GpuCAReduce in [x.op.__class__
for x in f2.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert _allclose(f2(val2), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
def test_flatten():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论