提交 6657d35b authored 作者: goodfeli's avatar goodfeli

Merge pull request #947 from nouiz/gpu_reduce

Gpu reduce small change.
...@@ -373,7 +373,12 @@ def use(device, ...@@ -373,7 +373,12 @@ def use(device,
if test_driver: if test_driver:
import theano.sandbox.cuda.tests.test_driver import theano.sandbox.cuda.tests.test_driver
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1() theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
if device_properties(use.device_number)["warpSize"] != 32:
raise ValueError("Your GPU have a warpSize of 32. Currently"
" we have code that depend on this. Email"
" Theano mailing list to tell us about"
" this new GPU as we don't know any with"
" this properties")
if move_shared_float32_to_gpu: if move_shared_float32_to_gpu:
handle_shared_float32(True) handle_shared_float32(True)
......
...@@ -624,8 +624,8 @@ class GpuCAReduce(GpuOp): ...@@ -624,8 +624,8 @@ class GpuCAReduce(GpuOp):
# but tensor.elemwise.CAReduce has this exact same check so I guess # but tensor.elemwise.CAReduce has this exact same check so I guess
# this is OK to do # this is OK to do
if self.scalar_op in [scal.minimum, scal.maximum]: if self.scalar_op in [scal.minimum, scal.maximum]:
conds = []
for i in xrange(nd_in): for i in xrange(nd_in):
conds = []
if self.reduce_mask[i]: if self.reduce_mask[i]:
conds.append("(CudaNdarray_HOST_DIMS(%(x)s)[%(i)s] == 0)" % locals()) conds.append("(CudaNdarray_HOST_DIMS(%(x)s)[%(i)s] == 0)" % locals())
assert len(conds) > 0 assert len(conds) > 0
...@@ -723,7 +723,7 @@ class GpuCAReduce(GpuOp): ...@@ -723,7 +723,7 @@ class GpuCAReduce(GpuOp):
if (verbose) if (verbose)
printf("running kernel_reduce_10_%(name)s\\n"); printf("running kernel_reduce_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x; int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_10_%(name)s<<<n_blocks, n_threads, kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
n_shared>>>( n_shared>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
...@@ -862,11 +862,10 @@ class GpuCAReduce(GpuOp): ...@@ -862,11 +862,10 @@ class GpuCAReduce(GpuOp):
extern __shared__ float buf[]; extern __shared__ float buf[];
float myresult = 0.0f; float myresult = 0.0f;
//This is caught in cuda/init.py when we init the gpu. I keep
//it here to ease finding code that rely on this.
if (warpSize != 32) if (warpSize != 32)
{ {
// TODO: set error code
// 2012-09-20 IG: as of today, Fred says he will check
// this elsewhere, in a different PR
Z[0] = -666; Z[0] = -666;
return; return;
} }
......
...@@ -42,7 +42,15 @@ def tes_use(): ...@@ -42,7 +42,15 @@ def tes_use():
tcn.use() tcn.use()
def test_sum(): def tensor_pattern_to_gpu_pattern(shape, pattern):
gpu_pattern = [0 for elem in shape]
for idx in pattern:
gpu_pattern[idx] = 1
gpu_pattern = tuple(gpu_pattern)
return gpu_pattern
def test_careduce():
""" """
test sum pattern 1, 11, 10, 01, 001, 010, 100, 110, 011, 111, test sum pattern 1, 11, 10, 01, 001, 010, 100, 110, 011, 111,
0011, 0101, 0111, 1011, 1111 0011, 0101, 0111, 1011, 1111
...@@ -56,363 +64,198 @@ def test_sum(): ...@@ -56,363 +64,198 @@ def test_sum():
TODO: test with broadcast TODO: test with broadcast
""" """
for shape, pattern in [((1,1),(1,)), for scalar_op in [theano.scalar.add, theano.scalar.maximum]:
((1,0),(1,)), for shape, pattern in [((1,1),(1,)),
((0,1),(1,)), ((1,0),(1,)),
((0,0),(1,)), ((0,1),(1,)),
((0,0,0),(1,2)), ((0,0),(1,)),
((0,0,0,0),(1,2,3)), ((0,0,0),(1,2)),
((2,1),(1,)), ((0,0,0,0),(1,2,3)),
((1,2),(1,)), ((2,1),(1,)),
((100,3,1300),[1]), ((1,2),(1,)),
((0,),[0]),((5,),[0]), ((100,3,1300),[1]),
((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test. ((0,),[0]),((5,),[0]),
((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]), ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
((0,0,0,0),[0,1,2,3]), ((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]), ((0,0,0,0),[0,1,2,3]),
((5,4,3,10,11),[1,2]), ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]), ((5,4,3,10,11),[1,2]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
#test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
((4100,3),[0]),((3,4101),[0]),#10 #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
((1024,33),[0]),((33,1024),[0]),#10 ((4100,3),[0]),((3,4101),[0]),#10
((1025,33),[0]),((33,1025),[0]),#10 ((1024,33),[0]),((33,1024),[0]),#10
((1025,33),[0]),((33,1025),[0]),#10
((4100,3),[1]),((3,4101),[1]),#01
((1024,33),[1]),((33,1024),[1]),#01 ((4100,3),[1]),((3,4101),[1]),#01
((1025,33),[1]),((33,1025),[1]),#01 ((1024,33),[1]),((33,1024),[1]),#01
((1025,33),[1]),((33,1025),[1]),#01
((4100,3),[0,1]),((3,4101),[0,1]),#11
((1024,33),[0,1]),((33,1024),[0,1]),#01 ((4100,3),[0,1]),((3,4101),[0,1]),#11
((1025,33),[0,1]),((33,1025),[0,1]),#01 ((1024,33),[0,1]),((33,1024),[0,1]),#01
((1025,33),[0,1]),((33,1025),[0,1]),#01
((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010 ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001 ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110 ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011 ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111 #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101 ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011 ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111 ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111 ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
#test pattern implemented by reshape
((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000 #test pattern implemented by reshape
((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100 ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010 ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001 ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111 ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
]:
a = tensor.TensorType('float32', (False,) * len(shape))() ]:
b = T.Sum(pattern)(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) op = tensor.CAReduce(scalar_op, axis=pattern)
# val = numpy.ones(shape) pat = tensor_pattern_to_gpu_pattern(shape, pattern)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) #GpuCAReduce{maximum} support only those patterns
val = theano._asarray(val, dtype='float32') if scalar_op is theano.scalar.maximum and pat not in [
f = theano.function([a], b, mode=mode_with_gpu) (0, 1), (0, 1, 1), (0, 1, 1)]:
f2 = theano.function([a], b, mode=mode_without_gpu) continue
assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert T.Sum in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
if val.size == 0:
assert _allclose(f2(val), f(val)), ('shape', shape, 'pattern', pattern)
else:
try:
#We raise the error threashold as we sum big matrix
#and this cause small rounding difference with some seed
#example in debug mode with unittests.rseed=9275
orig_rtol = theano.tensor.basic.float32_rtol
theano.tensor.basic.float32_rtol = 2e-5
assert _allclose(f2(val), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]),
f2(val), f(val), val)
finally:
theano.tensor.basic.float32_rtol = orig_rtol
#test with dimshuffle
#we shuffle the 2 outer dims.
for shape, pattern in [#((5,),[0]),
((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
a = tensor.TensorType('float32', (False,) * len(shape))()
dim_pattern = range(len(shape))
dim_pattern[0] = 1
dim_pattern[1] = 0
a = a.dimshuffle(dim_pattern)
b = T.Sum(pattern)(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu)
f2 = theano.function([a], b, mode=mode_without_gpu)
assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert T.Sum in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
assert _allclose(f2(val), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
#test with broadcast
for shape, pattern in [((5,),[0]),
((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
shape = numpy.asarray(shape) * 2
a = tensor.TensorType('float32', (False,) * len(shape))()
a2 = tcn.CudaNdarrayType((False,) * len(shape))()
b = T.Sum(pattern)(a)
b2 = T.Sum(pattern)(a2)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
val2 = cuda.CudaNdarray(val)
if len(shape) == 1:
val = val[::2]
val2 = val2[::2]
elif len(shape) == 2:
val = val[::2, ::2]
val2 = val2[::2, ::2]
elif len(shape) == 3:
val = val[::2, ::2, ::2]
val2 = val2[::2, ::2, ::2]
elif len(shape) == 4:
val = val[::2, ::2, ::2, ::2]
val2 = val2[::2, ::2, ::2, ::2]
f = theano.function([a], b, mode=mode_without_gpu)
f2 = theano.function([a2], b2, mode=mode_with_gpu)
assert tcn.GpuCAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
assert T.Sum in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert _allclose(f2(val2), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
def test_max():
"""
test GpuMax pattern 01, 011, 0111 (tensor.max pattern (1,), (1,2), (1,2,3) )
TODO: are others currently implemented by reshape? a = tensor.TensorType('float32', (False,) * len(shape))()
""" b = op(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu)
f2 = theano.function([a], b, mode=mode_without_gpu)
assert tcn.GpuCAReduce in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__
for x in f2.maker.fgraph.toposort()]
f_caused_value_error = False
try:
f_out = f(val)
except ValueError, e:
exc = e
f_caused_value_error = True
def tensor_pattern_to_gpu_pattern(shape, pattern):
gpu_pattern = [ 0 for elem in shape ]
for idx in pattern:
gpu_pattern[idx] = 1
gpu_pattern = tuple(gpu_pattern)
return gpu_pattern
known_fail = False
for shape, pattern in [((1,1),(1,)),
((1,0),(1,)),
((0,1),(1,)),
((0,0),(1,)),
((0,0,0),(1,2)),
((0,0,0,0),(1,2,3)),
((2,1),(1,)),
((1,2),(1,)),
((100,3,1300),[1]),
((0,),[0]),((5,),[0]),
((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
((0,0,0,0),[0,1,2,3]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
((5,4,3,10,11),[1,2]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
#test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
((4100,3),[0]),((3,4101),[0]),#10
((1024,33),[0]),((33,1024),[0]),#10
((1025,33),[0]),((33,1025),[0]),#10
((4100,3),[1]),((3,4101),[1]),#01
((1024,33),[1]),((33,1024),[1]),#01
((1025,33),[1]),((33,1025),[1]),#01
((4100,3),[0,1]),((3,4101),[0,1]),#11
((1024,33),[0,1]),((33,1024),[0,1]),#01
((1025,33),[0,1]),((33,1025),[0,1]),#01
((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
#test pattern implemented by reshape
((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
]:
# Don't test patterns that aren't implemented for max yet
if tensor_pattern_to_gpu_pattern(shape, pattern) not in \
[ (0,1), (0,1,1), (0,1,1) ]:
continue
a = tensor.TensorType('float32', (False,) * len(shape))()
b = T.max(a, pattern)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu)
f_caused_value_error = False
try:
f_out = f(val)
except ValueError, e:
exc = e
f_caused_value_error = True
f2 = theano.function([a], b, mode=mode_without_gpu)
try:
f2_out = f2(val)
f2_caused_value_error = False f2_caused_value_error = False
except ValueError, e: try:
exc2 = e f2_out = f2(val)
f2_caused_value_error = True except ValueError, e:
exc2 = e
f2_caused_value_error = True
assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert T.CAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()] if f_caused_value_error != f2_caused_value_error:
if f_caused_value_error:
print 'f caused this value error:'
print exc
else:
print 'f did not raise a value error, but should have'
if f2_caused_value_error:
print 'f2 caused this value error:'
print exc2
else:
print 'f should not have raised a value error'
print 'shape was: ', shape
print 'pattern was: ', pattern
assert False
# Check that 0 shape matrices are invalid in the same cases
if f_caused_value_error != f2_caused_value_error:
if f_caused_value_error:
print 'f caused this value error:'
print exc
else:
print 'f did not raise a value error, but should have'
if f2_caused_value_error:
print 'f2 caused this value error:'
print exc2
else:
print 'f should not have raised a value error'
print 'shape was: ',shape
print 'pattern was: ',pattern
assert False
if f_caused_value_error:
continue
if val.size == 0:
assert f2(val).size == f(val).size
assert f2(val).shape == f(val).shape
else:
try: try:
#We raise the error threashold as we sum big matrix #We raise the error threashold as we sum big matrix
#and this cause small rounding difference with some seed #and this cause small rounding difference with some seed
#example in debug mode with unittests.rseed=9275 #example in debug mode with unittests.rseed=9275
orig_rtol = theano.tensor.basic.float32_rtol orig_rtol = theano.tensor.basic.float32_rtol
theano.tensor.basic.float32_rtol = 2e-5 theano.tensor.basic.float32_rtol = 2e-5
f2_val = f2(val) assert _allclose(f_out, f2_out), ('shape', shape,
f_val = f(val) 'pattern', pattern,
if not _allclose(f2_val, f_val): sum([shape[i] for i in pattern]),
print 'failed for the following arguments: ' f2(val), f(val), val)
print 'shape:',shape
print 'pattern: ',pattern
print 'input:'
print val
print 'correct output: '
print f2_val
print 'actual output: '
print f_val
assert False
finally: finally:
theano.tensor.basic.float32_rtol = orig_rtol theano.tensor.basic.float32_rtol = orig_rtol
#test with dimshuffle #test with dimshuffle
#we shuffle the 2 outer dims. #we shuffle the 2 outer dims.
for shape, pattern in [#((5,),[0]), for shape, pattern in [#((5,),[0]),
((5,4),(0,1)),((5,4),[0]), ((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]), ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
# Don't test patterns that aren't implemented for max yet op = tensor.CAReduce(scalar_op, axis=pattern)
if tensor_pattern_to_gpu_pattern(shape, pattern) not in \ pat = tensor_pattern_to_gpu_pattern(shape, pattern)
[ (0,1), (0,1,1), (0,1,1) ]: #GpuCAReduce{maximum} support only those patterns
continue if scalar_op is theano.scalar.maximum and pat not in [
a = tensor.TensorType('float32', (False,) * len(shape))() (0, 1), (0, 1, 1), (0, 1, 1)]:
dim_pattern = range(len(shape)) continue
dim_pattern[0] = 1 a = tensor.TensorType('float32', (False,) * len(shape))()
dim_pattern[1] = 0 dim_pattern = range(len(shape))
a = a.dimshuffle(dim_pattern) dim_pattern[0] = 1
b = T.max(a, pattern) dim_pattern[1] = 0
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) a = a.dimshuffle(dim_pattern)
# val = numpy.ones(shape) b = op(a)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32') # val = numpy.ones(shape)
f = theano.function([a], b, mode=mode_with_gpu) # val = numpy.arange(numpy.prod(shape)).reshape(shape)
f2 = theano.function([a], b, mode=mode_without_gpu) val = theano._asarray(val, dtype='float32')
assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()] f = theano.function([a], b, mode=mode_with_gpu)
assert T.CAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()] f2 = theano.function([a], b, mode=mode_without_gpu)
assert _allclose(f2(val), f(val)), ('shape', shape, assert tcn.GpuCAReduce in [x.op.__class__
'pattern', pattern, for x in f.maker.fgraph.toposort()]
sum([shape[i] for i in pattern])) assert op.__class__ in [x.op.__class__
for x in f2.maker.fgraph.toposort()]
#test with broadcast assert _allclose(f2(val), f(val)), ('shape', shape,
for shape, pattern in [((5,),(0,)), 'pattern', pattern,
((5,4),(0,1)), sum([shape[i] for i in pattern]))
((5,4),(0,)),
((5,4,3),(0,)), #test with broadcast
((5,4,3),(0,1)), for shape, pattern in [((5,),[0]),
((5,4,3),(2,)), ((5,4),[0,1]),((5,4),[0]),
((5,4,3),(0,1,2)), ((5,4,3),[0]),((5,4,3),[0,1]),
((5,4,3,2),(0,1,2,3)), ((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),(0,2,3))]: ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
# Don't test patterns that aren't implemented for max yet op = tensor.CAReduce(scalar_op, axis=pattern)
if tensor_pattern_to_gpu_pattern(shape, pattern) not in \ pat = tensor_pattern_to_gpu_pattern(shape, pattern)
[ (0,1), (0,1,1), (0,1,1) ]: #GpuCAReduce{maximum} support only those patterns
continue if scalar_op is theano.scalar.maximum and pat not in [
shape = numpy.asarray(shape) * 2 (0, 1), (0, 1, 1), (0, 1, 1)]:
a = tensor.TensorType('float32', (False,) * len(shape))() continue
a2 = tcn.CudaNdarrayType((False,) * len(shape))() shape = numpy.asarray(shape) * 2
b = T.max(a, pattern) a = tensor.TensorType('float32', (False,) * len(shape))()
b2 = T.max(a2, pattern) a2 = tcn.CudaNdarrayType((False,) * len(shape))()
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) b = op(a)
# val = numpy.ones(shape) b2 = op(a2)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32') # val = numpy.ones(shape)
val2 = cuda.CudaNdarray(val) # val = numpy.arange(numpy.prod(shape)).reshape(shape)
if len(shape) == 1: val = theano._asarray(val, dtype='float32')
val = val[::2] val2 = cuda.CudaNdarray(val)
val2 = val2[::2] if len(shape) == 1:
elif len(shape) == 2: val = val[::2]
val = val[::2, ::2] val2 = val2[::2]
val2 = val2[::2, ::2] elif len(shape) == 2:
elif len(shape) == 3: val = val[::2, ::2]
val = val[::2, ::2, ::2] val2 = val2[::2, ::2]
val2 = val2[::2, ::2, ::2] elif len(shape) == 3:
elif len(shape) == 4: val = val[::2, ::2, ::2]
val = val[::2, ::2, ::2, ::2] val2 = val2[::2, ::2, ::2]
val2 = val2[::2, ::2, ::2, ::2] elif len(shape) == 4:
f = theano.function([a], b, mode=mode_without_gpu) val = val[::2, ::2, ::2, ::2]
f2 = theano.function([a2], b2, mode=mode_with_gpu) val2 = val2[::2, ::2, ::2, ::2]
assert tcn.GpuCAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()] f = theano.function([a], b, mode=mode_without_gpu)
assert T.CAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()] f2 = theano.function([a2], b2, mode=mode_with_gpu)
assert _allclose(f2(val2), f(val)), ('shape', shape, assert tcn.GpuCAReduce in [x.op.__class__
'pattern', pattern, for x in f2.maker.fgraph.toposort()]
sum([shape[i] for i in pattern])) assert op.__class__ in [x.op.__class__
for x in f.maker.fgraph.toposort()]
assert _allclose(f2(val2), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
def test_flatten(): def test_flatten():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论