提交 4927f127 authored 作者: abergeron's avatar abergeron

Merge pull request #1761 from nouiz/mixed

Mixed
...@@ -165,8 +165,12 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1): ...@@ -165,8 +165,12 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
my_pid = os.getpid() my_pid = os.getpid()
no_display = (verbosity == 0) no_display = (verbosity == 0)
# Acquire lock.
nb_error = 0 nb_error = 0
# The number of time we sleep when their is no errors.
# Used to don't display it the first time to display it less frequently.
# And so don't get as much email about this!
nb_wait = 0
# Acquire lock.
while True: while True:
try: try:
last_owner = 'no_owner' last_owner = 'no_owner'
...@@ -214,7 +218,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1): ...@@ -214,7 +218,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
last_owner = read_owner last_owner = read_owner
time_start = time.time() time_start = time.time()
no_display = (verbosity == 0) no_display = (verbosity == 0)
if not no_display: if not no_display and nb_wait > 0:
if read_owner == 'failure': if read_owner == 'failure':
msg = 'unknown process' msg = 'unknown process'
else: else:
...@@ -225,6 +229,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1): ...@@ -225,6 +229,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
tmp_dir) tmp_dir)
if verbosity <= 1: if verbosity <= 1:
no_display = True no_display = True
nb_wait += 1
time.sleep(random.uniform(min_wait, max_wait)) time.sleep(random.uniform(min_wait, max_wait))
try: try:
......
...@@ -1198,7 +1198,11 @@ class GpuCAReduce(GpuOp): ...@@ -1198,7 +1198,11 @@ class GpuCAReduce(GpuOp):
n_threads.z += 1; n_threads.z += 1;
else else
break; break;
}""" % locals() }
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
""" % locals()
if len(self.reduce_mask) == 2: if len(self.reduce_mask) == 2:
threads_y = '' threads_y = ''
...@@ -1509,6 +1513,8 @@ class GpuCAReduce(GpuOp): ...@@ -1509,6 +1513,8 @@ class GpuCAReduce(GpuOp):
n_threads.z += 1; n_threads.z += 1;
} }
n_threads.z -= 1; n_threads.z -= 1;
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
dim3 n_blocks(1,1,1); dim3 n_blocks(1,1,1);
%(makecall)s %(makecall)s
......
...@@ -109,11 +109,13 @@ def test_careduce(): ...@@ -109,11 +109,13 @@ def test_careduce():
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011 ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111 ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011 ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101 ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011 ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111 ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111 ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111
......
...@@ -1281,7 +1281,10 @@ class GpuCAReduceCuda(HideC, CAReduce): ...@@ -1281,7 +1281,10 @@ class GpuCAReduceCuda(HideC, CAReduce):
n_threads.z += 1; n_threads.z += 1;
else else
break; break;
}""" % locals() }
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
""" % locals()
if len(self.reduce_mask) == 2: if len(self.reduce_mask) == 2:
threads_y = '' threads_y = ''
...@@ -1601,6 +1604,8 @@ class GpuCAReduceCuda(HideC, CAReduce): ...@@ -1601,6 +1604,8 @@ class GpuCAReduceCuda(HideC, CAReduce):
n_threads.z += 1; n_threads.z += 1;
} }
n_threads.z -= 1; n_threads.z -= 1;
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
dim3 n_blocks(1,1,1); dim3 n_blocks(1,1,1);
%(makecall)s %(makecall)s
......
import unittest
from theano import scalar, gof from theano import scalar, gof
from theano.gof import FunctionGraph
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.tests.unittest_tools import SkipTest
from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle, from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
test_CAReduce) test_CAReduce)
...@@ -126,11 +122,13 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY): ...@@ -126,11 +122,13 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011 ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111 ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011 ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101 ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011 ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111 ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111 ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111
#test pattern implemented by reshape #test pattern implemented by reshape
......
...@@ -26,7 +26,9 @@ if cuda_available: ...@@ -26,7 +26,9 @@ if cuda_available:
from theano.sandbox.cuda import (CudaNdarrayType, from theano.sandbox.cuda import (CudaNdarrayType,
float32_shared_constructor) float32_shared_constructor)
def matVecModM(A, s, m): def matVecModM(A, s, m):
assert A.dtype == 'int64'
return numpy.int32(numpy.sum((A*s) % m, 1) % m) return numpy.int32(numpy.sum((A*s) % m, 1) % m)
...@@ -53,24 +55,30 @@ MASK2 = numpy.int32(65535) #2^16 - 1 ...@@ -53,24 +55,30 @@ MASK2 = numpy.int32(65535) #2^16 - 1
MULT2 = numpy.int32(21069) MULT2 = numpy.int32(21069)
NORM = 4.656612873077392578125e-10; #1./2^31 NORM = 4.656612873077392578125e-10; #1./2^31
A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]]) #A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]],
A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]]) # dtype='int64')
#A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]],
# dtype='int64')
A1p72 = numpy.asarray([[1516919229, 758510237, 499121365], A1p72 = numpy.asarray([[1516919229, 758510237, 499121365],
[1884998244, 1516919229, 335398200], [1884998244, 1516919229, 335398200],
[601897748, 1884998244, 358115744]]) [601897748, 1884998244, 358115744]],
dtype='int64')
A2p72 = numpy.asarray([[1228857673, 1496414766, 954677935], A2p72 = numpy.asarray([[1228857673, 1496414766, 954677935],
[1133297478, 1407477216, 1496414766], [1133297478, 1407477216, 1496414766],
[2002613992, 1639496704, 1407477216]]) [2002613992, 1639496704, 1407477216]],
dtype='int64')
A1p134 = numpy.asarray( A1p134 = numpy.asarray(
[[1702500920, 1849582496, 1656874625], [[1702500920, 1849582496, 1656874625],
[828554832, 1702500920, 1512419905], [828554832, 1702500920, 1512419905],
[1143731069, 828554832, 102237247]]) [1143731069, 828554832, 102237247]],
dtype='int64')
A2p134 = numpy.asarray( A2p134 = numpy.asarray(
[[796789021, 1464208080, 607337906], [[796789021, 1464208080, 607337906],
[1241679051, 1431130166, 1464208080], [1241679051, 1431130166, 1464208080],
[1401213391, 1178684362, 1431130166]]) [1401213391, 1178684362, 1431130166]],
dtype='int64')
np_int32_vals = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)] np_int32_vals = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]
......
...@@ -164,7 +164,8 @@ class TensorType(Type): ...@@ -164,7 +164,8 @@ class TensorType(Type):
" Theano C code does not support that.", " Theano C code does not support that.",
msg, msg,
"object shape", data.shape, "object shape", data.shape,
"object strides", data.strides) "object strides", data.strides,
"object dtype", data.dtype)
i = 0 i = 0
for b in self.broadcastable: for b in self.broadcastable:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论