提交 b3ebee2a authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Merge.

...@@ -77,3 +77,7 @@ AddConfigVar('traceback.limit', ...@@ -77,3 +77,7 @@ AddConfigVar('traceback.limit',
AddConfigVar('warn.argmax_pushdown_bug', AddConfigVar('warn.argmax_pushdown_bug',
"Warn if in past version of Theano we generated a bug with the optimisation theano.tensor.nnet.nnet.local_argmax_pushdown optimization. Was fixed 27 may 2010", "Warn if in past version of Theano we generated a bug with the optimisation theano.tensor.nnet.nnet.local_argmax_pushdown optimization. Was fixed 27 may 2010",
BoolParam(True)) BoolParam(True))
AddConfigVar('warn.gpusum_01_011_0111_bug',
"Warn if we are in a case where old version of Theano had a silent bug with GpuSum pattern 01,011 and 0111 when the first dimensions was bigger then 4096. Was fixed 31 may 2010",
BoolParam(True))
...@@ -824,6 +824,16 @@ class GpuSum(Op): ...@@ -824,6 +824,16 @@ class GpuSum(Op):
threads_z = '' threads_z = ''
if len(self.reduce_mask)==3: if len(self.reduce_mask)==3:
threads_z = '' threads_z = ''
if config.warn.gpusum_01_011_0111_bug:
pattern = '0'+N_pattern
warn = '''
static bool warn_gpusum_01_011_0111_bug = true;
if(warn_gpusum_01_011_0111_bug && CudaNdarray_HOST_DIMS(%(x)s)[%(N)s]>4096){
printf("WARNING: old version of Theano had a silent bug with GpuSum pattern %(pattern)s when the first dimensions was bigger then 4096. Was fixed 31 may 2010. To disable this warning set the Theano flags warn.gpusum_01_011_0111_bug to False. Won't repeat the warning before we exit.\\n");
warn_gpusum_01_011_0111_bug = false;
}
'''%locals()
else: warn = ""
print >> sio, """ print >> sio, """
{ {
int verbose = 0; int verbose = 0;
...@@ -833,6 +843,7 @@ class GpuSum(Op): ...@@ -833,6 +843,7 @@ class GpuSum(Op):
%(threads_y)s %(threads_y)s
%(threads_z)s %(threads_z)s
dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],NUM_VECTOR_OP_BLOCKS)); dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],NUM_VECTOR_OP_BLOCKS));
%(warn)s
%(makecall)s %(makecall)s
} }
""" %locals() """ %locals()
...@@ -1037,6 +1048,7 @@ class GpuSum(Op): ...@@ -1037,6 +1048,7 @@ class GpuSum(Op):
""" % locals() """ % locals()
def c_code_reduce_1011(self, sio, node, name, x, z, fail): def c_code_reduce_1011(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail)
print >> sio, """ print >> sio, """
{ {
int verbose = 0; int verbose = 0;
...@@ -1044,13 +1056,11 @@ class GpuSum(Op): ...@@ -1044,13 +1056,11 @@ class GpuSum(Op):
std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], std::min(CudaNdarray_HOST_DIMS(%(x)s)[3],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
while (n_threads.y * n_threads.x < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y; while (n_threads.x * (n_threads.y+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
n_threads.y -= 1;
if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[2]) if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[2])
n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[2]; n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[2];
while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z; while (n_threads.x * n_threads.y * (n_threads.z+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
n_threads.z -= 1;
if (n_threads.z > 64) if (n_threads.z > 64)
n_threads.z = 64; n_threads.z = 64;
if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0]) if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
...@@ -1058,41 +1068,12 @@ class GpuSum(Op): ...@@ -1058,41 +1068,12 @@ class GpuSum(Op):
dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]); dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);
if (verbose) printf("running kernel_reduce_sum_1011_%(name)s\\n"); %(makecall)s
if (verbose) fprint_CudaNdarray(stdout, %(x)s);
if (verbose) fprint_CudaNdarray(stdout, %(z)s);
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_sum_1011_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(%(x)s)[2],
CudaNdarray_HOST_DIMS(%(x)s)[3],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_HOST_STRIDES(%(x)s)[2],
CudaNdarray_HOST_STRIDES(%(x)s)[3],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0]);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_1011_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
} }
""" %locals() """ %locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (14,) return (17,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
...@@ -1204,7 +1185,7 @@ class GpuSum(Op): ...@@ -1204,7 +1185,7 @@ class GpuSum(Op):
for_i2 = "for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)" for_i2 = "for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)"
for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)" for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)"
reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]') reducebuf = self._k_reduce_buf('Z[i0 * sZ0]')
param_dim = ",".join(["const int d%(i)s"%locals() for i in range(nd_in)]) param_dim = ",".join(["const int d%(i)s"%locals() for i in range(nd_in)])
param_strides = ",".join(["const int sA%(i)s"%locals() for i in range(nd_in)]) param_strides = ",".join(["const int sA%(i)s"%locals() for i in range(nd_in)])
decl = self._k_decl(node,nodename) decl = self._k_decl(node,nodename)
...@@ -1212,16 +1193,19 @@ class GpuSum(Op): ...@@ -1212,16 +1193,19 @@ class GpuSum(Op):
print >> sio, """ print >> sio, """
%(decl)s{ %(decl)s{
%(init)s %(init)s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
mysum = 0;
%(for_i1)s{ %(for_i1)s{
%(for_i2)s{ %(for_i2)s{
%(for_i3)s{ %(for_i3)s{
float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + blockIdx.x * sA0]; float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0];
mysum += Ai; mysum += Ai;
} }
} }
} }
%(reducebuf)s %(reducebuf)s
} }
}
""" %locals() """ %locals()
if self.reduce_mask == (1,0): if self.reduce_mask == (1,0):
# this kernel uses one block for each column, # this kernel uses one block for each column,
......
...@@ -311,6 +311,7 @@ PyObject* CudaNdarray_Zeros(PyObject* dummy, PyObject* shape) ...@@ -311,6 +311,7 @@ PyObject* CudaNdarray_Zeros(PyObject* dummy, PyObject* shape)
} }
int shp_el = PyInt_AsLong(shp_el_obj); int shp_el = PyInt_AsLong(shp_el_obj);
Py_DECREF(shp_el_obj);
if (shp_el <= 0) if (shp_el <= 0)
{ {
...@@ -320,7 +321,6 @@ PyObject* CudaNdarray_Zeros(PyObject* dummy, PyObject* shape) ...@@ -320,7 +321,6 @@ PyObject* CudaNdarray_Zeros(PyObject* dummy, PyObject* shape)
} }
newdims[i] = shp_el; newdims[i] = shp_el;
total_elements *= newdims[i]; total_elements *= newdims[i];
} }
...@@ -1395,7 +1395,10 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *v) ...@@ -1395,7 +1395,10 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *v)
assert (CudaNdarray_EqualAndIgnore(viewCopyForComparison, rval, 1, 1)); assert (CudaNdarray_EqualAndIgnore(viewCopyForComparison, rval, 1, 1));
assert (rval->base == baseSavedForComparison); assert (rval->base == baseSavedForComparison);
assert (rval->dev_structure_fresh); assert (rval->dev_structure_fresh);
// Clean up locally-created references
Py_DECREF((PyObject*)viewCopyForComparison); Py_DECREF((PyObject*)viewCopyForComparison);
Py_DECREF(rval);
return 0; return 0;
} }
......
...@@ -38,7 +38,27 @@ def test_sum(): ...@@ -38,7 +38,27 @@ def test_sum():
((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test. ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]), ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
((0,0,0,0),[0,1,2,3]), ((0,0,0,0),[0,1,2,3]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3])]: ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
#test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enought thread/block in each dimensions
((4100,3),[0]),((3,4101),[0]),#10
((4100,3),[1]),((3,4101),[1]),#01
((4100,3),[0,1]),((3,4101),[0,1]),#11
((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
#((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010 ##not implemented
((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
]:
a = tensor.TensorType('float32',(False,)*len(shape))() a = tensor.TensorType('float32',(False,)*len(shape))()
b = T.Sum(pattern)(a) b = T.Sum(pattern)(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
......
...@@ -149,6 +149,10 @@ class mrg_uniform_base(Op): ...@@ -149,6 +149,10 @@ class mrg_uniform_base(Op):
return Apply(self, return Apply(self,
[rstate, size], [rstate, size],
[rstate.type(), self.output_type()]) [rstate.type(), self.output_type()])
def grad(self,inputs,ograd):
return [None for i in inputs]
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (1,)
...@@ -622,7 +626,12 @@ class MRG_RandomStreams(object): ...@@ -622,7 +626,12 @@ class MRG_RandomStreams(object):
If the size argument is ambiguous on the number of dimensions, If the size argument is ambiguous on the number of dimensions,
ndim may be a plain integer to supplement the missing ndim may be a plain integer to supplement the missing
information. information.
Currently size can't be None. Otherwise it fail later. So I added the assert
""" """
assert isinstance(size, tuple), "size must be a tuple"
assert all([isinstance(i,int) for i in size])
if nstreams is None: if nstreams is None:
nstreams = self.n_streams(size) nstreams = self.n_streams(size)
if self.use_cuda and dtype=='float32': if self.use_cuda and dtype=='float32':
...@@ -664,6 +673,8 @@ class MRG_RandomStreams(object): ...@@ -664,6 +673,8 @@ class MRG_RandomStreams(object):
# second half our U2's. See Wikipedia page: # second half our U2's. See Wikipedia page:
# http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform # http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
assert isinstance(size, tuple), "size must be a tuple"
assert all([isinstance(i,int) for i in size])
n_samples = numpy.prod(size) n_samples = numpy.prod(size)
evened = False evened = False
...@@ -710,5 +721,3 @@ def mrg_random_make_inplace(node): ...@@ -710,5 +721,3 @@ def mrg_random_make_inplace(node):
return new_op.make_node(*node.inputs).outputs return new_op.make_node(*node.inputs).outputs
return False return False
optdb.register('random_make_inplace_mrg', opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), 99, 'fast_run', 'inplace') optdb.register('random_make_inplace_mrg', opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), 99, 'fast_run', 'inplace')
...@@ -264,7 +264,7 @@ def test_consistency_GPU_parallel(): ...@@ -264,7 +264,7 @@ def test_consistency_GPU_parallel():
# We need the sample back in the main memory # We need the sample back in the main memory
cpu_sample = tensor.as_tensor_variable(sample) cpu_sample = tensor.as_tensor_variable(sample)
f = theano.function([], cpu_sample) f = theano.function([], cpu_sample, mode=mode)
for k in range(n_samples): for k in range(n_samples):
s = f() s = f()
...@@ -351,6 +351,10 @@ def test_rng0(): ...@@ -351,6 +351,10 @@ def test_rng0():
def test_normal0(): def test_normal0():
if config.mode == 'FAST_COMPILE':
mode = 'FAST_RUN'
else:
mode = config.mode
def basictest(f, steps, target_avg, target_std, prefix=""): def basictest(f, steps, target_avg, target_std, prefix=""):
dt = 0.0 dt = 0.0
avg_std = 0.0 avg_std = 0.0
...@@ -414,7 +418,7 @@ def test_normal0(): ...@@ -414,7 +418,7 @@ def test_normal0():
RR = theano.tensor.shared_randomstreams.RandomStreams(234) RR = theano.tensor.shared_randomstreams.RandomStreams(234)
nn = RR.normal(size=sample_size, avg=-5.0, std=2.0) nn = RR.normal(size=sample_size, avg=-5.0, std=2.0)
ff = theano.function([], nn, mode=mode) ff = theano.function([], nn)
basictest(ff, 50, -5.0, 2.0, prefix='numpy ') basictest(ff, 50, -5.0, 2.0, prefix='numpy ')
......
...@@ -87,7 +87,7 @@ class T_sigmoid_opts(unittest.TestCase): ...@@ -87,7 +87,7 @@ class T_sigmoid_opts(unittest.TestCase):
class T_softplus_opts(unittest.TestCase): class T_softplus_opts(unittest.TestCase):
def setUp(self): def setUp(self):
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
m = theano.compile.mode.get_mode('FAST_RUN') m = theano.compile.mode.get_mode('FAST_RUN').excluding('local_elemwise_fusion')
else: else:
m = theano.compile.mode.get_default_mode().excluding('local_elemwise_fusion') m = theano.compile.mode.get_default_mode().excluding('local_elemwise_fusion')
self.m = m self.m = m
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论