提交 b3ebee2a authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Merge.

......@@ -77,3 +77,7 @@ AddConfigVar('traceback.limit',
AddConfigVar('warn.argmax_pushdown_bug',
"Warn if in past version of Theano we generated a bug with the optimisation theano.tensor.nnet.nnet.local_argmax_pushdown optimization. Was fixed 27 may 2010",
BoolParam(True))
AddConfigVar('warn.gpusum_01_011_0111_bug',
"Warn if we are in a case where old version of Theano had a silent bug with GpuSum pattern 01,011 and 0111 when the first dimensions was bigger then 4096. Was fixed 31 may 2010",
BoolParam(True))
......@@ -824,6 +824,16 @@ class GpuSum(Op):
threads_z = ''
if len(self.reduce_mask)==3:
threads_z = ''
if config.warn.gpusum_01_011_0111_bug:
pattern = '0'+N_pattern
warn = '''
static bool warn_gpusum_01_011_0111_bug = true;
if(warn_gpusum_01_011_0111_bug && CudaNdarray_HOST_DIMS(%(x)s)[%(N)s]>4096){
printf("WARNING: old version of Theano had a silent bug with GpuSum pattern %(pattern)s when the first dimensions was bigger then 4096. Was fixed 31 may 2010. To disable this warning set the Theano flags warn.gpusum_01_011_0111_bug to False. Won't repeat the warning before we exit.\\n");
warn_gpusum_01_011_0111_bug = false;
}
'''%locals()
else: warn = ""
print >> sio, """
{
int verbose = 0;
......@@ -833,6 +843,7 @@ class GpuSum(Op):
%(threads_y)s
%(threads_z)s
dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],NUM_VECTOR_OP_BLOCKS));
%(warn)s
%(makecall)s
}
""" %locals()
......@@ -1037,6 +1048,7 @@ class GpuSum(Op):
""" % locals()
def c_code_reduce_1011(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail)
print >> sio, """
{
int verbose = 0;
......@@ -1044,13 +1056,11 @@ class GpuSum(Op):
std::min(CudaNdarray_HOST_DIMS(%(x)s)[3],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
while (n_threads.y * n_threads.x < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
n_threads.y -= 1;
while (n_threads.x * (n_threads.y+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[2])
n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[2];
while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
n_threads.z -= 1;
while (n_threads.x * n_threads.y * (n_threads.z+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
if (n_threads.z > 64)
n_threads.z = 64;
if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
......@@ -1058,41 +1068,12 @@ class GpuSum(Op):
dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);
if (verbose) printf("running kernel_reduce_sum_1011_%(name)s\\n");
if (verbose) fprint_CudaNdarray(stdout, %(x)s);
if (verbose) fprint_CudaNdarray(stdout, %(z)s);
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_sum_1011_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(%(x)s)[2],
CudaNdarray_HOST_DIMS(%(x)s)[3],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_HOST_STRIDES(%(x)s)[2],
CudaNdarray_HOST_STRIDES(%(x)s)[3],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0]);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_1011_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
%(makecall)s
}
""" %locals()
def c_code_cache_version(self):
return (14,)
return (17,)
def c_support_code_apply(self, node, nodename):
......@@ -1204,7 +1185,7 @@ class GpuSum(Op):
for_i2 = "for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)"
for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)"
reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]')
reducebuf = self._k_reduce_buf('Z[i0 * sZ0]')
param_dim = ",".join(["const int d%(i)s"%locals() for i in range(nd_in)])
param_strides = ",".join(["const int sA%(i)s"%locals() for i in range(nd_in)])
decl = self._k_decl(node,nodename)
......@@ -1212,16 +1193,19 @@ class GpuSum(Op):
print >> sio, """
%(decl)s{
%(init)s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
mysum = 0;
%(for_i1)s{
%(for_i2)s{
%(for_i3)s{
float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + blockIdx.x * sA0];
float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0];
mysum += Ai;
}
}
}
%(reducebuf)s
}
}
""" %locals()
if self.reduce_mask == (1,0):
# this kernel uses one block for each column,
......
......@@ -311,6 +311,7 @@ PyObject* CudaNdarray_Zeros(PyObject* dummy, PyObject* shape)
}
int shp_el = PyInt_AsLong(shp_el_obj);
Py_DECREF(shp_el_obj);
if (shp_el <= 0)
{
......@@ -320,7 +321,6 @@ PyObject* CudaNdarray_Zeros(PyObject* dummy, PyObject* shape)
}
newdims[i] = shp_el;
total_elements *= newdims[i];
}
......@@ -1395,7 +1395,10 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *v)
assert (CudaNdarray_EqualAndIgnore(viewCopyForComparison, rval, 1, 1));
assert (rval->base == baseSavedForComparison);
assert (rval->dev_structure_fresh);
// Clean up locally-created references
Py_DECREF((PyObject*)viewCopyForComparison);
Py_DECREF(rval);
return 0;
}
......
......@@ -38,7 +38,27 @@ def test_sum():
((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
((0,0,0,0),[0,1,2,3]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3])]:
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
#test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enought thread/block in each dimensions
((4100,3),[0]),((3,4101),[0]),#10
((4100,3),[1]),((3,4101),[1]),#01
((4100,3),[0,1]),((3,4101),[0,1]),#11
((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
#((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010 ##not implemented
((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
]:
a = tensor.TensorType('float32',(False,)*len(shape))()
b = T.Sum(pattern)(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
......
......@@ -149,6 +149,10 @@ class mrg_uniform_base(Op):
return Apply(self,
[rstate, size],
[rstate.type(), self.output_type()])
def grad(self,inputs,ograd):
return [None for i in inputs]
def c_code_cache_version(self):
return (1,)
......@@ -622,7 +626,12 @@ class MRG_RandomStreams(object):
If the size argument is ambiguous on the number of dimensions,
ndim may be a plain integer to supplement the missing
information.
Currently size can't be None. Otherwise it fail later. So I added the assert
"""
assert isinstance(size, tuple), "size must be a tuple"
assert all([isinstance(i,int) for i in size])
if nstreams is None:
nstreams = self.n_streams(size)
if self.use_cuda and dtype=='float32':
......@@ -664,6 +673,8 @@ class MRG_RandomStreams(object):
# second half our U2's. See Wikipedia page:
# http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
assert isinstance(size, tuple), "size must be a tuple"
assert all([isinstance(i,int) for i in size])
n_samples = numpy.prod(size)
evened = False
......@@ -710,5 +721,3 @@ def mrg_random_make_inplace(node):
return new_op.make_node(*node.inputs).outputs
return False
optdb.register('random_make_inplace_mrg', opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), 99, 'fast_run', 'inplace')
......@@ -264,7 +264,7 @@ def test_consistency_GPU_parallel():
# We need the sample back in the main memory
cpu_sample = tensor.as_tensor_variable(sample)
f = theano.function([], cpu_sample)
f = theano.function([], cpu_sample, mode=mode)
for k in range(n_samples):
s = f()
......@@ -351,6 +351,10 @@ def test_rng0():
def test_normal0():
if config.mode == 'FAST_COMPILE':
mode = 'FAST_RUN'
else:
mode = config.mode
def basictest(f, steps, target_avg, target_std, prefix=""):
dt = 0.0
avg_std = 0.0
......@@ -414,7 +418,7 @@ def test_normal0():
RR = theano.tensor.shared_randomstreams.RandomStreams(234)
nn = RR.normal(size=sample_size, avg=-5.0, std=2.0)
ff = theano.function([], nn, mode=mode)
ff = theano.function([], nn)
basictest(ff, 50, -5.0, 2.0, prefix='numpy ')
......
......@@ -87,7 +87,7 @@ class T_sigmoid_opts(unittest.TestCase):
class T_softplus_opts(unittest.TestCase):
def setUp(self):
if theano.config.mode == 'FAST_COMPILE':
m = theano.compile.mode.get_mode('FAST_RUN')
m = theano.compile.mode.get_mode('FAST_RUN').excluding('local_elemwise_fusion')
else:
m = theano.compile.mode.get_default_mode().excluding('local_elemwise_fusion')
self.m = m
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论