提交 394e8cf0 authored 作者: abergeron's avatar abergeron

Merge pull request #1794 from nouiz/recursion_limit

Recursion limit
......@@ -20,7 +20,9 @@ since 2007. But it is also approachable enough to be used in the classroom
News
====
* Theano 0.6rc3 was released. Everybody is encouraged to update.
* Ian Goodfellow did a `12h class with exercises on Theano <https://github.com/goodfeli/theano_exercises>`_.
* Theano 0.6 was released. Everybody is encouraged to update.
* New technical report on Theano: `Theano: new features and speed improvements <http://arxiv.org/abs/1211.5590>`_.
However, please keep citing the other paper below in scientific work involving Theano.
......
......@@ -2974,7 +2974,40 @@ class Composite(ScalarOp):
# We need to clone the graph as sometimes its nodes already
# contain a reference to an fgraph. As we want the Composite
# to be pickable, we can't have reference to fgraph.
inputs, outputs = gof.graph.clone(inputs, outputs)
# Also, if there is Composite in the inner graph, we want to
# remove them. In that case, we do a more complicated clone
# that will flatten Composite. We don't need to do this
# recusively, as the way the fusion optimizer work, we have
# only 1 new Composite each time at the output.
if len(outputs) > 1 or not any([isinstance(var.owner.op, Composite)
for var in outputs]):
# No inner Composite
inputs, outputs = gof.graph.clone(inputs, outputs)
else:
# Inner Composite that we need to flatten
assert len(outputs) == 1
# 1. Create a new graph from inputs up to the
# Composite
res = theano.compile.rebuild_collect_shared(
inputs=inputs,
outputs=outputs[0].owner.inputs,
copy_inputs_over=False) # Clone also the inputs
# 2. We continue this partial clone with the graph in
# the inner Composite
res2 = theano.compile.rebuild_collect_shared(
inputs=outputs[0].owner.op.inputs,
outputs=outputs[0].owner.op.outputs,
replace=dict(zip(outputs[0].owner.op.inputs, res[1]))
)
assert len(res2[1]) == len(outputs)
assert len(res[0]) == len(inputs)
assert res[0] != inputs
inputs, outputs = res[0], res2[1]
# Next assert comment just for speed
#assert not any([isinstance(node.op, Composite) for node in
# theano.gof.graph.ops(inputs, outputs)])
self.inputs = copy(inputs)
self.outputs = copy(outputs)
self.inputs_type = tuple([input.type for input in inputs])
......
......@@ -68,19 +68,17 @@ class test_composite(unittest.TestCase):
fn = gof.DualLinker().accept(g).make_function()
assert fn(1.0, 2.0) == 1.5
# def test_sin(self):
# x = inputs()
# e = sin(x)
# C = Composite([x], [e])
# c = C.make_node(x)
# # print c.c_code(['x'], ['z'], dict(id = 0))
# g = FunctionGraph([x], [c.out])
# fn = gof.DualLinker().accept(g).make_function()
# assert fn(0) == 0
# assert fn(3.14159265358/2) == 1
# assert fn(3.14159265358) == 0
# WRITEME: Test for sin, pow, and other scalar ops.
def test_flatten(self):
#Test that we flatten multiple Composite.
x, y, z = inputs()
C = Composite([x, y], [x + y])
CC = Composite([x, y], [C(x * y, y)])
assert not isinstance(CC.outputs[0].owner.op, Composite)
# Test with multiple outputs
CC = Composite([x, y, z], [C(x * y, y), C(x * z, y)])
#We don't flatten that case.
assert isinstance(CC.outputs[0].owner.op, Composite)
def test_with_constants(self):
x, y, z = inputs()
......
......@@ -508,6 +508,12 @@ class EmptyConstantError(NotScalarConstantError):
"""
get_scalar_constant_value_elemwises = (
scal.Cast, scal.Switch,
scal.NEQ, scal.EQ,
scal.LT, scal.GT, scal.LE, scal.GE,
scal.Sub, scal.Add, scal.Mod, scal.Mul,
scal.IntDiv, scal.TrueDiv)
def get_scalar_constant_value(v):
"""return the constant scalar(0-D) value underlying variable `v`
......@@ -562,7 +568,7 @@ def get_scalar_constant_value(v):
compile.ops.OutputGuard,
compile.DeepCopyOp)):
return get_scalar_constant_value(v.owner.inputs[0])
if (isinstance(v.owner.op, theano.compile.ops.Shape_i) and
elif (isinstance(v.owner.op, theano.compile.ops.Shape_i) and
isinstance(v.owner.inputs[0], Constant)):
return v.owner.inputs[0].data.shape[v.owner.op.i]
# Don't act as the constant_folding optimization here as this
......@@ -570,26 +576,29 @@ def get_scalar_constant_value(v):
# mess with the stabilization optimization and be too slow.
# We put all the scalar Ops used by get_canonical_form_slice()
# to allow it to determine the broadcast pattern correctly.
if ((isinstance(v.owner.op, Elemwise) and
isinstance(v.owner.op.scalar_op, scal.Second)) or
isinstance(v.owner.op, scal.Second)):
# We don't need both input to be constant for second
shape, val = v.owner.inputs
return get_scalar_constant_value(val)
elemwises = (scal.Cast, scal.Switch,
scal.NEQ, scal.EQ,
scal.LT, scal.GT, scal.LE, scal.GE,
scal.Sub, scal.Add, scal.Mod, scal.Mul,
scal.IntDiv, scal.TrueDiv)
if (isinstance(v.owner.op, Elemwise) and
len(v.owner.outputs) == 1 and
(isinstance(v.owner.op.scalar_op, elemwises) or
isinstance(v.owner.op, elemwises))):
const = [get_scalar_constant_value(i) for i in v.owner.inputs]
ret = [[None]]
v.owner.op.perform(v.owner, const, ret)
return ret[0][0]
if isinstance(v.owner.op, theano.tensor.subtensor.Subtensor) and v.ndim == 0:
elif isinstance(v.owner.op, scal.ScalarOp):
if isinstance(v.owner.op, scal.Second):
# We don't need both input to be constant for second
shape, val = v.owner.inputs
return get_scalar_constant_value(val)
if isinstance(v.owner.op, get_scalar_constant_value_elemwises):
const = [get_scalar_constant_value(i)
for i in v.owner.inputs]
ret = [[None]]
v.owner.op.perform(v.owner, const, ret)
return ret[0][0]
elif isinstance(v.owner.op, Elemwise):
if isinstance(v.owner.op.scalar_op, scal.Second):
# We don't need both input to be constant for second
shape, val = v.owner.inputs
return get_scalar_constant_value(val)
elif isinstance(v.owner.op.scalar_op,
get_scalar_constant_value_elemwises):
const = [get_scalar_constant_value(i) for i in v.owner.inputs]
ret = [[None]]
v.owner.op.perform(v.owner, const, ret)
return ret[0][0]
elif isinstance(v.owner.op, theano.tensor.subtensor.Subtensor) and v.ndim == 0:
if isinstance(v.owner.inputs[0], TensorConstant):
cdata = tuple(v.owner.op.get_constant_idx(v.owner.inputs))
try:
......@@ -626,7 +635,7 @@ def get_scalar_constant_value(v):
# join can cast implicitly its input in some case.
return theano._asarray(ret, dtype=v.type.dtype)
if (v.owner.inputs[0].owner and
elif (v.owner.inputs[0].owner and
isinstance(v.owner.inputs[0].owner.op,
theano.tensor.opt.MakeVector) and
# MakeVector normally accept only scalar as input.
......
......@@ -774,8 +774,7 @@ class Elemwise(OpenMPOp):
super(Elemwise, self).perform(node, inputs, output_storage)
maxsize = max(len(input.shape) for input in inputs)
for dims in izip(*[([(1, True)] * (maxsize - len(input.shape))
+ zip(input.shape, sinput.type.broadcastable))
for dims in izip(*[zip(input.shape, sinput.type.broadcastable)
for input, sinput in zip(inputs, node.inputs)]):
if max(d for d, b in dims) != 1 and (1, False) in dims:
# yes there may be more compact ways to write this code,
......@@ -808,34 +807,36 @@ class Elemwise(OpenMPOp):
out_shape.append(max(values))
out_shape = tuple(out_shape)
if not self.inplace_pattern:
for output, storage in izip(node.outputs, output_storage):
odat = storage[0]
if odat is not None:
if odat.shape != out_shape:
# It is unsafe to try to resize odat,
# we have to allocate output storage.
odat = None
if odat is None:
odat = numpy.ndarray(out_shape, dtype=output.type.dtype)
storage[0] = odat
else:
for i, (output, storage) in enumerate(
izip(node.outputs, output_storage)):
#i is an output idx
if i in self.inplace_pattern:
odat = inputs[self.inplace_pattern[i]]
else:
odat = storage[0]
if odat is not None:
if odat.shape != out_shape:
# It is unsafe to try to resize odat,
# we have to allocate output storage.
odat = None
if odat is None:
odat = numpy.ndarray(out_shape,
dtype=output.type.dtype)
storage[0] = odat
# Commented as we don't reuse outputs now.
#
# if not self.inplace_pattern:
# for output, storage in izip(node.outputs, output_storage):
# odat = storage[0]
# if odat is not None:
# if odat.shape != out_shape:
# # It is unsafe to try to resize odat,
# # we have to allocate output storage.
# odat = None
# if odat is None:
# odat = numpy.ndarray(out_shape, dtype=output.type.dtype)
# storage[0] = odat
# else:
# for i, (output, storage) in enumerate(
# izip(node.outputs, output_storage)):
# #i is an output idx
# if i in self.inplace_pattern:
# odat = inputs[self.inplace_pattern[i]]
# else:
# odat = storage[0]
# if odat is not None:
# if odat.shape != out_shape:
# # It is unsafe to try to resize odat,
# # we have to allocate output storage.
# odat = None
# if odat is None:
# odat = numpy.ndarray(out_shape,
# dtype=output.type.dtype)
# storage[0] = odat
ufunc_args = inputs # + output_storage
if self.nfunc and len(inputs) == self.nfunc_spec[1]:
......@@ -860,26 +861,25 @@ class Elemwise(OpenMPOp):
if nout == 1:
variables = [variables]
i = 0
for variable, storage, nout in izip(variables, output_storage,
node.outputs):
if str(getattr(variable, "dtype", "")) == 'object':
if getattr(variable, "dtype", "") == 'object':
# Since numpy 1.6, function created with numpy.frompyfunc
# always return an ndarray with dtype object
variable = numpy.asarray(variable, dtype=nout.dtype)
# The storage has been resized earlier.
if hasattr(variable, 'shape'):
assert storage[0].shape == variable.shape
if i in self.inplace_pattern:
odat = inputs[self.inplace_pattern[i]]
odat[...] = variable
storage[0] = odat
# Sometimes NumPy return a Python type.
elif not isinstance(variable, numpy.ndarray):
variable = numpy.asarray(variable, nout.dtype)
storage[0] = variable
else:
# If variable has not shape, then it is a scalar.
assert numpy.prod(storage[0].shape) == 1
storage[0][...] = variable
assert str(storage[0].dtype) != 'object'
# the following should be used instead of the previous loop,
# unfortunately it tends to segfault
# self.ufunc(*(ufunc_args+[s[0] for s in output_storage]))
storage[0] = variable
i += 1
def infer_shape(self, node, i_shapes):
rval = []
......
......@@ -4888,11 +4888,40 @@ class FusionOptimizer(Optimizer):
print >> stream, blanc, " time_toposort", prof[7]
def local_add_mul_fusion(node):
"""Fuse consecutive add or mul in one such node with more inputs.
It is better to fuse add/mul that way then in a Composite node as
this make the inner graph of the Compiste smaller. This allow to
put more computation in a Composite before hitting the max
recusion limit when pickling Composite.
"""
if (not isinstance(node.op, Elemwise) or
not isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul))):
return False
s_op = node.op.scalar_op.__class__
for inp in node.inputs:
if (inp.owner and
isinstance(inp.owner.op, Elemwise) and
isinstance(inp.owner.op.scalar_op, s_op)):
l = list(node.inputs)
l.remove(inp)
return [node.op(*(l + inp.owner.inputs))]
if config.tensor.local_elemwise_fusion:
_logger.debug("enabling optimization fusion elemwise in fast_run")
#Must be after gpu(48.5) and before AddDestroyHandler(49.5)
fuse_seqopt = gof.SequenceDB()
fuse_seqopt.register('local_add_mul_fusion',
FusionOptimizer(local_add_mul_fusion),
0, 'fast_run', 'fusion')
fuse_seqopt.register('composite_elemwise_fusion',
FusionOptimizer(local_elemwise_fusion),
1, 'fast_run', 'fusion')
compile.optdb.register('elemwise_fusion',
FusionOptimizer(local_elemwise_fusion), 49,
fuse_seqopt, 49,
'fast_run', 'fusion', 'local_elemwise_fusion',
'FusionOptimizer')
else:
......
......@@ -1207,6 +1207,36 @@ class test_fusion(unittest.TestCase):
# Test it on some dummy values
f(*[range(i, 4 + i) for i in range(35)])
def test_pickle_big_fusion(self):
"""In the past, pickle of Composite generated in tha case
crashed with max recusion limit. So we where not able to
generate C code in that case.
"""
factors = []
sd = tensor.dscalar()
means = tensor.dvector()
cst_05 = theano.tensor.constant(.5)
cst_m05 = theano.tensor.constant(-.5)
cst_2 = theano.tensor.constant(2)
cst_m2 = theano.tensor.constant(-2)
ones = theano.tensor.constant(numpy.ones(10))
n = 85
if theano.config.mode in ["DebugMode", "DEBUG_MODE"]:
n = 10
for i in range(n):
f = (cst_m05 * sd ** cst_m2 * (ones - means[i]) ** cst_2 +
cst_05 * tensor.log(cst_05 * (sd ** cst_m2) / numpy.pi))
factors.append(tensor.sum(f))
logp = tensor.add(*factors)
vars = [sd, means]
dlogp = function(vars, [theano.grad(logp, v) for v in vars])
dlogp(2, numpy.random.rand(n))
def speed_fusion(self, shared_fn=shared, gpu=False, s=None):
"""
param type s: a slice object
......@@ -1676,8 +1706,8 @@ class test_local_subtensor_lift(unittest.TestCase):
f = function([x, y, z], tensor.exp(x + y + z)[0], mode=mode_opt)
prog = f.maker.fgraph.toposort()
assert isinstance(prog[1].op, tensor.DimShuffle)
assert isinstance(prog[0].op, tensor.Subtensor) # first subtensor
assert isinstance(prog[0].op, tensor.DimShuffle)
assert isinstance(prog[1].op, tensor.Subtensor) # first subtensor
assert isinstance(prog[2].op, tensor.Subtensor) # first subtensor
assert isinstance(prog[3].op.scalar_op, theano.scalar.
Composite) # Composite{add,add}
......@@ -1693,8 +1723,8 @@ class test_local_subtensor_lift(unittest.TestCase):
f = function([x, y, z], tensor.exp(x + y + z)[0:2], mode=mode_opt)
prog = f.maker.fgraph.toposort()
assert isinstance(prog[1].op, tensor.DimShuffle)
assert isinstance(prog[0].op, tensor.Subtensor) # first subtensor
assert isinstance(prog[0].op, tensor.DimShuffle)
assert isinstance(prog[1].op, tensor.Subtensor) # first subtensor
assert isinstance(prog[2].op, tensor.Subtensor) # first subtensor
assert isinstance(prog[3].op.scalar_op, theano.scalar.
Composite) # Composite{add,add}
......@@ -3402,7 +3432,7 @@ class T_local_erfc(unittest.TestCase):
assert len(f.maker.fgraph.apply_nodes) == 1, len(f.maker.fgraph.apply_nodes)
assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
assert len(f.maker.fgraph.toposort()[0].fgraph.toposort()[
0].op.scalar_op.fgraph.apply_nodes)==2,len(f.maker.fgraph.toposort()[0].fgraph.toposort()[0].op.scalar_op.fgraph.apply_nodes)
0].op.scalar_op.fgraph.apply_nodes)==22,len(f.maker.fgraph.toposort()[0].fgraph.toposort()[0].op.scalar_op.fgraph.apply_nodes)
#TODO: fix this problem
if theano.config.floatX=="float32" and theano.config.mode in ["DebugMode", "DEBUG_MODE"]:
raise KnownFailureTest(
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论