提交 42d2026e authored 作者: lamblin's avatar lamblin

Merge pull request #1138 from pascanur/bugNicolas

Fix two bugs reported by Nicolas
......@@ -52,10 +52,10 @@ class Optimizer(object):
def apply(self, fgraph):
"""WRITEME
Applies the optimization to the provided L{FunctionGraph}. It may use all
the methods defined by the L{FunctionGraph}. If the L{Optimizer} needs
to use a certain tool, such as an L{InstanceFinder}, it can do
so in its L{add_requirements} method.
Applies the optimization to the provided L{FunctionGraph}. It may
use all the methods defined by the L{FunctionGraph}. If the
L{Optimizer} needs to use a certain tool, such as an
L{InstanceFinder}, it can do so in its L{add_requirements} method.
"""
pass
......@@ -208,7 +208,6 @@ class SeqOptimizer(Optimizer, list):
nb_node_after, sub_profs) = prof
blanc = (' ' * level)
print >> stream, blanc, "SeqOptimizer",
if hasattr(opts, "name"):
print >> stream, blanc, opts.name,
......@@ -217,7 +216,8 @@ class SeqOptimizer(Optimizer, list):
print >> stream, (" time %.3fs for %d/%d nodes"
" before/after optimization" % (
sum(prof), nb_node_before, nb_node_after))
print >> stream, blanc, " %.3fs for fgraph.validate()" % (validate_time)
print >> stream, \
blanc, " %.3fs for fgraph.validate()" % (validate_time)
if level == 0:
print >> stream, blanc, " time - (name, class, index)"
ll = []
......@@ -289,7 +289,8 @@ class SeqOptimizer(Optimizer, list):
p = prof2
new_t[idx] += p[1][p[0].index(l)]
if hasattr(l, 'merge_profile'):
assert len(p[5][p[0].index(l)]) == len(new_sub_profile[idx])
assert len(p[5][p[0].index(l)]) == \
len(new_sub_profile[idx])
new_sub_profile[idx] = l.merge_profile(
new_sub_profile[idx], p[5][p[0].index(l)])
else:
......@@ -468,7 +469,8 @@ class MergeFeature(object):
if node in self.nodes_seen:
return
# These asserts ensure that the fgraph has set the clients field properly.
# These asserts ensure that the fgraph has set the clients field
# properly.
# The clients should at least contain `node` itself!
if node.inputs:
assert len(node.inputs[0].clients) > 0
......@@ -677,7 +679,8 @@ class LocalOptimizer(object):
def add_requirements(self, fgraph):
"""
If this local optimization wants to add some requirements to the fgraph,
If this local optimization wants to add some requirements to the
fgraph,
This is the place to do it.
"""
# Added by default
......@@ -755,7 +758,8 @@ class _LocalOpKeyOptGroup(LocalOptGroup):
def __init__(self, optimizers):
if any(not hasattr(opt, 'op_key'), optimizers):
raise TypeError("All LocalOptimizers passed here must have an op_key method.")
raise TypeError(
"All LocalOptimizers passed here must have an op_key method.")
CompositeLocalOptimizer.__init__(self, optimizers)
def op_key(self):
......@@ -1133,8 +1137,8 @@ class NavigatorOptimizer(Optimizer):
def attach_updater(self, fgraph, importer, pruner, chin=None):
"""
Install some FunctionGraph listeners to help the navigator deal with the
ignore_trees-related functionality.
Install some FunctionGraph listeners to help the navigator deal with
the ignore_trees-related functionality.
:param importer: function that will be called whenever when
optimizations add stuff to the graph.
......@@ -1522,7 +1526,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
count_opt.append((time_lopts[opt], count, opt))
if count_opt:
print >> stream, blanc, 'times applied - optimizer (only those applied):'
print >> stream, blanc, \
'times applied - optimizer (only those applied):'
count_opt.sort()
for (t, count, opt) in count_opt[::-1]:
print >> stream, blanc, ' %.3fs - %d - %s' % (
......@@ -1591,6 +1596,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
### Utilities ###
#################
def _check_chain(r, chain):
"""WRITEME"""
......@@ -1633,8 +1639,8 @@ def check_chain(r, *chain):
def pre_greedy_local_optimizer(list_optimizations, out):
'''
This function traverses the computation graph described by all
``node`` in the graph before the variable out but that are not in the fgraph.
it applies each of the local_optimizations on the traversed graph.
``node`` in the graph before the variable out but that are not in the
fgraph. it applies each of the local_optimizations on the traversed graph.
Its main use is to apply locally constant folding when generating
the graph of the indices of a subtensor.
......@@ -1651,6 +1657,7 @@ def pre_greedy_local_optimizer(list_optimizations, out):
if not getattr(out, 'owner', None):
return [out], optimized_vars
node = out.owner
if hasattr(node, 'fgraph'):
return node.outputs, optimized_vars
for idx, inp in enumerate(node.inputs):
......@@ -1685,10 +1692,13 @@ def pre_greedy_local_optimizer(list_optimizations, out):
else:
break
return results, optimized_vars
if out.owner:
out_index = out.owner.outputs.index(out)
else:
out_index = 0
final_outs, optimized_nodes = local_recursive_function(
list_optimizations, out, {}, 0)
return final_outs[0]
return final_outs[out_index]
############
......
......@@ -1334,6 +1334,18 @@ class Scan(PureOp):
tmp = ils
if any([x is not None for x in tmp]):
connection_pattern[iidx + 1][oidx] = True
# Applying Floyd-Warshall to find all paths connecting inputs to
# outputs. Note that if `x` is an input to `y_t` and `y_tm1` is an
# input to `z_t` then `x` is an input to `z_t`.
n_outs = len(node.outputs)
for steps in xrange(n_outs):
for iidx in xrange(n_outs):
for jidx in xrange(n_outs):
j_inp_idx = self.get_input_pos(jidx) + 1
if connection_pattern[j_inp_idx][iidx] == True:
for k in xrange(len(connection_pattern)):
if connection_pattern[k][iidx]:
connection_pattern[k][jidx] = True
return connection_pattern
### GRAD FUNCTION
......@@ -1371,17 +1383,53 @@ class Scan(PureOp):
self.inner_mitsot_outs(self_outputs) +
self.inner_sitsot_outs(self_outputs) +
self.inner_nitsot_outs(self_outputs))
scan_node = outs[0].owner
connection_pattern = self.connection_pattern(scan_node)
def get_inp_idx(iidx):
if iidx < self.n_seqs:
return 1 + iidx
oidx = 1 + self.n_seqs
iidx = iidx - self.n_seqs
for taps in self.mitmot_taps():
if len(taps) > iidx:
return oidx
else:
oidx += 1
iidx -= len(taps)
for taps in self.mitsot_taps():
if len(taps) > iidx:
return oidx
else:
oidx += 1
iidx -= len(taps)
if iidx < self.info['n_sit_sot']:
return oidx + iidx
else:
return oidx + iidx + self.info['n_nit_sot']
def get_out_idx(iidx):
oidx = 0
for taps in self.mitmot_out_taps():
if len(taps) > iidx:
return oidx
else:
oidx += 1
iidx -= len(taps)
return oidx + iidx
def compute_gradient(y, g_y):
if 'int' in str(g_y.dtype):
raise TypeError("Gradients may never be integers but g_y "
"has type " + str(g_y.type))
wrt = [x for x in theano.gof.graph.inputs([y])
if x in diff_inputs]
grads = gradient.grad(
cost=None,
known_grads={y: g_y},
odx = get_out_idx(self_outputs.index(y))
wrt = [x for x in theano.gof.graph.inputs([y])
if (x in diff_inputs) and
(connection_pattern[get_inp_idx(self_inputs.index(x))][odx])]
grads = gradient.grad(
cost = None,
known_grads = {y : g_y },
wrt=wrt, consider_constant=wrt,
disconnected_inputs='ignore',
return_disconnected='None')
......@@ -1757,6 +1805,20 @@ class Scan(PureOp):
'Depends on a shared variable'))
else:
gradients.append(x[-1])
# Mask disconnected gradients
# Ideally we would want to assert that the gradients we are
# replacing do indeed evaluate to 0, though that is not practical
# from a computational point of view
# The gradients of scan are computed replacing Disconnected with 0,
# because through the recurrence they can become nonzero
for idx in xrange(len(gradients)):
disconnected = True
for kdx in xrange(len(node.outputs)):
if connection_pattern[idx][kdx] and \
not isinstance(dC_douts[kdx].type, DisconnectedType):
disconnected = False
if disconnected:
gradients[idx] = DisconnectedType()()
return gradients
def R_op(self, inputs, eval_points):
......
......@@ -2212,8 +2212,9 @@ class T_Scan(unittest.TestCase):
cost = expr.sum()
d_cost_wrt_W = tensor.grad(cost, [W])
f = theano.function([W, inpt], d_cost_wrt_W,
givens=OrderedDict([(initial, theano.shared(numpy.zeros(5)))]))
f = theano.function(
[W, inpt], d_cost_wrt_W,
givens=OrderedDict([(initial, theano.shared(numpy.zeros(5)))]))
rval = numpy.asarray([[5187989] * 5] * 5, dtype=theano.config.floatX)
arg1 = numpy.ones((5, 5), dtype=theano.config.floatX)
......@@ -3170,7 +3171,8 @@ class T_Scan(unittest.TestCase):
shared_var = theano.shared(numpy.float32(1.))
def inner_fn():
return [], OrderedDict([(shared_var, shared_var + numpy.float32(1.))])
return [], OrderedDict(
[(shared_var, shared_var + numpy.float32(1.))])
_, updates = theano.scan(inner_fn,
n_steps=10,
truncate_gradient=-1,
......@@ -3243,7 +3245,8 @@ class T_Scan(unittest.TestCase):
seq = tensor.matrix()
initial_value = theano.shared(numpy.zeros((4, 1),
dtype=theano.config.floatX))
outputs_info = [OrderedDict([('initial', initial_value), ('taps', [-4])]), None]
outputs_info = [OrderedDict(
[('initial', initial_value), ('taps', [-4])]), None]
results, updates = theano.scan(fn=onestep,
sequences=seq,
outputs_info=outputs_info)
......@@ -3263,7 +3266,8 @@ class T_Scan(unittest.TestCase):
seq = tensor.matrix()
initial_value = theano.shared(numpy.zeros((4, 1),
dtype=theano.config.floatX))
outputs_info = [OrderedDict([('initial', initial_value), ('taps', [-4])]), None]
outputs_info = [OrderedDict([('initial', initial_value),
('taps', [-4])]), None]
results, _ = theano.scan(fn=onestep,
sequences=seq,
outputs_info=outputs_info)
......@@ -3279,8 +3283,7 @@ class T_Scan(unittest.TestCase):
x_tm1.name = 'x'
y_tm1.name = 'y'
z_tm1.name = 'z'
return x_tm1 ** 2, x_tm1 + y_tm1, x_tm1 + 1
return x_tm1 ** 2, y_tm1, x_tm1 + 1
x0 = tensor.vector('X')
y0 = tensor.vector('y0')
z0 = tensor.vector('Z')
......@@ -3295,10 +3298,36 @@ class T_Scan(unittest.TestCase):
cost = x.sum()
self.assertRaises(ValueError, tensor.grad, cost, y0)
def test_disconnected_gradient(self):
v = tensor.vector('v')
m = tensor.matrix('m')
u0 = tensor.zeros((7,))
[u, m2], _ = theano.scan(lambda _, u: [u, v],
sequences=m,
outputs_info=[u0, None])
# This used to raise an exception with older versions becasue for a
# disconnected gradient a non disconnected type was returned
tensor.grad((m * m2).sum(), v)
def test_pregreedy_optimizer(self):
W = tensor.zeros((5, 4))
bv = tensor.zeros((5,))
bh = tensor.zeros((4,))
v = tensor.matrix('v')
(bv_t, bh_t), _ = theano.scan(lambda _: [bv, bh], sequences=v,
outputs_info=[None, None])
chain, _ = theano.scan(
lambda x: tensor.dot(tensor.dot(x, W) + bh_t, W.T) + bv_t,
outputs_info=v,
n_steps=2)
theano.function([v], chain)(numpy.zeros((3, 5)))
def test_savemem_does_not_duplicate_number_of_scan_nodes(self):
var = tensor.ones(())
values, _ = theano.scan(lambda x: ([x], (), theano.scan_module.until(x)),
outputs_info=[var], n_steps=2)
values, _ = theano.scan(lambda x: ([x], (),
theano.scan_module.until(x)),
outputs_info=[var], n_steps=2)
tmp_fn = theano.function([var], values)
scan_nodes = [x for x in tmp_fn.maker.fgraph.toposort()
......@@ -3371,7 +3400,6 @@ class T_Scan(unittest.TestCase):
assert numpy.allclose(outs[2], v_w + 3)
assert numpy.allclose(sh.get_value(), v_w + 4)
def test_speed():
#
# This function prints out the speed of very simple recurrent
......@@ -3726,7 +3754,8 @@ def test_compute_test_value():
x = tensor.vector('x')
xv = numpy.ones(3, dtype=theano.config.floatX)
x.tag.test_value = xv
y = theano.shared(numpy.arange(3, dtype=theano.config.floatX), name='y')
y = theano.shared(numpy.arange(3, dtype=theano.config.floatX),
name='y')
z, _ = theano.scan(
fn=lambda u, v: u + v,
sequences=[x, y])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论