提交 42d2026e authored 作者: lamblin's avatar lamblin

Merge pull request #1138 from pascanur/bugNicolas

Fix two bugs reported by Nicolas
...@@ -52,10 +52,10 @@ class Optimizer(object): ...@@ -52,10 +52,10 @@ class Optimizer(object):
def apply(self, fgraph): def apply(self, fgraph):
"""WRITEME """WRITEME
Applies the optimization to the provided L{FunctionGraph}. It may use all Applies the optimization to the provided L{FunctionGraph}. It may
the methods defined by the L{FunctionGraph}. If the L{Optimizer} needs use all the methods defined by the L{FunctionGraph}. If the
to use a certain tool, such as an L{InstanceFinder}, it can do L{Optimizer} needs to use a certain tool, such as an
so in its L{add_requirements} method. L{InstanceFinder}, it can do so in its L{add_requirements} method.
""" """
pass pass
...@@ -208,7 +208,6 @@ class SeqOptimizer(Optimizer, list): ...@@ -208,7 +208,6 @@ class SeqOptimizer(Optimizer, list):
nb_node_after, sub_profs) = prof nb_node_after, sub_profs) = prof
blanc = (' ' * level) blanc = (' ' * level)
print >> stream, blanc, "SeqOptimizer", print >> stream, blanc, "SeqOptimizer",
if hasattr(opts, "name"): if hasattr(opts, "name"):
print >> stream, blanc, opts.name, print >> stream, blanc, opts.name,
...@@ -217,7 +216,8 @@ class SeqOptimizer(Optimizer, list): ...@@ -217,7 +216,8 @@ class SeqOptimizer(Optimizer, list):
print >> stream, (" time %.3fs for %d/%d nodes" print >> stream, (" time %.3fs for %d/%d nodes"
" before/after optimization" % ( " before/after optimization" % (
sum(prof), nb_node_before, nb_node_after)) sum(prof), nb_node_before, nb_node_after))
print >> stream, blanc, " %.3fs for fgraph.validate()" % (validate_time) print >> stream, \
blanc, " %.3fs for fgraph.validate()" % (validate_time)
if level == 0: if level == 0:
print >> stream, blanc, " time - (name, class, index)" print >> stream, blanc, " time - (name, class, index)"
ll = [] ll = []
...@@ -289,7 +289,8 @@ class SeqOptimizer(Optimizer, list): ...@@ -289,7 +289,8 @@ class SeqOptimizer(Optimizer, list):
p = prof2 p = prof2
new_t[idx] += p[1][p[0].index(l)] new_t[idx] += p[1][p[0].index(l)]
if hasattr(l, 'merge_profile'): if hasattr(l, 'merge_profile'):
assert len(p[5][p[0].index(l)]) == len(new_sub_profile[idx]) assert len(p[5][p[0].index(l)]) == \
len(new_sub_profile[idx])
new_sub_profile[idx] = l.merge_profile( new_sub_profile[idx] = l.merge_profile(
new_sub_profile[idx], p[5][p[0].index(l)]) new_sub_profile[idx], p[5][p[0].index(l)])
else: else:
...@@ -468,7 +469,8 @@ class MergeFeature(object): ...@@ -468,7 +469,8 @@ class MergeFeature(object):
if node in self.nodes_seen: if node in self.nodes_seen:
return return
# These asserts ensure that the fgraph has set the clients field properly. # These asserts ensure that the fgraph has set the clients field
# properly.
# The clients should at least contain `node` itself! # The clients should at least contain `node` itself!
if node.inputs: if node.inputs:
assert len(node.inputs[0].clients) > 0 assert len(node.inputs[0].clients) > 0
...@@ -677,7 +679,8 @@ class LocalOptimizer(object): ...@@ -677,7 +679,8 @@ class LocalOptimizer(object):
def add_requirements(self, fgraph): def add_requirements(self, fgraph):
""" """
If this local optimization wants to add some requirements to the fgraph, If this local optimization wants to add some requirements to the
fgraph,
This is the place to do it. This is the place to do it.
""" """
# Added by default # Added by default
...@@ -755,7 +758,8 @@ class _LocalOpKeyOptGroup(LocalOptGroup): ...@@ -755,7 +758,8 @@ class _LocalOpKeyOptGroup(LocalOptGroup):
def __init__(self, optimizers): def __init__(self, optimizers):
if any(not hasattr(opt, 'op_key'), optimizers): if any(not hasattr(opt, 'op_key'), optimizers):
raise TypeError("All LocalOptimizers passed here must have an op_key method.") raise TypeError(
"All LocalOptimizers passed here must have an op_key method.")
CompositeLocalOptimizer.__init__(self, optimizers) CompositeLocalOptimizer.__init__(self, optimizers)
def op_key(self): def op_key(self):
...@@ -1133,8 +1137,8 @@ class NavigatorOptimizer(Optimizer): ...@@ -1133,8 +1137,8 @@ class NavigatorOptimizer(Optimizer):
def attach_updater(self, fgraph, importer, pruner, chin=None): def attach_updater(self, fgraph, importer, pruner, chin=None):
""" """
Install some FunctionGraph listeners to help the navigator deal with the Install some FunctionGraph listeners to help the navigator deal with
ignore_trees-related functionality. the ignore_trees-related functionality.
:param importer: function that will be called whenever when :param importer: function that will be called whenever when
optimizations add stuff to the graph. optimizations add stuff to the graph.
...@@ -1522,7 +1526,8 @@ class EquilibriumOptimizer(NavigatorOptimizer): ...@@ -1522,7 +1526,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
count_opt.append((time_lopts[opt], count, opt)) count_opt.append((time_lopts[opt], count, opt))
if count_opt: if count_opt:
print >> stream, blanc, 'times applied - optimizer (only those applied):' print >> stream, blanc, \
'times applied - optimizer (only those applied):'
count_opt.sort() count_opt.sort()
for (t, count, opt) in count_opt[::-1]: for (t, count, opt) in count_opt[::-1]:
print >> stream, blanc, ' %.3fs - %d - %s' % ( print >> stream, blanc, ' %.3fs - %d - %s' % (
...@@ -1591,6 +1596,7 @@ class EquilibriumOptimizer(NavigatorOptimizer): ...@@ -1591,6 +1596,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
### Utilities ### ### Utilities ###
################# #################
def _check_chain(r, chain): def _check_chain(r, chain):
"""WRITEME""" """WRITEME"""
...@@ -1633,8 +1639,8 @@ def check_chain(r, *chain): ...@@ -1633,8 +1639,8 @@ def check_chain(r, *chain):
def pre_greedy_local_optimizer(list_optimizations, out): def pre_greedy_local_optimizer(list_optimizations, out):
''' '''
This function traverses the computation graph described by all This function traverses the computation graph described by all
``node`` in the graph before the variable out but that are not in the fgraph. ``node`` in the graph before the variable out but that are not in the
it applies each of the local_optimizations on the traversed graph. fgraph. it applies each of the local_optimizations on the traversed graph.
Its main use is to apply locally constant folding when generating Its main use is to apply locally constant folding when generating
the graph of the indices of a subtensor. the graph of the indices of a subtensor.
...@@ -1651,6 +1657,7 @@ def pre_greedy_local_optimizer(list_optimizations, out): ...@@ -1651,6 +1657,7 @@ def pre_greedy_local_optimizer(list_optimizations, out):
if not getattr(out, 'owner', None): if not getattr(out, 'owner', None):
return [out], optimized_vars return [out], optimized_vars
node = out.owner node = out.owner
if hasattr(node, 'fgraph'): if hasattr(node, 'fgraph'):
return node.outputs, optimized_vars return node.outputs, optimized_vars
for idx, inp in enumerate(node.inputs): for idx, inp in enumerate(node.inputs):
...@@ -1685,10 +1692,13 @@ def pre_greedy_local_optimizer(list_optimizations, out): ...@@ -1685,10 +1692,13 @@ def pre_greedy_local_optimizer(list_optimizations, out):
else: else:
break break
return results, optimized_vars return results, optimized_vars
if out.owner:
out_index = out.owner.outputs.index(out)
else:
out_index = 0
final_outs, optimized_nodes = local_recursive_function( final_outs, optimized_nodes = local_recursive_function(
list_optimizations, out, {}, 0) list_optimizations, out, {}, 0)
return final_outs[0] return final_outs[out_index]
############ ############
......
...@@ -1334,6 +1334,18 @@ class Scan(PureOp): ...@@ -1334,6 +1334,18 @@ class Scan(PureOp):
tmp = ils tmp = ils
if any([x is not None for x in tmp]): if any([x is not None for x in tmp]):
connection_pattern[iidx + 1][oidx] = True connection_pattern[iidx + 1][oidx] = True
# Applying Floyd-Warshall to find all paths connecting inputs to
# outputs. Note that if `x` is an input to `y_t` and `y_tm1` is an
# input to `z_t` then `x` is an input to `z_t`.
n_outs = len(node.outputs)
for steps in xrange(n_outs):
for iidx in xrange(n_outs):
for jidx in xrange(n_outs):
j_inp_idx = self.get_input_pos(jidx) + 1
if connection_pattern[j_inp_idx][iidx] == True:
for k in xrange(len(connection_pattern)):
if connection_pattern[k][iidx]:
connection_pattern[k][jidx] = True
return connection_pattern return connection_pattern
### GRAD FUNCTION ### GRAD FUNCTION
...@@ -1371,17 +1383,53 @@ class Scan(PureOp): ...@@ -1371,17 +1383,53 @@ class Scan(PureOp):
self.inner_mitsot_outs(self_outputs) + self.inner_mitsot_outs(self_outputs) +
self.inner_sitsot_outs(self_outputs) + self.inner_sitsot_outs(self_outputs) +
self.inner_nitsot_outs(self_outputs)) self.inner_nitsot_outs(self_outputs))
scan_node = outs[0].owner
connection_pattern = self.connection_pattern(scan_node)
def get_inp_idx(iidx):
if iidx < self.n_seqs:
return 1 + iidx
oidx = 1 + self.n_seqs
iidx = iidx - self.n_seqs
for taps in self.mitmot_taps():
if len(taps) > iidx:
return oidx
else:
oidx += 1
iidx -= len(taps)
for taps in self.mitsot_taps():
if len(taps) > iidx:
return oidx
else:
oidx += 1
iidx -= len(taps)
if iidx < self.info['n_sit_sot']:
return oidx + iidx
else:
return oidx + iidx + self.info['n_nit_sot']
def get_out_idx(iidx):
oidx = 0
for taps in self.mitmot_out_taps():
if len(taps) > iidx:
return oidx
else:
oidx += 1
iidx -= len(taps)
return oidx + iidx
def compute_gradient(y, g_y): def compute_gradient(y, g_y):
if 'int' in str(g_y.dtype): if 'int' in str(g_y.dtype):
raise TypeError("Gradients may never be integers but g_y " raise TypeError("Gradients may never be integers but g_y "
"has type " + str(g_y.type)) "has type " + str(g_y.type))
odx = get_out_idx(self_outputs.index(y))
wrt = [x for x in theano.gof.graph.inputs([y]) wrt = [x for x in theano.gof.graph.inputs([y])
if x in diff_inputs] if (x in diff_inputs) and
(connection_pattern[get_inp_idx(self_inputs.index(x))][odx])]
grads = gradient.grad( grads = gradient.grad(
cost=None, cost = None,
known_grads={y: g_y}, known_grads = {y : g_y },
wrt=wrt, consider_constant=wrt, wrt=wrt, consider_constant=wrt,
disconnected_inputs='ignore', disconnected_inputs='ignore',
return_disconnected='None') return_disconnected='None')
...@@ -1757,6 +1805,20 @@ class Scan(PureOp): ...@@ -1757,6 +1805,20 @@ class Scan(PureOp):
'Depends on a shared variable')) 'Depends on a shared variable'))
else: else:
gradients.append(x[-1]) gradients.append(x[-1])
# Mask disconnected gradients
# Ideally we would want to assert that the gradients we are
# replacing do indeed evaluate to 0, though that is not practical
# from a computational point of view
# The gradients of scan are computed replacing Disconnected with 0,
# because through the recurrence they can become nonzero
for idx in xrange(len(gradients)):
disconnected = True
for kdx in xrange(len(node.outputs)):
if connection_pattern[idx][kdx] and \
not isinstance(dC_douts[kdx].type, DisconnectedType):
disconnected = False
if disconnected:
gradients[idx] = DisconnectedType()()
return gradients return gradients
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
......
...@@ -2212,7 +2212,8 @@ class T_Scan(unittest.TestCase): ...@@ -2212,7 +2212,8 @@ class T_Scan(unittest.TestCase):
cost = expr.sum() cost = expr.sum()
d_cost_wrt_W = tensor.grad(cost, [W]) d_cost_wrt_W = tensor.grad(cost, [W])
f = theano.function([W, inpt], d_cost_wrt_W, f = theano.function(
[W, inpt], d_cost_wrt_W,
givens=OrderedDict([(initial, theano.shared(numpy.zeros(5)))])) givens=OrderedDict([(initial, theano.shared(numpy.zeros(5)))]))
rval = numpy.asarray([[5187989] * 5] * 5, dtype=theano.config.floatX) rval = numpy.asarray([[5187989] * 5] * 5, dtype=theano.config.floatX)
...@@ -3170,7 +3171,8 @@ class T_Scan(unittest.TestCase): ...@@ -3170,7 +3171,8 @@ class T_Scan(unittest.TestCase):
shared_var = theano.shared(numpy.float32(1.)) shared_var = theano.shared(numpy.float32(1.))
def inner_fn(): def inner_fn():
return [], OrderedDict([(shared_var, shared_var + numpy.float32(1.))]) return [], OrderedDict(
[(shared_var, shared_var + numpy.float32(1.))])
_, updates = theano.scan(inner_fn, _, updates = theano.scan(inner_fn,
n_steps=10, n_steps=10,
truncate_gradient=-1, truncate_gradient=-1,
...@@ -3243,7 +3245,8 @@ class T_Scan(unittest.TestCase): ...@@ -3243,7 +3245,8 @@ class T_Scan(unittest.TestCase):
seq = tensor.matrix() seq = tensor.matrix()
initial_value = theano.shared(numpy.zeros((4, 1), initial_value = theano.shared(numpy.zeros((4, 1),
dtype=theano.config.floatX)) dtype=theano.config.floatX))
outputs_info = [OrderedDict([('initial', initial_value), ('taps', [-4])]), None] outputs_info = [OrderedDict(
[('initial', initial_value), ('taps', [-4])]), None]
results, updates = theano.scan(fn=onestep, results, updates = theano.scan(fn=onestep,
sequences=seq, sequences=seq,
outputs_info=outputs_info) outputs_info=outputs_info)
...@@ -3263,7 +3266,8 @@ class T_Scan(unittest.TestCase): ...@@ -3263,7 +3266,8 @@ class T_Scan(unittest.TestCase):
seq = tensor.matrix() seq = tensor.matrix()
initial_value = theano.shared(numpy.zeros((4, 1), initial_value = theano.shared(numpy.zeros((4, 1),
dtype=theano.config.floatX)) dtype=theano.config.floatX))
outputs_info = [OrderedDict([('initial', initial_value), ('taps', [-4])]), None] outputs_info = [OrderedDict([('initial', initial_value),
('taps', [-4])]), None]
results, _ = theano.scan(fn=onestep, results, _ = theano.scan(fn=onestep,
sequences=seq, sequences=seq,
outputs_info=outputs_info) outputs_info=outputs_info)
...@@ -3279,8 +3283,7 @@ class T_Scan(unittest.TestCase): ...@@ -3279,8 +3283,7 @@ class T_Scan(unittest.TestCase):
x_tm1.name = 'x' x_tm1.name = 'x'
y_tm1.name = 'y' y_tm1.name = 'y'
z_tm1.name = 'z' z_tm1.name = 'z'
return x_tm1 ** 2, x_tm1 + y_tm1, x_tm1 + 1 return x_tm1 ** 2, y_tm1, x_tm1 + 1
x0 = tensor.vector('X') x0 = tensor.vector('X')
y0 = tensor.vector('y0') y0 = tensor.vector('y0')
z0 = tensor.vector('Z') z0 = tensor.vector('Z')
...@@ -3295,9 +3298,35 @@ class T_Scan(unittest.TestCase): ...@@ -3295,9 +3298,35 @@ class T_Scan(unittest.TestCase):
cost = x.sum() cost = x.sum()
self.assertRaises(ValueError, tensor.grad, cost, y0) self.assertRaises(ValueError, tensor.grad, cost, y0)
def test_disconnected_gradient(self):
v = tensor.vector('v')
m = tensor.matrix('m')
u0 = tensor.zeros((7,))
[u, m2], _ = theano.scan(lambda _, u: [u, v],
sequences=m,
outputs_info=[u0, None])
# This used to raise an exception with older versions becasue for a
# disconnected gradient a non disconnected type was returned
tensor.grad((m * m2).sum(), v)
def test_pregreedy_optimizer(self):
W = tensor.zeros((5, 4))
bv = tensor.zeros((5,))
bh = tensor.zeros((4,))
v = tensor.matrix('v')
(bv_t, bh_t), _ = theano.scan(lambda _: [bv, bh], sequences=v,
outputs_info=[None, None])
chain, _ = theano.scan(
lambda x: tensor.dot(tensor.dot(x, W) + bh_t, W.T) + bv_t,
outputs_info=v,
n_steps=2)
theano.function([v], chain)(numpy.zeros((3, 5)))
def test_savemem_does_not_duplicate_number_of_scan_nodes(self): def test_savemem_does_not_duplicate_number_of_scan_nodes(self):
var = tensor.ones(()) var = tensor.ones(())
values, _ = theano.scan(lambda x: ([x], (), theano.scan_module.until(x)), values, _ = theano.scan(lambda x: ([x], (),
theano.scan_module.until(x)),
outputs_info=[var], n_steps=2) outputs_info=[var], n_steps=2)
tmp_fn = theano.function([var], values) tmp_fn = theano.function([var], values)
...@@ -3371,7 +3400,6 @@ class T_Scan(unittest.TestCase): ...@@ -3371,7 +3400,6 @@ class T_Scan(unittest.TestCase):
assert numpy.allclose(outs[2], v_w + 3) assert numpy.allclose(outs[2], v_w + 3)
assert numpy.allclose(sh.get_value(), v_w + 4) assert numpy.allclose(sh.get_value(), v_w + 4)
def test_speed(): def test_speed():
# #
# This function prints out the speed of very simple recurrent # This function prints out the speed of very simple recurrent
...@@ -3726,7 +3754,8 @@ def test_compute_test_value(): ...@@ -3726,7 +3754,8 @@ def test_compute_test_value():
x = tensor.vector('x') x = tensor.vector('x')
xv = numpy.ones(3, dtype=theano.config.floatX) xv = numpy.ones(3, dtype=theano.config.floatX)
x.tag.test_value = xv x.tag.test_value = xv
y = theano.shared(numpy.arange(3, dtype=theano.config.floatX), name='y') y = theano.shared(numpy.arange(3, dtype=theano.config.floatX),
name='y')
z, _ = theano.scan( z, _ = theano.scan(
fn=lambda u, v: u + v, fn=lambda u, v: u + v,
sequences=[x, y]) sequences=[x, y])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论