Merge pull request #1138 from pascanur/bugNicolas

Fix two bugs reported by Nicolas

Merge pull request #1138 from pascanur/bugNicolas
42d2026e · lamblin · 2e5fcea2 · 638bd64c · 42d2026e · 42d2026e
--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -52,10 +52,10 @@ class Optimizer(object):

    def apply(self, fgraph):
        """WRITEME
-        Applies the optimization to the provided L{FunctionGraph}. It may use all
-        the methods defined by the L{FunctionGraph}. If the L{Optimizer} needs
-        to use a certain tool, such as an L{InstanceFinder}, it can do
-        so in its L{add_requirements} method.
+        Applies the optimization to the provided L{FunctionGraph}. It may
+        use all the methods defined by the L{FunctionGraph}. If the
+        L{Optimizer} needs to use a certain tool, such as an
+        L{InstanceFinder}, it can do so in its L{add_requirements} method.
        """
        pass

@@ -208,7 +208,6 @@ class SeqOptimizer(Optimizer, list):
         nb_node_after, sub_profs) = prof
        blanc = ('    ' * level)

-
        print >> stream, blanc, "SeqOptimizer",
        if hasattr(opts, "name"):
            print >> stream, blanc, opts.name,
@@ -217,7 +216,8 @@ class SeqOptimizer(Optimizer, list):
        print >> stream, (" time %.3fs for %d/%d nodes"
                          " before/after optimization" % (
                              sum(prof), nb_node_before, nb_node_after))
-        print >> stream, blanc, "  %.3fs for fgraph.validate()" % (validate_time)
+        print >> stream, \
+                blanc, "  %.3fs for fgraph.validate()" % (validate_time)
        if level == 0:
            print >> stream, blanc, "  time      - (name, class, index)"
        ll = []
@@ -289,7 +289,8 @@ class SeqOptimizer(Optimizer, list):
                        p = prof2
                    new_t[idx] += p[1][p[0].index(l)]
                    if hasattr(l, 'merge_profile'):
-                        assert len(p[5][p[0].index(l)]) == len(new_sub_profile[idx])
+                        assert len(p[5][p[0].index(l)]) == \
+                                len(new_sub_profile[idx])
                        new_sub_profile[idx] = l.merge_profile(
                            new_sub_profile[idx], p[5][p[0].index(l)])
                    else:
@@ -468,7 +469,8 @@ class MergeFeature(object):
        if node in self.nodes_seen:
            return

-        # These asserts ensure that the fgraph has set the clients field properly.
+        # These asserts ensure that the fgraph has set the clients field
+        # properly.
        # The clients should at least contain `node` itself!
        if node.inputs:
            assert len(node.inputs[0].clients) > 0
@@ -677,7 +679,8 @@ class LocalOptimizer(object):

    def add_requirements(self, fgraph):
        """
-        If this local optimization wants to add some requirements to the fgraph,
+        If this local optimization wants to add some requirements to the
+        fgraph,
        This is the place to do it.
        """
        # Added by default
@@ -755,7 +758,8 @@ class _LocalOpKeyOptGroup(LocalOptGroup):

    def __init__(self, optimizers):
        if any(not hasattr(opt, 'op_key'), optimizers):
-            raise TypeError("All LocalOptimizers passed here must have an op_key method.")
+            raise TypeError(
+                "All LocalOptimizers passed here must have an op_key method.")
        CompositeLocalOptimizer.__init__(self, optimizers)

    def op_key(self):
@@ -1133,8 +1137,8 @@ class NavigatorOptimizer(Optimizer):

    def attach_updater(self, fgraph, importer, pruner, chin=None):
        """
-        Install some FunctionGraph listeners to help the navigator deal with the
-        ignore_trees-related functionality.
+        Install some FunctionGraph listeners to help the navigator deal with
+        the ignore_trees-related functionality.

        :param importer: function that will be called whenever when
            optimizations add stuff to the graph.
@@ -1522,7 +1526,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                count_opt.append((time_lopts[opt], count, opt))

        if count_opt:
-            print >> stream, blanc, 'times applied - optimizer (only those applied):'
+            print >> stream, blanc, \
+                    'times applied - optimizer (only those applied):'
            count_opt.sort()
            for (t, count, opt) in count_opt[::-1]:
                print >> stream, blanc, '  %.3fs - %d - %s' % (
@@ -1591,6 +1596,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
 ### Utilities ###
 #################

+
 def _check_chain(r, chain):
    """WRITEME"""

@@ -1633,8 +1639,8 @@ def check_chain(r, *chain):
 def pre_greedy_local_optimizer(list_optimizations, out):
    '''
    This function traverses the computation graph described by all
-    ``node`` in the graph before the variable out but that are not in the fgraph.
-    it applies each of the local_optimizations on the traversed graph.
+    ``node`` in the graph before the variable out but that are not in the
+    fgraph. it applies each of the local_optimizations on the traversed graph.

    Its main use is to apply locally constant folding when generating
    the graph of the indices of a subtensor.
@@ -1651,6 +1657,7 @@ def pre_greedy_local_optimizer(list_optimizations, out):
        if not getattr(out, 'owner', None):
            return [out], optimized_vars
        node = out.owner
+
        if hasattr(node, 'fgraph'):
            return node.outputs, optimized_vars
        for idx, inp in enumerate(node.inputs):
@@ -1685,10 +1692,13 @@ def pre_greedy_local_optimizer(list_optimizations, out):
                else:
                    break
        return results, optimized_vars
-
+    if out.owner:
+        out_index = out.owner.outputs.index(out)
+    else:
+        out_index = 0
    final_outs, optimized_nodes = local_recursive_function(
        list_optimizations, out, {}, 0)
-    return final_outs[0]
+    return final_outs[out_index]


 ############

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -1334,6 +1334,18 @@ class Scan(PureOp):
                            tmp = ils
                        if any([x is not None for x in tmp]):
                            connection_pattern[iidx + 1][oidx] = True
+        # Applying Floyd-Warshall to find all paths connecting inputs to
+        # outputs. Note that if `x` is an input to `y_t` and `y_tm1` is an
+        # input to `z_t` then `x` is an input to `z_t`.
+        n_outs = len(node.outputs)
+        for steps in xrange(n_outs):
+            for iidx in xrange(n_outs):
+                for jidx in xrange(n_outs):
+                    j_inp_idx = self.get_input_pos(jidx) + 1
+                    if connection_pattern[j_inp_idx][iidx] == True:
+                        for k in xrange(len(connection_pattern)):
+                            if connection_pattern[k][iidx]:
+                                connection_pattern[k][jidx] = True
        return connection_pattern

    ### GRAD FUNCTION
@@ -1371,17 +1383,53 @@ class Scan(PureOp):
                        self.inner_mitsot_outs(self_outputs) +
                        self.inner_sitsot_outs(self_outputs) +
                        self.inner_nitsot_outs(self_outputs))
+        scan_node = outs[0].owner
+        connection_pattern = self.connection_pattern(scan_node)
+        def get_inp_idx(iidx):
+            if iidx < self.n_seqs:
+                return 1 + iidx
+            oidx = 1 + self.n_seqs
+            iidx = iidx - self.n_seqs
+            for taps in self.mitmot_taps():
+                if len(taps) > iidx:
+                    return oidx
+                else:
+                    oidx += 1
+                    iidx -= len(taps)
+            for taps in self.mitsot_taps():
+                if len(taps) > iidx:
+                    return oidx
+                else:
+                    oidx += 1
+                    iidx -= len(taps)
+
+            if iidx < self.info['n_sit_sot']:
+                return oidx + iidx
+            else:
+                return oidx + iidx + self.info['n_nit_sot']
+
+        def get_out_idx(iidx):
+            oidx = 0
+            for taps in self.mitmot_out_taps():
+                if len(taps) > iidx:
+                    return oidx
+                else:
+                    oidx += 1
+                    iidx -= len(taps)
+            return oidx + iidx

        def compute_gradient(y, g_y):
            if 'int' in str(g_y.dtype):
                raise TypeError("Gradients may never be integers but g_y "
                        "has type " + str(g_y.type))

-            wrt = [x for x in theano.gof.graph.inputs([y])
-                    if x in diff_inputs]
-            grads = gradient.grad(
-                    cost=None,
-                    known_grads={y: g_y},
+            odx = get_out_idx(self_outputs.index(y))
+            wrt  = [x for  x in theano.gof.graph.inputs([y])
+                    if (x in diff_inputs) and
+                    (connection_pattern[get_inp_idx(self_inputs.index(x))][odx])]
+            grads =  gradient.grad(
+                    cost = None,
+                    known_grads = {y : g_y },
                    wrt=wrt, consider_constant=wrt,
                    disconnected_inputs='ignore',
                    return_disconnected='None')
@@ -1757,6 +1805,20 @@ class Scan(PureOp):
                                   'Depends on a shared variable'))
            else:
                gradients.append(x[-1])
+        # Mask disconnected gradients
+        # Ideally we would want to assert that the gradients we are
+        # replacing do indeed evaluate to 0, though that is not practical
+        # from a computational point of view
+        # The gradients of scan are computed replacing Disconnected with 0,
+        # because through the recurrence they can become nonzero
+        for idx in xrange(len(gradients)):
+            disconnected = True
+            for kdx in xrange(len(node.outputs)):
+                if connection_pattern[idx][kdx] and \
+                   not isinstance(dC_douts[kdx].type, DisconnectedType):
+                    disconnected = False
+            if disconnected:
+                gradients[idx] = DisconnectedType()()
        return gradients

    def R_op(self, inputs, eval_points):

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -2212,8 +2212,9 @@ class T_Scan(unittest.TestCase):

        cost = expr.sum()
        d_cost_wrt_W = tensor.grad(cost, [W])
-        f = theano.function([W, inpt], d_cost_wrt_W,
-                             givens=OrderedDict([(initial, theano.shared(numpy.zeros(5)))]))
+        f = theano.function(
+            [W, inpt], d_cost_wrt_W,
+            givens=OrderedDict([(initial, theano.shared(numpy.zeros(5)))]))

        rval = numpy.asarray([[5187989] * 5] * 5, dtype=theano.config.floatX)
        arg1 = numpy.ones((5, 5), dtype=theano.config.floatX)
@@ -3170,7 +3171,8 @@ class T_Scan(unittest.TestCase):
        shared_var = theano.shared(numpy.float32(1.))

        def inner_fn():
-            return [], OrderedDict([(shared_var, shared_var + numpy.float32(1.))])
+            return [], OrderedDict(
+                [(shared_var, shared_var + numpy.float32(1.))])
        _, updates = theano.scan(inner_fn,
                                 n_steps=10,
                                 truncate_gradient=-1,
@@ -3243,7 +3245,8 @@ class T_Scan(unittest.TestCase):
        seq = tensor.matrix()
        initial_value = theano.shared(numpy.zeros((4, 1),
                                                  dtype=theano.config.floatX))
-        outputs_info = [OrderedDict([('initial', initial_value), ('taps', [-4])]), None]
+        outputs_info = [OrderedDict(
+            [('initial', initial_value), ('taps', [-4])]), None]
        results, updates = theano.scan(fn=onestep,
                                       sequences=seq,
                                       outputs_info=outputs_info)
@@ -3263,7 +3266,8 @@ class T_Scan(unittest.TestCase):
        seq = tensor.matrix()
        initial_value = theano.shared(numpy.zeros((4, 1),
                                                  dtype=theano.config.floatX))
-        outputs_info = [OrderedDict([('initial', initial_value), ('taps', [-4])]), None]
+        outputs_info = [OrderedDict([('initial', initial_value),
+                                     ('taps', [-4])]), None]
        results, _ = theano.scan(fn=onestep,
                                       sequences=seq,
                                       outputs_info=outputs_info)
@@ -3279,8 +3283,7 @@ class T_Scan(unittest.TestCase):
            x_tm1.name = 'x'
            y_tm1.name = 'y'
            z_tm1.name = 'z'
-            return x_tm1 ** 2, x_tm1 + y_tm1, x_tm1 + 1
-
+            return x_tm1 ** 2, y_tm1, x_tm1 + 1
        x0 = tensor.vector('X')
        y0 = tensor.vector('y0')
        z0 = tensor.vector('Z')
@@ -3295,10 +3298,36 @@ class T_Scan(unittest.TestCase):
        cost = x.sum()
        self.assertRaises(ValueError, tensor.grad, cost, y0)

+    def test_disconnected_gradient(self):
+        v = tensor.vector('v')
+        m = tensor.matrix('m')
+        u0 = tensor.zeros((7,))
+
+        [u, m2], _ = theano.scan(lambda _, u: [u, v],
+                                 sequences=m,
+                                 outputs_info=[u0, None])
+        # This used to raise an exception with older versions becasue for a
+        # disconnected gradient a non disconnected type was returned
+        tensor.grad((m * m2).sum(), v)
+
+    def test_pregreedy_optimizer(self):
+        W = tensor.zeros((5, 4))
+        bv = tensor.zeros((5,))
+        bh = tensor.zeros((4,))
+        v = tensor.matrix('v')
+        (bv_t, bh_t), _ = theano.scan(lambda _: [bv, bh], sequences=v,
+                                      outputs_info=[None, None])
+        chain, _ = theano.scan(
+            lambda x: tensor.dot(tensor.dot(x, W) + bh_t, W.T) + bv_t,
+            outputs_info=v,
+            n_steps=2)
+        theano.function([v], chain)(numpy.zeros((3, 5)))
+
    def test_savemem_does_not_duplicate_number_of_scan_nodes(self):
        var = tensor.ones(())
-        values, _ = theano.scan(lambda x: ([x], (), theano.scan_module.until(x)),
-                                          outputs_info=[var], n_steps=2)
+        values, _ = theano.scan(lambda x: ([x], (),
+                                           theano.scan_module.until(x)),
+                                outputs_info=[var], n_steps=2)

        tmp_fn = theano.function([var], values)
        scan_nodes = [x for x in tmp_fn.maker.fgraph.toposort()
@@ -3371,7 +3400,6 @@ class T_Scan(unittest.TestCase):
        assert numpy.allclose(outs[2], v_w + 3)
        assert numpy.allclose(sh.get_value(), v_w + 4)

-
 def test_speed():
    #
    # This function prints out the speed of very simple recurrent
@@ -3726,7 +3754,8 @@ def test_compute_test_value():
        x = tensor.vector('x')
        xv = numpy.ones(3, dtype=theano.config.floatX)
        x.tag.test_value = xv
-        y = theano.shared(numpy.arange(3, dtype=theano.config.floatX), name='y')
+        y = theano.shared(numpy.arange(3, dtype=theano.config.floatX),
+                          name='y')
        z, _ = theano.scan(
                fn=lambda u, v: u + v,
                sequences=[x, y])