Merge pull request #2323 from carriepl/scan_push_out_sum_of_dot

Scan push out sum of dot

Merge pull request #2323 from carriepl/scan_push_out_sum_of_dot
06cc52d7 · Frédéric Bastien · f3fc3be2 · eb1d9063 · 06cc52d7 · 06cc52d7
--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -1427,6 +1427,11 @@ class Scan(PureOp):
        else:
            grad_steps = inputs[0]

+        # Restrict the number of grad steps according to
+        # self.truncate_gradient
+        if self.truncate_gradient != -1:
+            grad_steps = tensor.minimum(grad_steps, self.truncate_gradient)
+
        rval = scan_utils.reconstruct_graph(self.inputs,
                                            self.outputs)
        self_inputs = rval[0]
@@ -1652,6 +1657,10 @@ class Scan(PureOp):
        outer_inp_seqs += [x[::-1][:-1] for x in self.outer_sitsot_outs(outs)]
        outer_inp_seqs += [x[::-1] for x in self.outer_nitsot_outs(outs)]

+        # Restrict the length of the outer sequences to the number of grad
+        # steps
+        outer_inp_seqs = [seq[:grad_steps] for seq in outer_inp_seqs]
+
        inner_inp_seqs = self.inner_seqs(self_inputs)
        inner_inp_seqs += self.inner_mitmot(self_inputs)
        inner_inp_seqs += self.inner_mitsot(self_inputs)
@@ -1820,9 +1829,6 @@ class Scan(PureOp):
            ins_pos += 1
            n_mitmot_inps += 2

-        if self.truncate_gradient != -1:
-            grad_steps = tensor.minimum(grad_steps, self.truncate_gradient)
-
        n_nit_sot = self.n_seqs
        inner_out_nitsot = dC_dinps_t[:self.n_seqs]
        inner_out_sitsot = dC_dinps_t[ins_pos:]

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -605,11 +605,6 @@ class PushOutScanOutput(gof.Optimizer):

    def process_node(self, fgraph, node):

-        clean_inputs, clean_outputs = scan_utils.reconstruct_graph(
-            node.op.inputs, node.op.outputs)
-
-        local_fgraph = gof.FunctionGraph(clean_inputs, clean_outputs, clone=False)
-
        op = node.op

        # Use scan_args to parse the inputs and outputs of scan for ease of
@@ -617,29 +612,21 @@ class PushOutScanOutput(gof.Optimizer):
        args = scan_args(node.inputs, node.outputs,
                         node.op.inputs, node.op.outputs, node.op.info)

-        # Obtain the list containing the indices, in clean_outputs, of the
-        # scan op's outputs that are nit_sot (not fed back to the inner fct.)
-        nitsot_outs = op.inner_nitsot_outs(node.outputs)
-        idx_nitsot_outs = [node.outputs.index(i) for i in nitsot_outs]
-
-        # Construct the list of non_sequences to simplify a few things
-        inner_non_seqs = op.inner_non_seqs(clean_inputs)
-        outer_non_seqs = op.outer_non_seqs(node.inputs)
-        assert len(inner_non_seqs) == len(outer_non_seqs)
-
-        inner_seqs = op.inner_seqs(clean_inputs)
-        outer_seqs = op.outer_seqs(node.inputs)
+        local_fgraph = gof.FunctionGraph(args.inner_inputs,
+                                         args.inner_outputs,
+                                         clone=False)

        new_scan_node = None

        for nd in local_fgraph.toposort():

            if (isinstance(nd.op, theano.tensor.Dot) and
-                  nd.out in clean_outputs):
+                nd.out in args.inner_out_nit_sot):

                """
                The following optimization involves pushing out, after the
-                scan, a Dot where one input is one of scan's input with ndim=2
+                scan, a Dot whose output is nitsot (not feed back to the inner
+                graph) and where one input is one of scan's input with ndim=2
                and the other is an intermediate variable in the Scan inner
                graph with ndim=1.

@@ -648,29 +635,11 @@ class PushOutScanOutput(gof.Optimizer):
                concatenating the vectors into a matrix.
                """

-                # Go through clean_outputs and pick one that is
-                # - Equal to the output of the tensor.Dot
-                # - Nit_sot : not fed back to the inner graph because applying
-                #   the optimization in that case would alter the results of
-                #   the function
-                # - Used by something outside of the graph to avoid applying
-                #   the optimization needlessly
-                idx_dot_output = -1
-                for i in range(len(clean_outputs)):
-
-                    is_dot_output = (nd.out == clean_outputs[i])
-                    is_nitsot_output = i in idx_nitsot_outs
-                    used_in_outer_graph = (len(node.outputs[i].clients) > 0)
-
-                    if (is_dot_output and is_nitsot_output and
-                        used_in_outer_graph):
-
-                        idx_dot_output = i
-                        break
-
-                if idx_dot_output == -1:
-                    # The dot has no output that fits the requirements for
-                    # this optimization. Move on to the next node.
+                # Ensure that the output of the Dot is used in the outer
+                # graph to avoid apply the optimization needlessly
+                dot_out_nitsot_idx = args.inner_out_nit_sot.index(nd.out)
+                outer_dot_output = args.outer_out_nit_sot[dot_out_nitsot_idx]
+                if len(outer_dot_output.clients) == 0:
                    continue

                """
@@ -684,75 +653,39 @@ class PushOutScanOutput(gof.Optimizer):
                idx_vector_input = -1

                if (nd.inputs[0].ndim == 2 and
-                    (nd.inputs[0] in inner_non_seqs or
+                    (nd.inputs[0] in args.inner_in_non_seqs or
                     isinstance(nd.inputs[0], tensor.Constant)) and
                    nd.inputs[1].ndim == 1 and
-                      (nd.inputs[1] in inner_seqs or
-                       nd.inputs[1] not in clean_inputs)):
+                      (nd.inputs[1] in args.inner_in_seqs or
+                       nd.inputs[1] not in args.inner_inputs)):

                    valid_inputs = True
                    idx_matrix_input = 0
                    idx_vector_input = 1

                elif (nd.inputs[1].ndim == 2 and
-                      (nd.inputs[1] in inner_non_seqs or
+                      (nd.inputs[1] in args.inner_in_non_seqs or
                       isinstance(nd.inputs[1], tensor.Constant)) and
                      nd.inputs[0].ndim == 1 and
-                      (nd.inputs[0] in inner_seqs or
-                       nd.inputs[0] not in clean_inputs)):
+                      (nd.inputs[0] in args.inner_in_seqs or
+                       nd.inputs[0] not in args.inner_inputs)):

                    valid_inputs = True
                    idx_matrix_input = 1
                    idx_vector_input = 0

+
                if valid_inputs:
                    # The optimization can be applied on the current Dot

-                    # Create a copy of the Dot's matrix input outside
-                    # of scan
-                    inner_matrix_input = nd.inputs[idx_matrix_input]
-                    if inner_matrix_input in inner_non_seqs:
-                        _idx = inner_non_seqs.index(inner_matrix_input)
-                        outer_matrix_input = outer_non_seqs[_idx]
-                    elif isinstance(inner_matrix_input, theano.Constant):
-                        outer_matrix_input = inner_matrix_input.clone()
-                    else:
-                        # Should not have happened
-                        raise Exception(
-                            ('Error in the `scan_pushout_seq_'
-                             'operations`. The optimization tries '
-                             'to move some computation fron scan '
-                             'which is not allowed to move. Report '
-                             'this on theano-users list'),
-                             inner_matrix_input)
-
-                    # If the vector_input is already a nit_sot output of the
-                    # scan, get a reference to the corresponding outer output.
-                    # Otherwise, add it as a new nit_sot output and then get a
-                    # reference to it
-                    if nd.inputs[idx_vector_input] in inner_seqs:
-                        _idx = inner_seqs.index(nd.inputs[idx_vector_input])
-                        outer_vector_input = outer_seqs[_idx]
-
-                    elif nd.inputs[idx_vector_input] in nitsot_outs:
-                        # Figure out which scan output corresponds the vector
-                        # input
-                        inner_vector_input = nd.inputs[idx_vector_input]
-                        vector_input_nitsot_idx = args.inner_out_nit_sot.index(inner_vector_input)
-                        outer_vector_input = args.outer_out_nit_sot[vector_input_nitsot_idx]
-
-                    else:
-                        # Add the vector_input as a new nitsot output to scan
-                        new_output_inner = nd.inputs[idx_vector_input]
-                        new_scan_node, idx_old_outputs, idx_new_output = self.add_nitsot_outputs(
-                                                                                        fgraph, node,
-                                                                                        clean_inputs,
-                                                                                        clean_outputs,
-                                                                                        new_output_inner)
-                        outer_vector_input = new_scan_node.outputs[idx_new_output]
-
-                        node = new_scan_node
-                        idx_dot_output = idx_old_outputs[idx_dot_output]
+                    # Move out of scan the two inputs to the Dot
+                    (outer_vars,
+                     new_scan_node,
+                     new_scan_args) = self.push_out_inner_vars(fgraph,
+                                                               nd.inputs,
+                                                               node, args)
+                    outer_vector_input = outer_vars[idx_vector_input]
+                    outer_matrix_input = outer_vars[idx_matrix_input]

                    # Perform the Dot outside of scan
                    if idx_matrix_input == 0:
@@ -766,79 +699,220 @@ class PushOutScanOutput(gof.Optimizer):

                    # Modify the outer graph to add the outer Dot
                    fgraph.replace_all([
-                           (node.outputs[idx_dot_output],
+                           (new_scan_args.outer_out_nit_sot[dot_out_nitsot_idx],
                            outer_dot_output)],
                           reason="scanOp_pushout_output")

                    break

+            elif (isinstance(nd.op, theano.tensor.elemwise.Elemwise) and
+                  isinstance(nd.op.nfunc, numpy.ufunc) and
+                  nd.op.nfunc.__name__ == 'add' and
+                  nd.out in args.inner_out_sit_sot and
+                  self.inner_sitsot_only_last_step_used(nd.out, args)):
+
+                # Ensure that one of the input to the add is the output of
+                # the add from a previous iteration of the inner function
+                sitsot_idx = args.inner_out_sit_sot.index(nd.out)
+                if args.inner_in_sit_sot[sitsot_idx] in nd.inputs:
+
+                    # Ensure that the other input to the add is a dot product
+                    # between 2 matrices which will become a tensor3 and a
+                    # matrix if pushed outside of the scan. Also make sure
+                    # that the output of the Dot is ONLY used by the 'add'
+                    # otherwise doing a Dot in the outer graph will only
+                    # duplicate computation.
+
+                    sitsot_in_idx = nd.inputs.index(args.inner_in_sit_sot[sitsot_idx])
+
+                    dot_in_idx = 1 - sitsot_in_idx # 0 if sitsot_in_idx==1,
+                                                   # 1 if sitsot_in_idx==0
+                    dot_input = nd.inputs[dot_in_idx]
+
+                    if (dot_input.owner is not None and
+                        isinstance(dot_input.owner.op, theano.tensor.Dot) and
+                        len(dot_input.clients) == 1 and
+                        dot_input.owner.inputs[0].ndim == 2 and
+                        dot_input.owner.inputs[1].ndim == 2 and
+                        self.get_outer_ndim(dot_input.owner.inputs[0], args) == 3 and
+                        self.get_outer_ndim(dot_input.owner.inputs[1], args) == 3):
+
+                        # The optimization can be be applied in this case.
+
+                        # Move out of scan the two inputs to the Dot and
+                        # perform a dot outside of scan on these two inputs
+                        inner_dot_inputs = nd.inputs[dot_in_idx].owner.inputs
+                        (outer_dot_inputs,
+                         new_scan_node,
+                         new_scan_args) = self.push_out_inner_vars(fgraph,
+                                                                   inner_dot_inputs,
+                                                                   node, args)
+
+
+                        # Collapse some of the dimensions of the tensors
+                        # so that they become matrices. This is because a
+                        # dot is usually faster on two large matrices than
+                        # a bunch of small ones
+                        outer_dot_inputs[0] = theano.tensor.flatten(
+                                       outer_dot_inputs[0].dimshuffle(1,0,2),
+                                       outdim=2)
+
+                        shape_input1 = theano.tensor.shape(outer_dot_inputs[1])
+                        outer_dot_inputs[1] = outer_dot_inputs[1].reshape((shape_input1[0] *
+                                                                           shape_input1[1],
+                                                                           shape_input1[2]))
+
+                        # Perform the dot on the newly obtained matrices and
+                        # add the initial value
+                        outer_dot_output = theano.tensor.dot(*outer_dot_inputs)
+                        init_value = new_scan_args.outer_in_sit_sot[sitsot_idx][0]
+                        replacement = outer_dot_output + init_value
+
+                        # Alter the outer graph to use the output of the
+                        # external Dot instead of the output of scan
+                        # Modify the outer graph to add the outer Dot
+                        outer_sitsot = new_scan_args.outer_out_sit_sot[sitsot_idx]
+                        subtensor_node = outer_sitsot.clients[0][0]
+                        outer_sitsot_last_step = subtensor_node.outputs[0]
+
+                        fgraph.replace_all([
+                            (outer_sitsot_last_step, replacement)],
+                            reason="scanOp_pushout_output")
+
+                        break
+
        return new_scan_node

-    def add_nitsot_outputs(self, fgraph, scan_node, clean_inputs,
-                                    clean_outputs, new_output_inner):
+    def inner_sitsot_only_last_step_used(self, var, scan_args):
        """
-        Create a new scan that takes the same inputs as scan_node and produces
-        the same output as well as the provided output new_output_inner
+        Given a inner nit_sot output of scan, return True iff the outer
+        nit_sot output has only one client and that client is a Subtensor
+        instance that takes only the last step (last element along the first
+        axis).
        """
+        idx = scan_args.inner_out_sit_sot.index(var)
+        outer_var = scan_args.outer_out_sit_sot[idx]

-        # Compute the index at which to insert the new output. For a scan Op,
-        # the outputs follow the ordering : mit_mot, mit_sot, sis_sot, nit_sot
-        # and shared_outs
-        output_insert_idx = (scan_node.op.info['n_mit_mot'] +
-                             scan_node.op.info['n_mit_sot'] +
-                             scan_node.op.info['n_sit_sot'] +
-                             scan_node.op.info['n_nit_sot'])
-
-
-        # Compile list of new inputs and outputs for the new Scan op
-        _nw_op_ins = clean_inputs
-        _nw_op_outs = (scan_utils.clone(clean_outputs[:output_insert_idx]) +
-                       [new_output_inner] +
-                       scan_utils.clone(clean_outputs[output_insert_idx:]))
-        nw_op_ins, nw_op_outs = scan_utils.reconstruct_graph(_nw_op_ins,
-                                                             _nw_op_outs)
-
-        # Compile a list containing, for every output of the old scan op,
-        # what its output index will be under the new scan op
-        nw_op_output_indices = [i + int(i>output_insert_idx)
-                                for i in range(output_insert_idx)]
-
-        # Construct the new Scan op
-        nw_info = scan_node.op.info.copy()
-        nw_info['n_nit_sot'] += 1
-        nw_scan = scan_op.Scan(nw_op_ins, nw_op_outs, nw_info)
-
-        # Assemble the lists of inputs for the node that will apply the new
-        # scan op by inserting an initial value for the new input in the
-        # at the right position in the list of inputs for the old node.
-        nw_node_input_idx = (scan_node.op.info['n_seqs'] +
-                             scan_node.op.info['n_mit_mot'] +
-                             scan_node.op.info['n_mit_sot'] +
-                             scan_node.op.info['n_sit_sot'] +
-                             scan_node.op.info['n_shared_outs'] +
-                             scan_node.op.info['n_nit_sot'])
+        if len(outer_var.clients) == 1:

-        # (the initial value is the nb of steps to store. For a nistot,
-        # it should be the number of steps performed by scan)
-        nw_node_input_init_value = scan_node.inputs[0]
+            client = outer_var.clients[0][0]
+
+            if (client != 'output' and
+                isinstance(client.op, theano.tensor.Subtensor) and
+                isinstance(client.inputs[1], theano.Constant) and
+                client.inputs[1].ndim == 0 and
+                client.inputs[1].value == -1):
+
+                return True

-        nw_node_inputs = (scan_node.inputs[:nw_node_input_idx] +
-                          [nw_node_input_init_value] +
-                          scan_node.inputs[nw_node_input_idx:])
+        return False

-        # Build the Scan's apply node
-        nw_node = nw_scan(*nw_node_inputs, **dict(return_list=True))[0].owner
+    def get_outer_ndim(self, var, scan_args):

-        nw_node_old_outputs = (nw_node.outputs[:output_insert_idx] +
-                               nw_node.outputs[output_insert_idx+1:])
+        # Given a variable, determine the number of dimension it would have if
+        # it was pushed out of scan
+        if (var in scan_args.inner_in_non_seqs or
+            isinstance(var, theano.Constant)):

-        # Make sure the outputs of the new scan op are used instead of the old
-        fgraph.replace_all(
-            zip(scan_node.outputs, nw_node_old_outputs),
-            reason='scanOp_pushout_output')
+            outer_ndim = var.ndim
+        else:
+            outer_ndim = var.ndim + 1
+
+        return outer_ndim
+
+    def push_out_inner_vars(self, fgraph, inner_vars, old_scan_node,
+                            old_scan_args):
+
+        outer_vars = [None] * len(inner_vars)
+        new_scan_node = old_scan_node
+        new_scan_args = old_scan_args
+
+        # For the inner_vars that already exist in the outer graph,
+        # simply obtain a reference to them
+        for idx in range(len(inner_vars)):
+
+            var = inner_vars[idx]
+
+            if var in old_scan_args.inner_in_seqs:
+                idx_seq = old_scan_args.inner_in_seqs.index(var)
+                outer_vars[idx] = old_scan_args.outer_in_seqs[idx_seq]
+
+            elif var in old_scan_args.inner_in_non_seqs:
+                idx_non_seq = old_scan_args.inner_in_non_seqs.index(var)
+                outer_vars[idx] = old_scan_args.outer_in_non_seqs[idx_non_seq]

-        return nw_node, nw_op_output_indices, output_insert_idx
+            elif isinstance(var, theano.Constant):
+                outer_vars[idx] = var.clone()

+            elif var in old_scan_args.inner_out_nit_sot:
+                idx_nitsot = old_scan_args.inner_out_nit_sot.index(var)
+                outer_vars[idx] = old_scan_args.outer_out_nit_sot[idx_nitsot]
+
+        # For the inner_vars that don't already exist in the outer graph, add
+        # them as new nitsot outputs to the scan node.
+        idx_add_as_nitsots = [i for i in range(len(outer_vars))
+                              if outer_vars[i] == None]
+        add_as_nitsots = [inner_vars[idx] for idx in idx_add_as_nitsots]
+
+        if len(add_as_nitsots) > 0:
+
+            new_scan_node = self.add_nitsot_outputs(fgraph,old_scan_node,
+                                                    old_scan_args,
+                                                    add_as_nitsots)
+
+            new_scan_args = scan_args(new_scan_node.inputs,
+                                      new_scan_node.outputs,
+                                      new_scan_node.op.inputs,
+                                      new_scan_node.op.outputs,
+                                      new_scan_node.op.info)
+
+            new_outs = new_scan_args.outer_out_nit_sot[-len(add_as_nitsots):]
+            for i in range(len(new_outs)):
+                outer_vars[idx_add_as_nitsots[i]] = new_outs[i]
+
+        return outer_vars, new_scan_node, new_scan_args
+
+    def add_nitsot_outputs(self, fgraph, old_scan_node,
+                           old_scan_args, new_outputs_inner):
+
+        nb_new_outs = len(new_outputs_inner)
+
+        # Create the initial values for the new nitsot outputs
+        # (the initial value is the nb of steps to store. For a nistot,
+        # it should be the number of steps performed by scan)
+        new_nitsots_initial_value = [old_scan_node.inputs[0]
+                                     for i in range(nb_new_outs)]
+
+        # Create the scan_args corresponding to the new scan op to
+        # create
+        new_scan_args = copy.copy(old_scan_args)
+        new_scan_args.inner_out_nit_sot.extend(new_outputs_inner)
+        new_scan_args.outer_in_nit_sot.extend(new_nitsots_initial_value)
+
+        # Create the scan op from the scan_args
+        new_scan_op = scan_op.Scan(new_scan_args.inner_inputs,
+                                   new_scan_args.inner_outputs,
+                                   new_scan_args.info)
+
+        # Create the Apply node for the scan op
+        new_scan_node = new_scan_op(*new_scan_args.outer_inputs,
+                                    **dict(return_list=True))[0].owner
+
+        # Modify the outer graph to make sure the outputs of the new scan are
+        # used instead of the outputs of the old scan
+        new_node_new_outputs_idx = (len(old_scan_args.outer_outputs) -
+                                    len(old_scan_args.outer_out_shared))
+
+        new_node_old_outputs = (
+                new_scan_node.outputs[:new_node_new_outputs_idx] +
+                new_scan_node.outputs[new_node_new_outputs_idx+nb_new_outs:])
+
+        fgraph.replace_all_validate_remove(
+            zip(old_scan_node.outputs, new_node_old_outputs),
+            remove=[old_scan_node],
+            reason='scanOp_pushout_output')
+
+        return new_scan_node

 class ScanInplaceOptimizer(Optimizer):
    """Graph optimizer for Scan(makes it run inplace)"""

--- a/theano/scan_module/tests/test_scan_opt.py
+++ b/theano/scan_module/tests/test_scan_opt.py
@@ -157,12 +157,11 @@ class TestPushOutScanOutputDot(object):

        # Compile the function twice, once with the optimization and once
        # without
-        f_opt = theano.function([v, m], T.jacobian(output, v))
+        opt_mode = mode.including("scan")
+        f_opt = theano.function([v, m], T.jacobian(output, v), mode=opt_mode)

-        default_mode = theano.compile.get_default_mode()
-        default_mode.excluding("scanOp_pushout_output")
-        f_no_opt = theano.function([v, m], T.jacobian(output, v),
-                                   mode=default_mode)
+        no_opt_mode = mode.excluding("scanOp_pushout_output")
+        f_no_opt = theano.function([v, m], T.jacobian(output, v), mode=no_opt_mode)

        # Ensure that the optimization was performed correctly in f_opt
        # The inner function of scan should have only one output and it should
@@ -248,11 +247,11 @@ class TestPushOutScanOutputDot(object):

        # Compile the function twice, once with the optimization and once
        # without
-        f_opt = theano.function([a, b], outputs)
+        opt_mode = mode.including("scan")
+        f_opt = theano.function([a, b], outputs, mode=opt_mode)

-        default_mode = theano.compile.get_default_mode()
-        default_mode.excluding("scanOp_pushout_output")
-        f_no_opt = theano.function([a, b], outputs, mode=default_mode)
+        no_opt_mode = mode.excluding("scanOp_pushout_output")
+        f_no_opt = theano.function([a, b], outputs, mode=no_opt_mode)

        # Ensure that the optimization was performed correctly in f_opt
        # The inner function of scan should have only one output and it should
@@ -272,3 +271,150 @@ class TestPushOutScanOutputDot(object):

        utt.assert_allclose(output_opt[0], output_no_opt[0])
        utt.assert_allclose(output_opt[1], output_no_opt[1])
+
+class TestPushOutSumOfDot():
+    """
+    Test case for the PushOutScanOutput optimizer in the case where the scan
+    is used to compute the sum over the dot products between the corresponding
+    elements of two list of matrices.
+    """
+
+    def test_machine_translation(self):
+        """
+        This test case comes from https://github.com/rizar/scan-grad-speed and
+        is an example of actual computation done with scan in the context of
+        machine translation
+
+        'dim' has been reduced from 1000 to 5 to make the test run faster
+        """
+
+        # Parameters from an actual machine tranlation run
+        batch_size = 80
+        seq_len = 50
+        n_words = 80 * 50
+        dim = 5
+
+        # Weight matrices
+        U = theano.shared(numpy.random.normal(size=(dim, dim),
+                                              scale=0.0001).astype(config.floatX))
+        U.name = 'U'
+        V = theano.shared(U.get_value())
+        V.name = 'V'
+        W = theano.shared(U.get_value())
+        W.name = 'W'
+
+        # Variables and their values
+        x = T.tensor3('x')
+        x_value = numpy.random.normal(size=(seq_len, batch_size, dim),
+                                      scale=0.0001).astype(config.floatX)
+
+        ri = T.tensor3('ri')
+        ri_value = x_value
+
+        zi = T.tensor3('zi')
+        zi_value = x_value
+
+        init = T.alloc(numpy.cast[config.floatX](0), batch_size, dim)
+        def rnn_step1(
+                # sequences
+                x, ri, zi,
+                # outputs_info
+                h):
+            pre_r = ri + h.dot(U)
+            pre_z = zi + h.dot(V)
+            r = T.nnet.sigmoid(pre_r)
+            z = T.nnet.sigmoid(pre_z)
+
+            after_r = r * h
+            pre_h = x + after_r.dot(W)
+            new_h = T.tanh(pre_h)
+
+            res_h = z * new_h + (1 - z) * h
+            return res_h
+
+
+        # Compile the function twice, once with the optimization and once
+        # without
+        opt_mode = mode.including("scan")
+        h, _ = theano.scan(rnn_step1, sequences=[x, ri, zi], n_steps=seq_len,
+                           outputs_info=init, name='fpass1', mode=opt_mode)
+        cost = h[-1].sum()
+        grad1 = T.grad(cost, [U, V, W])
+        f_opt = theano.function(inputs=[x, ri, zi], outputs=grad1,
+                                mode=opt_mode)
+
+        no_opt_mode = mode.excluding("scanOp_pushout_output")
+        h, _ = theano.scan(rnn_step1, sequences=[x, ri, zi], n_steps=seq_len,
+                outputs_info=init, name='fpass1', mode=no_opt_mode)
+        cost = h[-1].sum()
+        grad1 = T.grad(cost, [U, V, W])
+        f_no_opt = theano.function(inputs=[x, ri, zi], outputs=grad1,
+                                   mode=no_opt_mode)
+
+        # Validate that the optimization has been applied
+        scan_node_grad = [node for node in f_opt.maker.fgraph.toposort()
+                     if isinstance(node.op, Scan)][1]
+
+        for output in scan_node_grad.op.outputs:
+            assert not (isinstance(output.owner.op, T.elemwise.Elemwise) and
+                        any([isinstance(i, T.Dot) for i
+                             in output.owner.inputs]))
+
+        # Compare the outputs of the two functions on the same input data.
+        f_opt_output = f_opt(x_value, ri_value, zi_value)
+        f_no_opt_output = f_no_opt(x_value, ri_value, zi_value)
+        utt.assert_allclose(f_opt_output, f_no_opt_output)
+
+    def test_non_zero_init(self):
+        """
+        Test the case where the initial value for the nitsot output is
+        non-zero
+        """
+
+        input1 = T.tensor3()
+        input2 = T.tensor3()
+        input3 = T.tensor3()
+
+        W = theano.shared(numpy.random.normal(size=(4, 5))).astype(config.floatX)
+        U = theano.shared(numpy.random.normal(size=(6, 7))).astype(config.floatX)
+
+        def inner_fct(seq1, seq2, seq3, previous_output):
+            temp1 = T.dot(seq1, W) + seq3
+            temp2 = T.dot(seq2, U)
+            dot_output = T.dot(temp1, temp2)
+            return previous_output + dot_output
+
+        init = T.as_tensor_variable(numpy.random.normal(size=(3,7)))
+
+        # Compile the function twice, once with the optimization and once
+        # without
+        opt_mode = mode.including("scan")
+        h, _ = theano.scan(inner_fct,
+                sequences=[input1, input2, input3],
+                outputs_info=init,
+                mode=opt_mode)
+        output = h[-1]
+        f_opt = theano.function([input1, input2, input3], output,
+                                mode=opt_mode)
+
+        no_opt_mode = mode.excluding("scanOp_pushout_output")
+        h, _ = theano.scan(inner_fct,
+                sequences=[input1, input2, input3],
+                outputs_info=init,
+                mode=no_opt_mode)
+        output = h[-1]
+        f_no_opt = theano.function([input1, input2, input3], output,
+                                    mode=no_opt_mode)
+
+        # Ensure that the optimization has been applied for f_opt
+        # TODO
+
+        # Compare the outputs of the 2 functions
+        input1_value = numpy.random.random((2, 3, 4)).astype(config.floatX)
+        input2_value = numpy.random.random((2, 5, 6)).astype(config.floatX)
+        input3_value = numpy.random.random((2, 3, 5)).astype(config.floatX)
+
+        output_opt = f_opt(input1_value, input2_value, input3_value)
+        output_no_opt = f_no_opt(input1_value, input2_value, input3_value)
+
+        utt.assert_allclose(output_opt, output_no_opt)