Merge pull request #3367 from carriepl/scan_mitmot_prealloc

Scan mitmot prealloc

Merge pull request #3367 from carriepl/scan_mitmot_prealloc
da3c8070 · Frédéric Bastien · 5bab81bb · 31ad3e30 · da3c8070 · da3c8070
--- a/theano/compile/function.py
+++ b/theano/compile/function.py
@@ -268,7 +268,6 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
                        "input.")
    # compute some features of the arguments:
-    uses_In = any([isinstance(i, In) for i in inputs])
    uses_tuple = any([isinstance(i, (list, tuple)) for i in inputs])
    uses_updates = bool(updates)
    uses_givens = bool(givens)
@@ -280,7 +279,7 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
                                   (hasattr(i, 'mutable') and i.mutable))):
            check_for_aliased_inputs = True
-    if uses_In or uses_tuple:
+    if uses_tuple:
        # we must use old semantics in this case.
        if profile:
            raise NotImplementedError("profiling not supported in old-style "

--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -159,10 +159,22 @@ def std_fgraph(input_specs, output_specs, accept_inplace=False):
    """
    orig_inputs = [spec.variable for spec in input_specs]
-    updates = [spec.update for spec in input_specs if spec.update]
+    # Extract the updates and the mapping between update outputs and
+    # the updated inputs.
+    updates = []
+    update_mapping = {}
+    out_idx = len(output_specs)
+    for inp_idx in range(len(input_specs)):
+        if input_specs[inp_idx].update:
+            updates.append(input_specs[inp_idx].update)
+            update_mapping[out_idx] = inp_idx
+            out_idx += 1
    orig_outputs = [spec.variable for spec in output_specs] + updates
-    fgraph = gof.fg.FunctionGraph(orig_inputs, orig_outputs)
+    fgraph = gof.fg.FunctionGraph(orig_inputs, orig_outputs,
+                                  update_mapping=update_mapping)
    for node in fgraph.apply_nodes:
        if getattr(node.op, 'destroy_map', None):

--- a/theano/compile/io.py
+++ b/theano/compile/io.py
@@ -69,6 +69,13 @@ class SymbolicInput(object):
        if self.name is not None and not isinstance(self.name, string_types):
            raise TypeError("name must be a string! (got: %s)" % self.name)
        self.update = update
+        if update is not None:
+            if not variable.type == update.type:
+                raise TypeError("Variable '%s' has type %s but an update of "
+                                "type %s. The type of the update should be "
+                                "the same as the type of the variable" %
+                                (variable, variable.type, update.type))
        if (mutable is not None):
            self.mutable = mutable
        else:

--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -161,18 +161,17 @@ class AddDestroyHandler(gof.Optimizer):
        fgraph.attach_feature(gof.DestroyHandler())
-class AddNoOutputFromInplace(gof.Optimizer):
+class AddFeatureOptimizer(gof.Optimizer):
    """
-    This optimizer adds to the fgraph a feature that will prevent outputs
+    This optimizer adds a provided feature to the function graph.
-    of a fgraph to be created by performing inplace operations on intermediary
-    variables. This is useful when the outputs of the fgraph are preallocated
-    to prevent useless copying of the data. Currently, scan preallocates its
-    outputs
    """
+    def __init__(self, feature):
+        self.feature = feature
    def add_requirements(self, fgraph):
-        super(AddNoOutputFromInplace, self).add_requirements(fgraph)
+        super(AddFeatureOptimizer, self).add_requirements(fgraph)
-        fgraph.attach_feature(gof.NoOutputFromInplace())
+        fgraph.attach_feature(self.feature)
 class PrintCurrentFunctionGraph(gof.Optimizer):
@@ -229,9 +228,6 @@ optdb.register('specialize_device', gof.EquilibriumDB(),
 optdb.register('merge2', gof.MergeOptimizer(),
               49, 'fast_run', 'merge')
-optdb.register('add_no_output_from_inplace', AddNoOutputFromInplace(),
-               49.4)
 optdb.register('add_destroy_handler', AddDestroyHandler(),
               49.5, 'fast_run', 'inplace')
@@ -321,19 +317,44 @@ class Mode(object):
                                              self.provided_optimizer)
        # N.B. opt might be a Query instance, not sure what else it might be...
        #     string? Optimizer? OptDB? who knows???
-        return self.__class__(linker=link, optimizer=opt.including(*tags))
+        return self.clone(optimizer=opt.including(*tags))
+    def register(self, *optimizations):
+        """Adds new optimization instances to a mode.
+        This method adds new optimization instances to a compilation mode. It
+        works like the `including()` method but takes as inputs optimization
+        instances to add instead of tags.
+        Parameters
+        ----------
+        optimizations :
+            Every element of `optimizations` is a tuple containing an
+            optimization instance and a floating point value indicating the
+            position at which to insert the optimization in the mode.
+        Returns
+        -------
+        Mode
+            Copy of the current Mode which includes the provided
+            optimizations.
+        """
+        link, opt = self.get_linker_optimizer(self.provided_linker,
+                                              self.provided_optimizer)
+        return self.clone(optimizer=opt.register(*optimizations))
    def excluding(self, *tags):
        link, opt = self.get_linker_optimizer(self.provided_linker,
                                              self.provided_optimizer)
-        return self.__class__(linker=link, optimizer=opt.excluding(*tags))
+        return self.clone(optimizer=opt.excluding(*tags))
    def requiring(self, *tags):
        link, opt = self.get_linker_optimizer(self.provided_linker,
                                              self.provided_optimizer)
-        return self.__class__(linker=link, optimizer=opt.requiring(*tags))
+        return self.clone(optimizer=opt.requiring(*tags))
-    def clone(self, link_kwargs=None, **kwargs):
+    def clone(self, link_kwargs=None, optimizer="", **kwargs):
        """
        Create a new instance of this Mode.
@@ -342,10 +363,14 @@ class Mode(object):
        arguments.
        """
+        if link_kwargs is None:
+            link_kwargs = {}
        new_linker = self.linker.clone(**link_kwargs)
-        new_optimizer = self.provided_optimizer
+        if optimizer == "":
+            optimizer = self.provided_optimizer
        new_mode = type(self)(linker=new_linker,
-                              optimizer=new_optimizer)
+                              optimizer=optimizer)
        return new_mode

--- a/theano/compile/monitormode.py
+++ b/theano/compile/monitormode.py
@@ -74,25 +74,7 @@ class MonitorMode(Mode):
        if self.post_func is not None:
            self.post_func(i, node, fn)
-    def including(self, *tags):
+    def clone(self, link_kwargs=None, optimizer="", **kwargs):
-        ret = super(MonitorMode, self).including(*tags)
-        ret.pre_func = self.pre_func
-        ret.post_func = self.post_func
-        return ret
-    def excluding(self, *tags):
-        ret = super(MonitorMode, self).excluding(*tags)
-        ret.pre_func = self.pre_func
-        ret.post_func = self.post_func
-        return ret
-    def requiring(self, *tags):
-        ret = super(MonitorMode, self).requiring(*tags)
-        ret.pre_func = self.pre_func
-        ret.post_func = self.post_func
-        return ret
-    def clone(self, link_kwargs=None, **kwargs):
        """
        Create a new instance of this Mode.
@@ -100,10 +82,12 @@ class MonitorMode(Mode):
        ignored, because ProfileMode needs to use its own linker.
        """
+        if optimizer == "":
+            optimizer = self.provided_optimizer
        new_mode = type(self)(pre_func=self.pre_func,
                              post_func=self.post_func,
                              linker=None,
-                              optimizer=self.provided_optimizer)
+                              optimizer=optimizer)
        return new_mode

--- a/theano/compile/pfunc.py
+++ b/theano/compile/pfunc.py
@@ -478,7 +478,19 @@ def pfunc(params, outputs=None, mode=None, updates=None, givens=None,
                'theano.clone(f(x), replace={x: g(x)}))`.'
                % x)
-    output_vars = rebuild_collect_shared(outputs,
+    # Extend the outputs with the updates on input variables so they are also
+    # cloned
+    additional_outputs = [i.update for i in inputs if i.update]
+    if outputs is None:
+        out_list = []
+    else:
+        if isinstance(outputs, (list, tuple)):
+            out_list = list(outputs)
+        else:
+            out_list = [outputs]
+    extended_outputs = out_list + additional_outputs
+    output_vars = rebuild_collect_shared(extended_outputs,
                                         in_variables,
                                         replace=givens,
                                         updates=updates,
@@ -486,12 +498,25 @@ def pfunc(params, outputs=None, mode=None, updates=None, givens=None,
                                         copy_inputs_over=True,
                                         no_default_updates=no_default_updates)
    # extracting the arguments
-    input_variables, cloned_outputs, other_stuff = output_vars
+    input_variables, cloned_extended_outputs, other_stuff = output_vars
    clone_d, update_d, update_expr, shared_inputs = other_stuff
+    # Recover only the clones of the original outputs
+    if outputs is None:
+        cloned_outputs = []
+    else:
+        if isinstance(outputs, (list, tuple)):
+            cloned_outputs = cloned_extended_outputs[:len(outputs)]
+        else:
+            cloned_outputs = cloned_extended_outputs[0]
    for i, iv in zip(inputs, input_variables):
        i.variable = iv
+        # If needed, replace the input's update by its cloned equivalent
+        if i.update:
+            i.update = clone_d[i.update]
    for sv in shared_inputs:
        # pass value of None
        # value will be stored in the resulting functions' defaults
@@ -526,6 +551,8 @@ def _pfunc_param_to_in(param, strict=False, allow_downcast=None):
            borrow=param.borrow,
            allow_downcast=param.allow_downcast,
            implicit=param.implicit)
+    elif isinstance(param, In):
+        return param
    raise TypeError('Unknown parameter type: %s' % type(param))

--- a/theano/compile/tests/test_function.py
+++ b/theano/compile/tests/test_function.py
@@ -2,10 +2,12 @@ import six.moves.cPickle as pickle
 import os
 import shutil
 import tempfile
+import unittest
 import numpy
 import theano
+from theano.compile.io import In
 def test_function_dump():
@@ -26,3 +28,167 @@ def test_function_dump():
    fct2 = theano.function(**l)
    x = [1, 2, 3]
    assert numpy.allclose(fct1(x), fct2(x))
+class TestFunctionIn(unittest.TestCase):
+    def test_in_strict(self):
+        a = theano.tensor.dvector()
+        b = theano.shared(7)
+        out = a + b
+        f = theano.function([In(a, strict=False)], out)
+        # works, rand generates float64 by default
+        f(numpy.random.rand(8))
+        # works, casting is allowed
+        f(numpy.array([1, 2, 3, 4], dtype='int32'))
+        f = theano.function([In(a, strict=True)], out)
+        try:
+            # fails, f expects float64
+            f(numpy.array([1, 2, 3, 4], dtype='int32'))
+        except TypeError:
+            pass
+    def test_explicit_shared_input(self):
+        # This is not a test of the In class per se, but the In class relies
+        # on the fact that shared variables cannot be explicit inputs
+        a = theano.shared(1.0)
+        self.assertRaises(TypeError, theano.function, [a], a + 1)
+    def test_in_shared_variable(self):
+        # Ensure that an error is raised if the In wrapped is used to wrap
+        # a shared variable
+        a = theano.shared(1.0)
+        a_wrapped = In(a, update=a + 1)
+        self.assertRaises(TypeError, theano.function, [a_wrapped])
+    def test_in_mutable(self):
+        a = theano.tensor.dvector()
+        a_out = a * 2  # assuming the op which makes this "in place" triggers
+        # using mutable=True will let f change the value in aval
+        f = theano.function([In(a, mutable=True)], a_out, mode='FAST_RUN')
+        aval = numpy.random.rand(10)
+        aval2 = aval.copy()
+        assert numpy.all(f(aval) == (aval2 * 2))
+        assert not numpy.all(aval == aval2)
+        # using mutable=False should leave the input untouched
+        f = theano.function([In(a, mutable=False)], a_out, mode='FAST_RUN')
+        aval = numpy.random.rand(10)
+        aval2 = aval.copy()
+        assert numpy.all(f(aval) == (aval2 * 2))
+        assert numpy.all(aval == aval2)
+    def test_in_update(self):
+        a = theano.tensor.dscalar('a')
+        f = theano.function([In(a, value=0.0, update=a + 1)], a,
+                            mode='FAST_RUN')
+        # Ensure that, through the executions of the function, the state of the
+        # input is persistent and is updated as it should
+        assert f() == 0.0
+        assert f() == 1.0
+        assert f() == 2.0
+    def test_in_update_wrong_dtype(self):
+        # Ensure that an error is raised if an In-wrapped variables has
+        # an update of a different type
+        a = theano.tensor.dscalar('a')
+        b = theano.tensor.dvector('b')
+        self.assertRaises(TypeError, In, a, update=b)
+    def test_in_update_shared(self):
+        # Test that using both In() with updates and shared variables with
+        # updates in the same function behaves as expected
+        shared_var = theano.shared(1.0)
+        a = theano.tensor.dscalar('a')
+        a_wrapped = In(a, value=0.0, update=shared_var)
+        f = theano.function([a_wrapped], [], updates={shared_var: a},
+                            mode='FAST_RUN')
+        # Ensure that, through the executions of the function, the state of
+        # the input and the shared variable are appropriate (after N execution,
+        # the values have swapped N times). This allows testing that the
+        # changes occur at the same time and one doesn't overwrite the other.
+        for i in range(5):
+            f()
+            assert numpy.allclose(shared_var.get_value(), i % 2)
+    def test_in_allow_downcast_int(self):
+        a = theano.tensor.wvector('a')  # int16
+        b = theano.tensor.bvector('b')  # int8
+        c = theano.tensor.bscalar('c')  # int8
+        f = theano.function([In(a, allow_downcast=True),
+                             In(b, allow_downcast=False),
+                             In(c, allow_downcast=None)],
+                            (a + b + c))
+        # Both values are in range. Since they're not ndarrays (but lists),
+        # they will be converted, and their value checked.
+        assert numpy.all(f([3], [6], 1) == 10)
+        # Values are in range, but a dtype too large has explicitly been given
+        # For performance reasons, no check of the data is explicitly performed
+        # (It might be OK to change this in the future.)
+        self.assertRaises(TypeError, f, [3], numpy.array([6], dtype='int16'),
+                          1)
+        # Value too big for a, silently ignored
+        assert numpy.all(f([2 ** 20], numpy.ones(1, dtype='int8'), 1) == 2)
+        # Value too big for b, raises TypeError
+        self.assertRaises(TypeError, f, [3], [312], 1)
+        # Value too big for c, raises TypeError
+        self.assertRaises(TypeError, f, [3], [6], 806)
+    def test_in_allow_downcast_floatX(self):
+        a = theano.tensor.fscalar('a')
+        b = theano.tensor.fscalar('b')
+        c = theano.tensor.fscalar('c')
+        f = theano.function([In(a, allow_downcast=True),
+                             In(b, allow_downcast=False),
+                             In(c, allow_downcast=None)],
+                            (a + b + c))
+        # If the values can be accurately represented, everything is OK
+        assert numpy.all(f(0, 0, 0) == 0)
+        # If allow_downcast is True, idem
+        assert numpy.allclose(f(0.1, 0, 0), 0.1)
+        # If allow_downcast is False, nope
+        self.assertRaises(TypeError, f, 0, 0.1, 0)
+        # If allow_downcast is None, it should work iff floatX=float32
+        if theano.config.floatX == 'float32':
+            assert numpy.allclose(f(0, 0, 0.1), 0.1)
+        else:
+            self.assertRaises(TypeError, f, 0, 0, 0.1)
+    def test_in_allow_downcast_vector_floatX(self):
+        a = theano.tensor.fvector('a')
+        b = theano.tensor.fvector('b')
+        c = theano.tensor.fvector('c')
+        f = theano.function([In(a, allow_downcast=True),
+                             In(b, allow_downcast=False),
+                             In(c, allow_downcast=None)],
+                            (a + b + c))
+        # If the values can be accurately represented, everything is OK
+        z = [0]
+        assert numpy.all(f(z, z, z) == 0)
+        # If allow_downcast is True, idem
+        assert numpy.allclose(f([0.1], z, z), 0.1)
+        # If allow_downcast is False, nope
+        self.assertRaises(TypeError, f, z, [0.1], z)
+        # If allow_downcast is None, like False
+        self.assertRaises(TypeError, f, z, z, [0.1])
--- a/theano/compile/tests/test_mode.py
+++ b/theano/compile/tests/test_mode.py
 import theano
-from theano.compile.mode import Mode
+from theano.compile.mode import Mode, AddFeatureOptimizer
+from theano.gof.toolbox import NoOutputFromInplace
 import theano.tensor as T
@@ -18,8 +19,8 @@ def test_no_output_from_implace():
    # Ensure that the elemwise op that produces the output is not inplace when
    # using a mode that includes the optimization
-    mode_opt = Mode(linker="cvm", optimizer="fast_run")
+    opt = AddFeatureOptimizer(NoOutputFromInplace())
-    mode_opt = mode_opt.including("add_no_output_from_inplace")
+    mode_opt = Mode(linker="cvm", optimizer="fast_run").register((opt, 49.9))
    fct_opt = theano.function([x, y], b, mode=mode_opt)
    op = fct_opt.maker.fgraph.outputs[0].owner.op

--- a/theano/gof/fg.py
+++ b/theano/gof/fg.py
@@ -109,7 +109,25 @@ class FunctionGraph(utils.object2):
    """
-    def __init__(self, inputs, outputs, features=None, clone=True):
+    def __init__(self, inputs, outputs, features=None, clone=True,
+                 update_mapping=None):
+        """
+        Create an FunctionGraph which operates on the subgraph bound by the
+        inputs and outputs sets.
+        Parameters
+        ----------
+        inputs : list of variables
+            Inputs nodes of the graph, usually declared by the user
+        outputs : list of variables
+            Outputs nodes of the graph.
+        clone : boolean
+            If true, we will clone the graph. This is useful to remove the
+            constant cache problem.
+        update_mapping : dictionnary
+            Mapping between the inputs with updates and the outputs
+            corresponding to their updates.
+        """
        if clone:
            inputs, outputs = graph.clone(inputs, outputs)
@@ -157,6 +175,7 @@ class FunctionGraph(utils.object2):
        self.node_locks = {}
        self.variable_locks = {}
        self.profile = None
+        self.update_mapping = update_mapping
    def add_input(self, input):
        if input not in self.inputs:

--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
 from __future__ import print_function
+import copy
 import sys
 import numpy
@@ -117,12 +118,16 @@ multiple time in a DB. Tryed to register "%s" again under the new name "%s".
        add = OrderedSet()
        for obj in variables:
            if isinstance(obj, DB):
-                sq = q.subquery.get(obj.name, q)
+                def_sub_query = q
-                if sq:
+                if q.extra_optimizations:
-                    replacement = obj.query(sq)
+                    def_sub_query = copy.copy(q)
-                    replacement.name = obj.name
+                    def_sub_query.extra_optimizations = []
-                    remove.add(obj)
+                sq = q.subquery.get(obj.name, def_sub_query)
-                    add.add(replacement)
+                replacement = obj.query(sq)
+                replacement.name = obj.name
+                remove.add(obj)
+                add.add(replacement)
        variables.difference_update(remove)
        variables.update(add)
        return variables
@@ -173,12 +178,16 @@ class Query(object):
    """
    def __init__(self, include, require=None, exclude=None,
-                 subquery=None, position_cutoff=None):
+                 subquery=None, position_cutoff=None,
+                 extra_optimizations=None):
        self.include = OrderedSet(include)
        self.require = require or OrderedSet()
        self.exclude = exclude or OrderedSet()
        self.subquery = subquery or {}
        self.position_cutoff = position_cutoff
+        if extra_optimizations is None:
+            extra_optimizations = []
+        self.extra_optimizations = extra_optimizations
        if isinstance(self.require, (list, tuple)):
            self.require = OrderedSet(self.require)
        if isinstance(self.exclude, (list, tuple)):
@@ -186,9 +195,14 @@ class Query(object):
    def __str__(self):
        return ("Query{inc=%s,ex=%s,require=%s,subquery=%s,"
-                "position_cutoff=%d}" %
+                "position_cutoff=%d,extra_opts=%s}" %
                (self.include, self.exclude, self.require, self.subquery,
-                 self.position_cutoff))
+                 self.position_cutoff, self.extra_optimizations))
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        if not hasattr(self, 'extra_optimizations'):
+            self.extra_optimizations = []
    # add all opt with this tag
    def including(self, *tags):
@@ -196,7 +210,8 @@ class Query(object):
                     self.require,
                     self.exclude,
                     self.subquery,
-                     self.position_cutoff)
+                     self.position_cutoff,
+                     self.extra_optimizations)
    # remove all opt with this tag
    def excluding(self, *tags):
@@ -204,7 +219,8 @@ class Query(object):
                     self.require,
                     self.exclude.union(tags),
                     self.subquery,
-                     self.position_cutoff)
+                     self.position_cutoff,
+                     self.extra_optimizations)
    # keep only opt with this tag.
    def requiring(self, *tags):
@@ -212,7 +228,16 @@ class Query(object):
                     self.require.union(tags),
                     self.exclude,
                     self.subquery,
-                     self.position_cutoff)
+                     self.position_cutoff,
+                     self.extra_optimizations)
+    def register(self, *optimizations):
+        return Query(self.include,
+                     self.require,
+                     self.exclude,
+                     self.subquery,
+                     self.position_cutoff,
+                     self.extra_optimizations + list(optimizations))
 class EquilibriumDB(DB):
@@ -242,8 +267,6 @@ class EquilibriumDB(DB):
        self.__final__ = {}
    def register(self, name, obj, *tags, **kwtags):
-        # if name == 'cut_gpua_constant_transfers':
-        #     import ipdb;ipdb.set_trace()
        if 'final_opt' in kwtags:
            final_opt = kwtags['final_opt']
            kwtags.pop('final_opt', None)
@@ -306,19 +329,33 @@ class SequenceDB(DB):
        position_cutoff = kwtags.pop('position_cutoff',
                                     config.optdb.position_cutoff)
+        position_dict = self.__position__
        if len(tags) >= 1 and isinstance(tags[0], Query):
            # the call to super should have raise an error with a good message
            assert len(tags) == 1
            if getattr(tags[0], 'position_cutoff', None):
                position_cutoff = tags[0].position_cutoff
-        opts = [o for o in opts if self.__position__[o.name] < position_cutoff]
+            # The Query instance might contain extra optimizations which need
-        # We want to sort by position and then if collision by name
+            # to be added the the sequence of optimizations (don't alter the
-        # for deterministic optimization.  Since Python 2.2, sort is
+            # original dictionary)
-        # stable, so sort by name first, then by position. This give
+            if len(tags[0].extra_optimizations) > 0:
-        # the order we want.
+                position_dict = position_dict.copy()
-        opts.sort(key=lambda obj: obj.name)
+                for extra_opt in tags[0].extra_optimizations:
-        opts.sort(key=lambda obj: self.__position__[obj.name])
+                    # Give a name to the extra optimization (include both the
+                    # class name for descriptiveness and id to avoid name
+                    # collisions)
+                    opt, position = extra_opt
+                    opt.name = "%s_%i" % (opt.__class__, id(opt))
+                    # Add the extra optimization to the optimization sequence
+                    if position < position_cutoff:
+                        opts.add(opt)
+                        position_dict[opt.name] = position
+        opts = [o for o in opts if position_dict[o.name] < position_cutoff]
+        opts.sort(key=lambda obj: (position_dict[obj.name], obj.name))
        kwargs = {}
        if self.failure_callback:
            kwargs["failure_callback"] = self.failure_callback

--- a/theano/gof/toolbox.py
+++ b/theano/gof/toolbox.py
@@ -440,10 +440,18 @@ class PreserveNames(Feature):
 class NoOutputFromInplace(Feature):
+    def __init__(self, first_output_idx=0, last_output_idx=None):
+        self.first_idx = first_output_idx
+        self.last_idx = last_output_idx
    def validate(self, fgraph):
        if not hasattr(fgraph, 'destroyers'):
            return True
-        for out in list(fgraph.outputs):
+        outputs_to_validate = list(fgraph.outputs)[self.first_idx:
+                                                   self.last_idx]
+        for out in outputs_to_validate:
            if out.owner is None:
                continue

--- a/theano/scan_module/numpy_api_changes.diff
+++ b/theano/scan_module/numpy_api_changes.diff
-@@ -5808,7 +5808,7 @@
+@@ -6667,7 +6667,7 @@
  *             cdef list stack
  *             cdef int offset
  */
-  __pyx_t_4 = ((PyObject *)__pyx_v_self->descr);
+-  __pyx_t_3 = ((PyObject *)__pyx_v_self->descr);
-+  __pyx_t_4 = ((PyObject *)PyArray_DESCR(__pyx_v_self));
+  __pyx_t_3 = ((PyObject *)PyArray_DESCR(__pyx_v_self));
-   __Pyx_INCREF(__pyx_t_4);
+   __Pyx_INCREF(__pyx_t_3);
-   __pyx_v_descr = ((PyArray_Descr *)__pyx_t_4);
+   __pyx_v_descr = ((PyArray_Descr *)__pyx_t_3);
-   __pyx_t_4 = 0;
+   __pyx_t_3 = 0;
-@@ -7337,7 +7337,7 @@
+@@ -8237,7 +8237,7 @@
  *      arr.base = baseptr
-  * 
+  *
  */
 -  Py_XDECREF(__pyx_v_arr->base);
 +  Py_XDECREF(PyArray_BASE(__pyx_v_arr));
   /* "numpy.pxd":973
  *          baseptr = <PyObject*>base
-@@ -7346,7 +7346,11 @@
+@@ -8246,7 +8246,11 @@
-  * 
+  *
  * cdef inline object get_array_base(ndarray arr):
  */
 -  __pyx_v_arr->base = __pyx_v_baseptr;
@@ -26,19 +26,19 @@
 +  #else
 +      PyArray_SetBaseObject(__pyx_v_arr, __pyx_v_baseptr);
 +  #endif
   __Pyx_RefNannyFinishContext();
 }
-@@ -7376,7 +7376,7 @@
+@@ -8285,7 +8285,7 @@
  *         return None
  *     else:
  */
 -  __pyx_t_1 = ((__pyx_v_arr->base == NULL) != 0);
 +  __pyx_t_1 = ((PyArray_BASE(__pyx_v_arr) == NULL) != 0);
   if (__pyx_t_1) {
     /* "numpy.pxd":977
-@@ -7400,8 +7404,8 @@
+@@ -8307,8 +8311,8 @@
  *         return <object>arr.base             # <<<<<<<<<<<<<<
  */
     __Pyx_XDECREF(__pyx_r);

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -56,6 +56,7 @@ __authors__ = ("Razvan Pascanu "
 __copyright__ = "(c) 2010, Universite de Montreal"
 __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
+import copy
 import itertools
 import logging
 import time
@@ -66,10 +67,12 @@ from six.moves import xrange
 import theano
 from theano.compat import exc_message
-from theano.compile import function, Param, Out
+from theano.compile import function, In, Param, Out
+from theano.compile.mode import AddFeatureOptimizer
 from theano import compile, config, gradient, gof, tensor
 from theano.gof import PureOp, Apply
 from theano.gof.graph import io_connection_pattern
+from theano.gof.toolbox import NoOutputFromInplace
 from theano.compat import OrderedDict, izip
 from theano.tensor import TensorType
 from theano.tensor.opt import Shape_i
@@ -193,16 +196,6 @@ class Scan(PureOp):
            link_kwargs=dict(allow_gc=self.allow_gc),
            message=message)
-        # Now that scan has its mode instance, if memory pre-allocation is
-        # activated for the outputs, we activate the optimization
-        # add_no_output_from_inplace in this mode instance. This will prevent
-        # Scan from producing outputs by means of inplace operations and
-        # therefore allow it to pre-allocate memory storage for the outputs,
-        # avoiding needless copies.
-        if theano.config.scan.allow_output_prealloc:
-            self.mode_instance = self.mode_instance.including(
-                                                "add_no_output_from_inplace")
        if not hasattr(self, 'name') or self.name is None:
            self.name = 'scan_fn'
        # to have a fair __eq__ comparison later on, we update the info with
@@ -319,6 +312,12 @@ class Scan(PureOp):
            # Generate the mappings between inner and outer inputs and outputs
            # if they haven't already been generated.
            self.var_mappings = self.get_oinp_iinp_iout_oout_mappings()
+        if (hasattr(self, 'fn') and
+                not hasattr(self, 'thunk_mit_mot_out_slices')):
+            # The thunk has been compiled before mit_mot preallocation feature
+            # was implemented. Mark every mit_mot output tap as not having
+            # been preallocated
+            self.mitmots_preallocated = [False] * self.n_mit_mot_outs
        # Ensure that the graph associated with the inner function is valid.
        self.validate_inner_graph()
@@ -746,17 +745,91 @@ class Scan(PureOp):
                  self.n_mit_sot +
                  self.n_sit_sot +
                  self.n_nit_sot)
        if theano.config.scan.allow_output_prealloc:
-            wrapped_inputs = [Param(x, borrow=False) for x in
-                              self.inputs]
+            # Go through the mitmots. Whenever a mitmot has a tap both as an
+            # input and an output, wrap the input such that the corresponding
+            # output variable becomes an update to be performed on it, possibly
+            # inplace at the end of the functions's execution.
+            wrapped_inputs = [In(x, borrow=False)
+                              for x in self.inputs[:self.n_seqs]]
+            new_outputs = [x for x in self.outputs]
+            preallocated_mitmot_outs = []
+            new_mit_mot_out_slices = copy.deepcopy(self.mit_mot_out_slices)
+            input_idx = self.n_seqs
+            for mitmot_idx in range(self.n_mit_mot):
+                for inp_tap in self.tap_array[mitmot_idx]:
+                    if inp_tap in self.mit_mot_out_slices[mitmot_idx]:
+                        # Figure out the index of the corresponding output
+                        output_idx = sum([len(m) for m in
+                                          self.mit_mot_out_slices[:mitmot_idx]])
+                        output_idx += self.mit_mot_out_slices[mitmot_idx].index(inp_tap)
+                        # Make it so the input is automatically updated to the
+                        # output value, possibly inplace, at the end of the
+                        # function exectution
+                        wrapped_inp = In(variable=self.inputs[input_idx],
+                                         update=self.outputs[output_idx])
+                        wrapped_inputs.append(wrapped_inp)
+                        preallocated_mitmot_outs.append(output_idx)
+                        new_mit_mot_out_slices[mitmot_idx].remove(inp_tap)
+                    else:
+                        # Wrap the corresponding input as usual. Leave the
+                        # output as-is.
+                        wrapped_inputs.append(In(self.inputs[input_idx],
+                                                    borrow=False))
+                    input_idx += 1
+            # Wrap the inputs not associated to mitmots and wrap the remaining
+            # outputs
+            wrapped_inputs += [In(x, borrow=False) for x in
+                               self.inputs[input_idx:]]
            wrapped_outputs = [Out(x, borrow=True) for x in
-                               self.outputs[:slices]]
+                               new_outputs[:slices]]
+            wrapped_outputs += new_outputs[slices:]
+            # Remove now useless outputs from the output list (start from the
+            # end to avoid altering the indices of the other outputs to be
+            # deleted.
+            preallocated_mitmot_outs.sort()
+            for p in preallocated_mitmot_outs[::-1]:
+                del wrapped_outputs[p]
+            # Store the list of mitmot output taps that have been altered
+            # so they can be preallocated
+            self.mitmots_preallocated = [i in preallocated_mitmot_outs
+                                         for i in range(self.n_mit_mot_outs)]
+            # Add an optimization to the compilation mode to attach a feature
+            # to the function graph just before the inplace optimizations are
+            # applied (inplace optimizations start at position 50 so the
+            # optimization to attach the feature is registered at position 49.9
+            # so that it runs before them). This feature will prevent mitsot,
+            # sitsot and nitsot outputs from being computed inplace (to allow
+            # their preallocation).
+            mitsot_start = self.n_mit_mot_outs - len(preallocated_mitmot_outs)
+            nitsot_end = (mitsot_start + self.n_mit_sot + self.n_sit_sot +
+                          self.n_nit_sot)
+            feature = NoOutputFromInplace(mitsot_start, nitsot_end)
+            opt = AddFeatureOptimizer(feature)
+            compilation_mode = self.mode_instance.register((opt, 49.9))
        else:
+            # Output preallocation is not activated. Mark every mitmot output
+            # tap as not being preallocated
+            self.mitmots_preallocated = [False] * self.n_mit_mot_outs
            wrapped_inputs = [Param(x, borrow=True) for x in
                              self.inputs]
            wrapped_outputs = [Out(x, borrow=False) for x in
                               self.outputs[:slices]]
-        wrapped_outputs += self.outputs[slices:]
+            wrapped_outputs += self.outputs[slices:]
+            compilation_mode = self.mode_instance
        profile = None
        if (theano.config.profile or
            (isinstance(self.profile, (string_types, bool, int))
@@ -772,7 +845,7 @@ class Scan(PureOp):
        if not getattr(self, 'fn', None):
            self.fn = function(wrapped_inputs,
                               wrapped_outputs,
-                               mode=self.mode_instance,
+                               mode=compilation_mode,
                               name=self.name,
                               profile=profile,
                               on_unused_input='ignore')
@@ -810,6 +883,8 @@ class Scan(PureOp):
                                                    dtype='int32')
            cython_vector_outs = numpy.asarray(self.vector_outs,
                                                    dtype='int32')
+            cython_mitmots_preallocated = numpy.asarray(self.mitmots_preallocated,
+                                                        dtype='int32')
            if hasattr(self, 'destroy_map'):
                cython_destroy_map = [x in self.destroy_map
@@ -837,6 +912,7 @@ class Scan(PureOp):
                        cython_vector_outs,
                        cython_mit_mot_out_slices,
                        cython_mit_mot_out_nslices,
+                        cython_mitmots_preallocated,
                        self.fn.fn,
                        self.fn,
                        cython_destroy_map,
@@ -1099,6 +1175,9 @@ class Scan(PureOp):
        offset = self.nit_sot_arg_offset + self.n_nit_sot
        other_args = args[offset:]
        input_storage = self.fn.input_storage
+        old_input_storage = [None] * len(input_storage)
+        old_input_data = [None] * len(input_storage)
+        input_reused = [None] * len(input_storage)
        output_storage = self.fn.output_storage
        old_output_storage = [None] * len(output_storage)
        old_output_data = [None] * len(output_storage)
@@ -1151,11 +1230,13 @@ class Scan(PureOp):
            # 4. collecting slices where the output should be stored
            # 4.1. Collect slices for mitmots
+            offset = 0
            for idx in xrange(self.n_mit_mot_outs):
-                output_storage[idx].storage[0] = None
+                if not self.mitmots_preallocated[idx]:
+                    output_storage[offset].storage[0] = None
+                    offset += 1
            # 4.2. Collect slices for mitsots, sitsots and nitsots
-            offset = self.n_mit_mot_outs
            if i != 0:
                for idx in xrange(self.n_outs + self.n_nit_sot -
                                  self.n_mit_mot):
@@ -1199,7 +1280,24 @@ class Scan(PureOp):
                else:
                    old_output_data[idx] = None
-            # 5. compute outputs
+            # 4.6. Keep a reference to the variables (ndarrays, CudaNdarrays,
+            # etc) currently in the input_storage to be able to compare them
+            # with the content of the input_storage after the execution of the
+            # function. Also keep pointers to their data to be able to detect
+            # cases where outputs reused the allocated object but alter the
+            # memory region they refer to.
+            for idx in xrange(len(input_storage)):
+                var = input_storage[idx].storage[0]
+                old_input_storage[idx] = var
+                if hasattr(var, 'gpudata'):
+                    old_input_data[idx] = var.gpudata
+                elif hasattr(var, 'data'):
+                    old_input_data[idx] = var.data
+                else:
+                    old_input_data[idx] = None
+            # 5.1 compute outputs
            t0_fn = time.time()
            try:
@@ -1228,8 +1326,20 @@ class Scan(PureOp):
                pdx = offset + self.n_shared_outs
                cond = output_storage[pdx].storage[0] == 0
-            # Check which of the pre-allocated outputs (if applicable) have
+            # 5.2. By calling fn() directly instead of calling the theano
-            # been reused by the inner function
+            # function, it is possible that the updates have not been
+            # performed. Perform the updates if needed.
+            offset_out = len(output_storage) - 1
+            if getattr(fn, 'need_update_inputs', True):
+                # Update the inputs that have an update function
+                for inp, storage in zip(self.fn.maker.expanded_inputs[::-1],
+                                        self.fn.input_storage[::-1]):
+                    if inp.update is not None:
+                        storage.data = output_storage[offset_out].data
+                        offset_out -= 1
+            # 5.3. Check which of the pre-allocated outputs (if applicable)
+            # have been reused by the inner function
            for idx in xrange(len(output_storage)):
                # If the storage map does not contain the same object, then
                # the pre-allocated output has not been reused
@@ -1251,16 +1361,61 @@ class Scan(PureOp):
                else:
                    output_reused[idx] = False
+            # 5.4 Check which of the input storage have been modified by the
+            # inner function
+            for idx in xrange(len(input_storage)):
+                # If the storage map does not contain the same object, then
+                # the pre-allocated output has not been reused
+                new_var = input_storage[idx].storage[0]
+                if old_input_storage[idx] is new_var:
+                    # The pre-allocated output is only considered as having
+                    # been reused if it still points to the same data as it
+                    # did before the execution of the inner function
+                    if old_input_data[idx] is None:
+                        input_reused[idx] = False
+                    else:
+                        if hasattr(new_var, 'gpudata'):
+                            input_reused[idx] = (new_var.gpudata ==
+                                                  old_input_data[idx])
+                        elif hasattr(new_var, 'data'):
+                            input_reused[idx] = (new_var.data ==
+                                                  old_input_data[idx])
+                else:
+                    input_reused[idx] = False
            t_fn += dt_fn
            offset_out = 0
-            # 5.1 Copy over the values for mit_mot outputs
+            # 5.5 Copy over the values for mit_mot outputs
+            mitmot_inp_offset = self.n_seqs
+            mitmot_out_idx = 0
            for j in xrange(self.n_mit_mot):
                for k in self.mit_mot_out_slices[j]:
-                    outs[j][0][k + pos[j]] = \
+                    if self.mitmots_preallocated[mitmot_out_idx]:
-                            output_storage[offset_out].storage[0]
+                        # This output tap has been preallocated. If the
-                    offset_out += 1
+                        # corresponding input storage has been replaced,
+                        # recover the value as usual. Otherwise, the input was
+                        # modified inplace and nothing needs to be done.
+                        inp_idx = (mitmot_inp_offset +
+                                   self.tap_array[j].index(k))
+                        if not input_reused[inp_idx]:
+                            outs[j][0][k + pos[j]] = \
+                                input_storage[inp_idx].storage[0]
+                    else:
+                        # This output tap has not been preallocated, recover
+                        # its value as usual
+                        outs[j][0][k + pos[j]] = \
+                                output_storage[offset_out].storage[0]
+                        offset_out += 1
+                    mitmot_out_idx += 1
+                mitmot_inp_offset += len(self.tap_array[j])
-            # 5.2 Copy over the values for mit_sot/sit_sot outputs
+            # 5.6 Copy over the values for mit_sot/sit_sot outputs
            begin = self.n_mit_mot
            end = self.n_outs
            offset_out -= self.n_mit_mot
@@ -1271,7 +1426,7 @@ class Scan(PureOp):
                    outs[j][0][pos[j]] = \
                            output_storage[offset_out + j].storage[0]
-            # 5.3 Copy over the values for nit_sot outputs
+            # 5.7 Copy over the values for nit_sot outputs
            begin = end
            end += self.n_nit_sot
            for j in xrange(begin, end):
@@ -1295,7 +1450,7 @@ class Scan(PureOp):
                    outs[j][0][pos[j]] = \
                            output_storage[j + offset_out].storage[0]
-            # 5.4 Copy over the values for outputs corresponding to shared
+            # 5.8 Copy over the values for outputs corresponding to shared
            # variables
            begin = end
            end += self.n_shared_outs
@@ -1552,7 +1707,7 @@ class Scan(PureOp):
        return connection_pattern
    def get_oinp_iinp_iout_oout_mappings(self):
-        """ 
+        """
        Compute and return dictionary mappings between the inputs and
        outputs of the inner function and the inputs and outputs of the Scan
        node in the outer graph.
@@ -2016,6 +2171,7 @@ class Scan(PureOp):
            undefined_msg = None
            through_shared = False
            disconnected = True
            for jdx in xrange(len(self.mit_mot_out_slices[idx])):
                inner_inp_mitmot.append(dC_dXts[out_pos])
                mitmot_inp_taps[idx].append(-self.mit_mot_out_slices[idx][jdx])
@@ -2023,7 +2179,13 @@ class Scan(PureOp):
                out_pos += 1
            for jdx in xrange(len(self.tap_array[idx])):
-                inner_inp_mitmot.append(dC_dXtm1s[ins_pos - self.n_seqs])
+                tap = -self.tap_array[idx][jdx]
+                # Only create a new inner input if there is not already one
+                # associated with this input tap
+                if tap not in mitmot_inp_taps[idx]:
+                    inner_inp_mitmot.append(dC_dXtm1s[ins_pos - self.n_seqs])
                if isinstance(dC_dinps_t[ins_pos].type, NullType):
                    # We cannot use Null in the inner graph, so we
                    # use a zero tensor of the appropriate shape instead.
@@ -2032,7 +2194,23 @@ class Scan(PureOp):
                                     dtype=theano.config.floatX))
                    undefined_msg = dC_dinps_t[ins_pos].type.why_null
                else:
-                    inner_out_mitmot.append(dC_dinps_t[ins_pos])
+                    new_inner_out_mitmot = dC_dinps_t[ins_pos]
+                    # If there is already an inner input associated with that
+                    # input tap, make sure the computation of the new output
+                    # uses it instead of the input it's currently using
+                    if tap in mitmot_inp_taps[idx]:
+                        to_replace = dC_dXtm1s[ins_pos - self.n_seqs]
+                        replacement_idx = (len(mitmot_inp_taps[idx]) -
+                                           mitmot_inp_taps[idx].index(tap))
+                        replacement = inner_inp_mitmot[-replacement_idx]
+                        self.tap_array[idx]
+                        new_inner_out_mitmot = theano.clone(new_inner_out_mitmot,
+                                                            replace=[(to_replace, replacement)])
+                    inner_out_mitmot.append(new_inner_out_mitmot)
                if not disconnected_dC_dinps_t[ins_pos]:
                    disconnected = False
@@ -2041,12 +2219,15 @@ class Scan(PureOp):
                    if _sh in gof.graph.inputs([dC_dinps_t[ins_pos]]):
                        through_shared = True
-                n_mitmot_inps += 1
                ins_pos += 1
                n_mitmot_outs += 1
-                mitmot_inp_taps[idx].append(-self.tap_array[idx][jdx])
                mitmot_out_taps[idx].append(-self.tap_array[idx][jdx])
+                # Only add the tap as a new input tap if needed
+                if tap not in mitmot_inp_taps[idx]:
+                    n_mitmot_inps += 1
+                    mitmot_inp_taps[idx].append(-self.tap_array[idx][jdx])
            if undefined_msg:
                type_outs.append(undefined_msg)
            elif through_shared:

--- a/theano/scan_module/scan_perform.c
+++ b/theano/scan_module/scan_perform.c
--- a/theano/scan_module/scan_perform.pyx
+++ b/theano/scan_module/scan_perform.pyx
@@ -62,7 +62,7 @@ import copy
 def get_version():
-    return 0.286
+    return 0.287
 @cython.boundscheck(False)
 def perform(
@@ -82,6 +82,7 @@ def perform(
            numpy.ndarray[numpy.int32_t,ndim=1] vector_outs,
            numpy.ndarray[numpy.int32_t,ndim=2] mit_mot_out_slices,
            numpy.ndarray[numpy.int32_t,ndim=1] mit_mot_out_nslices,
+            numpy.ndarray[numpy.int32_t,ndim=1] mitmots_preallocated,
            fn,
            fnct,
            numpy.ndarray[numpy.int32_t,ndim=1] destroy_map,
@@ -183,7 +184,7 @@ def perform(
    cdef unsigned int idx
    cdef unsigned int i
    cdef unsigned int j
-    cdef unsigned int k
+    cdef int k
    cdef unsigned int kdx
    cdef unsigned int tdx
    cdef unsigned int pdx
@@ -194,6 +195,7 @@ def perform(
    cdef unsigned int len_output_storage = (n_mit_mot_outs + n_mit_sot +
                                            n_sit_sot + n_nit_sot +
                                            n_shared_outs)
+    cdef int input_reused[500] # max 500 inputs
    cdef int output_reused[500] # max 500 outputs
@@ -254,6 +256,9 @@ def perform(
    offset = nit_sot_arg_offset + n_nit_sot
    other_args = args[offset:]
    input_storage = fnct.input_storage
+    len_input_storage = len(input_storage)
+    old_input_storage = [None] * len_input_storage
+    old_input_data = [None] * len_input_storage
    output_storage = fnct.output_storage
    old_output_storage = [None] * len_output_storage
    old_output_data = [None] * len_output_storage
@@ -312,11 +317,13 @@ def perform(
        # 4. collecting slices where the output should be stored
        # 4.1. Collect slices for mitmots
+        offset = 0
        for idx in range(n_mit_mot_outs):
-            output_storage[idx].storage[0] = None
+            if not mitmots_preallocated[<unsigned int>idx]:
+                output_storage[<unsigned int>offset].storage[0] = None
+                offset += 1
        # 4.2. Collect slices for mitsots, sitsots and nitsots
-        offset = n_mit_mot_outs
        if i != 0:
            for idx in range(n_outs + n_nit_sot - n_mit_mot):
                if ( store_steps[<unsigned int>(idx+n_mit_mot)] == 1 or
@@ -358,7 +365,24 @@ def perform(
            else:
                old_output_data[idx] = None
-        # 5. compute outputs
+        # 4.6. Keep a reference to the variables (ndarrays, CudaNdarrays,
+        # etc) currently in the input_storage to be able to compare them
+        # with the content of the input_storage after the execution of the
+        # function. Also keep pointers to their data to be able to detect
+        # cases where outputs reused the allocated object but alter the
+        # memory region they refer to.
+        for idx in xrange(len(input_storage)):
+            var = input_storage[idx].storage[0]
+            old_input_storage[idx] = var
+            if hasattr(var, 'gpudata'):
+                old_input_data[idx] = var.gpudata
+            elif hasattr(var, 'data'):
+                old_input_data[idx] = var.data
+            else:
+                old_input_data[idx] = None
+        # 5.1 compute outputs
        t0_fn = time.time()
        try:
@@ -379,8 +403,20 @@ def perform(
            pdx = offset + n_shared_outs
            cond = output_storage[pdx].storage[0] == 0
-        # Check which of the pre-allocated outputs (if applicable) have
+        # 5.2. By calling fn() directly instead of calling the theano
-        # been reused by the inner function
+        # function, it is possible that the updates have not been
+        # performed. Perform the updates if needed.
+        offset_out = len(output_storage) - 1
+        if getattr(fn, 'need_update_inputs', True):
+            # Update the inputs that have an update function
+            for inp, storage in zip(self.fn.maker.expanded_inputs[::-1],
+                                    self.fn.input_storage[::-1]):
+                if inp.update is not None:
+                    storage.data = output_storage[offset_out].data
+                    offset_out -= 1
+        # 5.3. Check which of the pre-allocated outputs (if applicable)
+        # have been reused by the inner function
        for idx in range(len_output_storage):
            # If the storage map does not contain the same object, then
            # the pre-allocated output has not been reused
@@ -402,15 +438,58 @@ def perform(
            else:
                output_reused[idx] = False
+        # 5.4. Check which of the input storage have been modified by the
+        # inner function
+        for idx in xrange(len(input_storage)):
+            # If the storage map does not contain the same object, then
+            # the pre-allocated output has not been reused
+            new_var = input_storage[idx].storage[0]
+            if old_input_storage[idx] is new_var:
+                # The pre-allocated output is only considered as having
+                # been reused if it still points to the same data as it
+                # did before the execution of the inner function
+                if old_input_data[idx] is None:
+                    input_reused[idx] = False
+                else:
+                    if hasattr(new_var, 'gpudata'):
+                        input_reused[idx] = (new_var.gpudata ==
+                                             old_input_data[idx])
+                    elif hasattr(new_var, 'data'):
+                        input_reused[idx] = (new_var.data ==
+                                             old_input_data[idx])
+            else:
+                input_reused[idx] = False
        offset_out = 0
-        # 5.1 Copy over the values for mit_mot outputs
+        # 5.5 Copy over the values for mit_mot outputs
-        for j in range(n_mit_mot):
+        mitmot_inp_offset = self.n_seqs
-            for kdx in range(mit_mot_out_nslices[j]):
+        mitmot_out_idx = 0
-                k = mit_mot_out_slices[j,kdx]
+        for j in xrange(self.n_mit_mot):
-                outs[j][0][<unsigned int>(k+pos[j])] = output_storage[offset_out].storage[0]
+            for k in self.mit_mot_out_slices[j]:
-                offset_out += 1
+                if mitmots_preallocated[<unsigned int>mitmot_out_idx]:
+                    # This output tap has been preallocated. If the
-        # 5.2 Copy over the values for mit_sot/sit_sot outputs
+                    # corresponding input storage has been replaced,
+                    # recover the value as usual. Otherwise, the input was
+                    # modified inplace and nothing needs to be done.
+                    inp_idx = (mitmot_inp_offset +
+                               self.tap_array[j].index(k))
+                    if not input_reused[inp_idx]:
+                        outs[j][0][<unsigned int>(k + pos[j])] = \
+                            input_storage[<unsigned int>inp_idx].storage[0]
+                else:
+                    # This output tap has not been preallocated, recover
+                    # its value as usual
+                    outs[j][0][<unsigned int>(k + pos[j])] = \
+                            output_storage[<unsigned int>offset_out].storage[0]
+                    offset_out += 1
+                mitmot_out_idx += 1
+            mitmot_inp_offset += len(self.tap_array[j])
+        # 5.6 Copy over the values for mit_sot/sit_sot outputs
        begin = n_mit_mot
        end   = n_outs
        offset_out -= n_mit_mot
@@ -421,7 +500,7 @@ def perform(
                outs[j][0][pos[j]] = output_storage[<unsigned int>(offset_out+j)].storage[0]
-        # 5.3 Copy over the values for nit_sot outputs
+        # 5.7 Copy over the values for nit_sot outputs
        begin  = end
        end   += n_nit_sot
        for j in range(begin,end):
@@ -443,8 +522,7 @@ def perform(
                  not output_reused[<unsigned int>(offset_out+j)]):
                outs[j][0][pos[j]] = output_storage[j+offset_out].storage[0]
+        # 5.8 Copy over the values for outputs corresponding to shared
-        # 5.4 Copy over the values for outputs corresponding to shared
        # variables
        begin  = end
        end   += n_shared_outs
@@ -456,8 +534,6 @@ def perform(
            pos[idx] = (pos[idx]+1)%store_steps[idx]
        i = i + 1
    # 6. Check if you need to re-order output buffers
    begin = n_mit_mot
    end   = n_outs + n_nit_sot

--- a/theano/scan_module/scan_perform_ext.py
+++ b/theano/scan_module/scan_perform_ext.py
@@ -17,7 +17,7 @@ from theano.gof import cmodule
 _logger = logging.getLogger('theano.scan_module.scan_perform')
-version = 0.286  # must match constant returned in function get_version()
+version = 0.287  # must match constant returned in function get_version()
 need_reload = False

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -711,7 +711,7 @@ class T_Scan(unittest.TestCase):
        def inner_fct(mitsot_m2, mitsot_m1, sitsot):
            total = mitsot_m2 + mitsot_m1 + sitsot
-            output = total ** 2
+            output = total ** 1.05
            return output, output
        inputs = [tensor.matrix(), tensor.vector()]
@@ -729,6 +729,52 @@ class T_Scan(unittest.TestCase):
        sum_of_grads = sum([g.sum() for g in gradients])
        second_gradients = theano.grad(sum_of_grads, inputs[0])
+    def test_verify_second_grad_sitsot(self):
+        def get_sum_of_grad(inp):
+            scan_outputs, updates = theano.scan(fn=lambda x: x * 2,
+                                                outputs_info=[inp],
+                                                n_steps=5)
+            # Take the gradient of each output wrt its corresponding initial
+            # state
+            return theano.grad(scan_outputs.sum(), inp).sum()
+        # Call verify_grad to ensure the correctness of the second gradients
+        floatX = theano.config.floatX
+        inputs_test_values = [numpy.random.random((3)).astype(floatX)]
+        theano.tests.unittest_tools.verify_grad(get_sum_of_grad,
+                                                inputs_test_values)
+    def test_verify_second_grad_mitsot1(self):
+        def inner_fct(mitsot_m2, sitsot):
+            total = mitsot_m2 + sitsot
+            output = total ** 1.02
+            return output, output
+        def get_sum_of_grad(input0, input1):
+            outputs_info = [dict(initial=input0, taps=[-2]), input1]
+            scan_outputs, updates = theano.scan(fn=inner_fct,
+                                                outputs_info=outputs_info,
+                                                n_steps=3)
+            # Take the gradient of each output wrt its corresponding initial
+            # state
+            gradients = [theano.grad(scan_outputs[0].sum(), input0),
+                         theano.grad(scan_outputs[1].sum(), input1)]
+            return gradients[0].sum() + gradients[1].sum()
+        # Call verify_grad to ensure the correctness of the second gradients
+        floatX = theano.config.floatX
+        inputs_test_values = [numpy.random.random((2, 3)).astype(floatX),
+                              numpy.random.random((3)).astype(floatX)]
+        theano.tests.unittest_tools.verify_grad(get_sum_of_grad,
+                                                inputs_test_values)
    def test_grad_two_scans(self):
        # data input & output

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -291,6 +291,11 @@ def inplace_elemwise_optimizer_op(OP):
        nb_change_no_validate = 0
        chk = fgraph.checkpoint()
+        if fgraph.update_mapping:
+            update_outs = [fgraph.outputs[i] for i in fgraph.update_mapping]
+        else:
+            update_outs = []
        for node in list(graph.io_toposort(fgraph.inputs, fgraph.outputs)):
            op = node.op
            # gpuarray GpuElemwise inherit from Elemwise
@@ -326,7 +331,59 @@ def inplace_elemwise_optimizer_op(OP):
            raised_warning = not verbose
            for candidate_output in candidate_outputs:
-                for candidate_input in candidate_inputs:
+                # If the output of the node can be established as an update
+                # output of the fgraph, visit the candidate_inputs in an order
+                # that will improve the chances of making the node operate
+                # inplace on the input it's meant to update
+                candidate_out_var = node.outputs[candidate_output]
+                sorted_candidate_inputs = candidate_inputs
+                if candidate_out_var in update_outs:
+                    # The candidate output is an update. Sort the
+                    # variables in candidate_inputs in the following order:
+                    # - Vars corresponding to the actual updated input
+                    #   (best case scenario is for the node that procudes
+                    #   an update to operate inplace on the variable to
+                    #   update)
+                    # - Vars computed inplace on the updates input (second
+                    #   best scenario if for the node to work inplace on
+                    #   a variable obtained by a chain of inplace on the
+                    #   variable to update. In some cases, this will be
+                    #   equivalent to operating inplace on the variable to
+                    #   update)
+                    # - Remaining variables
+                    updated_inputs = []
+                    for i, f_out in enumerate(fgraph.outputs):
+                        if (f_out is candidate_out_var and i in fgraph.update_mapping):
+                            updated_inp_idx = fgraph.update_mapping[i]
+                            updated_inputs.append(fgraph.inputs[updated_inp_idx])
+                    updated_vars = []
+                    vars_from_inplace = []
+                    other_vars = []
+                    for inp_idx in candidate_inputs:
+                        inp = node.inputs[inp_idx]
+                        if inp in updated_inputs:
+                            # the candidate input is the actual updated input
+                            updated_vars.append(inp_idx)
+                        elif (hasattr(fgraph, 'destroy_handler') and
+                              inp.owner and
+                              any([fgraph.destroy_handler.root_destroyer.get(up_inp, None) is inp.owner
+                                   for up_inp in updated_inputs])):
+                            # the candidate input is a variable computed
+                            # inplace on the updated input via a sequence of
+                            # one or more inplace operations
+                            vars_from_inplace.append(inp_idx)
+                        else:
+                            other_vars.append(inp_idx)
+                    sorted_candidate_inputs = (updated_vars +
+                                               vars_from_inplace + other_vars)
+                for candidate_input in sorted_candidate_inputs:
                    # remove inputs that don't have the same dtype as the output
                    if node.inputs[candidate_input].type != node.outputs[
                            candidate_output].type: