Merge pull request #1095 from goodfeli/determinism

Ready to merge: Determinism fix

Merge pull request #1095 from goodfeli/determinism
8a1272bd · nouiz · 5965057f · 7ccbc028 · 8a1272bd · 8a1272bd
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -88,7 +88,7 @@ from printing import \
 import scan_module
 from scan_module import scan, map, reduce, foldl, foldr, clone
-from updates import Updates
+from updates import Updates, OrderedUpdates
 import tensor
 import scalar

--- a/theano/compile/function.py
+++ b/theano/compile/function.py
@@ -12,6 +12,8 @@ from function_module import orig_function
 from profiling import ProfileStats
 from pfunc import pfunc
 from numpy import any  # to work in python 2.4
+import warnings
+from theano import gof
 def function(inputs, outputs=None, mode=None, updates=None, givens=None,
             no_default_updates=False, accept_inplace=False, name=None,
@@ -30,7 +32,7 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
    :type mode: string or `Mode` instance.
    :param mode: compilation mode
-    :type updates: iterable over pairs (shared_variable, new_expression). List, tuple or dict.
+    :type updates: iterable over pairs (shared_variable, new_expression). List, tuple or OrderedDict.
    :param updates: update the values for SharedVariable inputs according to these expressions
    :type givens: iterable over pairs (Var1, Var2) of Variables. List, tuple or dict.  The Var1
@@ -128,7 +130,7 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
                 def opt_log1p(node):
                    if not isinstance(node.op,Elemwise):
                       return
-                    if not isinstance(node.op.scalar_op, log,):
+                    if not isinstance(node.op.scalar_op, log):
                       return
                    inp = node.inputs[0]
                    if not inp.owner:
@@ -159,10 +161,18 @@ def function(inputs, outputs=None, mode=None, updates=None, givens=None,
    """
-    #tuple are used in some tests, as we accepted them in the past
-    #I prefer to allow it as they act the same as list for what they are used.
    if updates is None:
        updates = []
+    if isinstance(updates, dict) and \
+            not isinstance(updates, gof.python25.OrderedDict):
+        warnings.warn("Expected OrderedDict, got "+str(type(updates))+ "Using "
+        "a standard dictionary here results in "
+            "non-deterministic behavior. You should use an OrderedDict"
+            " if you are using python2.7 or use a list of (shared, update)"
+            " pairs. Do not just convert your dictionary to this type before"
+            " the call as the conversion will still be non-deterministic.")
    if givens is None:
        givens = []
    if not isinstance(inputs, (list, tuple)):

--- a/theano/compile/tests/test_pfunc.py
+++ b/theano/compile/tests/test_pfunc.py
@@ -626,8 +626,15 @@ class Test_pfunc(unittest.TestCase):
        # The order of the variables is not determined, so we try
        # both shared variables.
-        f = theano.function([], [], updates={a: a, b: (2 * b)})
+        # TODO: explain the above comment. By "not determined" does
-        g = theano.function([], [], updates={a: (a * 2), b: b})
+        # this mean "not deterministic"?
+        # This test originally wrote the updates using dictionaries,
+        # and iterating over the dictionary was not deterministic.
+        # Is that all the comment above meant, or is the CVM intended
+        # to add extra non-determinism? Or is the CVM meant to
+        # deterministically but arbitrarily pick an order for the updates?
+        f = theano.function([], [], updates=[(a, a), (b, (2 * b))])
+        g = theano.function([], [], updates=[(a, (a * 2)), (b, b)])
        f()
        assert a.get_value(borrow=True).shape == (), a.get_value()
@@ -642,10 +649,10 @@ class Test_pfunc(unittest.TestCase):
        a = shared(1., 'a')
        b = shared(numpy.ones((2, 3)), 'b')
-        # The order of the variables is not determined, so we try
+        # See comment in test_update_same about why we try both
-        # both shared variables.
+        # shared variables.
-        f = theano.function([], [], updates={a: a, b: (2 * b - b)})
+        f = theano.function([], [], updates=[(a, a), (b, (2 * b - b))])
-        g = theano.function([], [], updates={a: (a * 2 - a), b: b})
+        g = theano.function([], [], updates=[(a, (a * 2 - a)), (b, b)])
        f()
        assert a.get_value(borrow=True).shape == (), a.get_value()

--- a/theano/gof/null_type.py
+++ b/theano/gof/null_type.py
@@ -35,3 +35,6 @@ class NullType(Type):
    def __hash__(self, other):
        return hash(type(self))
+    def __str__(self):
+        return 'NullType'
--- a/theano/gof/python25.py
+++ b/theano/gof/python25.py
@@ -162,7 +162,7 @@ else:
 if sys.version_info[:2] < (2, 7):
    # The following implementation of OrderedDict compatible with python 2.4
-    # was taked from http://pypi.python.org/pypi/ordereddict/1.1
+    # was taken from http://pypi.python.org/pypi/ordereddict/1.1
    # It is under the MIT license.
    # Copyright (c) 2009 Raymond Hettinger

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -20,6 +20,7 @@ import theano
 from itertools import izip
 from theano import gof
 from theano.gof import Variable
+from theano.gof.python25 import OrderedDict
 from theano.gof.python25 import all
 import theano.gof.utils
 from theano.gof.null_type import NullType
@@ -144,6 +145,9 @@ class DisconnectedType(theano.gof.type.Type):
                " a symbolic placeholder."
            ))
+    def __str__(self):
+        return 'DisconnectedType'
 ########################
 # R Operator
@@ -211,7 +215,7 @@ def Rop(f, wrt, eval_points):
            # Tensor, Sparse and CudaNdArray have the ndim attribute
            pass
-    seen_nodes = {}
+    seen_nodes = OrderedDict()
    def _traverse(node):
        """ TODO: writeme """
@@ -432,14 +436,14 @@ def grad(cost, wrt, consider_constant=None,
    if known_grads is not None:
        outputs.extend(known_grads.keys())
-    var_to_node_to_idx = _populate_var_to_node_to_idx(
+    var_to_app_to_idx = _populate_var_to_app_to_idx(
            outputs, wrt, consider_constant)
    # build a dict mapping var to the gradient of cost with respect to var
-    grad_dict = {}
+    grad_dict = OrderedDict()
    if known_grads is None:
-        known_grads = {}
+        known_grads = OrderedDict()
    # The gradient of the cost is 1 unless specified otherwise by known_grads.
    if cost is not None:
@@ -501,10 +505,10 @@ def grad(cost, wrt, consider_constant=None,
    # variables that do not influence the cost have zero gradient.
    # if wrt is such a variable, populate the grad_dict with this info
-    # so that wrt not being in var_to_node_to_idx won't cause an error below
+    # so that wrt not being in var_to_app_to_idx won't cause an error below
    # according to the flag, possibly raise an error if wrt is disconnected
    for elem in wrt:
-        if elem not in var_to_node_to_idx and elem is not cost \
+        if elem not in var_to_app_to_idx and elem is not cost \
                and elem not in grad_dict:
            handle_disconnected(elem)
            grad_dict[elem] = DisconnectedType()()
@@ -521,7 +525,7 @@ def grad(cost, wrt, consider_constant=None,
        if hasattr(g.type, 'dtype'):
            assert g.type.dtype in tensor.float_dtypes
-    rval = _populate_grad_dict(var_to_node_to_idx,
+    rval = _populate_grad_dict(var_to_app_to_idx,
            grad_dict, wrt, cost_name)
    for i in xrange(len(rval)):
@@ -579,7 +583,7 @@ def _node_to_pattern(node):
    return connection_pattern
-def _populate_var_to_node_to_idx(outputs, wrt, consider_constant):
+def _populate_var_to_app_to_idx(outputs, wrt, consider_constant):
    """
    Helper function for grad function.
@@ -638,7 +642,7 @@ def _populate_var_to_node_to_idx(outputs, wrt, consider_constant):
    # var_to_app_to_idx[var][node] = [i,j] means node has
    # var as input at positions i and j
-    var_to_app_to_idx = {}
+    var_to_app_to_idx = OrderedDict()
    # Set of variables that have been added to their true parents
    # ('true' here means that the elements of the variable are a function
@@ -676,7 +680,13 @@ def _populate_var_to_node_to_idx(outputs, wrt, consider_constant):
                    continue
                if ipt not in var_to_app_to_idx:
-                    var_to_app_to_idx[ipt] = {}
+                    # This object here *must* be an OrderedDict, because
+                    # we iterate over its keys when adding up the terms of
+                    # the gradient on ipt. If it is a regular dict, the grad
+                    # method will return something that is analytically correct,
+                    # but whose order of doing additions depends on the memory
+                    # location of the apply nodes.
+                    var_to_app_to_idx[ipt] = OrderedDict()
                app_to_idx = var_to_app_to_idx[ipt]
                if app not in app_to_idx:
                    app_to_idx[app] = []
@@ -731,12 +741,12 @@ class DisconnectedInputError(ValueError):
    disconnected_inputs='raise'.
    """
-def _populate_grad_dict(var_to_node_to_idx,
+def _populate_grad_dict(var_to_app_to_idx,
        grad_dict, wrt, cost_name=None):
    """
        Helper function for grad function.
-        var_to_node_to_idx: a dictionary mapping a variable to
+        var_to_app_to_idx: a dictionary mapping a variable to
                a second dictionary.
                the second dictionary maps apply nodes acting on
                this variable to the variable's index in the apply
@@ -761,7 +771,7 @@ def _populate_grad_dict(var_to_node_to_idx,
    """
    # build a dict mapping node to the terms node contributes to each of
    # its inputs' gradients
-    term_dict = {}
+    term_dict = OrderedDict()
    def access_term_cache(node):
        """ Populates term_dict[node] and returns it """
@@ -1001,15 +1011,17 @@ def _populate_grad_dict(var_to_node_to_idx,
            #cache the result
            term_dict[node] = input_grads
        return term_dict[node]
    # populate grad_dict[var] and return it
    def access_grad_cache(var):
        if var not in grad_dict:
            # If var is not in grad_dict already, we must compute it
-            if var in var_to_node_to_idx:
+            if var in var_to_app_to_idx:
                terms = []
-                node_to_idx = var_to_node_to_idx[var]
+                node_to_idx = var_to_app_to_idx[var]
                for node in node_to_idx:
                    for idx in node_to_idx[node]:

--- a/theano/printing.py
+++ b/theano/printing.py
@@ -8,6 +8,8 @@ import logging
 import os
 import StringIO
 import sys
+# Not available on all platforms
+hashlib = None
 import numpy
@@ -1069,3 +1071,78 @@ def min_informative_str(obj, indent_level=0,
    rval = indent + prefix + name
    return rval
+def var_descriptor(obj, _prev_obs=None, _tag_generator=None):
+    """
+    Returns a string, with no endlines, fully specifying
+    how a variable is computed. Does not include any memory
+    location dependent information such as the id of a node.
+    """
+    if hashlib is None:
+        try:
+            import hashlib
+        except ImportError:
+            raise RuntimeError("Can't run var_descriptor because hashlib is not available.")
+    if _prev_obs is None:
+        _prev_obs = {}
+    if id(obj) in _prev_obs:
+        tag = _prev_obs[id(obj)]
+        return '<' + tag + '>'
+    if _tag_generator is None:
+        _tag_generator = _TagGenerator()
+    cur_tag = _tag_generator.get_tag()
+    _prev_obs[id(obj)] = cur_tag
+    if hasattr(obj, '__array__'):
+        # hashlib hashes only the contents of the buffer, but
+        # it can have different semantics depending on the strides
+        # of the ndarray
+        name = '<ndarray:'
+        name += 'strides=['+','.join(str(stride) for stride in obj.strides)+']'
+        name += ',digest='+hashlib.md5(obj).hexdigest()+'>'
+    elif hasattr(obj, 'name') and obj.name is not None:
+        name = obj.name
+    elif hasattr(obj, 'owner') and obj.owner is not None:
+        name = str(obj.owner.op) + '('
+        name += ','.join(var_descriptor(ipt,
+                    _prev_obs=_prev_obs, _tag_generator=_tag_generator) for ipt
+                    in obj.owner.inputs)
+        name += ')'
+    else:
+        name = str(obj)
+        if ' at 0x' in name:
+            # The __str__ method is encoding the object's id in its str
+            name = position_independent_str(obj)
+            if ' at 0x' in name:
+                print name
+                assert False
+    prefix = cur_tag + '='
+    rval = prefix + name
+    return rval
+def position_independent_str(obj):
+    if isinstance(obj, theano.gof.graph.Variable):
+        rval = 'theano_var'
+        rval += '{type='+str(obj.type)+'}'
+    else:
+        raise NotImplementedError()
+    return rval
--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -869,5 +869,5 @@ def test_stack_rows_segfault_070312():
    out = theano.shared(numpy.random.rand(1, 2, 2, 3).astype('float32'))
    op = theano.tensor.nnet.conv.ConvOp(imshp=(80, 96, 96), kshp=(9, 9),
            nkern=1, bsize=1)
-    f = theano.function([], [], updates={out: op(img, kern)})
+    f = theano.function([], [], updates=[(out, op(img, kern))])
    f()
--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -106,7 +106,7 @@ def test_alloc_memset_0():
 def test_gpuspecifyshape():
    x = cuda.shared_constructor(numpy.ones(3,dtype='float32'), 'x')
    m = theano.tensor.specify_shape(x + numpy.float32(1), (3,))
-    f = theano.function([], updates={x:m * numpy.float32(2)},
+    f = theano.function([], updates=[(x, m * numpy.float32(2))],
                        mode=mode_with_gpu)
    l = f.maker.fgraph.toposort()
    assert not numpy.any([isinstance(x.op, cuda.HostFromGpu) for x in l])

--- a/theano/sandbox/cuda/tests/test_var.py
+++ b/theano/sandbox/cuda/tests/test_var.py
@@ -60,11 +60,11 @@ class T_updates(unittest.TestCase):
        data = numpy.float32([1, 2, 3, 4])
        x = f32sc(data)
        y = x ** 2
-        f = theano.function([], y, updates={x: x + 1})
+        f = theano.function([], y, updates=[(x, x + 1)])
        f()
        # Test that we can update with a CudaVariable
-        f = theano.function([], y, updates={x: cuda.gpu_from_host(x + 1)})
+        f = theano.function([], y, updates=[(x, cuda.gpu_from_host(x + 1))])
        f()
    def test_2(self):
@@ -74,7 +74,7 @@ class T_updates(unittest.TestCase):
                value=numpy.zeros((10, 10), 'float32'))
        x = tensor.fmatrix('x')
-        output_updates = {output_var: x ** 2}
+        output_updates = [(output_var, x ** 2)]
        output_givens = {x: data}
        output_func = theano.function(inputs=[], outputs=[],
                updates=output_updates, givens=output_givens)
@@ -89,8 +89,8 @@ class T_updates(unittest.TestCase):
        # the update_var has type matrix, and the update expression
        # is a broadcasted scalar, and that should not be allowed.
        self.assertRaises(TypeError, theano.function, inputs=[], outputs=[],
-                          updates={output_var:
+                          updates=[(output_var,
-                                   output_var.sum()})
+                                   output_var.sum())])
    def test_err_broadcast(self):
        # Test that we raise a good error message when we don't
@@ -101,8 +101,8 @@ class T_updates(unittest.TestCase):
        # the update_var has type matrix, and the update expression
        # is a broadcasted scalar, and that should not be allowed.
        self.assertRaises(TypeError, theano.function, inputs=[], outputs=[],
-                          updates={output_var:
+                          updates=[(output_var,
-                                   output_var.sum().dimshuffle('x', 'x')})
+                                   output_var.sum().dimshuffle('x', 'x'))])
    def test_broadcast(self):
        # Test that we can rebroadcast
@@ -111,11 +111,11 @@ class T_updates(unittest.TestCase):
        up = tensor.unbroadcast(output_var.sum().dimshuffle('x', 'x'), 0, 1)
        output_func = theano.function(inputs=[], outputs=[],
-                                      updates={output_var: up})
+                                      updates=[(output_var, up)])
        output_func()
        up = tensor.patternbroadcast(output_var.sum().dimshuffle('x', 'x'),
                                     output_var.type.broadcastable)
        output_func = theano.function(inputs=[], outputs=[],
-                                      updates={output_var: up})
+                                      updates=[(output_var, up)])
        output_func()
--- a/theano/sandbox/scan.py
+++ b/theano/sandbox/scan.py
@@ -13,14 +13,16 @@ __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
 import itertools
 import logging
 import numpy
+import warnings
 from theano.compile import SharedVariable, function
 from theano import compile
 from theano import gof
+from theano.gof.python25 import OrderedDict
 from theano.tensor import opt
 from theano import tensor
 from theano import config
-from theano.updates import Updates
+from theano.updates import OrderedUpdates
 from theano.scan_module import scan_op
@@ -147,7 +149,7 @@ def scan(fn,
    n_seqs = len(seqs)
    n_outs = len(outs_info)
-    return_steps = {}
+    return_steps = OrderedDict()
    # wrap outputs info in a dictionary if they are not already in one
    for i in xrange(n_outs):
        if outs_info[i] is not None:
@@ -242,7 +244,7 @@ def scan(fn,
    mit_sot_inner_inputs = []
    mit_sot_inner_slices = []
    mit_sot_inner_outputs = []
-    mit_sot_return_steps = {}
+    mit_sot_return_steps = OrderedDict()
    mit_sot_tap_array = []
    mit_sot_rightOrder = []
@@ -251,7 +253,7 @@ def scan(fn,
    sit_sot_inner_inputs = []
    sit_sot_inner_slices = []
    sit_sot_inner_outputs = []
-    sit_sot_return_steps = {}
+    sit_sot_return_steps = OrderedDict()
    sit_sot_rightOrder = []
    nit_sot_steps = []
    # go through outputs picking up time slices as needed
@@ -398,7 +400,8 @@ def scan(fn,
                      not isinstance(arg, tensor.Constant))]
    # when we apply the lambda expression we get a mixture of update rules
    # and outputs that needs to be separated
-    condition, outputs, updates = scan_utils.get_updates_and_outputs(fn(*args))
+    lambda_result = fn(*args)
+    condition, outputs, updates = scan_utils.get_updates_and_outputs(lambda_result)
    if condition is not None:
        as_while = True
    else:
@@ -464,6 +467,13 @@ def scan(fn,
    dummy_outs = outputs
    if condition is not None:
        dummy_outs.append(condition)
+    # If we use a regular dict here, the results are non-deterministic
+    if not isinstance(updates, (list, tuple)):
+        if isinstance(updates, dict) and \
+            not isinstance(updates, gof.python25.OrderedDict):
+                warnings.warn("Using non-deterministic dictionary.")
    dummy_f = function(dummy_args,
                       dummy_outs,
                       updates=updates,
@@ -508,7 +518,7 @@ def scan(fn,
            sit_sot_inner_outputs.append(outputs[i])
    ## Step 5.3 Outputs that correspond to update rules of shared variables
-    givens = {}
+    givens = OrderedDict()
    n_shared_outs = 0
    shared_scan_inputs = []
    shared_inner_inputs = []
@@ -527,7 +537,7 @@ def scan(fn,
    ## Step 5.4 Outputs with no taps used in the input
    n_nit_sot = 0
    nit_sot_inner_outputs = []
-    nit_sot_return_steps = {}
+    nit_sot_return_steps = OrderedDict()
    nit_sot_rightOrder = []
    for i, out in enumerate(outs_info):
        if not 'taps' in out:
@@ -582,7 +592,7 @@ def scan(fn,
                  shared_inner_outputs)
    if condition is not None:
        inner_outs.append(condition)
-    new_givens = {}
+    new_givens = OrderedDict()
    for w, w_copy in givens.iteritems():
        new_givens[w] = w.type.filter_variable(w_copy)
@@ -593,7 +603,7 @@ def scan(fn,
    ##
    tap_array = mit_sot_tap_array + [[-1] for x in xrange(n_sit_sot)]
-    info = {}
+    info = OrderedDict()
    info['tap_array'] = tap_array
    info['n_seqs'] = n_seqs
@@ -607,7 +617,7 @@ def scan(fn,
    info['truncate_gradient'] = -1
    info['name'] = name
    info['mode'] = mode
-    info['destroy_map'] = {}
+    info['destroy_map'] = OrderedDict()
    info['inplace'] = False
    info['gpu'] = False
    info['as_while'] = as_while
@@ -641,7 +651,7 @@ def scan(fn,
    ###         and so on ...
    ##
-    update_map = Updates()
+    update_map = OrderedUpdates()
    offset = n_mit_mot
    offsets = [abs(numpy.min(x)) for x in mit_sot_tap_array]
@@ -675,4 +685,5 @@ def scan(fn,
    elif len(scan_out_list) == 0:
        scan_out_list = None
+    assert isinstance(update_map, dict) and 'Ordered' in str(type(update_map))
    return (scan_out_list, update_map)
--- a/theano/sandbox/scan_module/scan.py
+++ b/theano/sandbox/scan_module/scan.py
@@ -46,17 +46,12 @@ from itertools import izip
 import logging
 import numpy
-from theano.compile import SharedVariable, function
-from theano import compile
 from theano import gof
 from theano.tensor import opt, TensorVariable
 from theano.tensor.sharedvar import TensorSharedVariable
 from theano import tensor
-from theano import config
-from theano.updates import Updates
 from theano.scalar.sharedvar import shared as scalar_shared
 from theano.compile.pfunc import rebuild_collect_shared
-import theano
 import scan_op
 import scan_utils

--- a/theano/scan_module/scan.py
+++ b/theano/scan_module/scan.py
@@ -52,8 +52,9 @@ from theano import gof
 from theano.tensor import opt
 from theano import tensor
 from theano import config
-from theano.updates import Updates
+from theano.updates import OrderedUpdates
 from theano.compile import ops
+from theano.gof.python25 import OrderedDict
 import scan_op
@@ -376,11 +377,11 @@ def scan(fn,
    n_seqs = len(seqs)
    n_outs = len(outs_info)
-    return_steps = {}
+    return_steps = OrderedDict()
    # wrap sequences in a dictionary if they are not already dictionaries
    for i in xrange(n_seqs):
        if not isinstance(seqs[i], dict):
-            seqs[i] = dict(input=seqs[i], taps=[0])
+            seqs[i] = OrderedDict([('input', seqs[i]), ('taps', [0])])
        elif seqs[i].get('taps', None):
            seqs[i]['taps'] = wrap_into_list(seqs[i]['taps'])
        elif seqs[i].get('taps', True) is None:
@@ -402,7 +403,7 @@ def scan(fn,
            if not isinstance(outs_info[i], dict):
                # by default any output has a tap value of -1
-                outs_info[i] = dict(initial=outs_info[i], taps=[-1])
+                outs_info[i] = OrderedDict([('initial', outs_info[i]), ('taps', [-1])])
            elif (not outs_info[i].get('initial', None) and
                    outs_info[i].get('taps', None)):
                # ^ no initial state but taps provided
@@ -421,8 +422,8 @@ def scan(fn,
                outs_info[i]['taps'] = [-1]
        else:
            # if a None is provided as the output info we replace it
-            # with an empty dict() to simplify handling
+            # with an empty OrdereDict() to simplify handling
-            outs_info[i] = dict()
+            outs_info[i] = OrderedDict()
    ##
    ###   Step 2. Generate inputs and outputs of the inner functions
@@ -565,7 +566,7 @@ def scan(fn,
    mit_sot_inner_inputs = []
    mit_sot_inner_slices = []
    mit_sot_inner_outputs = []
-    mit_sot_return_steps = {}
+    mit_sot_return_steps = OrderedDict()
    mit_sot_tap_array = []
    mit_sot_rightOrder = []
@@ -574,7 +575,7 @@ def scan(fn,
    sit_sot_inner_inputs = []
    sit_sot_inner_slices = []
    sit_sot_inner_outputs = []
-    sit_sot_return_steps = {}
+    sit_sot_return_steps = OrderedDict()
    sit_sot_rightOrder = []
    # go through outputs picking up time slices as needed
@@ -777,7 +778,7 @@ def scan(fn,
    # as non sequences at the end of our args
    fake_nonseqs = [x.type() for x in non_seqs]
    fake_outputs = scan_utils.clone(outputs,
-                                    replace=dict(zip(non_seqs,
+                                    replace=OrderedDict(zip(non_seqs,
                                                     fake_nonseqs)))
    all_inputs = itertools.ifilter(
        lambda x: (isinstance(x, gof.Variable) and
@@ -825,7 +826,7 @@ def scan(fn,
        n_outs = len(dummy_f.maker.outputs)
        if as_while:
            n_outs = n_outs - 1
-        outs_info = [dict() for x in xrange(n_outs)]
+        outs_info = [OrderedDict() for x in xrange(n_outs)]
    ## Step 5.1 Outputs with taps different then -1
@@ -839,7 +840,7 @@ def scan(fn,
            sit_sot_inner_outputs.append(outputs[i])
    ## Step 5.3 Outputs that correspond to update rules of shared variables
-    givens = {}
+    givens = OrderedDict()
    n_shared_outs = 0
    shared_scan_inputs = []
    shared_inner_inputs = []
@@ -879,7 +880,7 @@ def scan(fn,
    ## Step 5.4 Outputs with no taps used in the input
    n_nit_sot = 0
    nit_sot_inner_outputs = []
-    nit_sot_return_steps = {}
+    nit_sot_return_steps = OrderedDict()
    nit_sot_rightOrder = []
    for i, out in enumerate(outs_info):
        if not 'taps' in out:
@@ -902,7 +903,7 @@ def scan(fn,
                         if (not isinstance(arg, SharedVariable) and
                             not isinstance(arg, tensor.Constant))]
-    givens.update(dict(zip(other_scan_args, other_inner_args)))
+    givens.update(OrderedDict(zip(other_scan_args, other_inner_args)))
    other_shared_scan_args = [arg.variable for arg
                        in dummy_f.maker.expanded_inputs
                        if (isinstance(arg.variable, SharedVariable) and
@@ -911,7 +912,7 @@ def scan(fn,
                        in dummy_f.maker.expanded_inputs
                        if (isinstance(arg.variable, SharedVariable) and
                            not arg.update)]
-    givens.update(dict(zip(other_shared_scan_args,
+    givens.update(OrderedDict(zip(other_shared_scan_args,
                           other_shared_inner_args)))
    ##
@@ -943,7 +944,7 @@ def scan(fn,
        # replace w with w_copy, where w is CudaNdarray
        # and w_copy is TensorType. This is caused because shared
        # variables are put on GPU right aways >:| ,
-        new_givens = {}
+        new_givens = OrderedDict()
        for w, w_copy in givens.iteritems():
            if (isinstance(w.type, cuda.CudaNdarrayType)
@@ -962,7 +963,7 @@ def scan(fn,
    ##
    tap_array = mit_sot_tap_array + [[-1] for x in xrange(n_sit_sot)]
-    info = {}
+    info = OrderedDict()
    info['tap_array'] = tap_array
    info['n_seqs'] = n_seqs
@@ -976,7 +977,7 @@ def scan(fn,
    info['truncate_gradient'] = truncate_gradient
    info['name'] = name
    info['mode'] = mode
-    info['destroy_map'] = {}
+    info['destroy_map'] = OrderedDict()
    info['gpu'] = False
    info['as_while'] = as_while
    info['profile'] = profile
@@ -1012,7 +1013,7 @@ def scan(fn,
    ###         and so on ...
    ##
-    update_map = Updates()
+    update_map = OrderedUpdates()
    def remove_dimensions(outs, steps_return, offsets=None):
        out_ls = []

--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
@@ -18,12 +18,13 @@ import logging
 from itertools import izip
 import numpy
+import warnings
 import theano
 from theano.compile.pfunc import rebuild_collect_shared
 from theano import gof
 from theano import tensor, scalar
-from theano.gof.python25 import all
+from theano.gof.python25 import all, OrderedDict
 from theano.tensor.basic import get_constant_value
@@ -181,12 +182,17 @@ def clone(output,
 def get_updates_and_outputs(ls):
    """
-    This function tries to recognize the updates dictionary, the
+    This function tries to recognize the updates OrderedDict, the
    list of outputs and the stopping condition returned by the
    lambda expression and arrange them in a predefined order
+    WRITEME: what is the type of ls? how is it formatted?
+            if it's not in the predefined order already, how does
+            this function know how to put it in that order?
    """
    def is_outputs(elem):
        if (isinstance(elem, (list, tuple)) and
            all([isinstance(x, theano.Variable) for x in elem])):
@@ -197,6 +203,11 @@ def get_updates_and_outputs(ls):
    def is_updates(elem):
        if isinstance(elem, dict):
+            # Make sure the updates will be applied in a deterministic order
+            if not isinstance(elem, gof.python25.OrderedDict):
+                warnings.warn("Expected OrderedDict or OrderedUpdates, got "\
+                        +str(type(elem))+". This can make your script non-"
+                        "deterministic.")
            return True
        # Dictionaries can be given as lists of tuples
        if (isinstance(elem, (list, tuple)) and
@@ -240,12 +251,13 @@ def get_updates_and_outputs(ls):
                'variables (or `theano.scan_module.until` objects for '
                'conditions). In particular if you need to use constant '
                'values, you can use `tensor.constant` to turn them into '
-                'Theano variables.')
+                 'Theano variables.')
    if is_outputs(ls):
-        return None, _list(ls), {}
+        return None, _list(ls), OrderedDict()
    if is_updates(ls):
-        return None, [], dict(ls)
+        return None, [], OrderedDict(ls)
    error_msg = ('Scan cannot parse the return value of your lambda '
                 'expression, which is: %s' % (ls,))
    if not isinstance(ls, (list, tuple)):
@@ -258,16 +270,16 @@ def get_updates_and_outputs(ls):
    if len(ls) == 2:
        if is_outputs(ls[0]):
            if is_updates(ls[1]):
-                return (None, _list(ls[0]), dict(ls[1]))
+                return (None, _list(ls[0]), OrderedDict(ls[1]))
            elif is_condition(ls[1]):
-                return (ls[1].condition, _list(ls[0]), {})
+                return (ls[1].condition, _list(ls[0]), OrderedDict())
            else:
                raise ValueError(error_msg)
        elif is_updates(ls[0]):
            if is_outputs(ls[1]):
                raise ValueError(deprecation_msg)
            elif is_condition(ls[1]):
-                return (ls[1].condition, [], dict(ls[0]))
+                return (ls[1].condition, [], OrderedDict(ls[0]))
            else:
                raise ValueError(error_msg)
        else:
@@ -276,7 +288,7 @@ def get_updates_and_outputs(ls):
        if is_outputs(ls[0]):
            if is_updates(ls[1]):
                if is_condition(ls[2]):
-                    return (ls[2].condition, _list(ls[0]), dict(ls[1]))
+                    return (ls[2].condition, _list(ls[0]), OrderedDict(ls[1]))
                else:
                    raise ValueError(error_msg)
            else:

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -16,6 +16,7 @@ from theano.compile.pfunc import rebuild_collect_shared
 from theano.gof.python25 import any
 from theano.tests  import unittest_tools as utt
 import theano.scalar.sharedvar
+from theano.gof.python25 import OrderedDict
 from numpy.testing.noseclasses import KnownFailureTest
@@ -1009,7 +1010,7 @@ class T_Scan(unittest.TestCase):
        x0 = theano.tensor.constant(x0)
        to_replace = outputs[0].owner.inputs[0].owner.inputs[1]
        outputs = theano.clone(outputs,
-                               replace={to_replace: x0})
+                               replace=[(to_replace, x0)])
        mode = theano.compile.mode.get_mode(None).including('inplace')
        f9 = theano.function([],
                             outputs,
@@ -1299,7 +1300,7 @@ class T_Scan(unittest.TestCase):
        state = theano.shared(v_state, 'vstate')
        def f_2():
-            return {state: 2 * state}
+            return OrderedDict([(state, 2 * state)])
        n_steps = theano.tensor.iscalar('nstep')
        output, updates = theano.scan(f_2,
                                      [],
@@ -1829,7 +1830,7 @@ class T_Scan(unittest.TestCase):
        X = theano.shared(numpy.array(1))
        out, updates = theano.scan(
-            lambda: {X: X + 1},
+            lambda: OrderedDict([(X, (X + 1))]),
            outputs_info=[],
            non_sequences=[],
            sequences=[],
@@ -1844,7 +1845,7 @@ class T_Scan(unittest.TestCase):
        y = theano.shared(numpy.array(1))
        out, updates = theano.scan(
-            lambda: {x: x + 1, y: x},
+            lambda: OrderedDict([(x, x + 1), (y, x)]),
            outputs_info=[],
            non_sequences=[],
            sequences=[],
@@ -1880,11 +1881,11 @@ class T_Scan(unittest.TestCase):
        b = theano.shared(numpy.random.rand(5, 4))
        def inner_func(a):
-            return a + 1, {b: 2 * b}
+            return a + 1, OrderedDict([(b, 2 * b)])
        out, updates = theano.scan(
            inner_func,
-            outputs_info=[{'initial': init_a}],
+            outputs_info=[OrderedDict([('initial', init_a)])],
            n_steps=1)
        out = out[-1]
        assert out.type.ndim == a.type.ndim
@@ -1967,7 +1968,7 @@ class T_Scan(unittest.TestCase):
        f1 = z * (x + y) ** 2 + 5
        f2 = theano.clone(f1,
-                          replace={y: y2},
+                          replace=OrderedDict([(y, y2)]),
                          strict=True,
                          copy_inputs=True)
        f2_inp = theano.gof.graph.inputs([f2])
@@ -1986,7 +1987,7 @@ class T_Scan(unittest.TestCase):
        f1 = z * (x + y) ** 2 + 5
        f2 = theano.clone(f1,
-                          replace={y: y2},
+                          replace=OrderedDict([(y, y2)]),
                          strict=False,
                          copy_inputs=True)
        f2_inp = theano.gof.graph.inputs([f2])
@@ -2005,7 +2006,7 @@ class T_Scan(unittest.TestCase):
        f1 = z * (x + y) ** 2 + 5
        f2 = theano.clone(f1,
-                          replace={y: y2},
+                          replace=[(y, y2)],
                          strict=True,
                          copy_inputs=False)
        f2_inp = theano.gof.graph.inputs([f2])
@@ -2024,7 +2025,7 @@ class T_Scan(unittest.TestCase):
        f1 = z * (x + y) ** 2 + 5
        f2 = theano.clone(f1,
-                          replace={y: y2},
+                          replace=[(y, y2)],
                          strict=False,
                          copy_inputs=False)
        f2_inp = theano.gof.graph.inputs([f2])
@@ -2204,15 +2205,15 @@ class T_Scan(unittest.TestCase):
        v2 = theano.shared(numpy.ones((5, 5), dtype=theano.config.floatX))
        shapef = theano.function([W],
                                 expr,
-                                 givens={initial: v1,
+                                 givens=OrderedDict([(initial, v1),
-                                         inpt: v2})
+                                         (inpt, v2)]))
        # First execution to cache n_steps
        shapef(numpy.ones((5, 5), dtype=theano.config.floatX))
        cost = expr.sum()
        d_cost_wrt_W = tensor.grad(cost, [W])
        f = theano.function([W, inpt], d_cost_wrt_W,
-                             givens={initial: theano.shared(numpy.zeros(5))})
+                             givens=OrderedDict([(initial, theano.shared(numpy.zeros(5)))]))
        rval = numpy.asarray([[5187989] * 5] * 5, dtype=theano.config.floatX)
        arg1 = numpy.ones((5, 5), dtype=theano.config.floatX)
@@ -3166,7 +3167,7 @@ class T_Scan(unittest.TestCase):
        shared_var = theano.shared(numpy.float32(1.))
        def inner_fn():
-            return [], {shared_var: shared_var + numpy.float32(1.)}
+            return [], OrderedDict([(shared_var, shared_var + numpy.float32(1.))])
        _, updates = theano.scan(inner_fn,
                                 n_steps=10,
                                 truncate_gradient=-1,
@@ -3239,7 +3240,7 @@ class T_Scan(unittest.TestCase):
        seq = tensor.matrix()
        initial_value = theano.shared(numpy.zeros((4, 1),
                                                  dtype=theano.config.floatX))
-        outputs_info = [{'initial': initial_value, 'taps': [-4]}, None]
+        outputs_info = [OrderedDict([('initial', initial_value), ('taps', [-4])]), None]
        results, updates = theano.scan(fn=onestep,
                                       sequences=seq,
                                       outputs_info=outputs_info)
@@ -3259,13 +3260,13 @@ class T_Scan(unittest.TestCase):
        seq = tensor.matrix()
        initial_value = theano.shared(numpy.zeros((4, 1),
                                                  dtype=theano.config.floatX))
-        outputs_info = [{'initial': initial_value, 'taps': [-4]}, None]
+        outputs_info = [OrderedDict([('initial', initial_value), ('taps', [-4])]), None]
        results, _ = theano.scan(fn=onestep,
                                       sequences=seq,
                                       outputs_info=outputs_info)
        sharedvar = theano.shared(numpy.zeros((1, 1),
                                              dtype=theano.config.floatX))
-        updates = {sharedvar: results[0][-1:]}
+        updates = OrderedDict([(sharedvar, results[0][-1:])])
        f = theano.function([seq], results[1], updates=updates)
        assert numpy.all(exp_out == f(inp))
@@ -3354,9 +3355,9 @@ def test_speed():
        theano.printing.debugprint(s_rinc)
        f = theano.function([],
                            [],
-                            updates={
+                            updates=OrderedDict([
-                                s_i: s_i + 1,
+                                (s_i, s_i + 1),
-                                shared_r: s_rinc},
+                                (shared_r, s_rinc)]),
                           mode=theano.Mode(linker='cvm'))
        f._check_for_aliased_inputs = False
        t2 = time.time()
@@ -3430,9 +3431,9 @@ def test_speed_rnn():
                        w)),
                tolerate_inplace_aliasing=True)
        f = theano.function([], [],
-                updates={
+                updates=OrderedDict([
-                    s_i: s_i + 1,
+                    (s_i, s_i + 1),
-                    shared_r: s_rinc},
+                    (shared_r, s_rinc)]),
                mode=theano.Mode(linker='cvm'))
        #theano.printing.debugprint(f)
        f_fn = f.fn
@@ -3495,9 +3496,9 @@ def test_speed_batchrnn():
                tolerate_inplace_aliasing=True)
        f = theano.function([],
                            [],
-                            updates={
+                            updates=[
-                                s_i: s_i + 1,
+                                (s_i, s_i + 1),
-                                shared_r: s_rinc},
+                                (shared_r, s_rinc)],
                mode=theano.Mode(linker='cvm'))
        #theano.printing.debugprint(f)
        f_fn = f.fn

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -1219,7 +1219,7 @@ class UsmmTests(unittest.TestCase):
            mode = theano.compile.mode.get_default_mode().excluding('fusion')
            if inplace:
-                updates = {z: z - a * theano.sparse.dot(x, y)}
+                updates = [(z, z - a * theano.sparse.dot(x, y))]
                f_a = theano.function([a, x, y], [],
                                      updates=updates,
                                      mode=mode)

--- a/theano/tensor/tests/mlp_test.py
+++ b/theano/tensor/tests/mlp_test.py
@@ -9,7 +9,7 @@ import numpy
 import theano
 import theano.tensor as T
-from theano.gof.python25 import any
+from theano.gof.python25 import any, OrderedDict
 def gen_data():
@@ -293,7 +293,7 @@ def test_mlp():
    # TODO: refine that and include only those
    mode = theano.compile.get_default_mode().including('fast_run')
-    updates2 = {}
+    updates2 = OrderedDict()
    updates2[classifier.hiddenLayer.params[0]]=T.grad(cost,classifier.hiddenLayer.params[0])
    train_model =theano.function( inputs = [index],

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -185,8 +185,8 @@ class t_gemm(TestCase):
        l2_reg = T.constant(0.0001).astype(config.floatX)
        #test constant merge with gemm
-        f = theano.function([a, b], updates={s: lr1 * T.dot(a, b) +
+        f = theano.function([a, b], updates=[(s, lr1 * T.dot(a, b) +
-                                                l2_reg * lr2 * s},
+                                                l2_reg * lr2 * s)],
                            mode=mode_not_fast_compile).maker.fgraph.toposort()
        #[Gemm{inplace}(<TensorType(float64, matrix)>, 0.01,
        # <TensorType(float64, matrix)>, <TensorType(float64, matrix)>,
@@ -195,8 +195,8 @@ class t_gemm(TestCase):
        assert f[0].op == gemm_inplace
        #test factored scalar with merge
-        f = theano.function([a, b], updates={s: lr1 * (T.dot(a, b) -
+        f = theano.function([a, b], updates=[(s, lr1 * (T.dot(a, b) -
-                                                        l2_reg * s)},
+                                                        l2_reg * s))],
                            mode=mode_not_fast_compile).maker.fgraph.toposort()
        #[Gemm{inplace}(<TensorType(float64, matrix)>, 0.01,
        # <TensorType(float64, matrix)>, <TensorType(float64, matrix)>,
@@ -206,7 +206,7 @@ class t_gemm(TestCase):
        #test factored scalar with merge and neg
        f = theano.function([a, b],
-                            updates={s: s - lr1 * (s * .0002 + T.dot(a, b))},
+                            updates=[(s, s - lr1 * (s * .0002 + T.dot(a, b)))],
                            mode=mode_not_fast_compile).maker.fgraph.toposort()
        #[Gemm{inplace}(<TensorType(float64, matrix)>, -0.01,
        # <TensorType(float64, matrix)>, <TensorType(float64, matrix)>,
@@ -368,7 +368,7 @@ class t_gemm(TestCase):
                tz_i = gemm_no_inplace(tz[:, :, i], ta, tx[
                    :, :, i], ty[:, :, i], tb)
                g_i = theano.function([], tz_i,
-                        updates={tz: T.set_subtensor(tz[:, :, i], tz_i)},
+                        updates=[(tz, T.set_subtensor(tz[:, :, i], tz_i))],
                        mode=compile.Mode(optimizer=None, linker=l))
                for j in xrange(3):
                    g_i()
@@ -801,7 +801,7 @@ def test_gemm_unrolled():
            cur_V = update_V(cur_H)
            cur_H = update_H(cur_V)
-        unrolled_theano = theano.function([], updates={V: cur_V, H: cur_H},
+        unrolled_theano = theano.function([], updates=[(V, cur_V), (H, cur_H)],
                                   name='unrolled_theano')
        nb_dot = sum([1 for node in unrolled_theano.maker.fgraph.toposort()
                      if isinstance(node.op, (theano.tensor.Dot,
@@ -1032,7 +1032,7 @@ def test_dot_w_self():
    p = T.dot(A, A) * B
    grad = T.grad(T.mean(p), A)
-    f = theano.function([B], p, updates={A: A - grad})
+    f = theano.function([B], p, updates=[(A, A - grad)])
    # tests correctness in debugmode
    f(numpy.asarray([[0, 1], [2, 3]], dtype=config.floatX))
@@ -1119,7 +1119,7 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
        assert topo[0].op.inplace == False
        #test the inplace version
-        g = theano.function([], [], updates={v2: v2 + theano.dot(m, v1)},
+        g = theano.function([], [], updates=[(v2, v2 + theano.dot(m, v1))],
                            mode=mode_blas_opt)
        # Assert they produce the same output
@@ -1169,7 +1169,7 @@ class TestGemv(TestCase, unittest_tools.TestOptimizationMixin):
        assert topo[-1].op.inplace == False
        #test the inplace version
-        g = theano.function([], [], updates={v2: v2 + theano.dot(v1, m)},
+        g = theano.function([], [], updates=[(v2, v2 + theano.dot(v1, m))],
                            mode=mode_blas_opt)
        # Assert they produce the same output
@@ -1575,7 +1575,7 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
    def function(self, inputs, outputs, updates=None):
        if updates is None:
-            updates = {}
+            updates = []
        return theano.function(inputs, outputs, self.mode, updates=updates)
    def b(self, bval):
@@ -1691,8 +1691,8 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
    def test_inplace(self):
        A = self.shared(numpy.random.rand(4, 5).astype(self.dtype))
        f = self.function([self.x, self.y], [],
-                          updates={A: A + T.constant(0.1, dtype=self.dtype) *
+                          updates=[(A, A + T.constant(0.1, dtype=self.dtype) *
-                                   T.outer(self.x, self.y)})
+                                   T.outer(self.x, self.y))])
        self.assertFunctionContains(f, self.ger_destructive)
        f(numpy.random.rand(4).astype(self.dtype),
          numpy.random.rand(5).astype(self.dtype))
@@ -1731,15 +1731,15 @@ class TestBlasStrides(TestCase):
        bt_dev = b_t.get_value(borrow=False, return_internal_type=True)
        ct_dev = c_t.get_value(borrow=False, return_internal_type=True)
-        f_nn = theano.function([], [], updates={a: tensor.dot(b, c)},
+        f_nn = theano.function([], [], updates=[(a, tensor.dot(b, c))],
                mode=self.mode)
        #print 'class name:', self.__class__.__name__
        #theano.printing.debugprint(f_nn)
-        f_nt = theano.function([], [], updates={a: tensor.dot(b, c_t.T)},
+        f_nt = theano.function([], [], updates=[(a, tensor.dot(b, c_t.T))],
                mode=self.mode)
-        f_tn = theano.function([], [], updates={a: tensor.dot(b_t.T, c)},
+        f_tn = theano.function([], [], updates=[(a, tensor.dot(b_t.T, c))],
                mode=self.mode)
-        f_tt = theano.function([], [], updates={a: tensor.dot(b_t.T, c_t.T)},
+        f_tt = theano.function([], [], updates=[(a, tensor.dot(b_t.T, c_t.T))],
                mode=self.mode)
        # Try with all stride patterns, and all transposed pattern
@@ -1802,14 +1802,14 @@ class TestBlasStrides(TestCase):
        bt_dev = b_t.get_value(borrow=False, return_internal_type=True)
        ct_dev = c_t.get_value(borrow=False, return_internal_type=True)
-        f_nn = theano.function([], [], updates={a: l * tensor.dot(b, c)},
+        f_nn = theano.function([], [], updates=[(a, l * tensor.dot(b, c))],
                mode=self.mode)
-        f_nt = theano.function([], [], updates={a: l * tensor.dot(b, c_t.T)},
+        f_nt = theano.function([], [], updates=[(a, l * tensor.dot(b, c_t.T))],
                mode=self.mode)
-        f_tn = theano.function([], [], updates={a: l * tensor.dot(b_t.T, c)},
+        f_tn = theano.function([], [], updates=[(a, l * tensor.dot(b_t.T, c))],
                mode=self.mode)
        f_tt = theano.function([], [],
-                updates={a: l * tensor.dot(b_t.T, c_t.T)},
+                updates=[(a, l * tensor.dot(b_t.T, c_t.T))],
                mode=self.mode)
        # Try with all stride patterns, and all transposed pattern
@@ -1875,28 +1875,28 @@ class TestBlasStrides(TestCase):
        ct_dev = c_t.get_value(borrow=False, return_internal_type=True)
        f_nnn = theano.function([], [],
-                updates={a: (l * a + tensor.dot(b, c))},
+                updates=[(a, (l * a + tensor.dot(b, c)))],
                mode=self.mode)
        f_nnt = theano.function([], [],
-                updates={a: (l * a + tensor.dot(b, c_t.T))},
+                updates=[(a, (l * a + tensor.dot(b, c_t.T)))],
                mode=self.mode)
        f_ntn = theano.function([], [],
-                updates={a: (l * a + tensor.dot(b_t.T, c))},
+                updates=[(a, (l * a + tensor.dot(b_t.T, c)))],
                mode=self.mode)
        f_ntt = theano.function([], [],
-                updates={a: (l * a + tensor.dot(b_t.T, c_t.T))},
+                updates=[(a, (l * a + tensor.dot(b_t.T, c_t.T)))],
                mode=self.mode)
        f_tnn = theano.function([], [],
-                updates={a_t: (l * a_t + tensor.dot(b, c).T)},
+                updates=[(a_t, (l * a_t + tensor.dot(b, c).T))],
                mode=self.mode)
        f_tnt = theano.function([], [],
-                updates={a_t: (l * a_t + tensor.dot(b, c_t.T).T)},
+                updates=[(a_t, (l * a_t + tensor.dot(b, c_t.T).T))],
                mode=self.mode)
        f_ttn = theano.function([], [],
-                updates={a_t: (l * a_t + tensor.dot(b_t.T, c).T)},
+                updates=[(a_t, (l * a_t + tensor.dot(b_t.T, c).T))],
                mode=self.mode)
        f_ttt = theano.function([], [],
-                updates={a_t: (l * a_t + tensor.dot(b_t.T, c_t.T).T)},
+                updates=[(a_t, (l * a_t + tensor.dot(b_t.T, c_t.T).T))],
                mode=self.mode)
        # Try with all stride patterns, and all transposed pattern
@@ -1985,11 +1985,11 @@ class TestBlasStrides(TestCase):
        b_dev = b.get_value(borrow=False, return_internal_type=True)
        c_dev = c.get_value(borrow=False, return_internal_type=True)
-        f_n = theano.function([], [], updates={a: (a + l * tensor.dot(b, c))},
+        f_n = theano.function([], [], updates=[(a, (a + l * tensor.dot(b, c)))],
                mode=self.mode)
        f_t = theano.function([], [],
-                updates={a: (a + l * tensor.dot(b_t.T, c))},
+                updates=[(a, (a + l * tensor.dot(b_t.T, c)))],
                mode=self.mode)
        # Try with all stride patterns, and all transposed pattern
@@ -2041,11 +2041,11 @@ class TestBlasStrides(TestCase):
        c_dev = c.get_value(borrow=False, return_internal_type=True)
        f_n = theano.function([], [],
-                updates={a: (a + l * tensor.outer(b, c))},
+                updates=[(a, (a + l * tensor.outer(b, c)))],
                mode=self.mode)
        f_t = theano.function([], [],
-                updates={a_t: (a_t + l * tensor.outer(b, c).T)},
+                updates=[(a_t, (a_t + l * tensor.outer(b, c).T))],
                mode=self.mode)
        # Try with all stride patterns, and all transposed patterns

--- a/theano/tensor/tests/test_blas_c.py
+++ b/theano/tensor/tests/test_blas_c.py
@@ -185,7 +185,7 @@ class TestCGemv(TestCase, TestOptimizationMixin):
        #test the inplace version
        g = theano.function([], [],
-                updates={v2: v2 + theano.dot(m, v1)},
+                updates=[(v2, v2 + theano.dot(m, v1))],
                mode=self.mode)
        # Assert they produce the same output

--- a/theano/tensor/tests/test_sharedvar.py
+++ b/theano/tensor/tests/test_sharedvar.py
@@ -526,8 +526,8 @@ def makeSharedTester(shared_constructor_,
            s = self.cast_value(s)
            s_shared = self.shared_constructor(s)
            f = theano.function([],
-                                updates={s_shared:theano.dot(a_shared,b_shared)
+                                updates=[(s_shared, theano.dot(a_shared,b_shared)
-                                         +s_shared})
+                                         +s_shared)])
            topo=f.maker.fgraph.toposort()
            f()
            #[Gemm{inplace}(<TensorType(float64, matrix)>, 0.01, <TensorType(float64, matrix)>, <TensorType(float64, matrix)>, 2e-06)]
@@ -541,8 +541,8 @@ def makeSharedTester(shared_constructor_,
            #now test with the specify shape op in the output
            f = theano.function([], s_shared.shape,
-                                updates={s_shared:theano.dot(a_shared,b_shared)
+                                updates=[(s_shared, theano.dot(a_shared,b_shared)
-                                         +s_shared_specify})
+                                         +s_shared_specify)])
            topo=f.maker.fgraph.toposort()
            shp=f()
            assert numpy.all(shp == (40,40))
@@ -557,8 +557,8 @@ def makeSharedTester(shared_constructor_,
                    b_shared.get_value(borrow=True).shape)
            f = theano.function([], s_shared.shape,
-                                updates={s_shared:theano.dot(a_shared,b_shared)
+                                updates=[(s_shared, theano.dot(a_shared,b_shared)
-                                         +s_shared_specify})
+                                         +s_shared_specify)])
            topo=f.maker.fgraph.toposort()
            shp=f()
            assert numpy.all(shp == (40,40))

--- a/theano/tests/diverse_tests.py
+++ b/theano/tests/diverse_tests.py
@@ -8,8 +8,8 @@ import numpy.random
 from theano.tests  import unittest_tools as utt
 '''
-  Different tests that are not connected to any particular Op, or functionality of 
+  Different tests that are not connected to any particular Op, or functionality of
-  Theano. Here will go for example code that we will publish in papers, that we 
+  Theano. Here will go for example code that we will publish in papers, that we
  should ensure that it will remain operational
 '''
@@ -55,7 +55,7 @@ class T_scipy(unittest.TestCase):
        train = function(
            inputs=[x,y],
            outputs=[prediction, xent],
-            updates={w:w-0.1*gw, b:b-0.1*gb})
+            updates=[(w, w-0.1*gw), (b, b-0.1*gb)])
        predict = function(inputs=[x], outputs=prediction)
        N = 4

--- a/theano/tests/test_updates.py
+++ b/theano/tests/test_updates.py
 import unittest
 import theano
-from theano.updates import Updates
+from theano.updates import OrderedUpdates
 import theano.tensor as T
 class test_ifelse(unittest.TestCase):
    def test_updates_init(self):
-        self.assertRaises(TypeError, Updates, dict(d=3))
+        self.assertRaises(TypeError, OrderedUpdates, dict(d=3))
        sv = theano.shared('asdf')
-        Updates({sv:3})
+        OrderedUpdates({sv:3})
    def test_updates_setitem(self):
        ok = True
-        up = Updates()
+        up = OrderedUpdates()
        sv = theano.shared('asdf')
        # keys have to be SharedVariables
@@ -27,8 +27,8 @@ class test_ifelse(unittest.TestCase):
    def test_updates_add(self):
-        up1 = Updates()
+        up1 = OrderedUpdates()
-        up2 = Updates()
+        up2 = OrderedUpdates()
        a = theano.shared('a')
        b = theano.shared('b')

--- a/theano/updates.py
+++ b/theano/updates.py
@@ -8,23 +8,27 @@ __contact__ = "theano-dev <theano-dev@googlegroups.com>"
 __docformat__ = "restructuredtext en"
+from theano.gof.python25 import OrderedDict
 from theano.compile.sharedvalue import SharedVariable
 import logging
 logger = logging.getLogger('theano.updates')
+import warnings
-class Updates(dict):
+# Must be an OrderedDict or updates will be applied in a non-deterministic order
+class OrderedUpdates(OrderedDict):
    """
    Dict-like mapping from SharedVariable keys to their new values.
    This mapping supports the use of the "+" operator for the union of updates.
    """
    def __init__(self, *key, **kwargs):
-        ret = super(Updates, self).__init__(*key, **kwargs)
+        ret = super(OrderedUpdates, self).__init__(*key, **kwargs)
        for key in self:
            if not isinstance(key, SharedVariable):
                raise TypeError(
-                    'Updates keys must inherit from SharedVariable',
+                    'OrderedUpdates keys must inherit from SharedVariable',
                    key)
        return ret
@@ -38,12 +42,14 @@ class Updates(dict):
            # value. Should it be cast to a GPU value right away?  Should
            # literals be transformed into constants immediately?
-            return super(Updates, self).__setitem__(key, value)
+            return super(OrderedUpdates, self).__setitem__(key, value)
        else:
-            raise TypeError('Updates keys must inherit from SharedVariable',
+            raise TypeError('OrderedUpdates keys must inherit from SharedVariable',
                    key)
-    def update(self, other):
+    def update(self, other=None):
+        if other is None:
+            return
        for key, val in dict(other).iteritems():
            if key in self:
                if self[key] == val:
@@ -52,13 +58,17 @@ class Updates(dict):
            self[key] = val  # __setitem__ does type-checking
    def __add__(self, other):
-        rval = Updates()
+        rval = OrderedUpdates()
        rval.update(self)
        rval.update(other)
        return rval
    def __radd__(other, self):
-        rval = Updates()
+        rval = OrderedUpdates()
        rval.update(other)
        rval.update(self)
        return rval
+def Updates(*key, **kwargs):
+    warnings.warn("Updates is deprecated. Switch to OrderedUpdates.")
+    return OrderedUpdates(*key, **kwargs)