Merged -- no conflict

ca79f02e · Olivier Delalleau · 4f1f59ac · 67968144 · ca79f02e · ca79f02e
--- a/.hgignore
+++ b/.hgignore
@@ -35,3 +35,5 @@ theano/version.py
 theano/version.py.out
 distribute-*.egg
 distribute-*.tar.gz
+out1
+out2
--- a/doc/tutorial/remarks.txt
+++ b/doc/tutorial/remarks.txt
@@ -11,8 +11,6 @@ How should you write your algorithm to make the most of what Theano can do?
 Limitations
 -----------
- Conditional control flow is possible but currently not efficient.  The current implementation will evaluate both sides of an ``if`` construct (see :func:`tensor.switch`).
 - While- or for-Loops within an expression graph are supported, but only via
  the :func:`theano.scan` op (which puts restrictions on how the loop body can
  interact with the rest of the graph).

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
-"""Provides `DebugMode`, an evaluation mode for debugging theano internals."""
+"""Provides `DebugMode`, an evaluation mode for debugging theano internals.
+:TODO: add support for IfElse Op, LazyLinker, PureOp, etc.
+"""
 __docformat__ = "restructuredtext en"
 import time, copy, sys, copy_reg, gc, os
@@ -1552,7 +1556,8 @@ class _Maker(FunctionMaker): #inheritance buys a few helper functions
    def __init__(self, inputs, outputs, optimizer, mode,
            accept_inplace = False,
-            function_builder = Function):
+            function_builder = Function,
+            profile=None):
        """
        :type inputs: a list of SymbolicInput instances
@@ -1567,7 +1572,7 @@ class _Maker(FunctionMaker): #inheritance buys a few helper functions
        :note: this function sets TensorType.filter_checks_isfinite when `mode.check_isfinite` is True
        """
+        self.profile = profile
        # Handle the case where inputs and/or outputs is a single Variable (not in a list)
        unpack_single = False
        return_none = False

--- a/theano/compile/function.py
+++ b/theano/compile/function.py
@@ -7,12 +7,13 @@ _logger = logging.getLogger('theano.compile.function')
 from io import In
 from function_module import orig_function
+from profiling import ProfileStats
 from pfunc import pfunc
 from numpy import any #for to work in python 2.4
 def function(inputs, outputs=None, mode=None, updates=[], givens=[],
             no_default_updates=False, accept_inplace=False, name=None,
-             rebuild_strict=True, allow_input_downcast=None):
+             rebuild_strict=True, allow_input_downcast=None, profile=None):
    """
    Return a callable object that will calculate `outputs` from `inputs`.
@@ -62,6 +63,11 @@ def function(inputs, outputs=None, mode=None, updates=[], givens=[],
    precise, type. None (default) is almost like False, but allows
    downcasting of Python float scalars to floatX.
+    :type profile: None, True, or ProfileStats instance
+    :param profile: accumulate profiling information into a given ProfileStats
+    instance. If argument is `True` then a new ProfileStats instance will be
+    used.  This profiling object will be available via self.profile.
    :note: Regarding givens: Be careful to make sure that these substitutions are
    independent--behaviour when Var1 of one pair appears in the graph leading to Var2 in
    another expression is undefined.  Replacements specified with givens are different from
@@ -88,6 +94,8 @@ def function(inputs, outputs=None, mode=None, updates=[], givens=[],
    if uses_In or uses_tuple:
        # we must use old semantics in this case.
+        if profile:
+            raise NotImplementedError('profiling not supported in old-style function')
        if uses_updates or uses_givens:
            raise NotImplementedError("In() instances and tuple inputs triggers the old semantics, which disallow using updates and givens")
        fn =  orig_function(inputs, outputs,
@@ -102,7 +110,8 @@ def function(inputs, outputs=None, mode=None, updates=[], givens=[],
                no_default_updates=no_default_updates,
                accept_inplace=accept_inplace,name=name,
                rebuild_strict=rebuild_strict,
-                allow_input_downcast=allow_input_downcast)
+                allow_input_downcast=allow_input_downcast,
+                profile=profile)
    # We need to add the flag check_aliased inputs if we have any mutable or
    # borrowed used defined inputs
    fn._check_for_aliased_inputs = check_for_aliased_inputs

--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -5,6 +5,7 @@ __docformat__ = "restructuredtext en"
 import copy
 import copy_reg
+import cPickle
 import itertools
 import time
@@ -15,7 +16,7 @@ from theano import gof
 from theano.gof.python25 import partial
 import mode as mode_module
 from io import In, SymbolicInput, SymbolicInputKit, SymbolicOutput
+from theano.configdefaults import config
 import logging
 _logger = logging.getLogger('theano.compile.function_module')
@@ -331,6 +332,7 @@ class Function(object):
        self.unpack_single = unpack_single
        self.return_none = return_none
        self.maker = maker
+        self.profile = None # reassigned in FunctionMaker.create
        # We will be popping stuff off this `containers` object.  It is a copy.
        containers = list(self.input_storage)
@@ -495,6 +497,7 @@ class Function(object):
        return cpy
    def __call__(self, *args, **kwargs):
+        profile = self.profile
        t0 = time.time()
        # Reinitialize each container's 'provided' counter
@@ -536,8 +539,7 @@ class Function(object):
        for k, arg in kwargs.iteritems():
            self[k] = arg
+        if (not hasattr(self, '_check_for_aliased_inputs') or
-        if ( not hasattr(self, '_check_for_aliased_inputs') or
            self._check_for_aliased_inputs):
            ## Collect aliased inputs among the storage space
            args_share_memory = []
@@ -592,9 +594,21 @@ class Function(object):
                            self.inv_finder[c]))
        # Do the actual work
+        if profile:
            t0_fn = time.time()
+        try:
            self.fn()
-        dt_fn = time.time() - t0_fn
+        except:
+            if hasattr(self.fn, 'position_of_error'):
+                # this is a new vm-provided function
+                # the C VM needs this because the exception manipulation
+                # done by raise_with_op is not implemented in C.
+                gof.vm.raise_with_op(self.fn.nodes[self.fn.position_of_error])
+            else:
+                # old-style linkers raise their own exceptions
+                raise
+        if profile:
+            profile.vm_call_time += time.time() - t0_fn
        # Retrieve the values that were computed
        outputs = [x.data for x in self.output_storage]
@@ -626,20 +640,18 @@ class Function(object):
                if isinstance(value, gof.Container):
                    value = value.storage[0]
                self[i] = value
        #
        # NOTE: This logic needs to be replicated in
        #       scan.
        #       grep for 'PROFILE_CODE'
        #
+        if profile:
            dt_call=time.time()-t0
-        if hasattr(self.maker.mode,'fct_call_time'):
+            profile.fct_callcount += 1
-            self.maker.mode.fct_call_time[self] += dt_call
+            profile.fct_call_time += dt_call
-            self.maker.mode.fct_call[self] += 1
+            if hasattr(self.fn, 'update_profile'):
+                self.fn.update_profile(profile)
-        self.maker.mode.call_time += dt_call
-        self.maker.mode.fn_time += dt_fn
        if self.return_none:
            return None
@@ -687,9 +699,10 @@ def _pickle_Function(f):
                if (i < j) and isinstance(d_i, numpy.ndarray) and isinstance(d_j, numpy.ndarray):
                    if numpy.may_share_memory(d_i, d_j):
                        if f.pickle_aliased_memory_strategy == 'warn':
-                            _logger.warning('aliased relationship between Function arguments '
+                            _logger.warning(('aliased relationship between'
-                                    'will not be preserved by un-pickling operation')
+                                    ' Function arguments %s, %s'
-                            #_logger.debug(str([d_i, d_j, id(d_i), id(d_j)]))
+                                    ' will not be preserved by un-pickling'
+                                    ' operation') %(str(d_i), str(d_j)))
                        else:
                            raise AliasedMemoryError(d_i, d_j)
@@ -893,7 +906,8 @@ class FunctionMaker(object):
            raise TypeError("Unknown output type: %s (%s)", type(output), output)
    def __init__(self, inputs, outputs,
-            mode = None, accept_inplace = False, function_builder = Function):
+            mode = None, accept_inplace = False, function_builder = Function,
+            profile=None):
        """
        :type inputs: a list of SymbolicInput instances
@@ -908,10 +922,20 @@ class FunctionMaker(object):
        :param accept_inplace: True iff it is acceptable to have inplace operations
                    in the graph from the inputs to the outputs
        """
        mode = mode_module.get_mode(mode)
+        # figure out which profile object to use (if any)
+        # to help with forward-porting ProfileMode,
+        # we allow ProfileMode to provide a ProfileStats object
+        # using this somewhat awkward mechanism.
+        mode_profile = getattr(mode, 'profile', None)
+        if (profile is not None) and (mode_profile is not None):
+            raise TypeError(
+                    'profile passed via both "mode" and "profile" arguments')
+        self.profile = profile = profile or mode_profile
        # Handle the case where inputs and/or outputs is a single Variable (not in a list)
+        self.orig_outputs = outputs
        unpack_single = False
        return_none = False
        if outputs is None:
@@ -951,7 +975,8 @@ class FunctionMaker(object):
            end_optimizer = time.time()
        finally:
            theano.config.compute_test_value = compute_test_value_orig
-        mode.optimizer_time += end_optimizer - start_optimizer
+        if profile:
+            profile.optimizer_time += end_optimizer - start_optimizer
        _logger.debug('Optimizing took %f seconds' % (end_optimizer - start_optimizer))
        #Add deep copy to respect the memory interface
@@ -1031,36 +1056,39 @@ class FunctionMaker(object):
        _fn, _i, _o = self.linker.make_thunk(input_storage = input_storage_lists)
        end_linker = time.time()
        _logger.debug('Linker took %f seconds' % (end_linker - start_linker))
-        self.mode.linker_time += end_linker - start_linker
+        if self.profile:
+            self.profile.linker_time += end_linker - start_linker
+            _fn.time_thunks = profile.flag_time_thunks
        fn = self.function_builder(_fn, _i, _o, self.indices, self.outputs, defaults, self.unpack_single, self.return_none, self)
        return fn
+def _pickle_FunctionMaker(self):
+    kwargs = dict(
+                inputs = self.inputs,
+                outputs = self.orig_outputs,
+                mode = self.mode,
+                accept_inplace = self.accept_inplace,
+                function_builder = self.function_builder,
+                profile = self.profile,
+                )
+    return (_constructor_FunctionMaker, (kwargs,))
-def _pickle_FunctionMaker(fm):
+def _constructor_FunctionMaker(kwargs):
-    if fm.return_none:
+    return FunctionMaker(**kwargs)
-        outputs = None
-    else:
-        if fm.unpack_single:
-            outputs = fm.outputs[0]
-        else:
-            outputs = fm.outputs
-    #backport
-    #outputs = None if fm.return_none else (fm.outputs[0] if fm.unpack_single else fm.outputs)
-    rval = (_constructor_FunctionMaker, (fm.inputs, outputs, fm.mode, fm.accept_inplace))
-    return rval
-def _constructor_FunctionMaker(*args):
-    return FunctionMaker(*args)
 copy_reg.pickle(FunctionMaker, _pickle_FunctionMaker)
-def _pickle_slice(s):
-    return (slice, (s.start, s.stop, s.step))
-copy_reg.pickle(slice, _pickle_slice)
+try:
+    # Someone wrote this at one point, and I'm guessing it's because the default
+    # pickling mechanism doesn't work... so I'm adding a try/except around it.
+    # This way if the default implementation works we can just use it.
+    cPickle.dumps(slice(0, 10, 100))
+except:
+    def _pickle_slice(s):
+        return (slice, (s.start, s.stop, s.step))
+    copy_reg.pickle(slice, _pickle_slice)
 __checkers = []
@@ -1077,7 +1105,7 @@ def check_equal(x, y):
 def register_checker(checker):
    __checkers.insert(0, checker)
-def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None):
+def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None, profile=None):
    """
    Return a Function that will calculate the outputs from the inputs.
@@ -1105,6 +1133,8 @@ def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None)
    :param accept_inplace:  True iff the graph can contain inplace operations prior to the
    optimization phase (default is False)
+    :param profile: None or ProfileStats instance
    """
    #Every element of the input list will be upgraded to an `In` instance if necessary,
@@ -1130,8 +1160,16 @@ def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None)
        if not mode:
            raise ValueError("Please provide at least one mode.")
        elif len(mode) == 1:
-            fn = FunctionMaker(inputs, outputs, mode[0], accept_inplace = accept_inplace).create(defaults)
+            fn = FunctionMaker(
+                    inputs,
+                    outputs,
+                    mode[0],
+                    accept_inplace = accept_inplace,
+                    profile=profile).create(
+                            defaults)
        else:
+            if profile:
+                raise NotImplementedError('profiling not implemented in this kind of mode')
            #return a different kind of function
            def dup_defaults():
                # TODO This may need to be changed to use containers as defaults.
@@ -1153,19 +1191,18 @@ def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None)
            fn = maker1.create(defaults)
    else:
        Maker = getattr(mode, 'function_maker', FunctionMaker)
-        fn = Maker(inputs, outputs, mode, accept_inplace = accept_inplace).create(defaults)
+        fn = Maker(inputs,
+                outputs,
+                mode,
+                accept_inplace = accept_inplace,
+                profile=profile).create(
+                        defaults)
    t2 = time.time()
-    if hasattr(mode, 'compile_time'):
+    if profile:
-        mode.compile_time+=t2-t1
+        profile.compile_time+=t2-t1
    fn.name = name
-    if hasattr(mode,'fct_call_time'):
-        mode.fct_call_time.setdefault(fn,0)
-    if hasattr(mode,'fct_call'):
-        mode.fct_call.setdefault(fn,0)
    return fn

--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -4,7 +4,9 @@ import os, logging
 import numpy, theano
 from theano import gof
-from theano.configparser import config, AddConfigVar, StrParam
+import theano.gof.vm
+from theano.configparser import config, AddConfigVar, StrParam, EnumStr
 _logger = logging.getLogger('theano.compile.mode')
@@ -55,7 +57,11 @@ predefined_linkers = {
    'c'    : gof.CLinker(),
    'c|py' : gof.OpWiseCLinker(allow_gc=True),
    'c|py_nogc' : gof.OpWiseCLinker(allow_gc=False),
-    'c&py' : gof.DualLinker(checker = check_equal)
+    'c&py' : gof.DualLinker(checker = check_equal),
+    'vm'   : gof.vm.VM_Linker(allow_gc=True, use_cloop=False),
+    'cvm'   : gof.vm.VM_Linker(allow_gc=True, use_cloop=True),
+    'vm_nogc' : gof.vm.VM_Linker(allow_gc=False, use_cloop=False),
+    'cvm_nogc': gof.vm.VM_Linker(allow_gc=False, use_cloop=True),
    }
@@ -249,6 +255,7 @@ class Mode(object):
        self._optimizer = optimizer
        self.call_time = 0
        self.fn_time = 0
+        linker.mode = self #TODO: WHY IS THIS HERE?
        self.optimizer_time = 0
        self.linker_time = 0
@@ -290,15 +297,27 @@ class Mode(object):
 FAST_COMPILE = Mode('py', 'fast_compile')
 FAST_RUN = Mode('c|py', 'fast_run')
 FAST_RUN_NOGC = Mode("c|py_nogc", 'fast_run')
-SANITY_CHECK = [Mode('c|py', None),
-                Mode('c|py', 'fast_run')]
 STABILIZE = Mode("c|py", OPT_STABILIZE)
 predefined_modes = {'FAST_COMPILE': FAST_COMPILE,
                    'FAST_RUN': FAST_RUN,
                    'FAST_RUN_NOGC':FAST_RUN_NOGC,
-                    'SANITY_CHECK': SANITY_CHECK,
+                    'STABILIZE': STABILIZE,
-                    'STABILIZE': STABILIZE}
+                    'VM':Mode('vm', 'fast_run'),
+                    'VM_NOGC':Mode('vm_nogc', 'fast_run'),
+                    'CVM':Mode('cvm', 'fast_run'),
+                    'CVM_NOGC':Mode('cvm_nogc', 'fast_run'),
+                    }
+#Don't add FAST_RUN_NOGC to this list(as well as other ALL CAPS short cut)
+#The way to get FAST_RUN_NOGC is with the flag 'linker=c|py_nogc'
+#The old all capital letter way of working is deprecated as it is not scalable.
+AddConfigVar('mode',
+        "Default compilation mode",
+        EnumStr(*(predefined_modes.keys() + [
+            'Mode','DEBUG_MODE', 'PROFILE_MODE'])),
+        in_c_key=False)
 instanciated_default_mode=None
 def get_mode(orig_string):
@@ -329,7 +348,7 @@ def get_mode(orig_string):
            ret = DebugMode(optimizer=config.optimizer)
        else:
            # The import is needed in case string is ProfileMode
-            from profilemode import ProfileMode
+            from profilemode import ProfileMode,prof_mode_instance_to_print
            ret = eval(string+'(linker=config.linker, optimizer=config.optimizer)')
    elif predefined_modes.has_key(string):
        ret = predefined_modes[string]
@@ -349,7 +368,6 @@ def get_mode(orig_string):
    #must tell python to print the summary at the end.
    if string == 'ProfileMode':
        #need to import later to break circular dependency.
-        from profilemode import prof_mode_instance_to_print
        prof_mode_instance_to_print.append(ret)
    return ret
@@ -365,3 +383,4 @@ def register_mode(name, mode):
    if name in predefined_modes:
        raise ValueError('Mode name already taken: %s' % name)
    predefined_modes[name] = mode
--- a/theano/compile/pfunc.py
+++ b/theano/compile/pfunc.py
 """Provide a simple user friendly API """
 __docformat__ = 'restructuredtext en'
+import numpy # for backport to 2.4, to get any().
+from profiling import ProfileStats
 from theano.gof import Container, Variable, generic, graph, Constant, Value
 from theano.compile import orig_function, In, Out
 from theano.compile.sharedvalue import SharedVariable, shared
-import numpy # for backport to 2.4, to get any().
+from theano import config
 def rebuild_collect_shared( outputs
                           , inputs             = None
@@ -292,7 +295,8 @@ class Param(object):
 def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
        no_default_updates=False, accept_inplace=False, name=None,
-        rebuild_strict=True, allow_input_downcast=None):
+        rebuild_strict=True, allow_input_downcast=None,
+        profile=None):
    """Function-constructor for graphs with shared variables.
    :type params: list of either Variable or Param instances.
@@ -319,11 +323,9 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
    If False (default), perform them all. Else, perform automatic updates on all Variables
    that are neither in "updates" nor in "no_default_updates".
-    :param name: an optional name for this fct. If used, the profile mode will print the time spent in this fct.
+    :type name: None or string
+    :param name: attaches a name to the Profiling result of this function when
-    :rtype: theano.compile.Function
+    using ProfileMode (will be deprecated).
-    :returns: a callable object that will compute the outputs (given the inputs)
-    and update the implicit function arguments according to the `updates`.
    :type allow_input_downcast: Boolean
    :param allow_input_downcast: True means that the values passed as
@@ -333,6 +335,21 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
    precise, type. None (default) is almost like False, but allows
    downcasting of Python float scalars to floatX.
+    :type profile: None, True, str, or ProfileStats instance
+    :param profile: accumulate profiling information into a given ProfileStats
+    instance. None is the default, and means to use the value of
+    config.profile.
+    If argument is `True` then a new ProfileStats instance will be
+    used.  If argument is a string, a new ProfileStats instance will be created
+    with that string as its `message` attribute.  This profiling object will be
+    available via self.profile.
+    :rtype: theano.compile.Function
+    :returns: a callable object that will compute the outputs (given the inputs)
+    and update the implicit function arguments according to the `updates`.
    :note: Regarding givens: Be careful to make sure that these substitutions are
    independent--behaviour when Var1 of one pair appears in the graph leading to Var2 in
    another expression is undefined.  Replacements specified with givens are different from
@@ -354,6 +371,17 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
    # Then it clones the outputs and the update expressions.  This rebuilds a computation graph
    # from the inputs and the givens.
    #
+    if profile is None:
+        profile = config.profile
+        # profile -> True or False
+    if profile == True:
+        profile = ProfileStats(message=name)
+        # profile -> object
+    if type(profile) == str:
+        profile = ProfileStats(message=profile)
+    # profile is typically either False or an object at this point.
+    # No need to block other objects being passed through though. It might be
+    # useful.
    if not isinstance(params,(list,tuple)):
        raise Exception("in pfunc() the first argument must be a list or a tuple")
@@ -393,7 +421,7 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
        inputs.append(si)
    return orig_function(inputs, cloned_outputs, mode,
-            accept_inplace=accept_inplace, name=name)
+            accept_inplace=accept_inplace, name=name, profile=profile)
 def _pfunc_param_to_in(param, strict=False, allow_downcast=None):

--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
@@ -8,6 +8,8 @@ from theano.configparser import config, AddConfigVar, IntParam, BoolParam
 from theano.compile.function_module import FunctionMaker
 run_cthunk = None # Will be imported only when needed.
+from profiling import ProfileStats
 import_time = time.time()
 AddConfigVar('ProfileMode.n_apply_to_print',
@@ -34,24 +36,53 @@ AddConfigVar('ProfileMode.profile_memory',
 class Profile_Maker(FunctionMaker):
    def create(self, input_storage=None, trustme=False):
        ret = super(Profile_Maker,self).create(input_storage, trustme)
+        # create a function-specific storage container for profiling info
+        profile = ProfileStats(atexit_print=False)
+        self.mode.profile_stats[ret] = profile
+        ret.profile = profile
+        #initialize the timers
        for i, node in enumerate(ret.maker.env.toposort()):
-            self.mode.apply_time[(i,node)]=0.0
+            profile.apply_time[node]=0.0
-            assert len(ret.fn.thunk_groups[i])==1
+            profile.outputs_size[node]=[0.0] * len(node.outputs)
-            self.mode.op_cimpl[node.op] = hasattr(ret.fn.thunk_groups[i][0],'cthunk')
+            # a thunk_group is a list of the thunks from each linker
+            # corresponding to the i'th position in the toposort.
+            assert len(ret.fn.thunk_groups[i])==1
+            profile.apply_cimpl[node] = hasattr(
+                    ret.fn.thunk_groups[i][0],
+                    'cthunk')
+        # Here we replace the linker function.
+        # This ugliness makes WrapLinker (an object that *generates*
+        # functions and is not function-specific)  work with ProfileStats
+        # objects which are function-specific.
+        #capture old fn in closure. This is important since new_fn is about to
+        #take its place as ret.fn.
+        ret_fn = ret.fn
+        def new_fn():
+            self.mode.apply_time = self.mode.profile_stats[ret].apply_time
+            self.mode.outputs_size = self.mode.profile_stats[ret].outputs_size
+            ret_fn()
+            # delete the old apply_time variable
+            # because it doesn't mean the same thing anymore.
+            # This prevents old code from looking like it still works.
+            del self.mode.apply_time
+            del self.mode.outputs_size
+        ret.fn = new_fn
        return ret
 class ProfileMode(Mode):
    def __init__(self, linker=config.linker, optimizer=config.optimizer):
-        apply_time = {}
-        op_cimpl = {}
-        compile_time = 0 #time passed in theano.function()
-        fct_call_time = {}#time passed inside theano fct call including op time.
-        fct_call = {}
        message=""
-        outputs_size={}
+        profile_stats={}
-        self.__setstate__((linker, optimizer, apply_time, op_cimpl,
+        self.__setstate__((linker,
-                           compile_time, fct_call_time, fct_call, message, outputs_size))
+            optimizer,
+            message,
+            profile_stats))
    def function_maker(self, i,o,m, *args, **kwargs):
        """Return an instance of `Profiler_Maker` which init the count"""
@@ -59,28 +90,24 @@ class ProfileMode(Mode):
        assert m is self
        return Profile_Maker(i, o, self, *args, **kwargs)
-    local_time = property(lambda self: [sum(self.apply_time.values())])
+    def __get_local_time(self):
+        rval = 0
+        for ps in self.profile_stats.values():
+            rval += sum(ps.apply_time.values())
+        return rval
+    local_time = property(__get_local_time)
    def __getstate__(self):
        #print "__getstate__",self.provided_linker,self.provided_optimizer
-        return (self.provided_linker, self.provided_optimizer, self.apply_time,
+        return (self.provided_linker,
-                self.op_cimpl, self.compile_time, self.fct_call_time,
+                self.provided_optimizer,
-                self.fct_call, self.message, self.outputs_size)
+                self.message,
+                self.profile_stats)
    def __setstate__(self, state):
-        linker, optimizer, apply_time, op_cimpl, compile_time, \
+        linker, optimizer, message, profile_stats = state
-                fct_call_time, fct_call, message, outputs_size = state
+        self.message = message
-        self.apply_time = apply_time
+        self.profile_stats = profile_stats
-        self.op_cimpl = op_cimpl
-        self.compile_time = compile_time
-        self.fct_call_time = fct_call_time
-        self.fct_call = fct_call
-        self.call_time = 0
-        self.fn_time = 0
-        self.optimizer_time = 0
-        self.linker_time = 0
-        self.message = ""
-        self.outputs_size = outputs_size
        def profile_thunk(i, node, th):
            """ Profile only the execution time
@@ -102,7 +129,7 @@ class ProfileMode(Mode):
                th()
                dt = time.time() - t0
-            apply_time[(i,node)] += dt
+            self.apply_time[node] += max(dt, 1e-14)
        def profile_thunk2(i, node, th):
@@ -149,8 +176,8 @@ class ProfileMode(Mode):
                else:
                    raise Exception("Can't determine the memory size of dtype",o[0].dtype)
                size.append(s)
-            outputs_size[node]=size
+            self.outputs_size[node]=size
-            apply_time[(i,node)] += dt
+            self.apply_time[node] += max(dt, 1e-14)
        self.provided_linker = linker
@@ -182,22 +209,44 @@ class ProfileMode(Mode):
                       Currently there is n_apply_to_print, n_ops_to_print and min_memory_size
                       that are accepted.
        """
+        compile_time = sum([ps.compile_time for ps in self.profile_stats.values()])
+        fct_call = dict([(fn, ps.fct_callcount)
+            for (fn, ps) in self.profile_stats.items()])
+        fct_call_time = dict([(fn, ps.fct_call_time)
+            for (fn, ps) in self.profile_stats.items()])
+        apply_time = {}
+        for fn, ps in self.profile_stats.items():
+            for (i, node) in enumerate(fn.maker.env.toposort()):
+                apply_time[(i, node)] = ps.apply_time[node]
+        for (i,n),t in apply_time.items():
+            if t == 0:
+                print i, n
+        op_cimpl = {}
+        outputs_size = {}
+        for fn, ps in self.profile_stats.items():
+            op_cimpl.update(ps.apply_cimpl)
-        compile_time = self.compile_time
-        fct_call_time = self.fct_call_time
-        fct_call = self.fct_call
-        apply_time = self.apply_time
-        op_cimpl = self.op_cimpl
        message = self.message
-        outputs_size = self.outputs_size
-        other_time = {'linker_time':self.linker_time,
+        outputs_size = {}
-                      'optimizer_time':self.optimizer_time}
+        for fn, ps in self.profile_stats.items():
+            outputs_size.update(ps.outputs_size)
+        other_time = dict(
+                linker_time = sum(
+                    [ps.linker_time for ps in self.profile_stats.values()]),
+                optimizer_time = sum(
+                    [ps.optimizer_time for ps in self.profile_stats.values()]))
        self.print_summary_("print_summary", compile_time, fct_call_time, fct_call,
-                            apply_time, op_cimpl, message, outputs_size, other_time,
+                        apply_time, op_cimpl, message, outputs_size,
+                        self.local_time, other_time,
                        **kwargs)
    def print_diff_summary(self, other, **kwargs):
        """ As print_summary, but print the difference on two different profile mode.
        TODO: Also we don't print the Apply-wise summary as it don't work for now.
@@ -240,7 +289,7 @@ class ProfileMode(Mode):
    @staticmethod
    def print_summary_(fct_name, compile_time, fct_call_time, fct_call,
                       apply_time, op_cimpl, message, outputs_size,
-                       other_time,
+                       local_time, other_time,
                       n_apply_to_print=config.ProfileMode.n_apply_to_print,
                       n_ops_to_print=config.ProfileMode.n_ops_to_print,
                       print_apply=True,
@@ -256,7 +305,6 @@ class ProfileMode(Mode):
                                whose outputs memory size is lower then that.
        """
-        local_time = sum(apply_time.values())
        total_time = time.time() - import_time
        total_fct_time = sum(fct_call_time.values())
        total_fct_call = sum(fct_call.values())
@@ -312,7 +360,7 @@ class ProfileMode(Mode):
            op_time[op]+=t
            nb_call = [v for k,v in fct_call.items() if k.maker.env is a.env][0]
            if t==0:
-                assert nb_call == 0
+                assert nb_call == 0, nb_call
            else:
                op_call[op] += nb_call
                op_apply[op] += 1
@@ -429,8 +477,8 @@ class ProfileMode(Mode):
        else:
            fct_memory={}#env->dict(node->(outputs size))
            var_mem = {}
-            for node,val in outputs_size.items():
+            for node, val in outputs_size.items():
-                fct_memory.setdefault(node.env,{})
+                fct_memory.setdefault(node.env, {})
                fct_memory[node.env][node]=val
                for out,v in zip(node.outputs,val):
                    var_mem[out]=v
@@ -600,7 +648,7 @@ def atexit_print_default_profile_mode():
    config.mode=PROFILE_MODE
    """
    for prof_mode in prof_mode_instance_to_print:
-        if sum(prof_mode.apply_time.values())>0:
+        if prof_mode.local_time>0:
            prof_mode.print_summary()
 #Register atexit_print_default_profile_mode to have the summary of the

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
+"""ProfileStats object for runtime and memory profiling.
+"""
+#
+# TODO: measure memory usage like ProfileMode did
+# TODO: put the optimization tips into a tips section??
+# TODO: add tip to use specify_shape (is specify_shape even in library doc?)
+# TODO: ensure field width for string fields makes columns line up
+# TODO: what to do about 'diff summary'? (ask Fred?)
+#
+__authors__   = "James Bergstra"
+__reviewer__  = "Razvan Pascanu"
+__copyright__ = "(c) 2011, Universite de Montreal"
+__license__   = "3-clause BSD License"
+__contact__   = "theano-dev <theano-dev@googlegroups.com>"
+__docformat__ = "restructuredtext en"
+import atexit
+import sys
+import theano
+from theano.configparser import AddConfigVar, StrParam, BoolParam
+config = theano.config
+_atexit_print_list = []
+_atexit_print_file = sys.stderr
+AddConfigVar('profiling.time_thunks',
+             """Time individual thunks when profiling""",
+        BoolParam(True))
+def _atexit_print_fn():
+    """Print ProfileStat objects in _atexit_print_list to _atexit_print_file
+    """
+    for ps in _atexit_print_list:
+        if ps.fct_callcount or ps.compile_time > 0:
+            ps.summary(file=_atexit_print_file)
+        else:
+            print 'Skipping empty Profile'
+atexit.register(_atexit_print_fn)
+class ProfileStats(object):
+    """
+    Object to store runtime and memory profiling information for all of
+    Theano's operations: compilation, optimization, execution.
+    """
+    #
+    # Note on implementation:
+    # Class variables are used here so that each one can be
+    # documented and initialized together.
+    # dictionary variables are initialized with None.
+    #
+    compile_time = 0.0
+    # Total time spent in body of orig_function,
+    # dominated by graph optimization and compilation of C
+    #
+    fct_call_time = 0.0
+    # The total time spent in Function.__call__
+    #
+    fct_callcount = 0
+    # Number of calls to Function.__call__
+    #
+    vm_call_time = 0.0
+    # Total time spent in Function.fn.__call__
+    #
+    apply_time = None
+    # dict from node -> float runtime
+    #
+    apply_callcount = None
+    # dict from node -> number of executions
+    #
+    apply_cimpl = None
+    # dict from node -> bool (1 if c, 0 if py)
+    #
+    message = None
+    # pretty string to print in summary, to identify this output
+    #
+    outputs_size = None
+    # node -> size of allocated output
+    #
+    optimizer_time = 0.0
+    # time spent optimizing graph (FunctionMaker.__init__)
+    linker_time = 0.0
+    # time spent linking graph (FunctionMaker.create)
+    # param is called flag_time_thunks because most other attributes with time
+    # in the name are times *of* something, rather than configuration flags.
+    def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
+        """
+        atexit_print - bool. True means that this object will be printed to
+                       stderr (using .summary()) at the end of the program.
+        **kwargs - misc initializers. These should (but need not) match the
+                   names of the class vars declared in this class.
+        """
+        self.apply_callcount = {}
+        self.output_size = {}
+        self.apply_time = {}
+        self.apply_cimpl = {}
+        self.outputs_size = {}
+        if flag_time_thunks is None:
+            self.flag_time_thunks = config.profiling.time_thunks
+        else:
+            self.flag_time_thunks = flag_time_thunks
+        self.__dict__.update(kwargs)
+        #print >> sys.stderr, "self.message", self.message
+        if atexit_print:
+            global _atexit_print_list
+            _atexit_print_list.append(self)
+    def op_time(self):
+        """dict op -> total time on thunks"""
+        # timing is stored by node, we compute timing by Op on demand
+        rval = {}
+        for node, t in self.apply_time.items():
+            rval.setdefault(node.op, 0)
+            rval[node.op] += t
+        return rval
+    def op_callcount(self):
+        """dict op -> total number of thunk calls"""
+        # timing is stored by node, we compute timing by Op on demand
+        rval = {}
+        for node, count in self.apply_callcount.items():
+            rval.setdefault(node.op, 0)
+            rval[node.op] += count
+        return rval
+    def op_nodes(self):
+        """dict op -> total number of nodes"""
+        # timing is stored by node, we compute timing by Op on demand
+        rval = {}
+        for node, count in self.apply_callcount.items():
+            rval.setdefault(node.op, 0)
+            rval[node.op] += 1
+        return rval
+    def op_impl(self):
+        """dict op -> total number of nodes"""
+        # timing is stored by node, we compute timing by Op on demand
+        rval = {}
+        for node in self.apply_callcount:
+            if self.apply_cimpl[node]:
+                rval[node.op] = 'C '
+            else:
+                rval[node.op] = 'Py'
+        return rval
+    def op_flops(self):
+        """dict op -> total number of flops"""
+        # timing is stored by node, we compute timing by Op on demand
+        rval = {}
+        return rval #TODO: continue here
+        for node, count in self.apply_callcount.items():
+            rval.setdefault(node.op, 0)
+            rval[node.op] += 1
+        return rval
+        for a,t in op_time.items():
+            if hasattr(a,'flops'):
+                op_flops[a]=a.flops*op_call[a]/t/1e6
+        flops_msg=''
+        if op_flops:
+            flops_msg=' <MFlops/s>'
+            print '\nHACK WARNING: we print the flops for some OP, but the logic don\' always work. You need to know the internal of Theano to make it work correctly. Otherwise don\'t use!'
+        print '\nOp-wise summary: <%% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> %s <nb_call> <nb apply> <Op name>'%(flops_msg)
+    def summary_ops(self, file=sys.stderr, N=None):
+        if self.apply_time:
+            local_time = sum(self.apply_time.values())
+        else:
+            local_time = 0
+        if local_time == 0:
+            print >> file, ('ProfileMode.summary_ops: total time 0'
+                    ' (did you forget to enable counters?)')
+            return
+        op_time = self.op_time()
+        op_call = self.op_callcount()
+        op_apply = self.op_nodes()
+        op_flops = self.op_flops()
+        op_impl = self.op_impl()
+        if N is None:
+            N = len(self.op_flops)
+        otimes = [(t*100/local_time,
+                    t,
+                    op,
+                    op_impl.get(op, '  '),
+                    op_call.get(op, 0),
+                    op_apply.get(op,0))
+                for op, t in op_time.items()]
+        otimes.sort()
+        otimes.reverse()
+        tot=0
+        print >> file, 'Ops'
+        print >> file, '---'
+        print >> file, '<% time> <cumulative %%> <apply time> <cumulative seconds> <time per call> <nb_call> <Op name>'
+        for f,t,a,impl,nb_call,nb_apply in otimes[:N]:
+            if nb_call == 0:
+                assert t == 0
+                continue
+            tot+=t
+            ftot=tot*100/local_time
+            if op_flops:
+                print >>file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %7.1f %5d %2d %s' % (
+                        f, ftot, t, tot, t/nb_call, impl, op_flops.get(a,-1), nb_call, nb_apply, a)
+            else:
+                print >>file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %5d %2d %s' % (
+                        f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
+        print >>file, '   ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
+                %(max(0, len(otimes)-N),
+                  sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
+                  sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
+        print >> file, ''
+    def summary_nodes(self, file=sys.stderr, N=None):
+        if self.apply_time:
+            local_time = sum(self.apply_time.values())
+        else:
+            local_time = 0
+        if local_time == 0:
+            print >> file, ('ProfileMode.summary_nodes: total time 0'
+                    ' (did you forget to enable counters?)')
+            return
+        print >> file, 'Thunks'
+        print >> file, '------'
+        print >> file, '<% time> <cumulative %%> <apply time> <cumulative seconds> <time per call> <nb_call> <Apply Op name>'
+        atimes = [(
+                t*100/local_time,
+                t,
+                a,
+                self.apply_callcount[a])
+            for a, t in self.apply_time.items()]
+        atimes.sort()
+        atimes.reverse()
+        tot=0
+        for (f, t, a, nb_call) in atimes[:N]:
+            tot+=t
+            ftot=tot*100/local_time
+            if nb_call==0:
+                continue
+            print >> file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs %.2es  %i  %s'%(
+                    f, ftot, t, tot, t/nb_call,nb_call, str(a))
+        print >> file, '   ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\
+                %(max(0, len(atimes)-N),
+                  sum(f for f, t, a, nb_call in atimes[N:]),
+                  sum(t for f, t, a, nb_call in atimes[N:]))
+        print >> file, ''
+    def summary_function(self, file):
+        print >> file, 'Function profiling'
+        print >> file, '=================='
+        print >> file, '  Message: %s'%self.message
+        print >> file, '  Time in %i calls to Function.__call__: %es' % (
+                self.fct_callcount, self.fct_call_time)
+        if self.fct_call_time>0:
+            print >> file, '  Time in Function.fn.__call__: %es (%.3f%%)' %(
+                    self.vm_call_time, 100*self.vm_call_time / self.fct_call_time)
+            local_time = sum(self.apply_time.values())
+            if local_time > 0:
+                print >> file, '  Time in thunks: %es (%.3f%%)' %(
+                        local_time, 100*local_time / self.fct_call_time)
+        print >> file, ''
+    def summary(self, file=sys.stderr, n_ops_to_print=20, n_applies_to_print=20):
+        self.summary_function(file)
+        local_time = sum(self.apply_time.values())
+        if local_time > 0:
+            self.summary_ops(file, n_ops_to_print)
+            self.summary_nodes(file, n_applies_to_print)
+        else:
+            print >> file, "  No node time accumulated (hint: try config profiling.time_thunks=1)"
+if 0: # old code still to be ported from ProfileMode
+    def long_print(self, file=sys.stderr, fct_name=None, message=None,
+            n_apply_to_print=15, n_ops_to_print=20, print_apply=False):
+        """
+        Print a readable summary of the stats.
+        param: n_apply_to_print the number of apply to print. Default 15.
+        param: n_ops_to_print the number of ops to print. Default 20.
+        """
+        local_time = sum(self.apply_time.values())
+        print ''
+        print 'ProfileMode.long_print()'
+        print 'name = %s'%fct_name
+        print 'msg = %s'%message
+        print '---------------------------'
+        print ''
+        print 'Total time spent running thunks: %.3fs'% local_time
+        sop_time={}
+        sop_call={}
+        sop_op = {}
+        sop_c={} #map each op class to Bool. True iff all applies were done in c.
+        for a,t in op_time.items():
+            typ = type(a)
+            sop_time.setdefault(typ,0)
+            sop_time[typ]+=t
+            sop_op.setdefault(typ,0)
+            sop_op[typ]+=1
+            sop_c.setdefault(typ,True)
+            sop_c[typ]=sop_c[typ] and op_cimpl.get(a, False)
+            sop_call[typ]=sop_call.get(typ,0)+op_call[a]
+        print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> <nb_call> <nb_op> <nb_op> <Op name>'
+        sotimes = [(t*100/local_time, t, a, sop_c[a], sop_call[a], sop_op[a]) for a, t in sop_time.items()]
+        sotimes.sort()
+        sotimes.reverse()
+        tot=0
+        for f,t,a,ci, nb_call, nb_op in sotimes[:n_ops_to_print]:
+            if nb_call == 0:
+                assert t == 0
+                continue
+            tot+=t
+            ftot=tot*100/local_time
+            if ci:
+                msg = '*'
+            else:
+                msg = ' '
+            print '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, nb_call, nb_op, a)
+        print '   ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\
+                %(max(0, len(sotimes)-n_ops_to_print),
+                  sum(f for f, t, a, ci, nb_call, nb_op in sotimes[n_ops_to_print:]),
+                  sum(t for f, t, a, ci, nb_call, nb_op in sotimes[n_ops_to_print:]))
+        total_time = time.time() - import_time
+        total_fct_time = sum(fct_call_time.values())
+        total_fct_call = sum(fct_call.values())
+        other_time = total_time - total_fct_time - compile_time
+        print
+        print 'Theano fct summary: <% total fct time> <total time> <time per call> <nb call> <fct name>'
+        for key in fct_call.keys():
+            if fct_call[key]>0:
+                print '   %4.1f%% %.3fs %.2es %d %s'%(fct_call_time[key]/total_fct_time*100 ,fct_call_time[key],
+                                                      fct_call_time[key]/fct_call[key], fct_call[key], key.name)
+            else:
+                print '   NOT CALLED',key.name
+        if total_fct_time>0:
+            time_pr_in_fct=local_time/total_fct_time*100
+            time_per_call=total_fct_time/total_fct_call
+        else:
+            time_pr_in_fct=0
+            time_per_call=0
+        print
+        print 'Time since import %.3fs'%(total_time)
+        print 'Compile time: %.3fs %.1f%%'%(compile_time, compile_time/total_time*100)
+        print 'Theano fct call %.3fs %.1f%%'%(total_fct_time,total_fct_time/total_time*100)
+        print '   Theano Op time (included in fct call, Time spent running thunks) %.3fs %.1f%%(of total) %.1f%%(of fct call)'% (local_time,local_time/total_time*100, time_pr_in_fct)
+        print 'Other time since import %.3fs %.1f%%'%(other_time,other_time/total_time*100)
+        print '%i Theano fct call, %.3fs per call'%(total_fct_call, time_per_call)
+        print
+        print "List of apply that don't have float64 as input but have float64 in outputs. Usefull to know if we forgot some cast when using floatX=float32 or gpu code."
+        print '<Apply> <Apply position> <fct name> <inputs type> <outputs type>'
+        for fct in fct_call.keys():
+            for idx, node in enumerate(fct.maker.env.toposort()):
+                if any(hasattr(i,'dtype') and i.dtype=='float64' for i in node.outputs) and not any(hasattr(i,'dtype') and i.dtype=='float64' for i in node.inputs):
+                    print str(node), idx, fct.name, str([getattr(i,'dtype',None) for i in node.inputs]),str([getattr(i,'dtype',None) for i in node.outputs])
+        if any([x[2].__name__.startswith("Gpu") for x in sotimes]):
+            cpu=[]
+            gpu=[]
+            trans=[]
+            for so in sotimes:
+                if so[2].__name__ in ["HostFromGpu", "GpuFromHost"]:
+                    trans.append(so)
+                elif so[2].__name__.startswith("Gpu"):
+                    gpu.append(so)
+                else:
+                    cpu.append(so)
+            sum_cpu=sum(so[1] for so in cpu)
+            sum_gpu=sum(so[1] for so in gpu)
+            sum_trans=sum(so[1] for so in trans)
+            print
+            print "Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op"%(
+                sum_cpu, sum_cpu/local_time*100, sum_gpu, sum_gpu/local_time*100, sum_trans, sum_trans/local_time*100)
+            print "Theano function input that are float64"
+            print "<fct name> <input name> <input type> <str input>"
+            for fct in fct_call.keys():
+                for i in fct.input_storage:
+                    if hasattr(i.type, 'dtype') and i.type.dtype=='float64':
+                        print fct.name, i.name, i.type, i
+        if outputs_size:
+            fct_memory={}#env->dict(node->(outputs size))
+            var_mem = {}
+            for node,val in outputs_size.items():
+                fct_memory.setdefault(node.env,{})
+                fct_memory[node.env][node]=val
+                for out,v in zip(node.outputs,val):
+                    var_mem[out]=v
+            print
+            print "Profile of Theano functions memory:"
+            for env,nodes_mem in fct_memory.iteritems():
+                print "Theano fct:", [fct for fct in fct_call.keys() if fct.maker.env is env][0].name
+                size_sum=sum([sum(val) for key,val in nodes_mem.iteritems()])
+                print "    Max without gc, inplace and view (KB)",size_sum/1024
+                node_memory_size = 0
+                node_memory_saved_by_view = 0
+                node_memory_saved_by_inplace = 0
+                running_memory_size = 0
+                running_max_memory_size = 0
+                post_thunk_old_storage = []
+                items = nodes_mem.items()
+                items.sort(key=lambda a: a[1])
+                items.reverse()
+                order = env.toposort()
+                computed, last_user = gc_helper(order)
+                for node in order:
+                    post_thunk_old_storage.append([ input_idx
+                                                    for input_idx,input in enumerate(node.inputs)
+                                                    if (input in computed) and (input not in env.outputs) and node == last_user[input]])
+                for node,val in items[:n_apply_to_print]:
+                    dmap = getattr(node.op,'destroy_map',None)
+                    vmap = getattr(node.op,'view_map',None)
+                    for idx,v in enumerate(val):
+                        if dmap and idx in dmap:#TODO check the op returned a view
+                            node_memory_saved_by_inplace += v
+                        elif vmap and idx in vmap:#TODO check the op returned a view
+                            node_memory_saved_by_view += v
+                        else:
+                            node_memory_size += v
+                            running_memory_size += v
+                            if running_memory_size > running_max_memory_size:
+                                running_max_memory_size = running_memory_size
+                            old_storage = post_thunk_old_storage[order.index(node)]
+                            for old_s in old_storage:
+                                running_memory_size -= var_mem[node.inputs[old_s]]
+                                pass
+                    pass
+                print "    Max FAST_RUN_NO_GC (KB)", node_memory_size/1024
+                print "    Max FAST_RUN (KB)", running_max_memory_size/1024
+                print "    Memory saved by view (KB)", node_memory_saved_by_view/1024
+                print "    Memory saved by inplace (KB)", node_memory_saved_by_inplace/1024
+                print "    Memory saved by GC (KB)", (node_memory_size-running_max_memory_size)/1024
+                n_apply_to_print+=10#TODO remove this line
+                print "    <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> <created/inplace/view> <Apply node>"
+                print "    <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected."
+                for key,val in items[:n_apply_to_print]:
+                    code = ['c']*len(node.outputs)
+                    for out,inp in getattr(key.op,'destroy_map',{}).iteritems():
+                        code[out] = "i"
+                    for out,inp in getattr(key.op,'view_map',{}).iteritems():
+                        code[out] = "v"
+                    print '       %9dB  %s %s %s' % (sum(val), str(val), ' '.join(code), key)
+                print '   ... (remaining %i Apply account for %.2f%%(%.2fs) of the runtime)'\
+                %(max(0, len(nodes_mem)-n_ops_to_print),
+                  sum(sum(val) for key, val in items[n_ops_to_print:]),
+                  sum(sum(val) for key, val in items[n_ops_to_print:])/size_sum)
+        print
+        print "Here are tips to potentially make your code run faster (if you think of new ones, suggest them on the mailing list). Test them first as they are not guaranteed to always provide a speedup."
+        from theano import tensor as T
+        from theano.tensor.raw_random import RandomFunction
+        import theano
+        import theano.scalar as scal
+        scalar_op_amdlibm_no_speed_up = [scal.LT, scal.GT, scal.LE, scal.GE, scal.EQ, scal.NEQ, scal.InRange, scal.Switch, scal.OR, scal.XOR, scal.AND, scal.Invert, scal.Maximum, scal.Minimum, scal.Add, scal.Mul, scal.Sub, scal.TrueDiv, scal.IntDiv, scal.Clip, scal.First, scal.Second, scal.Identity, scal.Cast, scal.Sgn, scal.Neg, scal.Inv, scal.Sqr ]
+        scalar_op_amdlibm_speed_up = [scal.Mod, scal.Pow, scal.Ceil, scal.Floor, scal.RoundHalfToEven, scal.RoundHalfAwayFromZero, scal.Log, scal.Log2, scal.Log10, scal.Log1p, scal.Exp, scal.Sqrt, scal.Abs, scal.Cos,  scal.Sin,  scal.Tan,  scal.Tanh,  scal.Cosh,  scal.Sinh, T.nnet.sigm.ScalarSigmoid, T.nnet.sigm.ScalarSoftplus ]#Abs, Mod in float{32,64} only
+        def get_scalar_ops(s):
+            if isinstance(s, theano.scalar.Composite):
+                l = []
+                for node in s.env.toposort():
+                    l+=get_scalar_ops(node.op)
+                return l
+            else: return [s]
+        def list_scalar_op(op):
+            if isinstance(op.scalar_op, theano.scalar.Composite):
+                return get_scalar_ops(op.scalar_op)
+            else: return [op.scalar_op]
+        def amdlibm_speed_up(op):
+            if not isinstance(op, T.Elemwise):
+                return False
+            else:
+                l = list_scalar_op(op)
+                for s_op in l:
+                    if s_op.__class__ in scalar_op_amdlibm_speed_up:
+                        return True
+                    elif s_op.__class__ not in scalar_op_amdlibm_no_speed_up:
+                        import pdb;pdb.set_trace()
+                        print "We don't know if amdlibm will accelerate this scalar op.", s_op
+                return False
+        def exp_float32_op(op):
+            if not isinstance(op, T.Elemwise):
+                return False
+            else:
+                l = list_scalar_op(op)
+                return any([s_op.__class__ in [scal.Exp] for s_op in l])
+        #tip 1
+        if config.floatX=='float64':
+            print "  - Try the Theano flag floatX=float32"
+        #tip 2
+        if not config.lib.amdlibm and any([amdlibm_speed_up(a.op) for i,a in apply_time]):
+            print "  - Try installing amdlibm and set the Theano flag lib.amdlibm=True. This speed up only some Elemwise operation."
+        #tip 3
+        if not config.lib.amdlibm and any([exp_float32_op(a.op) and a.inputs[0].dtype=='float32' for i,a in apply_time]):
+            print "  - With the default gcc libm, exp in float32 is slower then in float64! Try Theano flags floatX=float64 or install amdlibm and set the theano flags lib.amdlibm=True"
+        #tip 4
+        for a, t in apply_time.iteritems():
+            node = a
+            if isinstance(node.op, T.Dot) and all([ len(i.type.broadcastable)==2 for i in node.inputs]):
+                print "  - You have a dot operation that was not optimized to dot22 that is faster. Make sure the inputs are float32 or 64 and are the same for both input. Currently they are:",[i.type for i in node.inputs]
+        #tip 5
+        for a, t in apply_time.iteritems():
+            node = a
+            if isinstance(node.op, RandomFunction):
+                print "  - Replace the default random number generator by 'from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams' as this is is faster. It is still experimental, but seam to work correctly."
+                if config.device.startswith("gpu"):
+                    print "     - MRG_RandomStreams is the only random number supported on the GPU."
+                break
+    def print_summary(self,
+                      n_apply_to_print=config.ProfileMode.n_apply_to_print,
+                      n_ops_to_print=config.ProfileMode.n_ops_to_print):
+        """ Print 3 summary that show where the time is spend. The first show an Apply-wise summary, the second show an Op-wise summary, the third show an type-Op-wise summary.
+        The Apply-wise summary print the timing information for the worst offending Apply nodes. This corresponds to individual Op applications within your graph which take the longest to execute (so if you use dot twice, you will see two entries there).
+        The Op-wise summary print the execution time of all Apply nodes executing the same Op are grouped together and the total execution time per Op is shown (so if you use dot twice, you will see only one entry there corresponding to the sum of the time spent in each of them). If two Op have different hash value, they will be separate.
+        The type-Op-wise summary group the result by type of op. So event if two Op have different hash value, they will be merged.
+        Their is an hack with the Op-wise summary. Go see it if you want to know more.
+        :param n_apply_to_print: the number of apply to print. Default 15, or n_ops_to_print flag.
+        :param n_ops_to_print: the number of ops to print. Default 20, or n_apply_to_print flag.
+        """
+        fct_call_time = self.mode.fct_call_time
+        fct_call = self.mode.fct_call
+        apply_time = self.apply_time
+        op_cimpl = self.op_cimpl
+        message = self.message
+        outputs_size = self.outputs_size
+        self.print_summary_("print_summary",
+                None,
+                None,
+                None,
+                apply_time,
+                op_cimpl,
+                message,
+                outputs_size,
+                n_apply_to_print,
+                n_ops_to_print)
+    def print_diff_summary(self, other, n_apply_to_print=15, n_ops_to_print=20):
+        """ As print_summary, but print the difference on two different profile mode.
+        TODO: Also we don't print the Apply-wise summary as it don't work for now.
+        TODO: make comparaison with gpu code.
+        :param other: the other instance of ProfileMode that we want to be compared to.
+        :param n_apply_to_print: the number of apply to print. Default 15.
+        :param n_ops_to_print: the number of ops to print. Default 20.
+        """
+        def diff_dict(a_time,b_time_):
+            r = {}
+            b_time = copy.copy(b_time_)
+            for a,ta in a_time.items():
+                r.setdefault(a,0)
+                tb = b_time.pop(a,0)
+                r[a]+=ta-tb
+            #they are missing in a
+            for a,t in b_time.items():
+                r.setdefault(a,0)
+                r[a]+=t
+            return r
+        compile_time = self.compile_time-other.compile_time
+        fct_call_time = diff_dict(self.fct_call_time,other.fct_call_time)
+        fct_call = diff_dict(self.fct_call,other.fct_call)
+        apply_time = diff_dict(self.apply_time, other.apply_time)
+        op_cimpl = self.op_cimpl and other.op_cimpl
+        message = self.message
+        outputs_size = diff_dict(self.outputs_size,other.outputs_size)
+        self.print_summary_("print_diff_summary", compile_time, fct_call_time, fct_call,
+                            apply_time, op_cimpl, message, outputs_size,
+                            n_apply_to_print=n_apply_to_print,
+                            n_ops_to_print=n_ops_to_print, print_apply=False)
--- a/theano/compile/tests/test_modes.py
+++ b/theano/compile/tests/test_modes.py
+"""
+Test compilation modes
+"""
+from nose.plugins.skip import SkipTest
+import unittest
+import theano
+import numpy
+import random
+import numpy.random
+from theano.tests  import unittest_tools as utt
+import theano.tensor as T
+class T_bunch_of_modes(unittest.TestCase):
+    def test1(self):
+        # this is a quick test after the LazyLinker branch merge
+        # to check that all the current modes can still be used.
+        linker_classes_involved = []
+        for modename in theano.config.__class__.__dict__['mode'].all:
+            x = T.matrix()
+            y = T.vector()
+            f = theano.function([x,y], x+y, mode=modename)
+            # test that it runs something
+            f([[1,2],[3,4]], [5, 6])
+            linker_classes_involved.append(f.maker.mode.linker.__class__)
+            print 'MODE:', modename, f.maker.mode.linker, 'stop'
+        # regression check:
+        # there should be
+        # - VM_Linker
+        # - OpWiseCLinker (FAST_RUN)
+        # - WrapLinker (PROFILE_MODE)
+        # - PerformLinker (FAST_COMPILE)
+        # - DebugMode's Linker  (DEBUG_MODE)
+        assert 5 == len(set(linker_classes_involved))
+if __name__ == '__main__':
+    unittest.main()
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -65,15 +65,6 @@ AddConfigVar('force_device',
        BoolParam(False, allow_override=False),
        in_c_key=False)
-#Don't add FAST_RUN_NOGC to this list(as well as other ALL CAPS short cut)
-#The way to get FAST_RUN_NOGC is with the flag 'linker=c|py_nogc'
-#The old all capital letter way of working is deprecated as it is not scalable.
-AddConfigVar('mode',
-        "Default compilation mode",
-        EnumStr('Mode', 'ProfileMode', 'DebugMode', 'FAST_RUN',
-                'FAST_COMPILE', 'PROFILE_MODE', 'DEBUG_MODE'),
-        in_c_key=False)
 # Test whether or not gcc is present: disable C code if it is not.
 # Using the dummy file descriptor below is a workaround for a crash experienced
 # in an unusual Python 2.4.4 Windows environment with the default stdin=None.
@@ -84,13 +75,15 @@ try:
    # Keep the default linker the same as the one for the mode FAST_RUN
    AddConfigVar('linker',
                 "Default linker used if the theano flags mode is Mode or ProfileMode",
-                 EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py'),
+                 EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py',
+                     'vm', 'cvm', 'vm_nogc', 'cvm_nogc'),
                 in_c_key=False)
 except OSError:
    # gcc is not present, linker should default to python only
    AddConfigVar('linker',
                 "Default linker used if the theano flags mode is Mode or ProfileMode",
-                 EnumStr('py', 'c|py', 'c', 'c|py_nogc', 'c&py'),
+                 EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py',
+                     'vm', 'cvm', 'vm_nogc', 'cvm_nogc'),
                 in_c_key=False)
    warning('GCC not detected ! Theano will be unable to execute optimized '+
            'C-implementations (for both CPU and GPU) and will default to '+
@@ -145,10 +138,6 @@ AddConfigVar('op.set_flops',
        BoolParam(False),
        in_c_key=False)
-AddConfigVar('nvcc.fastmath',
-        "",
-        BoolParam(False))
 AddConfigVar('gpuelemwise.sync',
        "when true, wait that the gpu fct finished and check it error code.",
        BoolParam(True))

--- a/theano/gof/__init__.py
+++ b/theano/gof/__init__.py
@@ -146,7 +146,7 @@ from link import \
    Container, Linker, LocalLinker, PerformLinker, WrapLinker, WrapLinkerMany
 from op import \
-    Op
+    Op, PureOp
 from opt import (Optimizer, optimizer, SeqOptimizer,
    MergeOptimizer, MergeOptMerge, 

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -1312,6 +1312,7 @@ def gcc_module_compile_str(module_name, src_code, location=None, include_dirs=[]
    #DSE Patch 1 for supporting OSX frameworks; add -framework Python
    if sys.platform=='darwin' :
        preargs.extend(['-undefined','dynamic_lookup'])
+        python_inc = distutils.sysconfig.get_python_inc()
        # link with the framework library *if specifically requested*
        # config.mac_framework_link is by default False, since on some mac
        # installs linking with -framework causes a Bus Error

--- a/theano/gof/env.py
+++ b/theano/gof/env.py
@@ -311,6 +311,9 @@ class Env(utils.object2):
        self.__import_r__([new_r])
        self.__add_clients__(new_r, [(node, i)])
        prune = self.__remove_clients__(r, [(node, i)], False)
+        # Precondition: the substitution is semantically valid
+        # However it may introduce cycles to the graph,  in which case the
+        # transaction will be reverted later.
        self.execute_callbacks('on_change_input', node, i, r, new_r, reason=reason)
        if prune:
@@ -438,16 +441,32 @@ class Env(utils.object2):
        if len(self.nodes) < 2:
            # optimization
            # when there are 0 or 1 nodes, no sorting is necessary
+            # This special case happens a lot because the OpWiseCLinker produces
+            # 1-element graphs.
            return list(self.nodes)
        env = self
-        ords = {}
+        ords = self.orderings()
-        for feature in env._features:
-            if hasattr(feature, 'orderings'):
-                for op, prereqs in feature.orderings(env).items():
-                    ords.setdefault(op, []).extend(prereqs)
        order = graph.io_toposort(env.inputs, env.outputs, ords)
        return order
+    def orderings(self):
+        """
+        Return dict d s.t. d[node] is a list of nodes that must be evaluated
+        before node itself can be evaluated.
+        This is used primarily by the destroy_handler feature to ensure that all
+        clients of any destroyed inputs have already computed their outputs.
+        """
+        ords = {}
+        for feature in self._features:
+            if hasattr(feature, 'orderings'):
+                for node, prereqs in feature.orderings(self).items():
+                    ords.setdefault(node, []).extend(prereqs)
+        # eliminate duplicate prereqs
+        for (node,prereqs) in ords.items():
+            ords[node] = list(set(prereqs))
+        return ords
    def nclients(self, r):
        """WRITEME Same as len(self.clients(r))."""
        return len(self.clients(r))

--- a/theano/gof/lazylinker_c.c
+++ b/theano/gof/lazylinker_c.c
+#include <Python.h>
+#include "structmember.h"
+/**
+TODO: 
+- Check max supported depth of recursion
+- CLazyLinker should add context information to errors caught during evaluation. Say what node we were on, add the traceback attached to the node.
+- Clear containers of fully-useed intermediate results if allow_gc is 1
+- Add timers for profiling
+- Add support for profiling space used.
+  */
+#include <time.h>
+static double pytime(const struct timeval * tv)
+{
+  struct timeval t;
+  if (!tv)
+    {
+      tv = &t;
+      gettimeofday(&t, NULL);
+    }
+  return (double) tv->tv_sec + (double) tv->tv_usec / 1000000.0;
+}
+/**
+  CLazyLinker
+  */
+typedef struct {
+    PyObject_HEAD
+    /* Type-specific fields go here. */
+    PyObject * nodes; // the python list of nodes
+    PyObject * thunks; // python list of thunks
+    PyObject * pre_call_clear; //list of cells to clear on call.
+    int allow_gc;
+    Py_ssize_t n_applies;
+    int n_vars;    // number of variables in the graph
+    int * var_computed; // 1 or 0 for every variable
+    PyObject ** var_computed_cells;
+    Py_ssize_t n_output_vars;
+    Py_ssize_t * output_vars; // variables that *must* be evaluated by call
+    int * is_lazy; // 1 or 0 for every thunk
+    Py_ssize_t * var_owner; // nodes[[var_owner[var_idx]]] is var[var_idx]->owner
+    int * var_has_owner; //  1 or 0
+    Py_ssize_t * node_n_inputs;
+    Py_ssize_t * node_n_outputs;
+    Py_ssize_t ** node_inputs;
+    Py_ssize_t ** node_outputs;
+    Py_ssize_t * node_inputs_outputs_base; // node_inputs and node_outputs point into this
+    Py_ssize_t * node_n_prereqs;
+    Py_ssize_t ** node_prereqs;
+    void ** thunk_cptr_fn;
+    void ** thunk_cptr_data;
+    PyObject * call_times;
+    PyObject * call_counts;
+    int do_timing;
+    int position_of_error; // -1 for no error, otw the index into `thunks` that failed.
+} CLazyLinker;
+static void
+CLazyLinker_dealloc(PyObject* _self)
+{
+  CLazyLinker* self = (CLazyLinker *) _self;
+  free(self->thunk_cptr_fn);
+  free(self->thunk_cptr_data);
+  free(self->is_lazy);
+  if (self->node_n_prereqs)
+    {
+      for (int i = 0; i < self->n_applies; ++i)
+        {
+          free(self->node_prereqs[i]);
+        }
+    }
+  free(self->node_n_prereqs);
+  free(self->node_prereqs);
+  free(self->node_inputs_outputs_base);
+  free(self->node_n_inputs);
+  free(self->node_n_outputs);
+  free(self->node_inputs);
+  free(self->node_outputs);
+  free(self->var_owner);
+  free(self->var_has_owner);
+  free(self->var_computed);
+  if (self->var_computed_cells)
+    {
+      for (int i = 0; i < self->n_vars; ++i)
+        {
+          Py_DECREF(self->var_computed_cells[i]);
+        }
+    }
+  free(self->var_computed_cells);
+  free(self->output_vars);
+  Py_XDECREF(self->nodes);
+  Py_XDECREF(self->thunks);
+  Py_XDECREF(self->call_times);
+  Py_XDECREF(self->call_counts);
+  Py_XDECREF(self->pre_call_clear);
+  self->ob_type->tp_free((PyObject*)self);
+}
+static PyObject *
+CLazyLinker_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    CLazyLinker *self;
+    self = (CLazyLinker *)type->tp_alloc(type, 0);
+    if (self != NULL) {
+      self->nodes = NULL;
+      self->thunks = NULL;
+      self->pre_call_clear = NULL;
+      self->allow_gc = 1;
+      self->n_applies = 0;
+      self->n_vars = 0;
+      self->var_computed = NULL;
+      self->var_computed_cells = NULL;
+      self->n_output_vars = 0;
+      self->output_vars = NULL;
+      self->is_lazy = NULL;
+      self->var_owner = NULL;
+      self->var_has_owner = NULL;
+      self->node_n_inputs = NULL;
+      self->node_n_outputs = NULL;
+      self->node_inputs = NULL;
+      self->node_outputs = NULL;
+      self->node_inputs_outputs_base = NULL;
+      self->node_prereqs = NULL;
+      self->node_n_prereqs = NULL;
+      self->thunk_cptr_data = NULL;
+      self->thunk_cptr_fn = NULL;
+      self->call_times = NULL;
+      self->call_counts = NULL;
+      self->do_timing = 0;
+      self->position_of_error = -1;
+    }
+    return (PyObject *)self;
+}
+static int
+CLazyLinker_init(CLazyLinker *self, PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {
+      (char*)"nodes",
+      (char*)"thunks",
+      (char*)"pre_call_clear",
+      (char*)"allow_gc",
+      (char*)"call_counts",
+      (char*)"call_times",
+      (char*)"compute_map_list",
+      (char*)"base_input_output_list",
+      (char*)"node_n_inputs",
+      (char*)"node_n_outputs",
+      (char*)"node_input_offset",
+      (char*)"node_output_offset",
+      (char*)"var_owner",
+      (char*)"is_lazy_list",
+      (char*)"output_vars",
+      (char*)"node_prereqs",
+      (char*)"node_output_size",
+      NULL};
+    PyObject *compute_map_list=NULL,
+             *base_input_output_list=NULL,
+             *node_n_inputs=NULL,
+             *node_n_outputs=NULL,
+             *node_input_offset=NULL,
+             *node_output_offset=NULL,
+             *var_owner=NULL,
+             *is_lazy=NULL,
+             *output_vars=NULL,
+             *node_prereqs=NULL,
+             *node_output_size=NULL;
+    assert(!self->nodes);
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "OOOiOOOOOOOOOOOOO", kwlist,
+                                      &self->nodes,
+                                      &self->thunks,
+                                      &self->pre_call_clear,
+                                      &self->allow_gc,
+                                      &self->call_counts,
+                                      &self->call_times,
+                                      &compute_map_list,
+                                      &base_input_output_list,
+                                      &node_n_inputs,
+                                      &node_n_outputs,
+                                      &node_input_offset,
+                                      &node_output_offset,
+                                      &var_owner,
+                                      &is_lazy,
+                                      &output_vars,
+                                      &node_prereqs,
+                                      &node_output_size
+                                      ))
+        return -1;
+    Py_INCREF(self->nodes);
+    Py_INCREF(self->thunks);
+    Py_INCREF(self->pre_call_clear);
+    Py_INCREF(self->call_counts);
+    Py_INCREF(self->call_times);
+    Py_ssize_t n_applies = PyList_Size(self->nodes);
+    self->n_applies = n_applies;
+    self->n_vars = PyList_Size(var_owner);
+    if (PyList_Size(self->thunks) != n_applies) return -1;
+    if (PyList_Size(self->call_counts) != n_applies) return -1;
+    if (PyList_Size(self->call_times) != n_applies) return -1;
+    // allocated and initialize thunk_cptr_data and thunk_cptr_fn
+    if (n_applies)
+      {
+        self->thunk_cptr_data = (void**)malloc(n_applies * sizeof(void*));
+        self->thunk_cptr_fn = (void**)malloc(n_applies * sizeof(void*));
+        self->is_lazy = (int*)malloc(n_applies * sizeof(int));
+        self->node_prereqs = (Py_ssize_t**)malloc(n_applies*sizeof(Py_ssize_t*));
+        self->node_n_prereqs = (Py_ssize_t*)malloc(n_applies*sizeof(Py_ssize_t));
+        assert(self->node_prereqs);
+        assert(self->node_n_prereqs);
+        assert(self->is_lazy);
+        assert(self->thunk_cptr_fn);
+        assert(self->thunk_cptr_data);
+        // init these basic arrays
+        for (int i = 0; i < n_applies; ++i)
+          {
+            self->thunk_cptr_data[i] = NULL;
+            self->thunk_cptr_fn[i] = NULL;
+            self->is_lazy[i] = 1;
+            self->node_prereqs[i] = NULL;
+            self->node_n_prereqs[i] = 0;
+          }
+        for (int i = 0; i < n_applies; ++i)
+          {
+            PyObject * thunk = PyList_GetItem(self->thunks, i);
+            //thunk is borrowed
+            if (PyObject_HasAttrString(thunk, "cthunk"))
+              {
+                PyObject * cthunk = PyObject_GetAttrString(thunk, "cthunk");
+                //new reference
+                assert (cthunk && PyCObject_Check(cthunk));
+                self->thunk_cptr_fn[i] = PyCObject_AsVoidPtr(cthunk);
+                self->thunk_cptr_data[i] = PyCObject_GetDesc(cthunk);
+                Py_DECREF(cthunk);
+                // cthunk is kept alive by membership in self->thunks
+              }
+            else
+              {
+                self->thunk_cptr_fn[i] = NULL;
+                self->thunk_cptr_data[i] = NULL;
+              }
+            PyObject * el_i = PyList_GetItem(is_lazy, i);
+            self->is_lazy[i] = PyNumber_AsSsize_t(el_i, NULL);
+            /* now get the prereqs */
+            el_i = PyList_GetItem(node_prereqs, i);
+            assert (PyList_Check(el_i));
+            self->node_n_prereqs[i] = PyList_Size(el_i);
+            if (self->node_n_prereqs[i])
+              {
+                self->node_prereqs[i] = (Py_ssize_t*)malloc(
+                              PyList_Size(el_i)*sizeof(Py_ssize_t));
+                for (int j = 0; j < PyList_Size(el_i); ++j)
+                  {
+                    PyObject * el_ij = PyList_GetItem(el_i, j);
+                    Py_ssize_t N = PyNumber_AsSsize_t(el_ij, PyExc_IndexError);
+                    if (PyErr_Occurred())
+                      return -1;
+                    // N < n. variables
+                    assert(N < PyList_Size(var_owner));
+                    self->node_prereqs[i][j] = N;
+                  }
+              }
+          }
+      }
+    if (PyList_Check(base_input_output_list))
+      {
+        Py_ssize_t n_inputs_outputs_base = PyList_Size(base_input_output_list);
+        self->node_inputs_outputs_base = (Py_ssize_t*)malloc(n_inputs_outputs_base*sizeof(Py_ssize_t));
+        assert(self->node_inputs_outputs_base);
+        for (int i = 0; i < n_inputs_outputs_base; ++i)
+          {
+            PyObject *el_i = PyList_GetItem(base_input_output_list, i);
+            Py_ssize_t idx = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
+            if (PyErr_Occurred()) return -1;
+            self->node_inputs_outputs_base[i] = idx;
+          }
+        self->node_n_inputs = (Py_ssize_t*)malloc(n_applies*sizeof(Py_ssize_t));
+        assert(self->node_n_inputs);
+        self->node_n_outputs = (Py_ssize_t*)malloc(n_applies*sizeof(Py_ssize_t));
+        assert(self->node_n_outputs);
+        self->node_inputs = (Py_ssize_t**)malloc(n_applies*sizeof(Py_ssize_t*));
+        assert(self->node_inputs);
+        self->node_outputs = (Py_ssize_t**)malloc(n_applies*sizeof(Py_ssize_t*));
+        assert(self->node_outputs);
+        for (int i = 0; i < n_applies; ++i)
+          {
+            Py_ssize_t N;
+            N = PyNumber_AsSsize_t(PyList_GetItem(node_n_inputs, i),PyExc_IndexError);
+            if (PyErr_Occurred()) return -1;
+            assert (N <= n_inputs_outputs_base);
+            self->node_n_inputs[i] = N;
+            N = PyNumber_AsSsize_t(PyList_GetItem(node_n_outputs, i),PyExc_IndexError);
+            if (PyErr_Occurred()) return -1;
+            assert (N <= n_inputs_outputs_base);
+            self->node_n_outputs[i] = N;
+            N = PyNumber_AsSsize_t(PyList_GetItem(node_input_offset, i),PyExc_IndexError);
+            if (PyErr_Occurred()) return -1;
+            assert (N <= n_inputs_outputs_base);
+            self->node_inputs[i] = &self->node_inputs_outputs_base[N];
+            N = PyNumber_AsSsize_t(PyList_GetItem(node_output_offset, i),PyExc_IndexError);
+            if (PyErr_Occurred()) return -1;
+            assert (N <= n_inputs_outputs_base);
+            self->node_outputs[i] = &self->node_inputs_outputs_base[N];
+          }
+      }
+    else
+      {
+        PyErr_SetString(PyExc_TypeError, "base_input_output_list must be list");
+        return -1;
+      }
+    // allocation for var_owner
+    if (PyList_Check(var_owner))
+      {
+        self->var_owner = (Py_ssize_t*)malloc(self->n_vars*sizeof(Py_ssize_t));
+        self->var_has_owner = (int*)malloc(self->n_vars*sizeof(int));
+        self->var_computed = (int*)malloc(self->n_vars*sizeof(int));
+        self->var_computed_cells = (PyObject**)malloc(self->n_vars*sizeof(PyObject*));
+        for (int i = 0; i < self->n_vars; ++i)
+          {
+            PyObject * el_i = PyList_GetItem(var_owner, i);
+            if (el_i == Py_None)
+              {
+                self->var_has_owner[i] = 0;
+              }
+            else
+              {
+                Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
+                if (PyErr_Occurred()) return -1;
+                assert (N <= n_applies);
+                self->var_owner[i] = N;
+                self->var_has_owner[i] = 1;
+              }
+            self->var_computed_cells[i] = PyList_GetItem(compute_map_list, i);
+            Py_INCREF(self->var_computed_cells[i]);
+          }
+      }
+    else
+      {
+        PyErr_SetString(PyExc_TypeError, "var_owner must be list");
+        return -1;
+      }
+    //output vars
+    if (PyList_Check(output_vars))
+      {
+        self->n_output_vars = PyList_Size(output_vars);
+        self->output_vars = (Py_ssize_t*)malloc(self->n_output_vars*sizeof(Py_ssize_t));
+        assert(self->output_vars);
+        for (int i = 0; i < self->n_output_vars; ++i)
+          {
+            PyObject * el_i = PyList_GetItem(output_vars, i);
+            Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
+            if (PyErr_Occurred()) return -1;
+            assert (N <= self->n_vars);
+            self->output_vars[i] = N;
+          }
+      }
+    else
+      {
+        PyErr_SetString(PyExc_TypeError, "output_vars must be list");
+        return -1;
+      }
+    return 0;
+}
+static void set_position_of_error(CLazyLinker * self, int owner_idx)
+{
+  if (self->position_of_error == -1)
+    {
+      self->position_of_error = owner_idx;
+    }
+}
+static PyObject * pycall(CLazyLinker * self, Py_ssize_t node_idx, int verbose)
+{
+  // call thunk to see which inputs it wants
+  PyObject * thunk = PyList_GetItem(self->thunks, node_idx);
+  // refcounting - thunk is borrowed
+  PyObject * rval = NULL;
+  if (self->do_timing)
+    {
+      double t0 = pytime(NULL);
+      if (verbose) fprintf(stderr, "calling via Python (node %i)\n", (int)node_idx);
+      rval = PyObject_CallObject(thunk, NULL);
+      double t1 = pytime(NULL);
+      double ti = PyFloat_AsDouble(PyList_GetItem(self->call_times, node_idx));
+      PyList_SetItem(self->call_times, node_idx, PyFloat_FromDouble(t1 - t0 + ti));
+      PyObject * count = PyList_GetItem(self->call_counts, node_idx);
+      long icount = PyInt_AsLong(count);
+      PyList_SetItem(self->call_counts, node_idx, PyInt_FromLong(icount+1));
+    }
+  else
+    {
+      if (verbose) fprintf(stderr, "calling via Python (node %i)\n", (int)node_idx);
+      rval = PyObject_CallObject(thunk, NULL);
+    }
+  return rval;
+}
+static int c_call(CLazyLinker * self, Py_ssize_t node_idx, int verbose)
+{
+  void * ptr_addr = self->thunk_cptr_fn[node_idx];
+  int (*fn)(void*) = (int (*)(void*))(ptr_addr);
+  if (verbose) fprintf(stderr, "calling non-lazy shortcut (node %i)\n", (int)node_idx);
+  int err = 0;
+  if (self->do_timing)
+    {
+      double t0 = pytime(NULL);
+      err = fn(self->thunk_cptr_data[node_idx]);
+      double t1 = pytime(NULL);
+      double ti = PyFloat_AsDouble(PyList_GetItem(self->call_times, node_idx));
+      PyList_SetItem(self->call_times, node_idx, PyFloat_FromDouble(t1 - t0 + ti));
+      PyObject * count = PyList_GetItem(self->call_counts, node_idx);
+      long icount = PyInt_AsLong(count);
+      PyList_SetItem(self->call_counts, node_idx, PyInt_FromLong(icount+1));
+    }
+  else
+    {
+      err = fn(self->thunk_cptr_data[node_idx]);
+    }
+  if (err)
+    {
+      // cast the argument to a PyList (as described near line 226 of cc.py)
+      PyObject * __ERROR = ((PyObject**)self->thunk_cptr_data[node_idx])[0];
+      assert (PyList_Check(__ERROR));
+      assert (PyList_Size(__ERROR) == 3);
+      PyObject * err_type = PyList_GetItem(__ERROR, 0); //stolen ref
+      PyObject * err_msg = PyList_GetItem(__ERROR, 1); //stolen ref
+      PyObject * err_trace = PyList_GetItem(__ERROR, 2); //stolen ref
+      PyList_SET_ITEM(__ERROR, 0, Py_None); Py_INCREF(Py_None); //clobbers old ref
+      PyList_SET_ITEM(__ERROR, 1, Py_None); Py_INCREF(Py_None); //clobbers old ref
+      PyList_SET_ITEM(__ERROR, 2, Py_None); Py_INCREF(Py_None); //clobbers old ref
+      assert(!PyErr_Occurred()); // because CLinker hid the exception in __ERROR aka data
+      PyErr_Restore(err_type, err_msg, err_trace); //steals refs to args
+    }
+  if (err) set_position_of_error(self, node_idx);
+  return err;
+}
+static
+int lazy_rec_eval(CLazyLinker * self, Py_ssize_t var_idx, PyObject*one, PyObject*zero)
+{
+  int verbose = 0;
+  if (verbose) fprintf(stderr, "lazy_rec computing %i\n", (int)var_idx);
+  int err = 0;
+  if (self->var_computed[var_idx] || !self->var_has_owner[var_idx])
+    {
+      return 0;
+    }
+  else
+    {
+      Py_ssize_t owner_idx = self->var_owner[var_idx];
+      // STEP 1: compute the pre-requirements of the node
+      for (int i = 0; i < self->node_n_prereqs[owner_idx]; ++i)
+        {
+          Py_ssize_t prereq_idx = self->node_prereqs[owner_idx][i];
+          if (!self->var_computed[prereq_idx])
+            {
+              err = lazy_rec_eval(self, prereq_idx, one, zero);
+              if (err) return err;
+            }
+          assert (self->var_computed[prereq_idx]);
+        }
+      // STEP 2: compute the node itself
+      if (self->is_lazy[owner_idx])
+        {
+          // update the compute_map cells corresponding to the inputs of this thunk
+          for (int i = 0; i < self->node_n_inputs[owner_idx] && (!err); ++i)
+            {
+              int in_idx = self->node_inputs[owner_idx][i];
+              if (self->var_computed[in_idx])
+                {
+                  Py_INCREF(one);
+                  err = PyList_SetItem(self->var_computed_cells[in_idx], 0, one);
+                }
+              else
+                {
+                  Py_INCREF(zero);
+                  err = PyList_SetItem(self->var_computed_cells[in_idx], 0, zero);
+                }
+            }
+          if (err)
+            {
+              set_position_of_error(self, owner_idx);
+              return err;
+            }
+          PyObject * rval = pycall(self, owner_idx, verbose);
+          // refcounting - rval is new ref
+          //TODO: to prevent infinite loops
+          // - consider check that a thunk does not ask for an input that is already computed
+          if (rval) //call returned normally (no exception)
+            {
+              //update the computed-ness of any output cells
+              for (int i = 0; i < self->node_n_outputs[owner_idx]; ++i)
+                {
+                  int out_idx = self->node_outputs[owner_idx][i];
+                  PyObject * el_i = PyList_GetItem(self->var_computed_cells[out_idx], 0);
+                  Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
+                  if (PyErr_Occurred())
+                    {
+                      Py_DECREF(rval);
+                      set_position_of_error(self, owner_idx);
+                      return -1;
+                    }
+                  assert (N==0 || N==1);
+                  self->var_computed[out_idx] = N;
+                }
+              if (!self->var_computed[var_idx])
+                {
+                  if (PyList_Check(rval))
+                    {
+                      if (PyList_Size(rval))
+                        {
+                          for (int i = 0; i < PyList_Size(rval) && (!err); ++i)
+                            {
+                              PyObject * el_i = PyList_GetItem(rval, i);
+                              Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
+                              if (PyErr_Occurred())
+                                {
+                                  err = 1;
+                                }
+                              else
+                                {
+                                  assert (N <= self->node_n_inputs[owner_idx]);
+                                  Py_ssize_t input_idx = self->node_inputs[owner_idx][N];
+                                  err = lazy_rec_eval(self, input_idx, one, zero);
+                                }
+                            }
+                          if (!err)
+                            err = lazy_rec_eval(self, var_idx, one, zero);
+                        }
+                      else
+                        {
+                          PyErr_SetString(PyExc_ValueError,
+                                    "lazy thunk returned empty list without computing output");
+                          err = 1;
+                          set_position_of_error(self, owner_idx);
+                        }
+                      Py_DECREF(rval);
+                      set_position_of_error(self, owner_idx);
+                      return err;
+                    }
+                  else // don't know what it returned, but it wasn't right.
+                    {
+                      //TODO: More helpful error to help find *which node* made this
+                      // bad thunk
+                      PyErr_SetString(PyExc_TypeError,
+                                      "lazy thunk should list");
+                      Py_DECREF(rval);
+                      set_position_of_error(self, owner_idx);
+                      return 1;
+                    }
+                }
+              Py_DECREF(rval);
+            }
+          else // pycall returned NULL (internal error)
+            {
+              assert (PyErr_Occurred());
+              set_position_of_error(self, owner_idx);
+              return 1;
+            }
+        }
+      else //owner is not a lazy op. Ensure all intputs are evaluated.
+        {
+          // loop over inputs to owner
+          // call lazy_rec_eval on each one that is not computed.
+          // if there's an error, pass it up the stack
+          for (int i = 0; i < self->node_n_inputs[owner_idx]; ++i)
+            {
+              Py_ssize_t input_idx = self->node_inputs[owner_idx][i];
+              if (!self->var_computed[input_idx])
+                {
+                  err = lazy_rec_eval(self, input_idx, one, zero);
+                  if (err) return err;
+                }
+              assert (self->var_computed[input_idx]);
+            }
+          // call the thunk for this owner.
+          if (self->thunk_cptr_fn[owner_idx])
+            {
+              err = c_call(self, owner_idx, verbose);
+            }
+          else
+            {
+              PyObject * rval = pycall(self, owner_idx, verbose);
+              //rval is new ref
+              if (rval) //pycall returned normally (no exception)
+                {
+                  if (rval == Py_None)
+                    {
+                      Py_DECREF(rval); //ignore a return of None
+                    }
+                  else if (PyList_Check(rval))
+                    {
+                      PyErr_SetString(PyExc_TypeError,
+                                      "non-lazy thunk should return None, not list");
+                      err=1;
+                      set_position_of_error(self, owner_idx);
+                      Py_DECREF(rval);
+                    }
+                  else // don't know what it returned, but it wasn't right.
+                    {
+                      PyErr_SetObject(PyExc_TypeError, rval);
+                      err=1;
+                      set_position_of_error(self, owner_idx);
+                    }
+                }
+              else // pycall returned NULL (internal error)
+                {
+                  err=1;
+                  set_position_of_error(self, owner_idx);
+                }
+            }
+        }
+      // loop over all outputs and mark them as computed
+      for (int i = 0; i < self->node_n_outputs[owner_idx] && (!err); ++i)
+        {
+          self->var_computed[self->node_outputs[owner_idx][i]] = 1;
+        }
+    }
+  return err;
+}
+PyObject *
+CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
+{
+  CLazyLinker * self = (CLazyLinker*)_self;
+  static char *kwlist[] = {(char*)"time_thunks", NULL};
+  if (! PyArg_ParseTupleAndKeywords(args, kwds, "|i", kwlist,
+                                    &self->do_timing))
+    return NULL;
+  int err = 0;
+  self->position_of_error = -1;
+  PyObject * one = PyInt_FromLong(1);
+  PyObject * zero = PyInt_FromLong(0);
+  //clear storage of pre_call_clear elements
+  Py_ssize_t n_pre_call_clear = PyList_Size(self->pre_call_clear);
+  assert(PyList_Check(self->pre_call_clear));
+  for (int i = 0; i < n_pre_call_clear; ++i)
+    {
+      PyObject * el_i = PyList_GetItem(self->pre_call_clear, i);
+      Py_INCREF(Py_None);
+      PyList_SetItem(el_i, 0, Py_None);
+    }
+  //clear the computed flag out of all non-input vars
+  for (int i = 0; i < self->n_vars; ++i)
+    {
+      self->var_computed[i] = !self->var_has_owner[i];
+      if (self->var_computed[i])
+        {
+          Py_INCREF(one);
+          PyList_SetItem(self->var_computed_cells[i], 0, one);
+        }
+      else
+        {
+          Py_INCREF(zero);
+          PyList_SetItem(self->var_computed_cells[i], 0, zero);
+        }
+    }
+  for (int i = 0; i < self->n_output_vars && (!err); ++i)
+    {
+      err = lazy_rec_eval(self, self->output_vars[i], one, zero);
+    }
+  Py_DECREF(one);
+  Py_DECREF(zero);
+  if (err) return NULL;
+  Py_INCREF(Py_None);
+  return Py_None;
+}
+#if 0
+static PyMethodDef CLazyLinker_methods[] = {
+    {
+      //"name", (PyCFunction)CLazyLinker_accept, METH_VARARGS, "Return the name, combining the first and last name"
+    },
+    {NULL}  /* Sentinel */
+};
+#endif
+static PyMemberDef CLazyLinker_members[] = {
+    {(char*)"nodes", T_OBJECT_EX, offsetof(CLazyLinker, nodes), 0,
+     (char*)"list of nodes"},
+    {(char*)"thunks", T_OBJECT_EX, offsetof(CLazyLinker, thunks), 0,
+     (char*)"list of thunks in program"},
+    {(char*)"call_counts", T_OBJECT_EX, offsetof(CLazyLinker, call_counts), 0,
+     (char*)"number of calls of each thunk"},
+    {(char*)"call_times", T_OBJECT_EX, offsetof(CLazyLinker, call_times), 0,
+     (char*)"total runtime in each thunk"},
+    {(char*)"position_of_error", T_INT, offsetof(CLazyLinker, position_of_error), 0,
+     (char*)"position of failed thunk"},
+    {(char*)"time_thunks", T_INT, offsetof(CLazyLinker, do_timing), 0,
+     (char*)"bool: nonzero means call will time thunks"},
+    {NULL}  /* Sentinel */
+};
+static PyTypeObject lazylinker_ext_CLazyLinkerType = {
+    PyObject_HEAD_INIT(NULL)
+    0,                         /*ob_size*/
+    "lazylinker_ext.CLazyLinker",             /*tp_name*/
+    sizeof(CLazyLinker), /*tp_basicsize*/
+    0,                         /*tp_itemsize*/
+    CLazyLinker_dealloc,       /*tp_dealloc*/
+    0,                         /*tp_print*/
+    0,                         /*tp_getattr*/
+    0,                         /*tp_setattr*/
+    0,                         /*tp_compare*/
+    0,                         /*tp_repr*/
+    0,                         /*tp_as_number*/
+    0,                         /*tp_as_sequence*/
+    0,                         /*tp_as_mapping*/
+    0,                         /*tp_hash */
+    CLazyLinker_call,          /*tp_call*/
+    0,                         /*tp_str*/
+    0,                         /*tp_getattro*/
+    0,                         /*tp_setattro*/
+    0,                         /*tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,        /*tp_flags*/
+    "CLazyLinker object",      /* tp_doc */
+    0,                         /* tp_traverse */
+    0,                         /* tp_clear */
+    0,                         /* tp_richcompare */
+    0,                         /* tp_weaklistoffset */
+    0,                         /* tp_iter */
+    0,                         /* tp_iternext */
+    0,//CLazyLinker_methods,       /* tp_methods */
+    CLazyLinker_members,       /* tp_members */
+    0,                         /* tp_getset */
+    0,                         /* tp_base */
+    0,                         /* tp_dict */
+    0,                         /* tp_descr_get */
+    0,                         /* tp_descr_set */
+    0,                         /* tp_dictoffset */
+    (initproc)CLazyLinker_init,/* tp_init */
+    0,                         /* tp_alloc */
+    CLazyLinker_new,           /* tp_new */
+};
+static PyMethodDef lazylinker_ext_methods[] = {
+    {NULL}  /* Sentinel */
+};
+#ifndef PyMODINIT_FUNC  /* declarations for DLL import/export */
+#define PyMODINIT_FUNC void
+#endif
+PyMODINIT_FUNC
+initlazylinker_ext(void) 
+{
+    PyObject* m;
+    lazylinker_ext_CLazyLinkerType.tp_new = PyType_GenericNew;
+    if (PyType_Ready(&lazylinker_ext_CLazyLinkerType) < 0)
+        return;
+    m = Py_InitModule3("lazylinker_ext", lazylinker_ext_methods,
+                       "Example module that creates an extension type.");
+    Py_INCREF(&lazylinker_ext_CLazyLinkerType);
+    PyModule_AddObject(m, "CLazyLinker", (PyObject *)&lazylinker_ext_CLazyLinkerType);
+}
--- a/theano/gof/lazylinker_c.py
+++ b/theano/gof/lazylinker_c.py
+import os
+import theano
+from theano import config
+from theano.gof.compilelock import get_lock, release_lock
+from theano.gof import cmodule
+get_lock()
+try:
+    dirname = 'lazylinker_ext'
+    cfile = os.path.join(theano.__path__[0], 'gof', 'lazylinker_c.c')
+    code = open(cfile).read()
+    loc = os.path.join(config.compiledir, dirname)
+    if not os.path.exists(loc):
+        os.mkdir(loc)
+    cmodule.gcc_module_compile_str(dirname, code, location=loc)
+    from lazylinker_ext.lazylinker_ext import *
+finally:
+    # Release lock on compilation directory.
+    release_lock()
--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -3,18 +3,21 @@
 The `Op` class is the base interface for all operations
 compatible with `gof`'s :doc:`graph` routines.
 """
+__authors__   = "theano-dev"
+__copyright__ = "(c) 2010, Universite de Montreal"
+__license__   = "3-clause BSD License"
+__contact__   = "theano-dev <theano-dev@googlegroups.com>"
 __docformat__ = "restructuredtext en"
+import logging
 from theano import config
 import graph
 import numpy
 import utils
 import warnings
-import logging
-from theano import config
 from env import Env
-import graph
 import cc

--- a/theano/gof/tests/test_lazy.py
+++ b/theano/gof/tests/test_lazy.py
+from copy import deepcopy
+import numpy
+from theano.gof.op import PureOp
+from theano.gof import Apply, generic, Container
+from theano.gof.link import LocalLinker, map_storage, add_clear_storage
+from theano import function, Mode
+from theano.lazycond import ifelse
+import theano.tensor as T
+class IfElseIfElseIf(PureOp):
+    def __init__(self, inplace=False):
+        self.inplace=inplace # check destroyhandler and others to ensure that a view_map with
+        #multiple inputs can work
+        assert not self.inplace
+    def make_node(self, c1, t1, c2,t2,c3,t3,f3):
+        assert t1.type == f3.type
+        assert t2.type == t3.type
+        assert t3.type == f3.type
+        return Apply(self, [c1,t1,c2,t2,c3,t3,f3], [t1.type()])
+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        input_computed = [compute_map[v] for v in node.inputs]
+        output_computed = [compute_map[v] for v in node.outputs]
+        input_registers = [storage_map[v] for v in node.inputs]
+        output_registers = [storage_map[v] for v in node.outputs]
+        outtype = node.outputs[0].type
+        def thunk():
+            if not input_computed[0][0]:
+                return [0]
+            else:
+                truthval = input_registers[0][0]
+                if truthval:
+                    if not input_computed[1][0]:
+                        return [1]
+                    else:
+                        output_computed[0][0]=1
+                        output_registers[0][0]=outtype.filter(deepcopy(input_registers[1][0]))
+                        return []
+                else:
+                    if not input_computed[2][0]:
+                        return [2]
+                    else:
+                        truthval = input_registers[2][0]
+                        if truthval:
+                            if not input_computed[3][0]:
+                                return [3]
+                            else:
+                                output_computed[0][0] = 1
+                                output_registers[0][0] = outtype.filter(deepcopy(input_registers[3][0]))
+                                return []
+                        else:
+                            if not input_computed[4][0]:
+                                return [4]
+                            else:
+                                truthval = input_registers[4][0]
+                                if truthval:
+                                    if not input_computed[5][0]:
+                                        return [5]
+                                    else:
+                                        output_computed[0][0] = 1
+                                        output_registers[0][0] = outtype.filter(deepcopy(input_registers[5][0]))
+                                        return []
+                                else:
+                                    if not input_computed[6][0]:
+                                        return [6]
+                                    else:
+                                        output_computed[0][0] = 1
+                                        output_registers[0][0] = outtype.filter(deepcopy(input_registers[6][0]))
+                                        return []
+        thunk.lazy = True
+        return thunk
+class NotImplementedOp(PureOp):
+    class E(Exception): pass
+    def make_node(self, x):
+        return Apply(self, [x], [x.type()])
+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        def thunk():
+            raise self.E()
+        thunk.lazy=False
+        return thunk
+def test_ifelse():
+    a = generic()
+    b = generic()
+    c = generic()
+    notimpl = NotImplementedOp()
+    f = function([a,b,c], ifelse(a, notimpl(b), c),
+            mode=Mode(linker='vm', optimizer='fast_run'))
+    try:
+        print "case 1"
+        f( True, 'a', 'b')
+        assert False
+    except NotImplementedOp.E:
+        pass
+    print "... passed"
+    print "case 2"
+    print f( False, 'a', 'b')
+    assert f( False, 'a', 'b') == 'b'
+    print "... passed"
+def more_complex_test():
+    notimpl = NotImplementedOp()
+    ifelseifelseif = IfElseIfElseIf()
+    x1 = T.scalar('x1')
+    x2 = T.scalar('x2')
+    c1 = generic('c1')
+    c2 = generic('c2')
+    t1 = ifelse(c1,x1,notimpl(x2))
+    t1.name = 't1'
+    t2 = t1*10
+    t2.name = 't2'
+    t3 = ifelse(c2,t2, x1+t1)
+    t3.name = 't3'
+    t4 = ifelseifelseif(T.eq(x1,x2), x1, T.eq(x1,5), x2, c2, t3, t3+0.5)
+    t4.name = 't4'
+    f = function([c1,c2,x1,x2], t4, mode=Mode(linker='vm', optimizer='fast_run'))
+    print f(1, 0, numpy.array(10,dtype=x1.dtype),0)
+    assert f(1,0,numpy.array(10,dtype=x1.dtype),0) == 20.5
+    print '... passed'
+if __name__ == '__main__':
+    more_complex_test()
--- a/theano/gof/tests/test_vm.py
+++ b/theano/gof/tests/test_vm.py
+import gc
+import sys
+import time
+try:
+    import line_profiler
+except ImportError:
+    pass
+import numpy
+from theano import function
+from theano.gof import vm,link, OpWiseCLinker
+from theano.compile import Mode
+from theano import tensor
+from theano.lazycond import ifelse
+import theano
+def test_speed():
+    def build_graph(x, depth=5):
+        z = x
+        for d in range(depth):
+            z = (z + z)
+        return z
+    def numpy_version(x, depth):
+        z = x
+        for d in xrange(depth):
+            z = (z+z)
+        return z
+    def time_numpy():
+        steps_a = 5
+        steps_b = 100
+        x = numpy.asarray([2.0, 3.0], dtype=theano.config.floatX)
+        numpy_version(x, steps_a)
+        t0 = time.time()
+        print numpy_version(x, steps_a)
+        t1 = time.time()
+        t2 = time.time()
+        print numpy_version(x, steps_b)
+        t3 = time.time()
+        t_a = t1 - t0
+        t_b = t3 - t2
+        print "%s takes %f s/Kop" % (
+                'numpy',
+                (1000*(t_b-t_a) / (steps_b - steps_a)))
+    def time_linker(name, linker):
+        steps_a = 5
+        steps_b = 100
+        x = tensor.vector()
+        a = build_graph(x,steps_a)
+        b = build_graph(x,steps_b)
+        f_a = function([x], a,
+                mode=Mode(optimizer=None, linker=linker()),
+                #profile='f_a speed test %s'%name,
+                )
+        f_b = function([x], b,
+                mode=Mode(optimizer=None, linker=linker()),
+                #profile='f_b speed test %s'%name,
+                )
+        print f_a([2.0, 3.0])
+        t0 = time.time()
+        print f_a([2.0, 3.0])
+        t1 = time.time()
+        print f_b([2.0, 3.0])
+        t2 = time.time()
+        print f_b([2.0, 3.0])
+        t3 = time.time()
+        t_a = t1 - t0
+        t_b = t3 - t2
+        print "%s takes %f s/Kop" % (
+                name,
+                (1000*(t_b-t_a) / (steps_b - steps_a)))
+    time_linker('c|py', OpWiseCLinker)
+    time_linker('vmLinker', vm.VM_Linker)
+    time_linker('vmLinker_nogc', lambda : vm.VM_Linker(allow_gc=False))
+    time_linker('vmLinker_CLOOP', lambda : vm.VM_Linker(allow_gc=False,
+        use_cloop=True))
+    time_numpy()
+def test_speed_lazy():
+    def build_graph(x, depth=5):
+        z = x
+        for d in range(depth):
+            z = ifelse(z> 0, -z, z)
+        return z
+    def time_linker(name, linker):
+        steps_a = 10
+        steps_b = 100
+        x = tensor.vector()
+        a = build_graph(x, steps_a)
+        b = build_graph(x, steps_b)
+        f_a = function([x], a,
+                mode=Mode(optimizer=None,
+                    linker=linker()),
+                #profile='f_a lazy ifelse %s'%name,
+                )
+        f_b = function([x], b,
+                mode=Mode(optimizer=None,
+                    linker=linker()),
+                #profile='f_b lazy ifelse %s'%name,
+                )
+        print f_a([2.0])
+        t0 = time.time()
+        print f_a([2.0])
+        t1 = time.time()
+        print f_b([2.0])
+        t2 = time.time()
+        print f_b([2.0])
+        t3 = time.time()
+        t_a = t1 - t0
+        t_b = t3 - t2
+        print "%s takes %f s/Kop" % (
+                name,
+                (1000*(t_b-t_a) / (steps_b - steps_a)))
+    time_linker('vmLinker', vm.VM_Linker)
+    time_linker('vmLinker_nogc', lambda : vm.VM_Linker(allow_gc=False))
+    time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False,
+        use_cloop=True))
+run_memory_usage_tests = False
+if run_memory_usage_tests:
+    # these are not normal unit tests, do not run them as part of standard
+    # suite.  I ran them while looking at top, and stopped when memory usage was
+    # stable.
+    def test_leak2():
+        import theano.sandbox.cuda as cuda
+        for i in xrange(1000000):
+            n = numpy.asarray([2.3, 4.5], dtype='f')
+            c = sys.getrefcount(n)
+            a = cuda.CudaNdarray(n)
+            assert c == sys.getrefcount(n)
+            if not i % 1000:
+                print '.',
+                print gc.collect(),
+                print gc.collect()
+            sys.stdout.flush()
+    def test_no_leak_many_graphs():
+        # Verify no memory leaks when creating and deleting a lot of functions
+        # This isn't really a unit test, you have to run it and look at top to see
+        # if there's a leak
+        for i in xrange(10000):
+            x = tensor.vector()
+            z = x
+            for d in range(10):
+                z = tensor.sin(-z+ 1)
+            f = function([x], z, mode=Mode(optimizer=None, linker='cvm'))
+            if not i % 100:
+                print gc.collect()
+            sys.stdout.flush()
+            gc.collect()
+            if 1:
+                f([2.0])
+                f([3.0])
+                f([4.0])
+                f([5.0])
+    def test_no_leak_many_call_lazy():
+        # Verify no memory leaks when calling a function a lot of times
+        # This isn't really a unit test, you have to run it and look at top to see
+        # if there's a leak
+        def build_graph(x, depth=5):
+            z = x
+            for d in range(depth):
+                z = ifelse(z> 0, -z, z)
+            return z
+        def time_linker(name, linker):
+            steps_a = 10
+            x = tensor.vector()
+            a = build_graph(x, steps_a)
+            f_a = function([x], a,
+                    mode=Mode(optimizer=None,
+                        linker=linker()))
+            for i in xrange(100000):
+                f_a([2.0])
+            if 0: # this doesn't seem to work, prints 0 for everything
+                import resource
+                pre = resource.getrusage(resource.RUSAGE_SELF)
+                post = resource.getrusage(resource.RUSAGE_SELF)
+                print pre.ru_ixrss, post.ru_ixrss
+                print pre.ru_idrss, post.ru_idrss
+                print pre.ru_maxrss, post.ru_maxrss
+        time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False, use_cloop=True))
+    def test_no_leak_many_call_nonlazy():
+        # Verify no memory leaks when calling a function a lot of times
+        # This isn't really a unit test, you have to run it and look at top to see
+        # if there's a leak
+        def build_graph(x, depth=5):
+            z = x
+            for d in range(depth):
+                z = tensor.sin(-z+1)
+            return z
+        def time_linker(name, linker):
+            steps_a = 10
+            x = tensor.vector()
+            a = build_graph(x,steps_a)
+            f_a = function([x], a,
+                    mode=Mode(optimizer=None,
+                        linker=linker()))
+            for i in xrange(500000):
+                f_a([2.0])
+        time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False, use_cloop=True))
--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
+"""
+VMs that run Theano graph computations.
+"""
+import sys
+import time
+import link
+import traceback
+from theano.gof.python25 import all
+import theano
+config = theano.config
+from theano.configparser import config, AddConfigVar, BoolParam
+from theano import config
+AddConfigVar('profile',
+        "If VM should collect profile information",
+        BoolParam(False))
+def raise_with_op(op, exc_info = None):
+    """WRITEME"""
+    if exc_info is None:
+        exc_info = sys.exc_info()
+    exc_type, exc_value, exc_trace = exc_info
+    if exc_type == KeyboardInterrupt:
+        # print a simple traceback from KeyboardInterrupt
+        raise exc_type, exc_value, exc_trace
+    try:
+        trace = op.tag.trace
+    except AttributeError:
+        trace = ()
+    exc_value.__thunk_trace__ = trace
+    exc_value.args += (op, )
+    if op in op.env.toposort():
+        exc_value.args += ('Sequence id of Apply node='+str(op.env.toposort().index(op)),)
+    raise exc_type, exc_value, exc_trace
+class VM(object):
+    """
+    A VM object evaluates a Theano program with its __call__ method.
+    Attributes:
+    call_counts - list of integers, one for each thunk. call_count[i] is the
+        number of times thunks[i] was called in the course of computations
+        performed by call_with_timers().
+    call_times - list of floats, one for each thunk. call_times[i] is the amount
+        of runtime spent on thunks[i] in the course of computations performed by
+        call_with_timers().
+    """
+    def __init__(self, nodes, thunks, pre_call_clear):
+        """
+        Allocate a virtual machine.
+        nodes - a list of nodes in toposort order
+        thunks - a list of thunks to execute those nodes, in toposort order
+        pre_call_clear - a list of containers to empty at the beginning of each
+                         call.
+        """
+        if len(nodes) != len(thunks):
+            raise ValueError()
+        self.nodes = nodes
+        self.thunks = thunks
+        self.pre_call_clear = pre_call_clear
+        self.call_counts = [0]*len(nodes)
+        self.call_times = [0]*len(nodes)
+        self.time_thunks = False
+    def __call__(self):
+        """
+        Run the machine.
+        Postcondition - all output variables have been computed.  VMs vary in
+        what exactly this means and how it is done.
+        """
+        raise NotImplementedError('override me')
+    def clear_storage(self):
+        """
+        Free any internal references to temporary variables.
+        Free internal variables and outputs.  Essentially, free as much memory
+        as possible without intefering with the ability to evaluate subsequent
+        calls.
+        """
+        raise NotImplementedError('override me')
+    def update_profile(self, profile):
+        # accumulate into the profile object
+        for node, thunk, t, c in zip(self.nodes, self.thunks, self.call_times, self.call_counts):
+            profile.apply_time.setdefault(node,0.0)
+            profile.apply_time[node] += t
+            profile.apply_callcount.setdefault(node,0)
+            profile.apply_callcount[node] = c
+            profile.apply_cimpl[node] = hasattr(thunk,'cthunk')
+        # clear the timer info out of the buffers
+        for i in range(len(self.call_times)):
+            self.call_times[i] = 0.0
+            self.call_counts[i] = 0
+class Loop(VM):
+    """
+    Unconditional start-to-finish program execution in Python.
+    No garbage collection is allowed on intermediate results.
+    """
+    def __call__(self):
+        if self.time_thunks:
+            for cont in self.pre_call_clear:
+                cont[0] = None
+            try:
+                for i, (thunk, node) in enumerate(zip(self.thunks, self.nodes)):
+                    t0 = time.time()
+                    thunk()
+                    t1 = time.time()
+                    self.call_counts[i] += 1
+                    self.call_times[i] += t1 - t0
+            except:
+                raise_with_op(node)
+        else:
+            for cont in self.pre_call_clear:
+                cont[0] = None
+            try:
+                for thunk, node in zip(self.thunks, self.nodes):
+                    thunk()
+            except:
+                raise_with_op(node)
+class LoopGC(VM):
+    """
+    Unconditional start-to-finish program execution in Python.
+    Garbage collection is possible on intermediate results.
+    """
+    def __init__(self, nodes, thunks, pre_call_clear, post_thunk_clear):
+        super(LoopGC, self).__init__(nodes, thunks, pre_call_clear)
+        self.post_thunk_clear = post_thunk_clear
+        if not (len(nodes) == len(thunks) == len(post_thunk_clear)):
+            raise ValueError()
+    def __call__(self):
+        if self.time_thunks:
+            for cont in self.pre_call_clear:
+                cont[0] = None
+            try:
+                i = 0
+                for thunk, node, old_storage in zip(self.thunks, self.nodes, self.post_thunk_clear):
+                    t0 = time.time()
+                    thunk()
+                    t1 = time.time()
+                    self.call_counts[i] += 1
+                    self.call_times[i] += t1 - t0
+                    for old_s in old_storage:
+                        old_s[0] = None
+                    i += 1
+            except:
+                raise_with_op(node)
+        else:
+            for cont in self.pre_call_clear:
+                cont[0] = None
+            try:
+                for thunk, node, old_storage in zip(self.thunks, self.nodes, self.post_thunk_clear):
+                    thunk()
+                    for old_s in old_storage:
+                        old_s[0] = None
+            except:
+                raise_with_op(node)
+class Stack(VM):
+    """
+    Finish-to-start evalution order of thunks.
+    This supports lazy evaluation of subtrees and partial
+    computations of graphs when only some inputs have changed.
+    """
+    def __init__(self, nodes, thunks, pre_call_clear,
+            storage_map, compute_map,
+            env, allow_gc):
+        super(Stack, self).__init__(nodes, thunks, pre_call_clear)
+        self.allow_gc = allow_gc
+        self.message = ""
+        self.base_apply_stack = [o.owner for o in env.outputs if o.owner]
+        self.outputs = env.outputs
+        self.storage_map = storage_map
+        self.apply_time = {}
+        self.outputs_size = {}
+        self.compute_map = compute_map
+        self.node_idx = node_idx = {}
+        ords = env.orderings()
+        for i, node in enumerate(self.nodes):
+            node_idx[node] = i
+            self.apply_time[node]     = 0
+            self.outputs_size[node]   = []
+            node.destroy_dependencies = []
+            if node in ords:
+                for prereq in ords[node]:
+                    node.destroy_dependencies += prereq.outputs
+        dependencies = self.dependencies = {}
+        for k in storage_map:
+            dependencies[k] = []
+            if k.owner and k.clients:
+                ls = []
+                is_output = 0
+                for cl in k.clients:
+                    if cl[0] is not 'output':
+                        ls += cl[0].outputs
+                dependencies[k] += ls
+        if config.profile:
+            self.memory_size_map = {"nt8": 1, "t16": 2, "t32": 4, "t64": 8, "128": 16}
+            atexit.register(self.atexit_print_all)
+    def __call__(self):
+        storage_map = self.storage_map
+        compute_map = self.compute_map
+        thunks = self.thunks
+        dependencies = self.dependencies
+        for k in self.storage_map:
+            compute_map[k][0] = (k.owner is None)
+        # apply_stack contains nodes
+        apply_stack = list(self.base_apply_stack)
+        last_apply_stack_len = -1
+        ls = []
+        while apply_stack:
+            # Make sure something happened last time round.
+            # This is just a safety check to make sure the op is written correctly
+            # apply_stack should either decrease in length by one (a thunk successfully applied), or
+            # increase in length (added dependencies over and above the original).
+            # NB: this doesn't catch cycles (would be too expensive/slow), just stalls.
+            apply_stack_len = len(apply_stack)
+            assert apply_stack_len != last_apply_stack_len
+            last_apply_stack_len = apply_stack_len
+            current_apply = apply_stack.pop()
+            # Use these for loops + breaks to short circuit evaluation
+            # This is a significant performance point
+            computed_ins = True
+            for i in current_apply.inputs:
+                if not compute_map[i][0]:
+                    computed_ins = False
+                    break
+            computed_outs = True
+            for o in current_apply.outputs:
+                if not compute_map[o][0]:
+                    computed_outs = False
+                    break
+            if computed_ins:
+                for d in current_apply.destroy_dependencies:
+                    if not compute_map[d][0]:
+                        computed_ins = False
+                        break
+            if not thunks[self.node_idx[current_apply]].lazy:
+                # Check if all inputs are in place
+                # If so compute thunk and remove it from the apply_stack
+                # If not leave it in, and add to the apply_stack those that will
+                # produce you those inputs
+                if computed_ins and not computed_outs:
+                    try:
+                        t0 = time.time()
+                        thunks[self.node_idx[current_apply]]()
+                        if config.profile:
+                            dt = time.time() - t0
+                            self.apply_time[current_apply] += dt
+                            ## Computing the memory footprint of the the op
+                            # ?? What about inplace .. if the op is inplace
+                            # you don't actually ask for more memory!
+                            size = []
+                            for (idx,o) in enumerate(
+                                    thunks[self.node_idx[current_apply]].outputs):
+                                if not hasattr(o[0],'size'):
+                                    size.append(-1)
+                                    continue
+                                s=o[0].size
+                                dtype = str(o[0].dtype)
+                                dtype2 = dtype[-3:]
+                                s *= memory_size_map[dtype2] # KeyError here: couldn't determine the dtype memory size
+                                size.append(s)
+                            self.outputs_size[current_apply] = size
+                    except Exception:
+                        raise_with_op(current_apply)
+                    for o in current_apply.outputs:
+                        compute_map[o][0] = 1
+                    # Garbage Collection -> check if anybody else uses this input
+                    if self.allow_gc:
+                        for i in current_apply.inputs:
+                            if (dependencies[i] and i.owner
+                                and i not in self.outputs):
+                                empty_storage_map = True
+                                for x in dependencies[i]:
+                                    if not compute_map[x][0]:
+                                        empty_storage_map = False
+                                        break
+                                if empty_storage_map:
+                                    storage_map[i][0] = None
+                elif not computed_ins:
+                    apply_stack.append(current_apply)
+                    apply_stack.extend(inp.owner for inp in current_apply.inputs if inp.owner)
+                    apply_stack.extend(inp.owner for inp in current_apply.destroy_dependencies if inp.owner)
+            elif not computed_outs:
+                # Try and run it to see if it works
+                try:
+                    t0 = time.time()
+                    requires = thunks[self.node_idx[current_apply]]()
+                    dt = time.time() - t0
+                    self.apply_time[current_apply] += dt
+                except Exception:
+                    raise_with_op(current_apply)
+                if requires:
+                    for r in requires:
+                        # We are not done with this op ..
+                        # so we added back and see to get the inputs we are missing
+                        apply_stack.append(current_apply)
+                        if current_apply.inputs[r].owner:
+                            apply_stack.append(current_apply.inputs[r].owner)
+                else:
+                    if config.profile:
+                        size = []
+                        for (idx,o) in enumerate(thunks[self.node_idx[current_apply]].outputs):
+                            if not hasattr(o[0],'size'):
+                                size.append(-1)
+                                continue
+                            s=o[0].size
+                            dtype = str(o[0].dtype)
+                            dtype2 = dtype[-2:]
+                            s *= memory_size_map[dtype2] # KeyError here: couldn't determine the dtype memory size
+                            size.append(s)
+                        self.outputs_size[current_apply] = size
+                    if self.allow_gc:
+                        for i in current_apply.inputs:
+                            if (dependencies[i] and i.owner and
+                                i not in self.outputs):
+                                empty_storage_map = True
+                                for x in dependencies[i]:
+                                    if not compute_map[x][0]:
+                                        empty_storage_map = False
+                                        break
+                                if empty_storage_map:
+                                    storage_map[i][0] = None
+try:
+    import lazylinker_c
+    class CVM(lazylinker_c.CLazyLinker, VM):
+        def __init__(self, *args, **kwargs):
+            lazylinker_c.CLazyLinker.__init__(self, *args, **kwargs)
+            # skip VM.__init__
+except ImportError:
+    pass
+class VM_Linker(link.LocalLinker):
+    """
+    Class that satisfies the Linker interface by acting as a VM factory.
+    """
+    def __init__(self, allow_gc=True, use_cloop = False):
+        self.env = None
+        self.allow_gc = allow_gc
+        self.use_cloop=use_cloop
+    def accept(self, env, no_recycling = []):
+        """
+        :param env: a PerformLinker can have accepted one Env instance at a time.
+        :param no_recycling: WRITEME
+        :returns: self (TODO: WHY? Who calls this function?)
+        """
+        if self.env is not None and self.env is not env:
+            return type(self)().accept(env, no_recycling)
+        self.env = env
+        self.no_recycling = no_recycling
+        return self
+    def make_vm(self, nodes, thunks,
+            input_storage, output_storage, storage_map,
+            post_thunk_clear,
+            computed,
+            compute_map
+            ):
+        pre_call_clear = [storage_map[v] for v in self.no_recycling]
+        if self.use_cloop:
+            # create a map from nodes to ints and vars to ints
+            nodes_idx = {}
+            vars_idx = {}
+            for i, node in enumerate(nodes):
+                nodes_idx[node] = i
+                for v in node.inputs + node.outputs:
+                    vars_idx.setdefault(v, len(vars_idx))
+            for v in self.env.inputs + self.env.outputs:
+                vars_idx.setdefault(v, len(vars_idx))
+            nodes_idx_inv = {}
+            vars_idx_inv = {}
+            for (node,i) in nodes_idx.items():
+                nodes_idx_inv[i] = node
+            for (var,i) in vars_idx.items():
+                vars_idx_inv[i] = var
+            # put storage_map and compute_map into a int-based scheme
+            n_applies = len(nodes)
+            storage_map_list = [storage_map[vars_idx_inv[i]]
+                    for i in range(len(vars_idx_inv))]
+            compute_map_list = [compute_map[vars_idx_inv[i]]
+                    for i in range(len(vars_idx_inv))]
+            if nodes:
+                assert type(storage_map_list[0]) is list
+                assert type(compute_map_list[0]) is list
+            # build the pointers to node inputs and offsets
+            base_input_output_list = []
+            node_n_inputs = []
+            node_n_outputs = []
+            node_input_offset = []
+            node_output_offset = []
+            for node in nodes:
+                inputs_idx = [vars_idx[v] for v in node.inputs]
+                outputs_idx = [vars_idx[v] for v in node.outputs]
+                node_n_inputs.append(len(inputs_idx))
+                node_n_outputs.append(len(outputs_idx))
+                node_input_offset.append(len(base_input_output_list))
+                base_input_output_list.extend(inputs_idx)
+                node_output_offset.append(len(base_input_output_list))
+                base_input_output_list.extend(outputs_idx)
+            # build the var owner array
+            var_owner = [None]*len(vars_idx)
+            for (var,i) in vars_idx.items():
+                if var.owner:
+                    var_owner[i] = nodes_idx[var.owner]
+            is_lazy_list = [int(th.lazy) for th in thunks]
+            output_vars = [vars_idx[v] for v in self.env.outputs]
+            # builds the list of prereqs induced by e.g. destroy_handler
+            ords = self.env.orderings()
+            node_prereqs = []
+            node_output_size = []
+            for i, node in enumerate(nodes):
+                node_output_size.append(0)
+                prereq_var_idxs = []
+                for prereq_node in ords.get(node,[]):
+                    prereq_var_idxs.extend(
+                            [vars_idx[v] for v in prereq_node.outputs])
+                prereq_var_idxs = list(set(prereq_var_idxs))
+                prereq_var_idxs.sort() # TODO: why sort?
+                node_prereqs.append(prereq_var_idxs)
+            c0 = sys.getrefcount(node_n_inputs)
+            vm = CVM(
+                    nodes,
+                    thunks,
+                    pre_call_clear,
+                    allow_gc=self.allow_gc,
+                    call_counts=[0]*len(nodes),
+                    call_times=[0.0]*len(nodes),
+                    compute_map_list=compute_map_list,
+                    base_input_output_list=base_input_output_list,
+                    node_n_inputs=node_n_inputs,
+                    node_n_outputs=node_n_outputs,
+                    node_input_offset=node_input_offset,
+                    node_output_offset=node_output_offset,
+                    var_owner=var_owner,
+                    is_lazy_list=is_lazy_list,
+                    output_vars=output_vars,
+                    node_prereqs=node_prereqs,
+                    node_output_size=node_output_size,
+                    )
+            assert c0 == sys.getrefcount(node_n_inputs)
+        else:
+            if all([(not th.lazy) for th in thunks]):
+                # there is no conditional in the graph
+                if self.allow_gc:
+                    vm = LoopGC(
+                            nodes,
+                            thunks,
+                            pre_call_clear,
+                            post_thunk_clear)
+                else:
+                    vm = Loop(
+                            nodes,
+                            thunks,
+                            pre_call_clear)
+            else:
+                vm = Stack(
+                        nodes, thunks, pre_call_clear,
+                        storage_map, compute_map,
+                        self.env, self.allow_gc
+                        )
+        return vm
+    def make_all(self, profiler = None, input_storage = None, output_storage = None):
+        env = self.env
+        order = list(env.toposort())
+        no_recycling = self.no_recycling
+        input_storage, output_storage, storage_map = link.map_storage(
+                env, order, input_storage, output_storage)
+        compute_map = {}
+        for k in storage_map:
+            compute_map[k] = [k.owner is None]
+        thunks = [node.op.make_thunk(node,
+                    storage_map,
+                    compute_map,
+                    no_recycling)
+                        for node in order]
+        computed, last_user = link.gc_helper(order)
+        if self.allow_gc:
+            post_thunk_clear = []
+            for node in order:
+                clear_after_this_thunk = []
+                for input in node.inputs:
+                    if ((input in computed)
+                            and (input not in env.outputs)
+                            and (node == last_user[input])):
+                        clear_after_this_thunk.append(storage_map[input])
+                post_thunk_clear.append(clear_after_this_thunk)
+        else:
+            post_thunk_clear = None
+        vm = self.make_vm(order, thunks,
+                input_storage, output_storage, storage_map,
+                post_thunk_clear,
+                computed,
+                compute_map
+                )
+        return (vm,
+                [link.Container(input, storage)
+                    for input, storage in zip(env.inputs, input_storage)],
+                [link.Container(output, storage, True)
+                    for output, storage in zip(env.outputs, output_storage)],
+                thunks,
+                order)
--- a/theano/lazycond.py
+++ b/theano/lazycond.py
+"""
+IfElse is an Op that works with the LazyLinker to support conditional graph evaluation.
+:TODO: Add text to library documentation describing the IfElse Op.
+"""
+from copy import deepcopy
+from theano.gof import PureOp, Apply, generic, Container
+import theano.tensor
+import gof
+from compile import optdb
+from tensor import opt
+@gof.local_optimizer([None])
+def ifelse_make_inplace(node):
+    op = node.op
+    if isinstance(op, IfElse) and not op.as_view :
+        print 'ifelse_make_inplace applied'
+        return IfElse(as_view = True,
+                    gpu = op.gpu, name=op.name).make_node(*node.inputs).outputs
+    return False
+optdb.register('ifelse_make_inplace', opt.in2out(ifelse_make_inplace,
+    ignore_newtrees=True), 95, 'fast_run', 'inplace')
+class IfElse(PureOp):
+    """
+    Op that works with LazyLinker to support conditional graph evaluation.
+    Example usage:
+        ``rval = ifelse(tf, rval_if_true, rval_if_false)``
+    :note:
+        Other Linkers (ALL other linkers right now) are INCOMPATIBLE with this
+        Op, they will produce functions that FAIL TO EXECUTE.
+    """
+    def __init__(self, as_view=False, gpu = False, name = None):
+        if as_view:
+            # check destroyhandler and others to ensure that a view_map with
+            # multiple inputs can work
+            view_map = {}
+            view_map[0] = [1]
+            self.view_map = view_map
+            #raise NotImplementedError('IfElse must copy for now')
+        self.as_view=as_view
+        self.gpu = gpu
+        self.name = name
+    def make_node(self, c, t, f):
+        if t.type != f.type:
+            raise TypeError(
+                    'IfElse requires same types for true and false args',
+                    (t.type, f.type))
+        return Apply(self, [c,t,f], [t.type()])
+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        outtype = node.outputs[0].type
+        c,t,f = node.inputs
+        output = node.outputs[0]
+        def thunk():
+            if not compute_map[c][0]:
+                return [0]
+            else:
+                truthval = storage_map[c][0]
+                if truthval:
+                    if not compute_map[t][0]:
+                        return [1]
+                    else:
+                        compute_map[output][0]=1
+                        if self.as_view:
+                            oval = outtype.filter(storage_map[t][0])
+                        else:
+                            oval = outtype.filter(
+                                    deepcopy(storage_map[t][0]))
+                        storage_map[output][0] = oval
+                        return []
+                else:
+                    if not compute_map[f][0]:
+                        return [2]
+                    else:
+                        # can't view both outputs unless destroyhandler
+                        # improves
+                        compute_map[output][0]=1
+                        oval = outtype.filter(
+                                deepcopy(storage_map[f][0]))
+                        storage_map[output][0]=oval
+                        return []
+        thunk.lazy = True
+        thunk.inputs  = [storage_map[v] for v in node.inputs]
+        thunk.outputs = [storage_map[v] for v in node.outputs]
+        return thunk
+ifelse = IfElse()
--- a/theano/printing.py
+++ b/theano/printing.py
@@ -391,7 +391,7 @@ default_colorCodes = {'GpuFromHost' : 'red',
              'HostFromGpu' : 'red',
              'Scan'  : 'yellow',
              'Shape' : 'cyan',
-              'Cond'  : 'magenta',
+              'IfElse'  : 'magenta',
              'Elemwise': '#FFAABB',
              'Subtensor': '#FFAAFF'}
@@ -473,10 +473,10 @@ def pydotprint(fct, outfile=None,
        c3 = pd.Cluster('Middle')
        cond = None
        for node in fct_env.toposort():
-            if node.op.__class__.__name__=='Cond' and node.op.name == cond_highlight:
+            if node.op.__class__.__name__=='IfElse' and node.op.name == cond_highlight:
                cond = node
        if cond is None:
-            _warn("pydotprint: cond_highlight is set but there is no Cond node in the graph")
+            _warn("pydotprint: cond_highlight is set but there is no IfElse node in the graph")
            cond_highlight = None
    if cond_highlight is not None:

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
 import atexit, logging, os, stat, sys
 from theano.compile import optdb
-from theano import config
 from theano.gof.cmodule import get_lib_extension
+from theano.configparser import config, AddConfigVar, StrParam
 import nvcc_compiler
 _logger_name = 'theano.sandbox.cuda'
@@ -20,6 +20,22 @@ def debug(*msg):
    _logger.debug('DEBUG (%s): %s'% ( _logger_name,
        ' '.join(str(m) for m in msg)))
+AddConfigVar('cuda.root',
+        """directory with bin/, lib/, include/ for cuda utilities.
+        This directory is included via -L and -rpath when linking dynamically
+        compiled modules.  If AUTO, if nvcc is in the path, it will use one of
+        this parent directory.  Otherwise /usr/local/cuda.  Leave empty to
+        prevent extra linker directives.
+        Default: environment variable "CUDA_ROOT" or else "AUTO".
+        """,
+        StrParam(os.getenv('CUDA_ROOT', "AUTO")))
+if config.cuda.root == "AUTO":
+    # set nvcc_path correctly and get the version
+    nvcc_compiler.set_cuda_root()
+#is_nvcc_available called here to initialize global vars in nvcc_compiler module
+nvcc_compiler.is_nvcc_available()
 # Compile cuda_ndarray.cu
 # This need that nvcc (part of cuda) is installed. If it is not, a warning is

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -7,20 +7,7 @@ import commands
 _logger=logging.getLogger("theano.sandbox.cuda.nvcc_compiler")
 _logger.setLevel(logging.WARN)
-from theano.configparser import config, AddConfigVar, StrParam
+from theano.configparser import config, AddConfigVar, StrParam, BoolParam
-AddConfigVar('nvcc.compiler_bindir',
-        "If defined, nvcc compiler driver will seek g++ and gcc in this directory",
-        StrParam(""))
-AddConfigVar('cuda.nvccflags',
-        "Extra compiler flags for nvcc",
-        StrParam(""))
-AddConfigVar('cuda.root',
-        "The directory with bin/, lib/, include/ for cuda utilities. Used to put this directory of nvidia lib in the compiled libraire. Usefull when people forget to update there LD_LIBRARY_PATH and LIBRARY_PATH environment variable. If AUTO, if nvcc is in the path, it will use one of this parent directory. Otherwise /usr/local/cuda. If empty, won't appen the directory in the compiled library",
-        StrParam(os.getenv('CUDA_ROOT', "AUTO")))
 def error(*args):
    #sys.stderr.write('ERROR:'+ ' '.join(str(a) for a in args)+'\n')
@@ -35,6 +22,18 @@ def debug(*args):
    #sys.stderr.write('DEBUG:'+ ' '.join(str(a) for a in args)+'\n')
    _logger.debug("DEBUG: "+' '.join(str(a) for a in args))
+AddConfigVar('nvcc.compiler_bindir',
+        "If defined, nvcc compiler driver will seek g++ and gcc in this directory",
+        StrParam(""))
+AddConfigVar('nvcc.flags',
+        "Extra compiler flags for nvcc",
+        StrParam(""))
+AddConfigVar('nvcc.fastmath',
+        "",
+        BoolParam(False))
 nvcc_path = 'nvcc'
 nvcc_version = None
 def is_nvcc_available():
@@ -66,11 +65,6 @@ def set_cuda_root():
            config.cuda.root = os.path.split(dir)[0]
            return
-if config.cuda.root == "AUTO":
-    set_cuda_root()
-is_nvcc_available()#to set nvcc_path correctly and get the version
 rpath_defaults = []
 def add_standard_rpath(rpath):
    rpath_defaults.append(rpath)
@@ -183,11 +177,9 @@ def nvcc_module_compile_str(
        if sys.platform != 'darwin':
            # the 64bit CUDA libs are in the same files as are named by the function above
            rpaths.append(os.path.join(config.cuda.root,'lib64'))
    for rpath in rpaths:
        cmd.extend(['-Xlinker',','.join(['-rpath',rpath])])
-    nvccflags = [flag for flag in config.cuda.nvccflags.split(' ') if flag]
+    cmd.extend([flag for flag in config.nvcc.flags.split(' ') if flag])
-    cmd.extend(nvccflags)
    cmd.extend('-I%s'%idir for idir in include_dirs)
    cmd.extend(['-o',lib_filename])
    cmd.append(os.path.split(cppfilename)[-1])

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -270,6 +270,48 @@ def local_gpu_dot_to_dot22(node):
                                                shape_out))]
    return False
+@register_opt()
+@local_optimizer([])
+def local_gpu_lazy_ifelse(node):
+    """
+    gpu_from_host(dot22) -> gpudot(gpu_from_host)
+    dot(host_from_gpu) -> host_from_gpu(gpudot22)
+    """
+    import theano
+    if hasattr(theano,"lazycond"):
+        gpu_ifelse = theano.lazycond.IfElse(gpu = True)
+        if node.op == gpu_from_host:
+            host_input = node.inputs[0]
+            if (host_input.owner
+                    and host_input.owner.op == theano.lazycond.ifelse):
+                c, t, f = host_input.owner.inputs
+                if not isinstance(f.type,CudaNdarrayType):
+                    f = gpu_from_host(f)
+                if not isinstance(t.type,CudaNdarrayType):
+                    t = gpu_from_host(t)
+                if isinstance(c.type,CudaNdarrayType):
+                    c = host_from_gpu(c)
+                return [gpu_ifelse(c, t, f)]
+        if node.op == theano.lazycond.ifelse:
+            if numpy.any([(i.owner and i.owner.op == host_from_gpu) for i in node.inputs]):
+                c, t, f = node.inputs
+                if not isinstance(f.type,CudaNdarrayType):
+                    f = gpu_from_host(f)
+                if not isinstance(t.type,CudaNdarrayType):
+                    t = gpu_from_host(t)
+                if isinstance(c.type,CudaNdarrayType):
+                    c = host_from_gpu(c)
+                return [host_from_gpu(gpu_ifelse(c, t, f))]
+    return False
 @register_opt()
 @local_optimizer([])

--- a/theano/scan_module/__init__.py
+++ b/theano/scan_module/__init__.py
@@ -567,7 +567,7 @@ class ScanMerge(gof.Optimizer):
    def apply(self, env):
        nodelist = list(env.toposort())
-        cond_nodes = [ x for x in nodelist if x.op.__class__.__name__=='Cond']
+        cond_nodes = [ x for x in nodelist if x.op.__class__.__name__=='IfElse']
        scan_nodes = [ x for x in nodelist if x.op.__class__.__name__=='Scan']
        # Having lazy ifs in the graph complicates a bit things, and for

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -133,6 +133,79 @@ def sp_ones_like(x):
    data, indices, indptr, shape = csm_properties(x) #TODO: don't restrict to CSM formats
    return CSM(format=x.format)(tensor.ones_like(data), indices, indptr, shape)
+class _sparse_py_operators:
+    T = property(lambda self: transpose(self), doc = "Return aliased transpose of self (read-only)")
+    def __neg__(self): return neg(self)
+    def __add__(left, right): return add(left, right)
+    def __radd__(right, left): return add(left, right)
+    def __sub__(left, right): return sub(left, right)
+    def __rsub__(right, left): return sub(left, right)
+    def __mul__(left, right): return mul(left, right)
+    def __rmul__(left, right): return mul(left, right)
+    #extra pseudo-operator symbols
+    def __dot__(left, right): return structured_dot(left, right)
+    def __rdot__(right, left): return structured_dot(left, right)
+    #N.B. THIS IS COMMENTED OUT ON PURPOSE!!!
+    #     Discussion with Fred & James (at least, and maybe others before)
+    #     we decided that casting from a sparse to dense should be explicit
+    #     because it's usually something you want to be pretty careful about,
+    #     and not to do by accident.
+    #def _as_TensorVariable(self):
+    #    return dense_from_sparse(self)
+    shape = property(lambda self: tensor.shape(dense_from_sparse(self))) # don't worry!
+    # ... the plan is that the ShapeFeature in tensor.opt will do shape propagation
+    # ... and remove the dense_from_sparse from the graph.  This will *NOT* actually expand
+    # ... your sparse matrix just to get the shape.
+    ndim = property(lambda self: self.type.ndim)
+    dtype = property(lambda self: self.type.dtype)
+class SparseVariable(gof.Variable, _sparse_py_operators):
+    dtype = property(lambda self: self.type.dtype)
+    format = property(lambda self: self.type.format)
+    def __str__(self):
+        return '%s{%s,%s}'%(
+                self.__class__.__name__,
+                self.format,
+                self.dtype)
+    def __repr__(self):
+        return str(self)
+class SparseConstantSignature(tuple):
+    def __eq__(self, other):
+        (a, b), (x,y) = self, other
+        return a == x \
+                and (b.dtype == y.dtype)\
+                and (type(b) == type(y))\
+                and (b.shape == y.shape)\
+                and (abs(b-y).sum() < 1e-6 * b.nnz)
+    def __hash__(self):
+        (a,b) = self
+        return hash(type(self)) ^ hash(a) ^ hash(type(b))
+class SparseConstant(gof.Constant, _sparse_py_operators):
+    dtype = property(lambda self: self.type.dtype)
+    format = property(lambda self: self.type.format)
+    def signature(self):
+        assert self.data is not None
+        return SparseConstantSignature((self.type, self.data))
+    def __str__(self):
+        return '%s{%s,%s,shape=%s,nnz=%s}'%(
+                self.__class__.__name__,
+                self.format,
+                self.dtype,
+                self.data.shape,
+                self.data.nnz)
+    def __repr__(self):
+        return str(self)
+class SparseValue(gof.Value, _sparse_py_operators):
+    dtype = property(lambda self: self.type.dtype)
+    format = property(lambda self: self.type.format)
 class SparseType(gof.Type):
    """
@@ -149,6 +222,9 @@ class SparseType(gof.Type):
    dtype_set = set(['int', 'int8', 'int16','int32', 'int64', 'float32', 'float64', 'complex64','complex128'])
    ndim = 2
+    Variable = SparseVariable
+    Constant = SparseConstant
    def __init__(self, format, dtype):
        """
        Fundamental way to create a sparse node.
@@ -248,65 +324,6 @@ csr_dmatrix = SparseType(format='csr', dtype='float64')
 csc_fmatrix = SparseType(format='csc', dtype='float32')
 csr_fmatrix = SparseType(format='csr', dtype='float32')
-class _sparse_py_operators:
-    T = property(lambda self: transpose(self), doc = "Return aliased transpose of self (read-only)")
-    def __neg__(self): return neg(self)
-    def __add__(left, right): return add(left, right)
-    def __radd__(right, left): return add(left, right)
-    def __sub__(left, right): return sub(left, right)
-    def __rsub__(right, left): return sub(left, right)
-    def __mul__(left, right): return mul(left, right)
-    def __rmul__(left, right): return mul(left, right)
-    #extra pseudo-operator symbols
-    def __dot__(left, right): return structured_dot(left, right)
-    def __rdot__(right, left): return structured_dot(left, right)
-    #N.B. THIS IS COMMENTED OUT ON PURPOSE!!!
-    #     Discussion with Fred & James (at least, and maybe others before)
-    #     we decided that casting from a sparse to dense should be explicit
-    #     because it's usually something you want to be pretty careful about,
-    #     and not to do by accident.
-    #def _as_TensorVariable(self):
-    #    return dense_from_sparse(self)
-    shape = property(lambda self: tensor.shape(dense_from_sparse(self))) # don't worry!
-    # ... the plan is that the ShapeFeature in tensor.opt will do shape propagation
-    # ... and remove the dense_from_sparse from the graph.  This will *NOT* actually expand
-    # ... your sparse matrix just to get the shape.
-    ndim = property(lambda self: self.type.ndim)
-    dtype = property(lambda self: self.type.dtype)
-class SparseVariable(gof.Variable, _sparse_py_operators):
-    dtype = property(lambda self: self.type.dtype)
-    format = property(lambda self: self.type.format)
-class SparseConstantSignature(tuple):
-    def __eq__(self, other):
-        (a, b), (x,y) = self, other
-        return a == x \
-                and (b.dtype == y.dtype)\
-                and (type(b) == type(y))\
-                and (b.shape == y.shape)\
-                and (abs(b-y).sum() < 1e-6 * b.nnz)
-    def __hash__(self):
-        (a,b) = self
-        return hash(type(self)) ^ hash(a) ^ hash(type(b))
-class SparseConstant(gof.Constant, _sparse_py_operators):
-    dtype = property(lambda self: self.type.dtype)
-    format = property(lambda self: self.type.format)
-    def signature(self):
-        assert self.data is not None
-        return SparseConstantSignature((self.type, self.data))
-class SparseValue(gof.Value, _sparse_py_operators):
-    dtype = property(lambda self: self.type.dtype)
-    format = property(lambda self: self.type.format)
 # CONSTRUCTION
 class CSMProperties(gof.Op):
    """Extract all of .data .indices and .indptr"""

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -937,6 +937,9 @@ def _gemm_from_node2(node):
        lst = _factor_canonicalized(lst)
        rval = _gemm_from_factored_list(lst)
        #print "RVAL", rval
+        # THIS GOT COMMENTED OUT AT SOME POINT - ASK P.Lamblin maybe why?
+        #if rval:
+        #    assert rval[0].type == node.outputs[0].type, (rval[0].type, node.outputs[0].type)
        if rval and (rval[0].type == node.outputs[0].type):
            return rval

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -3057,30 +3057,33 @@ def constant_folding(node):
    for input in node.inputs:
        if not isinstance(input, Constant):
            return False
-    try:
+    #condition:  all inputs are constant
-        storage = [[None] for output in node.outputs]
-        node.op.perform(node, [x.data for x in node.inputs], storage)
+    storage_map=dict([(i,[i.data]) for i in node.inputs])
-    except MethodNotDefined:
+    compute_map=dict([(i,[True]) for i in node.inputs])
-        tmp_inputs = [x.type() for x in node.inputs]
+    for o in node.outputs:
-        f = compile.function(
+        storage_map[o] = [None]
-                inputs=tmp_inputs,
+        compute_map[o] = [False]
-                outputs=node.op.make_node(*tmp_inputs).outputs,
-                mode=compile.Mode(linker='c|py',optimizer=None))
+    thunk = node.op.make_thunk(node, storage_map, compute_map,
-        xvals = f(*[x.data for x in node.inputs])
+            no_recycling=[])
-        storage = [[xv] for xv in xvals]
+    required = thunk()
-    msg = []
+    assert not required # a node whose inputs are all provided should always
-    assert len(storage) == len(node.outputs)
+    # return successfully
-    for s, output in zip(storage, node.outputs):
+    rval = []
+    for output in node.outputs:
+        assert compute_map[output][0], (output, storage_map[output][0])
        try:
            constant = output.type.Constant
-        except:
+        except AttributeError:
            constant = Constant
-        msg += [constant(output.type, s[0])]
+        rval.append(constant(output.type, storage_map[output][0]))
-    return msg
+    return rval
 register_canonicalize(constant_folding, 'fast_compile')
-register_stabilize(constant_folding) # because
+register_stabilize(constant_folding)
 register_specialize(constant_folding)
 def _is_1(expr):

--- a/theano/tensor/tests/test_gc.py
+++ b/theano/tensor/tests/test_gc.py
@@ -20,7 +20,7 @@ def test_no_reuse():
        return
    assert not 'should not get here'
-def test_gc():
+def test_gc_never_pickles_temporaries():
    x = T.dvector()
    #print >> sys.stderr, 'BUILDING GRAPH'
@@ -32,32 +32,63 @@ def test_gc():
    optimizer=None
    optimizer='fast_run'
    for f_linker, g_linker in [
            (theano.PerformLinker(allow_gc = True), theano.PerformLinker(allow_gc=False)), 
            (theano.OpWiseCLinker(allow_gc = True), theano.OpWiseCLinker(allow_gc=False))]:
+        #f_linker has garbage collection
+        #g_linker has no garbage collection
        #print >> sys.stderr, 'COMPILING'
        f = theano.function([x], r,mode=theano.Mode(optimizer=optimizer, linker=f_linker))
+        g = theano.function([x], r,mode=theano.Mode(optimizer=optimizer, linker=g_linker))
+        len_pre_f = len(cPickle.dumps(f))
+        len_pre_g = len(cPickle.dumps(g))
+        # should be no difference at first
+        # In future, FunctionMaker might pickle linker-dependent stuff and make
+        # this assertion fail.
+        assert len_pre_f == len_pre_g
+        def a(fn):
+            return len(cPickle.dumps(fn.maker))
+        assert a(f) == a(f) # some sanity checks on the pickling mechanism
+        assert a(g) == a(g) # some sanity checks on the pickling mechanism
-        g = theano.function([x], r,mode=theano.Mode(optimizer=optimizer, linker=f_linker))
+        def b(fn):
+            return len(
+                    cPickle.dumps(
+                        theano.compile.function_module._pickle_Function(
+                            fn)))
+        assert b(f) == b(f) # some sanity checks on the pickling mechanism
-        pre_f = cPickle.dumps(f)
+        def c(fn):
-        pre_g = cPickle.dumps(g)
+            return len(cPickle.dumps(fn))
+        assert c(f) == c(f) # some sanity checks on the pickling mechanism
+        assert c(g) == c(g) # some sanity checks on the pickling mechanism
-        #print >> sys.stderr, 'RUNNING'
+        # now run the function once to create temporaries within the no-gc
+        # linker
        f(numpy.ones(100, dtype='float64'))
        g(numpy.ones(100, dtype='float64'))
+        # serialize the functions again
        post_f = cPickle.dumps(f)
        post_g = cPickle.dumps(g)
-        #because allow_gc should leave the function un-changed by calling
-        assert len(pre_f) == len(post_f)
-        #because temporaries that weren't collected shouldn't be pickled anyway
        len_post_f = len(post_f)
        len_post_g = len(post_g)
-        assert len_post_f == len_post_g
+        #assert that f() didn't cause the function to grow
+        # allow_gc should leave the function un-changed by calling
+        assert len_pre_f == len_post_f
+        #assert that g() didn't cause g to grow
+        # because temporaries that weren't collected shouldn't be pickled anyway
+        assert len_post_f == len_post_g, (f_linker, len_post_f, len_post_g)
 def test_merge_opt_runtime():

--- a/theano/tensor/tests/test_raw_random.py
+++ b/theano/tensor/tests/test_raw_random.py
@@ -49,11 +49,14 @@ class T_random_function(unittest.TestCase):
        rng_R = random_state_type()
        # use make_node to override some of the self.args
-        post_r2, out2 = rf2(rng_R, (4,), -2, 2)
+        post_r2,     out2     = rf2(rng_R, (4,), -2, 2) # NOT INPLACE
-        post_r2_4, out2_4 = rf2(rng_R, (4,), -4.0, 2)
+        post_r4,     out4     = rf4(rng_R, (4,), -4, 4) # INPLACE
-        post_r2_4_4, out2_4_4 = rf2(rng_R, (4,), -4.0, 4.0)
+        post_r2_4,   out2_4   = rf2(rng_R, (4,), -4.0, 2) # NOT INPLACE
-        post_r4, out4 = rf4(rng_R, (4,), -4, 4)
+        post_r2_4_4, out2_4_4 = rf2(rng_R, (4,), -4.0, 4.0) # NOT INPLACE
+        # configure out4 to be computed inplace
+        # The update expression means that the random state rng_R will
+        # be maintained by post_r4
        f = compile.function(
                [compile.In(rng_R,
                            value=numpy.random.RandomState(utt.fetch_seed()),
@@ -65,9 +68,25 @@ class T_random_function(unittest.TestCase):
        f2, f4, f2_4, f2_4_4 = f()
        f2b, f4b, f2_4b, f2_4_4b = f()
-        assert numpy.allclose(f2*2, f4)
+        print f2
-        assert numpy.allclose(f2_4_4, f4)
+        print f4
-        assert not numpy.allclose(f4, f4b)
+        print f2_4
+        print f2_4_4
+        #print f2b
+        #print f4b
+        #print f2_4b
+        #print f2_4_4b
+        # setting bounds is same as multiplying by 2
+        assert numpy.allclose(f2*2, f4), (f2, f4)
+        # retrieving from non-inplace generator
+        # is same as inplace one for first call
+        assert numpy.allclose(f2_4_4, f4), (f2_4_4, f4)
+        # f4 changes from call to call, that the update has worked
+        assert not numpy.allclose(f4, f4b), (f4, f4b)
    def test_inplace_optimization(self):
        """Test that FAST_RUN includes the random_make_inplace optimization"""

--- a/theano/tests/diverse_tests.py
+++ b/theano/tests/diverse_tests.py
@@ -13,19 +13,32 @@ from theano.tests  import unittest_tools as utt
  should ensure that it will remain operational
 '''
-class T_diverse(unittest.TestCase):
+class T_scipy(unittest.TestCase):
    def setUp(self):
        utt.seed_rng()
+        self.orig_floatX = theano.config.floatX
+    def tearDown(self):
+        theano.config.floatX = self.orig_floatX
-    def scipy_paper_example1(self):
+    def test_scipy_paper_example1(self):
        a = theano.tensor.vector('a') # declare variable
        b = a + a**10                 # build expression
        f = theano.function([a], b)   # compile function
        assert numpy.all(f([0,1,2]) == numpy.array([0,2,1026]))
-    def scipy_papaer_example2(self):
+    def test_scipy_paper_example2(self):
        ''' This just sees if things compile well and if they run '''
+        # PREAMPBLE
+        T = theano.tensor
+        shared = theano.shared
+        function = theano.function
+        rng = numpy.random
+        theano.config.floatX='float64'
+        #
+        # ACTUAL SCRIPT FROM PAPER
        x = T.matrix()
        y = T.vector()
        w = shared(rng.randn(100))
@@ -52,6 +65,7 @@ class T_diverse(unittest.TestCase):
        for i in range(training_steps):
            pred, err = train(D[0], D[1])
 if __name__ == '__main__':
    unittest.main()