Merged -- no conflict

ca79f02e · Olivier Delalleau · 4f1f59ac · 67968144 · ca79f02e · ca79f02e
--- a/.hgignore
+++ b/.hgignore
@@ -35,3 +35,5 @@ theano/version.py
 theano/version.py.out
 distribute-*.egg
 distribute-*.tar.gz
+out1
+out2
--- a/doc/tutorial/remarks.txt
+++ b/doc/tutorial/remarks.txt
@@ -11,8 +11,6 @@ How should you write your algorithm to make the most of what Theano can do?
 Limitations
 -----------
- Conditional control flow is possible but currently not efficient.  The current implementation will evaluate both sides of an ``if`` construct (see :func:`tensor.switch`).
 - While- or for-Loops within an expression graph are supported, but only via
  the :func:`theano.scan` op (which puts restrictions on how the loop body can
  interact with the rest of the graph).

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
-"""Provides `DebugMode`, an evaluation mode for debugging theano internals."""
+"""Provides `DebugMode`, an evaluation mode for debugging theano internals.
+:TODO: add support for IfElse Op, LazyLinker, PureOp, etc.
+"""
 __docformat__ = "restructuredtext en"
 import time, copy, sys, copy_reg, gc, os
@@ -1552,7 +1556,8 @@ class _Maker(FunctionMaker): #inheritance buys a few helper functions
    def __init__(self, inputs, outputs, optimizer, mode,
            accept_inplace = False,
-            function_builder = Function):
+            function_builder = Function,
+            profile=None):
        """
        :type inputs: a list of SymbolicInput instances
@@ -1567,7 +1572,7 @@ class _Maker(FunctionMaker): #inheritance buys a few helper functions
        :note: this function sets TensorType.filter_checks_isfinite when `mode.check_isfinite` is True
        """
+        self.profile = profile
        # Handle the case where inputs and/or outputs is a single Variable (not in a list)
        unpack_single = False
        return_none = False

--- a/theano/compile/function.py
+++ b/theano/compile/function.py
@@ -7,12 +7,13 @@ _logger = logging.getLogger('theano.compile.function')
 from io import In
 from function_module import orig_function
+from profiling import ProfileStats
 from pfunc import pfunc
 from numpy import any #for to work in python 2.4
 def function(inputs, outputs=None, mode=None, updates=[], givens=[],
             no_default_updates=False, accept_inplace=False, name=None,
-             rebuild_strict=True, allow_input_downcast=None):
+             rebuild_strict=True, allow_input_downcast=None, profile=None):
    """
    Return a callable object that will calculate `outputs` from `inputs`.
@@ -62,6 +63,11 @@ def function(inputs, outputs=None, mode=None, updates=[], givens=[],
    precise, type. None (default) is almost like False, but allows
    downcasting of Python float scalars to floatX.
+    :type profile: None, True, or ProfileStats instance
+    :param profile: accumulate profiling information into a given ProfileStats
+    instance. If argument is `True` then a new ProfileStats instance will be
+    used.  This profiling object will be available via self.profile.
    :note: Regarding givens: Be careful to make sure that these substitutions are
    independent--behaviour when Var1 of one pair appears in the graph leading to Var2 in
    another expression is undefined.  Replacements specified with givens are different from
@@ -88,6 +94,8 @@ def function(inputs, outputs=None, mode=None, updates=[], givens=[],
    if uses_In or uses_tuple:
        # we must use old semantics in this case.
+        if profile:
+            raise NotImplementedError('profiling not supported in old-style function')
        if uses_updates or uses_givens:
            raise NotImplementedError("In() instances and tuple inputs triggers the old semantics, which disallow using updates and givens")
        fn =  orig_function(inputs, outputs,
@@ -102,7 +110,8 @@ def function(inputs, outputs=None, mode=None, updates=[], givens=[],
                no_default_updates=no_default_updates,
                accept_inplace=accept_inplace,name=name,
                rebuild_strict=rebuild_strict,
-                allow_input_downcast=allow_input_downcast)
+                allow_input_downcast=allow_input_downcast,
+                profile=profile)
    # We need to add the flag check_aliased inputs if we have any mutable or
    # borrowed used defined inputs
    fn._check_for_aliased_inputs = check_for_aliased_inputs

--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -4,7 +4,9 @@ import os, logging
 import numpy, theano
 from theano import gof
-from theano.configparser import config, AddConfigVar, StrParam
+import theano.gof.vm
+from theano.configparser import config, AddConfigVar, StrParam, EnumStr
 _logger = logging.getLogger('theano.compile.mode')
@@ -55,7 +57,11 @@ predefined_linkers = {
    'c'    : gof.CLinker(),
    'c|py' : gof.OpWiseCLinker(allow_gc=True),
    'c|py_nogc' : gof.OpWiseCLinker(allow_gc=False),
-    'c&py' : gof.DualLinker(checker = check_equal)
+    'c&py' : gof.DualLinker(checker = check_equal),
+    'vm'   : gof.vm.VM_Linker(allow_gc=True, use_cloop=False),
+    'cvm'   : gof.vm.VM_Linker(allow_gc=True, use_cloop=True),
+    'vm_nogc' : gof.vm.VM_Linker(allow_gc=False, use_cloop=False),
+    'cvm_nogc': gof.vm.VM_Linker(allow_gc=False, use_cloop=True),
    }
@@ -249,6 +255,7 @@ class Mode(object):
        self._optimizer = optimizer
        self.call_time = 0
        self.fn_time = 0
+        linker.mode = self #TODO: WHY IS THIS HERE?
        self.optimizer_time = 0
        self.linker_time = 0
@@ -290,15 +297,27 @@ class Mode(object):
 FAST_COMPILE = Mode('py', 'fast_compile')
 FAST_RUN = Mode('c|py', 'fast_run')
 FAST_RUN_NOGC = Mode("c|py_nogc", 'fast_run')
-SANITY_CHECK = [Mode('c|py', None),
-                Mode('c|py', 'fast_run')]
 STABILIZE = Mode("c|py", OPT_STABILIZE)
 predefined_modes = {'FAST_COMPILE': FAST_COMPILE,
                    'FAST_RUN': FAST_RUN,
                    'FAST_RUN_NOGC':FAST_RUN_NOGC,
-                    'SANITY_CHECK': SANITY_CHECK,
+                    'STABILIZE': STABILIZE,
-                    'STABILIZE': STABILIZE}
+                    'VM':Mode('vm', 'fast_run'),
+                    'VM_NOGC':Mode('vm_nogc', 'fast_run'),
+                    'CVM':Mode('cvm', 'fast_run'),
+                    'CVM_NOGC':Mode('cvm_nogc', 'fast_run'),
+                    }
+#Don't add FAST_RUN_NOGC to this list(as well as other ALL CAPS short cut)
+#The way to get FAST_RUN_NOGC is with the flag 'linker=c|py_nogc'
+#The old all capital letter way of working is deprecated as it is not scalable.
+AddConfigVar('mode',
+        "Default compilation mode",
+        EnumStr(*(predefined_modes.keys() + [
+            'Mode','DEBUG_MODE', 'PROFILE_MODE'])),
+        in_c_key=False)
 instanciated_default_mode=None
 def get_mode(orig_string):
@@ -329,7 +348,7 @@ def get_mode(orig_string):
            ret = DebugMode(optimizer=config.optimizer)
        else:
            # The import is needed in case string is ProfileMode
-            from profilemode import ProfileMode
+            from profilemode import ProfileMode,prof_mode_instance_to_print
            ret = eval(string+'(linker=config.linker, optimizer=config.optimizer)')
    elif predefined_modes.has_key(string):
        ret = predefined_modes[string]
@@ -349,7 +368,6 @@ def get_mode(orig_string):
    #must tell python to print the summary at the end.
    if string == 'ProfileMode':
        #need to import later to break circular dependency.
-        from profilemode import prof_mode_instance_to_print
        prof_mode_instance_to_print.append(ret)
    return ret
@@ -365,3 +383,4 @@ def register_mode(name, mode):
    if name in predefined_modes:
        raise ValueError('Mode name already taken: %s' % name)
    predefined_modes[name] = mode
--- a/theano/compile/pfunc.py
+++ b/theano/compile/pfunc.py
 """Provide a simple user friendly API """
 __docformat__ = 'restructuredtext en'
+import numpy # for backport to 2.4, to get any().
+from profiling import ProfileStats
 from theano.gof import Container, Variable, generic, graph, Constant, Value
 from theano.compile import orig_function, In, Out
 from theano.compile.sharedvalue import SharedVariable, shared
-import numpy # for backport to 2.4, to get any().
+from theano import config
 def rebuild_collect_shared( outputs
                           , inputs             = None
@@ -292,7 +295,8 @@ class Param(object):
 def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
        no_default_updates=False, accept_inplace=False, name=None,
-        rebuild_strict=True, allow_input_downcast=None):
+        rebuild_strict=True, allow_input_downcast=None,
+        profile=None):
    """Function-constructor for graphs with shared variables.
    :type params: list of either Variable or Param instances.
@@ -319,11 +323,9 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
    If False (default), perform them all. Else, perform automatic updates on all Variables
    that are neither in "updates" nor in "no_default_updates".
-    :param name: an optional name for this fct. If used, the profile mode will print the time spent in this fct.
+    :type name: None or string
+    :param name: attaches a name to the Profiling result of this function when
-    :rtype: theano.compile.Function
+    using ProfileMode (will be deprecated).
-    :returns: a callable object that will compute the outputs (given the inputs)
-    and update the implicit function arguments according to the `updates`.
    :type allow_input_downcast: Boolean
    :param allow_input_downcast: True means that the values passed as
@@ -333,6 +335,21 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
    precise, type. None (default) is almost like False, but allows
    downcasting of Python float scalars to floatX.
+    :type profile: None, True, str, or ProfileStats instance
+    :param profile: accumulate profiling information into a given ProfileStats
+    instance. None is the default, and means to use the value of
+    config.profile.
+    If argument is `True` then a new ProfileStats instance will be
+    used.  If argument is a string, a new ProfileStats instance will be created
+    with that string as its `message` attribute.  This profiling object will be
+    available via self.profile.
+    :rtype: theano.compile.Function
+    :returns: a callable object that will compute the outputs (given the inputs)
+    and update the implicit function arguments according to the `updates`.
    :note: Regarding givens: Be careful to make sure that these substitutions are
    independent--behaviour when Var1 of one pair appears in the graph leading to Var2 in
    another expression is undefined.  Replacements specified with givens are different from
@@ -354,6 +371,17 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
    # Then it clones the outputs and the update expressions.  This rebuilds a computation graph
    # from the inputs and the givens.
    #
+    if profile is None:
+        profile = config.profile
+        # profile -> True or False
+    if profile == True:
+        profile = ProfileStats(message=name)
+        # profile -> object
+    if type(profile) == str:
+        profile = ProfileStats(message=profile)
+    # profile is typically either False or an object at this point.
+    # No need to block other objects being passed through though. It might be
+    # useful.
    if not isinstance(params,(list,tuple)):
        raise Exception("in pfunc() the first argument must be a list or a tuple")
@@ -393,7 +421,7 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
        inputs.append(si)
    return orig_function(inputs, cloned_outputs, mode,
-            accept_inplace=accept_inplace, name=name)
+            accept_inplace=accept_inplace, name=name, profile=profile)
 def _pfunc_param_to_in(param, strict=False, allow_downcast=None):

--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
@@ -8,6 +8,8 @@ from theano.configparser import config, AddConfigVar, IntParam, BoolParam
 from theano.compile.function_module import FunctionMaker
 run_cthunk = None # Will be imported only when needed.
+from profiling import ProfileStats
 import_time = time.time()
 AddConfigVar('ProfileMode.n_apply_to_print',
@@ -34,24 +36,53 @@ AddConfigVar('ProfileMode.profile_memory',
 class Profile_Maker(FunctionMaker):
    def create(self, input_storage=None, trustme=False):
        ret = super(Profile_Maker,self).create(input_storage, trustme)
+        # create a function-specific storage container for profiling info
+        profile = ProfileStats(atexit_print=False)
+        self.mode.profile_stats[ret] = profile
+        ret.profile = profile
+        #initialize the timers
        for i, node in enumerate(ret.maker.env.toposort()):
-            self.mode.apply_time[(i,node)]=0.0
+            profile.apply_time[node]=0.0
-            assert len(ret.fn.thunk_groups[i])==1
+            profile.outputs_size[node]=[0.0] * len(node.outputs)
-            self.mode.op_cimpl[node.op] = hasattr(ret.fn.thunk_groups[i][0],'cthunk')
+            # a thunk_group is a list of the thunks from each linker
+            # corresponding to the i'th position in the toposort.
+            assert len(ret.fn.thunk_groups[i])==1
+            profile.apply_cimpl[node] = hasattr(
+                    ret.fn.thunk_groups[i][0],
+                    'cthunk')
+        # Here we replace the linker function.
+        # This ugliness makes WrapLinker (an object that *generates*
+        # functions and is not function-specific)  work with ProfileStats
+        # objects which are function-specific.
+        #capture old fn in closure. This is important since new_fn is about to
+        #take its place as ret.fn.
+        ret_fn = ret.fn
+        def new_fn():
+            self.mode.apply_time = self.mode.profile_stats[ret].apply_time
+            self.mode.outputs_size = self.mode.profile_stats[ret].outputs_size
+            ret_fn()
+            # delete the old apply_time variable
+            # because it doesn't mean the same thing anymore.
+            # This prevents old code from looking like it still works.
+            del self.mode.apply_time
+            del self.mode.outputs_size
+        ret.fn = new_fn
        return ret
 class ProfileMode(Mode):
    def __init__(self, linker=config.linker, optimizer=config.optimizer):
-        apply_time = {}
-        op_cimpl = {}
-        compile_time = 0 #time passed in theano.function()
-        fct_call_time = {}#time passed inside theano fct call including op time.
-        fct_call = {}
        message=""
-        outputs_size={}
+        profile_stats={}
-        self.__setstate__((linker, optimizer, apply_time, op_cimpl,
+        self.__setstate__((linker,
-                           compile_time, fct_call_time, fct_call, message, outputs_size))
+            optimizer,
+            message,
+            profile_stats))
    def function_maker(self, i,o,m, *args, **kwargs):
        """Return an instance of `Profiler_Maker` which init the count"""
@@ -59,28 +90,24 @@ class ProfileMode(Mode):
        assert m is self
        return Profile_Maker(i, o, self, *args, **kwargs)
-    local_time = property(lambda self: [sum(self.apply_time.values())])
+    def __get_local_time(self):
+        rval = 0
+        for ps in self.profile_stats.values():
+            rval += sum(ps.apply_time.values())
+        return rval
+    local_time = property(__get_local_time)
    def __getstate__(self):
        #print "__getstate__",self.provided_linker,self.provided_optimizer
-        return (self.provided_linker, self.provided_optimizer, self.apply_time,
+        return (self.provided_linker,
-                self.op_cimpl, self.compile_time, self.fct_call_time,
+                self.provided_optimizer,
-                self.fct_call, self.message, self.outputs_size)
+                self.message,
+                self.profile_stats)
    def __setstate__(self, state):
-        linker, optimizer, apply_time, op_cimpl, compile_time, \
+        linker, optimizer, message, profile_stats = state
-                fct_call_time, fct_call, message, outputs_size = state
+        self.message = message
-        self.apply_time = apply_time
+        self.profile_stats = profile_stats
-        self.op_cimpl = op_cimpl
-        self.compile_time = compile_time
-        self.fct_call_time = fct_call_time
-        self.fct_call = fct_call
-        self.call_time = 0
-        self.fn_time = 0
-        self.optimizer_time = 0
-        self.linker_time = 0
-        self.message = ""
-        self.outputs_size = outputs_size
        def profile_thunk(i, node, th):
            """ Profile only the execution time
@@ -102,7 +129,7 @@ class ProfileMode(Mode):
                th()
                dt = time.time() - t0
-            apply_time[(i,node)] += dt
+            self.apply_time[node] += max(dt, 1e-14)
        def profile_thunk2(i, node, th):
@@ -149,8 +176,8 @@ class ProfileMode(Mode):
                else:
                    raise Exception("Can't determine the memory size of dtype",o[0].dtype)
                size.append(s)
-            outputs_size[node]=size
+            self.outputs_size[node]=size
-            apply_time[(i,node)] += dt
+            self.apply_time[node] += max(dt, 1e-14)
        self.provided_linker = linker
@@ -182,22 +209,44 @@ class ProfileMode(Mode):
                       Currently there is n_apply_to_print, n_ops_to_print and min_memory_size
                       that are accepted.
        """
+        compile_time = sum([ps.compile_time for ps in self.profile_stats.values()])
+        fct_call = dict([(fn, ps.fct_callcount)
+            for (fn, ps) in self.profile_stats.items()])
+        fct_call_time = dict([(fn, ps.fct_call_time)
+            for (fn, ps) in self.profile_stats.items()])
+        apply_time = {}
+        for fn, ps in self.profile_stats.items():
+            for (i, node) in enumerate(fn.maker.env.toposort()):
+                apply_time[(i, node)] = ps.apply_time[node]
+        for (i,n),t in apply_time.items():
+            if t == 0:
+                print i, n
+        op_cimpl = {}
+        outputs_size = {}
+        for fn, ps in self.profile_stats.items():
+            op_cimpl.update(ps.apply_cimpl)
-        compile_time = self.compile_time
-        fct_call_time = self.fct_call_time
-        fct_call = self.fct_call
-        apply_time = self.apply_time
-        op_cimpl = self.op_cimpl
        message = self.message
-        outputs_size = self.outputs_size
-        other_time = {'linker_time':self.linker_time,
+        outputs_size = {}
-                      'optimizer_time':self.optimizer_time}
+        for fn, ps in self.profile_stats.items():
+            outputs_size.update(ps.outputs_size)
+        other_time = dict(
+                linker_time = sum(
+                    [ps.linker_time for ps in self.profile_stats.values()]),
+                optimizer_time = sum(
+                    [ps.optimizer_time for ps in self.profile_stats.values()]))
        self.print_summary_("print_summary", compile_time, fct_call_time, fct_call,
-                            apply_time, op_cimpl, message, outputs_size, other_time,
+                        apply_time, op_cimpl, message, outputs_size,
+                        self.local_time, other_time,
                        **kwargs)
    def print_diff_summary(self, other, **kwargs):
        """ As print_summary, but print the difference on two different profile mode.
        TODO: Also we don't print the Apply-wise summary as it don't work for now.
@@ -240,7 +289,7 @@ class ProfileMode(Mode):
    @staticmethod
    def print_summary_(fct_name, compile_time, fct_call_time, fct_call,
                       apply_time, op_cimpl, message, outputs_size,
-                       other_time,
+                       local_time, other_time,
                       n_apply_to_print=config.ProfileMode.n_apply_to_print,
                       n_ops_to_print=config.ProfileMode.n_ops_to_print,
                       print_apply=True,
@@ -256,7 +305,6 @@ class ProfileMode(Mode):
                                whose outputs memory size is lower then that.
        """
-        local_time = sum(apply_time.values())
        total_time = time.time() - import_time
        total_fct_time = sum(fct_call_time.values())
        total_fct_call = sum(fct_call.values())
@@ -312,7 +360,7 @@ class ProfileMode(Mode):
            op_time[op]+=t
            nb_call = [v for k,v in fct_call.items() if k.maker.env is a.env][0]
            if t==0:
-                assert nb_call == 0
+                assert nb_call == 0, nb_call
            else:
                op_call[op] += nb_call
                op_apply[op] += 1
@@ -429,8 +477,8 @@ class ProfileMode(Mode):
        else:
            fct_memory={}#env->dict(node->(outputs size))
            var_mem = {}
-            for node,val in outputs_size.items():
+            for node, val in outputs_size.items():
-                fct_memory.setdefault(node.env,{})
+                fct_memory.setdefault(node.env, {})
                fct_memory[node.env][node]=val
                for out,v in zip(node.outputs,val):
                    var_mem[out]=v
@@ -600,7 +648,7 @@ def atexit_print_default_profile_mode():
    config.mode=PROFILE_MODE
    """
    for prof_mode in prof_mode_instance_to_print:
-        if sum(prof_mode.apply_time.values())>0:
+        if prof_mode.local_time>0:
            prof_mode.print_summary()
 #Register atexit_print_default_profile_mode to have the summary of the

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
--- a/theano/compile/tests/test_modes.py
+++ b/theano/compile/tests/test_modes.py
+"""
+Test compilation modes
+"""
+from nose.plugins.skip import SkipTest
+import unittest
+import theano
+import numpy
+import random
+import numpy.random
+from theano.tests  import unittest_tools as utt
+import theano.tensor as T
+class T_bunch_of_modes(unittest.TestCase):
+    def test1(self):
+        # this is a quick test after the LazyLinker branch merge
+        # to check that all the current modes can still be used.
+        linker_classes_involved = []
+        for modename in theano.config.__class__.__dict__['mode'].all:
+            x = T.matrix()
+            y = T.vector()
+            f = theano.function([x,y], x+y, mode=modename)
+            # test that it runs something
+            f([[1,2],[3,4]], [5, 6])
+            linker_classes_involved.append(f.maker.mode.linker.__class__)
+            print 'MODE:', modename, f.maker.mode.linker, 'stop'
+        # regression check:
+        # there should be
+        # - VM_Linker
+        # - OpWiseCLinker (FAST_RUN)
+        # - WrapLinker (PROFILE_MODE)
+        # - PerformLinker (FAST_COMPILE)
+        # - DebugMode's Linker  (DEBUG_MODE)
+        assert 5 == len(set(linker_classes_involved))
+if __name__ == '__main__':
+    unittest.main()
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -65,15 +65,6 @@ AddConfigVar('force_device',
        BoolParam(False, allow_override=False),
        in_c_key=False)
-#Don't add FAST_RUN_NOGC to this list(as well as other ALL CAPS short cut)
-#The way to get FAST_RUN_NOGC is with the flag 'linker=c|py_nogc'
-#The old all capital letter way of working is deprecated as it is not scalable.
-AddConfigVar('mode',
-        "Default compilation mode",
-        EnumStr('Mode', 'ProfileMode', 'DebugMode', 'FAST_RUN',
-                'FAST_COMPILE', 'PROFILE_MODE', 'DEBUG_MODE'),
-        in_c_key=False)
 # Test whether or not gcc is present: disable C code if it is not.
 # Using the dummy file descriptor below is a workaround for a crash experienced
 # in an unusual Python 2.4.4 Windows environment with the default stdin=None.
@@ -84,13 +75,15 @@ try:
    # Keep the default linker the same as the one for the mode FAST_RUN
    AddConfigVar('linker',
                 "Default linker used if the theano flags mode is Mode or ProfileMode",
-                 EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py'),
+                 EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py',
+                     'vm', 'cvm', 'vm_nogc', 'cvm_nogc'),
                 in_c_key=False)
 except OSError:
    # gcc is not present, linker should default to python only
    AddConfigVar('linker',
                 "Default linker used if the theano flags mode is Mode or ProfileMode",
-                 EnumStr('py', 'c|py', 'c', 'c|py_nogc', 'c&py'),
+                 EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py',
+                     'vm', 'cvm', 'vm_nogc', 'cvm_nogc'),
                 in_c_key=False)
    warning('GCC not detected ! Theano will be unable to execute optimized '+
            'C-implementations (for both CPU and GPU) and will default to '+
@@ -145,10 +138,6 @@ AddConfigVar('op.set_flops',
        BoolParam(False),
        in_c_key=False)
-AddConfigVar('nvcc.fastmath',
-        "",
-        BoolParam(False))
 AddConfigVar('gpuelemwise.sync',
        "when true, wait that the gpu fct finished and check it error code.",
        BoolParam(True))

--- a/theano/gof/__init__.py
+++ b/theano/gof/__init__.py
@@ -146,7 +146,7 @@ from link import \
    Container, Linker, LocalLinker, PerformLinker, WrapLinker, WrapLinkerMany
 from op import \
-    Op
+    Op, PureOp
 from opt import (Optimizer, optimizer, SeqOptimizer,
    MergeOptimizer, MergeOptMerge, 

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -1312,6 +1312,7 @@ def gcc_module_compile_str(module_name, src_code, location=None, include_dirs=[]
    #DSE Patch 1 for supporting OSX frameworks; add -framework Python
    if sys.platform=='darwin' :
        preargs.extend(['-undefined','dynamic_lookup'])
+        python_inc = distutils.sysconfig.get_python_inc()
        # link with the framework library *if specifically requested*
        # config.mac_framework_link is by default False, since on some mac
        # installs linking with -framework causes a Bus Error

--- a/theano/gof/env.py
+++ b/theano/gof/env.py
@@ -311,6 +311,9 @@ class Env(utils.object2):
        self.__import_r__([new_r])
        self.__add_clients__(new_r, [(node, i)])
        prune = self.__remove_clients__(r, [(node, i)], False)
+        # Precondition: the substitution is semantically valid
+        # However it may introduce cycles to the graph,  in which case the
+        # transaction will be reverted later.
        self.execute_callbacks('on_change_input', node, i, r, new_r, reason=reason)
        if prune:
@@ -438,16 +441,32 @@ class Env(utils.object2):
        if len(self.nodes) < 2:
            # optimization
            # when there are 0 or 1 nodes, no sorting is necessary
+            # This special case happens a lot because the OpWiseCLinker produces
+            # 1-element graphs.
            return list(self.nodes)
        env = self
-        ords = {}
+        ords = self.orderings()
-        for feature in env._features:
-            if hasattr(feature, 'orderings'):
-                for op, prereqs in feature.orderings(env).items():
-                    ords.setdefault(op, []).extend(prereqs)
        order = graph.io_toposort(env.inputs, env.outputs, ords)
        return order
+    def orderings(self):
+        """
+        Return dict d s.t. d[node] is a list of nodes that must be evaluated
+        before node itself can be evaluated.
+        This is used primarily by the destroy_handler feature to ensure that all
+        clients of any destroyed inputs have already computed their outputs.
+        """
+        ords = {}
+        for feature in self._features:
+            if hasattr(feature, 'orderings'):
+                for node, prereqs in feature.orderings(self).items():
+                    ords.setdefault(node, []).extend(prereqs)
+        # eliminate duplicate prereqs
+        for (node,prereqs) in ords.items():
+            ords[node] = list(set(prereqs))
+        return ords
    def nclients(self, r):
        """WRITEME Same as len(self.clients(r))."""
        return len(self.clients(r))

--- a/theano/gof/lazylinker_c.c
+++ b/theano/gof/lazylinker_c.c
--- a/theano/gof/lazylinker_c.py
+++ b/theano/gof/lazylinker_c.py
+import os
+import theano
+from theano import config
+from theano.gof.compilelock import get_lock, release_lock
+from theano.gof import cmodule
+get_lock()
+try:
+    dirname = 'lazylinker_ext'
+    cfile = os.path.join(theano.__path__[0], 'gof', 'lazylinker_c.c')
+    code = open(cfile).read()
+    loc = os.path.join(config.compiledir, dirname)
+    if not os.path.exists(loc):
+        os.mkdir(loc)
+    cmodule.gcc_module_compile_str(dirname, code, location=loc)
+    from lazylinker_ext.lazylinker_ext import *
+finally:
+    # Release lock on compilation directory.
+    release_lock()
--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -3,18 +3,21 @@
 The `Op` class is the base interface for all operations
 compatible with `gof`'s :doc:`graph` routines.
 """
+__authors__   = "theano-dev"
+__copyright__ = "(c) 2010, Universite de Montreal"
+__license__   = "3-clause BSD License"
+__contact__   = "theano-dev <theano-dev@googlegroups.com>"
 __docformat__ = "restructuredtext en"
+import logging
 from theano import config
 import graph
 import numpy
 import utils
 import warnings
-import logging
-from theano import config
 from env import Env
-import graph
 import cc

--- a/theano/gof/tests/test_lazy.py
+++ b/theano/gof/tests/test_lazy.py
+from copy import deepcopy
+import numpy
+from theano.gof.op import PureOp
+from theano.gof import Apply, generic, Container
+from theano.gof.link import LocalLinker, map_storage, add_clear_storage
+from theano import function, Mode
+from theano.lazycond import ifelse
+import theano.tensor as T
+class IfElseIfElseIf(PureOp):
+    def __init__(self, inplace=False):
+        self.inplace=inplace # check destroyhandler and others to ensure that a view_map with
+        #multiple inputs can work
+        assert not self.inplace
+    def make_node(self, c1, t1, c2,t2,c3,t3,f3):
+        assert t1.type == f3.type
+        assert t2.type == t3.type
+        assert t3.type == f3.type
+        return Apply(self, [c1,t1,c2,t2,c3,t3,f3], [t1.type()])
+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        input_computed = [compute_map[v] for v in node.inputs]
+        output_computed = [compute_map[v] for v in node.outputs]
+        input_registers = [storage_map[v] for v in node.inputs]
+        output_registers = [storage_map[v] for v in node.outputs]
+        outtype = node.outputs[0].type
+        def thunk():
+            if not input_computed[0][0]:
+                return [0]
+            else:
+                truthval = input_registers[0][0]
+                if truthval:
+                    if not input_computed[1][0]:
+                        return [1]
+                    else:
+                        output_computed[0][0]=1
+                        output_registers[0][0]=outtype.filter(deepcopy(input_registers[1][0]))
+                        return []
+                else:
+                    if not input_computed[2][0]:
+                        return [2]
+                    else:
+                        truthval = input_registers[2][0]
+                        if truthval:
+                            if not input_computed[3][0]:
+                                return [3]
+                            else:
+                                output_computed[0][0] = 1
+                                output_registers[0][0] = outtype.filter(deepcopy(input_registers[3][0]))
+                                return []
+                        else:
+                            if not input_computed[4][0]:
+                                return [4]
+                            else:
+                                truthval = input_registers[4][0]
+                                if truthval:
+                                    if not input_computed[5][0]:
+                                        return [5]
+                                    else:
+                                        output_computed[0][0] = 1
+                                        output_registers[0][0] = outtype.filter(deepcopy(input_registers[5][0]))
+                                        return []
+                                else:
+                                    if not input_computed[6][0]:
+                                        return [6]
+                                    else:
+                                        output_computed[0][0] = 1
+                                        output_registers[0][0] = outtype.filter(deepcopy(input_registers[6][0]))
+                                        return []
+        thunk.lazy = True
+        return thunk
+class NotImplementedOp(PureOp):
+    class E(Exception): pass
+    def make_node(self, x):
+        return Apply(self, [x], [x.type()])
+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        def thunk():
+            raise self.E()
+        thunk.lazy=False
+        return thunk
+def test_ifelse():
+    a = generic()
+    b = generic()
+    c = generic()
+    notimpl = NotImplementedOp()
+    f = function([a,b,c], ifelse(a, notimpl(b), c),
+            mode=Mode(linker='vm', optimizer='fast_run'))
+    try:
+        print "case 1"
+        f( True, 'a', 'b')
+        assert False
+    except NotImplementedOp.E:
+        pass
+    print "... passed"
+    print "case 2"
+    print f( False, 'a', 'b')
+    assert f( False, 'a', 'b') == 'b'
+    print "... passed"
+def more_complex_test():
+    notimpl = NotImplementedOp()
+    ifelseifelseif = IfElseIfElseIf()
+    x1 = T.scalar('x1')
+    x2 = T.scalar('x2')
+    c1 = generic('c1')
+    c2 = generic('c2')
+    t1 = ifelse(c1,x1,notimpl(x2))
+    t1.name = 't1'
+    t2 = t1*10
+    t2.name = 't2'
+    t3 = ifelse(c2,t2, x1+t1)
+    t3.name = 't3'
+    t4 = ifelseifelseif(T.eq(x1,x2), x1, T.eq(x1,5), x2, c2, t3, t3+0.5)
+    t4.name = 't4'
+    f = function([c1,c2,x1,x2], t4, mode=Mode(linker='vm', optimizer='fast_run'))
+    print f(1, 0, numpy.array(10,dtype=x1.dtype),0)
+    assert f(1,0,numpy.array(10,dtype=x1.dtype),0) == 20.5
+    print '... passed'
+if __name__ == '__main__':
+    more_complex_test()
--- a/theano/gof/tests/test_vm.py
+++ b/theano/gof/tests/test_vm.py
+import gc
+import sys
+import time
+try:
+    import line_profiler
+except ImportError:
+    pass
+import numpy
+from theano import function
+from theano.gof import vm,link, OpWiseCLinker
+from theano.compile import Mode
+from theano import tensor
+from theano.lazycond import ifelse
+import theano
+def test_speed():
+    def build_graph(x, depth=5):
+        z = x
+        for d in range(depth):
+            z = (z + z)
+        return z
+    def numpy_version(x, depth):
+        z = x
+        for d in xrange(depth):
+            z = (z+z)
+        return z
+    def time_numpy():
+        steps_a = 5
+        steps_b = 100
+        x = numpy.asarray([2.0, 3.0], dtype=theano.config.floatX)
+        numpy_version(x, steps_a)
+        t0 = time.time()
+        print numpy_version(x, steps_a)
+        t1 = time.time()
+        t2 = time.time()
+        print numpy_version(x, steps_b)
+        t3 = time.time()
+        t_a = t1 - t0
+        t_b = t3 - t2
+        print "%s takes %f s/Kop" % (
+                'numpy',
+                (1000*(t_b-t_a) / (steps_b - steps_a)))
+    def time_linker(name, linker):
+        steps_a = 5
+        steps_b = 100
+        x = tensor.vector()
+        a = build_graph(x,steps_a)
+        b = build_graph(x,steps_b)
+        f_a = function([x], a,
+                mode=Mode(optimizer=None, linker=linker()),
+                #profile='f_a speed test %s'%name,
+                )
+        f_b = function([x], b,
+                mode=Mode(optimizer=None, linker=linker()),
+                #profile='f_b speed test %s'%name,
+                )
+        print f_a([2.0, 3.0])
+        t0 = time.time()
+        print f_a([2.0, 3.0])
+        t1 = time.time()
+        print f_b([2.0, 3.0])
+        t2 = time.time()
+        print f_b([2.0, 3.0])
+        t3 = time.time()
+        t_a = t1 - t0
+        t_b = t3 - t2
+        print "%s takes %f s/Kop" % (
+                name,
+                (1000*(t_b-t_a) / (steps_b - steps_a)))
+    time_linker('c|py', OpWiseCLinker)
+    time_linker('vmLinker', vm.VM_Linker)
+    time_linker('vmLinker_nogc', lambda : vm.VM_Linker(allow_gc=False))
+    time_linker('vmLinker_CLOOP', lambda : vm.VM_Linker(allow_gc=False,
+        use_cloop=True))
+    time_numpy()
+def test_speed_lazy():
+    def build_graph(x, depth=5):
+        z = x
+        for d in range(depth):
+            z = ifelse(z> 0, -z, z)
+        return z
+    def time_linker(name, linker):
+        steps_a = 10
+        steps_b = 100
+        x = tensor.vector()
+        a = build_graph(x, steps_a)
+        b = build_graph(x, steps_b)
+        f_a = function([x], a,
+                mode=Mode(optimizer=None,
+                    linker=linker()),
+                #profile='f_a lazy ifelse %s'%name,
+                )
+        f_b = function([x], b,
+                mode=Mode(optimizer=None,
+                    linker=linker()),
+                #profile='f_b lazy ifelse %s'%name,
+                )
+        print f_a([2.0])
+        t0 = time.time()
+        print f_a([2.0])
+        t1 = time.time()
+        print f_b([2.0])
+        t2 = time.time()
+        print f_b([2.0])
+        t3 = time.time()
+        t_a = t1 - t0
+        t_b = t3 - t2
+        print "%s takes %f s/Kop" % (
+                name,
+                (1000*(t_b-t_a) / (steps_b - steps_a)))
+    time_linker('vmLinker', vm.VM_Linker)
+    time_linker('vmLinker_nogc', lambda : vm.VM_Linker(allow_gc=False))
+    time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False,
+        use_cloop=True))
+run_memory_usage_tests = False
+if run_memory_usage_tests:
+    # these are not normal unit tests, do not run them as part of standard
+    # suite.  I ran them while looking at top, and stopped when memory usage was
+    # stable.
+    def test_leak2():
+        import theano.sandbox.cuda as cuda
+        for i in xrange(1000000):
+            n = numpy.asarray([2.3, 4.5], dtype='f')
+            c = sys.getrefcount(n)
+            a = cuda.CudaNdarray(n)
+            assert c == sys.getrefcount(n)
+            if not i % 1000:
+                print '.',
+                print gc.collect(),
+                print gc.collect()
+            sys.stdout.flush()
+    def test_no_leak_many_graphs():
+        # Verify no memory leaks when creating and deleting a lot of functions
+        # This isn't really a unit test, you have to run it and look at top to see
+        # if there's a leak
+        for i in xrange(10000):
+            x = tensor.vector()
+            z = x
+            for d in range(10):
+                z = tensor.sin(-z+ 1)
+            f = function([x], z, mode=Mode(optimizer=None, linker='cvm'))
+            if not i % 100:
+                print gc.collect()
+            sys.stdout.flush()
+            gc.collect()
+            if 1:
+                f([2.0])
+                f([3.0])
+                f([4.0])
+                f([5.0])
+    def test_no_leak_many_call_lazy():
+        # Verify no memory leaks when calling a function a lot of times
+        # This isn't really a unit test, you have to run it and look at top to see
+        # if there's a leak
+        def build_graph(x, depth=5):
+            z = x
+            for d in range(depth):
+                z = ifelse(z> 0, -z, z)
+            return z
+        def time_linker(name, linker):
+            steps_a = 10
+            x = tensor.vector()
+            a = build_graph(x, steps_a)
+            f_a = function([x], a,
+                    mode=Mode(optimizer=None,
+                        linker=linker()))
+            for i in xrange(100000):
+                f_a([2.0])
+            if 0: # this doesn't seem to work, prints 0 for everything
+                import resource
+                pre = resource.getrusage(resource.RUSAGE_SELF)
+                post = resource.getrusage(resource.RUSAGE_SELF)
+                print pre.ru_ixrss, post.ru_ixrss
+                print pre.ru_idrss, post.ru_idrss
+                print pre.ru_maxrss, post.ru_maxrss
+        time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False, use_cloop=True))
+    def test_no_leak_many_call_nonlazy():
+        # Verify no memory leaks when calling a function a lot of times
+        # This isn't really a unit test, you have to run it and look at top to see
+        # if there's a leak
+        def build_graph(x, depth=5):
+            z = x
+            for d in range(depth):
+                z = tensor.sin(-z+1)
+            return z
+        def time_linker(name, linker):
+            steps_a = 10
+            x = tensor.vector()
+            a = build_graph(x,steps_a)
+            f_a = function([x], a,
+                    mode=Mode(optimizer=None,
+                        linker=linker()))
+            for i in xrange(500000):
+                f_a([2.0])
+        time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False, use_cloop=True))
--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
--- a/theano/lazycond.py
+++ b/theano/lazycond.py
+"""
+IfElse is an Op that works with the LazyLinker to support conditional graph evaluation.
+:TODO: Add text to library documentation describing the IfElse Op.
+"""
+from copy import deepcopy
+from theano.gof import PureOp, Apply, generic, Container
+import theano.tensor
+import gof
+from compile import optdb
+from tensor import opt
+@gof.local_optimizer([None])
+def ifelse_make_inplace(node):
+    op = node.op
+    if isinstance(op, IfElse) and not op.as_view :
+        print 'ifelse_make_inplace applied'
+        return IfElse(as_view = True,
+                    gpu = op.gpu, name=op.name).make_node(*node.inputs).outputs
+    return False
+optdb.register('ifelse_make_inplace', opt.in2out(ifelse_make_inplace,
+    ignore_newtrees=True), 95, 'fast_run', 'inplace')
+class IfElse(PureOp):
+    """
+    Op that works with LazyLinker to support conditional graph evaluation.
+    Example usage:
+        ``rval = ifelse(tf, rval_if_true, rval_if_false)``
+    :note:
+        Other Linkers (ALL other linkers right now) are INCOMPATIBLE with this
+        Op, they will produce functions that FAIL TO EXECUTE.
+    """
+    def __init__(self, as_view=False, gpu = False, name = None):
+        if as_view:
+            # check destroyhandler and others to ensure that a view_map with
+            # multiple inputs can work
+            view_map = {}
+            view_map[0] = [1]
+            self.view_map = view_map
+            #raise NotImplementedError('IfElse must copy for now')
+        self.as_view=as_view
+        self.gpu = gpu
+        self.name = name
+    def make_node(self, c, t, f):
+        if t.type != f.type:
+            raise TypeError(
+                    'IfElse requires same types for true and false args',
+                    (t.type, f.type))
+        return Apply(self, [c,t,f], [t.type()])
+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        outtype = node.outputs[0].type
+        c,t,f = node.inputs
+        output = node.outputs[0]
+        def thunk():
+            if not compute_map[c][0]:
+                return [0]
+            else:
+                truthval = storage_map[c][0]
+                if truthval:
+                    if not compute_map[t][0]:
+                        return [1]
+                    else:
+                        compute_map[output][0]=1
+                        if self.as_view:
+                            oval = outtype.filter(storage_map[t][0])
+                        else:
+                            oval = outtype.filter(
+                                    deepcopy(storage_map[t][0]))
+                        storage_map[output][0] = oval
+                        return []
+                else:
+                    if not compute_map[f][0]:
+                        return [2]
+                    else:
+                        # can't view both outputs unless destroyhandler
+                        # improves
+                        compute_map[output][0]=1
+                        oval = outtype.filter(
+                                deepcopy(storage_map[f][0]))
+                        storage_map[output][0]=oval
+                        return []
+        thunk.lazy = True
+        thunk.inputs  = [storage_map[v] for v in node.inputs]
+        thunk.outputs = [storage_map[v] for v in node.outputs]
+        return thunk
+ifelse = IfElse()
--- a/theano/printing.py
+++ b/theano/printing.py
@@ -391,7 +391,7 @@ default_colorCodes = {'GpuFromHost' : 'red',
              'HostFromGpu' : 'red',
              'Scan'  : 'yellow',
              'Shape' : 'cyan',
-              'Cond'  : 'magenta',
+              'IfElse'  : 'magenta',
              'Elemwise': '#FFAABB',
              'Subtensor': '#FFAAFF'}
@@ -473,10 +473,10 @@ def pydotprint(fct, outfile=None,
        c3 = pd.Cluster('Middle')
        cond = None
        for node in fct_env.toposort():
-            if node.op.__class__.__name__=='Cond' and node.op.name == cond_highlight:
+            if node.op.__class__.__name__=='IfElse' and node.op.name == cond_highlight:
                cond = node
        if cond is None:
-            _warn("pydotprint: cond_highlight is set but there is no Cond node in the graph")
+            _warn("pydotprint: cond_highlight is set but there is no IfElse node in the graph")
            cond_highlight = None
    if cond_highlight is not None:

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
 import atexit, logging, os, stat, sys
 from theano.compile import optdb
-from theano import config
 from theano.gof.cmodule import get_lib_extension
+from theano.configparser import config, AddConfigVar, StrParam
 import nvcc_compiler
 _logger_name = 'theano.sandbox.cuda'
@@ -20,6 +20,22 @@ def debug(*msg):
    _logger.debug('DEBUG (%s): %s'% ( _logger_name,
        ' '.join(str(m) for m in msg)))
+AddConfigVar('cuda.root',
+        """directory with bin/, lib/, include/ for cuda utilities.
+        This directory is included via -L and -rpath when linking dynamically
+        compiled modules.  If AUTO, if nvcc is in the path, it will use one of
+        this parent directory.  Otherwise /usr/local/cuda.  Leave empty to
+        prevent extra linker directives.
+        Default: environment variable "CUDA_ROOT" or else "AUTO".
+        """,
+        StrParam(os.getenv('CUDA_ROOT', "AUTO")))
+if config.cuda.root == "AUTO":
+    # set nvcc_path correctly and get the version
+    nvcc_compiler.set_cuda_root()
+#is_nvcc_available called here to initialize global vars in nvcc_compiler module
+nvcc_compiler.is_nvcc_available()
 # Compile cuda_ndarray.cu
 # This need that nvcc (part of cuda) is installed. If it is not, a warning is

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -7,20 +7,7 @@ import commands
 _logger=logging.getLogger("theano.sandbox.cuda.nvcc_compiler")
 _logger.setLevel(logging.WARN)
-from theano.configparser import config, AddConfigVar, StrParam
+from theano.configparser import config, AddConfigVar, StrParam, BoolParam
-AddConfigVar('nvcc.compiler_bindir',
-        "If defined, nvcc compiler driver will seek g++ and gcc in this directory",
-        StrParam(""))
-AddConfigVar('cuda.nvccflags',
-        "Extra compiler flags for nvcc",
-        StrParam(""))
-AddConfigVar('cuda.root',
-        "The directory with bin/, lib/, include/ for cuda utilities. Used to put this directory of nvidia lib in the compiled libraire. Usefull when people forget to update there LD_LIBRARY_PATH and LIBRARY_PATH environment variable. If AUTO, if nvcc is in the path, it will use one of this parent directory. Otherwise /usr/local/cuda. If empty, won't appen the directory in the compiled library",
-        StrParam(os.getenv('CUDA_ROOT', "AUTO")))
 def error(*args):
    #sys.stderr.write('ERROR:'+ ' '.join(str(a) for a in args)+'\n')
@@ -35,6 +22,18 @@ def debug(*args):
    #sys.stderr.write('DEBUG:'+ ' '.join(str(a) for a in args)+'\n')
    _logger.debug("DEBUG: "+' '.join(str(a) for a in args))
+AddConfigVar('nvcc.compiler_bindir',
+        "If defined, nvcc compiler driver will seek g++ and gcc in this directory",
+        StrParam(""))
+AddConfigVar('nvcc.flags',
+        "Extra compiler flags for nvcc",
+        StrParam(""))
+AddConfigVar('nvcc.fastmath',
+        "",
+        BoolParam(False))
 nvcc_path = 'nvcc'
 nvcc_version = None
 def is_nvcc_available():
@@ -66,11 +65,6 @@ def set_cuda_root():
            config.cuda.root = os.path.split(dir)[0]
            return
-if config.cuda.root == "AUTO":
-    set_cuda_root()
-is_nvcc_available()#to set nvcc_path correctly and get the version
 rpath_defaults = []
 def add_standard_rpath(rpath):
    rpath_defaults.append(rpath)
@@ -183,11 +177,9 @@ def nvcc_module_compile_str(
        if sys.platform != 'darwin':
            # the 64bit CUDA libs are in the same files as are named by the function above
            rpaths.append(os.path.join(config.cuda.root,'lib64'))
    for rpath in rpaths:
        cmd.extend(['-Xlinker',','.join(['-rpath',rpath])])
-    nvccflags = [flag for flag in config.cuda.nvccflags.split(' ') if flag]
+    cmd.extend([flag for flag in config.nvcc.flags.split(' ') if flag])
-    cmd.extend(nvccflags)
    cmd.extend('-I%s'%idir for idir in include_dirs)
    cmd.extend(['-o',lib_filename])
    cmd.append(os.path.split(cppfilename)[-1])

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -270,6 +270,48 @@ def local_gpu_dot_to_dot22(node):
                                                shape_out))]
    return False
+@register_opt()
+@local_optimizer([])
+def local_gpu_lazy_ifelse(node):
+    """
+    gpu_from_host(dot22) -> gpudot(gpu_from_host)
+    dot(host_from_gpu) -> host_from_gpu(gpudot22)
+    """
+    import theano
+    if hasattr(theano,"lazycond"):
+        gpu_ifelse = theano.lazycond.IfElse(gpu = True)
+        if node.op == gpu_from_host:
+            host_input = node.inputs[0]
+            if (host_input.owner
+                    and host_input.owner.op == theano.lazycond.ifelse):
+                c, t, f = host_input.owner.inputs
+                if not isinstance(f.type,CudaNdarrayType):
+                    f = gpu_from_host(f)
+                if not isinstance(t.type,CudaNdarrayType):
+                    t = gpu_from_host(t)
+                if isinstance(c.type,CudaNdarrayType):
+                    c = host_from_gpu(c)
+                return [gpu_ifelse(c, t, f)]
+        if node.op == theano.lazycond.ifelse:
+            if numpy.any([(i.owner and i.owner.op == host_from_gpu) for i in node.inputs]):
+                c, t, f = node.inputs
+                if not isinstance(f.type,CudaNdarrayType):
+                    f = gpu_from_host(f)
+                if not isinstance(t.type,CudaNdarrayType):
+                    t = gpu_from_host(t)
+                if isinstance(c.type,CudaNdarrayType):
+                    c = host_from_gpu(c)
+                return [host_from_gpu(gpu_ifelse(c, t, f))]
+    return False
 @register_opt()
 @local_optimizer([])

--- a/theano/scan_module/__init__.py
+++ b/theano/scan_module/__init__.py
@@ -567,7 +567,7 @@ class ScanMerge(gof.Optimizer):
    def apply(self, env):
        nodelist = list(env.toposort())
-        cond_nodes = [ x for x in nodelist if x.op.__class__.__name__=='Cond']
+        cond_nodes = [ x for x in nodelist if x.op.__class__.__name__=='IfElse']
        scan_nodes = [ x for x in nodelist if x.op.__class__.__name__=='Scan']
        # Having lazy ifs in the graph complicates a bit things, and for

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -133,6 +133,79 @@ def sp_ones_like(x):
    data, indices, indptr, shape = csm_properties(x) #TODO: don't restrict to CSM formats
    return CSM(format=x.format)(tensor.ones_like(data), indices, indptr, shape)
+class _sparse_py_operators:
+    T = property(lambda self: transpose(self), doc = "Return aliased transpose of self (read-only)")
+    def __neg__(self): return neg(self)
+    def __add__(left, right): return add(left, right)
+    def __radd__(right, left): return add(left, right)
+    def __sub__(left, right): return sub(left, right)
+    def __rsub__(right, left): return sub(left, right)
+    def __mul__(left, right): return mul(left, right)
+    def __rmul__(left, right): return mul(left, right)
+    #extra pseudo-operator symbols
+    def __dot__(left, right): return structured_dot(left, right)
+    def __rdot__(right, left): return structured_dot(left, right)
+    #N.B. THIS IS COMMENTED OUT ON PURPOSE!!!
+    #     Discussion with Fred & James (at least, and maybe others before)
+    #     we decided that casting from a sparse to dense should be explicit
+    #     because it's usually something you want to be pretty careful about,
+    #     and not to do by accident.
+    #def _as_TensorVariable(self):
+    #    return dense_from_sparse(self)
+    shape = property(lambda self: tensor.shape(dense_from_sparse(self))) # don't worry!
+    # ... the plan is that the ShapeFeature in tensor.opt will do shape propagation
+    # ... and remove the dense_from_sparse from the graph.  This will *NOT* actually expand
+    # ... your sparse matrix just to get the shape.
+    ndim = property(lambda self: self.type.ndim)
+    dtype = property(lambda self: self.type.dtype)
+class SparseVariable(gof.Variable, _sparse_py_operators):
+    dtype = property(lambda self: self.type.dtype)
+    format = property(lambda self: self.type.format)
+    def __str__(self):
+        return '%s{%s,%s}'%(
+                self.__class__.__name__,
+                self.format,
+                self.dtype)
+    def __repr__(self):
+        return str(self)
+class SparseConstantSignature(tuple):
+    def __eq__(self, other):
+        (a, b), (x,y) = self, other
+        return a == x \
+                and (b.dtype == y.dtype)\
+                and (type(b) == type(y))\
+                and (b.shape == y.shape)\
+                and (abs(b-y).sum() < 1e-6 * b.nnz)
+    def __hash__(self):
+        (a,b) = self
+        return hash(type(self)) ^ hash(a) ^ hash(type(b))
+class SparseConstant(gof.Constant, _sparse_py_operators):
+    dtype = property(lambda self: self.type.dtype)
+    format = property(lambda self: self.type.format)
+    def signature(self):
+        assert self.data is not None
+        return SparseConstantSignature((self.type, self.data))
+    def __str__(self):
+        return '%s{%s,%s,shape=%s,nnz=%s}'%(
+                self.__class__.__name__,
+                self.format,
+                self.dtype,
+                self.data.shape,
+                self.data.nnz)
+    def __repr__(self):
+        return str(self)
+class SparseValue(gof.Value, _sparse_py_operators):
+    dtype = property(lambda self: self.type.dtype)
+    format = property(lambda self: self.type.format)
 class SparseType(gof.Type):
    """
@@ -149,6 +222,9 @@ class SparseType(gof.Type):
    dtype_set = set(['int', 'int8', 'int16','int32', 'int64', 'float32', 'float64', 'complex64','complex128'])
    ndim = 2
+    Variable = SparseVariable
+    Constant = SparseConstant
    def __init__(self, format, dtype):
        """
        Fundamental way to create a sparse node.
@@ -248,65 +324,6 @@ csr_dmatrix = SparseType(format='csr', dtype='float64')
 csc_fmatrix = SparseType(format='csc', dtype='float32')
 csr_fmatrix = SparseType(format='csr', dtype='float32')
-class _sparse_py_operators:
-    T = property(lambda self: transpose(self), doc = "Return aliased transpose of self (read-only)")
-    def __neg__(self): return neg(self)
-    def __add__(left, right): return add(left, right)
-    def __radd__(right, left): return add(left, right)
-    def __sub__(left, right): return sub(left, right)
-    def __rsub__(right, left): return sub(left, right)
-    def __mul__(left, right): return mul(left, right)
-    def __rmul__(left, right): return mul(left, right)
-    #extra pseudo-operator symbols
-    def __dot__(left, right): return structured_dot(left, right)
-    def __rdot__(right, left): return structured_dot(left, right)
-    #N.B. THIS IS COMMENTED OUT ON PURPOSE!!!
-    #     Discussion with Fred & James (at least, and maybe others before)
-    #     we decided that casting from a sparse to dense should be explicit
-    #     because it's usually something you want to be pretty careful about,
-    #     and not to do by accident.
-    #def _as_TensorVariable(self):
-    #    return dense_from_sparse(self)
-    shape = property(lambda self: tensor.shape(dense_from_sparse(self))) # don't worry!
-    # ... the plan is that the ShapeFeature in tensor.opt will do shape propagation
-    # ... and remove the dense_from_sparse from the graph.  This will *NOT* actually expand
-    # ... your sparse matrix just to get the shape.
-    ndim = property(lambda self: self.type.ndim)
-    dtype = property(lambda self: self.type.dtype)
-class SparseVariable(gof.Variable, _sparse_py_operators):
-    dtype = property(lambda self: self.type.dtype)
-    format = property(lambda self: self.type.format)
-class SparseConstantSignature(tuple):
-    def __eq__(self, other):
-        (a, b), (x,y) = self, other
-        return a == x \
-                and (b.dtype == y.dtype)\
-                and (type(b) == type(y))\
-                and (b.shape == y.shape)\
-                and (abs(b-y).sum() < 1e-6 * b.nnz)
-    def __hash__(self):
-        (a,b) = self
-        return hash(type(self)) ^ hash(a) ^ hash(type(b))
-class SparseConstant(gof.Constant, _sparse_py_operators):
-    dtype = property(lambda self: self.type.dtype)
-    format = property(lambda self: self.type.format)
-    def signature(self):
-        assert self.data is not None
-        return SparseConstantSignature((self.type, self.data))
-class SparseValue(gof.Value, _sparse_py_operators):
-    dtype = property(lambda self: self.type.dtype)
-    format = property(lambda self: self.type.format)
 # CONSTRUCTION
 class CSMProperties(gof.Op):
    """Extract all of .data .indices and .indptr"""

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -937,6 +937,9 @@ def _gemm_from_node2(node):
        lst = _factor_canonicalized(lst)
        rval = _gemm_from_factored_list(lst)
        #print "RVAL", rval
+        # THIS GOT COMMENTED OUT AT SOME POINT - ASK P.Lamblin maybe why?
+        #if rval:
+        #    assert rval[0].type == node.outputs[0].type, (rval[0].type, node.outputs[0].type)
        if rval and (rval[0].type == node.outputs[0].type):
            return rval

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -3057,30 +3057,33 @@ def constant_folding(node):
    for input in node.inputs:
        if not isinstance(input, Constant):
            return False
-    try:
+    #condition:  all inputs are constant
-        storage = [[None] for output in node.outputs]
-        node.op.perform(node, [x.data for x in node.inputs], storage)
+    storage_map=dict([(i,[i.data]) for i in node.inputs])
-    except MethodNotDefined:
+    compute_map=dict([(i,[True]) for i in node.inputs])
-        tmp_inputs = [x.type() for x in node.inputs]
+    for o in node.outputs:
-        f = compile.function(
+        storage_map[o] = [None]
-                inputs=tmp_inputs,
+        compute_map[o] = [False]
-                outputs=node.op.make_node(*tmp_inputs).outputs,
-                mode=compile.Mode(linker='c|py',optimizer=None))
+    thunk = node.op.make_thunk(node, storage_map, compute_map,
-        xvals = f(*[x.data for x in node.inputs])
+            no_recycling=[])
-        storage = [[xv] for xv in xvals]
+    required = thunk()
-    msg = []
+    assert not required # a node whose inputs are all provided should always
-    assert len(storage) == len(node.outputs)
+    # return successfully
-    for s, output in zip(storage, node.outputs):
+    rval = []
+    for output in node.outputs:
+        assert compute_map[output][0], (output, storage_map[output][0])
        try:
            constant = output.type.Constant
-        except:
+        except AttributeError:
            constant = Constant
-        msg += [constant(output.type, s[0])]
+        rval.append(constant(output.type, storage_map[output][0]))
-    return msg
+    return rval
 register_canonicalize(constant_folding, 'fast_compile')
-register_stabilize(constant_folding) # because
+register_stabilize(constant_folding)
 register_specialize(constant_folding)
 def _is_1(expr):

--- a/theano/tensor/tests/test_gc.py
+++ b/theano/tensor/tests/test_gc.py
@@ -20,7 +20,7 @@ def test_no_reuse():
        return
    assert not 'should not get here'
-def test_gc():
+def test_gc_never_pickles_temporaries():
    x = T.dvector()
    #print >> sys.stderr, 'BUILDING GRAPH'
@@ -32,32 +32,63 @@ def test_gc():
    optimizer=None
    optimizer='fast_run'
    for f_linker, g_linker in [
            (theano.PerformLinker(allow_gc = True), theano.PerformLinker(allow_gc=False)), 
            (theano.OpWiseCLinker(allow_gc = True), theano.OpWiseCLinker(allow_gc=False))]:
+        #f_linker has garbage collection
+        #g_linker has no garbage collection
        #print >> sys.stderr, 'COMPILING'
        f = theano.function([x], r,mode=theano.Mode(optimizer=optimizer, linker=f_linker))
+        g = theano.function([x], r,mode=theano.Mode(optimizer=optimizer, linker=g_linker))
+        len_pre_f = len(cPickle.dumps(f))
+        len_pre_g = len(cPickle.dumps(g))
+        # should be no difference at first
+        # In future, FunctionMaker might pickle linker-dependent stuff and make
+        # this assertion fail.
+        assert len_pre_f == len_pre_g
+        def a(fn):
+            return len(cPickle.dumps(fn.maker))
+        assert a(f) == a(f) # some sanity checks on the pickling mechanism
+        assert a(g) == a(g) # some sanity checks on the pickling mechanism
-        g = theano.function([x], r,mode=theano.Mode(optimizer=optimizer, linker=f_linker))
+        def b(fn):
+            return len(
+                    cPickle.dumps(
+                        theano.compile.function_module._pickle_Function(
+                            fn)))
+        assert b(f) == b(f) # some sanity checks on the pickling mechanism
-        pre_f = cPickle.dumps(f)
+        def c(fn):
-        pre_g = cPickle.dumps(g)
+            return len(cPickle.dumps(fn))
+        assert c(f) == c(f) # some sanity checks on the pickling mechanism
+        assert c(g) == c(g) # some sanity checks on the pickling mechanism
-        #print >> sys.stderr, 'RUNNING'
+        # now run the function once to create temporaries within the no-gc
+        # linker
        f(numpy.ones(100, dtype='float64'))
        g(numpy.ones(100, dtype='float64'))
+        # serialize the functions again
        post_f = cPickle.dumps(f)
        post_g = cPickle.dumps(g)
-        #because allow_gc should leave the function un-changed by calling
-        assert len(pre_f) == len(post_f)
-        #because temporaries that weren't collected shouldn't be pickled anyway
        len_post_f = len(post_f)
        len_post_g = len(post_g)
-        assert len_post_f == len_post_g
+        #assert that f() didn't cause the function to grow
+        # allow_gc should leave the function un-changed by calling
+        assert len_pre_f == len_post_f
+        #assert that g() didn't cause g to grow
+        # because temporaries that weren't collected shouldn't be pickled anyway
+        assert len_post_f == len_post_g, (f_linker, len_post_f, len_post_g)
 def test_merge_opt_runtime():

--- a/theano/tensor/tests/test_raw_random.py
+++ b/theano/tensor/tests/test_raw_random.py
@@ -49,11 +49,14 @@ class T_random_function(unittest.TestCase):
        rng_R = random_state_type()
        # use make_node to override some of the self.args
-        post_r2, out2 = rf2(rng_R, (4,), -2, 2)
+        post_r2,     out2     = rf2(rng_R, (4,), -2, 2) # NOT INPLACE
-        post_r2_4, out2_4 = rf2(rng_R, (4,), -4.0, 2)
+        post_r4,     out4     = rf4(rng_R, (4,), -4, 4) # INPLACE
-        post_r2_4_4, out2_4_4 = rf2(rng_R, (4,), -4.0, 4.0)
+        post_r2_4,   out2_4   = rf2(rng_R, (4,), -4.0, 2) # NOT INPLACE
-        post_r4, out4 = rf4(rng_R, (4,), -4, 4)
+        post_r2_4_4, out2_4_4 = rf2(rng_R, (4,), -4.0, 4.0) # NOT INPLACE
+        # configure out4 to be computed inplace
+        # The update expression means that the random state rng_R will
+        # be maintained by post_r4
        f = compile.function(
                [compile.In(rng_R,
                            value=numpy.random.RandomState(utt.fetch_seed()),
@@ -65,9 +68,25 @@ class T_random_function(unittest.TestCase):
        f2, f4, f2_4, f2_4_4 = f()
        f2b, f4b, f2_4b, f2_4_4b = f()
-        assert numpy.allclose(f2*2, f4)
+        print f2
-        assert numpy.allclose(f2_4_4, f4)
+        print f4
-        assert not numpy.allclose(f4, f4b)
+        print f2_4
+        print f2_4_4
+        #print f2b
+        #print f4b
+        #print f2_4b
+        #print f2_4_4b
+        # setting bounds is same as multiplying by 2
+        assert numpy.allclose(f2*2, f4), (f2, f4)
+        # retrieving from non-inplace generator
+        # is same as inplace one for first call
+        assert numpy.allclose(f2_4_4, f4), (f2_4_4, f4)
+        # f4 changes from call to call, that the update has worked
+        assert not numpy.allclose(f4, f4b), (f4, f4b)
    def test_inplace_optimization(self):
        """Test that FAST_RUN includes the random_make_inplace optimization"""

--- a/theano/tests/diverse_tests.py
+++ b/theano/tests/diverse_tests.py
@@ -13,19 +13,32 @@ from theano.tests  import unittest_tools as utt
  should ensure that it will remain operational
 '''
-class T_diverse(unittest.TestCase):
+class T_scipy(unittest.TestCase):
    def setUp(self):
        utt.seed_rng()
+        self.orig_floatX = theano.config.floatX
+    def tearDown(self):
+        theano.config.floatX = self.orig_floatX
-    def scipy_paper_example1(self):
+    def test_scipy_paper_example1(self):
        a = theano.tensor.vector('a') # declare variable
        b = a + a**10                 # build expression
        f = theano.function([a], b)   # compile function
        assert numpy.all(f([0,1,2]) == numpy.array([0,2,1026]))
-    def scipy_papaer_example2(self):
+    def test_scipy_paper_example2(self):
        ''' This just sees if things compile well and if they run '''
+        # PREAMPBLE
+        T = theano.tensor
+        shared = theano.shared
+        function = theano.function
+        rng = numpy.random
+        theano.config.floatX='float64'
+        #
+        # ACTUAL SCRIPT FROM PAPER
        x = T.matrix()
        y = T.vector()
        w = shared(rng.randn(100))
@@ -52,6 +65,7 @@ class T_diverse(unittest.TestCase):
        for i in range(training_steps):
            pred, err = train(D[0], D[1])
 if __name__ == '__main__':
    unittest.main()