merge

cd644635 · James Bergstra · 754f30fa · 4a5bf511 · cd644635 · cd644635
--- a/doc/advanced/debugging_with_stepmode.txt
+++ b/doc/advanced/debugging_with_stepmode.txt
+
+Debugging with a customized so-called StepMode
+==============================================
+
+One convenient trick I've found for debugging my programs that are running with theano is to
+use what I call a 'StepMode'.  There is no such StepMode in the standard library because the
+purpose of it is to hack it to investigate what your own particular program is doing.
+
+
+.. code-block:: python
+
+    from theano.gof.link import WrapLinkerMany
+    from theano.compile.mode import (Mode, register_mode, predefined_modes, predefined_linkers,
+            predefined_optimizers, default_linker, default_optimizer)
+
+    class StepMode(Mode):
+        def __init__(self, linker=default_linker, optimizer=default_optimizer):
+
+            def blah(i, node, th):
+                # This function will be run for each node in your compiled program.
+                # here you can inspect all the values as they are computed,
+                # ... you can even change them !
+
+                # 'i' is the execution position in the serialized graph
+                # node is the symbolic Apply instance
+                # th is a callable thing that will compute the node.
+
+                print i, node, len(th.inputs)
+
+                # the symbolic inputs of the node are in node.inputs
+                # the j'th non-symbolic input of the node is in th.inputs[j][0]
+
+                th() # call the function to actually 'run' the graph
+
+                # the symbolic outputs of the node are in node.outputs
+                # the j'th non-symbolic output of the node is in th.outputs[j][0]
+
+                print type(th.outputs[0][0])
+
+                if i == 39:
+                    print 'this node is weird...', th.outputs[0][0]
+
+            
+            self.provided_linker = linker
+            self.provided_optimizer = optimizer
+            if isinstance(linker, str) or linker is None:
+                linker = predefined_linkers[linker]
+
+            self.linker = WrapLinkerMany([linker], [blah])
+                
+            if isinstance(optimizer, str) or optimizer is None:
+                optimizer = predefined_optimizers[optimizer]
+            self._optimizer = optimizer
+
+
+
+The way to use it is like this:
+
+.. code-block:: python
+
+    fn = function(inputs, outputs, mode=StepMode())
+
+When you call fn, your function in the stepmode will be called for each node in the compiled
+program.  You can print out some or all of the values, you can change them in mid-execution.
+You can see where bizarre values are first occurring in your computations.  It's a very
+powerful way to understand your program's execution.
+
+Remember, if you give names your variables then printing nodes will give you a better idea of
+where in the calculations you are.
+
+
--- a/doc/advanced/index.txt
+++ b/doc/advanced/index.txt
@@ -15,4 +15,5 @@ Advanced Topics (under construction)
    ccodegen
    function
    module
+    debugging_with_stepmode

--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -22,7 +22,59 @@ from io import *
 import logging
 _logger = logging.getLogger('theano.compile.function_module')

+def view_map_root(v):
+    """Return the variable that v is ultimately a view of"""
+    if v.owner is None: return v
+    vmap = getattr(v.owner.op, 'view_map', {})
+    dmap = getattr(v.owner.op, 'destroy_map', {})
+    outpos = v.owner.outputs.index(v)
+    v_views = vmap.get(outpos, []) + dmap.get(outpos, [])
+    if len(v_views) > 1:
+        raise NotImplementedError()
+    elif v_views:
+        return view_map_root(v.owner.inputs[v_views[0]])
+    else:
+        return v
+
+def view_tree_set(v, treeset):
+    """Add to `treeset` all variables that are views of v, given that v is not a view"""
+    treeset.add(v)
+    for cl, v_input_pos_to_cl in v.clients:
+        if cl == 'output': 
+            continue
+        vmap = getattr(cl.op, 'view_map', {})
+        dmap = getattr(cl.op, 'destroy_map', {})
+        for opos, iposlist in vmap.items() + dmap.items():
+            if v_input_pos_to_cl in iposlist:
+                if cl.outputs[opos] not in treeset:
+                    view_tree_set(cl.outputs[opos], treeset)
+
 def infer_reuse_pattern(env, outputs_to_disown):
+    """
+    Given an env and a list of variables, returns the list or set of all variables which may
+    share the same underlying data storage as any of the specified variables. Used internally
+    by function, FunctionMaker.
+
+    This list (or set) is also refered to as no_recycling sometimes, especially by linker code.
+    """
+    rval = set()
+    for o in outputs_to_disown:
+        view_tree_set(view_map_root(o), rval)
+    # remove from rval all of the inputs, constants, values.
+    rval = set(r for r in rval if r.owner is not None)
+
+    if 1:
+        # DEBUG STUFF
+        # verify that we return a superset of what we've been returning so far...
+        rval0 = _old_infer_reuse_pattern(env, outputs_to_disown)
+        rval0_set = set(rval0)
+
+        for r in rval0_set:
+            assert r in rval
+
+    return rval
+
+def _old_infer_reuse_pattern(env, outputs_to_disown):
    """
    Given an env and a list of variables, returns the list of all
    variables which may share the same underlying data storage as any of
@@ -39,18 +91,8 @@ def infer_reuse_pattern(env, outputs_to_disown):
        do_not_reuse.append(r)
        node = r.owner
        op = node.op
-        if hasattr(op, 'destroy_map'):
-          dmap = op.destroy_map
-        else:
-          dmap = {}
-
-        if hasattr(op, 'view_map'):
-          vmap = op.view_map
-        else:
-          vmap = {}
-        #backport
-        #dmap = op.destroy_map if hasattr(op, 'destroy_map') else {}
-        #vmap = op.view_map if hasattr(op, 'view_map') else {}
+        dmap = getattr(op, 'destroy_map', {})
+        vmap = getattr(op, 'view_map', {})
        for l in dmap.values() + vmap.values():
            for i in l:
                walk(node.inputs[i])
@@ -515,6 +557,7 @@ class SanityCheckFunction(Function):
        super(SanityCheckFunction, self).__init__(*args, **kwargs)
        self.others = others
        self.check_equal = check_equal
+        # DEPRECATED?  Is this just for DualLinker?

    def __setitem__(self, item, value):
        super(SanityCheckFunction, self).__setitem__(item, value)
@@ -739,6 +782,7 @@ class FunctionMaker(object):
                input_storage_lists.append([input_storage_i])
                defaults.append((self.required[i], self.refeed[i], input_storage_i))

+
        # Get a function instance
        _fn, _i, _o = self.linker.make_thunk(input_storage = input_storage_lists)
        fn = self.function_builder(_fn, _i, _o, self.indices, self.outputs, defaults, self.unpack_single, self.return_none, self)
@@ -791,7 +835,7 @@ def register_checker(checker):

 def function(inputs, outputs, mode=None, accept_inplace = False):
    """
-    Return a function calculating the outputs from the inputs.
+    Return a Function that will calculate the outputs from the inputs.

    :param inputs: list of `SymbolicInput` or `In` instances

@@ -804,61 +848,41 @@ def function(inputs, outputs, mode=None, accept_inplace = False):
    
    Currently, the library provides the following mode strings:

-     - SANITY_CHECK TODO: NotImplemented
-
-     - FAST_COMPILE (apply only optimization that are fast to apply)
-
     - FAST_RUN (default) (optimize without too much time)

-     - EXPENSIVE_OPTIMIZATION TODO: NotImplemented
+     - FAST_COMPILE (minimal optimization)

     - PROFILE_MODE : allow to print a profile mode with mode.print_summary

-     - DEBUG_MODE : make all the check that we taught of(compare python and c,...)
+     - DEBUG_MODE : verify many internal conditions that are normally assumed (SLOW)

    :param accept_inplace:  True iff the graph can contain inplace operations prior to the
    optimization phase (default is False)

-    Every element of the input list will be upgraded to an `In` instance if necessary,
-    using the rules implemented by the `convert_function_input` function.
-
-    Similarly, every element of the output list will be upgraded to an
-    `Out` instance if necessary:
-
-    * a `Variable` instance r will be upgraded like `Out`(r)
-
-
-    Random Numbers
-    --------------
+    """

-    If your computation involves random numbers, then you have to pass the `RandomKit` as an
-    input argument.  That RandomKit must have a name to be able to seed the generator.  To seed
-    the generator, use the `__setitem__` method: 
+    #Every element of the input list will be upgraded to an `In` instance if necessary,
+    #using the rules implemented by the `convert_function_input` function.

-    ..code-block: python
-    
-        f[<kitname>] = seed   #re-seed the elements of a RandomKit
+    #Similarly, every element of the output list will be upgraded to an
+    #`Out` instance if necessary:

-    """
    t1 = time.time()
    if mode is None:
-      mode = mode_module.default_mode
-    #backport
-    #mode = mode if mode is not None else mode_module.default_mode
+        mode = mode_module.default_mode

    inputs = map(convert_function_input, inputs)
    if outputs is not None:
-      if isinstance(outputs, (list, tuple)):
-        outputs = map(FunctionMaker.wrap_out, outputs)
-      else:
-        outputs = FunctionMaker.wrap_out(outputs)
-      #backport
-      #outputs = map(FunctionMaker.wrap_out, outputs) if isinstance(outputs, (list, tuple)) else FunctionMaker.wrap_out(outputs)
+        if isinstance(outputs, (list, tuple)):
+            outputs = map(FunctionMaker.wrap_out, outputs)
+        else:
+            outputs = FunctionMaker.wrap_out(outputs)

    defaults = [getattr(input, 'value', None) for input in inputs]

    mode = mode_module.predefined_modes.get(mode, mode)
    if isinstance(mode, (list, tuple)): # "mode comparison" semantics
+        _logger.warning('Passing multiple modes is deprecated (20091019)')
        if not mode:
            raise ValueError("Please provide at least one mode.")
        elif len(mode) == 1:

--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
+"""WRITEME
+"""
+import os, logging
+
 import numpy
-import os
 import scipy.sparse as sp
 from theano import gof

+_logger = logging.getLogger('theano.compile.mode')
+
 def check_equal(x, y):
    """
    Returns True iff x[0] and y[0] are equal (checks the dtype and
@@ -74,9 +79,51 @@ def register_optimizer(name, opt):
        raise ValueError('Optimizer name already taken: %s' % name)
    predefined_optimizers[name] = opt

+class OutputGuard(gof.Op):
+    destroy_map = {0:[0]}
+    view_map = {0:[0]}
+    def make_node(self, x):
+        return gof.Apply(self, [x], [x.type()])
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def perform(self, node, (x,), (z,)):
+        z[0] = x
+    def __str__(self):
+        return '%s' % self.__class__.__name__
+    def c_code(self, node, nodename, (x,), (z,), sub):
+        return """
+        Py_XDECREF(%(z)s);
+        %(z)s = %(x)s;
+        Py_XINCREF(%(z)s);
+        """ %locals()
+    def c_code_cache_version(self):
+        return (1,)
+_output_guard = OutputGuard()
+
 class AddDestroyHandler(gof.Optimizer):
+    """This optimizer performs two important functions:
+
+    1) it has a 'requirement' of the destroyhandler.  This means that the env will include it
+    as a feature for this optimization, and keep this feature enabled for subsequent
+    optimizations.  All optimizations that work inplace on any of their inputs must run *after*
+    this optimization to ensure that the DestroyHandler has been included in the env.
+
+    2) It tries to replace each output with an Op that purports to destroy it (but it won't I
+    promise).   If this replacement succeeds it means that there is a bug in theano.  It should
+    not be possible to destroy outputs.
+    """
    def apply(self, env):
-        pass
+        for o in env.outputs:
+            try:
+                env.replace_validate(o, _output_guard(o), reason='output_guard')
+                _logger.warning("Output variable %s required output_guard,"
+                        " how was this output left unprotected against destructive operations?"
+                        % o)
+            except gof.InconsistencyError:
+                #this output is already impossible to destroy. no guard necessary
+                pass
    def add_requirements(self, env):
        super(AddDestroyHandler, self).add_requirements(env)
        env.extend(gof.DestroyHandler())

--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
@@ -21,7 +21,7 @@ class ProfileMode(Mode):
                           op_time, op_cimpl, op_call, compile_time))

    def __getstate__(self):
-        print "__getstate__",self.provided_linker,self.provided_optimizer
+        #print "__getstate__",self.provided_linker,self.provided_optimizer
        return (self.provided_linker, self.provided_optimizer, self.local_time,
                           self.apply_time, self.apply_call,
                           self.op_time, self.op_cimpl, self.op_call, self.compile_time)
@@ -255,7 +255,8 @@ def atexit_print_default_profile_mode():
    THEANO_DEFAULT_MODE=PROFILE_MODE
    """
    prof_mode=predefined_modes["PROFILE_MODE"]
-    if prof_mode.local_time[0]>0: prof_mode.print_summary()
+    if prof_mode.local_time[0]>0:
+        prof_mode.print_summary()

 #Register atexit_print_default_profile_mode to have the summary of the
 #predefined mode PROFILE_MODE if it is used printed when the program terminate.

--- a/theano/compile/tests/test_module.py
+++ b/theano/compile/tests/test_module.py
@@ -737,18 +737,19 @@ def test_pickle_aliased_memory():
    m.x[0,0] = 3.14
    assert m.y[0,0] == 3.14

-    import StringIO
+    import StringIO, logging

    sio = StringIO.StringIO()
+    handler = logging.StreamHandler(sio)
+    logging.getLogger('theano.compile.function_module').addHandler(handler)
+    try:
+        m.f.pickle_aliased_memory_strategy = 'warn'
+        m.g.pickle_aliased_memory_strategy = 'warn'
+        m_dup = cPickle.loads(cPickle.dumps(m))
+        assert sio.getvalue().startswith('aliased relat')
+    finally:
+        logging.getLogger('theano.compile.function_module').removeHandler(handler)

-    old_stderr = sys.stderr
-    sys.stderr = sio
-
-    m.f.pickle_aliased_memory_strategy = 'warn'
-    m.g.pickle_aliased_memory_strategy = 'warn'
-    m_dup = cPickle.loads(cPickle.dumps(m))
-    sys.stderr = old_stderr
-    assert sio.getvalue().startswith('WARNING: aliased relat')
    try:
        m.f.pickle_aliased_memory_strategy = 'raise'
        m.g.pickle_aliased_memory_strategy = 'raise'

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -585,9 +585,12 @@ class CLinker(link.Linker):
        """
        ret = ["-O3"]
 # this is the param the -ffast-math activate. I put the explicitly as FillMissing must disable some of them. Putting -ffast-math would make it disable all other parameter at the same time.
-        ret += ["-fno-math-errno", "-funsafe-math-optimizations",
-                "-fno-signaling-nans", "-fcx-limited-range",
-                "-fno-rounding-math", "-ffinite-math-only",
+        ret += ["-fno-math-errno",
+                #"-funsafe-math-optimizations",
+                #"-fno-signaling-nans",
+                #"-fcx-limited-range",
+                #"-fno-rounding-math",
+                #"-ffinite-math-only",
                "-Wno-unused-label",#the current code generate label event if they are not used. Could use gcc attribute for those label only
                "-Wno-unused-variable",#idem as the precedent
                "-Wno-write-strings",#generated by our code generator...
@@ -758,38 +761,61 @@ class CLinker(link.Linker):
        return res

    def cmodule_key(self):
-        """Return a complete hashable signature of the module we compiled
+        """Return a complete hashable signature of the module we compiled.

+        This function must have the property that no two programs that compute different things
+        yield the same key.
+
+        The key returned by this function is of the form (version, signature)
        The signature has the following form:
        {{{
            'CLinker.cmodule_key', compilation args, libraries,
-            op0, (input0.type, input1.type, input0 pos, input1 pos)
-            op1, (...)
+            (op0, input_signature0, output_signature0),
+            (op1, input_signature1, output_signature1),
            ...
-            opK, (...)
-            }}}
+            (opK, input_signatureK, output_signatureK),
+        }}}
+
+        The signature is a tuple, some elements of which are sub-tuples.

-        The signature is a tuple of tuples.
+        The outer tuple has a brief header, followed by elements for every node in the
+        topological ordering of `self.env`.
+
+        If the Op of any Apply in the Env does not have c_code_cache_ok()==True, then this
+        function raises a KeyError exception.

-        The outer tuple has one element for every node in the topological ordering of
-        `self.env`.
+        Input Signature
+        ---------------

-        The inner tuple has one element for the op used at that node, and one element for the
-        inputs to that node.  The inputs are identified by their type and "graph position"
+        Each input signature is a tuple with an element for each input to the corresponding
+        Apply node.
+        Each element identifies the type of the node input, and the nature of that input in the
+        graph.

-        The graph position of a typical variable is encoded by integer pairs ``(a,b)``: 
+        The nature of a typical variable is encoded by integer pairs ``((a,b),c)``: 
        ``a`` is the topological position of the input's owner (-1 for graph inputs),
        ``b`` is the index of the variable in the owner's output list.
+        ``c`` is a flag indicating whether the variable is in the no_recycling set.
+
+        If a variable is also a graph output, then its position in the outputs list is also
+        bundled with this tuple (after the b).

-        The graph position of a Constant instance is defined as its signature, together with
+        The nature of a Constant instance is defined as its signature, together with
        two integers: the topological position of the first Apply using that Constant instance,
        and the lowest index into that Apply's inputs that refers to that Constant.  (These two
        integers are a surrogate for the id() of the Constant.  The integers are important
        because merge-able constants have the same signature, but require separate containers
-        in C code.)
+        in C code.)  The membership in no_recycling is also included in the signature.

-        If the Op of any Apply in the Env does not have c_code_cache_ok()==True, then this
-        function raises a KeyError exception.
+        Output Signature
+        ----------------
+
+        The outputs of a node are entirely determined by the node's Op and the nature of the
+        inputs, but the set of outputs that may be re-used by the computation (the elements of
+        self.no_recycling) can affect the code that is generated.
+        
+        The format of each Op's output signature is simply a list of booleans, indicating
+        whether each output is in the no_recycling set.
        
        """
        return self.cmodule_key_(self.env, self.no_recycling,
@@ -797,68 +823,81 @@ class CLinker(link.Linker):
                          libraries=self.libraries()
                          )
    @staticmethod
-    def cmodule_key_(env, no_recycling, compile_args=None, libraries=None):
+    def cmodule_key_(env, no_recycling, compile_args=[], libraries=[]):
        """
        Do the actual computation of cmodule_key in a static method
        to allow it to be reused in scalar.Composite.__eq__
        """
        order = list(env.toposort())
-        env_computed_set = set()
-        env_inputs_dict = dict((i, [-1, pos]) for pos, i in enumerate(env.inputs))
+        #set of variables that have been computed by nodes we have
+        # seen 'so far' in the loop below
+        env_computed_set = set() 
+        env_inputs_dict = dict((i, (-1, pos)) for pos, i in enumerate(env.inputs))
        constant_ids = dict()
        op_pos = {} # Apply -> topological position
-        rval = ['CLinker.cmodule_key'] # will be cast to tuple on return
-        if compile_args is not None: rval.append(tuple(compile_args))
-        if libraries is not None: rval.append(tuple(libraries))
-        version = []

-        # assert that every input to every node is one of'
-        # - an env input
-        # - an output from a node in the Env
-        # - a Constant
+        # first we put the header, compile_args, library names into the signature
+        sig = ['CLinker.cmodule_key'] # will be cast to tuple on return
+        if compile_args is not None: sig.append(tuple(compile_args))
+        if libraries is not None: sig.append(tuple(libraries))
+
+        def in_sig(i, topological_pos, i_idx):
+            # assert that every input to every node is one of'
+            # - an env input
+            # - an output from a node in the Env
+            # - a Constant

-        # It is important that a variable (i)
-        # yield a 'position' that reflects its role in code_gen()
-        def graphpos(i, topological_pos, i_idx):
-            rval = []
+            # It is important that a variable (i)
+            # yield a 'position' that reflects its role in code_gen()
            if isinstance(i, graph.Constant): #orphans
                if id(i) not in constant_ids:
-                    constant_ids[id(i)] = [i.signature(), topological_pos, i_idx]
-                rval += constant_ids[id(i)]
+                    constant_ids[id(i)] = (i.signature(), topological_pos, i_idx)
+                isig = constant_ids[id(i)]
                #print 'SIGNATURE', i.signature()
                #return i.signature()
            elif i in env_inputs_dict:   #inputs
-                rval += env_inputs_dict[i]
+                isig = env_inputs_dict[i]
            else:
                if i.owner is None:
                    assert all( all(out is not None for out in o.outputs) for o in order)
                    assert all( input.owner is None for input in env.inputs)
                    raise Exception('what is this?', (i, type(i), i.clients, env))
+
                if i in env.outputs:
-                    rval += [op_pos[i.owner], # outputs
+                    isig = (op_pos[i.owner], # outputs
                            i.owner.outputs.index(i),
-                            env.outputs.index(i)]
+                            env.outputs.index(i))
                else:
-                    rval += [op_pos[i.owner], i.owner.outputs.index(i)] # temps
-            assert rval
-            rval.append(i in no_recycling)
-            return tuple(rval)
+                    isig = (op_pos[i.owner], i.owner.outputs.index(i)) # temps
+            return (isig, i in no_recycling)

+        version = []
        for node_pos, node in enumerate(order):
            version.append(node.op.c_code_cache_version_apply(node))
            for i in node.inputs:
                version.append(i.type.c_code_cache_version())
            for o in node.outputs:
                version.append(o.type.c_code_cache_version())
-            rval.append((node.op, tuple((i.type, graphpos(i, node_pos, ipos)) 
-                for ipos,i in enumerate(node.inputs))))
+
+            #add the signature for this node
+            sig.append((
+                node.op,
+                tuple((i.type, in_sig(i, node_pos, ipos)) 
+                    for ipos,i in enumerate(node.inputs)),
+                tuple(o in no_recycling for o in node.outputs)))
+
            op_pos[node] = node_pos
            env_computed_set.update(node.outputs)

+        #crystalize the signature and version
+        sig = tuple(sig)
+        version = tuple(version)
        for v in version:
-            if not v: #one of the ops or types here is unversioned
-                return ((), tuple(rval))
-        return tuple(version), tuple(rval)
+            if not v: 
+                # one of the ops or types here is unversioned, 
+                # so this env is entirely unversioned
+                return ((), sig)
+        return version, sig

    def compile_cmodule(self, location=None):
        """

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -257,9 +257,13 @@ class ModuleCache(object):
                        warning(("The __eq__ and __hash__ functions are broken for some element"
                                " in the following two keys. The cache mechanism will say that"
                                " graphs like this need recompiling, when they could have been"
-                                " retrieved):"))
+                                " retrieved:"))
                        warning("Key 0:", k0)
+                        warning("Entry 0:", self.entry_from_key[k0])
+                        warning("hash 0:", hash(k0))
                        warning("Key 1:", k1)
+                        warning("Entry 1:", self.entry_from_key[k1])
+                        warning("hash 1:", hash(k1))

    def refresh(self):
        """Update self.entry_from_key by walking the cache directory structure.

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -260,15 +260,15 @@ def streamline(env, thunks, order, post_thunk_old_storage = None, no_recycling =
                    (len(thunks), len(post_thunk_old_storage)))

        def streamline_default_f():
-                for x in no_recycling:
-                    x[0] = None
-                try:
-                    for thunk, node, old_storage in zip(thunks, order, post_thunk_old_storage):
-                        thunk()
-                        for old_s in old_storage:
-                            old_s[0] = None
-                except:
-                    raise_with_op(node)
+            for x in no_recycling:
+                x[0] = None
+            try:
+                for thunk, node, old_storage in zip(thunks, order, post_thunk_old_storage):
+                    thunk()
+                    for old_s in old_storage:
+                        old_s[0] = None
+            except:
+                raise_with_op(node)
        f = streamline_default_f
    elif nice_errors:
        thunk_node_list = zip(thunks, order)

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -787,19 +787,17 @@ class Pow(BinaryScalarOp):
        return "%(z)s = pow(%(x)s, %(y)s);" % locals()
    def grad(self, (x, y), (gz, )):
        if x.type in grad_types:
-          first_part = gz * y * x**(y - 1)
+            first_part = gz * y * x**(y - 1)
        else:
-          first_part = None
+            first_part = None

        if y.type in grad_types:
-          second_part = gz * log(x) * x**y 
+            second_part = gz * log(x) * x**y 
        else:
-          second_part = None
+            second_part = None

        return (first_part, second_part)

-        #return (gz * y * x**(y - 1) if x.type in grad_types else None,
-        #        gz * log(x) * x**y if y.type in grad_types else None)
 pow = Pow(upcast_out, name = 'pow')

 class Clip(ScalarOp):

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -180,7 +180,24 @@ def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
        assert len(bcastable) == ndim

    try:
-        return rtype(TensorType(dtype = x_.dtype, broadcastable = bcastable), x_, name=name)
+        if rtype is TensorConstant:
+            if 0:
+                # put the shape into the type
+
+                # This is disabled because if a tensor has shape, then the following fails:
+                # theano.lvector == as_tensor_variable([0,1]).type
+                # I think the solution is that we should implement something more like
+                # compatability instead of equality in our Type comparisons... but we're not
+                # there yet.
+                x_shape = x_.shape
+            else:
+                x_shape = None
+            return rtype(
+                    TensorType(dtype = x_.dtype, broadcastable = bcastable, shape=x_shape),
+                    x_, name=name)
+        else:
+            # leave the shape out of the type
+            return rtype(TensorType(dtype = x_.dtype, broadcastable = bcastable), x_, name=name)
    except:
        raise TypeError("Could not convert %s to TensorType" % x, type(x))

@@ -236,7 +253,7 @@ class TensorType(Type):
    When this is True, strict filtering rejects data containing NaN or Inf entries. (Used in `DebugMode`)
    """

-    def __init__(self, dtype, broadcastable, name = None):
+    def __init__(self, dtype, broadcastable, name = None, shape=None):
        """Initialize self.dtype and self.broadcastable.

        :Parameters:
@@ -256,6 +273,20 @@ class TensorType(Type):
        self.broadcastable = tuple(broadcastable)
        self.dtype_specs() # error checking is done there
        self.name = name
+        if shape is None:
+            self.shape = tuple((1 if b else None) for b in self.broadcastable)
+        else:
+            self.shape = tuple(shape)
+        if len(self.shape) != len(self.broadcastable):
+            raise ValueError('shape and broadcastable must have equal lengths', (self.shape,
+                self.broadcastable))
+
+    def __setstate__(self, dct):
+        self.__dict__.update(dct)
+
+        #add shape when unpickling  old pickled things
+        if 'shape' not in dct:
+            self.shape = tuple(1 if b else None for b in self.broadcastable)
    
    def filter(self, data, strict = False):
        """Convert `data` to something which can be associated to a `TensorVariable`.
@@ -273,6 +304,11 @@ class TensorType(Type):
                raise TypeError("%s expected a ndarray object with %s dimensions (got %s)." % (self, self.ndim, data.ndim))
            if self.filter_checks_isfinite and (not numpy.all(numpy.isfinite(data))):
                raise TypeError("non-finite elements not allowed")
+
+            for si, di in zip(self.shape, data.shape):
+                if not (si is None or si == di):
+                    raise TypeError('%s requires ndarray with shape matching %s (got %s)'%(
+                        self, self.shape, data.shape))
            return data
        else:
            data = numpy.asarray(data, dtype = self.dtype)
@@ -311,7 +347,9 @@ class TensorType(Type):

    def __eq__(self, other):
        """Compare True iff other is the same kind of TensorType"""
-        return type(self) == type(other) and other.dtype == self.dtype and other.broadcastable == self.broadcastable
+        return type(self) == type(other) and other.dtype == self.dtype \
+                and other.broadcastable == self.broadcastable \
+                and other.shape == self.shape

    @staticmethod
    def values_eq(a, b):
@@ -382,7 +420,7 @@ class TensorType(Type):

    def __hash__(self):
        """Hash equal for same kinds of TensorType"""
-        return hashtype(self) ^ hash(self.dtype) ^ hash(self.broadcastable)
+        return hashtype(self) ^ hash(self.dtype) ^ hash(self.broadcastable) ^ hash(self.shape)

    ndim = property(lambda self: len(self.broadcastable), doc = "number of dimensions")
    """Number of dimensions
@@ -405,6 +443,8 @@ class TensorType(Type):
    def __str__(self):
        if self.name:
            return self.name
+        elif not all(None == si for si in self.shape):
+            return 'TensorType{%s, %s}' % (self.dtype, self.shape)
        else:
            b = self.broadcastable
            named_broadcastable = {(): 'scalar',
@@ -782,7 +822,6 @@ class _tensor_py_operators:
    dtype = property(lambda self: self.type.dtype)
    """ The dtype of this tensor.  """

-
    #extra pseudo-operator symbols
    def __dot__(left, right): return dot(left, right)
    def __rdot__(right, left): return dot(left, right)
@@ -806,6 +845,14 @@ class _tensor_py_operators:
        """See `theano.tensor.var`"""
        return var(self, axis)

+    def min(self, axis=None):
+        """See `theano.tensor.min`"""
+        return min(self, axis)
+
+    def max(self, axis=None):
+        """See `theano.tensor.max`"""
+        return max(self, axis)
+
    #TO TRUMP NUMPY OPERATORS
    __array_priority__ = 1000
    
@@ -1051,11 +1098,25 @@ class Shape(Op):
        out[0] = numpy.asarray(x.shape, dtype = 'int64')
    def grad(self, (x,), (gz,)):
        return [None]
-@_redefine_asRoutine(Shape())
+_shape = Shape()
+@constructor
 def shape(a):
-    pass
+    """Return the shape tuple of a TensorType Variable, it may be either symbolic or nonsymbolic.

-pprint.assign(shape, printing.MemberPrinter('shape'))
+    If the shape of the expression is not known at graph-construction time, then a symbolic
+    lvector will be returned, corresponding to the actual shape at graph-execution time.
+    """
+    va = as_tensor_variable(a)
+    #print 'HERE', va, va.type
+    if None in va.type.shape:
+        # Some shape components are unknown at this time
+        return _shape(va)
+    else:
+        # all shape components are known at compile time, so we return
+        # a tuple directly.  This tuple is like the numpy.ndarray.shape tuple.
+        return va.type.shape
+
+pprint.assign(_shape, printing.MemberPrinter('shape'))


 class MaxAndArgmax(Op):
@@ -2352,7 +2413,7 @@ def get_vector_length(v):
            return join.vec_length(v)
        except ValueError:
            pass
-    if v.owner and v.owner.op == shape:
+    if v.owner and v.owner.op == _shape:
        return v.owner.inputs[0].type.ndim
    raise ValueError("length not known")

@@ -2806,6 +2867,11 @@ def grad(cost, wrt, g_cost=None, consider_constant=[], warn_type=False):
    if not isinstance(cost, TensorVariable):
        raise TypeError('In tensor.grad(), cost argument should be a TensorVariable.', cost)

+    if cost.type.ndim:
+        _warn('the passing of a non-scalar cost to theano.tensor.grad() is deprecated.'
+                '  Use the lower-level '
+                'theano.gradient if you really want to do this')
+
    if g_cost is None:
        g_cost = ones_like(cost)
    inputs = gof.graph.inputs([cost])

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -18,6 +18,7 @@ from theano import compile  #to register the optimizer built by this file
 from theano.tensor.blas_headers import cblas_header_text, blas_header_text

 _logger = logging.getLogger('theano.tensor.blas')
+_logger.setLevel(logging.INFO)
 def debug(*msg): _logger.debug(' '.join(str(m) for m in msg))
 def info(*msg): _logger.info(' '.join(str(m) for m in msg))
 def warn(*msg): _logger.warn(' '.join(str(m) for m in msg))
@@ -604,10 +605,15 @@ class Dot22(GemmRelated):
    This is a specialization of the more general Dot()
    """
    def make_node(self, x, y):
-        assert _is_real_matrix(x)
-        assert y.type == x.type               #makes sure y is a matrix
+        if not _is_real_matrix(x):
+            raise TypeError(x)
+        if not _is_real_matrix(x):
+            raise TypeError(y)
+        if y.type.dtype != x.type.dtype:
+            raise TypeError('dtype mismatch to Dot22')
+        out_shape = (x.type.shape[0], y.type.shape[1])
        bz = [False, False]
-        outputs = [T.tensor(x.type.dtype, bz)]
+        outputs = [T.tensor(x.type.dtype, bz, shape=out_shape)]
        return Apply(self, [x,y], outputs)

    def perform(self, node, (x, y), (z, )):
@@ -660,10 +666,10 @@ _dot22 = Dot22()
 def local_dot_to_dot22(node):
    if node.op == T.dot:
        x,y = node.inputs
-        if _is_real_matrix(x) and y.type == x.type:
+        if _is_real_matrix(x) and _is_real_matrix(y) and y.type.dtype == x.type.dtype:
            return [_dot22(*node.inputs)]
        else:
-            info('Not optimizing dot with inputs', x, y)
+            info('Not optimizing dot with inputs', x, y, x.type, y.type)
    else:
        return False
 register_specialize(local_dot_to_dot22)

--- a/theano/tensor/nnet.py
+++ b/theano/tensor/nnet.py
@@ -142,9 +142,6 @@ class SoftmaxWithBias(gof.Op):
        return ['<iostream>','<cmath>']

    @staticmethod
-    def c_code_cache_version():
-        return (4,)
-    @staticmethod
    def c_code_template():
        # this implementation was lifted from
        # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx
@@ -180,7 +177,7 @@ class SoftmaxWithBias(gof.Op):
        }
        if ((%(x)s->dimensions[1] != %(b)s->dimensions[0]))
        {
-            PyErr_Format(PyExc_ValueError, "number of columns in x (%%i) does not match length of b (%%i)",
+            PyErr_Format(PyExc_ValueError, "number of columns in x (%%zi) does not match length of b (%%zi)",
            %(x)s->dimensions[1], %(b)s->dimensions[0]);
            %(fail)s;
        }
@@ -236,20 +233,6 @@ class SoftmaxWithBias(gof.Op):
                sum += sm_ij;
                sm_i[j * Ssm] = sm_ij;
            }
-            //std::cout << "\\n";
-            if (std::isinf(sum))
-            {
-                //that was our best...
-                PyErr_SetString(PyExc_ValueError, "softmax is impossible (inf)!");
-                %(fail)s;
-            }
-
-            if (0.0 == sum)
-            {
-                //that was our best...
-                PyErr_SetString(PyExc_ValueError, "softmax is impossible (zero)!");
-                %(fail)s;
-            }

            //cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n);
            double sum_inv = 1.0 / sum;
@@ -271,6 +254,10 @@ class SoftmaxWithBias(gof.Op):
        code_template = ''.join(self.c_code_template())
        return code_template % dict(locals(), **sub)

+    @staticmethod
+    def c_code_cache_version():
+        return (5,)
+
 softmax_with_bias = SoftmaxWithBias()



--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -196,20 +196,20 @@ def local_shape_lift_sum(node):
 register_canonicalize(local_shape_lift_sum, 'shape_lift')


-@gof.local_optimizer([T.shape, T.dot])
+@gof.local_optimizer([T._shape, T.dot])
 def local_shape_lift_dot(node):
    """
    shape(dot(a, b)) -> [shape(a)[0], shape(b)[1]]
    """
-    if not opt.check_chain(node, T.shape, T.dot):
+    if not opt.check_chain(node, T._shape, T.dot):
        return False
    a, b = node.inputs[0].owner.inputs
    if a.type.ndim == 2 and b.type.ndim == 2:
-        return T.make_lvector.make_node(T.shape(a)[0], T.shape(b)[1]).outputs
+        return T.make_lvector.make_node(T._shape(a)[0], T._shape(b)[1]).outputs
    elif a.type.ndim == 1 and b.type.ndim == 2:
-        return T.make_lvector.make_node(T.shape(b)[1]).outputs
+        return T.make_lvector.make_node(T._shape(b)[1]).outputs
    elif a.type.ndim == 2 and b.type.ndim == 1:
-        return T.make_lvector.make_node(T.shape(a)[0]).outputs
+        return T.make_lvector.make_node(T._shape(a)[0]).outputs
    elif a.type.ndim == 1 and b.type.ndim == 1:
        return T.make_lvector.make_node().outputs
    else:

--- a/theano/tensor/tests/test_nnet.py
+++ b/theano/tensor/tests/test_nnet.py
@@ -163,7 +163,8 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
        theano.compile.mode.optdb.query(
                theano.compile.mode.OPT_FAST_RUN).optimize(env)

-        assert env.outputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias
+        assert str(env.outputs[0].owner.op) == 'OutputGuard'
+        assert env.outputs[0].owner.inputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias

    def test_softmax_optimizations_w_bias(self):
        x = tensor.matrix('x')
@@ -186,9 +187,10 @@ class T_CrossentropyCategorical1Hot(unittest.TestCase):
        theano.compile.mode.optdb.query(
                theano.compile.mode.OPT_FAST_RUN).optimize(env)

-        assert len(env.toposort()) == 1
+        assert len(env.toposort()) == 2

-        assert env.outputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias
+        assert str(env.outputs[0].owner.op) == 'OutputGuard'
+        assert env.outputs[0].owner.inputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias


    def test_softmax_grad_optimizations(self):
@@ -249,7 +251,7 @@ def test_argmax_pushdown():
    #print 'AFTER'
    #for node in env.toposort():
        #print node.op
-    assert len(env.toposort()) == 1
+    assert len(env.toposort()) == 2 # an output_guard is second
    assert env.toposort()[0].op == tensor._max_and_argmax

 def test_argmax_pushdown_bias():
@@ -263,10 +265,14 @@ def test_argmax_pushdown_bias():
    theano.compile.mode.optdb.query(
            theano.compile.mode.OPT_FAST_RUN).optimize(env)

-    #print 'AFTER'
-    #for node in env.toposort():
-        #print node.op
-    assert len(env.toposort()) == 3
+    print 'AFTER'
+    for node in env.toposort():
+        print node.op
+    assert len(env.toposort()) == 4
+    assert isinstance(env.toposort()[0].op, tensor.DimShuffle)
+    assert isinstance(env.toposort()[1].op, tensor.Elemwise)
+    assert isinstance(env.toposort()[2].op, tensor.MaxAndArgmax)
+    assert str(env.toposort()[3].op) == 'OutputGuard'

 def test_asymptotic_32():
    """

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -246,16 +246,20 @@ class test_canonize(unittest.TestCase):
        #We must be sure that the Canonizer is working, but that we don't have other
        # optimisation that could hide bug in the Canonizer as local_elemwise_fusion
        mode=compile.mode.predefined_modes[compile.mode.default_mode]
-        mode._optimizer=gof.Query(["canonicalize"])
-        mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
-        for id, [g, sym_inputs, val_inputs, nb_elemwise, out_dtype] in enumerate(cases):
-            f = compile.function(list(sym_inputs), g,
-                                 #we need the optimisation enabled, debug do this.
-                                 mode=mode)
-            
-            out = f(*val_inputs)
-            assert(len(f.maker.env.toposort())==nb_elemwise)
-            assert(out_dtype==out.dtype)
+        old_optimizer = mode._optimizer
+        try:
+            mode._optimizer=gof.Query(["canonicalize"])
+            mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
+            for id, [g, sym_inputs, val_inputs, nb_elemwise, out_dtype] in enumerate(cases):
+                f = compile.function(list(sym_inputs), g,
+                                     #we need the optimisation enabled, debug do this.
+                                     mode=mode)
+                
+                out = f(*val_inputs)
+                assert(len(f.maker.env.toposort())==nb_elemwise)
+                assert(out_dtype==out.dtype)
+        finally:
+            mode._optimizer = old_optimizer

    def test_elemwise_multiple_inputs_optimisation2(self):
        """
@@ -367,130 +371,134 @@ class test_canonize(unittest.TestCase):
        #We must be sure that the Canonizer is working, but that we don't have other
        # optimisation that could hide bug in the Canonizer as local_elemwise_fusion
        mode=compile.mode.predefined_modes[compile.mode.default_mode]
-        mode._optimizer=gof.Query(["canonicalize"])
-        mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
-
-        #test x / x -> 1
-        for id, (g, sym_inputs, val_inputs, out_dtype) in enumerate([(fx/fx,[fx],[fxv],'float32'),
-                                                       (dx/dx,[dx],[dxv],'float64'),
-                                                       (fv/fv,[fv],[fvv],'float32'),
-                                                       (dv/dv,[dv],[dvv],'float64'),
-                                                       ]):
-            f = compile.function(list(sym_inputs), g,
-                                 mode=mode)
-            out = f(*val_inputs)
-            assert (out==numpy.ones(shp, dtype=out_dtype)).all()
-            topo=f.maker.env.toposort()
-            assert len(topo)==1
-            assert isinstance(topo[0].op,(T.Elemwise,))
-            assert isinstance(topo[0].op.scalar_op,theano.scalar.basic.Second)
-            assert len(topo[0].inputs)==2
-            assert(out_dtype==out.dtype)
-
-        #test (x * y) / x -> y
-        for id,(g, sym_inputs, val_inputs, nb_elemwise, out_dtype) in enumerate([
-                                                       ((dx*dy)/dx,[dx,dy],[dxv,dyv],0,'float64'),
-                                                       ((fx*fy)/fx,[fx,fy],[fxv,fyv],0,'float32'),
-                                                       ((dv*dy)/dv,[dv,dy],[dvv,dyv],0,'float64'),
-                                                       ((fv*fy)/fv,[fv,fy],[fvv,fyv],0,'float32'),
-            #must broadcast as their is a dimshuffle in the computation
-                                                       ((dx*dv)/dx,[dx,dv],[dxv,dvv],1,'float64'),
-            #topo: [Elemwise{second,no_inplace}(x, <TensorType(float64, row)>)]
-                                                       ((fx*fv)/fx,[fx,fv],[fxv,fvv],1,'float32')
-            #topo: [Elemwise{second,no_inplace}(x, <TensorType(float32, row)>)]
-            ]):
-            f = compile.function(list(sym_inputs), g,
-                                 mode=mode)
-            out = f(*val_inputs)
-            assert numpy.allclose(out,val_inputs[1])
-            topo=f.maker.env.toposort()
-            assert len(topo)==nb_elemwise
-            assert(out_dtype==out.dtype)
-
-        #test x / y / x -> 1 / y
-        for id,(g, sym_inputs, val_inputs, nb_elemwise, out_dtype) in enumerate([
-                                                       ((dx/dy)/dx,[dx,dy],[dxv,dyv],1,'float64'),
-                                                       ((fx/fy)/fx,[fx,fy],[fxv,fyv],1,'float32'),
-                                                       ((dv/dy)/dv,[dv,dy],[dvv,dyv],1,'float64'),
-                                                       ((fv/fy)/fv,[fv,fy],[fvv,fyv],1,'float32'),
-                        #must broadcast as their is a dimshuffle in the computation
-
-                                                       ((dx/dv)/dx,[dx,dv],[dxv,dvv],2,'float64'),
-#topo:            [Elemwise{inv,no_inplace}(<TensorType(float64, row)>), Elemwise{second,no_inplace}(x, Elemwise{inv,no_inplace}.0)]
-                                                       ((fx/fv)/fx,[fx,fv],[fxv,fvv],2,'float32'),
-            #topo:[Elemwise{inv,no_inplace}(<TensorType(float32, row)>), Elemwise{second,no_inplace}(x, Elemwise{inv,no_inplace}.0)]
-            ]):
-            f = compile.function(list(sym_inputs), g,
-                                 mode=mode)
-            out = f(*val_inputs)
-            assert numpy.allclose(out,(1/val_inputs[1]))
-            topo=f.maker.env.toposort()
-            assert len(topo)==nb_elemwise
-            assert isinstance(topo[0].op,(T.Elemwise,))
-            assert isinstance(topo[0].op.scalar_op,(theano.scalar.basic.Inv, theano.scalar.basic.TrueDiv))
-            assert(out_dtype==out.dtype)
-
-        #test (a / b) * (b / c) * (c / d) -> a / d
-        for id,(g, sym_inputs, val_inputs, out_dtype) in enumerate([
-                                                       ((dx / dy) * (dy / dz) * (dz / dw),[dx,dy,dz,dw],[dxv,dyv,dzv,dwv],'float64'),
-                                                       ((fx / fy) * (fy / fz) * (fz / fw),[fx,fy,fz,fw],[fxv,fyv,fzv,fwv],'float32'),
-                                                       ((dv / dy) * (dy / dz) * (dz / dw),[dv,dy,dz,dw],[dvv,dyv,dzv,dwv],'float64'),
-                                                       ((fv / fy) * (fy / fz) * (fz / fw),[fv,fy,fz,fw],[fvv,fyv,fzv,fwv],'float32'),
-                                                       ((dx / dv) * (dv / dz) * (dz / dw),[dx,dv,dz,dw],[dxv,dvv,dzv,dwv],'float64'),
-                                                       ((fx / fv) * (fv / fz) * (fz / fw),[fx,fv,fz,fw],[fxv,fvv,fzv,fwv],'float32'),
-                                                       ((dx / dy) * (dy / dv) * (dv / dw),[dx,dy,dv,dw],[dxv,dyv,dvv,dwv],'float64'),
-                                                       ((fx / fy) * (fy / fv) * (fv / fw),[fx,fy,fv,fw],[fxv,fyv,fvv,fwv],'float32'),
-                                                       ((dx / dy) * (dy / dz) * (dz / dv),[dx,dy,dz,dv],[dxv,dyv,dzv,dvv],'float64'),
-                                                       ((fx / fy) * (fy / fz) * (fz / fv),[fx,fy,fz,fv],[fxv,fyv,fzv,fvv],'float32'),
-            ]):
-            f = compile.function(list(sym_inputs), g,
-                                 mode=mode)
-            out = f(*val_inputs)
-            assert numpy.allclose(out,(val_inputs[0]/val_inputs[3]))
-            topo=f.maker.env.toposort()
-            assert len(topo)==1
-            assert isinstance(topo[0].op,(T.Elemwise,))
-            assert isinstance(topo[0].op.scalar_op,theano.scalar.basic.TrueDiv)
-            assert len(topo[0].inputs)==2
-            assert(out_dtype==out.dtype)
-
-        #test (2.0 * x) / (4.0 * y) -> (0.5 * x) / y
-        for id,(g, sym_inputs, val_inputs, out_dtype) in enumerate([
-                                                       (((2.0*dx)/(4.0*dy)),[dx,dy],[dxv,dyv],'float64'),
-                                                       (((2.0*fx)/(4.0*fy)),[fx,fy],[fxv,fyv],'float32'),
-                                                       (((2.0*dv)/(4.0*dy)),[dv,dy],[dvv,dyv],'float64'),
-                                                       (((2.0*fv)/(4.0*fy)),[fv,fy],[fvv,fyv],'float32'),
-                                                       (((2.0*dx)/(4.0*dv)),[dx,dv],[dxv,dvv],'float64'),
-                                                       (((2.0*fx)/(4.0*fv)),[fx,fv],[fxv,fvv],'float32'),
-            ]):
-            f = compile.function(list(sym_inputs), g,
-                                 mode=mode)
-            out = f(*val_inputs)
-            assert numpy.allclose(out,(0.5*val_inputs[0]/val_inputs[1]))
-            topo=f.maker.env.toposort()
-            assert len(topo)==2
-            assert isinstance(topo[0].op,(T.Elemwise,))
-            assert isinstance(topo[0].op.scalar_op,theano.scalar.basic.Mul)
-            assert len(topo[0].inputs)==2
-            assert isinstance(topo[1].op,(T.Elemwise,))
-            assert isinstance(topo[1].op.scalar_op,theano.scalar.basic.TrueDiv)
-            assert len(topo[1].inputs)==2
-            assert(out_dtype==out.dtype)
-            
-        #test 2 * x / 2 -> x
-        for id,(g, sym_inputs, val_inputs, out_dtype) in enumerate([
-                                                       ((2*dx)/2,[dx],[dxv],'float64'),
-                                                       ((2*fx)/2,[fx],[fxv],'float32'),
-                                                       ((2*dv)/2,[dv],[dvv],'float64'),
-                                                       ((2*fv)/2,[fv],[fvv],'float32'),
-            ]):
-            f = compile.function(list(sym_inputs), g,
-                                 mode=mode)
-            out = f(*val_inputs)
-            assert numpy.allclose(out,val_inputs[0])
-            topo=f.maker.env.toposort()
-            assert len(topo)==0
-            assert(out_dtype==out.dtype)
+        old_optimizer = mode._optimizer
+        try:
+            mode._optimizer=gof.Query(["canonicalize"])
+            mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
+
+            #test x / x -> 1
+            for id, (g, sym_inputs, val_inputs, out_dtype) in enumerate([(fx/fx,[fx],[fxv],'float32'),
+                                                           (dx/dx,[dx],[dxv],'float64'),
+                                                           (fv/fv,[fv],[fvv],'float32'),
+                                                           (dv/dv,[dv],[dvv],'float64'),
+                                                           ]):
+                f = compile.function(list(sym_inputs), g,
+                                     mode=mode)
+                out = f(*val_inputs)
+                assert (out==numpy.ones(shp, dtype=out_dtype)).all()
+                topo=f.maker.env.toposort()
+                assert len(topo)==1
+                assert isinstance(topo[0].op,(T.Elemwise,))
+                assert isinstance(topo[0].op.scalar_op,theano.scalar.basic.Second)
+                assert len(topo[0].inputs)==2
+                assert(out_dtype==out.dtype)
+
+            #test (x * y) / x -> y
+            for id,(g, sym_inputs, val_inputs, nb_elemwise, out_dtype) in enumerate([
+                                                           ((dx*dy)/dx,[dx,dy],[dxv,dyv],0,'float64'),
+                                                           ((fx*fy)/fx,[fx,fy],[fxv,fyv],0,'float32'),
+                                                           ((dv*dy)/dv,[dv,dy],[dvv,dyv],0,'float64'),
+                                                           ((fv*fy)/fv,[fv,fy],[fvv,fyv],0,'float32'),
+                #must broadcast as their is a dimshuffle in the computation
+                                                           ((dx*dv)/dx,[dx,dv],[dxv,dvv],1,'float64'),
+                #topo: [Elemwise{second,no_inplace}(x, <TensorType(float64, row)>)]
+                                                           ((fx*fv)/fx,[fx,fv],[fxv,fvv],1,'float32')
+                #topo: [Elemwise{second,no_inplace}(x, <TensorType(float32, row)>)]
+                ]):
+                f = compile.function(list(sym_inputs), g,
+                                     mode=mode)
+                out = f(*val_inputs)
+                assert numpy.allclose(out,val_inputs[1])
+                topo=f.maker.env.toposort()
+                assert len(topo)==nb_elemwise
+                assert(out_dtype==out.dtype)
+
+            #test x / y / x -> 1 / y
+            for id,(g, sym_inputs, val_inputs, nb_elemwise, out_dtype) in enumerate([
+                                                           ((dx/dy)/dx,[dx,dy],[dxv,dyv],1,'float64'),
+                                                           ((fx/fy)/fx,[fx,fy],[fxv,fyv],1,'float32'),
+                                                           ((dv/dy)/dv,[dv,dy],[dvv,dyv],1,'float64'),
+                                                           ((fv/fy)/fv,[fv,fy],[fvv,fyv],1,'float32'),
+                            #must broadcast as their is a dimshuffle in the computation
+
+                                                           ((dx/dv)/dx,[dx,dv],[dxv,dvv],2,'float64'),
+    #topo:            [Elemwise{inv,no_inplace}(<TensorType(float64, row)>), Elemwise{second,no_inplace}(x, Elemwise{inv,no_inplace}.0)]
+                                                           ((fx/fv)/fx,[fx,fv],[fxv,fvv],2,'float32'),
+                #topo:[Elemwise{inv,no_inplace}(<TensorType(float32, row)>), Elemwise{second,no_inplace}(x, Elemwise{inv,no_inplace}.0)]
+                ]):
+                f = compile.function(list(sym_inputs), g,
+                                     mode=mode)
+                out = f(*val_inputs)
+                assert numpy.allclose(out,(1/val_inputs[1]))
+                topo=f.maker.env.toposort()
+                assert len(topo)==nb_elemwise
+                assert isinstance(topo[0].op,(T.Elemwise,))
+                assert isinstance(topo[0].op.scalar_op,(theano.scalar.basic.Inv, theano.scalar.basic.TrueDiv))
+                assert(out_dtype==out.dtype)
+
+            #test (a / b) * (b / c) * (c / d) -> a / d
+            for id,(g, sym_inputs, val_inputs, out_dtype) in enumerate([
+                                                           ((dx / dy) * (dy / dz) * (dz / dw),[dx,dy,dz,dw],[dxv,dyv,dzv,dwv],'float64'),
+                                                           ((fx / fy) * (fy / fz) * (fz / fw),[fx,fy,fz,fw],[fxv,fyv,fzv,fwv],'float32'),
+                                                           ((dv / dy) * (dy / dz) * (dz / dw),[dv,dy,dz,dw],[dvv,dyv,dzv,dwv],'float64'),
+                                                           ((fv / fy) * (fy / fz) * (fz / fw),[fv,fy,fz,fw],[fvv,fyv,fzv,fwv],'float32'),
+                                                           ((dx / dv) * (dv / dz) * (dz / dw),[dx,dv,dz,dw],[dxv,dvv,dzv,dwv],'float64'),
+                                                           ((fx / fv) * (fv / fz) * (fz / fw),[fx,fv,fz,fw],[fxv,fvv,fzv,fwv],'float32'),
+                                                           ((dx / dy) * (dy / dv) * (dv / dw),[dx,dy,dv,dw],[dxv,dyv,dvv,dwv],'float64'),
+                                                           ((fx / fy) * (fy / fv) * (fv / fw),[fx,fy,fv,fw],[fxv,fyv,fvv,fwv],'float32'),
+                                                           ((dx / dy) * (dy / dz) * (dz / dv),[dx,dy,dz,dv],[dxv,dyv,dzv,dvv],'float64'),
+                                                           ((fx / fy) * (fy / fz) * (fz / fv),[fx,fy,fz,fv],[fxv,fyv,fzv,fvv],'float32'),
+                ]):
+                f = compile.function(list(sym_inputs), g,
+                                     mode=mode)
+                out = f(*val_inputs)
+                assert numpy.allclose(out,(val_inputs[0]/val_inputs[3]))
+                topo=f.maker.env.toposort()
+                assert len(topo)==1
+                assert isinstance(topo[0].op,(T.Elemwise,))
+                assert isinstance(topo[0].op.scalar_op,theano.scalar.basic.TrueDiv)
+                assert len(topo[0].inputs)==2
+                assert(out_dtype==out.dtype)
+
+            #test (2.0 * x) / (4.0 * y) -> (0.5 * x) / y
+            for id,(g, sym_inputs, val_inputs, out_dtype) in enumerate([
+                                                           (((2.0*dx)/(4.0*dy)),[dx,dy],[dxv,dyv],'float64'),
+                                                           (((2.0*fx)/(4.0*fy)),[fx,fy],[fxv,fyv],'float32'),
+                                                           (((2.0*dv)/(4.0*dy)),[dv,dy],[dvv,dyv],'float64'),
+                                                           (((2.0*fv)/(4.0*fy)),[fv,fy],[fvv,fyv],'float32'),
+                                                           (((2.0*dx)/(4.0*dv)),[dx,dv],[dxv,dvv],'float64'),
+                                                           (((2.0*fx)/(4.0*fv)),[fx,fv],[fxv,fvv],'float32'),
+                ]):
+                f = compile.function(list(sym_inputs), g,
+                                     mode=mode)
+                out = f(*val_inputs)
+                assert numpy.allclose(out,(0.5*val_inputs[0]/val_inputs[1]))
+                topo=f.maker.env.toposort()
+                assert len(topo)==2
+                assert isinstance(topo[0].op,(T.Elemwise,))
+                assert isinstance(topo[0].op.scalar_op,theano.scalar.basic.Mul)
+                assert len(topo[0].inputs)==2
+                assert isinstance(topo[1].op,(T.Elemwise,))
+                assert isinstance(topo[1].op.scalar_op,theano.scalar.basic.TrueDiv)
+                assert len(topo[1].inputs)==2
+                assert(out_dtype==out.dtype)
+                
+            #test 2 * x / 2 -> x
+            for id,(g, sym_inputs, val_inputs, out_dtype) in enumerate([
+                                                           ((2*dx)/2,[dx],[dxv],'float64'),
+                                                           ((2*fx)/2,[fx],[fxv],'float32'),
+                                                           ((2*dv)/2,[dv],[dvv],'float64'),
+                                                           ((2*fv)/2,[fv],[fvv],'float32'),
+                ]):
+                f = compile.function(list(sym_inputs), g,
+                                     mode=mode)
+                out = f(*val_inputs)
+                assert numpy.allclose(out,val_inputs[0])
+                topo=f.maker.env.toposort()
+                assert len(topo)==0
+                assert(out_dtype==out.dtype)
+        finally:
+            mode._optimizer = old_optimizer


    def test_multiple_case_that_fail(self):
@@ -510,43 +518,48 @@ class test_canonize(unittest.TestCase):
        #We must be sure that the Canonizer is working, but that we don't have other
        # optimisation that could hide bug in the Canonizer as local_elemwise_fusion
        mode=compile.mode.predefined_modes[compile.mode.default_mode]
-        mode._optimizer=gof.Query(["canonicalize"])
-        mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
-
-#test fail!
-        #test x / y / z -> x / (y * z)
-        for (g, sym_inputs, val_inputs, out_dtype) in [
-                                                       ((dx/dy)/dz,[dx,dy,dz],[dxv,dyv,dzv],'float64'),
-                                                       ((fx/fy)/fz,[fx,fy,fz],[fxv,fyv,fzv],'float32')
-            ]:
-            f = compile.function(list(sym_inputs), g,
-                                 mode=mode)
-            out = f(*val_inputs)
-            assert numpy.allclose(out,val_inputs[0]/val_inputs[1]/val_inputs[2])
-            topo=f.maker.env.toposort()
-            print topo
-            assert len(topo)==2
-            assert isinstance(topo[0].op,(T.Elemwise,))
-            assert isinstance(topo[0].op.scalar_op,theano.scalar.basic.Inv)
-            assert len(topo[0].inputs)==1
-            assert(out_dtype==out.dtype)
-
-        #test x / (y / z) -> (x * z) / y
-        for (g, sym_inputs, val_inputs, out_dtype) in [
-                                                       (dx/(dy/dz),[dx,dy,dz],[dxv,dyv,dzv],'float64'),
-                                                       (fx/(fy/fz),[fx,fy,fz],[fxv,fyv,fzv],'float32')
-            ]:
-            f = compile.function(list(sym_inputs), g,
-                                 mode=mode)
-            out = f(*val_inputs)
-            assert numpy.allclose(out,val_inputs[0]/(val_inputs[1]/val_inputs[2]))
-            topo=f.maker.env.toposort()
-            print topo
-            assert len(topo)==2
-            assert isinstance(topo[0].op,(T.Elemwise,))
-            assert isinstance(topo[0].op.scalar_op,theano.scalar.basic.Inv)
-            assert len(topo[0].inputs)==1
-            assert(out_dtype==out.dtype)
+        old_optimizer = mode._optimizer
+        try:
+            mode._optimizer=gof.Query(["canonicalize"])
+            mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
+
+    #test fail!
+            #test x / y / z -> x / (y * z)
+            for (g, sym_inputs, val_inputs, out_dtype) in [
+                                                           ((dx/dy)/dz,[dx,dy,dz],[dxv,dyv,dzv],'float64'),
+                                                           ((fx/fy)/fz,[fx,fy,fz],[fxv,fyv,fzv],'float32')
+                ]:
+                f = compile.function(list(sym_inputs), g,
+                                     mode=mode)
+                out = f(*val_inputs)
+                assert numpy.allclose(out,val_inputs[0]/val_inputs[1]/val_inputs[2])
+                topo=f.maker.env.toposort()
+                print topo
+                assert len(topo)==2
+                assert isinstance(topo[0].op,(T.Elemwise,))
+                assert isinstance(topo[0].op.scalar_op,theano.scalar.basic.Inv)
+                assert len(topo[0].inputs)==1
+                assert(out_dtype==out.dtype)
+
+            #test x / (y / z) -> (x * z) / y
+            for (g, sym_inputs, val_inputs, out_dtype) in [
+                                                           (dx/(dy/dz),[dx,dy,dz],[dxv,dyv,dzv],'float64'),
+                                                           (fx/(fy/fz),[fx,fy,fz],[fxv,fyv,fzv],'float32')
+                ]:
+                f = compile.function(list(sym_inputs), g,
+                                     mode=mode)
+                out = f(*val_inputs)
+                assert numpy.allclose(out,val_inputs[0]/(val_inputs[1]/val_inputs[2]))
+                topo=f.maker.env.toposort()
+                print topo
+                assert len(topo)==2
+                assert isinstance(topo[0].op,(T.Elemwise,))
+                assert isinstance(topo[0].op.scalar_op,theano.scalar.basic.Inv)
+                assert len(topo[0].inputs)==1
+                assert(out_dtype==out.dtype)
+
+        finally:
+            mode._optimizer = old_optimizer

    def test_dont_merge_if_multiple_client(self):
        """ test those case take from the comment in Canonizer
@@ -571,10 +584,16 @@ def test_local_shape_lift_dot():
        for y in [fvector, fmatrix]:
            i = x()
            j = y()
+            print 'I SHAPE', i.type.shape
+            print 'J SHAPE', j.type.shape
            d = shape(dot(i,j))
-            g = Env([i,j], [d])
-            gof.TopoOptimizer(gof.LocalOptGroup(local_shape_lift_dot), order='out_to_in').optimize(g)
-            assert pprint(g.outputs[0]) == args_to_result[(x,y)]
+            if x is fvector and y is fvector:
+                assert d == ()
+            else:
+                g = Env([i,j], [d])
+                gof.TopoOptimizer(gof.LocalOptGroup(local_shape_lift_dot), order='out_to_in').optimize(g)
+                print pprint(g.outputs[0]), args_to_result[(x,y)]
+                assert pprint(g.outputs[0]) == args_to_result[(x,y)]
        
 #     def test_plusmin(self):
 #         x, y, z = inputs()
@@ -982,23 +1001,27 @@ class test_fusion(unittest.TestCase):
        #Follow up. Clinker do the same... second cause?
        mode2=compile.Mode(linker(), copy.copy(compile.mode.OPT_FAST_RUN))
 #        mode2=copy.copy(compile.mode.predefined_modes['FAST_RUN'])
-        mode2._optimizer=mode2._optimizer.excluding('local_elemwise_fusion')
-#        mode2=compile.Mode(gof.OpWiseCLinker(allow_gc=True), compile.mode.OPT_FAST_COMPILE)
-
-        if s is None:
-            s=slice(0,49)
-            #s=slice(49,59)
-        nb_repeat=10
-        print "test with linker", str(linker)
-        times1=self.do(mode1, shared_fn, shp, gpu=gpu, nb_repeat=nb_repeat, assert_len_topo=False,slice=s)
-        times2=self.do(mode2, shared_fn, shp, gpu=gpu, nb_repeat=nb_repeat, assert_len_topo=False,slice=s)
-        print "times1 FAST_RUN optimisation"
-        print times1, times1.min(), times1.max(), times1.sum()
-        print "times2 FAST_RUN optimisation without local_elemwise_fusion"
-        print times2, times2.min(), times2.max(), times2.sum()
-        d=times2/times1
-#        d.sort()
-        print "times2/times1",d,d.min(), d.max(), d.mean(), d.std()
+        old_optimizer = mode2._optimizer
+        try:
+            mode2._optimizer=mode2._optimizer.excluding('local_elemwise_fusion')
+    #        mode2=compile.Mode(gof.OpWiseCLinker(allow_gc=True), compile.mode.OPT_FAST_COMPILE)
+
+            if s is None:
+                s=slice(0,49)
+                #s=slice(49,59)
+            nb_repeat=10
+            print "test with linker", str(linker)
+            times1=self.do(mode1, shared_fn, shp, gpu=gpu, nb_repeat=nb_repeat, assert_len_topo=False,slice=s)
+            times2=self.do(mode2, shared_fn, shp, gpu=gpu, nb_repeat=nb_repeat, assert_len_topo=False,slice=s)
+            print "times1 FAST_RUN optimisation"
+            print times1, times1.min(), times1.max(), times1.sum()
+            print "times2 FAST_RUN optimisation without local_elemwise_fusion"
+            print times2, times2.min(), times2.max(), times2.sum()
+            d=times2/times1
+    #        d.sort()
+            print "times2/times1",d,d.min(), d.max(), d.mean(), d.std()
+        finally:
+            mode2._optimizer = old_optimizer

    def speed_fusion_gpu(self):
        import theano_cuda_ndarray as tcn