merge

af804be5 · James Bergstra · 1364578a · 8963a3d1 · af804be5 · af804be5
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -197,6 +197,11 @@ class BadOptimization(DebugModeError):
            print >> ssio, "  Mean Abs Diff: ", numpy.mean(numpy.absolute(nv-ov))
            print >> ssio, "  Median Abs Diff: ", numpy.median(numpy.absolute(nv-ov))
            print >> ssio, "  Std Abs Diff: ", numpy.std(numpy.absolute(nv-ov))
+            reldiff = numpy.absolute(nv-ov) / (numpy.absolute(nv)+numpy.absolute(ov))
+            print >> ssio, "  Max Rel Diff: ", numpy.max(reldiff)
+            print >> ssio, "  Mean Rel Diff: ", numpy.mean(reldiff)
+            print >> ssio, "  Median Rel Diff: ", numpy.median(reldiff)
+            print >> ssio, "  Std Rel Diff: ", numpy.std(reldiff)
            # only if all succeeds to we add anything to sio
            print >> sio, ssio.getvalue()                                    
        except:
@@ -349,14 +354,17 @@ def debugprint(r, prefix='', depth=-1, done=None, file=sys.stdout):
        # this variable is the output of computation,
        # so just print out the apply
        a = r.owner
-        print >> file, prefix, a.op, id(a)
+        if len(a.outputs) == 1:
+            print >> file, '%s%s [@%i]' % (prefix, a.op, id(r))
+        else:
+            print >> file, '%s%s.%i [@%i]' % (prefix, a.op, a.outputs.index(r), id(r))
        if id(a) not in done:
            done.add(id(a))
            for i in a.inputs:
-                debugprint(i, prefix+'  ', depth=depth-1, done=done, file=file)
+                debugprint(i, prefix+' |', depth=depth-1, done=done, file=file)
    else:
        #this is a variable
-        print >> file, prefix, r, id(r)
+        print >> file, '%s%s [@%i]' % (prefix, r, id(r))

    return file


--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -116,7 +116,7 @@ class AddDestroyHandler(gof.Optimizer):
        for o in env.outputs:
            try:
                env.replace_validate(o, _output_guard(o), reason='output_guard')
-                _logger.warning("Output variable %s required output_guard,"
+                _logger.info("Output variable %s required output_guard,"
                        " how was this output left unprotected against destructive operations?"
                        % o)
            except gof.InconsistencyError:
@@ -127,12 +127,22 @@ class AddDestroyHandler(gof.Optimizer):
        env.extend(gof.DestroyHandler())

 optdb = gof.SequenceDB()
-optdb.register('merge1', gof.MergeOptimizer(), 0, 'fast_run', 'fast_compile')
-optdb.register('canonicalize', gof.EquilibriumDB(), 1, 'fast_run')
-optdb.register('specialize', gof.EquilibriumDB(), 2, 'fast_run')
-optdb.register('merge2', gof.MergeOptimizer(), 49, 'fast_run')
-optdb.register('add_destroy_handler', AddDestroyHandler(), 49.5, 'fast_run', 'inplace')
-optdb.register('merge3', gof.MergeOptimizer(), 100, 'fast_run')
+optdb.register('merge1', gof.MergeOptimizer(), 
+        0, 'fast_run', 'fast_compile')
+optdb.register('canonicalize', gof.EquilibriumDB(),         # rearranges elemwise expressions
+        1, 'fast_run')
+optdb.register('merge1.2', gof.MergeOptimizer(skip_const_merge=True),
+        1.2, 'fast_run', 'fast_compile')
+optdb.register('stabilize', gof.EquilibriumDB(),            # replace unstable subgraphs
+        1.5, 'fast_run')          
+optdb.register('specialize', gof.EquilibriumDB(),           # misc special cases for speed
+        2, 'fast_run')
+optdb.register('merge2', gof.MergeOptimizer(),              # especially constant merge
+        49, 'fast_run')
+optdb.register('add_destroy_handler', AddDestroyHandler(), 
+        49.5, 'fast_run', 'inplace')
+optdb.register('merge3', gof.MergeOptimizer(),              # final pass just to make sure
+        100, 'fast_run')


 class Mode(object):
@@ -153,6 +163,12 @@ class Mode(object):
    
    def __init__(self, linker = config.linker, optimizer = config.optimizer):
        self.__setstate__((linker, optimizer))
+        #self.provided_optimizer - typically the `optimizer` arg.  But if the `optimizer` arg is
+        #    keyword corresponding to a predefined Query, then this stores the query
+        #self._optimizer - typically same as provided_optimizer??
+
+        #self.__get_optimizer - returns self._optimizer (possibly querying optdb with self._optimizer)
+        #self.optimizer - property that returns __get_optimizer()

    def __getstate__(self):
        return (self.provided_linker, self.provided_optimizer)
@@ -218,7 +234,7 @@ predefined_modes = {'FAST_COMPILE': FAST_COMPILE,

 def get_mode(string):
    if string is None: string = config.mode
-    if not isinstance(string, str): return string #it is already a mode...
+    if not isinstance(string, str): return string #it is hopefully already a mode...
    if not predefined_modes.has_key(string):
        raise Exception("No predefixed mode exist for string: %s"%string)
    return predefined_modes[string]

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -197,12 +197,19 @@ class _metadict:


 class MergeOptimizer(Optimizer):
-    """WRITEME
-    Merges parts of the graph that are identical, i.e. parts that
-    take the same inputs and carry out the asme computations so we
-    can avoid doing them more than once. Also merges variables that
-    are constant.
    """
+    Merges parts of the graph that are identical and redundant.
+    
+    The basic principle is that if two Applies have ops that compare equal, and identical
+    inputs, then they do not both need to be computed.  The clients of one are transfered to
+    the other and one of them is removed from the graph.  This procedure is carried out in
+    input->output order through the graph.
+
+    The first step of merging is constant-merging, so that all clients of an int(1) for example,
+    are transfered to a particular instance of int(1).
+    """
+    def __init__(self, skip_const_merge=False):
+        self.skip_const_merge = skip_const_merge

    def add_requirements(self, env):
        env.extend(toolbox.ReplaceValidate())
@@ -230,41 +237,6 @@ class MergeOptimizer(Optimizer):
                    const_sig[c] = sig
                    const_sig_inv[sig] = c

-    def exptime_apply_node_merge(self, env):
-        # we clear the dicts because the Constants signatures are not necessarily hashable
-        # and it's more efficient to give them an integer like the other Variables
-
-        symbol_idx = {}       #variable -> int
-        symbol_idx_inv = {}   #int -> variable (inverse of symbol_idx)
-
-        #add all graph sources to the symbol_idx dictionaries (arbitrary order)
-        for i, r in enumerate(r for r in env.variables if r.owner is None):
-            symbol_idx[r] = i
-            symbol_idx_inv[i] = r
-
-        for node in _list_of_nodes(env):
-            node_cid = (node.op, tuple([symbol_idx[input] for input in node.inputs]))
-            #print 'NODE', node, node_cid
-            dup = symbol_idx_inv.get(node_cid, None)
-            success = False
-            if dup is not None:
-                success = True
-                pairs = zip(node.outputs, dup.outputs)
-                for output, new_output in pairs:
-                    if output.name and not new_output.name:
-                        new_output.name = output.name
-                try:
-                    env.replace_all_validate(pairs, reason='Merge (exptime)')
-                except InconsistencyError, e:
-                    success = False
-            if not success:
-                symbol_idx[node] = node_cid
-                symbol_idx_inv[node_cid] = node
-                for i, output in enumerate(node.outputs):
-                    ref = (i, node_cid)
-                    symbol_idx[output] = ref
-                    symbol_idx_inv[ref] = output
-    
    def apply_node_merge(self, env):
        # we clear the dicts because the Constants signatures are not necessarily hashable
        # and it's more efficient to give them an integer like the other Variables
@@ -316,7 +288,8 @@ class MergeOptimizer(Optimizer):

    #TODO: Consider splitting this into a separate optimizer (SeqOptimizer)
    def apply(self, env):
-        self.apply_constant_merge(env)
+        if not self.skip_const_merge:
+            self.apply_constant_merge(env)
        self.apply_node_merge(env)

 merge_optimizer = MergeOptimizer()
@@ -541,7 +514,7 @@ class PatternSub(LocalOptimizer):
     PatternSub((subtract, (add, 'x', 'y'), 'y'), 'x')
     PatternSub((power, 'x', Constant(double, 2.0)), (square, 'x'))
     PatternSub((boggle, {'pattern': 'x',
-                                'constraint': lambda env, expr: expr.type == scrabble}),
+                                'constraint': lambda expr: expr.type == scrabble}),
                      (scrabble, 'x'))
    """

@@ -789,7 +762,10 @@ class NavigatorOptimizer(Optimizer):
                raise
        if replacements is False or replacements is None:
            return False
-        assert len(node.outputs) == len(replacements)
+        if not isinstance(replacements, (tuple, list)):
+            raise TypeError('Optimizer %s gave wrong type of replacement' % lopt)
+        if len(node.outputs) != len(replacements):
+            raise ValueError('Optimizer %s gave wrong number of replacements' % lopt)
        repl_pairs = zip(node.outputs, replacements)
        try:
            env.replace_all_validate(repl_pairs, reason=lopt)
@@ -904,8 +880,13 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                 max_depth = None,
                 max_use_ratio = None):
        """
+        :param local_optimizers:  list or set of local optimizations to apply until
+            equilibrium.
+
        :param max_use_ratio: each optimizer can be applied at most (size of graph * this number)

+        :param max_depth: TODO what does this do? (EquilibriumDB sets it to 5)
+
        """

        super(EquilibriumOptimizer, self).__init__(
@@ -916,6 +897,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        self.local_optimizers = local_optimizers
        self.max_depth = max_depth
        self.max_use_ratio = max_use_ratio
+        assert self.max_use_ratio is not None, 'max_use_ratio has to be a number'

    def apply(self, env, start_from = None):
        if start_from is None:
@@ -960,7 +942,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                            changed |= lopt_change
            finally:
                self.detach_updater(env, u)
-            self.detach_updater(env, u)
+            self.detach_updater(env, u) #TODO: erase this line, it's redundant at best
        if max_use_abort:
            print >> sys.stderr, "WARNING: EquilibriumOptimizer max'ed out"


--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -26,7 +26,7 @@ class DB(object):
        # It is an instance of a DB.In the tests for example,
        # this is not always the case.
        if not isinstance(obj, (DB, opt.Optimizer, opt.LocalOptimizer)):
-            raise Exception('Triing to register an optimizer that don\'t herite from theano.gof.opt.Optimizer or theano.gof.opt.LocalOptimizer', obj)
+            raise TypeError('Object cannot be registered in OptDB', obj)
            
        if self.name is not None:
            tags = tags + (self.name,)
@@ -132,6 +132,18 @@ class Query(object):


 class EquilibriumDB(DB):
+    """A set of potential optimizations which should be applied in an arbitrary order until
+    equilibrium is reached.
+
+    Canonicalize, Stabilize, and Specialize are all equilibrium optimizations.
+
+    .. note::
+        
+        It seems like this might be supposed to contain LocalOptimizer instances rather than
+        optimizer instances, because whatever is selected by the query is passed to
+        EquilibriumOptimizer and EquilibriumOptimizer requires LocalOptimizer instances.
+
+    """

    def query(self, *tags, **kwtags):
        opts = super(EquilibriumDB, self).query(*tags, **kwtags)
@@ -142,27 +154,45 @@ class EquilibriumDB(DB):


 class SequenceDB(DB):
+    """A sequence of potential optimizations.
+
+    Retrieve a sequence of optimizations (a SeqOptimizer) by calling query().
+
+    Each potential optimization is registered with a floating-point position.
+    No matter which optimizations are selected by a query, they are carried out in order of
+    increasing position.
+
+    The optdb itself (`theano.compile.mode.optdb`), from which (among many other tags) fast_run
+    and fast_compile optimizers are drawn is a SequenceDB.
+
+    """

    def __init__(self, failure_callback = opt.SeqOptimizer.warn):
        super(SequenceDB, self).__init__()
-        self.__priority__ = {}
+        self.__position__ = {}
        self.failure_callback = failure_callback

-    def register(self, name, obj, priority, *tags):
+    def register(self, name, obj, position, *tags):
        super(SequenceDB, self).register(name, obj, *tags)
-        self.__priority__[name] = priority
+        self.__position__[name] = position

    def query(self, *tags, **kwtags):
+        """
+        :type position_cutoff: float or int
+        :param position_cutoff: only optimizations with position less than the cutoff are returned.
+        """
+        position_cutoff = kwtags.pop('position_cutoff', float('inf'))
        opts = super(SequenceDB, self).query(*tags, **kwtags)
-        opts = list(opts)
-        opts.sort(key = lambda obj: self.__priority__[obj.name])
+        opts = [o for o in opts if self.__position__[o.name] < position_cutoff]
+        opts.sort(key = lambda obj: self.__position__[obj.name])
        return opt.SeqOptimizer(opts, failure_callback = self.failure_callback)

    def print_summary(self, stream=sys.stdout):
        print >> stream, "SequenceDB (id %i)"%id(self)
-        print >> stream, "  priority", self.__priority__
+        print >> stream, "  position", self.__position__
        print >> stream, "  names", self._names
        print >> stream, "  db", self.__db__
+
    def __str__(self):
        sio = StringIO.StringIO()
        self.print_summary(sio)

--- a/theano/printing.py
+++ b/theano/printing.py
@@ -7,9 +7,52 @@ import sys,os
 from theano import config
 from gof import Op, Apply
 from theano.gof.python25 import any
+from theano.compile import Function, debugmode

-#We import the debugprint here to have all printing of graph available from this module
-from theano.compile.debugmode import debugprint
+
+def debugprint(obj, depth=-1, file=None):
+    """Print a computation graph to file
+
+    :type obj: Variable, Apply, or Function instance
+    :param obj: symbolic thing to print
+    :type depth: integer
+    :param depth: print graph to this depth (-1 for unlimited)
+    :type file: None or file-like object
+    :param file: print to this file (None means sys.stdout)
+
+    :rtype: None or file-like object
+    :returns: `file` argument
+
+    Each line printed represents a Variable in the graph.
+    The indentation of each line corresponds to its depth in the symbolic graph.
+    The first part of the text identifies whether it is an input (if a name or type is printed)
+    or the output of some Apply (in which case the Op is printed).
+    The second part of the text is the memory location of the Variable.
+
+    If a Variable is encountered multiple times in the depth-first search, it is only printed
+    recursively the first time.  Later, just the Variable and its memory location are printed.
+
+    If an Apply has multiple outputs, then a '.N' suffix will be appended to the Apply's
+    identifier, to indicate which output a line corresponds to.
+
+    """
+    if file is None:
+        _file = sys.stdout
+    else:
+        _file = file
+    done = set()
+    results_to_print = []
+    if isinstance(obj, gof.Variable):
+        results_to_print.append(obj)
+    elif isinstance(obj, gof.Apply):
+        results_to_print.extend(obj.outputs)
+    elif isinstance(obj, Function):
+        results_to_print.extend(obj.maker.env.outputs)
+    for r in results_to_print:
+        debugmode.debugprint(r, depth=depth, done=done, file=_file)
+    if file is None:
+        _file.flush()
+    return file

 class Print(Op):
    """This identity-like Op has the side effect of printing a message followed by its inputs
@@ -329,7 +372,7 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
        if var.name is not None:
            varstr = var.name
        elif isinstance(var,gof.Constant):
-            varstr = str(var.data)
+            varstr = '%s [%s]'% (str(var.data) , str(var.type))
        elif var in input_update and input_update[var].variable.name is not None:
            varstr = input_update[var].variable.name
        else:

--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
@@ -3,7 +3,7 @@ import numpy
 import theano
 from theano import Op, Type, Apply, Variable, Constant
 from theano import tensor
-from theano.compile import shared, SharedVariable, shared_constructor
+from theano.compile import shared, SharedVariable

 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import filter as type_support_filter
@@ -68,6 +68,11 @@ CudaNdarrayType.SharedVariable = CudaNdarraySharedVariable
 def cuda_shared_constructor(value, name, strict=False, broadcastable=None):
    """SharedVariable Constructor for TensorType"""

+    # THIS CONSTRUCTOR TRIES TO CAST VALUE TO A FLOAT32, WHICH THEN GOES ONTO THE CARD
+    # SO INT shared vars, float64 shared vars, etc. all end up on the card.
+    # THIS IS NOT THE DEFAULT BEHAVIOUR THAT WE WANT. 
+    # SEE float32_shared_constructor
+
    #TODO: what should strict mean in this context, since we always have to make a copy?
    if strict:
        _value = value

--- a/theano/scan.py
+++ b/theano/scan.py
@@ -20,8 +20,9 @@ Special cases:

 Often a for loop can be expressed as a ``scan()`` operation, and ``scan`` is
 the closest that theano comes to looping. The advantage of using ``scan`` 
-over for loops is that it allows you to express the loop symbolically. The 
-Scan Op should always be used by applying the ``scan`` function. 
+over for loops is that it allows the number of iterations to be a part of the symbolic graph. 
+
+The Scan Op should always be used by applying the ``scan`` function. 
 """ 
 __docformat__ = 'restructedtext en'

@@ -60,7 +61,8 @@ def hash_listsDictsTuples(x):

 def scan(fn, sequences, initial_states, non_sequences, inplace_map={}, \
         sequences_taps={}, outputs_taps = {}, n_steps = 0, \
-         truncate_gradient = -1, go_backwards = False, mode = 'FAST_RUN'):
+         truncate_gradient = -1, go_backwards = False, 
+         mode = None):
    '''Function that constructs and applies a Scan op

    :param fn: Function that describes the operations involved in one step of scan 

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -4,7 +4,7 @@ __docformat__ = "restructuredtext en"

 import __builtin__
 import sys # for sys.maxint
-from theano.configparser import config
+from theano.configparser import config, AddConfigVar, BoolParam
 import traceback #for overriding Op.__call__
 if sys.version_info >= (2,5):
  import functools
@@ -570,7 +570,6 @@ class TensorType(Type):
        # input received.
        return """
        %(name)s = NULL;
-        type_num_%(name)s = ((PyArrayObject*)py_%(name)s)->descr->type_num; //we expect %(type_num)s
        if (py_%(name)s == Py_None) {
            // We can either fail here or set %(name)s to NULL and rely on Ops using
            // tensors to handle the NULL case, but if they fail to do so they'll end up
@@ -578,18 +577,17 @@ class TensorType(Type):
            PyErr_SetString(PyExc_ValueError, "expected an ndarray, not None");
            %(fail)s
        }
-        else if (!PyArray_Check(py_%(name)s)) {
+        if (!PyArray_Check(py_%(name)s)) {
            PyErr_SetString(PyExc_ValueError, "expected an ndarray");
            %(fail)s
        }
-        else if (type_num_%(name)s != %(type_num)s) {
+        type_num_%(name)s = ((PyArrayObject*)py_%(name)s)->descr->type_num; //we expect %(type_num)s
+        if (type_num_%(name)s != %(type_num)s) {
            PyErr_SetString(PyExc_ValueError, "expected %(type_num)s");
            %(fail)s
        }
-        else {
-            %(name)s = (PyArrayObject*)(py_%(name)s);
-            Py_XINCREF(%(name)s);
-        }
+        %(name)s = (PyArrayObject*)(py_%(name)s);
+        Py_XINCREF(%(name)s);
        """ % dict(sub, name = name, type_num = self.dtype_specs()[2])

    def c_cleanup(self, name, sub):
@@ -631,7 +629,7 @@ class TensorType(Type):
    def c_code_cache_version(self):
        scalar_version = scal.Scalar(self.dtype).c_code_cache_version()
        if scalar_version:
-            return (1,) + scalar_version
+            return (2,) + scalar_version
        else:
            return ()

@@ -943,7 +941,13 @@ class _tensor_py_operators:
                break

        if advanced:
-            return AdvancedSubtensor(args)(self, *args)
+            if config.experimental.advanced_indexing:
+                if len(args) == 1:
+                    return AdvancedSubtensor1()(self, *args)
+                else:
+                    return AdvancedSubtensor(args)(self, *args)
+            else:
+                return AdvancedSubtensor(args)(self, *args)
        else:
            return Subtensor(args)(self, *Subtensor.collapse(args, lambda entry: isinstance(entry, Variable)))

@@ -1029,6 +1033,7 @@ class TensorConstantSignature(tuple):
        except:
            return False
        #N.B. compare shape to ensure no broadcasting in ==
+        #N.B. compare elementwise last because it is the most expensive check
        return (t0 == t1) and (d0.shape == d1.shape) \
                and (self.sum == other.sum) and (numpy.all(d0 == d1)) 
    def __hash__(self):
@@ -1294,9 +1299,15 @@ def shape(a):

 pprint.assign(_shape, printing.MemberPrinter('shape'))

-
 class MaxAndArgmax(Op):
-    """Calculate the max and argmax over a given axis"""
+    """Calculate the max and argmax over a given axis.
+    
+    .. note::
+
+        If axis is None it means to calculate the max over the last dimension which is
+        DIFFERENT FROM NUMPY!!
+    
+    """
    nin=2 # tensor, axis
    nout=2 # max val, max idx
    E_axis = 'invalid axis'
@@ -1307,7 +1318,8 @@ class MaxAndArgmax(Op):
            axis = x.type.ndim - 1
        axis = _as_tensor_variable(axis)
        inputs = [x, axis]
-        broadcastable = [False] * (x.type.ndim - 1) #TODO: be less conservative
+        #TODO: figure things out if axis is a constant
+        broadcastable = [False] * (x.type.ndim - 1)
        outputs = [tensor(x.type.dtype, broadcastable,name='max'), 
                   tensor('int32', broadcastable,name='argmax')]
        return Apply(self, inputs, outputs)
@@ -1666,60 +1678,113 @@ def zeros_like(model):
    #return Zeros(model.type.ndim)(shape(model))
    return fill(model, constant(0.0, dtype=model.type.dtype))

-class Filler(gof.Op):
+if 0:
+    ## COMMENTED OUT FEB 17 2010
+    ## TODO (DOCUMENT AND WRITE TESTS) OR DELETE
+    class Filler(gof.Op):
+        """WRITEME"""
+        def __init__(self, value, ndim, dtype = 'float64'):
+            self.value = value
+            self.ndim = ndim
+            self.dtype = dtype
+            self.type = TensorType(dtype = dtype,
+                               broadcastable = (False,)*ndim)
+
+        def make_node(self, dims):
+            dims = as_tensor_variable(dims)
+            return gof.Apply(self, [dims], [self.type()])
+
+        def perform(self, node, (dims,), (out,)):
+            if out[0] is not None:
+                out[0].resize(dims, refcheck = 0)
+                out[0].fill(self.value)
+            else:
+                if self.value == 0:
+                    out[0] = numpy.zeros(dims, dtype = self.dtype)
+                elif self.value == 1:
+                    out[0] = numpy.ones(dims, dtype = self.dtype)
+                else:
+                    out[0] = numpy.ones(dims, dtype = self.dtype) * self.value
+
+        def grad(self, (dims,), (gout,)):
+            return None,
+
+        def __eq__(self, other):
+            return type(self) == type(other) and self.ndim == other.ndim and self.dtype == other.dtype
+
+        def __hash__(self):
+            return hash(self.ndim) ^ hash(self.dtype)
+
+    Zeros = partial(Filler, 0)
+    """WRITEME"""
+
+    Ones = partial(Filler, 1)
    """WRITEME"""
-    def __init__(self, value, ndim, dtype = 'float64'):
-        self.value = value
-        self.ndim = ndim
-        self.dtype = dtype
-        self.type = TensorType(dtype = dtype,
-                           broadcastable = (False,)*ndim)

-    def make_node(self, dims):
-        dims = as_tensor_variable(dims)
-        return gof.Apply(self, [dims], [self.type()])
+    @constructor
+    def zero():
+        """
+        Return a scalar zero, e.g. for initializing sums.
+        """
+        return Zeros(0)([])

-    def perform(self, node, (dims,), (out,)):
-        if out[0] is not None:
-            out[0].resize(dims, refcheck = 0)
-            out[0].fill(self.value)
-        else:
-            if self.value == 0:
-                out[0] = numpy.zeros(dims, dtype = self.dtype)
-            elif self.value == 1:
-                out[0] = numpy.ones(dims, dtype = self.dtype)
-            else:
-                out[0] = numpy.ones(dims, dtype = self.dtype) * self.value
+    @constructor
+    def one():
+        """WRITEME"""
+        return Ones(0)([])

-    def grad(self, (dims,), (gout,)):
-        return None,
+    pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Filler) and r.owner.op.value == 0, printing.FunctionPrinter('zeros'))
+    pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Filler) and r.owner.op.value == 1, printing.FunctionPrinter('ones'))

-    def __eq__(self, other):
-        return type(self) == type(other) and self.ndim == other.ndim and self.dtype == other.dtype
+class Alloc(gof.Op):
+    """Create a Tensor from an initial value and a desired shape

-    def __hash__(self):
-        return hash(self.ndim) ^ hash(self.dtype)
+    alloc(value, shape0, shape1, ..., shapeN) 

-Zeros = partial(Filler, 0)
-"""WRITEME"""
+    Returns an N-dimensional tensor initialized by `value` using something equivalent to
+    >>> z = numpy.zeros(shape, value.dtype)
+    >>> z += value

-Ones = partial(Filler, 1)
-"""WRITEME"""
+    The result has N dimensions, has the dtype of `value` and is obtained by broadcasting value
+    over the output ndarray.

-@constructor
-def zero():
-    """
-    Return a scalar zero, e.g. for initializing sums.
+    This Op is used to replace fill() during optimizations because after shapes are lifted, 
+    the first argument to fill can often be pruned from the graph.
    """
-    return Zeros(0)([])
+    def __init__(self, dtype):
+        self.dtype = dtype

-@constructor
-def one():
-    """WRITEME"""
-    return Ones(0)([])
+    def __eq__(self, other):
+        return type(self) == type(other) and self.dtype == other.dtype
+
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.dtype)
+
+    def __str__(self):
+        return '%s{%s}' % (self.__class__.__name__, self.dtype)
+
+    def make_node(self, value, *shape):
+        v = as_tensor_variable(value)
+        sh = [as_tensor_variable(s) for s in shape]
+        bcast = []
+        for s in sh:
+            if s.type.dtype[:3] not in ('int', 'uin'):
+                raise TypeError('Shape arguments must be integers', s)
+            # if s is constant 1, then we're broadcastable in that dim
+            bcast.append(isinstance(s, TensorConstant) and (s.data == 1))
+        otype = TensorType(dtype=self.dtype, broadcastable=bcast)
+        return gof.Apply(self, [v]+sh, [otype()])
+
+    def perform(self, node, inputs, (out,)):
+        v = inputs[0]
+        sh = tuple([int(i) for i in inputs[1:]])
+        if out[0] is None or out[0].shape != sh:
+            out[0] = numpy.zeros(sh, dtype=self.dtype)
+            out[0][...] += v # broadcast v to fill us up
+
+    def grad(self, inputs, (gout,)):
+        return [None for i in inputs]

-pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Filler) and r.owner.op.value == 0, printing.FunctionPrinter('zeros'))
-pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Filler) and r.owner.op.value == 1, printing.FunctionPrinter('ones'))

 @_redefine(elemwise.Elemwise(scal.identity))
 def tensor_copy(a):
@@ -1841,33 +1906,36 @@ def var(input, axis = None):
    #return the mean sqr
    return mean(centered_input**2, axis)

-class Repeat(gof.Op):
-
-    def make_node(self, input, repeats, axis):
-        assert isinstance(input.type, TensorType)
-        assert repeats.type == iscalar
-        assert axis.type == iscalar
-        broadcastable = []
-        for i,x in enumerate(input.broadcastable):
-          if i==axis:
-            broadcastable += [False]
-          else:
-            broadcastable += [x]
-
-        type = TensorType(dtype = input.type.dtype, broadcastable = \
-                          broadcastable)
-        #backport
-        #type = TensorType(dtype = input.type.dtype,
-        #              broadcastable = [False if i==axis else x for i, x in enumerate(input.broadcastable)])
-        return gof.Apply(self, [inputs, repeats, axis], [type()])
-
-    def perform(self, node, (input, repeats, axis), (out, )):
-        out[0] = numpy.repeat(input, repeats, axis)
-
-    def grad(self, (input, repeats, axis), (gout, )):
-        return add.grad((input, gout), (gout,))[:1]
-
-repeat = Repeat()
+if 0:
+    ## COMMENTED OUT FEB 17 2010
+    ## TODO (DOCUMENT AND WRITE TESTS) OR DELETE
+    class Repeat(gof.Op):
+
+        def make_node(self, input, repeats, axis):
+            assert isinstance(input.type, TensorType)
+            assert repeats.type == iscalar
+            assert axis.type == iscalar
+            broadcastable = []
+            for i,x in enumerate(input.broadcastable):
+              if i==axis:
+                broadcastable += [False]
+              else:
+                broadcastable += [x]
+
+            type = TensorType(dtype = input.type.dtype, broadcastable = \
+                              broadcastable)
+            #backport
+            #type = TensorType(dtype = input.type.dtype,
+            #              broadcastable = [False if i==axis else x for i, x in enumerate(input.broadcastable)])
+            return gof.Apply(self, [inputs, repeats, axis], [type()])
+
+        def perform(self, node, (input, repeats, axis), (out, )):
+            out[0] = numpy.repeat(input, repeats, axis)
+
+        def grad(self, (input, repeats, axis), (gout, )):
+            return add.grad((input, gout), (gout,))[:1]
+
+    repeat = Repeat()

 class Default(gof.Op):
    """
@@ -2489,6 +2557,10 @@ class Join(Op):
        join(2, x, y, z)    # WRONG: the axis has to be an index into the shape
        join(0, x, u)       # WRONG: joined tensors must have the same rank
    """
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))

    def make_node(self, *axis_and_tensors):
        """
@@ -2765,37 +2837,6 @@ else:
    pass


-class MakeVector(Op):
-    """WRITEME"""
-    def __init__(self, stype):
-        self.stype = stype
-    def make_node(self, *inputs):
-        inputs = map(as_tensor_variable, inputs)
-        assert all(a.type == self.stype for a in inputs)
-        return Apply(self, inputs, [TensorType(broadcastable = (False,),
-                                           dtype = self.stype.dtype)()])
-    def perform(self, node, inputs, (out,)):
-        out[0] = numpy.asarray(inputs)
-    def grad(self, inputs, (gout,)):
-        return [None]*len(inputs)
-
-make_lvector = MakeVector(lscalar)
-"""WRITEME"""
-
-
-class MakeVectorPrinter:
-
-    def process(self, r, pstate):
-        if r.owner is None:
-            raise TypeError("Can only print make_vector.")
-        elif isinstance(r.owner.op, MakeVector):
-            return "[%s]" % ", ".join(pstate.pprinter.process(input, pstate.clone(precedence = 1000)) for input in r.owner.inputs)
-        else:
-            raise TypeError("Can only print make_vector.")
-
-pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, MakeVector), MakeVectorPrinter())
-
-
 class Reshape(Op):
    """Perform a reshape operation of the input x to the new shape shp.
    The number of dimensions to which to reshape to (ndim) must be known at graph 
@@ -3138,6 +3179,37 @@ def inverse_permutation(perm):
 # Should reproduce numpy's behaviour:
 # http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing

+AddConfigVar('experimental.advanced_indexing',
+        "enable not-well-tested advanced indexing functionality",
+        BoolParam(False))
+
+
+class AdvancedSubtensor1(Op):
+    """Implement x[ilist] where ilist is a vector of integers."""
+
+    def __hash__(self):
+        return hash(type(self))
+    def __eq__(self, other):
+        type(self) == type(other)
+
+    def make_node(self, x, ilist):
+        x_ = as_tensor_variable(x)
+        ilist_ = as_tensor_variable(ilist)
+        if ilist_.type.dtype[:3] not in ('int', 'uin'):
+            raise TypeError('index must be integers')
+        if ilist_.type.broadcastable != (False,):
+            raise TypeError('index must be vector')
+        if x_.type.ndim == 0:
+            raise TypeError('cannot index into a scalar')
+        if x_.type.broadcastable[0]:
+            # the caller should have made a copy of x len(ilist) times
+            raise TypeError('cannot index into a broadcastable dimension')
+
+        return gof.Apply(self, [x_, ilist_], [x_.type()])
+
+    def perform(self, node, (x,i), (out,)):
+        out[0] = x[i]
+
 class AdvancedSubtensor(Op):
    """Return a subtensor copy, using advanced indexing.
    """
@@ -3308,6 +3380,18 @@ class Dot(Op):
            rval = dot(gz, y.T), dot(x.T, gz)
        return cast(rval[0], x.dtype), cast(rval[1], y.dtype)

+    def infer_shape(self, node, (xshp,yshp)):
+        x, y = node.inputs
+        if x.ndim == 2 and y.ndim == 2:
+            return [(xshp[0], yshp[1])]
+        if x.ndim == 1 and y.ndim == 2:
+            return [(yshp[1],)]
+        if x.ndim == 2 and y.ndim == 1:
+            return [(xshp[0],)]
+        if x.ndim == 1 and y.ndim == 1:
+            return [()]
+        raise NotImplementedError()
+
    def __str__(self):
        return "dot"
 dot = Dot()

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -6,10 +6,9 @@ import numpy.distutils
 from theano.configparser import config, AddConfigVar, StrParam
 from theano.gof import (utils, Op, Apply, view_roots, PatternSub, DestroyHandler, 
        SeqOptimizer, local_optimizer, Optimizer, LocalOptimizer, OpKeyOptimizer, 
-        InconsistencyError, toolbox)
+        InconsistencyError, toolbox, SequenceDB, EquilibriumOptimizer)
 from theano.printing import pprint, FunctionPrinter
-from theano.tensor.opt import register_specialize, out2in, insert_inplace_optimizer
-# opt.py
+from theano.compile.mode import optdb

 import basic as T

@@ -30,7 +29,6 @@ AddConfigVar('blas.ldflags',
        "lib[s] to include for [Fortran] level-3 blas implementation",
        StrParam(default_blas_ldflags()))

-
 _logger = logging.getLogger('theano.tensor.blas')
 _logger.setLevel(logging.WARN)
 def debug(*msg): _logger.debug(' '.join(str(m) for m in msg))
@@ -391,12 +389,22 @@ class Gemm(GemmRelated):
    def c_code_cache_version(self):
        return (1,) + self.build_gemm_version()

-gemm = Gemm()
-    
-
-
-
+class PseudoGemm(Op):
+    # should be replaced by Gemm
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, *args):
+        inputs = [T.as_tensor_variable(i) for i in args]
+        return Apply(self, inputs, [inputs[0].type()])
+    def perform(self, node, (z, a, x, y, b), (zout, )):
+        zout[0] = a * numpy.dot(x,y) + b *  z
+gemm = PseudoGemm()
+gemm_inplace = Gemm()
 pprint.assign(gemm, FunctionPrinter('gemm'))
+pprint.assign(gemm_inplace, FunctionPrinter('gemm_inplace'))
+
 def res_is_a(node, op, maxclients=None):
  if maxclients is not None:
    retval = (len(node.clients) <= maxclients)
@@ -597,6 +605,7 @@ class GemmOptimizer(Optimizer):
        while did_something:
            nodelist = list(env.toposort())
            did_something = False
+            nodelist.reverse()
            for node in nodelist:
                new_outputs = _gemm_from_node(node)
                if new_outputs:
@@ -611,10 +620,6 @@ class GemmOptimizer(Optimizer):
                        #TODO: retry other applications of gemm (see comment in _gemm_from_node
                        pass

-#neede to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71)
-compile.optdb.register('inplace_gemm', GemmOptimizer(), 70.00, 'fast_run', 'inplace', 'gemm')
-
-
 class Dot22(GemmRelated):
    """Compute a matrix-matrix product.
    This is a specialization of the more general Dot()
@@ -689,5 +694,34 @@ def local_dot_to_dot22(node):
            info('Not optimizing dot with inputs', x, y, x.type, y.type)
    else:
        return False
-register_specialize(local_dot_to_dot22)
+
+@local_optimizer([gemm])
+def local_inplace_gemm(node):
+    if node.op == gemm:
+        return [gemm_inplace(*node.inputs)]
+
+#################################
+#
+# Set up the BlasOpt optimizer
+#
+#################################
+
+blas_optdb = SequenceDB()
+
+# run after numerical stability optimizations (1.5)
+optdb.register('BlasOpt', blas_optdb, 1.7, 'fast_run')
+# run before specialize (2.0) because specialize is basically a free-for-all that makes the
+# graph crazy.
+
+blas_optdb.register('local_dot_to_dot22', 
+        EquilibriumOptimizer([local_dot_to_dot22], max_use_ratio=5),
+        0, 'fast_run')
+blas_optdb.register('local_dot_to_gemm', GemmOptimizer(), 10, 'fast_run')
+
+# After destroyhandler is in but before we try to make elemwise things inplace
+# Try to make gemm inplace
+# Also, need to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71)
+optdb.register('InplaceBlasOpt', 
+        EquilibriumOptimizer([local_inplace_gemm], max_use_ratio=5), 
+        70.0, 'fast_run', 'inplace')

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -197,6 +197,18 @@ class DimShuffle(Op):

        storage[0] = numpy.asarray(res) #asarray puts scalars back into array

+    def infer_shape(self, node, (ishp,)):
+        ishp = list(ishp)
+        for drop in reversed(self.drop):
+            del ishp[drop]
+        # transpose
+        rval = [ishp[i] for i in self.shuffle]
+
+        # augment
+        for augm in self.augment:
+            rval.insert(augm, 1)
+        return [rval]
+
    def c_code(self, node, name, (input,), (res,), sub):
        basename = input + '__view_or_copy'

@@ -613,6 +625,25 @@ class Elemwise(Op):
        # the following should be used instead of the previous loop, unfortunately it tends to segfault
        # self.ufunc(*(ufunc_args+[s[0] for s in output_storage]))

+    def infer_shape(self, node, i_shapes):
+        rval = []
+        for o in node.outputs:
+            oshp = []
+            for dim, b in enumerate(o.type.broadcastable):
+                b_dim = None
+                if b: # this is broadcastable
+                    b_dim = 1
+                else: # there must be some input that is not broadcastable
+                    for ishp, i in zip(i_shapes,node.inputs):
+                        if not i.type.broadcastable[dim]:
+                            b_dim = ishp[dim]
+                            assert b_dim, 'AA'
+                            break
+                    assert b_dim, 'BB'
+                oshp.append(b_dim)
+            rval.append(oshp)
+        return rval
+
    def _c_all(self, node, name, inames, onames, sub):
        _inames = inames
        _onames = onames
@@ -764,10 +795,14 @@ class CAReduce(Op):
        if scalar_op.nin not in [-1, 2] or scalar_op.nout != 1:
            raise NotImplementedError("CAReduce only supports binary functions with a single output.")
        self.scalar_op = scalar_op
-        if isinstance(axis, int):
-            self.axis = [axis]
-        else:
+        if axis is None:
            self.axis = axis
+        elif isinstance(axis, int):
+            self.axis = (axis,)
+        else:
+            self.axis = list(set(axis))
+            self.axis.sort()
+            self.axis = tuple(self.axis)
        self.ufunc = numpy.frompyfunc(scalar_op.impl, 2, 1)

        # CAReduce output views input when reducing scalars
@@ -834,6 +869,13 @@ class CAReduce(Op):
        else:
            output[0] = numpy.copy(variable)

+    def infer_shape(self, node, (ishape,)):
+        axis = self.axis
+        if axis is None:
+            return (),
+        return [ishape[i] for (i,b) in enumerate(node.inputs[0].type.broadcastable) if i not in axis],
+
+
    def _c_all(self, node, name, inames, onames, sub):

        input = node.inputs[0]

--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
 from nnet import *
+from sigm import softplus, sigmoid, sigmoid_inplace, scalar_sigmoid
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -4,89 +4,14 @@
 """

 from theano import gof
-from theano import scalar
 from theano import printing
-from theano.printing import pprint
 from theano.tensor import basic as tensor
 from theano.tensor import elemwise
 from theano.tensor import opt
 from theano.compile import optdb
 import numpy

-############
-#
-# SCALAR OPS
-#
-
-class ScalarSigmoid(scalar.UnaryScalarOp):
-    @staticmethod
-    def st_impl(x):
-        if x < -30.0:
-            return 0.0
-        if x > 30.0:
-            return 1.0 
-        return 1.0 / (1.0 + numpy.exp(-x))
-    def impl(self, x):
-        return ScalarSigmoid.st_impl(x)
-    def grad(self, (x,), (gz,)):
-        y = scalar_sigmoid(x)
-        return [gz * y * (1.0 - y)]
-    def c_code(self, node, name, (x,), (z,), sub):
-        if node.inputs[0].type == scalar.float32:
-            # These constants were obtained by looking at the output of python commands like:
-            #  for i in xrange(750):
-            #      print i, repr( theano._asarray(1.0, dtype=dt) / (theano._asarray(1.0, dtype=dt) + numpy.exp(-theano._asarray([i,-i], dtype=dt))))
-            # the boundary checks prevent us from generating inf
-            return """%(z)s = %(x)s < -88.0f ? 0.0 : %(x)s > 15.0f ? 1.0f : 1.0f /(1.0f + exp(-%(x)s));""" % locals()
-        elif node.inputs[0].type == scalar.float64:
-            return """%(z)s = %(x)s < -709.0 ? 0.0 : %(x)s > 19.0 ? 1.0 : 1.0 /(1.0+exp(-%(x)s));""" % locals()
-        else:
-            raise NotImplementedError('only floatingpoint is implemented')
-    def c_code_cache_version(self):
-        v = super(ScalarSigmoid, self).c_code_cache_version()
-        if v:
-            return (2,) + v
-        else:
-            return v
-scalar_sigmoid = ScalarSigmoid(scalar.upgrade_to_float, name='scalar_sigmoid')
-sigmoid = elemwise.Elemwise(scalar_sigmoid, name='sigmoid')
-
-pprint.assign(sigmoid, printing.FunctionPrinter('sigmoid'))
-
-
-class ScalarSoftplus(scalar.UnaryScalarOp):
-    @staticmethod
-    def static_impl(x):
-        if x < -30.0:
-            return 0.0
-        if x > 30.0:
-            return x
-        return numpy.log1p(numpy.exp(x))
-    def impl(self, x):
-        return ScalarSoftplus.static_impl(x)
-    def grad(self, (x,), (gz,)):
-        return [gz * scalar_sigmoid(x)]
-    def c_code(self, node, name, (x,), (z,), sub):
-        if node.inputs[0].type == scalar.float32:
-            # These constants were obtained by looking at the output of python commands like:
-            #  for i in xrange(750):
-            #      print i, repr( numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt))))
-            # the boundary checks prevent us from generating inf
-            return """%(z)s = %(x)s < -103.0f ? 0.0 : %(x)s > 14.0f ? %(x)s : log1p(exp(%(x)s));""" % locals()
-        elif node.inputs[0].type == scalar.float64:
-            return """%(z)s = %(x)s < -745.0 ? 0.0 : %(x)s > 16.0 ? %(x)s : log1p(exp(%(x)s));""" % locals()
-        else:
-            raise NotImplementedError('only floatingpoint is implemented')
-    def c_code_cache_version(self):
-        v = super(ScalarSoftplus, self).c_code_cache_version()
-        if v:
-            return (2,) + v
-        else:
-            return v
-scalar_softplus = ScalarSoftplus(scalar.upgrade_to_float, name='scalar_softplus')
-softplus = elemwise.Elemwise(scalar_softplus, name='softplus')
-
-pprint.assign(softplus, printing.FunctionPrinter('softplus'))
+from .sigm import sigmoid, softplus


 ############
@@ -1351,6 +1276,7 @@ def categorical_crossentropy(coding_dist, true_dist):
        raise TypeError('rank mismatch between coding and true distributions')


+from theano import scalar

 class Prepend_scalar_constant_to_each_row(gof.Op):
    def __init__(self, val = 0):
@@ -1440,14 +1366,3 @@ prepend_scalar_to_each_row = Prepend_scalar_to_each_row()
 prepend_0_to_each_row = Prepend_scalar_constant_to_each_row(0.)
 prepend_1_to_each_row = Prepend_scalar_constant_to_each_row(1.)

-logsigm_to_softplus = gof.PatternSub(
-    (tensor.log, (sigmoid, 'x')),
-    (tensor.neg, (softplus, (tensor.neg, 'x'))),
-    allow_multiple_clients = True)
-log1msigm_to_softplus = gof.PatternSub(
-    (tensor.log, (tensor.sub, tensor.constant([[1.0]]), (sigmoid, 'x'))),
-    (tensor.neg, (softplus, 'x')),
-    allow_multiple_clients = True)
-
-opt.register_specialize(logsigm_to_softplus, name = 'logsigm_to_softplus')
-opt.register_specialize(log1msigm_to_softplus, name = 'log1msigm_to_softplus')
--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
+"""Ops and optimizations: sigmoid, softplus
+
+These functions implement special cases of exp and log to improve numerical stability.
+"""
+import numpy
+
+from theano import gof
+from theano import scalar
+from theano import printing
+from theano.tensor import basic as tensor
+from theano.printing import pprint
+from theano.tensor import elemwise
+from theano.tensor import opt
+from theano.compile import optdb
+
+
+############
+#
+# SCALAR OPS
+#
+
+class ScalarSigmoid(scalar.UnaryScalarOp):
+    @staticmethod
+    def st_impl(x):
+        if x < -30.0:
+            return 0.0
+        if x > 30.0:
+            return 1.0 
+        return 1.0 / (1.0 + numpy.exp(-x))
+    def impl(self, x):
+        return ScalarSigmoid.st_impl(x)
+    def grad(self, (x,), (gz,)):
+        y = scalar_sigmoid(x)
+        return [gz * y * (1.0 - y)]
+    def c_code(self, node, name, (x,), (z,), sub):
+        if node.inputs[0].type == scalar.float32:
+            # These constants were obtained by looking at the output of python commands like:
+            #  for i in xrange(750):
+            #      print i, repr( theano._asarray(1.0, dtype=dt) / (theano._asarray(1.0, dtype=dt) + numpy.exp(-theano._asarray([i,-i], dtype=dt))))
+            # the boundary checks prevent us from generating inf
+            return """%(z)s = %(x)s < -88.0f ? 0.0 : %(x)s > 15.0f ? 1.0f : 1.0f /(1.0f + exp(-%(x)s));""" % locals()
+        elif node.inputs[0].type == scalar.float64:
+            return """%(z)s = %(x)s < -709.0 ? 0.0 : %(x)s > 19.0 ? 1.0 : 1.0 /(1.0+exp(-%(x)s));""" % locals()
+        else:
+            raise NotImplementedError('only floatingpoint is implemented')
+    def c_code_cache_version(self):
+        v = super(ScalarSigmoid, self).c_code_cache_version()
+        if v:
+            return (2,) + v
+        else:
+            return v
+scalar_sigmoid = ScalarSigmoid(scalar.upgrade_to_float, name='scalar_sigmoid')
+sigmoid = elemwise.Elemwise(scalar_sigmoid, name='sigmoid')
+
+sigmoid_inplace = elemwise.Elemwise(
+        ScalarSigmoid(scalar.transfer_type(0)),
+        inplace_pattern={0:0},
+        name='sigmoid_inplace',
+        )
+
+pprint.assign(sigmoid, printing.FunctionPrinter('sigmoid'))
+
+
+class ScalarSoftplus(scalar.UnaryScalarOp):
+    @staticmethod
+    def static_impl(x):
+        if x < -30.0:
+            return 0.0
+        if x > 30.0:
+            return x
+        return numpy.log1p(numpy.exp(x))
+    def impl(self, x):
+        return ScalarSoftplus.static_impl(x)
+    def grad(self, (x,), (gz,)):
+        return [gz * scalar_sigmoid(x)]
+    def c_code(self, node, name, (x,), (z,), sub):
+        if node.inputs[0].type == scalar.float32:
+            # These constants were obtained by looking at the output of python commands like:
+            #  for i in xrange(750):
+            #      print i, repr( numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt))))
+            # the boundary checks prevent us from generating inf
+            return """%(z)s = %(x)s < -103.0f ? 0.0 : %(x)s > 14.0f ? %(x)s : log1p(exp(%(x)s));""" % locals()
+        elif node.inputs[0].type == scalar.float64:
+            return """%(z)s = %(x)s < -745.0 ? 0.0 : %(x)s > 16.0 ? %(x)s : log1p(exp(%(x)s));""" % locals()
+        else:
+            raise NotImplementedError('only floatingpoint is implemented')
+    def c_code_cache_version(self):
+        v = super(ScalarSoftplus, self).c_code_cache_version()
+        if v:
+            return (2,) + v
+        else:
+            return v
+scalar_softplus = ScalarSoftplus(scalar.upgrade_to_float, name='scalar_softplus')
+softplus = elemwise.Elemwise(scalar_softplus, name='softplus')
+
+pprint.assign(softplus, printing.FunctionPrinter('softplus'))
+
+logsigm_to_softplus = gof.PatternSub(
+    (tensor.log, (sigmoid, 'x')),
+    (tensor.neg, (softplus, (tensor.neg, 'x'))),
+    allow_multiple_clients = True)
+
+def _is_1(expr):
+    """rtype bool. True iff expr is a constant close to 1
+    """
+    try:
+        v = opt.get_constant_value(expr)
+        return numpy.allclose(v, 1)
+    except TypeError:
+        return False
+
+log1msigm_to_softplus = gof.PatternSub(
+    (tensor.log, 
+        (tensor.sub,
+            dict(pattern='y', constraint = _is_1),
+            (sigmoid, 'x'))),
+    (tensor.neg, (softplus, 'x')),
+    allow_multiple_clients = True)
+
+opt.register_stabilize(logsigm_to_softplus, name = 'logsigm_to_softplus')
+opt.register_stabilize(log1msigm_to_softplus, name = 'log1msigm_to_softplus')
+
+def is_1pexp(t):
+    # if t is of form (1+exp(x)), return x
+    # else return None
+    if t.owner and t.owner.op == tensor.add:
+        scalars, scalar_inputs, nonconsts = \
+                opt.scalarconsts_rest(t.owner.inputs)
+        # scalar_inputs are potentially dimshuffled and fill'd scalars
+        if len(nonconsts) == 1:
+            maybe_exp = nonconsts[0]
+            if maybe_exp.owner and maybe_exp.owner.op == tensor.exp:
+                return False, maybe_exp.owner.inputs[0]
+    return None
+
+def is_exp(t):
+    # if t is of form (exp(x)) then return x
+    # else return None
+    neg = False
+    if t.owner and t.owner.op == tensor.neg:
+        t = t.owner.inputs[0]
+        neg = True
+    if t.owner and t.owner.op == tensor.exp:
+        return neg, t.owner.inputs[0]
+
+def partition_num_or_denom(r, f):
+    if r.owner and r.owner.op == tensor.mul:
+        a = r.owner.inputs
+    else:
+        a = [r]
+
+    # ugly 2.4-compatible thing
+    f_terms = []
+    neg = False
+    rest = []
+    for t in a:
+        f_t = f(t)
+        if f_t is None:
+            rest.append(t)
+        else:
+            neg_t, f_t = f_t
+            f_terms.append(f_t)
+            neg ^= neg_t #bit flip if neg_t is true
+    return f_terms, rest, neg
+
+
+@opt.register_stabilize
+@gof.local_optimizer([tensor.true_div])
+def local_exp_over_1_plus_exp(node):
+    """exp(x)/(1+exp(x)) -> sigm(x)
+    c/(1+exp(x)) -> c*sigm(-x)
+    """
+    # this optimization should be done for numerical stability
+    # so we don't care to check client counts
+    if node.op == tensor.true_div:
+
+        #find all the exp() terms in the numerator
+        num, denom = node.inputs
+        num_exp_x, num_rest, num_neg = partition_num_or_denom(num, is_exp)
+        denom_1pexp, denom_rest, denom_neg = partition_num_or_denom(denom, is_1pexp)
+
+        sigmoids = []
+        for t in denom_1pexp:
+            if t in num_exp_x:
+                # case: exp(x) /(1+exp(x))
+                sigmoids.append(sigmoid(t))
+                del num_exp_x[num_exp_x.index(t)]
+            else:
+                # case: 1/(1+exp(x))
+                sigmoids.append(sigmoid(-t))
+
+        if not sigmoids: # we didn't find any.  abort
+            return
+        # put the new numerator together
+        new_num = sigmoids + [tensor.exp(t) for t in num_exp_x] + num_rest
+        if len(new_num) == 1:
+            new_num = new_num[0]
+        else:
+            new_num = tensor.mul(*new_num)
+
+        if num_neg ^ denom_neg:
+            new_num = -new_num
+
+        if len(denom_rest) == 0:
+            return [new_num]
+        elif len(denom_rest) == 1:
+            return [new_num / denom_rest[0]]
+        else:
+            return [new_num / tensor.mul(*denom_rest)]
+
+@opt.register_stabilize
+@gof.local_optimizer([tensor.mul])
+def local_sigm_times_exp(node):
+    """
+    exp(x)*sigm(-x) -> -sigm(x)
+    """
+    # this is a numerical stability thing, so we dont check clients
+    if node.op == tensor.mul:
+        exp_x = []
+        exp_minus_x = []
+        sigm_x = []
+        sigm_minus_x = []
+        other = []
+        neg = False
+        for i in node.inputs:
+            while i.owner and i.owner.op == tensor.neg:
+                neg ^= True
+                i = i.owner.inputs[0]
+            if i.owner and i.owner.op == tensor.exp:
+                exp_arg = i.owner.inputs[0]
+                if exp_arg.owner and exp_arg.owner.op == tensor.neg:
+                    exp_minus_x.append(exp_arg.owner.inputs[0])
+                else:
+                    exp_x.append(exp_arg)
+            elif i.owner and i.owner.op == sigmoid:
+                sigm_arg = i.owner.inputs[0]
+                if sigm_arg.owner and sigm_arg.owner.op == tensor.neg:
+                    sigm_minus_x.append(sigm_arg.owner.inputs[0])
+                else:
+                    sigm_x.append(sigm_arg)
+            else:
+                other.append(i)
+
+        # remove matched pairs in exp_x and sigm_minus_x
+        did_something = False
+        for i in exp_x:
+            if i in sigm_minus_x:
+                del sigm_minus_x[sigm_minus_x.index(i)]
+                other.append(sigmoid(i))
+                did_something = True
+            else:
+                other.append(i)
+
+        # remove matched pairs in exp_minus_x and sigm_x
+        for i in exp_minus_x:
+            if i in sigm_x:
+                del sigm_x[sigm_x.index(i)]
+                other.append(sigm(-i))
+                did_something = True
+            else:
+                other.append(i)
+        if did_something:
+            terms = other + [sigmoid(x) for x in sigm_x] \
+                    + [sigmoid(-x) for x in sigm_minus_x]
+            if len(terms)>1:
+                rval = tensor.mul(*terms)
+            else:
+                rval = terms[0]
+            
+            if neg:
+                return [-rval]
+            else:
+                return [rval]
+
+
+@opt.register_stabilize
+@gof.local_optimizer([tensor.inv])
+def local_inv_1_plus_exp(node):
+    """
+    1/(1+exp(x)) -> sigm(-x)
+    """
+    # this optimization should be done for numerical stability
+    # so we don't care to check client counts
+    if node.op == tensor.inv:
+        inv_arg = node.inputs[0]
+        if inv_arg.owner and inv_arg.owner.op == tensor.add:
+            scalars, scalar_inputs, nonconsts = \
+                    opt.scalarconsts_rest(inv_arg.owner.inputs)
+            # scalar_inputs are potentially dimshuffled and fill'd scalars
+            if len(nonconsts) == 1:
+                if nonconsts[0].owner and nonconsts[0].owner.op == tensor.exp:
+                    if scalars and numpy.allclose(numpy.sum(scalars), 1):
+                        return opt._fill_chain(
+                                sigmoid(tensor.neg(nonconsts[0].owner.inputs[0])),
+                                scalar_inputs)
+
+              
+#@opt.register_canonicalize
+@gof.local_optimizer([tensor.inv])
+def local_1msigmoid(node):
+    """
+    1-sigm(x) -> sigm(-x)
+    """
+    if node.op == tensor.sub:
+        sub_l, sub_r = node.inputs
+        if len(sub_r.clients) > 1:
+            return # graph is using both sigm and 1-sigm
+        if sub_r.owner and sub_r.owner.op == sigmoid:
+            try:
+                val_l = opt.get_constant_value(sub_l)
+            except Exception, e:
+                return
+            if numpy.allclose(numpy.sum(val_l), 1):
+                return [sigmoid(-sub_r.owner.inputs[0])]
+
--- a/theano/tensor/nnet/tests/test_sigm.py
+++ b/theano/tensor/nnet/tests/test_sigm.py
+import unittest
+import theano
+from theano import tensor as T
+from theano import gof
+import numpy
+from theano.tests import unittest_tools as utt
+from theano.tensor.tests import test_basic as TT
+
+from theano.tensor.nnet import *
+
+
+class T_sigmoid(unittest.TestCase):
+    def setUp(self):
+        utt.seed_rng()
+    def test_elemwise(self):
+        utt.verify_grad(sigmoid, [numpy.random.rand(3,4)])
+
+class T_softplus(unittest.TestCase):
+    def setUp(self):
+        utt.seed_rng()
+    def test_elemwise(self):
+        utt.verify_grad(softplus, [numpy.random.rand(3,4)])
+
+
+class T_sigmoid_opts(unittest.TestCase):
+    def test_exp_over_1_plus_exp(self):
+        m = theano.config.mode
+        if m == 'FAST_COMPILE':
+            m = 'FAST_RUN'
+
+        x = T.dvector()
+
+        # tests exp_over_1_plus_exp
+        f = theano.function([x], T.exp(x)/(1+T.exp(x)), mode=m)
+        #theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [sigmoid]
+
+        # tests inv_1_plus_exp
+        f = theano.function([x], T.fill(x,1.0) / (1+T.exp(-x)), mode=m)
+        #theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [sigmoid]
+
+        # tests inv_1_plus_exp with neg
+        f = theano.function([x], T.fill(x,-1.0) / (1+T.exp(-x)), mode=m)
+        #theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [sigmoid, 
+                T.inplace.neg_inplace]
+
+        # tests double inv_1_plus_exp with neg
+        f = theano.function([x], (T.fill(x,-1.0)*T.exp(x)) / ((1+T.exp(x))*(1+T.exp(-x))), mode=m)
+        #theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [sigmoid, 
+                T.mul]
+
+    def test_1msigmoid(self):
+        m = theano.config.mode
+        if m == 'FAST_COMPILE':
+            m = 'FAST_RUN'
+
+        x = T.fmatrix()
+
+        # tests exp_over_1_plus_exp
+        f = theano.function([x], 1 - T.exp(x)/(1+T.exp(x)), mode=m)
+        theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [tensor.neg, sigmoid_inplace]
+
+        # tests inv_1_plus_exp
+        f = theano.function([x], 1 - T.fill(x,1.0) / (1+T.exp(-x)), mode=m)
+        theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [tensor.neg, 
+                sigmoid_inplace]
+
+
+
+
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -44,34 +44,65 @@ def _fill_chain(new_out, orig_inputs):
        new_out = T.fill(i, new_out)
    return [new_out]

-def get_constant_value(v, fill=False):
-    """return the constant value underlying variable `v`
+def encompasses_broadcastable(b1, b2):
+    """
+    Returns True if the broadcastable patterns b1 and b2 are such that b2 is
+    broadcasted to b1's shape and not the opposite.
+
+    :param b1: the broadcastable attribute of a tensor type
+    :param b2: the broadcastable attribute of a tensor type
+    """
+    if len(b1) < len(b2):
+        return False
+    b1 = b1[-len(b2):]
+    return not any(v1 and not v2 for v1, v2 in zip(b1, b2))
+
+def merge_broadcastables(broadcastables):
+    return [all(bcast) for bcast in zip(*broadcastables)]
+
+def get_constant_value(v):
+    """return the constant scalar(0-D) value underlying variable `v`

    If v is the output of dimshuffles, fills, this function digs through them.

    If `v` is not some view of constant data, then raise a TypeError.

-    if fill is True, then it returns (v, [...]) where the second term is a list of variables
-    that were used in the fill expressions
-
    :note: There may be another function similar to this one in the code, but I'm not sure where it
    is.
    """

    if isinstance(v, gof.Constant):
-        if fill:
-            return v.data, []
-        return v.data
+        #TODO: consider checking for arrays of the form e.g. [1,1,1,1] where
+        # it is not a constant, but in some cases it *could* be replaced with one.
+        # Note that this would have an effect on the broadcasting of inputs and so on
+        try:
+            complex(v.data) #works for all numeric scalars
+            return v.data
+        except:
+            raise TypeError(v)
    if v.owner and isinstance(v.owner.op, T.DimShuffle):
-        return get_constant_value(v.owner.inputs[0], fill=fill)
-    if fill:
-        if v.owner and v.owner.op == T.fill:
-            shape, val = v.owner.inputs
-            # fill(a,b) fills the shape of 'a' filled with 'b'
-            rval, rshapes = get_constant_value(val, fill=fill)
-            return rval, rshapes + [shape]
+        return get_constant_value(v.owner.inputs[0])
+    if v.owner and v.owner.op == T.fill:
+        shape, val = v.owner.inputs
+        # fill(a,b) fills the shape of 'a' filled with 'b'
+        return get_constant_value(val)
    raise TypeError(v)

+def scalarconsts_rest(inputs):
+    """Partition a list of variables into two kinds:
+    scalar constants, and the rest."""
+    consts = []
+    origconsts = []
+    nonconsts = []
+    for i in inputs:
+        try:
+            v = get_constant_value(i)
+            consts.append(v)
+            origconsts.append(i)
+        except:
+            nonconsts.append(i)
+    return consts, origconsts, nonconsts
+
 @gof.optimizer
 def insert_inplace_optimizer(env):
    """
@@ -124,6 +155,11 @@ def register_specialize(lopt, *tags, **kwargs):
    compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags)
    return lopt

+def register_stabilize(lopt, *tags, **kwargs):
+    name = (kwargs and kwargs.pop('name')) or lopt.__name__
+    compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
+    return lopt
+
 ######################
 # DimShuffle lifters #
 ######################
@@ -164,130 +200,321 @@ register_canonicalize(local_dimshuffle_lift)



-#################
-# Shape lifters #
-#################
+#####################################
+# ShapeFeature, Shape optimizations
+#####################################

-@gof.local_optimizer([T._shape, None])
-def local_shape_lift_elemwise(node):
-    """
-    shape(elemwise_op(..., x, ...)) -> shape(x)
+class MakeVector(T.Op):
+    """Concatenate a number of scalars together into a vector

-    Where x contains the maximal shape information.
+    This is a simple version of stack() that introduces far less cruft into the graph.
+    
    """
-    if not opt.check_chain(node, T._shape, T.Elemwise):
-        return False
-
-    output = node.inputs[0]
-    parent = output.owner
-
-    for input in parent.inputs:
-        if input.type.broadcastable == output.type.broadcastable:
-            return T._shape(input),
-
-    return False
-
-register_canonicalize(local_shape_lift_elemwise, 'shape_lift')
-register_specialize(local_shape_lift_elemwise, 'shape_lift')
-
+    def __init__(self, dtype='int64'):
+        self.dtype = dtype
+    def __eq__(self, other):
+        return type(self) == type(other) and self.dtype == other.dtype
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.dtype)
+    def make_node(self, *inputs):
+        inputs = map(T.as_tensor_variable, inputs)
+        if not all(a.type == inputs[0].type for a in inputs):
+            raise TypeError('This MakeVector instance requires inputs of same type %s' %
+                    inputs[0].type)
+        if inputs:
+            dtype = inputs[0].type.dtype
+        else:
+            dtype = self.dtype
+        #bcastable = (len(inputs) == 1)
+        bcastable = False
+        otype = T.TensorType(
+                broadcastable=(bcastable,),
+                dtype=dtype)
+        return T.Apply(self, inputs, [otype()])
+    def __str__(self):
+        return self.__class__.__name__
+    def perform(self, node, inputs, (out,)):
+        out[0] = T.numpy.asarray(inputs)
+
+make_vector = MakeVector()
+
+class MakeVectorPrinter:
+    def process(self, r, pstate):
+        if r.owner is None:
+            raise TypeError("Can only print make_vector.")
+        elif isinstance(r.owner.op, MakeVector):
+            return "[%s]" % ", ".join(pstate.pprinter.process(input, pstate.clone(precedence = 1000)) for input in r.owner.inputs)
+        else:
+            raise TypeError("Can only print make_vector.")
+T.pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, MakeVector), MakeVectorPrinter())

-@gof.local_optimizer([T._shape, None])
-def local_shape_lift_sum(node):
+class Shape_i(T.Op):
    """
-    shape(sum{n}(x)) -> [shape(x)[0], ..., shape(x)[n-1], shape(x)[n+1], ...]
-    """
-    if not opt.check_chain(node, T._shape, T.Sum):
-        return False
+    L{Op} to return the shape of a matrix.

-    input = node.inputs[0].owner.inputs[0]
-    axis = node.inputs[0].owner.op.axis
-    if axis is None:# or len(axis) != 1:
-        axis = range(input.type.ndim)
+    @note: Non-differentiable.
+    """
+    def __init__(self, i):
+        self.i = i
+    def __hash__(self):
+        return hash(type(self)) ^ self.i
+    def __eq__(self, other):
+        return type(self) == type(other) and self.i == other.i
+    def __str__(self):
+        return '%s{%i}'%(self.__class__.__name__, self.i)
+    def make_node(self, x):
+        x = T.as_tensor_variable(x)
+        if x.ndim <= self.i:
+            raise TypeError('x has too few dimensions for Shape_i', (x, self.i))
+        return T.Apply(self, [x], [T.lscalar()])
+    def perform(self, node, (x, ), (out, )):
+        out[0] = theano._asarray(x.shape[self.i], dtype = 'int64')
+    def grad(self, (x,), (gz,)):
+        return [None]
+
+class ShapeFeature(object):
+    """Graph optimizer for removing all calls to shape()
+    
+    This optimizer replaces all Shapes and Subtensors of Shapes with Shape_i and MakeVector
+    Ops.
+
+    This optimizer has several goals:
+    1. to 'lift' Shapes to as close to the inputs as possible.  
+    2. to infer the shape of every node in the graph in terms of the input shapes.
+    3. remove all fills (T.second, T.fill) from the graph
+
+    Lifting shapes as close to the inputs as possible is important for canonicalization because
+    it is very bad form to have to compute something just to know how big it will be.  Firstly,
+    it is a waste of time to compute such outputs.  But it is important to get rid of these
+    outputs as early as possible in the compilation process because the
+    extra computations make it appear as if many internal graph nodes have multiple clients.
+    Many optimizations refuse to work on nodes with multiple clients.
+
+    Lifting is done by using an `<Op>.infer_shape` function if one is present, or else using a
+    conservative default.  An Op that supports shape-lifting should define a 
+    infer_shape(self, node, input_shapes) function.  The argument input_shapes is a tuple
+    of tuples... there is an interior tuple for each input to the node.  The tuple has as many
+    elements as dimensions.  The element in position i of tuple j represents the i'th shape
+    component of the j'th input.  The function should return a tuple of tuples.  One output
+    tuple for each node.output.  Again, the i'th element of the j'th output tuple represents
+    the output[j].shape[i] of the function.  If an output is not a TensorType, then None should
+    be returned instead of a tuple for that output.
+
+    For example the infer_shape for a matrix-matrix product would accept 
+    input_shapes=((x0,x1), (y0,y1)) and return ((x0, y1),).
+    

+    Inferring the shape of internal nodes in the graph is important for doing size-driven
+    optimizations.  If we know how big various intermediate results will be, we can estimate
+    the cost of many Ops accurately, and generate c-code that is specific [e.g. unrolled] to
+    particular sizes.

-    ish = T._shape(input)
-    return T.make_lvector.make_node(*(ish[i] for i in xrange(input.type.ndim) if i not in axis)).outputs
-#    return T.vertical_stack.make_node(ish[:axis], ish[axis+1:]).outputs
+    .. note::

-register_canonicalize(local_shape_lift_sum, 'shape_lift')
+        Right now there is only the ConvOp that can really take advantage of this shape
+        inference, but it is worth it even just for the ConvOp.  All that's necessary to do
+        shape inference is 1) to mark shared inputs as having a particular shape,
+        either via a .tag or some similar hacking; and 2) to add an optional Param() argument
+        to promise that inputs will have a certain shape (or even to have certain shapes in
+        certain dimensions).


-@gof.local_optimizer([T._shape, T.dot])
-def local_shape_lift_dot(node):
    """
-    shape(dot(a, b)) -> [shape(a)[0], shape(b)[1]]
-    """
-    if not opt.check_chain(node, T._shape, T.dot):
-        return False
-    a, b = node.inputs[0].owner.inputs
-    if a.type.ndim == 2 and b.type.ndim == 2:
-        return T.make_lvector.make_node(T._shape(a)[0], T._shape(b)[1]).outputs
-    elif a.type.ndim == 1 and b.type.ndim == 2:
-        return T.make_lvector.make_node(T._shape(b)[1]).outputs
-    elif a.type.ndim == 2 and b.type.ndim == 1:
-        return T.make_lvector.make_node(T._shape(a)[0]).outputs
-    elif a.type.ndim == 1 and b.type.ndim == 1:
-        return T.make_lvector.make_node().outputs
-    else:
-        return False
-
-register_canonicalize(local_shape_lift_dot, 'shape_lift')
-
-
-# local_shape_lift = opt.LocalOptGroup(local_shape_lift_elemwise,
-#                                      local_shape_lift_sum,
-#                                      local_shape_lift_dot)
+    def shape_i(self, i):
+        def op_deco(r):
+            if r.type.broadcastable[i]:
+                return self.lscalar_one
+            else:
+                return Shape_i(i)(r)
+        return op_deco

+    def shape_tuple(self, r):
+        return tuple([self.shape_i(i)(r) for i in xrange(r.ndim)])

-################
-# Fill lifters #
-################
+    def default_infer_shape(self, node, i_shapes):
+        rval = []
+        for r in node.outputs:
+            try:
+                rval.append(self.shape_tuple(r))
+            except AttributeError:
+                rval.append(None)
+        return rval
+
+    def unpack(self, s_i):
+        # unpack the s_i that the Op returned
+        assert s_i is not None
+        if s_i == 1:
+            # don't make the optimizer merge a zillion ones together
+            return self.lscalar_one
+        if type(s_i) is int:
+            # this shape is a constant
+            assert s_i >= 0
+            return T.constant(s_i, dtype='int64')
+        if type(s_i) in (tuple,list):
+            # this dimension is the same as many of the inputs
+            # which tells us that if one of the inputs is known, 
+            # the others all become known.
+            # TODO: should be implemented in Elemwise, and Dot
+            #
+            # worst case, we loop over shape_of and replace things
+            raise NotImplementedError(s_i)
+        elif s_i.type == T.lscalar:
+            return s_i
+        else:
+            raise TypeError('Unsupported shape element', s_i)

-def encompasses_broadcastable(b1, b2):
-    """
-    Returns True if the broadcastable patterns b1 and b2 are such that b2 is
-    broadcasted to b1's shape and not the opposite.
+    def set_shape(self, r, s):
+        assert r not in self.shape_of
+        if s is None:
+            self.shape_of[r] = s
+        else:
+            self.shape_of[r] = tuple([self.unpack(s_i) for s_i in s])

-    :param b1: the broadcastable attribute of a tensor type
-    :param b2: the broadcastable attribute of a tensor type
-    """
-    if len(b1) < len(b2):
-        return False
-    b1 = b1[-len(b2):]
-    return not any(v1 and not v2 for v1, v2 in zip(b1, b2))
+    def make_vector_shape(self, r):
+        return make_vector(*self.shape_of[r])
+    #
+    #
+    # Feature inteface
+    #
+    #
+    def on_attach(self, env):
+        assert not hasattr(env, 'shape_feature')
+        env.shape_feature = self
+        self.shape_of = {} # Variable -> tuple(scalars) or None  (All tensor vars map to tuple)
+        self.lscalar_one = T.constant(1, dtype='int64')
+        assert self.lscalar_one.type == T.lscalar
+        for node in env.toposort():
+            self.on_import(env, node)
+
+    def on_import(self, env, node):
+        if node.outputs[0] in self.shape_of:
+            # this is a revert, not really an import
+            for r in node.outputs + node.inputs:
+                assert r in self.shape_of
+            return
+
+        for i, r in enumerate(node.inputs):
+            # make sure we have shapes for the inputs
+            if r not in self.shape_of:
+                try:
+                    self.set_shape(r, self.shape_tuple(r))
+                except AttributeError:
+                    self.set_shape(r, None ) # not a TensorType variable

-def merge_broadcastables(broadcastables):
-    return [all(bcast) for bcast in zip(*broadcastables)]
+        try:
+            shape_infer = node.op.infer_shape
+        except AttributeError:
+            shape_infer = self.default_infer_shape

-@gof.local_optimizer([T.fill, None])
-def local_fill_lift(node):
+        try:
+            o_shapes = shape_infer(node, [self.shape_of[r] for r in node.inputs])
+        except Exception, e:
+            _logger.error('Failed to infer_shape from Op %s (i_shapes=%s): %s %s'% (node.op,
+                [self.shape_of[r] for r in node.inputs],
+                type(e), str(e)))
+            o_shapes = default_infer_shape(node, [self.shape_of[r] for r in node.inputs])
+
+        # this is packed information
+        # an element of o_shapes is either None or a tuple
+        #   elements of the tuple can be either strings, or ints
+
+        assert len(o_shapes) == len(node.outputs)
+
+        for r, s in zip(node.outputs, o_shapes):
+            self.set_shape(r, s)
+
+    def on_change_input(self, env, mode, i, r, new_r):
+        # TODO:
+        # This tells us that r and new_r must have the same shape
+        # if we didn't know that the shapes are related, now we do.
+        pass
+
+class ShapeOptimizer(Optimizer):
+    """Optimizer that serves to add ShapeFeature as an env feature.
    """
-    fill(f(a), b) -> fill(a, b)
-    If a.type == f(a).type.
+    def __init__(self):
+        Optimizer.__init__(self)

-    fill(a, b) -> b
-    If a.type == b.type.
-    """
-    if not opt.check_chain(node, T.fill):
-        return False
+    def add_requirements(self, env):
+        env.extend(ShapeFeature())

-    model, filling = node.inputs
+    def apply(self, env):
+        pass

-    mb, fb = model.type.broadcastable, filling.type.broadcastable
-    if model.type.dtype == filling.type.dtype and encompasses_broadcastable(fb, mb):
-        return False# [filling]
+# -1 should make it run right before the first merge
+theano.compile.mode.optdb.register('ShapeOpt', ShapeOptimizer(), -1, 'fast_run', 'fast_compile')

-    parent = model.owner
-    if parent is None or not isinstance(parent, T.Elemwise):
-        return False
-    for input in parent.inputs:
-        if input.type == model.type:
-            return [T.fill(input, filling)]
+@register_specialize
+@register_canonicalize
+@gof.local_optimizer([T.fill])
+def local_fill_to_alloc(node):
+    """fill(s,v) -> alloc(v, shape(s))

-    return False
+    This is an important optimization because with the shape_to_shape_i optimization, the
+    dependency on 's' is often removed.
+    
+    """
+    if node.op == T.fill:
+        r, v = node.inputs
+        if v.type == node.outputs[0].type:
+            # this is a useless fill, erase it.
+            rval = [v]
+        elif v.type.broadcastable == node.outputs[0].type.broadcastable:
+            # this is a cast
+            rval = [T.cast(v, node.outputs[0].type.dtype)]
+        else:
+            # we are broadcasting v somehow
+            shape_of = node.env.shape_feature.shape_of
+            # TODO: cut out un-necessary dimshuffles of v
+            rval = [T.Alloc(node.outputs[0].dtype)(v, *shape_of[node.outputs[0]])]
+        assert rval[0].type == node.outputs[0].type
+        return rval

-register_canonicalize(local_fill_lift, 'fill_lift')
+@register_specialize
+@register_canonicalize
+@gof.local_optimizer([T._shape])
+def local_shape_to_shape_i(node):
+    if node.op == T._shape:
+        shape_feature = node.env.shape_feature
+        return [shape_feature.make_vector_shape(node.inputs[0])]

+@register_specialize
+@register_canonicalize
+@gof.local_optimizer([T.Subtensor])
+def local_subtensor_make_vector(node):
+    # replace all subtensor(make_vector) like:
+    # [a,b,c][0] -> a
+    # [a,b,c][0:2] -> [a,b]
+    # we can do this for constant indexes
+    if isinstance(node.op, T.Subtensor):
+        shape_feature = node.env.shape_feature
+        x = node.inputs[0]
+        if x.owner and x.owner.op == make_vector:
+            try:
+                idx, = node.op.idx_list
+            except:
+                #'how can you have multiple indexes into a shape?'
+                raise
+            if isinstance(idx, int):
+                return [x.owner.inputs[idx]]
+            elif isinstance(idx, T.TensorVariable):
+                # if it is a constant we can do something with it
+                try:
+                    v = get_constant_value(idx)
+                    return [x.owner.inputs[v]]
+                except:
+                    pass 
+            else:
+                # it is a slice of ints and/or Variables
+                #TODO: check subtensor to see if it can contain constant variables,
+                #      and if it can, then try to unpack them.
+                try:
+                    return [make_vector(*x.owner.inputs.__getitem__(idx))]
+                except TypeError:
+                    pass
+                except:
+                    _logger.error('failed to index with "%s"' % str(idx))
+                    raise

 ##################
 # Subtensor opts #
@@ -306,38 +533,6 @@ def local_subtensor_unary(node):
            x_idx = node.op(u.owner.inputs[0], *idx)
            return [u.owner.op(x_idx)]

-@gof.local_optimizer([None, None])
-def local_subtensor_make_vector(node):
-    """
-    [a,b,c][0] -> a
-    [a,b,c][0:2] -> [a,b]
-
-    If the index or slice is constant.
-    """
-    if not opt.check_chain(node, T.Subtensor, T.MakeVector):
-        return False
-
-    joined_r = node.inputs[0]
-
-    try: 
-        #check that join is being used to join scalars
-        veclen = T.join.vec_length(joined_r)
-    except:
-        return False
-
-    idxlist = node.op.idx_list
-    if len(idxlist) != 1:
-        return False
-    idx = idxlist[0]
-    if isinstance(idx, int):
-        return [node.inputs[0].owner.inputs[idx]]
-    try:
-        return T.make_vector(*(node.owner.inputs[0].owner.inputs.__getslice__(idx)))
-    except TypeError:
-        return False
-
-register_canonicalize(local_subtensor_make_vector)
-
 @register_canonicalize
 @gof.local_optimizer([None])
 def local_IncSubtensor_serialize(node):
@@ -581,13 +776,18 @@ class Canonizer(gof.LocalOptimizer):
        # the dtype of the 'input' argument. The leaf-Variables of the graph covered by the
        # recursion may be of any Variable type.

-        if len(input.clients) > 1:
-            # this logic is too conservative, but doing it is better than not doing it.
-            #
-            # we don't want to canonize a subgraph that we will need to compute anyway for the other clients.
-            # This check is too conservative because if the other clients are also in the subgraph we are canonizing,
-            # then we should [probably?] recurse anyway.
-            return [input], []
+        if 0:
+            # UPDATE: This logic makes it impossible to recognize some important patterns
+            # (e.g. variants on the x/x)
+            # and it is screwing up the RBM free energy gradient.
+            #TODO: review this
+            if len(input.clients) > 1:
+                # this logic is too conservative, but doing it is better than not doing it.
+                #
+                # we don't want to canonize a subgraph that we will need to compute anyway for the other clients.
+                # This check is too conservative because if the other clients are also in the subgraph we are canonizing,
+                # then we should [probably?] recurse anyway.
+                return [input], []

        if input.owner is None or input.owner.op not in [self.main, self.inverse, self.reciprocal]:
            if input.owner and isinstance(input.owner.op, T.DimShuffle):
@@ -835,8 +1035,8 @@ class Canonizer(gof.LocalOptimizer):
        # Here we make the canonical version of the graph around this node
        # See the documentation of get_num_denum and simplify
        orig_num, orig_denum = self.get_num_denum(node.outputs[0])
-        num, denum = list(orig_num), list(orig_denum)
-        num, denum = self.simplify(num, denum)
+        num, denum = self.simplify(list(orig_num), list(orig_denum))
+

        def same(x, y):
            return len(x) == len(y) and all(N.all(xe == ye) for xe, ye in zip(x, y))
@@ -935,6 +1135,60 @@ def local_sum_mul_by_scalar(node):
        if thing_summed.owner and thing_summed.owner.op == T.neg:
            return [T.neg(node.op(thing_summed.owner.inputs[0]))]

+@register_canonicalize
+@gof.local_optimizer([])
+def local_sum_all_to_none(node):
+    """Sum{0,1,...N} -> Sum{}"""
+    if isinstance(node.op, T.Sum):
+        # if all the axes are named, then use None as a shorthand
+        # this permits more merging
+        if node.op.axis is None:
+            return
+        if set(node.op.axis) == set(range(node.inputs[0].type.ndim)):
+            return [T.Sum(axis=None)(node.inputs[0])]
+
+@register_canonicalize
+@gof.local_optimizer([])
+def local_sum_sum(node):
+    """Sum(Sum()) -> Sum"""
+    if isinstance(node.op, T.Sum):
+        summed, = node.inputs
+        if len(summed.clients) == 1:
+            if summed.owner and isinstance(summed.owner.op, T.Sum):
+                if summed.owner.op.axis is None:
+                    # special case of local_cut_useless_reduce
+                    return [T.Sum(None)(summed.owner.inputs[0])]
+                if node.op.axis is None:
+                    # we're summing up everything anyway so lets 
+                    # do it all at once
+                    return [T.Sum(None)(summed.owner.inputs[0])]
+
+                # figure out which dimensions of the original input are preserved
+                alldims = range(summed.owner.inputs[0].type.ndim)
+                
+                # trim out the dimensions that were removed by the first sum
+                alldims = [d for i,d in enumerate(alldims) if i in summed.owner.op.axis]
+
+                # trim out the dimensions removed by second sum
+                alldims = [d for i,d in enumerate(alldims) if i in node.op.axis]
+
+                # figure out an axis argument that combines the effect of both
+                newaxis = [i for i in xrange(summed.owner.inputs[0].type.ndim)
+                        if i not in alldims]
+
+                combined_sum = T.Sum(newaxis)
+                return [combined_sum(summed.owner.inputs[0])]
+
+@register_canonicalize
+@gof.local_optimizer([])
+def local_cut_useless_reduce(node):
+    """Sum(a, axis=[]) -> a  """
+    if isinstance(node.op, T.CAReduce):
+        summed, = node.inputs
+        # if reduce were doing anything, the output ndim would be reduced
+        if summed.type == node.outputs[0].type:
+            return [summed]
+
 @gof.local_optimizer([T.mul])
 def local_mul_to_neg(node):
    if node.op == T.mul and N.all(local_mul_canonizer.get_constant(node.inputs[0]) == -1.0):
@@ -1143,30 +1397,23 @@ register_specialize(local_add_specialize)

 mul_canonizer = in2out(gof.LocalOptGroup(local_mul_canonizer, local_fill_cut, local_fill_sink))

-@register_specialize
+@register_stabilize
 @gof.local_optimizer([T.log])
 def local_log1p(node):
    # log(1+exp(x)) -> log1p(x)
    if node.op == T.log:
        log_arg, = node.inputs
        if log_arg.owner and log_arg.owner.op == T.add:
-            add_inputs = log_arg.owner.inputs
-            consts = [0]
-            fills = []
-            nonconsts = []
-            for add_in in add_inputs:
-                try:
-                    v, f = get_constant_value(add_in, fill=True)
-                    consts.append(v)
-                    fills.extend(f)
-                except:
-                    nonconsts.append(add_in)
-            if nonconsts:
-                if numpy.allclose(numpy.sum(consts), 1):
-                    if len(nonconsts)==1:
-                        return _fill_chain(T.log1p(nonconsts[0]), fills)
-                    else:
-                        return _fill_chain(T.log1p(T.add(*nonconsts)), fills)
+            scalars, scalar_inputs, nonconsts = \
+                    scalarconsts_rest(log_arg.owner.inputs)
+            # scalar_inputs are potentially dimshuffled and fill'd scalars
+            if scalars and numpy.allclose(numpy.sum(scalars), 1):
+                if not nonconsts:
+                    pass # leave for constant-merge
+                if len(nonconsts)==1:
+                    return _fill_chain(T.log1p(nonconsts[0]), scalar_inputs)
+                else:
+                    return _fill_chain(T.log1p(T.add(*nonconsts)), scalar_inputs)


 def add_calculate(num, denum, aslist = False, out_type=None):

--- a/theano/tensor/raw_random.py
+++ b/theano/tensor/raw_random.py
@@ -136,10 +136,7 @@ class RandomFunction(gof.Op):
        draw.

        """
-        if shape == () or shape == []:
-            shape = tensor.as_tensor_variable(shape, dtype='int64')
-        else:
-            shape = tensor.as_tensor_variable(shape, ndim=1)
+        shape = tensor.as_tensor_variable(shape, ndim=1)
        assert shape.type.ndim == 1
        assert (shape.type.dtype == 'int64') or (shape.type.dtype == 'int32')
        if not isinstance(r.type, RandomStateType):
@@ -158,6 +155,22 @@ class RandomFunction(gof.Op):
                         [r, shape] + args,
                         [r.type(), self.outtype()])

+    def infer_shape(self, node, i_shapes):
+        r, shp = node.inputs[0:2]
+
+        #if shp is a constant array of len 0, then it means 'automatic shape'
+        unknown_shape = len(getattr(shp, 'data', [0,1,2])) == 0
+
+        # if ndim_added == 0 and shape != () then shape
+        if self.ndim_added == 0 and not unknown_shape:
+            sample_shp = shp
+        else:
+            # if shape == () then it will depend on args
+            # if ndim_added != 0 and shape != () then it will depend on args
+            sample_shp = node.outputs[1].shape
+
+        return [None, [sample_shp[i] for i in xrange(node.outputs[1].ndim)]]
+
    def perform(self, node, inputs, (rout, out)):
        # Use self.fn to draw shape worth of random numbers.
        # Numbers are drawn from r if self.inplace is True, and from a copy of r if

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -89,7 +89,6 @@ class test_greedy_distribute(unittest.TestCase):
        g = Env([a,b,c,d,x,y,z], [e])
        ##print pprint(g.outputs[0])
        mul_canonizer.optimize(g)
-        gof.TopoOptimizer(gof.LocalOptGroup(local_fill_cut, local_fill_lift), order = 'out_to_in').optimize(g)
        gof.TopoOptimizer(gof.LocalOptGroup(local_greedy_distributor), order = 'out_to_in').optimize(g)
        ##print pprint(g.outputs[0])
    
@@ -136,7 +135,6 @@ class test_canonize(unittest.TestCase):
        g = Env([x, y, z, a, b, c, d], [e])
        print pprint(g.outputs[0])
        mul_canonizer.optimize(g)
-        gof.TopoOptimizer(gof.LocalOptGroup(local_fill_cut, local_fill_lift), order = 'out_to_in').optimize(g)
        print pprint(g.outputs[0])

    def test_elemwise_multiple_inputs_optimisation(self):
@@ -296,17 +294,17 @@ class test_canonize(unittest.TestCase):
            
    def test_multiple_case(self):
        """ test those case take from the comment in Canonizer
-      x / x -> 1
-      (x * y) / x -> y
-      x / y / x -> 1 / y
-      x / y / z -> x / (y * z)
-      x / (y / z) -> (x * z) / y
-      (a / b) * (b / c) * (c / d) -> a / d
-      (2.0 * x) / (4.0 * y) -> (0.5 * x) / y
-      2 * x / 2 -> x
-      with and without DimShuffle
-      TODO: with DimShuffle
-      """
+        x / x -> 1
+        (x * y) / x -> y
+        x / y / x -> 1 / y
+        x / y / z -> x / (y * z)
+        x / (y / z) -> (x * z) / y
+        (a / b) * (b / c) * (c / d) -> a / d
+        (2.0 * x) / (4.0 * y) -> (0.5 * x) / y
+        2 * x / 2 -> x
+        with and without DimShuffle
+        TODO: with DimShuffle
+        """
        import theano.tensor, theano.compile

        shp=(3,3)
@@ -331,6 +329,7 @@ class test_canonize(unittest.TestCase):
        old_optimizer = mode._optimizer
        try:
            mode._optimizer=gof.Query(["canonicalize"])
+            mode._optimizer=mode._optimizer.including('ShapeOpt')
            mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')

            #test x / x -> 1
@@ -344,10 +343,15 @@ class test_canonize(unittest.TestCase):
                out = f(*val_inputs)
                assert (out==numpy.ones(shp, dtype=out_dtype)).all()
                topo=f.maker.env.toposort()
-                assert len(topo)==1
-                assert isinstance(topo[0].op,(T.Elemwise,))
-                assert isinstance(topo[0].op.scalar_op,theano.scalar.basic.Second)
-                assert len(topo[0].inputs)==2
+                if sym_inputs[0].broadcastable[0]:
+                    assert len(topo)==2
+                    assert isinstance(topo[0].op, Shape_i)
+                    assert isinstance(topo[1].op, TT.Alloc)
+                else:
+                    assert len(topo)==3
+                    assert isinstance(topo[0].op, Shape_i)
+                    assert isinstance(topo[1].op, Shape_i)
+                    assert isinstance(topo[2].op, TT.Alloc)
                assert(out_dtype==out.dtype)

            #test (x * y) / x -> y
@@ -365,10 +369,16 @@ class test_canonize(unittest.TestCase):
                f = compile.function(list(sym_inputs), g,
                                     mode=mode)
                out = f(*val_inputs)
+                assert(out_dtype==out.dtype)
                assert numpy.allclose(out,val_inputs[1])
                topo=f.maker.env.toposort()
-                assert len(topo)==nb_elemwise
-                assert(out_dtype==out.dtype)
+                print "ID TOPO", id, topo, sym_inputs
+                for r,t in f.maker.env.shape_feature.shape_of.items():
+                    print '  ', r, t
+                if topo:
+                    for node in topo[:-1]:
+                        assert isinstance(node.op, Shape_i)
+                    assert isinstance(topo[-1].op, TT.Alloc)

            #test x / y / x -> 1 / y
            for id,(g, sym_inputs, val_inputs, nb_elemwise, out_dtype) in enumerate([
@@ -378,19 +388,21 @@ class test_canonize(unittest.TestCase):
                                                           ((fv/fy)/fv,[fv,fy],[fvv,fyv],1,'float32'),
                            #must broadcast as their is a dimshuffle in the computation

-                                                           ((dx/dv)/dx,[dx,dv],[dxv,dvv],2,'float64'),
-    #topo:            [Elemwise{inv,no_inplace}(<TensorType(float64, row)>), Elemwise{second,no_inplace}(x, Elemwise{inv,no_inplace}.0)]
-                                                           ((fx/fv)/fx,[fx,fv],[fxv,fvv],2,'float32'),
-                #topo:[Elemwise{inv,no_inplace}(<TensorType(float32, row)>), Elemwise{second,no_inplace}(x, Elemwise{inv,no_inplace}.0)]
+                                                           ((dx/dv)/dx,[dx,dv],[dxv,dvv],1,'float64'),
+    #topo:            [Shape_i, Shape_i, Elemwise{inv,no_inplace}(<TensorType(float64, row)>), Alloc(...)]
+                                                           ((fx/fv)/fx,[fx,fv],[fxv,fvv],1,'float32'),
+                #topo:[Shape_i, Shape_i, Elemwise{inv,no_inplace}(<TensorType(float32, row)>), Alloc(...)]
                ]):
                f = compile.function(list(sym_inputs), g,
                                     mode=mode)
                out = f(*val_inputs)
                assert numpy.allclose(out,(1/val_inputs[1]))
                topo=f.maker.env.toposort()
-                assert len(topo)==nb_elemwise
-                assert isinstance(topo[0].op,(T.Elemwise,))
-                assert isinstance(topo[0].op.scalar_op,(theano.scalar.basic.Inv, theano.scalar.basic.TrueDiv))
+                print topo
+                elem = [t for t in topo if isinstance(t.op, T.Elemwise)]
+                assert len(elem)==nb_elemwise
+                assert isinstance(elem[0].op,(T.Elemwise,))
+                assert isinstance(elem[0].op.scalar_op,(theano.scalar.basic.Inv, theano.scalar.basic.TrueDiv))
                assert(out_dtype==out.dtype)

            #test (a / b) * (b / c) * (c / d) -> a / d
@@ -529,29 +541,6 @@ def test_mixeddiv():
    d = dscalar()
    assert 0 == function([i,d], d*(i/(i+1)))(3, 1.0)

-def test_local_shape_lift_dot():
-    args_to_result = {
-        (fvector, fvector): "[]",
-        (fvector, fmatrix): "[<TensorType(float32, matrix)>.shape[1]]",
-        (fmatrix, fvector): "[<TensorType(float32, matrix)>.shape[0]]",
-        (fmatrix, fmatrix): "[<TensorType(float32, matrix)>.shape[0], <TensorType(float32, matrix)>.shape[1]]",
-        }
-
-    for x in [fvector, fmatrix]:
-        for y in [fvector, fmatrix]:
-            i = x()
-            j = y()
-            print 'I SHAPE', i.type.shape
-            print 'J SHAPE', j.type.shape
-            d = shape(dot(i,j))
-            if x is fvector and y is fvector:
-                assert d == ()
-            else:
-                g = Env([i,j], [d])
-                gof.TopoOptimizer(gof.LocalOptGroup(local_shape_lift_dot), order='out_to_in').optimize(g)
-                print pprint(g.outputs[0]), args_to_result[(x,y)]
-                assert pprint(g.outputs[0]) == args_to_result[(x,y)]
-        
 def test_const_type_in_mul_canonizer():
    input = dmatrix()
    w = dmatrix()
@@ -915,11 +904,16 @@ def test_log1p():
    # check trickier cases (and use different dtype)
    y = fmatrix()
    f = function([x,y], T.log(fill(y,1)+(x)), mode=m)
-    assert [node.op for node in f.maker.env.toposort()] == [T.DimShuffle([False], ['x', 0], True), T.log1p, T.fill]
+    print f.maker.env.toposort()
+    # the first three ops are Shape_i, Shape_i, and Dimshuffle
+    assert [node.op for node in f.maker.env.toposort()][3:] \
+            == [T.log1p, Alloc('float64')]
    f = function([x,y], T.log(0+(x) + fill(y,1.0)), mode=m)
-    assert [node.op for node in f.maker.env.toposort()] == [T.DimShuffle([False], ['x', 0], True), T.log1p, T.fill]
+    assert [node.op for node in f.maker.env.toposort()][3:] \
+            == [T.log1p, Alloc('float64')]
    f = function([x,y], T.log(2+(x) - fill(y,1.0)), mode=m)
-    assert [node.op for node in f.maker.env.toposort()] == [T.DimShuffle([False], ['x', 0], True), T.log1p, T.fill]
+    assert [node.op for node in f.maker.env.toposort()][3:] \
+            == [T.log1p, Alloc('float64')]

    f([1e-7, 10], [[0, 0], [0, 0]]) #debugmode will verify values 
        
@@ -969,6 +963,51 @@ class test_local_subtensor_unary(unittest.TestCase):

        f([[0,1],[2,3]], [4,5]) # let debugmode test something

+def test_local_fill_useless():
+    m = theano.config.mode
+    if m == 'FAST_COMPILE':
+        m = 'FAST_RUN'
+
+    x = dvector()
+    y = dvector()
+    z = lvector()
+
+    # basic case
+    f = function([x], T.fill(x,x)*2, mode=m)
+    assert [node.op for node in f.maker.env.toposort()] == [T.mul]
+
+    # basic case
+    f = function([x,y], T.second(y,x)*2, mode=m)
+    assert [node.op for node in f.maker.env.toposort()] == [T.mul]
+
+    # now with different type
+    f = function([x,z], T.fill(z,x)*2, mode=m)
+    assert [node.op for node in f.maker.env.toposort()] == [T.mul]
+
+    # now cutting out the input ??
+    f = function([x,y], T.fill(x,y)*2, mode=m)
+    assert [node.op for node in f.maker.env.toposort()] == [T.mul]
+
+    # now filll is serving as a cast
+    f = function([x,y], T.fill(x,y)*2, mode=m)
+    assert [node.op for node in f.maker.env.toposort()] == [T.mul]
+
+class test_shapeoptimizer(unittest.TestCase):
+    def test0(self):
+        v = T.vector()
+        m = T.matrix()
+        f = function([v,m], (v+m).shape)
+        for node in f.maker.env.toposort():
+            assert node.op != T.add
+
+    def test_constant(self):
+
+        v = T.vector()
+        m = T.matrix()
+        f = function([v,m], v.dimshuffle('x','x',0).shape[1])
+        print f.maker.env.toposort()
+        assert [] == f.maker.env.toposort()
+
 if __name__ == '__main__':
 #    unittest.main()
    test_fusion().tes_memory_leak()

--- a/theano/tensor/tests/test_shared_randomstreams.py
+++ b/theano/tensor/tests/test_shared_randomstreams.py
@@ -352,7 +352,7 @@ class T_SharedRandomStreams(unittest.TestCase):

    def test_vector_arguments(self):
        random = RandomStreams(utt.fetch_seed())
-        low = tensor.vector()
+        low = tensor.dvector()
        out = random.uniform(low=low, high=1)
        assert out.ndim == 1
        f = function([low], out)
@@ -402,8 +402,8 @@ class T_SharedRandomStreams(unittest.TestCase):

    def test_broadcast_arguments(self):
        random = RandomStreams(utt.fetch_seed())
-        low = tensor.vector()
-        high = tensor.col()
+        low = tensor.dvector()
+        high = tensor.dcol()
        out = random.uniform(low=low, high=high)
        assert out.ndim == 2
        f = function([low, high], out)
@@ -424,8 +424,8 @@ class T_SharedRandomStreams(unittest.TestCase):

    def test_uniform_vector(self):
        random = RandomStreams(utt.fetch_seed())
-        low = tensor.vector()
-        high = tensor.vector()
+        low = tensor.dvector()
+        high = tensor.dvector()
        out = random.uniform(low=low, high=high)
        assert out.ndim == 1
        f = function([low, high], out)
@@ -438,11 +438,15 @@ class T_SharedRandomStreams(unittest.TestCase):
        # Arguments of size (3,)
        val0 = f(low_val, high_val)
        numpy_val0 = numpy_rng.uniform(low=low_val, high=high_val)
+        print 'THEANO', val0
+        print 'NUMPY', numpy_val0
        assert numpy.all(val0 == numpy_val0)

        # arguments of size (2,)
        val1 = f(low_val[:-1], high_val[:-1])
        numpy_val1 = numpy_rng.uniform(low=low_val[:-1], high=high_val[:-1])
+        print 'THEANO', val1
+        print 'NUMPY', numpy_val1
        assert numpy.all(val1 == numpy_val1)

        # Specifying the size explicitly
@@ -486,8 +490,8 @@ class T_SharedRandomStreams(unittest.TestCase):

    def test_normal_vector(self):
        random = RandomStreams(utt.fetch_seed())
-        avg = tensor.vector()
-        std = tensor.vector()
+        avg = tensor.dvector()
+        std = tensor.dvector()
        out = random.normal(avg=avg, std=std)
        assert out.ndim == 1
        f = function([avg, std], out)