merge

af804be5 · James Bergstra · 1364578a · 8963a3d1 · af804be5 · af804be5
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -197,6 +197,11 @@ class BadOptimization(DebugModeError):
            print >> ssio, "  Mean Abs Diff: ", numpy.mean(numpy.absolute(nv-ov))
            print >> ssio, "  Median Abs Diff: ", numpy.median(numpy.absolute(nv-ov))
            print >> ssio, "  Std Abs Diff: ", numpy.std(numpy.absolute(nv-ov))
+            reldiff = numpy.absolute(nv-ov) / (numpy.absolute(nv)+numpy.absolute(ov))
+            print >> ssio, "  Max Rel Diff: ", numpy.max(reldiff)
+            print >> ssio, "  Mean Rel Diff: ", numpy.mean(reldiff)
+            print >> ssio, "  Median Rel Diff: ", numpy.median(reldiff)
+            print >> ssio, "  Std Rel Diff: ", numpy.std(reldiff)
            # only if all succeeds to we add anything to sio
            print >> sio, ssio.getvalue()                                    
        except:
@@ -349,14 +354,17 @@ def debugprint(r, prefix='', depth=-1, done=None, file=sys.stdout):
        # this variable is the output of computation,
        # so just print out the apply
        a = r.owner
-        print >> file, prefix, a.op, id(a)
+        if len(a.outputs) == 1:
+            print >> file, '%s%s [@%i]' % (prefix, a.op, id(r))
+        else:
+            print >> file, '%s%s.%i [@%i]' % (prefix, a.op, a.outputs.index(r), id(r))
        if id(a) not in done:
            done.add(id(a))
            for i in a.inputs:
-                debugprint(i, prefix+'  ', depth=depth-1, done=done, file=file)
+                debugprint(i, prefix+' |', depth=depth-1, done=done, file=file)
    else:
        #this is a variable
-        print >> file, prefix, r, id(r)
+        print >> file, '%s%s [@%i]' % (prefix, r, id(r))

    return file


--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -116,7 +116,7 @@ class AddDestroyHandler(gof.Optimizer):
        for o in env.outputs:
            try:
                env.replace_validate(o, _output_guard(o), reason='output_guard')
-                _logger.warning("Output variable %s required output_guard,"
+                _logger.info("Output variable %s required output_guard,"
                        " how was this output left unprotected against destructive operations?"
                        % o)
            except gof.InconsistencyError:
@@ -127,12 +127,22 @@ class AddDestroyHandler(gof.Optimizer):
        env.extend(gof.DestroyHandler())

 optdb = gof.SequenceDB()
-optdb.register('merge1', gof.MergeOptimizer(), 0, 'fast_run', 'fast_compile')
-optdb.register('canonicalize', gof.EquilibriumDB(), 1, 'fast_run')
-optdb.register('specialize', gof.EquilibriumDB(), 2, 'fast_run')
-optdb.register('merge2', gof.MergeOptimizer(), 49, 'fast_run')
-optdb.register('add_destroy_handler', AddDestroyHandler(), 49.5, 'fast_run', 'inplace')
-optdb.register('merge3', gof.MergeOptimizer(), 100, 'fast_run')
+optdb.register('merge1', gof.MergeOptimizer(), 
+        0, 'fast_run', 'fast_compile')
+optdb.register('canonicalize', gof.EquilibriumDB(),         # rearranges elemwise expressions
+        1, 'fast_run')
+optdb.register('merge1.2', gof.MergeOptimizer(skip_const_merge=True),
+        1.2, 'fast_run', 'fast_compile')
+optdb.register('stabilize', gof.EquilibriumDB(),            # replace unstable subgraphs
+        1.5, 'fast_run')          
+optdb.register('specialize', gof.EquilibriumDB(),           # misc special cases for speed
+        2, 'fast_run')
+optdb.register('merge2', gof.MergeOptimizer(),              # especially constant merge
+        49, 'fast_run')
+optdb.register('add_destroy_handler', AddDestroyHandler(), 
+        49.5, 'fast_run', 'inplace')
+optdb.register('merge3', gof.MergeOptimizer(),              # final pass just to make sure
+        100, 'fast_run')


 class Mode(object):
@@ -153,6 +163,12 @@ class Mode(object):
    
    def __init__(self, linker = config.linker, optimizer = config.optimizer):
        self.__setstate__((linker, optimizer))
+        #self.provided_optimizer - typically the `optimizer` arg.  But if the `optimizer` arg is
+        #    keyword corresponding to a predefined Query, then this stores the query
+        #self._optimizer - typically same as provided_optimizer??
+
+        #self.__get_optimizer - returns self._optimizer (possibly querying optdb with self._optimizer)
+        #self.optimizer - property that returns __get_optimizer()

    def __getstate__(self):
        return (self.provided_linker, self.provided_optimizer)
@@ -218,7 +234,7 @@ predefined_modes = {'FAST_COMPILE': FAST_COMPILE,

 def get_mode(string):
    if string is None: string = config.mode
-    if not isinstance(string, str): return string #it is already a mode...
+    if not isinstance(string, str): return string #it is hopefully already a mode...
    if not predefined_modes.has_key(string):
        raise Exception("No predefixed mode exist for string: %s"%string)
    return predefined_modes[string]

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -197,12 +197,19 @@ class _metadict:


 class MergeOptimizer(Optimizer):
-    """WRITEME
-    Merges parts of the graph that are identical, i.e. parts that
-    take the same inputs and carry out the asme computations so we
-    can avoid doing them more than once. Also merges variables that
-    are constant.
    """
+    Merges parts of the graph that are identical and redundant.
+    
+    The basic principle is that if two Applies have ops that compare equal, and identical
+    inputs, then they do not both need to be computed.  The clients of one are transfered to
+    the other and one of them is removed from the graph.  This procedure is carried out in
+    input->output order through the graph.
+
+    The first step of merging is constant-merging, so that all clients of an int(1) for example,
+    are transfered to a particular instance of int(1).
+    """
+    def __init__(self, skip_const_merge=False):
+        self.skip_const_merge = skip_const_merge

    def add_requirements(self, env):
        env.extend(toolbox.ReplaceValidate())
@@ -230,41 +237,6 @@ class MergeOptimizer(Optimizer):
                    const_sig[c] = sig
                    const_sig_inv[sig] = c

-    def exptime_apply_node_merge(self, env):
-        # we clear the dicts because the Constants signatures are not necessarily hashable
-        # and it's more efficient to give them an integer like the other Variables
-
-        symbol_idx = {}       #variable -> int
-        symbol_idx_inv = {}   #int -> variable (inverse of symbol_idx)
-
-        #add all graph sources to the symbol_idx dictionaries (arbitrary order)
-        for i, r in enumerate(r for r in env.variables if r.owner is None):
-            symbol_idx[r] = i
-            symbol_idx_inv[i] = r
-
-        for node in _list_of_nodes(env):
-            node_cid = (node.op, tuple([symbol_idx[input] for input in node.inputs]))
-            #print 'NODE', node, node_cid
-            dup = symbol_idx_inv.get(node_cid, None)
-            success = False
-            if dup is not None:
-                success = True
-                pairs = zip(node.outputs, dup.outputs)
-                for output, new_output in pairs:
-                    if output.name and not new_output.name:
-                        new_output.name = output.name
-                try:
-                    env.replace_all_validate(pairs, reason='Merge (exptime)')
-                except InconsistencyError, e:
-                    success = False
-            if not success:
-                symbol_idx[node] = node_cid
-                symbol_idx_inv[node_cid] = node
-                for i, output in enumerate(node.outputs):
-                    ref = (i, node_cid)
-                    symbol_idx[output] = ref
-                    symbol_idx_inv[ref] = output
-    
    def apply_node_merge(self, env):
        # we clear the dicts because the Constants signatures are not necessarily hashable
        # and it's more efficient to give them an integer like the other Variables
@@ -316,7 +288,8 @@ class MergeOptimizer(Optimizer):

    #TODO: Consider splitting this into a separate optimizer (SeqOptimizer)
    def apply(self, env):
-        self.apply_constant_merge(env)
+        if not self.skip_const_merge:
+            self.apply_constant_merge(env)
        self.apply_node_merge(env)

 merge_optimizer = MergeOptimizer()
@@ -541,7 +514,7 @@ class PatternSub(LocalOptimizer):
     PatternSub((subtract, (add, 'x', 'y'), 'y'), 'x')
     PatternSub((power, 'x', Constant(double, 2.0)), (square, 'x'))
     PatternSub((boggle, {'pattern': 'x',
-                                'constraint': lambda env, expr: expr.type == scrabble}),
+                                'constraint': lambda expr: expr.type == scrabble}),
                      (scrabble, 'x'))
    """

@@ -789,7 +762,10 @@ class NavigatorOptimizer(Optimizer):
                raise
        if replacements is False or replacements is None:
            return False
-        assert len(node.outputs) == len(replacements)
+        if not isinstance(replacements, (tuple, list)):
+            raise TypeError('Optimizer %s gave wrong type of replacement' % lopt)
+        if len(node.outputs) != len(replacements):
+            raise ValueError('Optimizer %s gave wrong number of replacements' % lopt)
        repl_pairs = zip(node.outputs, replacements)
        try:
            env.replace_all_validate(repl_pairs, reason=lopt)
@@ -904,8 +880,13 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                 max_depth = None,
                 max_use_ratio = None):
        """
+        :param local_optimizers:  list or set of local optimizations to apply until
+            equilibrium.
+
        :param max_use_ratio: each optimizer can be applied at most (size of graph * this number)

+        :param max_depth: TODO what does this do? (EquilibriumDB sets it to 5)
+
        """

        super(EquilibriumOptimizer, self).__init__(
@@ -916,6 +897,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        self.local_optimizers = local_optimizers
        self.max_depth = max_depth
        self.max_use_ratio = max_use_ratio
+        assert self.max_use_ratio is not None, 'max_use_ratio has to be a number'

    def apply(self, env, start_from = None):
        if start_from is None:
@@ -960,7 +942,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                            changed |= lopt_change
            finally:
                self.detach_updater(env, u)
-            self.detach_updater(env, u)
+            self.detach_updater(env, u) #TODO: erase this line, it's redundant at best
        if max_use_abort:
            print >> sys.stderr, "WARNING: EquilibriumOptimizer max'ed out"


--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -26,7 +26,7 @@ class DB(object):
        # It is an instance of a DB.In the tests for example,
        # this is not always the case.
        if not isinstance(obj, (DB, opt.Optimizer, opt.LocalOptimizer)):
-            raise Exception('Triing to register an optimizer that don\'t herite from theano.gof.opt.Optimizer or theano.gof.opt.LocalOptimizer', obj)
+            raise TypeError('Object cannot be registered in OptDB', obj)
            
        if self.name is not None:
            tags = tags + (self.name,)
@@ -132,6 +132,18 @@ class Query(object):


 class EquilibriumDB(DB):
+    """A set of potential optimizations which should be applied in an arbitrary order until
+    equilibrium is reached.
+
+    Canonicalize, Stabilize, and Specialize are all equilibrium optimizations.
+
+    .. note::
+        
+        It seems like this might be supposed to contain LocalOptimizer instances rather than
+        optimizer instances, because whatever is selected by the query is passed to
+        EquilibriumOptimizer and EquilibriumOptimizer requires LocalOptimizer instances.
+
+    """

    def query(self, *tags, **kwtags):
        opts = super(EquilibriumDB, self).query(*tags, **kwtags)
@@ -142,27 +154,45 @@ class EquilibriumDB(DB):


 class SequenceDB(DB):
+    """A sequence of potential optimizations.
+
+    Retrieve a sequence of optimizations (a SeqOptimizer) by calling query().
+
+    Each potential optimization is registered with a floating-point position.
+    No matter which optimizations are selected by a query, they are carried out in order of
+    increasing position.
+
+    The optdb itself (`theano.compile.mode.optdb`), from which (among many other tags) fast_run
+    and fast_compile optimizers are drawn is a SequenceDB.
+
+    """

    def __init__(self, failure_callback = opt.SeqOptimizer.warn):
        super(SequenceDB, self).__init__()
-        self.__priority__ = {}
+        self.__position__ = {}
        self.failure_callback = failure_callback

-    def register(self, name, obj, priority, *tags):
+    def register(self, name, obj, position, *tags):
        super(SequenceDB, self).register(name, obj, *tags)
-        self.__priority__[name] = priority
+        self.__position__[name] = position

    def query(self, *tags, **kwtags):
+        """
+        :type position_cutoff: float or int
+        :param position_cutoff: only optimizations with position less than the cutoff are returned.
+        """
+        position_cutoff = kwtags.pop('position_cutoff', float('inf'))
        opts = super(SequenceDB, self).query(*tags, **kwtags)
-        opts = list(opts)
-        opts.sort(key = lambda obj: self.__priority__[obj.name])
+        opts = [o for o in opts if self.__position__[o.name] < position_cutoff]
+        opts.sort(key = lambda obj: self.__position__[obj.name])
        return opt.SeqOptimizer(opts, failure_callback = self.failure_callback)

    def print_summary(self, stream=sys.stdout):
        print >> stream, "SequenceDB (id %i)"%id(self)
-        print >> stream, "  priority", self.__priority__
+        print >> stream, "  position", self.__position__
        print >> stream, "  names", self._names
        print >> stream, "  db", self.__db__
+
    def __str__(self):
        sio = StringIO.StringIO()
        self.print_summary(sio)

--- a/theano/printing.py
+++ b/theano/printing.py
@@ -7,9 +7,52 @@ import sys,os
 from theano import config
 from gof import Op, Apply
 from theano.gof.python25 import any
+from theano.compile import Function, debugmode

-#We import the debugprint here to have all printing of graph available from this module
-from theano.compile.debugmode import debugprint
+
+def debugprint(obj, depth=-1, file=None):
+    """Print a computation graph to file
+
+    :type obj: Variable, Apply, or Function instance
+    :param obj: symbolic thing to print
+    :type depth: integer
+    :param depth: print graph to this depth (-1 for unlimited)
+    :type file: None or file-like object
+    :param file: print to this file (None means sys.stdout)
+
+    :rtype: None or file-like object
+    :returns: `file` argument
+
+    Each line printed represents a Variable in the graph.
+    The indentation of each line corresponds to its depth in the symbolic graph.
+    The first part of the text identifies whether it is an input (if a name or type is printed)
+    or the output of some Apply (in which case the Op is printed).
+    The second part of the text is the memory location of the Variable.
+
+    If a Variable is encountered multiple times in the depth-first search, it is only printed
+    recursively the first time.  Later, just the Variable and its memory location are printed.
+
+    If an Apply has multiple outputs, then a '.N' suffix will be appended to the Apply's
+    identifier, to indicate which output a line corresponds to.
+
+    """
+    if file is None:
+        _file = sys.stdout
+    else:
+        _file = file
+    done = set()
+    results_to_print = []
+    if isinstance(obj, gof.Variable):
+        results_to_print.append(obj)
+    elif isinstance(obj, gof.Apply):
+        results_to_print.extend(obj.outputs)
+    elif isinstance(obj, Function):
+        results_to_print.extend(obj.maker.env.outputs)
+    for r in results_to_print:
+        debugmode.debugprint(r, depth=depth, done=done, file=_file)
+    if file is None:
+        _file.flush()
+    return file

 class Print(Op):
    """This identity-like Op has the side effect of printing a message followed by its inputs
@@ -329,7 +372,7 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
        if var.name is not None:
            varstr = var.name
        elif isinstance(var,gof.Constant):
-            varstr = str(var.data)
+            varstr = '%s [%s]'% (str(var.data) , str(var.type))
        elif var in input_update and input_update[var].variable.name is not None:
            varstr = input_update[var].variable.name
        else:

--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
@@ -3,7 +3,7 @@ import numpy
 import theano
 from theano import Op, Type, Apply, Variable, Constant
 from theano import tensor
-from theano.compile import shared, SharedVariable, shared_constructor
+from theano.compile import shared, SharedVariable

 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import filter as type_support_filter
@@ -68,6 +68,11 @@ CudaNdarrayType.SharedVariable = CudaNdarraySharedVariable
 def cuda_shared_constructor(value, name, strict=False, broadcastable=None):
    """SharedVariable Constructor for TensorType"""

+    # THIS CONSTRUCTOR TRIES TO CAST VALUE TO A FLOAT32, WHICH THEN GOES ONTO THE CARD
+    # SO INT shared vars, float64 shared vars, etc. all end up on the card.
+    # THIS IS NOT THE DEFAULT BEHAVIOUR THAT WE WANT. 
+    # SEE float32_shared_constructor
+
    #TODO: what should strict mean in this context, since we always have to make a copy?
    if strict:
        _value = value

--- a/theano/scan.py
+++ b/theano/scan.py
@@ -20,8 +20,9 @@ Special cases:

 Often a for loop can be expressed as a ``scan()`` operation, and ``scan`` is
 the closest that theano comes to looping. The advantage of using ``scan`` 
-over for loops is that it allows you to express the loop symbolically. The 
-Scan Op should always be used by applying the ``scan`` function. 
+over for loops is that it allows the number of iterations to be a part of the symbolic graph. 
+
+The Scan Op should always be used by applying the ``scan`` function. 
 """ 
 __docformat__ = 'restructedtext en'

@@ -60,7 +61,8 @@ def hash_listsDictsTuples(x):

 def scan(fn, sequences, initial_states, non_sequences, inplace_map={}, \
         sequences_taps={}, outputs_taps = {}, n_steps = 0, \
-         truncate_gradient = -1, go_backwards = False, mode = 'FAST_RUN'):
+         truncate_gradient = -1, go_backwards = False, 
+         mode = None):
    '''Function that constructs and applies a Scan op

    :param fn: Function that describes the operations involved in one step of scan 

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -6,10 +6,9 @@ import numpy.distutils
 from theano.configparser import config, AddConfigVar, StrParam
 from theano.gof import (utils, Op, Apply, view_roots, PatternSub, DestroyHandler, 
        SeqOptimizer, local_optimizer, Optimizer, LocalOptimizer, OpKeyOptimizer, 
-        InconsistencyError, toolbox)
+        InconsistencyError, toolbox, SequenceDB, EquilibriumOptimizer)
 from theano.printing import pprint, FunctionPrinter
-from theano.tensor.opt import register_specialize, out2in, insert_inplace_optimizer
-# opt.py
+from theano.compile.mode import optdb

 import basic as T

@@ -30,7 +29,6 @@ AddConfigVar('blas.ldflags',
        "lib[s] to include for [Fortran] level-3 blas implementation",
        StrParam(default_blas_ldflags()))

-
 _logger = logging.getLogger('theano.tensor.blas')
 _logger.setLevel(logging.WARN)
 def debug(*msg): _logger.debug(' '.join(str(m) for m in msg))
@@ -391,12 +389,22 @@ class Gemm(GemmRelated):
    def c_code_cache_version(self):
        return (1,) + self.build_gemm_version()

-gemm = Gemm()
-    
-
-
-
+class PseudoGemm(Op):
+    # should be replaced by Gemm
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, *args):
+        inputs = [T.as_tensor_variable(i) for i in args]
+        return Apply(self, inputs, [inputs[0].type()])
+    def perform(self, node, (z, a, x, y, b), (zout, )):
+        zout[0] = a * numpy.dot(x,y) + b *  z
+gemm = PseudoGemm()
+gemm_inplace = Gemm()
 pprint.assign(gemm, FunctionPrinter('gemm'))
+pprint.assign(gemm_inplace, FunctionPrinter('gemm_inplace'))
+
 def res_is_a(node, op, maxclients=None):
  if maxclients is not None:
    retval = (len(node.clients) <= maxclients)
@@ -597,6 +605,7 @@ class GemmOptimizer(Optimizer):
        while did_something:
            nodelist = list(env.toposort())
            did_something = False
+            nodelist.reverse()
            for node in nodelist:
                new_outputs = _gemm_from_node(node)
                if new_outputs:
@@ -611,10 +620,6 @@ class GemmOptimizer(Optimizer):
                        #TODO: retry other applications of gemm (see comment in _gemm_from_node
                        pass

-#neede to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71)
-compile.optdb.register('inplace_gemm', GemmOptimizer(), 70.00, 'fast_run', 'inplace', 'gemm')
-
-
 class Dot22(GemmRelated):
    """Compute a matrix-matrix product.
    This is a specialization of the more general Dot()
@@ -689,5 +694,34 @@ def local_dot_to_dot22(node):
            info('Not optimizing dot with inputs', x, y, x.type, y.type)
    else:
        return False
-register_specialize(local_dot_to_dot22)
+
+@local_optimizer([gemm])
+def local_inplace_gemm(node):
+    if node.op == gemm:
+        return [gemm_inplace(*node.inputs)]
+
+#################################
+#
+# Set up the BlasOpt optimizer
+#
+#################################
+
+blas_optdb = SequenceDB()
+
+# run after numerical stability optimizations (1.5)
+optdb.register('BlasOpt', blas_optdb, 1.7, 'fast_run')
+# run before specialize (2.0) because specialize is basically a free-for-all that makes the
+# graph crazy.
+
+blas_optdb.register('local_dot_to_dot22', 
+        EquilibriumOptimizer([local_dot_to_dot22], max_use_ratio=5),
+        0, 'fast_run')
+blas_optdb.register('local_dot_to_gemm', GemmOptimizer(), 10, 'fast_run')
+
+# After destroyhandler is in but before we try to make elemwise things inplace
+# Try to make gemm inplace
+# Also, need to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71)
+optdb.register('InplaceBlasOpt', 
+        EquilibriumOptimizer([local_inplace_gemm], max_use_ratio=5), 
+        70.0, 'fast_run', 'inplace')

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -197,6 +197,18 @@ class DimShuffle(Op):

        storage[0] = numpy.asarray(res) #asarray puts scalars back into array

+    def infer_shape(self, node, (ishp,)):
+        ishp = list(ishp)
+        for drop in reversed(self.drop):
+            del ishp[drop]
+        # transpose
+        rval = [ishp[i] for i in self.shuffle]
+
+        # augment
+        for augm in self.augment:
+            rval.insert(augm, 1)
+        return [rval]
+
    def c_code(self, node, name, (input,), (res,), sub):
        basename = input + '__view_or_copy'

@@ -613,6 +625,25 @@ class Elemwise(Op):
        # the following should be used instead of the previous loop, unfortunately it tends to segfault
        # self.ufunc(*(ufunc_args+[s[0] for s in output_storage]))

+    def infer_shape(self, node, i_shapes):
+        rval = []
+        for o in node.outputs:
+            oshp = []
+            for dim, b in enumerate(o.type.broadcastable):
+                b_dim = None
+                if b: # this is broadcastable
+                    b_dim = 1
+                else: # there must be some input that is not broadcastable
+                    for ishp, i in zip(i_shapes,node.inputs):
+                        if not i.type.broadcastable[dim]:
+                            b_dim = ishp[dim]
+                            assert b_dim, 'AA'
+                            break
+                    assert b_dim, 'BB'
+                oshp.append(b_dim)
+            rval.append(oshp)
+        return rval
+
    def _c_all(self, node, name, inames, onames, sub):
        _inames = inames
        _onames = onames
@@ -764,10 +795,14 @@ class CAReduce(Op):
        if scalar_op.nin not in [-1, 2] or scalar_op.nout != 1:
            raise NotImplementedError("CAReduce only supports binary functions with a single output.")
        self.scalar_op = scalar_op
-        if isinstance(axis, int):
-            self.axis = [axis]
-        else:
+        if axis is None:
            self.axis = axis
+        elif isinstance(axis, int):
+            self.axis = (axis,)
+        else:
+            self.axis = list(set(axis))
+            self.axis.sort()
+            self.axis = tuple(self.axis)
        self.ufunc = numpy.frompyfunc(scalar_op.impl, 2, 1)

        # CAReduce output views input when reducing scalars
@@ -834,6 +869,13 @@ class CAReduce(Op):
        else:
            output[0] = numpy.copy(variable)

+    def infer_shape(self, node, (ishape,)):
+        axis = self.axis
+        if axis is None:
+            return (),
+        return [ishape[i] for (i,b) in enumerate(node.inputs[0].type.broadcastable) if i not in axis],
+
+
    def _c_all(self, node, name, inames, onames, sub):

        input = node.inputs[0]

--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
 from nnet import *
+from sigm import softplus, sigmoid, sigmoid_inplace, scalar_sigmoid
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -4,89 +4,14 @@
 """

 from theano import gof
-from theano import scalar
 from theano import printing
-from theano.printing import pprint
 from theano.tensor import basic as tensor
 from theano.tensor import elemwise
 from theano.tensor import opt
 from theano.compile import optdb
 import numpy

-############
-#
-# SCALAR OPS
-#
-
-class ScalarSigmoid(scalar.UnaryScalarOp):
-    @staticmethod
-    def st_impl(x):
-        if x < -30.0:
-            return 0.0
-        if x > 30.0:
-            return 1.0 
-        return 1.0 / (1.0 + numpy.exp(-x))
-    def impl(self, x):
-        return ScalarSigmoid.st_impl(x)
-    def grad(self, (x,), (gz,)):
-        y = scalar_sigmoid(x)
-        return [gz * y * (1.0 - y)]
-    def c_code(self, node, name, (x,), (z,), sub):
-        if node.inputs[0].type == scalar.float32:
-            # These constants were obtained by looking at the output of python commands like:
-            #  for i in xrange(750):
-            #      print i, repr( theano._asarray(1.0, dtype=dt) / (theano._asarray(1.0, dtype=dt) + numpy.exp(-theano._asarray([i,-i], dtype=dt))))
-            # the boundary checks prevent us from generating inf
-            return """%(z)s = %(x)s < -88.0f ? 0.0 : %(x)s > 15.0f ? 1.0f : 1.0f /(1.0f + exp(-%(x)s));""" % locals()
-        elif node.inputs[0].type == scalar.float64:
-            return """%(z)s = %(x)s < -709.0 ? 0.0 : %(x)s > 19.0 ? 1.0 : 1.0 /(1.0+exp(-%(x)s));""" % locals()
-        else:
-            raise NotImplementedError('only floatingpoint is implemented')
-    def c_code_cache_version(self):
-        v = super(ScalarSigmoid, self).c_code_cache_version()
-        if v:
-            return (2,) + v
-        else:
-            return v
-scalar_sigmoid = ScalarSigmoid(scalar.upgrade_to_float, name='scalar_sigmoid')
-sigmoid = elemwise.Elemwise(scalar_sigmoid, name='sigmoid')
-
-pprint.assign(sigmoid, printing.FunctionPrinter('sigmoid'))
-
-
-class ScalarSoftplus(scalar.UnaryScalarOp):
-    @staticmethod
-    def static_impl(x):
-        if x < -30.0:
-            return 0.0
-        if x > 30.0:
-            return x
-        return numpy.log1p(numpy.exp(x))
-    def impl(self, x):
-        return ScalarSoftplus.static_impl(x)
-    def grad(self, (x,), (gz,)):
-        return [gz * scalar_sigmoid(x)]
-    def c_code(self, node, name, (x,), (z,), sub):
-        if node.inputs[0].type == scalar.float32:
-            # These constants were obtained by looking at the output of python commands like:
-            #  for i in xrange(750):
-            #      print i, repr( numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt))))
-            # the boundary checks prevent us from generating inf
-            return """%(z)s = %(x)s < -103.0f ? 0.0 : %(x)s > 14.0f ? %(x)s : log1p(exp(%(x)s));""" % locals()
-        elif node.inputs[0].type == scalar.float64:
-            return """%(z)s = %(x)s < -745.0 ? 0.0 : %(x)s > 16.0 ? %(x)s : log1p(exp(%(x)s));""" % locals()
-        else:
-            raise NotImplementedError('only floatingpoint is implemented')
-    def c_code_cache_version(self):
-        v = super(ScalarSoftplus, self).c_code_cache_version()
-        if v:
-            return (2,) + v
-        else:
-            return v
-scalar_softplus = ScalarSoftplus(scalar.upgrade_to_float, name='scalar_softplus')
-softplus = elemwise.Elemwise(scalar_softplus, name='softplus')
-
-pprint.assign(softplus, printing.FunctionPrinter('softplus'))
+from .sigm import sigmoid, softplus


 ############
@@ -1351,6 +1276,7 @@ def categorical_crossentropy(coding_dist, true_dist):
        raise TypeError('rank mismatch between coding and true distributions')


+from theano import scalar

 class Prepend_scalar_constant_to_each_row(gof.Op):
    def __init__(self, val = 0):
@@ -1440,14 +1366,3 @@ prepend_scalar_to_each_row = Prepend_scalar_to_each_row()
 prepend_0_to_each_row = Prepend_scalar_constant_to_each_row(0.)
 prepend_1_to_each_row = Prepend_scalar_constant_to_each_row(1.)

-logsigm_to_softplus = gof.PatternSub(
-    (tensor.log, (sigmoid, 'x')),
-    (tensor.neg, (softplus, (tensor.neg, 'x'))),
-    allow_multiple_clients = True)
-log1msigm_to_softplus = gof.PatternSub(
-    (tensor.log, (tensor.sub, tensor.constant([[1.0]]), (sigmoid, 'x'))),
-    (tensor.neg, (softplus, 'x')),
-    allow_multiple_clients = True)
-
-opt.register_specialize(logsigm_to_softplus, name = 'logsigm_to_softplus')
-opt.register_specialize(log1msigm_to_softplus, name = 'log1msigm_to_softplus')
--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
--- a/theano/tensor/nnet/tests/test_sigm.py
+++ b/theano/tensor/nnet/tests/test_sigm.py
+import unittest
+import theano
+from theano import tensor as T
+from theano import gof
+import numpy
+from theano.tests import unittest_tools as utt
+from theano.tensor.tests import test_basic as TT
+
+from theano.tensor.nnet import *
+
+
+class T_sigmoid(unittest.TestCase):
+    def setUp(self):
+        utt.seed_rng()
+    def test_elemwise(self):
+        utt.verify_grad(sigmoid, [numpy.random.rand(3,4)])
+
+class T_softplus(unittest.TestCase):
+    def setUp(self):
+        utt.seed_rng()
+    def test_elemwise(self):
+        utt.verify_grad(softplus, [numpy.random.rand(3,4)])
+
+
+class T_sigmoid_opts(unittest.TestCase):
+    def test_exp_over_1_plus_exp(self):
+        m = theano.config.mode
+        if m == 'FAST_COMPILE':
+            m = 'FAST_RUN'
+
+        x = T.dvector()
+
+        # tests exp_over_1_plus_exp
+        f = theano.function([x], T.exp(x)/(1+T.exp(x)), mode=m)
+        #theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [sigmoid]
+
+        # tests inv_1_plus_exp
+        f = theano.function([x], T.fill(x,1.0) / (1+T.exp(-x)), mode=m)
+        #theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [sigmoid]
+
+        # tests inv_1_plus_exp with neg
+        f = theano.function([x], T.fill(x,-1.0) / (1+T.exp(-x)), mode=m)
+        #theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [sigmoid, 
+                T.inplace.neg_inplace]
+
+        # tests double inv_1_plus_exp with neg
+        f = theano.function([x], (T.fill(x,-1.0)*T.exp(x)) / ((1+T.exp(x))*(1+T.exp(-x))), mode=m)
+        #theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [sigmoid, 
+                T.mul]
+
+    def test_1msigmoid(self):
+        m = theano.config.mode
+        if m == 'FAST_COMPILE':
+            m = 'FAST_RUN'
+
+        x = T.fmatrix()
+
+        # tests exp_over_1_plus_exp
+        f = theano.function([x], 1 - T.exp(x)/(1+T.exp(x)), mode=m)
+        theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [tensor.neg, sigmoid_inplace]
+
+        # tests inv_1_plus_exp
+        f = theano.function([x], 1 - T.fill(x,1.0) / (1+T.exp(-x)), mode=m)
+        theano.printing.debugprint(f)
+        assert [node.op for node in f.maker.env.toposort()] == [tensor.neg, 
+                sigmoid_inplace]
+
+
+
+
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
--- a/theano/tensor/raw_random.py
+++ b/theano/tensor/raw_random.py
@@ -136,10 +136,7 @@ class RandomFunction(gof.Op):
        draw.

        """
-        if shape == () or shape == []:
-            shape = tensor.as_tensor_variable(shape, dtype='int64')
-        else:
-            shape = tensor.as_tensor_variable(shape, ndim=1)
+        shape = tensor.as_tensor_variable(shape, ndim=1)
        assert shape.type.ndim == 1
        assert (shape.type.dtype == 'int64') or (shape.type.dtype == 'int32')
        if not isinstance(r.type, RandomStateType):
@@ -158,6 +155,22 @@ class RandomFunction(gof.Op):
                         [r, shape] + args,
                         [r.type(), self.outtype()])

+    def infer_shape(self, node, i_shapes):
+        r, shp = node.inputs[0:2]
+
+        #if shp is a constant array of len 0, then it means 'automatic shape'
+        unknown_shape = len(getattr(shp, 'data', [0,1,2])) == 0
+
+        # if ndim_added == 0 and shape != () then shape
+        if self.ndim_added == 0 and not unknown_shape:
+            sample_shp = shp
+        else:
+            # if shape == () then it will depend on args
+            # if ndim_added != 0 and shape != () then it will depend on args
+            sample_shp = node.outputs[1].shape
+
+        return [None, [sample_shp[i] for i in xrange(node.outputs[1].ndim)]]
+
    def perform(self, node, inputs, (rout, out)):
        # Use self.fn to draw shape worth of random numbers.
        # Numbers are drawn from r if self.inplace is True, and from a copy of r if

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -89,7 +89,6 @@ class test_greedy_distribute(unittest.TestCase):
        g = Env([a,b,c,d,x,y,z], [e])
        ##print pprint(g.outputs[0])
        mul_canonizer.optimize(g)
-        gof.TopoOptimizer(gof.LocalOptGroup(local_fill_cut, local_fill_lift), order = 'out_to_in').optimize(g)
        gof.TopoOptimizer(gof.LocalOptGroup(local_greedy_distributor), order = 'out_to_in').optimize(g)
        ##print pprint(g.outputs[0])
    
@@ -136,7 +135,6 @@ class test_canonize(unittest.TestCase):
        g = Env([x, y, z, a, b, c, d], [e])
        print pprint(g.outputs[0])
        mul_canonizer.optimize(g)
-        gof.TopoOptimizer(gof.LocalOptGroup(local_fill_cut, local_fill_lift), order = 'out_to_in').optimize(g)
        print pprint(g.outputs[0])

    def test_elemwise_multiple_inputs_optimisation(self):
@@ -296,17 +294,17 @@ class test_canonize(unittest.TestCase):
            
    def test_multiple_case(self):
        """ test those case take from the comment in Canonizer
-      x / x -> 1
-      (x * y) / x -> y
-      x / y / x -> 1 / y
-      x / y / z -> x / (y * z)
-      x / (y / z) -> (x * z) / y
-      (a / b) * (b / c) * (c / d) -> a / d
-      (2.0 * x) / (4.0 * y) -> (0.5 * x) / y
-      2 * x / 2 -> x
-      with and without DimShuffle
-      TODO: with DimShuffle
-      """
+        x / x -> 1
+        (x * y) / x -> y
+        x / y / x -> 1 / y
+        x / y / z -> x / (y * z)
+        x / (y / z) -> (x * z) / y
+        (a / b) * (b / c) * (c / d) -> a / d
+        (2.0 * x) / (4.0 * y) -> (0.5 * x) / y
+        2 * x / 2 -> x
+        with and without DimShuffle
+        TODO: with DimShuffle
+        """
        import theano.tensor, theano.compile

        shp=(3,3)
@@ -331,6 +329,7 @@ class test_canonize(unittest.TestCase):
        old_optimizer = mode._optimizer
        try:
            mode._optimizer=gof.Query(["canonicalize"])
+            mode._optimizer=mode._optimizer.including('ShapeOpt')
            mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')

            #test x / x -> 1
@@ -344,10 +343,15 @@ class test_canonize(unittest.TestCase):
                out = f(*val_inputs)
                assert (out==numpy.ones(shp, dtype=out_dtype)).all()
                topo=f.maker.env.toposort()
-                assert len(topo)==1
-                assert isinstance(topo[0].op,(T.Elemwise,))
-                assert isinstance(topo[0].op.scalar_op,theano.scalar.basic.Second)
-                assert len(topo[0].inputs)==2
+                if sym_inputs[0].broadcastable[0]:
+                    assert len(topo)==2
+                    assert isinstance(topo[0].op, Shape_i)
+                    assert isinstance(topo[1].op, TT.Alloc)
+                else:
+                    assert len(topo)==3
+                    assert isinstance(topo[0].op, Shape_i)
+                    assert isinstance(topo[1].op, Shape_i)
+                    assert isinstance(topo[2].op, TT.Alloc)
                assert(out_dtype==out.dtype)

            #test (x * y) / x -> y
@@ -365,10 +369,16 @@ class test_canonize(unittest.TestCase):
                f = compile.function(list(sym_inputs), g,
                                     mode=mode)
                out = f(*val_inputs)
+                assert(out_dtype==out.dtype)
                assert numpy.allclose(out,val_inputs[1])
                topo=f.maker.env.toposort()
-                assert len(topo)==nb_elemwise
-                assert(out_dtype==out.dtype)
+                print "ID TOPO", id, topo, sym_inputs
+                for r,t in f.maker.env.shape_feature.shape_of.items():
+                    print '  ', r, t
+                if topo:
+                    for node in topo[:-1]:
+                        assert isinstance(node.op, Shape_i)
+                    assert isinstance(topo[-1].op, TT.Alloc)

            #test x / y / x -> 1 / y
            for id,(g, sym_inputs, val_inputs, nb_elemwise, out_dtype) in enumerate([
@@ -378,19 +388,21 @@ class test_canonize(unittest.TestCase):
                                                           ((fv/fy)/fv,[fv,fy],[fvv,fyv],1,'float32'),
                            #must broadcast as their is a dimshuffle in the computation

-                                                           ((dx/dv)/dx,[dx,dv],[dxv,dvv],2,'float64'),
-    #topo:            [Elemwise{inv,no_inplace}(<TensorType(float64, row)>), Elemwise{second,no_inplace}(x, Elemwise{inv,no_inplace}.0)]
-                                                           ((fx/fv)/fx,[fx,fv],[fxv,fvv],2,'float32'),
-                #topo:[Elemwise{inv,no_inplace}(<TensorType(float32, row)>), Elemwise{second,no_inplace}(x, Elemwise{inv,no_inplace}.0)]
+                                                           ((dx/dv)/dx,[dx,dv],[dxv,dvv],1,'float64'),
+    #topo:            [Shape_i, Shape_i, Elemwise{inv,no_inplace}(<TensorType(float64, row)>), Alloc(...)]
+                                                           ((fx/fv)/fx,[fx,fv],[fxv,fvv],1,'float32'),
+                #topo:[Shape_i, Shape_i, Elemwise{inv,no_inplace}(<TensorType(float32, row)>), Alloc(...)]
                ]):
                f = compile.function(list(sym_inputs), g,
                                     mode=mode)
                out = f(*val_inputs)
                assert numpy.allclose(out,(1/val_inputs[1]))
                topo=f.maker.env.toposort()
-                assert len(topo)==nb_elemwise
-                assert isinstance(topo[0].op,(T.Elemwise,))
-                assert isinstance(topo[0].op.scalar_op,(theano.scalar.basic.Inv, theano.scalar.basic.TrueDiv))
+                print topo
+                elem = [t for t in topo if isinstance(t.op, T.Elemwise)]
+                assert len(elem)==nb_elemwise
+                assert isinstance(elem[0].op,(T.Elemwise,))
+                assert isinstance(elem[0].op.scalar_op,(theano.scalar.basic.Inv, theano.scalar.basic.TrueDiv))
                assert(out_dtype==out.dtype)

            #test (a / b) * (b / c) * (c / d) -> a / d
@@ -529,29 +541,6 @@ def test_mixeddiv():
    d = dscalar()
    assert 0 == function([i,d], d*(i/(i+1)))(3, 1.0)

-def test_local_shape_lift_dot():
-    args_to_result = {
-        (fvector, fvector): "[]",
-        (fvector, fmatrix): "[<TensorType(float32, matrix)>.shape[1]]",
-        (fmatrix, fvector): "[<TensorType(float32, matrix)>.shape[0]]",
-        (fmatrix, fmatrix): "[<TensorType(float32, matrix)>.shape[0], <TensorType(float32, matrix)>.shape[1]]",
-        }
-
-    for x in [fvector, fmatrix]:
-        for y in [fvector, fmatrix]:
-            i = x()
-            j = y()
-            print 'I SHAPE', i.type.shape
-            print 'J SHAPE', j.type.shape
-            d = shape(dot(i,j))
-            if x is fvector and y is fvector:
-                assert d == ()
-            else:
-                g = Env([i,j], [d])
-                gof.TopoOptimizer(gof.LocalOptGroup(local_shape_lift_dot), order='out_to_in').optimize(g)
-                print pprint(g.outputs[0]), args_to_result[(x,y)]
-                assert pprint(g.outputs[0]) == args_to_result[(x,y)]
-        
 def test_const_type_in_mul_canonizer():
    input = dmatrix()
    w = dmatrix()
@@ -915,11 +904,16 @@ def test_log1p():
    # check trickier cases (and use different dtype)
    y = fmatrix()
    f = function([x,y], T.log(fill(y,1)+(x)), mode=m)
-    assert [node.op for node in f.maker.env.toposort()] == [T.DimShuffle([False], ['x', 0], True), T.log1p, T.fill]
+    print f.maker.env.toposort()
+    # the first three ops are Shape_i, Shape_i, and Dimshuffle
+    assert [node.op for node in f.maker.env.toposort()][3:] \
+            == [T.log1p, Alloc('float64')]
    f = function([x,y], T.log(0+(x) + fill(y,1.0)), mode=m)
-    assert [node.op for node in f.maker.env.toposort()] == [T.DimShuffle([False], ['x', 0], True), T.log1p, T.fill]
+    assert [node.op for node in f.maker.env.toposort()][3:] \
+            == [T.log1p, Alloc('float64')]
    f = function([x,y], T.log(2+(x) - fill(y,1.0)), mode=m)
-    assert [node.op for node in f.maker.env.toposort()] == [T.DimShuffle([False], ['x', 0], True), T.log1p, T.fill]
+    assert [node.op for node in f.maker.env.toposort()][3:] \
+            == [T.log1p, Alloc('float64')]

    f([1e-7, 10], [[0, 0], [0, 0]]) #debugmode will verify values 
        
@@ -969,6 +963,51 @@ class test_local_subtensor_unary(unittest.TestCase):

        f([[0,1],[2,3]], [4,5]) # let debugmode test something

+def test_local_fill_useless():
+    m = theano.config.mode
+    if m == 'FAST_COMPILE':
+        m = 'FAST_RUN'
+
+    x = dvector()
+    y = dvector()
+    z = lvector()
+
+    # basic case
+    f = function([x], T.fill(x,x)*2, mode=m)
+    assert [node.op for node in f.maker.env.toposort()] == [T.mul]
+
+    # basic case
+    f = function([x,y], T.second(y,x)*2, mode=m)
+    assert [node.op for node in f.maker.env.toposort()] == [T.mul]
+
+    # now with different type
+    f = function([x,z], T.fill(z,x)*2, mode=m)
+    assert [node.op for node in f.maker.env.toposort()] == [T.mul]
+
+    # now cutting out the input ??
+    f = function([x,y], T.fill(x,y)*2, mode=m)
+    assert [node.op for node in f.maker.env.toposort()] == [T.mul]
+
+    # now filll is serving as a cast
+    f = function([x,y], T.fill(x,y)*2, mode=m)
+    assert [node.op for node in f.maker.env.toposort()] == [T.mul]
+
+class test_shapeoptimizer(unittest.TestCase):
+    def test0(self):
+        v = T.vector()
+        m = T.matrix()
+        f = function([v,m], (v+m).shape)
+        for node in f.maker.env.toposort():
+            assert node.op != T.add
+
+    def test_constant(self):
+
+        v = T.vector()
+        m = T.matrix()
+        f = function([v,m], v.dimshuffle('x','x',0).shape[1])
+        print f.maker.env.toposort()
+        assert [] == f.maker.env.toposort()
+
 if __name__ == '__main__':
 #    unittest.main()
    test_fusion().tes_memory_leak()

--- a/theano/tensor/tests/test_shared_randomstreams.py
+++ b/theano/tensor/tests/test_shared_randomstreams.py
@@ -352,7 +352,7 @@ class T_SharedRandomStreams(unittest.TestCase):

    def test_vector_arguments(self):
        random = RandomStreams(utt.fetch_seed())
-        low = tensor.vector()
+        low = tensor.dvector()
        out = random.uniform(low=low, high=1)
        assert out.ndim == 1
        f = function([low], out)
@@ -402,8 +402,8 @@ class T_SharedRandomStreams(unittest.TestCase):

    def test_broadcast_arguments(self):
        random = RandomStreams(utt.fetch_seed())
-        low = tensor.vector()
-        high = tensor.col()
+        low = tensor.dvector()
+        high = tensor.dcol()
        out = random.uniform(low=low, high=high)
        assert out.ndim == 2
        f = function([low, high], out)
@@ -424,8 +424,8 @@ class T_SharedRandomStreams(unittest.TestCase):

    def test_uniform_vector(self):
        random = RandomStreams(utt.fetch_seed())
-        low = tensor.vector()
-        high = tensor.vector()
+        low = tensor.dvector()
+        high = tensor.dvector()
        out = random.uniform(low=low, high=high)
        assert out.ndim == 1
        f = function([low, high], out)
@@ -438,11 +438,15 @@ class T_SharedRandomStreams(unittest.TestCase):
        # Arguments of size (3,)
        val0 = f(low_val, high_val)
        numpy_val0 = numpy_rng.uniform(low=low_val, high=high_val)
+        print 'THEANO', val0
+        print 'NUMPY', numpy_val0
        assert numpy.all(val0 == numpy_val0)

        # arguments of size (2,)
        val1 = f(low_val[:-1], high_val[:-1])
        numpy_val1 = numpy_rng.uniform(low=low_val[:-1], high=high_val[:-1])
+        print 'THEANO', val1
+        print 'NUMPY', numpy_val1
        assert numpy.all(val1 == numpy_val1)

        # Specifying the size explicitly
@@ -486,8 +490,8 @@ class T_SharedRandomStreams(unittest.TestCase):

    def test_normal_vector(self):
        random = RandomStreams(utt.fetch_seed())
-        avg = tensor.vector()
-        std = tensor.vector()
+        avg = tensor.dvector()
+        std = tensor.dvector()
        out = random.normal(avg=avg, std=std)
        assert out.ndim == 1
        f = function([avg, std], out)