merge

d653a636 · desjagui@atchoum.iro.umontreal.ca · 56f487f6 · 19c4443e · d653a636 · d653a636
--- a/benchmark/autoencoder/Makefile
+++ b/benchmark/autoencoder/Makefile
 aa.x : aa.cc
-	g++ -O3 -ffast-math aa.cc -o aa.x -L${PUB_PREFIX}/lib -lgsl -lcblas -lgoto -lgfortran -lm
+	g++ -O3 -ffast-math aa.cc -o aa.x -L${PUB_PREFIX}/lib -lgsl ${THEANO_BLAS_LDFLAGS}

 clean : 
 	rm aa.x
--- a/benchmark/autoencoder/aa.cc
+++ b/benchmark/autoencoder/aa.cc
@@ -28,6 +28,7 @@ int main(int argc, char **argv)

    int neg = strtol(argv[1], 0, 0);
    int nout = strtol(argv[2], 0, 0);
+    int nin = nout;
    int nhid = strtol(argv[3], 0, 0);
    int niter = strtol(argv[4], 0, 0);
    double lr = 0.01;
@@ -35,8 +36,8 @@ int main(int argc, char **argv)
    gsl_rng_set(rng, 234);


-    gsl_matrix * x = gsl_matrix_alloc(neg, nout);
-    gsl_matrix * w = gsl_matrix_alloc(nout, nhid);
+    gsl_matrix * x = gsl_matrix_alloc(neg, nin);
+    gsl_matrix * w = gsl_matrix_alloc(nin, nhid);
    gsl_vector * a = gsl_vector_alloc(nhid);
    gsl_vector * b = gsl_vector_alloc(nout);
    gsl_matrix * xw = gsl_matrix_alloc(neg, nhid);
@@ -59,11 +60,17 @@ int main(int argc, char **argv)

    struct timeval tv0, tv1;

+    struct timeval tdot0, tdot1;
+    double time_of_dot = 0.0;
+
    gettimeofday(&tv0, 0);
    double err = 0.0;
    for (int iter = 0; iter < niter; ++iter)
    {
+        gettimeofday(&tdot0, 0);
        gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, x, w, 0.0, xw);
+        gettimeofday(&tdot1, 0);
+        time_of_dot += pytime(&tdot1) - pytime(&tdot0);

        for (int i = 0; i < neg; ++i)
            for (int j = 0; j < nhid; ++j)
@@ -72,7 +79,10 @@ int main(int argc, char **argv)
                hid->data[i*nhid+j] = tanh(act);
            }

+        gettimeofday(&tdot0, 0);
        gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, hid, w, 0.0, hidwt);
+        gettimeofday(&tdot1, 0);
+        time_of_dot += pytime(&tdot1) - pytime(&tdot0);

        for (int i = 0; i < nout; ++i) g_b->data[i] = 0.0;
        err = 0.0;
@@ -90,8 +100,11 @@ int main(int argc, char **argv)

        if (1)
        {
+        gettimeofday(&tdot0, 0);
            gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, g_hidwt, w, 0.0, g_hid);
            gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, g_hidwt, hid, 0.0, g_w);
+        gettimeofday(&tdot1, 0);
+        time_of_dot += pytime(&tdot1) - pytime(&tdot0);
            

            for (int i = 0; i < neg; ++i)
@@ -101,14 +114,19 @@ int main(int argc, char **argv)
                    a->data[j] -= lr * g_hid->data[i*nhid+j];
                }

+        gettimeofday(&tdot0, 0);
            gsl_blas_dgemm(CblasTrans, CblasNoTrans, -lr, x, g_hid, 1.0, w);
+        gettimeofday(&tdot1, 0);
+        time_of_dot += pytime(&tdot1) - pytime(&tdot0);
            for (int i = 0; i < nout*nhid; ++i) w->data[i] -= lr * g_w->data[i];
        }

    }
    gettimeofday(&tv1, 0);

-    fprintf(stdout, "took = %lfs  to get err %lf\n", pytime(&tv1) - pytime(&tv0), 0.5 * err);
+    double total_time = pytime(&tv1) - pytime(&tv0);
+    fprintf(stdout, "took = %lfs  to get err %lf\n", total_time, 0.5 * err);
+    fprintf(stdout, "... of which %.2lfs was spent in dgemm (fraction: %.2lf)\n", time_of_dot, time_of_dot / total_time);
    //skip freeing
    return 0;
 }

--- a/benchmark/autoencoder/aa.py
+++ b/benchmark/autoencoder/aa.py
@@ -8,7 +8,15 @@ import theano
 import theano.tensor as T
 import theano.sandbox
 import theano.sandbox.wraplinker
-from theano.compile import module
+from theano.compile import module, Mode
+from theano.sandbox.wraplinker import ProfileMode
+from theano import gof, Op, Apply
+
+from theano.tensor import blas, opt
+
+# numpy: aa_numpy.py
+# c : aa.cc
+

 if 0:
    class Opt(object):
@@ -130,32 +138,29 @@ if 0:

                self.merge(env)

-    def linker(print_prog=False):
-        if 1:
-            print 'wtf?'
-            #return theano.gof.OpWiseCLinker()
-            imap = {None:'-'}
-            def blah(i, node, thunk):
-                imap[node] = str(i)
-                if print_prog:# and node.op.__class__ is T.DimShuffle:
-                    if False and  node.op == T.DimShuffle((), ['x', 'x'], inplace = True):
-                        print node.op == T.DimShuffle((), ['x', 'x'], inplace = True),
-                        print node.inputs[0], type(node.inputs[0]), 
-                        print node.inputs[0].equals(T.constant(2)), 
-                    outputs = node.outputs
-                    inputs = theano.gof.graph.inputs(outputs)
-                    print 'node ', i, node,
-                    print ':'.join([imap[inp.owner] for inp in node.inputs])
-                    #print theano.sandbox.pprint.pp.process_graph(inputs, outputs)
-                    
-            return theano.sandbox.wraplinker.WrapLinkerMany(
-                    [theano.gof.OpWiseCLinker()],
-                    [theano.sandbox.wraplinker.run_all
-                        ,blah
-                        #,theano.sandbox.wraplinker.numpy_notall_isfinite
-                        ])
-        else:
-            return theano.gof.OpWiseCLinker()
+def print_graph_linker(print_prog=True):
+    if 1:
+        imap = {None:'-'}
+        def blah(i, node, thunk):
+            imap[node] = str(i)
+            if print_prog:# and node.op.__class__ is T.DimShuffle:
+                if False and  node.op == T.DimShuffle((), ['x', 'x'], inplace = True):
+                    print node.op == T.DimShuffle((), ['x', 'x'], inplace = True),
+                    print node.inputs[0], type(node.inputs[0]), 
+                    print node.inputs[0].equals(T.constant(2)), 
+                outputs = node.outputs
+                inputs = theano.gof.graph.inputs(outputs)
+                print 'node ', i, node,
+                print ':'.join([imap[inp.owner] for inp in node.inputs])
+                #print theano.sandbox.pprint.pp.process_graph(inputs, outputs)
+        return theano.sandbox.wraplinker.WrapLinkerMany(
+                [theano.gof.OpWiseCLinker()],
+                [theano.sandbox.wraplinker.run_all
+                    ,blah
+                    #,theano.sandbox.wraplinker.numpy_notall_isfinite
+                    ])
+    else:
+        return theano.gof.OpWiseCLinker()


 class M(module.Module):
@@ -167,11 +172,14 @@ class M(module.Module):
        self.a = module.Member(T.vector('a')) # hid bias
        self.b = module.Member(T.vector('b')) # output bias

-        hid = T.tanh(T.dot(x, self.w) + self.a)
+        self.hid = T.tanh(T.dot(x, self.w) + self.a)
+        hid = self.hid

-        out = T.tanh(T.dot(hid, self.w.T) + self.b)
+        self.out = T.tanh(T.dot(hid, self.w.T) + self.b)
+        out = self.out

-        err = 0.5 * T.sum((out - x)**2)
+        self.err = 0.5 * T.sum((out - x)**2)
+        err = self.err

        params = [self.w, self.a, self.b]

@@ -182,7 +190,13 @@ class M(module.Module):
        self.step = module.Method([x], err, updates=dict(updates))

 mod = M()
-m = mod.make(mode='FAST_RUN')
+mode = 'FAST_RUN'
+#mode = ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
+mode = Mode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker(nice_errors=True))
+mode = Mode(optimizer='fast_run', linker='c')
+mode = Mode(optimizer='fast_run', linker='c|py')
+print mod.pretty(mode=mode)
+m = mod.make(mode=mode)

 neg, nout, nhid, niter = [int(a) for a in sys.argv[1:]]
 rng = numpy.random.RandomState(342)
@@ -196,4 +210,10 @@ t = time.time()
 for i in xrange(niter):
    err = m.step(x)
 print 'time: ',time.time() - t, 'err: ', err
+try:
+    mode.print_summary()
+    pass
+except:
+    pass
+

--- a/benchmark/autoencoder/aa_numpy.py
+++ b/benchmark/autoencoder/aa_numpy.py
+#!/usr/bin/env python2.5
+from __future__ import absolute_import
+import numpy as N
+import sys
+import time
+
+# c: aa.cc
+
+neg, nout, nhid, niter = [int(a) for a in sys.argv[1:]]
+lr = 0.01
+
+rng = N.random.RandomState(342)
+
+w = rng.rand(nout, nhid)
+a = rng.randn(nhid) * 0.0
+b = rng.randn(nout) * 0.0
+x = (rng.rand(neg, nout)-0.5) * 1.5
+
+dot_time = 0.0
+
+t = time.time()
+for i in xrange(niter):
+    tt = time.time()
+    d = N.dot(x, w)
+    dot_time += time.time() - tt
+
+    hid = N.tanh(d + a)
+
+    tt = time.time()
+    d = N.dot(hid, w.T)
+    dot_time += time.time() - tt
+    out = N.tanh(d + b)
+
+    g_out = out - x
+    err = 0.5 * N.sum(g_out**2)
+
+    g_hidwt = g_out * (1.0 - out**2)
+
+    b -= lr * N.sum(g_hidwt, axis=0)
+
+    tt = time.time()
+    g_hid = N.dot(g_hidwt, w)
+    dot_time += time.time() - tt
+
+    g_hidin = g_hid * (1.0 - hid**2)
+
+    tt = time.time()
+    d = N.dot(g_hidwt.T, hid)
+    dd = N.dot(x.T, g_hidin)
+    dot_time += time.time() - tt
+
+    gw = (d + dd)
+    w -= lr * gw
+
+    a -= lr * N.sum(g_hidin, axis=0)
+
+total_time = time.time() - t
+print 'time: ',total_time, 'err: ', err
+print ' of which', dot_time, 'was spent on dot. Fraction:', dot_time / total_time
+
--- a/doc/README.txt
+++ b/doc/README.txt
@@ -89,8 +89,9 @@ Get the source and run the tests like this:

 .. code-block:: bash
    
-    hg clone http://pylearn.org/hg/theano theano
-    cd theano
+    hg clone http://pylearn.org/hg/theano Theano
+    ln -s Theano/theano <someplace on your PYTHONPATH>/theano
+    cd Theano
    nosetests

 To update your library to the latest on pylearn.org, change directory (`cd`) to this `theano` folder and type

--- a/examples/tests/test_wiki.py
+++ b/examples/tests/test_wiki.py
--- a/theano/compile/module.py
+++ b/theano/compile/module.py
@@ -664,6 +664,10 @@ class ComponentList(Composite):
        return self.__class__(*[c.dup() for c in self._components])


+def default_initialize(self, init = {}, **kwinit):
+    for k, initv in dict(init, **kwinit).iteritems():
+        self[k] = initv
+
 class ComponentDictInstance(CompositeInstance):
    """
    ComponentDictInstance is meant to be instantiated by ComponentDict.

--- a/theano/gof/__init__.py
+++ b/theano/gof/__init__.py
@@ -23,11 +23,12 @@ from op import \
 from opt import \
    Optimizer, optimizer, SeqOptimizer, \
    MergeOptimizer, MergeOptMerge, \
-    LocalOptimizer, local_optimizer, LocalOptGroup, LocalOpKeyOptGroup, \
+    LocalOptimizer, local_optimizer, LocalOptGroup, \
    OpSub, OpRemove, PatternSub, \
-    NavigatorOptimizer, TopoOptimizer, OpKeyOptimizer, EquilibriumOptimizer, \
+    NavigatorOptimizer, TopoOptimizer, EquilibriumOptimizer, \
    keep_going, warn, \
    InplaceOptimizer, PureThenInplaceOptimizer
+    #LocalOpKeyOptGroup, OpKeyOptimizer

 from optdb import \
    DB, Query, \

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -686,7 +686,16 @@ class CLinker(link.Linker):
                instantiate.customize.add_support_code(support_code)
            instantiate.customize.add_support_code(self.struct_code)
            instantiate.customize.add_support_code(static)
-            instantiate.customize.add_extra_compile_arg("-w")
+            for extra_arg in (
+                    "-O2", 
+                    "-ffast-math",
+                    #"-fprefetch-loop-arrays",
+                    #"-ftree-vect-loop-version",
+                    #"-ftree-loop-optimize",
+                    #"-ftree-vectorize"):
+                    "-w" #-w means supress all warnings
+                    ):
+                instantiate.customize.add_extra_compile_arg(extra_arg)
            for arg in self.compile_args():
                instantiate.customize.add_extra_compile_arg(arg)
            for header in self.headers():
@@ -739,6 +748,7 @@ def _execute(cthunk, init_tasks, tasks, error_storage):
            exc_value = exc_type(_exc_value, task)
            exc_value.__thunk_trace__ = trace # this can be used to retrieve the location the Op was declared
            raise exc_type, exc_value, exc_trace
+    execute.cthunk = cthunk
    return execute


@@ -761,9 +771,12 @@ class OpWiseCLinker(link.LocalLinker):

    __cache__ = {}

-    def __init__(self, fallback_on_perform = True):
+    def __init__(self, 
+            fallback_on_perform = True, 
+            nice_errors = True):
        self.env = None
        self.fallback_on_perform = fallback_on_perform
+        self.nice_errors = nice_errors

    def accept(self, env, no_recycling = []):
        if self.env is not None and self.env is not env:
@@ -833,7 +846,9 @@ class OpWiseCLinker(link.LocalLinker):
        else:
            no_recycling = [storage_map[r] for r in no_recycling if r not in env.inputs]

-        f = link.streamline(env, thunks, order, no_recycling = no_recycling, profiler = profiler)
+        f = link.streamline(env, thunks, order, 
+                no_recycling = no_recycling, 
+                nice_errors = self.nice_errors)

        return f, [link.Container(input, storage) for input, storage in zip(env.inputs, input_storage)], \
            [link.Container(output, storage, True) for output, storage in zip(env.outputs, output_storage)], \
@@ -841,7 +856,6 @@ class OpWiseCLinker(link.LocalLinker):



-
 def _default_checker(x, y):
    """WRITEME
    Default checker for DualLinker. This checks that the

--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
@@ -13,6 +13,7 @@ from collections import deque

 import utils

+_creation_idx = [0]

 class Apply(utils.object2):
    """
@@ -121,6 +122,13 @@ class Apply(utils.object2):
    def __asapply__(self):
        return self

+    def __hash__(self):
+        if not hasattr(self, '_creation_idx'):
+            self._creation_idx = _creation_idx[0]
+            _creation_idx[0] += 1
+        return self._creation_idx
+
+
    def clone(self):
        """Duplicate this Apply instance with inputs = self.inputs.

@@ -567,7 +575,10 @@ def general_toposort(r_out, deps, debug_print = False):
        deps(i) should behave like a pure function (no funny business with internal state)

    :note: 
-        deps(i) can/should be cached by the deps function to be fast
+        deps(i) will be cached by this function (to be fast)
+
+    :note:
+        The order of the return value list is determined by the order of nodes returned by the deps() function.
    """
    deps_cache = {}
    def _deps(io):
@@ -611,8 +622,9 @@ def general_toposort(r_out, deps, debug_print = False):
 def io_toposort(i, o, orderings = {}):
    """WRITEME
    """
+    #the inputs are used only here in the function that decides what 'predecessors' to explore
    iset = set(i)
-    def deps(obj):
+    def deps(obj): 
        rval = []
        if obj not in iset:
            if isinstance(obj, Result): 

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -5,6 +5,7 @@ from type import Type

 import sys, traceback
 from copy import copy
+from cutils import run_cthunk


 __excepthook = sys.excepthook
@@ -225,9 +226,27 @@ def clear_storage_thunk(stg):
    thunk.inputs = [stg]
    return thunk

-def streamline(env, thunks, order, no_recycling = [], profiler = None):
-    """WRITEME"""
-    if profiler is None:
+def streamline(env, thunks, order, no_recycling = [], profiler = None, nice_errors = True):
+    """WRITEME
+
+    :param env:
+
+    :param thunks: the list of program instructions
+
+    :param order: the list of apply instances that gave rise to the thunks (same order as thunks)
+
+    :param no_recycling: storage elements that cannot be 'recycled' by repeatedly executing the
+    program.  These storage elements are cleared before re-running.
+    
+    :param profiler: deprecated
+
+    :param nice_errors: run in such a way that the double-traceback is printed.  This costs a
+    bit of performance in the inner python loop.
+    """
+    if profiler is not None: 
+        raise NotImplementedError()
+
+    if nice_errors:
        def f():
            for x in no_recycling:
                x[0] = None
@@ -237,14 +256,13 @@ def streamline(env, thunks, order, no_recycling = [], profiler = None):
            except:
                raise_with_op(node)
    else:
+        # don't worry about raise_with_op, just go a little faster.
+        #there is a mix of python and c thunks
        def f():
            for x in no_recycling:
                x[0] = None
-            def g():
-                for thunk, node in zip(thunks, order):
-                    profiler.profile_node(thunk, node)
-            profiler.profile_env(g, env)
-        f.profiler = profiler
+            for thunk in thunks:
+                thunk()
    return f

 class LocalLinker(Linker):

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -4,16 +4,31 @@ import opt


 class DB(object):
+    def __hash__(self):
+        if not hasattr(self, '_optimizer_idx'):
+            self._optimizer_idx = opt._optimizer_idx[0]
+            opt._optimizer_idx[0] += 1
+        return self._optimizer_idx

    def __init__(self):
        self.__db__ = defaultdict(set)
+        self._names = set()

    def register(self, name, obj, *tags):
+        # N.B. obj is not an instance of class Optimizer.
+        # It is an instance of a DB.In the tests for example,
+        # this is not always the case.
+        if not isinstance(obj, (DB, opt.Optimizer, opt.LocalOptimizer)):
+            raise Exception('wtf', obj)
+            
        obj.name = name
        if name in self.__db__:
            raise ValueError('The name of the object cannot be an existing tag or the name of another existing object.', obj, name)
        self.__db__[name] = set([obj])
+        self._names.add(name)
        for tag in tags:
+            if tag in self._names:
+                raise ValueError('The tag of the object collides with a name.', obj, tag)
            self.__db__[tag].add(obj)

    def __query__(self, q):

--- a/theano/gof/sandbox/equilibrium.py
+++ b/theano/gof/sandbox/equilibrium.py
+
+if 0:
+    class _EquilibriumOptimizer(NavigatorOptimizer):
+
+        def __init__(self,
+                     local_optimizers,
+                     failure_callback = None,
+                     max_depth = None,
+                     max_use_ratio = None):
+
+            super(EquilibriumOptimizer, self).__init__(
+                None,
+                ignore_newtrees = False,
+                failure_callback = failure_callback)
+
+            self.local_optimizers = local_optimizers
+            self.max_depth = max_depth
+            self.max_use_ratio = max_use_ratio
+
+            self.tracks = defaultdict(list)
+            self.tracks0 = defaultdict(list)
+            max_depth = 0
+            for lopt in local_optimizers:
+                tracks = lopt.tracks()
+                for track in tracks:
+                    max_depth = max(max_depth, len(track))
+                    if self.max_depth is not None and max_depth > self.max_depth:
+                        raise ValueError('One of the local optimizers exceeds the maximal depth.')
+                    for i, op in enumerate(track):
+                        if i == 0:
+                            self.tracks0[op].append((track, i, lopt))
+                        self.tracks[op].append((track, i, lopt))
+
+        def fetch_tracks(self, op):
+            return self.tracks[op] + self.tracks[None]
+
+        def fetch_tracks0(self, op):
+            return self.tracks0[op] + self.tracks0[None]
+
+        def backtrack(self, node, tasks):
+            candidates = self.fetch_tracks(node.op)
+            tracks = []
+            def filter(node, depth):
+                new_candidates = []
+                for candidate in candidates:
+                    track, i, lopt = candidate
+                    if i < depth:
+                        pass
+                    elif track[i-depth] in (None, node.op):
+                        if i == depth:
+                            tasks[node].append(lopt)
+                        else:
+                            tracks.append(candidate)
+                    else:
+                        new_candidates.append(candidate)
+                return new_candidates
+            depth = 0
+            nodes = [node]
+            while candidates:
+                for node in nodes:
+                    candidates = filter(node, depth)
+                depth += 1
+                _nodes = nodes
+                nodes = reduce(list.__iadd__,
+                               [reduce(list.__iadd__,
+                                       [[n for n, i in out.clients if not isinstance(n, str)] for out in node.outputs],
+                                       []) for node in nodes],
+                               [])
+                candidates = tracks
+                tracks = []
+
+        def apply(self, env):
+            tasks = defaultdict(list)
+            
+            if self.max_use_ratio is not None:
+                max_uses = self.max_use_ratio * len(env.nodes)
+                runs = defaultdict(int)
+            else:
+                runs = None
+
+            def importer(node):
+                #print 'IMPORTING', node
+                self.backtrack(node, tasks)
+            def pruner(node):
+                try:
+                    del tasks[node]
+                except KeyError:
+                    pass
+            def chin(node, i, r, new_r):
+                if new_r.owner and not r.clients:
+                    self.backtrack(new_r.owner, tasks)
+
+    #         # == NOT IDEAL == #
+    #         for node in env.nodes:
+    #             importer(node)
+
+            
+            for node in env.toposort():
+                tasks[node].extend(lopt for track, i, lopt in self.fetch_tracks0(node.op))
+
+            u = self.attach_updater(env, importer, pruner, chin)
+            print 'KEYS', map(hash, tasks.keys())
+            while tasks:
+                for node in tasks.iterkeys():
+                    todo = tasks.pop(node)
+                    break
+                for lopt in todo:
+                    if runs is not None and runs[lopt] >= max_uses:
+                        print >>sys.stderr, 'Warning: optimization exceeded its maximal use ratio: %s, %s' % (lopt, max_uses)
+                        continue
+                    success = self.process_node(env, node, lopt)
+                    if success:
+                        if runs is not None: runs[lopt] += 1
+                        break
+            self.detach_updater(env, u)
+
+#     def match(self, node, candidates):
+#         candidates[:] = [candidate
+#                          for candidate in candidates
+#                          if candidate.current.op is None or candidate.current.op == node.op]
+#         for candidate in candidates:
+#             if candidate.current.inputs is not None:
+#                 for in1, in2 in zip(candidate.current.inputs, node.inputs):
+#                     if isinstance(in1, str):
+#                         candidate.match[in1] = in2
+#         for client in node.clients:
+            
+
+#         op = node.op
+#         patterns = self.pattern_base[(depth, op)].union(self.pattern_base[(depth, WILDCARD)])
+#         if not patterns:
+#             return patterns
+#         return self.match(node, depth + 1).intersection(patterns)
+
+
+#     def backtrack(self, node, q):
+#         for node2, i in node.clients:
+#             op2 = node2.op
+
+
+
+
+
--- a/theano/gof/tests/test_opt.py
+++ b/theano/gof/tests/test_opt.py
@@ -375,7 +375,7 @@ class TestEquilibrium(object):
        x, y, z = map(MyResult, 'xyz')
        e = op3(op4(x, y))
        g = Env([x, y, z], [e])
-        print g
+        print 'before', g
        sys.stderr = sys.stdout # display pesky warnings along with stdout
        opt = EquilibriumOptimizer(
            [PatternSub((op1, 'x', 'y'), (op2, 'x', 'y')),
@@ -384,7 +384,7 @@ class TestEquilibrium(object):
             ],
            max_use_ratio = 1. / len(g.nodes)) # each opt can only be applied once
        opt.optimize(g)
-        print g
+        print 'after', g
        assert str(g) == '[Op4(x, y)]'



--- a/theano/gof/tests/test_optdb.py
+++ b/theano/gof/tests/test_optdb.py
+from theano.gof.optdb import *
+from unittest import TestCase
+
+class Test_DB(TestCase):
+
+    def test_0(self):
+
+        class Opt(opt.Optimizer):  #inheritance buys __hash__
+            name = 'blah'
+
+        db = DB()
+        db.register('a', Opt())
+
+        db.register('b', Opt())
+
+        db.register('c', Opt(), 'z', 'asdf')
+
+        try:
+            db.register('c', Opt()) #name taken
+            self.fail()
+        except ValueError, e:
+            if e[0].startswith("The name"):
+                pass
+            else:
+                raise
+        except:
+            self.fail()
+
+        try:
+            db.register('z', Opt()) #name collides with tag
+            self.fail()
+        except ValueError, e:
+            if e[0].startswith("The name"):
+                pass
+            else:
+                raise
+        except:
+            self.fail()
+
+        try:
+            db.register('u', Opt(), 'b') #name new but tag collides with name
+            self.fail()
+        except ValueError, e:
+            if e[0].startswith("The tag"):
+                pass
+            else:
+                raise
+        except:
+            self.fail()
+
+
--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -2,6 +2,7 @@ import gof #, gof.result
 import numpy #for numeric_grad

 from gof.python25 import all
+import gof.utils

 _msg_retType = 'op.grad(...) returned a non-list'
 _msg_badlen = 'op.grad(...) returned wrong number of gradients'
@@ -55,17 +56,17 @@ def grad_sources_inputs(sources, graph_inputs):
            else:
                gmap[r] = g_r

-    graph_outputs = gmap.keys()
-    
+    graph_outputs = gof.utils.uniq([r for r,g in sources])
+
    if graph_inputs is None:
        graph_inputs = gof.graph.inputs(graph_outputs)
-        
+
    for node in gof.graph.io_toposort(graph_inputs, graph_outputs).__reversed__():
        g_outputs = [gmap.get(o,None) for o in node.outputs]

        #if all output gradients are None, continue
        if all(map(lambda x:x is None, g_outputs)): continue
-        
+
        output_arg = g_outputs
        input_arg = node.inputs


--- a/theano/printing.py
+++ b/theano/printing.py
@@ -235,17 +235,27 @@ class PPrinter:
        else:
            raise TypeError('Not enough arguments to call.')

-
-
-
-special = dict(middle_dot = u"\u00B7",
-               big_sigma = u"\u03A3")
-
-greek = dict(alpha    = u"\u03B1",
-             beta     = u"\u03B2",
-             gamma    = u"\u03B3",
-             delta    = u"\u03B4",
-             epsilon  = u"\u03B5")
+use_ascii = True
+
+if use_ascii:
+    special = dict(middle_dot = "\dot",
+                   big_sigma = "\Sigma")
+
+    greek = dict(alpha    = "\alpha",
+                 beta     = "\beta",
+                 gamma    = "\gamma",
+                 delta    = "\delta",
+                 epsilon  = "\epsilon")
+else:
+
+    special = dict(middle_dot = u"\u00B7",
+                   big_sigma = u"\u03A3")
+
+    greek = dict(alpha    = u"\u03B1",
+                 beta     = u"\u03B2",
+                 gamma    = u"\u03B3",
+                 delta    = u"\u03B4",
+                 epsilon  = u"\u03B5")


 pprint = PPrinter()

--- a/theano/sandbox/wraplinker.py
+++ b/theano/sandbox/wraplinker.py
@@ -2,6 +2,7 @@ from __future__ import absolute_import
 import time
 import numpy

+from ..gof.cutils import run_cthunk
 from ..gof.link import WrapLinker
 from ..compile.mode import Mode

@@ -103,49 +104,82 @@ def DualLinker(linkers):


 class ProfileMode(Mode):
-    def __init__(self, local_linker, optimizer=None):
+    def __init__(self, linker, optimizer=None):
        local_time = [0.0]
        apply_time = {}
        op_time = {}
+        op_cimpl = {}

        def blah(i, node, *thunks):
-            t0 = time.time() 
-            for th in thunks:
-                th()
-            dt = time.time() - t0
+            if 0:
+                t0 = time.time() 
+                for th in thunks:
+                    th()
+                dt = time.time() - t0
+            elif 0: #more precise timing
+                for th in thunks:
+                    t0 = time.time()
+                    th()
+                    dt = time.time() - t0
+            elif 1:
+                for th in thunks:
+                    if hasattr(th, 'cthunk'):
+                        t0 = time.time()
+                        run_cthunk(th.cthunk)
+                        dt = time.time() - t0
+                    else:
+                        t0 = time.time()
+                        th()
+                        dt = time.time() - t0
+            elif 1:
+                pass
+            else:
+                raise Exception('one of the cases has to run the thunks!')
            local_time[0] += dt
            apply_time[(i,node.op)] = apply_time.get((i,node.op), 0.0) + dt
            op_time[node.op] = op_time.get(node.op, 0.0) + dt
+            op_cimpl[node.op] = hasattr(thunks[0], 'cthunk')

        self.local_time = local_time
        self.apply_time = apply_time
        self.op_time = op_time
+        self.op_cimpl = op_cimpl

-        linker = WrapLinkerMany([local_linker], [blah])
+        wrap_linker = WrapLinkerMany([linker], [blah])
        if optimizer:
-            Mode.__init__(self, linker, optimizer)
+            super(ProfileMode, self).__init__(wrap_linker, optimizer)
        else:
-            Mode.__init__(self, linker)
+            super(ProfileMode, self).__init__(wrap_linker)

    def print_summary(self):
        local_time = self.local_time[0]
        apply_time = self.apply_time
        op_time = self.op_time

-        print 'local_time', local_time
-        print 'apply-wise times'
+        print ''
+        print 'ProfileMode.print_summary()'
+        print '---------------------------'
+        print ''
+        print 'local_time', local_time, '(Time spent running thunks)'
+        print 'Apply-wise summary: <fraction of local_time spent at this position> (<Apply position>, <Apply Op name>)'
        atimes = [(t/local_time, (a[0], str(a[1]))) for a, t in apply_time.items()]
        atimes.sort()
        atimes.reverse()
        for t,a in atimes[:15]:
-            print '  ', t, a
-        print '   ...'  #show that we are ignoring applies that don't take much time
-        print 'op-wise times'
-        otimes = [(t/local_time, a) for a, t in op_time.items()]
+            print '\t%.3f\t%i\t%s' % (t, a[0], a[1])
+        print '   ... (remaining %i Apply instances account for %.2f of the runtime)'\
+                %(max(0, len(atimes)-15), sum(t for t, a in atimes[15:]))
+
+
+        n_ops_to_print = 20
+        print 'Op-wise summary: <fraction of local_time spent on this kind of Op> <Op name>'
+        otimes = [(t/local_time, a, self.op_cimpl[a]) for a, t in op_time.items()]
        otimes.sort()
        otimes.reverse()
-        for t,a in otimes[:15]:
-            print '  ', t, a
-        print '   ...'  #show that we are ignoring applies that don't take much time
-        print sum(t for a,t in op_time.items())
+        for t,a,ci in otimes[:n_ops_to_print]:
+            print '\t%.3f\t%s %s' % (t, '*' if ci else ' ', a)
+        print '   ... (remaining %i Ops account for %.2f of the runtime)'\
+                %(max(0, len(otimes)-n_ops_to_print), sum(t for t, a, ci in
+                    otimes[n_ops_to_print:]))
+        print '(*) Op is running a c implementation'

--- a/theano/tensor/__init__.py
+++ b/theano/tensor/__init__.py
@@ -2,6 +2,7 @@
 from basic import *

 import opt
+import blas

 import raw_random
 from raw_random import \

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
--- a/theano/tensor/blas_headers.py
+++ b/theano/tensor/blas_headers.py
--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -103,16 +103,18 @@ class DimShuffle(Op):
        for i, b in enumerate(input_broadcastable):
            if i not in new_order:
                # we want to drop this dimension because it's not a value in new_order
-                if b == 1:
+                if b == 1: # 1 aka True
                    self.drop.append(i)
                else:
                    # we cannot drop non-broadcastable dimensions
-                    raise NotImplementedError("You cannot drop a non-broadcastable dimension.")
+                    raise ValueError("You cannot drop a non-broadcastable dimension.")
            else:
                i2j[i] = j
                j += 1

        # transposition of non-broadcastable dimensions
+        # This is how the dimensions will be permuted, without accounting for the extra
+        # 'x' broadcastable dimensions to insert.
        self.shuffle = [i2j[x] for x in new_order if x != 'x']

        # list of dimensions of the output that are broadcastable and were not in the original input
@@ -144,7 +146,8 @@ class DimShuffle(Op):
            and self.input_broadcastable == other.input_broadcastable

    def __hash__(self):
-        return hash(self.inplace) ^ hash(self.new_order) ^ hash(self.input_broadcastable)
+        return hash(type(self)) ^ hash(self.inplace) \
+                ^ hash(self.new_order) ^ hash(self.input_broadcastable)

    def __str__(self):
        if self.inplace:
@@ -175,13 +178,78 @@ class DimShuffle(Op):

        storage[0] = res

+    def c_code(self, node, name, (input,), (res,), sub):
+        def statements(lst):
+            return ';\n'.join(lst) + ';'
+
+        nd_in = len(self.input_broadcastable)
+        nd_out = len(self.new_order)
+
+        check_input_nd = [('if (%(input)s->nd != ' + str(nd_in) + ')'
+                '{PyErr_SetString(PyExc_NotImplementedError, "input nd"); %(fail)s;}')]
+
+        clear_output = ['if (%(res)s) {Py_XDECREF(%(res)s);}']
+
+        shape_statements = ['npy_intp dimensions[%i]'%nd_out]
+        shape_statements += [('dimensions['+str(i)+'] = %(input)s->dimensions['+str(o)+']')
+            if o != 'x' else
+            ('dimensions['+str(i)+'] = 1')
+            for i, o in enumerate(self.new_order)]
+
+
+        strides_statements = ['npy_intp strides[%i]'%nd_out]
+        strides_statements += [('strides['+str(i)+'] = %(input)s->strides['+str(o)+']')
+            if o != 'x' else
+            ('strides['+str(i)+'] = 0')
+            for i, o in enumerate(self.new_order)]
+
+
+        if self.inplace:
+            get_base = ['{ PyArrayObject * base = %(input)s', 'Py_INCREF((PyObject*)base)']
+        else:
+            get_base = [('{ PyArrayObject * base = (PyArrayObject*)PyArray_FromAny((PyObject*)%(input)s, NULL,'
+                    '0, 0, NPY_ALIGNED|NPY_ENSURECOPY, NULL)')]
+
+        alloc_output = [('%(res)s = (PyArrayObject*)PyArray_New(&PyArray_Type, '
+                    '' + str(nd_out) + ', dimensions, '
+                    'PyArray_TYPE(base), strides, '
+                    'base->data, base->descr->elsize, '
+                    'PyArray_FLAGS(base), NULL)'),
+                '%(res)s->base = (PyObject*)base',
+                '}']
+
+        full_code = statements(check_input_nd 
+                + clear_output
+                + shape_statements 
+                + strides_statements
+                + get_base
+                + alloc_output)
+
+        if 0:
+            print 'C_CODE'
+            print ''
+            print self
+            print "IN BROAD", self.input_broadcastable
+            print "NEW ORDER", self.new_order
+            print "SHUFFLE", self.shuffle
+            print "AUGMENT", self.augment
+            print '------------'
+            print ''
+            print full_code
+
+            if 0:
+                import sys
+                sys.exit()
+
+        return full_code % dict(locals(), **sub)
+
    def grad(self, (x, ), (gz, )):
        gz = as_tensor(gz)
        grad_order = ['x'] * len(x.type.broadcastable)
        for i, v in enumerate(self.new_order):
            if v != 'x':
                grad_order[v] = i
-        return DimShuffle(gz.type.broadcastable, grad_order)(gz),
+        return [DimShuffle(gz.type.broadcastable, grad_order, inplace=True)(Elemwise(scalar.identity)(gz))]




--- a/theano/tensor/inplace.py
+++ b/theano/tensor/inplace.py

-from basic import _scal_elemwise, _transpose_inplace
+from .basic import _scal_elemwise #, _transpose_inplace
 from .. import scalar as scal
 import elemwise
 from .. import printing
@@ -183,9 +183,11 @@ pprint.assign(div_inplace, printing.OperatorPrinter('/=', -1, 'left'))
 pprint.assign(pow_inplace, printing.OperatorPrinter('**=', 1, 'right'))


-transpose_inplace = _transpose_inplace
-"""WRITEME"""
+def transpose_inplace(x, **kwargs):
+    """Perform a transpose on a tensor without copying the underlying storage"""
+    dims = range(x.ndim-1, -1, -1)
+    return elemwise.DimShuffle(x.broadcastable, dims, inplace=True)(x)

-pprint.assign(transpose_inplace, printing.MemberPrinter('T'))
+#pprint.assign(transpose_inplace, printing.MemberPrinter('T'))


--- a/theano/tensor/nnet.py
+++ b/theano/tensor/nnet.py
@@ -203,6 +203,7 @@ class SoftmaxWithBias(gof.Op):
            for (j = 0; j < Nx[1]; ++j)
            {
                double row_ij = x_i[j * Sx] +  b_i[j * Sb];
+//                std::cout << "1" << row_ij << "\\n";
                row_max_j = (row_ij > row_max) ? j : row_max_j;
                row_max   = (row_ij > row_max) ? row_ij : row_max;
            }
@@ -210,13 +211,23 @@ class SoftmaxWithBias(gof.Op):
            for (j = 0; j < Nx[1]; ++j)
            {
                double row_ij = x_i[j * Sx] +  b_i[j * Sb];
+//                std::cout << "2" << row_ij << "\\n";
                double sm_ij = exp(row_ij - row_max);
+//                std::cout << "3" << sm_ij << "\\n";
                sum += sm_ij;
                sm_i[j * Ssm] = sm_ij;
            }
-            if ( (0.0 == sum) || (std::isinf(sum)))
+            if (std::isinf(sum))
            {
                //that was our best...
+                PyErr_SetString(PyExc_ValueError, "softmax is impossible (inf)!");
+                %(fail)s;
+            }
+
+            if (0.0 == sum)
+            {
+                //that was our best...
+                PyErr_SetString(PyExc_ValueError, "softmax is impossible (zero)!");
                %(fail)s;
            }

@@ -600,6 +611,7 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op):
            }
            if (y_i >= %(dx)s->dimensions[1])
            {
+                PyErr_SetString(PyExc_ValueError, "y_i >= dx dimensions[1]");
                %(fail)s;
            }
            dx_i[y_i * Sdx] -= dnll_i;

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
-
+"""Tensor optimizations addressing the ops in basic.py
+"""
 # TODO: intelligent merge for mul/add
 # TODO: 0*x -> 0

@@ -30,28 +31,6 @@ def in2out(*local_opts, **kwargs):
                             **kwargs)


-# gemm: (d,a,b,c,s) -> d = d*s + a*dot(b,c)
-# Transforms d -= a * dot(b, c) into gemm(d, -a, b, c, 1.0)
-gemm_pattern_1 = gof.PatternSub((T.sub,
-                                 'd',
-                                 (T.mul,
-                                  dict(pattern = (T.DimShuffle((), ['x', 'x'], inplace = True), 'a'),
-                                       allow_multiple_clients = True),
-                                  (T.dot, 'b', 'c'))),
-                                (T.gemm, 'd', (T.neg, 'a'), 'b', 'c', T.constant(1.0)),
-                                allow_multiple_clients = False)
-
-# gemm: (d,a,b,c,s) -> d = d*s + a*dot(b,c)
-# Transforms dot(a, b) into gemm(zeros(2)(hstack(shape(a)[:1], shape(b)[1:])), 1.0, a, b, 1.0)
-# The construction of the 'gemm' node may fail if, for example, a and b are not both matrices.
-dot_to_gemm = gof.PatternSub((T.dot, 'a', 'b'),
-                             (T.gemm, (T.Zeros(2),
-                                       (T.stack,
-                                        (T.Subtensor([slice(0, 1)]), (T.shape, 'a')),
-                                        (T.Subtensor([slice(1, 2)]), (T.shape, 'b')))),
-                              T.constant(1.0), 'a', 'b', T.constant(1.0)),
-                             allow_multiple_clients = False)
-

 def _insert_inplace_optimizer(env):
    """
@@ -91,12 +70,6 @@ def _insert_inplace_optimizer(env):
                break
 insert_inplace_optimizer = gof.optimizer(_insert_inplace_optimizer)

-inplace_optimizer = gof.InplaceOptimizer(
-    gof.SeqOptimizer(out2in(gemm_pattern_1),
-                     insert_inplace_optimizer,
-                     failure_callback = gof.warn))
-compile.optdb.register('inplace_opt', inplace_optimizer, 99, 'fast_run', 'inplace')
-

 def register_canonicalize(lopt, *tags, **kwargs):
    name = (kwargs and kwargs.pop('name')) or lopt.__name__
@@ -216,6 +189,13 @@ register_canonicalize(local_shape_lift_dot)
 ################

 def encompasses_broadcastable(b1, b2):
+    """
+    Returns True if the broadcastable patterns b1 and b2 are such that b2 is
+    broadcasted to b1's shape and not the opposite.
+
+    :param b1: the broadcastable attribute of a tensor type
+    :param b2: the broadcastable attribute of a tensor type
+    """
    if len(b1) < len(b2):
        return False
    b1 = b1[-len(b2):]
@@ -330,6 +310,7 @@ def local_fill_cut(node):

 register_canonicalize(local_fill_cut)

+register_canonicalize(gof.OpRemove(T.tensor_copy), name='remove_tensor_copy' )

 @gof.local_optimizer([None, T.fill])
 def local_fill_sink(node):
@@ -524,9 +505,30 @@ class Canonizer(gof.LocalOptimizer):
            return False

        new = self.merge_num_denum(num, denum)
-        if new.type != out.type:
+        if new.dtype != out.dtype:
            #new = T.fill(out, new)
-            new = T.fill(out, T.Elemwise(scalar.Identity(scalar.specific_out(getattr(scalar, out.type.dtype))))(new))
+            elem_op = T.Elemwise(scalar.Identity(scalar.specific_out(getattr(scalar, out.type.dtype))))
+            new = T.fill(out, elem_op(new))
+        if new.broadcastable != out.broadcastable:
+            #this case is tricky... we need to provide exactly the same kind of broadcastable
+            #pattern, but only if legal...
+            dlen = len(new.broadcastable) - len(out.broadcastable)
+
+            if dlen > 0:
+                #try to take the leading ranks of new.broadcastable, which should be broadcastable
+                # ranks
+                #if this means skipping over nonbroadcastable ranks, then DimShuffle will fail
+                dimshuffle_op = T.DimShuffle(new.broadcastable, 
+                        range(dlen, len(new.broadcastable)))
+                new = dimshuffle_op(new)
+            elif dlen < 0:
+                #we have to boost up a scalar or something
+                dimshuffle_op = T.DimShuffle(new.broadcastable, 
+                        ['x' for x in range(-dlen)] + range(0, len(new.broadcastable)))
+                new = dimshuffle_op(new)
+
+        # if our if's above worked, this should be true. OTW investigate.
+        assert new.type == out.type
        return [new]

    def __str__(self):
@@ -550,6 +552,7 @@ def local_neg_to_mul(node):
        return [-1 * node.inputs[0]]
    else:
        return False
+register_canonicalize(local_neg_to_mul)

 @gof.local_optimizer([T.mul])
 def local_mul_to_neg(node):
@@ -557,6 +560,7 @@ def local_mul_to_neg(node):
        return [-local_mul_canonizer.merge_num_denum(node.inputs[1:], [])]
    else:
        return False
+register_specialize(local_mul_to_neg)

 @gof.local_optimizer([T.div])
 def local_div_to_inv(node):
@@ -564,10 +568,120 @@ def local_div_to_inv(node):
        return [T.inv(local_mul_canonizer.merge_num_denum(node.inputs[1:], []))]
    else:
        return False
-
-register_canonicalize(local_neg_to_mul)
-register_specialize(local_mul_to_neg)
 register_specialize(local_div_to_inv)
+
+@gof.local_optimizer([T.inv])
+def local_inv_canon(node):
+    if node.op == T.inv:
+        return [T.pow(node.inputs[0], -1.0)]
+    else:
+        return False
+register_canonicalize(local_inv_canon)
+
+@gof.local_optimizer([T.pow])
+def local_pow_canonicalize(node):
+    if node.op == T.pow:
+        if N.all(local_mul_canonizer.get_constant(node.inputs[1]) == 1.0):
+            return [T.fill(node.inputs[1], node.inputs[0])]
+        if N.all(local_mul_canonizer.get_constant(node.inputs[1]) == 0.0):
+            #extra fills here are to make sure the size of the output stays constant.
+            return [T.fill(node.inputs[0], T.fill(node.inputs[1], 1.0))]
+    else:
+        return False
+register_canonicalize(local_pow_canonicalize)
+
+@gof.local_optimizer([T.pow])
+def local_pow_specialize(node):
+    #here, we are past the point of canonicalization, so we don't want to put in un-necessary fills.
+    if node.op == T.pow:
+        #the idea here is that we have pow(x, y)
+        xsym = node.inputs[0]
+        ysym = node.inputs[1]
+        y = local_mul_canonizer.get_constant(ysym)
+        if (y is not None) \
+                and encompasses_broadcastable(xsym.type.broadcastable, ysym.type.broadcastable):
+            if N.all(y == 2.0):
+                return [T.sqr(xsym)]
+            if N.all(y == 1.0):
+                return [xsym]
+            if N.all(y == 0.0):
+                return [T.fill(xsym, 1.0)]
+            if N.all(y == 0.5):
+                return [T.sqrt(xsym)]
+            if N.all(y == -0.5):
+                return [T.inv(T.sqrt(xsym))]
+            if N.all(y == -1.0):
+                return [T.inv(xsym)]
+            if N.all(y == -2.0):
+                return [T.inv(T.sqr(xsym))]
+    else:
+        return False
+register_specialize(local_pow_specialize)
+
+@gof.local_optimizer([T.mul])
+def local_mul_specialize(node):
+    #here, we are past the point of canonicalization, so we don't want to put in un-necessary fills.
+    if node.op == T.mul:
+        #the idea here is that we have pow(x, y)
+        neg = False
+        new_inputs = []
+        for input in node.inputs:
+            y = local_mul_canonizer.get_constant(input)
+            if N.all(y == 1.0):
+                continue
+            elif N.all(y == -1.0):
+                neg ^= True #toggles
+            elif N.all(y == 0.0):
+                return [input]
+            else:
+                new_inputs.append(input)
+        if len(new_inputs) < len(node.inputs):
+            if len(new_inputs) == 0:
+                newval = -y.flatten()[0] if neg else y.flatten()[0]
+                return [T.TensorConstant(T.Tensor(dtype=node.outputs[0].type.dtype,
+                    broadcastable = [True] * node.outputs[0].ndim), N.asarray(newval))]
+
+            if len(new_inputs) == 1:
+                return [-new_inputs[0]] if neg else new_inputs
+            else:
+                return [-T.mul(*new_inputs)] if neg else \
+                        [T.mul(*new_inputs)] 
+    else:
+        return False
+register_specialize(local_mul_specialize)
+
+if 0: #TODO: replace this with a c version of any InplaceDimShuffle
+    class _TransposeInplace(T.Op):
+        view_map = {0: [0]}
+        
+        def make_node(self, input):
+            return T.Apply(self, [input], 
+                    [T.tensor(dtype = input.type.dtype,
+                        broadcastable = reversed(input.type.broadcastable))])
+        
+        def perform(self, node, (x, ), (z, )):
+            z[0] = x.T
+        
+        def c_code(self, node, name, (x, ), (z, ), sub):
+            return """
+            PyArrayObject* transposed = (PyArrayObject*)PyArray_Transpose(%(x)s, NULL);
+            if (%(z)s) {
+                Py_XDECREF(%(z)s);
+            }
+            %(z)s = transposed;
+            """ % locals()
+
+        def __str__(self):
+            return "_TransposeInplace"
+    _transpose_inplace = _TransposeInplace()
+
+    @gof.local_optimizer([T.DimShuffle([False,False],[1,0],inplace=True)])
+    def local_dimshuffle_transposeinplace(node):
+        if node.op == T.DimShuffle([False,False],[1,0],inplace=True):
+            return [_transpose_inplace(node.inputs[0])]
+        return False
+    register_specialize(local_dimshuffle_transposeinplace)
+
 register_canonicalize(local_mul_canonizer, name = 'local_mul_canonizer')


@@ -724,8 +838,10 @@ def constant_folding(node):
 register_canonicalize(constant_folding)


-
-
+inplace_matrix_transpose = T.DimShuffle([False,False], [1,0], inplace=True)
+local_transposed_dot = gof.PatternSub((inplace_matrix_transpose, (T.dot, 'x', 'y')),
+        (T.dot, (inplace_matrix_transpose, 'y'), (inplace_matrix_transpose, 'x')))
+register_canonicalize(local_transposed_dot, name='local_transposed_dot')


 # def _math_optimizer():

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -662,56 +662,6 @@ class T_max_and_argmax(unittest.TestCase):
        self.failUnless(i.shape == (2,3))


-class T_transpose(unittest.TestCase):
-    def test0(self):
-        n = as_tensor(numpy.ones(()))
-        t = transpose(n)
-        self.failUnless(t.owner.op == inplace.transpose_inplace)
-        f = function([n], t)
-        tval = f(n.data)
-        self.failUnless(tval.shape == n.data.shape)
-
-        #test aliasing
-        tval += 55.0
-        self.failUnless(n.data == 1.0)
-
-    def test1(self):
-        n = as_tensor(numpy.ones(5))
-        t = transpose(n)
-        self.failUnless(t.owner.op == inplace.transpose_inplace)
-        f = function([n], t)
-        tval = f(n.data)
-        self.failUnless(tval.shape == n.data.shape)
-        #test aliasing
-        tval += 55.0
-        self.failUnless(n.data[0] == 1.0)
-
-    def test2(self):
-        n = as_tensor(numpy.ones((5,3)))
-        t = transpose(n)
-        self.failUnless(t.owner.op == inplace.transpose_inplace)
-        f = function([n], t)
-        tval = f(n.data)
-        self.failUnless(tval.shape == (3,5))
-        #test aliasing
-        tval += 55.0
-        self.failUnless(n.data[0,0] == 1.0)
-
-    def test3(self):
-        """Test transpose of tensor, inplace version"""
-        n = as_tensor(numpy.ones((5,3,2)))
-        t = inplace.transpose_inplace(n)
-        self.failUnless(t.owner.op == inplace.transpose_inplace)
-        f = function([n], t)
-        tval = f(n.data)
-        self.failUnless(tval.shape == (2,3,5))
-        #test aliasing
-        tval += 55.0
-        self.failUnless(n.data[0,0,0] == 56.0)
-    def test_grad(self):
-        verify_grad(self, inplace.transpose_inplace, [numpy.random.rand(2, 3)])
-        verify_grad(self, inplace.transpose_inplace, [numpy.ones(3)])
-
 class T_subtensor(unittest.TestCase):
    def setUp(self):
        Subtensor.debug = False
@@ -1406,179 +1356,6 @@ class t_dot(unittest.TestCase):
        #verify_grad(self, dot, [self.rand(), self.rand(2)])
        #verify_grad(self, dot, [self.rand(), self.rand(2,5)])

-class t_gemm(unittest.TestCase):
-    def setUp(self):
-        numpy.random.seed(44)
-        _approx_eq.debug = 0
-        Gemm.debug = False
-
-    @staticmethod
-    def _gemm(z,a,x,y,b):
-        assert a.shape == ()
-        assert b.shape == ()
-        return b * z + a * numpy.dot(x,y)
-    @staticmethod
-    def rand(*args):
-        return numpy.random.rand(*args)
-
-    def cmp(self, z, a, x, y, b):
-        def cmp_linker(z, a, x, y, b, l):
-            z,a,x,y,b = [numpy.asarray(p) for p in z,a,x,y,b]
-            z_orig = z.copy()
-            tz,ta,tx,ty,tb = [as_tensor(p).type() for p in z,a,x,y,b]
-
-            f = function([tz,ta,tx,ty,tb], gemm(tz,ta,tx,ty,tb), mode=compile.Mode(optimizer = None, linker = l))
-            new_z = f(z,a,x,y,b)
-            z_after = self._gemm(z_orig, a, x, y, b)
-
-            self.failUnless(z is new_z)
-            #print z_orig, z_after, z, type(z_orig), type(z_after), type(z)
-            #_approx_eq.debug = 1
-            self.failUnless(_approx_eq(z_after, z))
-            if a == 0.0 and b == 1.0:
-                return
-            else:
-                self.failIf(numpy.all(z_orig == z))
-
-        cmp_linker(copy(z), a, x, y, b, 'c|py')
-        cmp_linker(copy(z), a, x, y, b, 'c')
-        cmp_linker(copy(z), a, x, y, b, 'py')
-
-    def test0a(self): 
-        Gemm.debug = True
-        try:
-            g = gemm([1.], 1., [1.], [1.], 1.)
-        except ValueError, e:
-            if e[0] is Gemm.E_rank:
-                return
-        self.fail()
-
-    def test0(self): 
-        try:
-            self.cmp(1., 0., 1.0, 1.0, 1.0)
-        except ValueError, e:
-            if e[0] is Gemm.E_rank:
-                return
-        self.fail()
-
-    def test2(self): 
-        try:
-            self.cmp(2., 1.0, [3,2,1.], [[1],[2],[3.]], 1.0)
-        except ValueError, e:
-            self.failUnless(e[0] == Gemm.E_rank)
-            return
-        self.fail()
-    def test4(self): 
-        self.cmp(self.rand(3,4), 1.0, self.rand(3,5), self.rand(5,4), 0.0)
-    def test5(self): self.cmp(self.rand(3,4), 1.0,
-            self.rand(3,5), self.rand(5,4), 1.0)
-    def test6(self): self.cmp(self.rand(3,4), 1.0,
-            self.rand(3,5), self.rand(5,4), -1.0)
-    def test7(self): self.cmp(self.rand(3,4), 0.0,
-            self.rand(3,5), self.rand(5,4), 0.0)
-    def test8(self): self.cmp(self.rand(3,4), 0.0,
-            self.rand(3,5), self.rand(5,4), 0.6)
-    def test9(self): self.cmp(self.rand(3,4), 0.0,
-            self.rand(3,5), self.rand(5,4), -1.0)
-    def test10(self): 
-        _approx_eq.debug = 1
-        self.cmp(self.rand(3,4), -1.0, self.rand(3,5), self.rand(5,4), 0.0)
-    def test11(self): self.cmp(self.rand(3,4), -1.0,
-            self.rand(3,5), self.rand(5,4), 1.0)
-    def test12(self): self.cmp(self.rand(3,4), -1.0,
-            self.rand(3,5), self.rand(5,4), -1.0)
-
-    def test_destroy_map0(self):
-        """test that only first input can be overwritten"""
-        Z = as_tensor(self.rand(2,2))
-        try:
-            gemm(Z, 1.0, Z, Z, 1.0)
-        except ValueError, e:
-            if e[0] == Gemm.E_z_uniq:
-                return
-        self.fail()
-    def test_destroy_map1(self):
-        """test that only first input can be overwritten"""
-        Z = as_tensor(self.rand(2,2))
-        A = as_tensor(self.rand(2,2))
-        try:
-            gemm(Z, 1.0, A, inplace.transpose_inplace(Z), 1.0)
-        except ValueError, e:
-            if e[0] == Gemm.E_z_uniq:
-                return
-        self.fail()
-    def test_destroy_map2(self):
-        """test that only first input can be overwritten"""
-        Z = as_tensor(self.rand(2,2))
-        A = as_tensor(self.rand(2,2))
-        try:
-            gemm(Z, 1.0, inplace.transpose_inplace(Z), A, 1.0)
-        except ValueError, e:
-            if e[0] == Gemm.E_z_uniq:
-                return
-        self.fail()
-    def test_destroy_map3(self):
-        """test that only first input can be overwritten"""
-        Z = as_tensor(self.rand(2,2))
-        A = as_tensor(self.rand(2,2))
-        try:
-            gemm(Z, 1.0, Z, A, 1.0)
-        except ValueError, e:
-            if e[0] == Gemm.E_z_uniq:
-                return
-        self.fail()
-
-    def test_destroy_map4(self):
-        """test that dot args can be aliased"""
-        Z = value(self.rand(2,2))
-        A = value(self.rand(2,2))
-        eval_outputs([gemm(Z, 1.0, A, A, 1.0)])
-        eval_outputs([gemm(Z, 1.0, A, A.T, 1.0)])
-
-
-    def test_transposes(self):
-        # three square matrices which are not contiguous
-        A = self.rand(4,5)[:,:4]
-        B = self.rand(4,5)[:,:4]
-        C = self.rand(4,5)[:,:4]
-
-        def t(z,x,y,a=1.0, b=0.0,l='c|py',dt='float64'):
-            z,a,x,y,b = [numpy.asarray(p,dtype=dt) for p in z,a,x,y,b]
-            z_orig = z.copy()
-            z_after = self._gemm(z, a, x, y, b)
-
-            tz,ta,tx,ty,tb = [value(p) for p in z,a,x,y,b]
-
-            f = function([tz,ta,tx,ty,tb], gemm(tz,ta,tx,ty,tb), mode = compile.Mode(optimizer = None, linker=l))
-            f(z, a, x, y, b)
-            self.failUnless(_approx_eq(z_after, z), (z_orig, z_after, z, z_after - z))
-
-            f(z.T, a, y.T, x.T, b)
-            self.failUnless(_approx_eq(z_after, z))
-
-        t(C,A,B)
-        t(C.T, A, B)
-        t(C, A.T, B, dt='float32')
-        t(C, A, B.T)
-        t(C.T, A.T, B)
-        t(C, A.T, B.T, dt='float32')
-        t(C.T, A, B.T)
-        t(C.T, A.T, B.T, dt='float32')
-
-        t(C, A[:,:2], B[:2, :])
-        t(C.T, A[:,:2], B[:2, :], dt='float32')
-        t(C, A[:2,:].T, B[:2, :])
-        t(C.T, A[:2,:].T, B[:2, :], dt='float32')
-        t(C, A[:2,:].T, B[:, :2].T)
-        t(C.T, A[:2,:].T, B[:, :2].T)
-
-        try:
-            t(C.T, A[:2,:], B[:, :2].T)
-        except ValueError, e:
-            if e[0].find('aligned') >= 0:
-                return
-        self.fail()
-
 class T_tensorfromscalar(unittest.TestCase):
    def test0(self):
        s = scal.constant(56)

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
--- a/theano/tests/__init__.py
+++ b/theano/tests/__init__.py
--- a/theano/tests/main.py
+++ b/theano/tests/main.py