merged, changed gemm to use fortran blas instead of cblas

13db39ca · bergstrj@iro.umontreal.ca · 476c9f66 · c64b127c · 13db39ca · 13db39ca
--- a/_test_elemwise.py
+++ b/_test_elemwise.py
@@ -120,6 +120,16 @@ class _test_Broadcast(unittest.TestCase):
        f(xv, yv)
        assert (xv == yv).all()
+    def test_weird_strides(self):
+        x = modes.build(Tensor('float64', [0, 0, 0, 0, 0], name = 'x'))
+        y = modes.build(Tensor('float64', [0, 0, 0, 0, 0], name = 'y'))
+        e = Broadcast(Add, (x, y)).out
+        f = gof.CLinker(env([x, y], [e])).make_function(inplace = False)
+        xv = numpy.random.rand(2, 2, 2, 2, 2)
+        yv = numpy.random.rand(2, 2, 2, 2, 2).transpose(4, 0, 3, 1, 2)
+        zv = xv + yv
+        assert (f(xv, yv) == zv).all()
 class _test_CAReduce(unittest.TestCase):

--- a/_test_opt.py
+++ b/_test_opt.py
+import unittest
+import gof
+from opt import *
+import tensor
+from tensor import Tensor
+from gof import Env
+from elemwise import DimShuffle
+import numpy
+import scalar_opt
+def inputs(xbc = (0, 0), ybc = (0, 0), zbc = (0, 0)):
+    x = Tensor(broadcastable = xbc, dtype = 'float64', name = 'x')
+    y = Tensor(broadcastable = ybc, dtype = 'float64', name = 'y')
+    z = Tensor(broadcastable = zbc, dtype = 'float64', name = 'z')
+    return x, y, z
+ds = gof.op.constructor(DimShuffle)
+class _test_inplace_opt(unittest.TestCase):
+    def test_straightforward(self):
+        x, y, z = inputs()
+        e = x + y + z
+        g = Env([x, y], [e])
+        assert str(g) == "[Broadcast{Add}(Broadcast{Add}(x, y), z)]"
+        inplace_optimizer.optimize(g)
+        assert str(g) == "[Broadcast{Add}{0: 0}(Broadcast{Add}{0: 0}(x, y), z)]"
+    def test_multiple_uses(self):
+        x, y, z = inputs()
+        e0 = x + y
+        e1 = x * y
+        g = Env([x, y], [e0, e1])
+        assert str(g) == "[Broadcast{Add}(x, y), Broadcast{Mul}(x, y)]"
+        inplace_optimizer.optimize(g)
+        assert str(g) == "[Broadcast{Add}{0: 0}(x, y), Broadcast{Mul}(x, y)]" \
+            or str(g) == "[Broadcast{Add}(x, y), Broadcast{Mul}{0: 0}(x, y)]"
+    def test_user_inplace(self):
+        x, y, z = inputs()
+        e0 = x + y
+        e1 = tensor.mul_inplace(x, y)
+        g = Env([x, y], [e0, e1])
+        assert str(g) == "[Broadcast{Add}(x, y), Broadcast{Mul}{0: 0}(x, y)]"
+        inplace_optimizer.optimize(g)
+        assert str(g) == "[Broadcast{Add}(x, y), Broadcast{Mul}{0: 0}(x, y)]"
+class _test_dimshuffle_lift(unittest.TestCase):
+    def test_double_transpose(self):
+        x, y, z = inputs()
+        e = ds(ds(x, (1, 0)), (1, 0))
+        g = Env([x], [e])
+        assert str(g) == "[DimShuffle{10}(DimShuffle{10}(x))]"
+        lift_dimshuffle.optimize(g)
+        assert str(g) == "[x]"
+    def test_merge2(self):
+        x, y, z = inputs()
+        e = ds(ds(x, (1, 'x', 0)), (2, 0, 'x', 1))
+        g = Env([x], [e])
+        self.failUnless(str(g) == "[DimShuffle{20x1}(DimShuffle{1x0}(x))]", str(g))
+        lift_dimshuffle.optimize(g)
+        self.failUnless(str(g) == "[DimShuffle{01xx}(x)]", str(g))
+    def test_elim3(self):
+        x, y, z = inputs()
+        e = ds(ds(ds(x, (0, 'x', 1)), (2, 0, 'x', 1)), (1, 0))
+        g = Env([x], [e])
+        self.failUnless(str(g) == "[DimShuffle{10}(DimShuffle{20x1}(DimShuffle{0x1}(x)))]", str(g))
+        lift_dimshuffle.optimize(g)
+        self.failUnless(str(g) == "[x]", str(g))
+    def test_lift(self):
+        x, y, z = inputs([0]*1, [0]*2, [0]*3)
+        e = x + y + z
+        g = Env([x, y, z], [e])
+        self.failUnless(str(g) == "[Broadcast{Add}(DimShuffle{x01}(Broadcast{Add}(DimShuffle{x0}(x), y)), z)]", str(g))
+        lift_dimshuffle.optimize(g)
+        self.failUnless(str(g) == "[Broadcast{Add}(Broadcast{Add}(DimShuffle{xx0}(x), DimShuffle{x01}(y)), z)]", str(g))
+class _test_cliques(unittest.TestCase):
+    def test_straightforward(self):
+        x, y, z = inputs()
+        m = y * z
+        d = tensor.dot(x, m)
+        d.name = 'd'
+        e = x + y + d
+        g = Env([x, y, z], [e])
+        cliques = find_cliques(g)
+        assert len(cliques) == 2
+        (i1, o1), (i2, o2) = cliques
+        assert str(Env(i1, o1)) == "[Broadcast{Add}(Broadcast{Add}(x, y), d)]"
+        assert str(Env(i2, o2)) == "[Broadcast{Mul}(y, z)]"
+#         print g
+#         for i, o in find_cliques(g):
+#             print "-->", Env(i, [o])
+    def test_broadcasting(self):
+        x, y, z = inputs([0]*1, [0]*2, [0]*3)
+        e = x + y + z
+        g = Env([x, y, z], [e])
+        lift_dimshuffle.optimize(g)
+        assert len(find_cliques(g, through_broadcast = True)) == 1
+        assert len(find_cliques(g, through_broadcast = False)) == 2
+#         print g
+#         for i, o in find_cliques(g, True):
+#             print "-->", Env(i, [o])
+# class _test_clique_opt(unittest.TestCase):
+#     def test_straightforward(self):
+#         x, y, z = inputs()
+#         e = x ** 2.0 #x * x
+#         g = Env([x], [e])
+#         gof.ConstantFinder().optimize(g)
+#         opt = CliqueOptimizer(through_broadcast = False,
+#                               scalar_optimizer = scalar_opt.opt2,
+#                               make_composite = False)
+#         print g
+#         opt.optimize(g)
+#         print g
+#     def test_inplace(self):
+#         x, y, z = inputs()
+#         #e = tensor.add_inplace(x, y + z)
+#         e = x + tensor.add_inplace(y, z)
+#         g = Env([x, y, z], [e])
+#         opt = CliqueOptimizer(through_broadcast = False,
+#                               scalar_optimizer = None,
+#                               make_composite = True)
+#         print g
+#         opt.optimize(g)
+#         print g
+# #        print g.outputs[0].owner.c_code(['x', 'y', 'z'], ['e'], dict(fail = "FAIL;", id = 0))
+#         print gof.OpWiseCLinker(g).make_function()(numpy.ones((5, 5)), numpy.ones((5, 5)), numpy.ones((5, 5)))
+#     def test_straightforward(self):
+#         x, y, z = inputs()
+#         e = x + y + z
+#         g = Env([x, y, z], [e])
+#         opt = CliqueOptimizer(through_broadcast = False,
+#                               scalar_optimizer = None,
+#                               make_composite = True)
+#         print g
+#         opt.optimize(g)
+#         print g
+# #        print g.outputs[0].owner.c_code(['x', 'y', 'z'], ['e'], dict(fail = "FAIL;", id = 0))
+#         print gof.OpWiseCLinker(g).make_function()(numpy.ones((5, 5)), numpy.ones((5, 5)), numpy.ones((5, 5)))
+#     def test_straightforward2(self):
+#         x, y, z = inputs()
+#         m = y * z
+#         d = tensor.dot(x, m)
+#         d.name = 'd'
+#         e = x + y + d
+#         g = Env([x, y, z], [e])
+#         opt = CliqueOptimizer(through_broadcast = False,
+#                               scalar_optimizer = None,
+#                               make_composite = True)
+#         print g
+#         opt.optimize(g)
+#         print g
+# #        print g.outputs[0].owner.c_code(['x', 'y', 'z'], ['e'], dict(fail = "FAIL;", id = 0))
+#         print gof.OpWiseCLinker(g).make_function()(numpy.ones((5, 5)), numpy.ones((5, 5)), numpy.ones((5, 5)))
+if __name__ == '__main__':
+    unittest.main()
--- a/_test_scalar.py
+++ b/_test_scalar.py
@@ -27,6 +27,50 @@ class _test_ScalarOps(unittest.TestCase):
        assert fn(1.0, 2.0) == 1.5
+class _test_composite(unittest.TestCase):
+    def test_straightforward(self):
+        x, y, z = inputs()
+        e = mul(add(x, y), div(x, y))
+        C = composite([x, y], [e])
+        c = C(x, y)
+        # print c.c_code(['x', 'y'], ['z'], dict(id = 0))
+        c.perform()
+        assert c.outputs[0].data == 1.5
+        g = env([x, y], [c.out])
+        fn = gof.DualLinker(g).make_function()
+        assert fn(1.0, 2.0) == 1.5
+    def test_with_constants(self):
+        x, y, z = inputs()
+        e = mul(add(70.0, y), div(x, y))
+        C = composite([x, y], [e])
+        c = C(x, y)
+        assert "70.0" in c.c_code(['x', 'y'], ['z'], dict(id = 0))
+        # print c.c_code(['x', 'y'], ['z'], dict(id = 0))
+        c.perform()
+        assert c.outputs[0].data == 36.0
+        g = env([x, y], [c.out])
+        fn = gof.DualLinker(g).make_function()
+        assert fn(1.0, 2.0) == 36.0
+    def test_many_outputs(self):
+        x, y, z = inputs()
+        e0 = x + y + z
+        e1 = x + y * z
+        e2 = x / y
+        C = composite([x, y, z], [e0, e1, e2])
+        c = C(x, y, z)
+        # print c.c_code(['x', 'y', 'z'], ['out0', 'out1', 'out2'], dict(id = 0))
+        c.perform()
+        assert c.outputs[0].data == 6.0
+        assert c.outputs[1].data == 7.0
+        assert c.outputs[2].data == 0.5
+        g = env([x, y], c.outputs)
+        fn = gof.DualLinker(g).make_function()
+        assert fn(1.0, 2.0) == [6.0, 7.0, 0.5]
 if __name__ == '__main__':
    unittest.main()

--- a/_test_scalar_opt.py
+++ b/_test_scalar_opt.py
+import unittest
+from gof import Result, Op, Env, modes
+import gof
+from scalar import *
+from scalar_opt import *
+def inputs():
+    x = Scalar('float64', name = 'x')
+    y = Scalar('float64', name = 'y')
+    z = Scalar('float64', name = 'z')
+    return x, y, z
+class _test_opts(unittest.TestCase):
+    def test_pow_to_sqr(self):
+        x, y, z = inputs()
+        e = x ** 2.0
+        g = Env([x], [e])
+        assert str(g) == "[Pow(x, 2.0)]"
+        gof.ConstantFinder().optimize(g)
+        opt2.optimize(g)
+        assert str(g) == "[Sqr(x)]"
+if __name__ == '__main__':
+    unittest.main()
--- a/_test_tensor.py
+++ b/_test_tensor.py
@@ -990,21 +990,21 @@ class t_gemm(unittest.TestCase):
    def cmp(self, z, a, x, y, b):
        def cmp_linker(z, a, x, y, b, l):
            z,a,x,y,b = [numpy.asarray(p) for p in z,a,x,y,b]
-            cz = z.copy()
+            z_orig = z.copy()
            tz,ta,tx,ty,tb = [astensor(p) for p in z,a,x,y,b]
            f = Function([tz,ta,tx,ty,tb], [gemm(tz,ta,tx,ty,tb)], linker_cls=l)
            new_z = f(z,a,x,y,b)
-            _z = self._gemm(cz, a, x, y, b)
+            z_after = self._gemm(z_orig, a, x, y, b)
            self.failUnless(z is new_z)
-            #print cz, _z, z, type(cz), type(_z), type(z)
+            #print z_orig, z_after, z, type(z_orig), type(z_after), type(z)
            #_approx_eq.debug = 1
-            self.failUnless(_approx_eq(_z, z))
+            self.failUnless(_approx_eq(z_after, z))
            if a == 0.0 and b == 1.0:
                return
            else:
-                self.failIf(numpy.all(cz == z))
+                self.failIf(numpy.all(z_orig == z))
        cmp_linker(copy(z), a, x, y, b, gof.cc.OpWiseCLinker)
        #cmp_linker(copy(z), a, x, y, b, gof.cc.CLinker)
@@ -1101,5 +1101,49 @@ class t_gemm(unittest.TestCase):
        eval_outputs([gemm(Z, 1.0, A, A, 1.0)])
        eval_outputs([gemm(Z, 1.0, A, A.T, 1.0)])
+    def test_transposes(self):
+        # three square matrices which are not contiguous
+        A = self.rand(4,5)[:,:4]
+        B = self.rand(4,5)[:,:4]
+        C = self.rand(4,5)[:,:4]
+        def t(z,x,y,a=1.0, b=0.0,l=gof.cc.OpWiseCLinker):
+            z,a,x,y,b = [numpy.asarray(p) for p in z,a,x,y,b]
+            z_orig = z.copy()
+            z_after = self._gemm(z, a, x, y, b)
+            tz,ta,tx,ty,tb = [astensor(p) for p in z,a,x,y,b]
+            f = Function([tz,ta,tx,ty,tb], [gemm(tz,ta,tx,ty,tb)], linker_cls=l)
+            f(z, a, x, y, b)
+            self.failUnless(_approx_eq(z_after, z), (z_orig, z_after, z))
+            f(z.T, a, y.T, x.T, b)
+            self.failUnless(_approx_eq(z_after, z))
+        t(C,A,B)
+        t(C.T, A, B)
+        t(C, A.T, B)
+        t(C, A, B.T)
+        t(C.T, A.T, B)
+        t(C, A.T, B.T)
+        t(C.T, A, B.T)
+        t(C.T, A.T, B.T)
+        t(C, A[:,:2], B[:2, :])
+        t(C.T, A[:,:2], B[:2, :])
+        t(C, A[:2,:].T, B[:2, :])
+        t(C.T, A[:2,:].T, B[:2, :])
+        t(C, A[:2,:].T, B[:, :2].T)
+        t(C.T, A[:2,:].T, B[:, :2].T)
+        try:
+            t(C.T, A[:2,:], B[:, :2].T)
+        except ValueError, e:
+            if e[0].find('aligned') >= 0:
+                return
+        self.fail()
 if __name__ == '__main__':
    unittest.main()
--- a/blas.py
+++ b/blas.py
--- a/elemwise.py
+++ b/elemwise.py
@@ -397,9 +397,6 @@ class CAReduce(Op):
        if dimensions_to_reduce is None:
            dimensions_to_reduce = range(len(inputs[0].broadcastable))
-        self.nin = 1
-        self.nout = 1
        self.inputs = inputs
        self.outputs = [Tensor(dtype = inputs[0].dtype,
                               broadcastable = [x for i, x in enumerate(inputs[0].broadcastable) if i not in dimensions_to_reduce])]

--- a/gof/cc.py
+++ b/gof/cc.py
@@ -409,7 +409,7 @@ class CLinker(Linker):
                    elif result in self.orphans:
                        self.orphans.remove(result)
                    continue
-                except AbstractFunctionError:
+                except (AbstractFunctionError, NotImplementedError):
                    pass
            # policy = [[what to declare in the struct, what to do at construction, what to do at destruction],
            #           [what to declare in each run, what to do at the beginning of each run, what to do at the end of each run]]

--- a/gof/result.py
+++ b/gof/result.py
@@ -35,22 +35,23 @@ class Computed : """Memory has been allocated, contents are the owner's output."
 ############################
 class Result(object):
-    """Base class for storing L{Op} inputs and outputs
+    """
+    Base class for storing L{Op} inputs and outputs
    Attributes:
-    _role - None or (owner, index) #or BrokenLink
+     - _role - None or (owner, index) #or BrokenLink
-    _data - anything
+     - _data - anything
-    state - one of (Empty, Allocated, Computed)
+     - state - one of (Empty, Allocated, Computed)
-    name - string
+     - name - string
    Properties:
-    role - (rw)
+     - role - (rw)
-    owner - (ro)
+     - owner - (ro)
-    index - (ro)
+     - index - (ro)
-    data - (rw) : calls data_filter when setting
+     - data - (rw) : calls data_filter when setting
    Abstract Methods:
-    data_filter
+     - data_filter
    """
    __slots__ = ['_role', '_data', 'state', '_name', '_hash_id']
@@ -241,6 +242,13 @@ class Result(object):
    def c_libraries(self):
        """
        Return a list of libraries to link against to manipulate this L{Result}.
+        For example: return ['gsl', 'gslcblas', 'm', 'fftw3', 'g2c'].
+        The compiler will search the directories specified by the environment
+        variable LD_LIBRARY_PATH.  No option is provided for an Op to provide an
+        extra library directory because this would change the linking path for
+        other Ops in a potentially disasterous way.
        """
        raise AbstractFunctionError()

--- a/opt.py
+++ b/opt.py
-from gof import opt
+from gof import opt, Env
-from elemwise import Broadcast
+import gof
+from elemwise import Broadcast, DimShuffle
+from gof.python25 import any, all
+import scalar
 class InplaceOptimizer(opt.OpSpecificOptimizer):
@@ -26,30 +29,236 @@ class InplaceOptimizer(opt.OpSpecificOptimizer):
 inplace_optimizer = InplaceOptimizer()
-# class ElemwisePatternOptimizer(opt.Optimizer):
-#     def __init__(self, scalar_opt):
+class DimShuffleLifter(opt.Optimizer):
-#         self.
+    """
+    "Lifts" DimShuffle through Broadcast operations and merges
+    consecutive DimShuffles. Basically, applies the following
+    transformations on the whole graph:
+    DimShuffle(Broadcast(x, y)) => Broadcast(DimShuffle(x), DimShuffle(y))
+    DimShuffle(DimShuffle(x)) => DimShuffle(x)
-# def find_elemwise_cliques(env, cross_broadcast = False):
+    After this transform, clusters of Broadcast operations are
+    void of DimShuffle operations.
+    """
+    def apply(self, env):
+        seen = set()
+        def merge(ord1, ord2):
+            return [x == 'x' and 'x' or ord1[x] for x in ord2]
+        def lift(r):
+            if r in seen:
+                return
+            seen.add(r)
+            op = r.owner
+            if op is None \
+                    or op in env.inputs \
+                    or op in env.orphans():
+                return
+            if isinstance(op, DimShuffle):
+                in_op = op.inputs[0].owner
+                if isinstance(in_op, DimShuffle):
+                    new_order = [x == 'x' and 'x' or in_op.new_order[x] for x in op.new_order]
+                    if new_order == range(len(new_order)):
+                        repl = in_op.inputs[0]
+                    else:
+                        repl = DimShuffle(in_op.inputs[0], new_order).out
+                    env.replace(r, repl)
+                    lift(repl)
+                    return
+                elif isinstance(in_op, Broadcast):
+                    repl = Broadcast(in_op.scalar_opclass,
+                                     [DimShuffle(input, op.new_order).out for input in in_op.inputs],
+                                     in_op.inplace_pattern).out
+                    env.replace(r, repl)
+                    r = repl
+                    op = r.owner
+            for next_r in op.inputs:
+                lift(next_r)
+        for output in env.outputs:
+            lift(output)
+lift_dimshuffle = DimShuffleLifter()
+def find_cliques(env, through_broadcast = False):
-# def synchronize(env1, env2, equiv, transform):
+    def seek_from(r):
+        op = r.owner
+        if r in env.inputs \
+                or r in env.orphans() \
+                or op is None \
+                or not isinstance(op, Broadcast) \
+                or len(op.outputs) > 1:
+            # todo: handle multiple-output broadcast ops
+            #       (needs to update the clique's outputs)
+            return None
-#     class Synchronize(Listener, Constraint):
+        ret = set()
+        if not through_broadcast:
+            if any(any(bc) and not all(bc)
+                   for bc in zip(*[input.broadcastable for input in op.inputs])):
+                ret.update(op.inputs)
+                return ret
+        for input in op.inputs:
+            res = seek_from(input)
+            if res is None:
+                ret.add(input)
+            else:
+                ret.update(res)
-#         def on_import(self, op1):
+        return ret
-#             if op1 not in equiv:
-#                 equiv[op1] = transform(op1)
+    cliques = []
+    def find_cliques_helper(r):
+        if r in env.inputs or r in env.orphans():
+            return
+        clique_inputs = seek_from(r)
+        if clique_inputs is None:
+            op = r.owner
+            if op is not None:
+                for input in op.inputs:
+                    find_cliques_helper(input)
+        else:
+            cliques.append((clique_inputs, [r]))
+            for input in clique_inputs:
+                find_cliques_helper(input)
+    for output in env.outputs:
+        find_cliques_helper(output)
+    # todo: merge the cliques if possible
+    return cliques
+class CliqueOptimizer(opt.Optimizer):
+    def __init__(self, through_broadcast = False, scalar_optimizer = None, make_composite = False):
+        self.through_broadcast = through_broadcast
+        self.scalar_optimizer = scalar_optimizer
+        self.make_composite = make_composite
+    def apply(self, env):
+        if self.scalar_optimizer is None and not self.make_composite:
+            # there's nothing to do with the cliques...
+            return
+        cliques = find_cliques(env, self.through_broadcast)
+        opt = self.scalar_optimizer
+        def build_scalar_clique(r, env, equiv):
+            if r in equiv:
+                return equiv[r]
+            op = r.owner
+            if r in env.inputs or r in env.orphans():
+                s = scalar.Scalar(dtype = r.dtype)
+                _r = r
+                if isinstance(r.owner, DimShuffle) and all(x == 'x' for x in r.owner.new_order):
+                    _r = r.owner.inputs[0]
+                if (getattr(r, 'constant', False) or getattr(_r, 'constant', False)) \
+                       and _r.broadcastable == ():
+                    s.data = _r.data
+                    s.constant = True
+                equiv[r] = s
+                return s
+            s_op = op.scalar_opclass(*[build_scalar_clique(input, env, equiv) for input in op.inputs])
+            equiv[op] = s_op
+            for output, s_output in zip(op.outputs, s_op.outputs):
+                equiv[output] = s_output
+            return equiv[r]
+        for c_in, c_out in cliques:
+            equiv = dict()
+            g = Env(c_in, c_out)
+            for output in c_out:
+                build_scalar_clique(output, g, equiv)
+            s_g = Env([equiv[r] for r in g.inputs],
+                      [equiv[r] for r in g.outputs])
+            if opt is not None:
+                equiv2 = dict()
+                for k, v in equiv.items():
+                    equiv2[v] = k
+                def transform(op, equiv):
+                    return Broadcast(op.__class__, [equiv[input] for input in op.inputs])
+                s_g.add_feature(sync_to(env, equiv2, transform))
+                opt.optimize(s_g)
+            if self.make_composite:
+                def follow_inplace(r):
+                    op = r.owner
+                    if op is None or r in g.inputs or r in g.orphans():
+                        return None
+                    assert isinstance(op, Broadcast)
+                    destroyed = op.destroy_map().get(r, None)
+                    if destroyed is None:
+                        return None
+                    else:
+                        r2 = destroyed[0]
+                        ret = follow_inplace(r2)
+                        if ret is None:
+                            return r2
+                        else:
+                            return ret
+                inplace_pattern = {}
+                for i, output in enumerate(g.outputs):
+                    destroyed = follow_inplace(output)
+                    if destroyed is not None and destroyed in g.inputs:
+                        inplace_pattern[i] = g.inputs.index(destroyed)
+                C = scalar.composite(s_g.inputs, s_g.outputs)
+                ec = Broadcast(C, g.inputs, inplace_pattern = inplace_pattern)
+                env.replace_all(dict((o, eco) for o, eco in zip(c_out, ec.outputs)))
+def sync_to(target, equiv, transform):
+    class Synchronize(gof.Listener, gof.Constraint):
+        def __init__(self, source):
+            self.source = source
+            self.target = target
+            self.equiv = equiv
+            self.transform = transform
+            self.inconsistencies = []
+        def on_import(self, op1):
+            if op1 not in self.equiv:
+                op2 = self.transform(op1, self.equiv)
+                self.equiv[op1] = op2
+                for o1, o2 in zip(op1.outputs, op2.outputs):
+                    self.equiv[o1] = o2
+        def on_prune(self, op1):
+            if op1 in self.equiv:
+                op2 = self.equiv[op1]
+                del self.equiv[op1]
+                for o1, o2 in zip(op1.outputs, op2.outputs):
+                    del self.equiv[o1]
+        def on_rewire(self, clients1, r1, new_r1):
+            if (new_r1, r1) in self.inconsistencies:
+                self.inconsistencies.remove((new_r1, r1))
+                return
+            if not self.source.clients(r1):
+                try:
+                    target.replace(self.equiv[r1], self.equiv[new_r1])
+                except:
+                    self.inconsistencies.append((r1, new_r1))
+        def validate(self):
+            if self.inconsistencies:
+                raise InconsistencyError("Could not synchronize when replacing the following pairs: %s" % self.inconsistencies)
+            return True
+    return Synchronize
-#         def on_prune(self, op1):
-#             if op1 in equiv:
-#                 del equiv[op1]

--- a/scalar.py
+++ b/scalar.py
@@ -5,7 +5,8 @@ import math
 from copy import copy
 import inspect
-from gof import Result, GuardedOp, utils
+import gof
+from gof import Result, GuardedOp, Env, utils
 def as_scalar(x, name = None):
@@ -20,6 +21,11 @@ def as_scalar(x, name = None):
    if isinstance(x, Scalar):
        return x
+def constant(x):
+    res = as_scalar(x)
+    res.constant = True
+    return res
 class Scalar(Result):
@@ -29,6 +35,8 @@ class Scalar(Result):
        self.dtype_specs()
    def __get_constant(self):
+        if not hasattr(self, '_constant'):
+            return False
        return self._constant
    def __set_constant(self, value):
@@ -37,7 +45,10 @@ class Scalar(Result):
        self._constant = value
    constant = property(__get_constant, __set_constant)
+    def desc(self):
+        return (self.dtype, self.data)
    def filter(self, data):
        py_type = self.dtype_specs()[0]
        return py_type(data)
@@ -58,6 +69,11 @@ class Scalar(Result):
        except KeyError:
            raise TypeError("Unsupported dtype for %s: %s" % (self.__class__.__name__, self.dtype))
+    def c_literal(self):
+        if 'complex' in self.dtype:
+            raise NotImplementedError("No literal for complex values.")
+        return str(self.data)
    def c_declare(self, name, sub):
        return """
        %(dtype)s %(name)s;
@@ -184,7 +200,7 @@ class ScalarMixedOp(GuardedOp):
        inputs = [as_scalar(input) for input in inputs]
        i_dtypes = [getattr(input, 'dtype', None) for input in inputs]
-        o_dtypes = utils.from_return_values(self.propagate_dtypes(*i_dtypes))
+        o_dtypes = self.propagate_dtypes(*i_dtypes)
        self.inputs = inputs
        self.outputs = [Scalar(dtype) for dtype in o_dtypes]
@@ -217,7 +233,7 @@ class PureScalarOp(ScalarMixedOp):
        for dtype in i_dtypes:
            if dtype is None:
                raise TypeError("Expected a Scalar.")
-        return self.cast_method(*i_dtypes)
+        return [self.cast_method(*i_dtypes)] * self.nout
 class UnaryScalarOp(PureScalarOp):
@@ -383,4 +399,111 @@ modes.make_constructors(globals())
+def composite(inputs, outputs):
+    """
+    Usage: composite(inputs, outputs)
+    Produces an Op class which represents the computations
+    between the provided inputs and outputs as a single
+    operation.
+    The operations between inputs and outputs (as given by
+    Env(inputs, outputs).ops()) must all be instances of
+    PureScalarOp.
+    Examples:
+      x, y = Scalar(), Scalar()
+      SquareDiff = composite([x, y], [(x - y)**2])
+      TimesTen = composite([x], [x * 10.0])
+      Neighbors = composite([x], [x - 1, x + 1])
+    """
+    env = Env(inputs, outputs).clone()
+    gof.opt.ConstantFinder().apply(env)
+    inputs, outputs = env.inputs, env.outputs
+    for op in env.ops():
+        if not isinstance(op, PureScalarOp):
+            raise ValueError("The input env to composite must be exclusively composed of PureScalarOp instances.")
+    subd = dict(zip(inputs,
+                    ["%%(i%i)s"%i for i in range(len(inputs))]) +
+                zip(outputs,
+                    ["%%(o%i)s"%i for i in range(len(outputs))]))
+    for orphan in env.orphans():
+        if orphan.constant:
+            subd[orphan] = orphan.c_literal()
+        else:
+            raise ValueError("All orphans in the input env to composite must be constant.")
+    _c_code = "{\n"
+    i = 0
+    j = 0
+    for op in env.toposort():
+        j += 1
+        for output in op.outputs:
+            if output not in subd:
+                i += 1
+                name = "V%%(id)s_tmp%i" % i
+                subd[output] = name
+                # the c code is not robust to any other dtypes than those of the specified inputs
+                # a solution would be to require Composite.c_code to fill in the dtypes using
+                # a proper upcast
+                _c_code += "%s %s;\n" % (output.dtype_specs()[1], name)
+        _c_code += op.c_code([subd[input] for input in op.inputs],
+                             [subd[output] for output in op.outputs],
+                             dict(fail = "%(fail)s",
+                                  id = "%%(id)s_%i" % j))
+        _c_code += "\n"
+    _c_code += "}\n"
+    def compose_impl(r):
+        # this is not optimal at all eg in add(*1 -> mul(x, y), *1)
+        # it will calculate *1 twice
+        # it also doesn't follow env.toposort but that's (presumably)
+        # still correct since we only have pure scalar ops
+        if r in env.inputs:
+            idx = env.inputs.index(r)
+            return lambda inputs: inputs[idx]
+        elif r in env.orphans():
+            return lambda inputs: r.data
+        op = r.owner
+        producers = [compose_impl(input) for input in op.inputs]
+        return lambda inputs: op.impl(*[p(inputs) for p in producers])
+    _impls = [compose_impl(r) for r in env.outputs]
+    class Composite(PureScalarOp):
+        nin = len(inputs)
+        nout = len(outputs)
+        # todo: propagate_dtypes?
+        def perform(self):
+            inputs = [input.data for input in self.inputs]
+            for output, impl in zip(self.outputs, _impls):
+                output.data = impl(inputs)
+        def impl(self, *inputs):
+            for r, input in zip(self.inputs, inputs):
+                r.data = input
+            self.perform()
+            return utils.to_return_values([output.data for output in self.outputs])
+        def grad(self, inputs, output_grads):
+            raise NotImplementedError("grad is not implemented for Composite")
+        def c_code(self, inames, onames, sub):
+            d = dict(zip(["i%i"%i for i in range(len(inames))],
+                         inames) +
+                     zip(["o%i"%i for i in range(len(onames))],
+                         onames),
+                     **sub)
+            return _c_code % d
+    return Composite
--- a/scalar_opt.py
+++ b/scalar_opt.py
+from scalar import *
+from gof import PatternOptimizer
+c2 = constant(2.0)
+opt1 = PatternOptimizer((Mul, 'x', 'x'), (Sqr, 'x'))
+opt2 = PatternOptimizer((Pow, 'x', c2), (Sqr, 'x'))
--- a/tensor.py
+++ b/tensor.py