injected new elemwise in tensor.py

af1b2de4 · olivier@olivier-desktop · 761c7f9f · af1b2de4 · af1b2de4 · af1b2de4
--- a/_test_elemwise2.py
+++ b/_test_elemwise2.py
@@ -22,19 +22,37 @@ def env(inputs, outputs, validate = True, features = []):

 class _test_DimShuffle(unittest.TestCase):

-    def test_straightforward(self):
-        x, y, z = inputs()
-        e0 = DimShuffle(x, [1, 'x', 0]).out
-        f = gof.PerformLinker(env([x], [e0])).make_function(inplace=True)
-        assert f(numpy.ones((2, 3))).shape == (3, 1, 2)
+    def with_linker(self, linker):
+        for xsh, shuffle, zsh in [((2, 3), (1, 'x', 0), (3, 1, 2)),
+                                  ((1, 2, 3), (1, 2), (2, 3)),
+                                  ((1, 2, 1, 3), (1, 3), (2, 3)),
+                                  ((2, 3, 4), (2, 1, 0), (4, 3, 2)),
+                                  ((2, 3, 4), ('x', 2, 1, 0, 'x'), (1, 4, 3, 2, 1)),
+                                  ((1, 4, 3, 2, 1), (3, 2, 1), (2, 3, 4)),
+                                  ((1, 1, 4), (1, 2), (1, 4))]:
+            x = modes.build(Tensor('float64', [1 * (entry == 1) for entry in xsh], name = 'x'))
+            e = DimShuffle(x, shuffle).out
+#             print shuffle, e.owner.grad(e.owner.inputs, e.owner.outputs).owner.new_order
+            f = linker(env([x], [e])).make_function(inplace=False)
+            assert f(numpy.ones(xsh)).shape == zsh
+
+    def test_perform(self):
+        self.with_linker(gof.PerformLinker)
+
+
+#     def test_straightforward(self):
+#         x, y, z = inputs()
+#         e0 = DimShuffle(x, [1, 'x', 0]).out
+#         f = gof.PerformLinker(env([x], [e0])).make_function(inplace=True)
+#         assert f(numpy.ones((2, 3))).shape == (3, 1, 2)


 class _test_Broadcast(unittest.TestCase):

    def with_linker(self, linker):
-        for xsh, ysh in [((5, 5), (5, 5)),
-                         ((5, 5), (1, 5)),
-                         ((5, 5), (5, 1)),
+        for xsh, ysh in [((3, 5), (3, 5)),
+                         ((3, 5), (1, 5)),
+                         ((3, 5), (3, 1)),
                         ((1, 5), (5, 1)),
                         ((1, 1), (1, 1)),
                         ((2, 3, 4, 5), (2, 3, 4, 5)),
@@ -52,7 +70,11 @@ class _test_Broadcast(unittest.TestCase):
            xv = numpy.asarray(numpy.random.rand(*xsh))
            yv = numpy.asarray(numpy.random.rand(*ysh))
            zv = xv + yv
-            
+
+#             print "AAAAAAAAAAAAAAAAAA"
+#             print f(xv, yv)
+#             print zv
+#             print "BBBBBBBBBBBBBBBBBB"
            self.failUnless((f(xv, yv) == zv).all())

    def with_linker_inplace(self, linker):
@@ -105,7 +127,9 @@ class _test_CAReduce(unittest.TestCase):
        for xsh, tosum in [((5, 6), (0, 1)),
                           ((5, 6), (0, )),
                           ((5, 6), (1, )),
-                           ((2, 3, 4, 5), (0, 1, 3))]:
+                           ((5, 6), ()),
+                           ((2, 3, 4, 5), (0, 1, 3)),
+                           ((), ())]:
            x = modes.build(Tensor('float64', [1 * (entry == 1) for entry in xsh], name = 'x'))
            e = CAReduce(Add, [x], dimensions_to_reduce = tosum).out
            f = linker(env([x], [e])).make_function(inplace = False)
@@ -113,7 +137,13 @@ class _test_CAReduce(unittest.TestCase):
            zv = xv
            for axis in reversed(sorted(tosum)):
                zv = numpy.add.reduce(zv, axis)
-            self.failUnless((f(xv) - zv < 1e-10).all())
+#             print "AAAAAAAAAAAAAAAAAA"
+#             print xsh, tosum
+#             print f(xv)
+#             print zv
+#             print f(xv) - zv
+#             print "BBBBBBBBBBBBBBBBBB"
+            self.failUnless((numpy.abs(f(xv) - zv) < 1e-10).all())

    def test_perform(self):
        self.with_linker(gof.PerformLinker)
@@ -123,27 +153,27 @@ class _test_CAReduce(unittest.TestCase):
        

 if __name__ == '__main__':
-#    unittest.main()
-    x = modes.build(Tensor('float64', [0, 0], name = 'x'))
-    y = modes.build(Tensor('float64', [0, 0], name = 'y'))
-    e = Broadcast(SquareDiff, (x, y), {0:0}).out
-    f = gof.CLinker(env([x, y], [e])).make_function(inplace = False)
-    xv = numpy.random.rand(1000, 1000)
-    yv = numpy.random.rand(1000, 1000)
-    zv = numpy.random.rand(1000, 1000)
-    add = numpy.frompyfunc(lambda x, y: x + y, 2, 1)
-
-    t0 = time.time()
-    for i in xrange(100):
-        xv -= yv
-        xv *= xv
-#        xv += yv
-    print time.time() - t0
-
-    t0 = time.time()
-    for i in xrange(100):
-        f(xv, yv)
-    print time.time() - t0
+    unittest.main()
+#     x = modes.build(Tensor('float64', [0, 0], name = 'x'))
+#     y = modes.build(Tensor('float64', [0, 0], name = 'y'))
+#     e = Broadcast(SquareDiff, (x, y), {0:0}).out
+#     f = gof.CLinker(env([x, y], [e])).make_function(inplace = False)
+#     xv = numpy.random.rand(1000, 1000)
+#     yv = numpy.random.rand(1000, 1000)
+#     zv = numpy.random.rand(1000, 1000)
+#     add = numpy.frompyfunc(lambda x, y: x + y, 2, 1)
+
+#     t0 = time.time()
+#     for i in xrange(100):
+#         xv -= yv
+#         xv *= xv
+# #        xv += yv
+#     print time.time() - t0
+
+#     t0 = time.time()
+#     for i in xrange(100):
+#         f(xv, yv)
+#     print time.time() - t0
    



--- a/_test_tensor.py
+++ b/_test_tensor.py
@@ -7,7 +7,9 @@ from compile import Function, eval_outputs
 import gradient
 import gof, gof.graph
 from gof.python25 import any
+import gof

+from elemwise2 import DimShuffle

 def _numpy_checker(x, y):
    """
@@ -58,6 +60,15 @@ def verify_grad(testcase, op_cls, pt, n_tests=1, rng=numpy.random, eps=0.0000001
        if not isinstance(analytic_grad, (list, tuple)):
            analytic_grad = [analytic_grad]

+#         if num_grad.max_err(analytic_grad) > 1.0e-4:
+#             print "aaaaaaaaaa"
+#             print gof.Env(tensor_pt, [cost])
+#             print gof.Env(tensor_pt, symbolic_grad)
+#             print analytic_grad
+#             print num_grad.gf
+#             print num_grad.max_err(analytic_grad)
+#             print "bbbbbbbbbb"
+
        if num_grad.max_err(analytic_grad) > 1.0e-4:
            raise Exception(verify_grad.E_grad)
 verify_grad.E_grad = 'gradient error exceeded tolerance'
@@ -361,6 +372,15 @@ class T_add(unittest.TestCase):
                f = Function([a,b], [fn(a, b)], linker_cls = gof.CLinker)
                self.failUnless(numpy.all(fn(a.data, b.data) == f(a.data, b.data)))

+    def test_grad_scalar_l(self):
+        verify_grad(self, Add, [numpy.asarray([3.0]), numpy.random.rand(3)])
+    def test_grad_scalar_r(self):
+        verify_grad(self, Add, [numpy.random.rand(3), numpy.asarray([3.0])])
+    def test_grad_row(self):
+        verify_grad(self, Add, [numpy.random.rand(3, 5), numpy.random.rand(1, 5)])
+    def test_grad_col(self):
+        verify_grad(self, Add, [numpy.random.rand(3, 5), numpy.random.rand(3, 1)])
+

 class T_abs(unittest.TestCase):
    def test_impl(self):
@@ -381,8 +401,8 @@ class T_abs(unittest.TestCase):
    class AbsBadGrad(tensor._Elemwise):
        def impl(self, x):
            return numpy.abs(x)
-        def grad(self, x, gz):
-            return scale(gz * sgn(x),0.9)
+        def grad(self, (x, ), (gz, )):
+            return mul(gz * sgn(x),0.9),
        def c_foreach(self, (x_i, ), (z_i, )):
            return "z_i = abs(x_i);"

@@ -401,7 +421,7 @@ class T_fill(unittest.TestCase):
        o = t.owner
        self.failUnless(o.inputs[0].broadcastable == (0,))
 #        self.failUnless(o.inputs[0].dtype[0:3] == 'int')
-        self.failUnless(o.inputs[1].broadcastable == ())
+        self.failUnless(o.inputs[1].broadcastable == (1,))
 #        self.failUnless(o.inputs[1].dtype[0:3] == 'flo')
        self.failUnless(o.outputs[0].broadcastable == (0,))
 #        self.failUnless(o.outputs[0].dtype[0:3] == 'flo')
@@ -432,47 +452,70 @@ class T_mul(unittest.TestCase):
    def test_elemwise(self):
        a = astensor(0.0)
        b = astensor(0.0)
-        check_eq2_both(self, [a,b], mul_elemwise(a,b), [3.0, 4.0], 12.0)
-        check_eq2_both(self, [a,b], mul_elemwise(b,a), [-1.0,2.0], -2.0)
-        self.failUnless(isinstance(mul(a,b).owner, Scale))
+        check_eq2_both(self, [a,b], mul(a,b), [3.0, 4.0], 12.0)
+        check_eq2_both(self, [a,b], mul(b,a), [-1.0,2.0], -2.0)
+        #self.failUnless(isinstance(mul(a,b).owner, Scale))

        a = astensor(numpy.ones(2))
        b = astensor(numpy.ones(2))
        aa = numpy.asarray([-0.5, 4.0])
        bb = numpy.asarray([-0.5, 2.0])
-        check_eq2_both(self, [a,b], mul_elemwise(a,b), [aa,bb], numpy.asarray([0.25, 8.0]))
-        check_eq2_both(self, [a,b], mul_elemwise(a,b), [bb,aa], numpy.asarray([0.25, 8.0]))
-        self.failUnless(isinstance(mul(a,b).owner, MulElemwise))
+        check_eq2_both(self, [a,b], mul(a,b), [aa,bb], numpy.asarray([0.25, 8.0]))
+        check_eq2_both(self, [a,b], mul(a,b), [bb,aa], numpy.asarray([0.25, 8.0]))
+        #self.failUnless(isinstance(mul(a,b).owner, MulElemwise))

    def test_scalar(self):
        r = numpy.random.rand(2,3)
        a = astensor(r)
        b = astensor(2.0)
-        check_eq2_both(self, [a,b], scale(a,b), [r, 2.0], r*2.0)
-        check_eq2_both(self, [a,b], scale(a,b), [r, 4.0], r*4.0)
+        check_eq2_both(self, [a,b], mul(a,b), [r, 2.0], r*2.0)
+        check_eq2_both(self, [a,b], mul(a,b), [r, 4.0], r*4.0)
        self.failUnless(b.data == 2.0)

-    def test_operator(self):
-        a = astensor([1,1])
-        aa = astensor([1,1])
-        b = astensor(4)
-        self.failUnless(isinstance((a*b).owner, Scale))
-        self.failUnless(isinstance((b*a).owner, Scale))
-        self.failUnless(isinstance((a*aa).owner, MulElemwise))
-        self.failUnless(isinstance((aa*a).owner, MulElemwise))
+    def test_rowcol(self):
+        r1 = numpy.random.rand(3,5)
+        r2 = numpy.random.rand(1,5)
+        r3 = numpy.random.rand(3,1)
+        a1, a2, a3 = astensor(r1), astensor(r2), astensor(r3)
+        check_eq2_both(self, [a1,a2], mul(a1,a2), [r1, r2], r1*r2)
+        check_eq2_both(self, [a1,a3], mul(a1,a3), [r1, r3], r1*r3)
+
+    def test_grad_elemwise(self):
+        verify_grad(self, Mul, [numpy.random.rand(3,4), numpy.random.rand(3,4)])
+    def test_grad_scalar_l(self):
+        verify_grad(self, Mul, [numpy.asarray([3.0]), numpy.random.rand(3)])
+    def test_grad_scalar_r(self):
+        verify_grad(self, Mul, [numpy.random.rand(3), numpy.asarray([3.0])])
+    def test_grad_row(self):
+        verify_grad(self, Mul, [numpy.random.rand(3, 5), numpy.random.rand(1, 5)])
+    def test_grad_row2(self):
+        op = lambda x, y: Mul(x, DimShuffle(y, ['x', 0]).out)
+        verify_grad(self, op, [numpy.random.rand(3, 5), numpy.random.rand(5)])
+    def test_grad_col(self):
+        verify_grad(self, Mul, [numpy.random.rand(3, 5), numpy.random.rand(3, 1)])
+
+
+#     def test_operator(self):
+#         a = astensor([1,1])
+#         aa = astensor([1,1])
+#         b = astensor(4)
+#         self.failUnless(isinstance((a*b).owner, Scale))
+#         self.failUnless(isinstance((b*a).owner, Scale))
+#         self.failUnless(isinstance((a*aa).owner, MulElemwise))
+#         self.failUnless(isinstance((aa*a).owner, MulElemwise))

    def test_wrong_shapes(self):
        a = astensor(numpy.ones(3))
        b = astensor(numpy.ones(4))
        try:
-            check_eq2(self, [a,b], MulElemwise(a,b).out,
+            check_eq2(self, [a,b], Mul(a,b).out,
                      [numpy.ones(3), numpy.ones(4)], 1.0)
            self.fail()
        except ValueError, e:
-            self.failUnless(e[0] is tensor._assert_same_shapes.E_shape)
+            self.failUnless('shape mismatch' in str(e))
        
        try:
-            check_eq2_c(self, [a,b], MulElemwise(a,b).out,
+            check_eq2_c(self, [a,b], Mul(a,b).out,
                        [numpy.ones(3), numpy.ones(4)], 1.0)
            self.fail()
        except ValueError, e:
@@ -482,14 +525,14 @@ class T_div(unittest.TestCase):
    def setUp(self):
        numpy.random.seed(9999)
    def test_grad_e(self):
-        verify_grad(self, DivElemwise, [numpy.ones(()), numpy.ones(())])
-        verify_grad(self, DivElemwise, [numpy.random.rand(3), numpy.ones(3)])
-        verify_grad(self, DivElemwise, [numpy.random.rand(3,5), numpy.random.rand(3,5)+0.1])
+        verify_grad(self, Div, [numpy.random.rand(3), numpy.ones(3)])
+        verify_grad(self, Div, [numpy.random.rand(3,5), numpy.random.rand(3,5)+0.1])
+        verify_grad(self, Div, [numpy.ones(()), numpy.ones(())])

    def test_grad_sl(self):
-        verify_grad(self, DivElemwise, [numpy.ones(()), numpy.ones(())])
-        verify_grad(self, DivElemwise, [numpy.random.rand(3), numpy.ones(3)])
-        verify_grad(self, DivElemwise, [numpy.random.rand(3,5), numpy.random.rand(3,5)+0.1])
+        verify_grad(self, Div, [numpy.ones((3, 5)), numpy.ones((1, 1))])
+        verify_grad(self, Div, [numpy.random.rand(3), numpy.ones((1, ))])
+        verify_grad(self, Div, [numpy.random.rand(3,5), numpy.random.rand(1,1)])

 class T_log2(unittest.TestCase):
    def test0(self):
@@ -509,12 +552,16 @@ class T_pow(unittest.TestCase):
    def setUp(self):
        numpy.random.seed(9999)
    def test_elemwise(self):
-        verify_grad(self, DivElemwise, [numpy.random.rand(3,4), numpy.random.rand(3,4)+0.1])
-        verify_grad(self, PowElemwise, [numpy.random.rand(3,4), numpy.random.rand(3,4)])
+        verify_grad(self, Div, [numpy.random.rand(3,4), numpy.random.rand(3,4)+0.1])
+        verify_grad(self, Pow, [numpy.random.rand(3,4), numpy.random.rand(3,4)])
    def test_scalar_l(self):
-        verify_grad(self, PowScalarL, [numpy.random.rand(3), numpy.asarray(3.0)])
+        verify_grad(self, Pow, [numpy.asarray([3.0]), numpy.random.rand(3)])
    def test_scalar_r(self):
-        verify_grad(self, PowScalarR, [numpy.random.rand(3), numpy.asarray(3.0)])
+        verify_grad(self, Pow, [numpy.random.rand(3), numpy.asarray([3.0])])
+    def test_row(self):
+        verify_grad(self, Pow, [numpy.random.rand(3, 5), numpy.random.rand(1, 5)])
+    def test_col(self):
+        verify_grad(self, Pow, [numpy.random.rand(3, 5), numpy.random.rand(3, 1)])

 class _testCase_matinv(unittest.TestCase):


--- a/base_tensor.py
+++ b/base_tensor.py
@@ -94,7 +94,7 @@ class BaseTensor(ResultBase):
                    'complex128': (complex, 'theano_complex128', 'NPY_COMPLEX128'),
                    'complex64': (complex, 'theano_complex64', 'NPY_COMPLEX64')}[self.dtype]
        except KeyError:
-            raise TypeError("Unsupported dtype for BaseTensor: %s" % self.dtype)
+            raise TypeError("Unsupported dtype for %s: %s" % (self.__class__.__name__, self.dtype))

    #
    # Hash for constant folding

--- a/elemwise2.py
+++ b/elemwise2.py
@@ -3,12 +3,16 @@ import elemwise_cgen as cgen

 import numpy
 from gof import Op, Viewer, Destroyer
-from tensor import Tensor
+from base_tensor import BaseTensor as Tensor
 from scalar import upcast, Scalar
 import scalar_ops
 import gof


+def astensor(data):
+    assert isinstance(data, Tensor)
+    return data
+

 ##################
 ### DimShuffle ###
@@ -18,6 +22,8 @@ class DimShuffle(Op, Viewer):

    def __init__(self, input, new_order, inplace = True):

+        input = astensor(input)
+
        ib = input.broadcastable
        ob = []
        for value in new_order:
@@ -35,13 +41,23 @@ class DimShuffle(Op, Viewer):
        self.outputs = output,

        self.inplace = inplace
-        
-        self.numorder = [x for x in new_order if type(x) == int]
-        self.is_transposition = sorted(new_order) == range(len(ib))
-        self.dup_dims = len(set(self.numorder)) != len(self.numorder)
-        self.all_dims = len(set(self.numorder)) == len(ib)
-        if self.dup_dims or not self.all_dims:
-            raise NotImplementedError("You must provide a permutation of *all* the input dimensions with *no duplicates*.")
+
+        self.drop = []
+        self.augment = []
+        i2j = {}
+        j = 0
+        for i, b in enumerate(ib):
+            if i not in new_order:
+                if b == 1:
+                    self.drop.append(i)
+                else:
+                    raise NotImplementedError("You cannot drop a non-broadcastable dimension.")
+            else:
+                i2j[i] = j
+                j += 1
+
+        self.shuffle = [i2j[x] for x in new_order if x != 'x']
+        self.augment = [i for i, x in enumerate(new_order) if x == 'x']

    def clone_with_new_inputs(self, *new_inputs):
        return DimShuffle(new_inputs[0], self.new_order, self.inplace)
@@ -53,19 +69,31 @@ class DimShuffle(Op, Viewer):
            return {}

    def perform(self):
-        res = self.inputs[0].data.transpose(self.numorder)
+        res = self.inputs[0].data
        shape = list(res.shape)
-        new_shape = []
-        for entry in self.new_order:
-            if entry == 'x':
-                new_shape.append(1)
-            else:
-                new_shape.append(shape.pop(0))
-        res = res.reshape(new_shape)
+        for drop in reversed(self.drop):
+            shape.pop(drop)
+        res = res.reshape(shape)
+        
+        res = res.transpose(self.shuffle)
+
+        shape = list(res.shape)
+        for augm in self.augment:
+            shape.insert(augm, 1)
+        res = res.reshape(shape)
+
        if not self.inplace:
            res = numpy.copy(res)
+
        self.outputs[0].data = res

+    def grad(self, (x, ), (gz, )):
+        grad_order = ['x'] * len(self.inputs[0].broadcastable)
+        for i, x in enumerate(self.new_order):
+            if x != 'x':
+                grad_order[x] = i
+        return DimShuffle(gz, grad_order).out,
+        
    def __str__(self):
        return "%s(%s, %s)" % (self.__class__.__name__, str(self.inputs[0]), self.new_order)

@@ -90,6 +118,9 @@ class Transpose(DimShuffle):
 class Broadcast(Op, Destroyer):

    def __init__(self, scalar_opclass, inputs, inplace_pattern = {}):
+
+        inputs = map(astensor, inputs)
+        
        try:
            assert len(set([len(input.broadcastable) for input in inputs])) == 1
        except (AssertionError, AttributeError):
@@ -141,15 +172,29 @@ class Broadcast(Op, Destroyer):
            if r in scalar_ograds:
                return ograds[scalar_ograds.index(r)]
            op = r.owner
+            if op is None:
+                b = [1] * len(inputs[0].broadcastable)
+                res = astensor(numpy.asarray(r.data).reshape(b),
+                               broadcastable = b)
+                return res
            op_class = op.__class__
-            bcasted = Broadcast(op_class, [transform(input) for input in op.inputs], {})
+            bcasted = Broadcast(op_class, [transform(input) for input in op.inputs], {}).out
            return bcasted
        ret = []
        for scalar_igrad, input in zip(scalar_igrads, inputs):
            r = transform(scalar_igrad)
            to_sum = [i for i, bcast in enumerate(input.broadcastable) if bcast]
            if to_sum:
+                shuffle = []
+                j = 0
+                for bcast in input.broadcastable:
+                    if bcast == 1:
+                        shuffle.append('x')
+                    else:
+                        shuffle.append(j)
+                        j += 1
                sr = Sum(r, axis = to_sum).out
+                sr = DimShuffle(sr, shuffle).out
                ret.append(sr)
            else:
                ret.append(r)
@@ -269,16 +314,19 @@ def make_broadcast(scalar_opclass, inplace_pattern = {}, name = None):
        New.__name__ = "Tensor" + scalar_opclass.__name__
    return New

-def broadcast(op):
+def wrap_broadcast(op):
    def instantiate(*inputs):
+        inputs = map(astensor, inputs)
+        
        target_length = max([len(input.broadcastable) for input in inputs])
        args = []
        for input in inputs:
-            difference = target_length - len(input.broadcastable)
+            length = len(input.broadcastable)
+            difference = target_length - length
            if not difference:
                args.append(input)
            else:
-                args.append(DimShuffle(input, ['x']*difference + range(length)))
+                args.append(DimShuffle(input, ['x']*difference + range(length)).out)
        return op(*args)
    return instantiate

@@ -319,6 +367,8 @@ class CAReduce(Op):
    """
    
    def __init__(self, scalar_opclass, inputs, dimensions_to_reduce = None):
+        inputs = map(astensor, inputs)
+        
        if scalar_opclass.nin != 2 or scalar_opclass.nout != 1:
            raise NotImplementedError("CAReduce only supports binary functions with a single output.")
        if len(inputs) != 1:
@@ -346,9 +396,13 @@ class CAReduce(Op):
        
    def perform(self):
        result = self.inputs[0].data
-        for dimension in reversed(sorted(self.dimensions_to_reduce)):
-            result = self.ufunc.reduce(result, dimension)
-        self.outputs[0].data = result
+        to_reduce = reversed(sorted(self.dimensions_to_reduce))
+        if to_reduce:
+            for dimension in to_reduce:
+                result = self.ufunc.reduce(result, dimension)
+            self.outputs[0].data = result
+        else:
+            self.outputs[0].data = numpy.copy(result)

    def _c_all(self, inames, onames, sub):

@@ -363,6 +417,9 @@ class CAReduce(Op):

        tosum = self.dimensions_to_reduce

+        if tosum == ():
+            return Broadcast(scalar_ops.Identity, (input, ))._c_all(inames, onames, sub)
+
        order1 = [i for i in xrange(len(input.broadcastable)) if i not in tosum]
        order = order1 + list(tosum)
        
@@ -459,7 +516,19 @@ def make_reduce(scalar_opclass, name = None):
        New.__name__ = "Reduce" + scalar_opclass.__name__
    return New

-Sum = make_reduce(scalar_ops.Add, name = 'Sum')
+class Sum(make_reduce(scalar_ops.Add)):
+    def grad(self, (x, ), (gz, )):
+        if self.dimensions_to_reduce == ():
+            return gz,
+        new_dims = []
+        i = 0
+        for j, _ in enumerate(x.broadcastable):
+            if j in self.dimensions_to_reduce:
+                new_dims.append('x')
+            else:
+                new_dims.append(i)
+                i += 1
+        return Broadcast(scalar_ops.Second, (x, DimShuffle(gz, new_dims).out)).out, 


 def reduce(op):

--- a/gof/cc.py
+++ b/gof/cc.py
@@ -832,8 +832,14 @@ class DualLinker(Linker):
        op_order_1 = env1.toposort()
        op_order_2 = [equiv[op.outputs[0]].owner for op in op_order_1] # we need to have the exact same order so we can compare each step

+        def c_make_thunk(op):
+            try:
+                return CLinker(op).make_thunk(True)[0]
+            except AbstractFunctionError:
+                return op.perform
+        
        thunks1 = [op.perform for op in op_order_1]
-        thunks2 = [CLinker(op).make_thunk(True)[0] for op in op_order_2]
+        thunks2 = [c_make_thunk(op) for op in op_order_2]
        
        def f():
            for input1, input2 in zip(env1.inputs, env2.inputs):

--- a/gradient.py
+++ b/gradient.py
@@ -76,14 +76,17 @@ def grad_sources_inputs(sources, graph_inputs):
        #if all output gradients are None, continue
        if all(map(lambda x:x is None, g_outputs)): continue

-        output_arg = _unpack_result(g_outputs)
-        input_arg = _unpack_result(op.inputs)
+#         output_arg = _unpack_result(g_outputs)
+#         input_arg = _unpack_result(op.inputs)
+        
+        output_arg = g_outputs
+        input_arg = op.inputs
        op_grad = op.grad(input_arg, output_arg)
        if op_grad is None:
            raise ValueError(_msg_retNone, op.__class__)
        if isinstance(op_grad, float):
            raise TypeError('wtf!!!!!!!!', op)
-        g_inputs = _pack_result(op_grad)
+        g_inputs = op_grad #_pack_result(op_grad)
        assert isinstance(g_inputs, (list, tuple))
        if len(g_inputs) != len(op.inputs):
            raise ValueError(_msg_badlen, 
@@ -123,6 +126,10 @@ class numeric_grad:
        """
        gf = [numpy.ndarray(x.shape) for x in pt]
        f_pt = f(*pt)
+        if isinstance(f, (list, tuple)):
+            f_pt = [numpy.copy(x) for x in f_pt]
+        else:
+            f_pt = numpy.copy(f_pt)

        for idx in xrange(len(gf)):
            if len(pt[idx].shape) == 0:

--- a/scalar.py
+++ b/scalar.py
@@ -12,6 +12,10 @@ def as_scalar(x, name = None):
        s = Scalar('float64', name = name)
        s.data = x
        return s
+    if isinstance(x, int):
+        s = Scalar('int32', name = name)
+        s.data = x
+        return s
    if isinstance(x, Scalar):
        return x

@@ -45,7 +49,8 @@ class Scalar(ResultBase):
 #             and self.data == other.data

    def dtype_specs(self):
-        return {'float64': (float, 'double', 'PyFloat_Check', 'PyFloat_AsDouble', 'PyFloat_FromDouble')}[self.dtype]
+        return {'float64': (float, 'npy_float64', 'PyFloat_Check', 'PyFloat_AsDouble', 'PyFloat_FromDouble'),
+                'int32': (int, 'npy_int32', 'PyInt_Check', 'PyInt_AsLong', 'PyInt_FromLong')}[self.dtype]

    def c_declare(self, name, sub):
        return """

--- a/scalar_ops.py
+++ b/scalar_ops.py
@@ -18,7 +18,7 @@ class Sub(BinaryScalarOp):
    def c_code(self, (x, y), (z, ), sub):
        return "%(z)s = %(x)s - %(y)s;" % locals()
    def grad(self, (x, y), (gz, )):
-        return gz, -gz
+        return gz, neg(gz)

 class Mul(BinaryScalarOp):
    def impl(self, x, y):
@@ -34,62 +34,119 @@ class Div(BinaryScalarOp):
    def c_code(self, (x, y), (z, ), sub):
        return "%(z)s = %(x)s / %(y)s;" % locals()
    def grad(self, (x, y), (gz, )):
-        return div(gz, y), -div(mul(x, gz), y*y)
+        return div(gz, y), neg(div(mul(x, gz), mul(y, y)))

 class Pow(BinaryScalarOp):
    def impl(self, x, y):
        return x ** y
    def c_code(self, (x, y), (z, ), sub):
        return "%(z)s = pow(%(x)s, %(y)s);" % locals()
+    def grad(self, (x, y), (gz, )):
+        return mul(gz, mul(y, pow(x, sub(y, as_scalar(1))))), mul(gz, mul(log(x), pow(x, y)))

 class First(BinaryScalarOp):
    def impl(self, x, y):
        return x
    def c_code(self, (x, y), (z, ), sub):
        return "%(z)s = %(x)s;" % locals()
+    def grad(self, (x, y), (gz, )):
+        return gz, None

 class Second(BinaryScalarOp):
    def impl(self, x, y):
        return y
    def c_code(self, (x, y), (z, ), sub):
        return "%(z)s = %(y)s;" % locals()
+    def grad(self, (x, y), (gz, )):
+        return None, gz

-class SquareDiff(BinaryScalarOp):
-    def impl(self, x, y):
-        diff = (x - y)
-        return diff * diff
-    def c_code(self, (x, y), (z, ), sub):
-        return "%(z)s = %(x)s - %(y)s; %(z)s *= %(z)s;" % locals()
+# class SquareDiff(BinaryScalarOp):
+#     def impl(self, x, y):
+#         diff = (x - y)
+#         return diff * diff
+#     def c_code(self, (x, y), (z, ), sub):
+#         return "%(z)s = %(x)s - %(y)s; %(z)s *= %(z)s;" % locals()

+class Identity(UnaryScalarOp):
+    def impl(self, x):
+        return x
+    def c_code(self, (x, ), (z, ), sub):
+        return "%(z)s = %(x)s;" % locals()
+    def grad(self, (x, y), (gz, )):
+        return gz,

 class Neg(UnaryScalarOp):
    def impl(self, x):
        return -x
    def grad(self, (x, ), (gz, )):
-        return -gz
+        return neg(gz),
    def c_code(self, (x, ), (z, ), sub):
        return "%(z)s = -%(x)s;" % locals()

+class Abs(UnaryScalarOp):
+    def impl(self, x):
+        return numpy.abs(x)
+    def grad(self, (x, ), (gz, )):
+        return mul(gz, sgn(x)),
+    def c_code(self, (x, ), (z, ), sub):
+        return "%(z)s = abs(%(x)s);" % locals()
+
+class Sgn(UnaryScalarOp):
+    def impl(self, x):
+        return numpy.abs(x) / x
+    def grad(self, (x, ), (gz, )):
+        return None,
+    def c_code(self, (x, ), (z, ), sub):
+        return "%(z)s = %(x)s/abs(%(x)s);" % locals() # TODO: C use copysign
+
 class Inv(UnaryScalarOp):
    def impl(self, x):
        return 1 / x
    def grad(self, (x, ), (gz, )):
-        return -gz / (x*x)
+        return div(neg(gz), mul(x, x)),
    def c_code(self, (x, ), (z, ), sub):
        return "%(z)s = 1 / %(x)s;" % locals()

 class Log(UnaryScalarOp):
    def impl(self, x):
        return math.log(x)
+    def grad(self, (x, ), (gz, )):
+        return div(gz, x),
    def c_code(self, (x, ), (z, ), sub):
        return "%(z)s = log(%(x)s);" % locals()

+class Log2(UnaryScalarOp):
+    def impl(self, x):
+        return numpy.log2(x)
+    def grad(self, (x, ), (gz, )):
+        return div(gz, mul(x, as_scalar(math.log(2.0)))),
+    def c_code(self, (x, ), (z, ), sub):
+        return "%(z)s = log2(%(x)s);" % locals()
+
 class Exp(UnaryScalarOp):
    def impl(self, x):
        return math.exp(x)
+    def grad(self, (x, ), (gz, )):
+        return mul(gz, exp(x)),
    def c_code(self, (x, ), (z, ), sub):
        return "%(z)s = exp(%(x)s);" % locals()

+class Sqr(UnaryScalarOp):
+    def impl(self, x):
+        return x*x
+    def grad(self, (x, ), (gz, )):
+        return mul(gz, mul(x, as_scalar(2))),
+    def c_code(self, (x, ), (z, ), sub):
+        return "%(z)s = %(x)s * %(x)s;" % locals()
+
+class Sqrt(UnaryScalarOp):
+    def impl(self, x):
+        return math.sqrt(x)
+    def grad(self, (x, ), (gz, )):
+        return div(mul(gz, as_scalar(0.5)), sqrt(x)),
+    def c_code(self, (x, ), (z, ), sub):
+        return "%(z)s = sqrt(%(x)s);" % locals()
+

 # class Sigmoid(UnaryComposite):
 #     def expand_impl(self, x):

--- a/tensor.py
+++ b/tensor.py
@@ -12,6 +12,9 @@ from base_tensor import BaseTensor, BaseTensorOp
 from elemwise import Elemwise
 import blas # for gemm, dot

+import elemwise2 as s2t
+import scalar_ops as scal
+

 class Tensor(BaseTensor):
    """
@@ -65,7 +68,9 @@ class Tensor(BaseTensor):
    #SLICING
    def __getitem__(self, item): return subtensor(self, item)
    def __getslice__(self, *args): return subtensor(self, slice(*args))
+s2t.Tensor = Tensor

+    
 # alternate Tensor constructor
 def astensor(data, broadcastable=None, role=None, name=None):
    """Return a Tensor containing given data"""
@@ -79,6 +84,7 @@ def astensor(data, broadcastable=None, role=None, name=None):
    rval = Tensor(data.dtype, broadcastable, role, name)
    rval.data = data # will raise if broadcastable was mis-specified
    return rval
+s2t.astensor = astensor


 ############################
@@ -229,15 +235,23 @@ class TensorScalarOp(_Elemwise):
 # Unary Operations
 ##########################

-class Abs(_Elemwise):
-    def impl(self, x):
-        return numpy.abs(x)
-    def grad(self, x, gz):
-        return gz * Sgn(x).out #TODO: handle the corner case (get it? pun?) (there's a special place in hell for people like you)
-    def c_foreach(self, (x_i, ), (z_i, )):
-        return "%(z)s_i = abs(%(x)s_i);"
+# class Abs(_Elemwise):
+#     def impl(self, x):
+#         return numpy.abs(x)
+#     def grad(self, x, gz):
+#         return gz * Sgn(x).out #TODO: handle the corner case (get it? pun?) (there's a special place in hell for people like you)
+#     def c_foreach(self, (x_i, ), (z_i, )):
+#         return "%(z)s_i = abs(%(x)s_i);"
+# #Constructor not necessary because builtin abs() does this
+
+
+Abs = s2t.make_broadcast(scal.Abs)
+AbsInplace = s2t.make_broadcast(scal.Abs, {0:0})
+
 #Constructor not necessary because builtin abs() does this
+abs_inplace = gof.op.constructor(s2t.wrap_broadcast(AbsInplace))

+    
 class Argmax(Op):
    nin=2 # tensor, axis
    nout=2 # max val, max idx
@@ -269,91 +283,152 @@ def max(x, axis=None):
    # but when Argmax.c_impl() is in place, it should be fine.
    return argmax(x,axis)[0]

-class Exp(_Elemwise):
-    def impl(self, x): return numpy.exp(x)
-    def grad(self, x, gz): return gz * exp(x)
-    def c_foreach(self, (x_i, ), (z_i, )): return "z_i = exp(x_i);"
-exp = gof.op.constructor(Exp)
+# class Exp(_Elemwise):
+#     def impl(self, x): return numpy.exp(x)
+#     def grad(self, x, gz): return gz * exp(x)
+#     def c_foreach(self, (x_i, ), (z_i, )): return "z_i = exp(x_i);"
+# exp = gof.op.constructor(Exp)
+
+Exp = s2t.make_broadcast(scal.Exp)
+ExpInplace = s2t.make_broadcast(scal.Exp, {0:0})
+exp = gof.op.constructor(s2t.wrap_broadcast(Exp))
+exp_inplace = gof.op.constructor(s2t.wrap_broadcast(ExpInplace))
+
+
+# class Neg(_Elemwise):
+#     def impl(self, x):
+#         return -x
+#     def grad(self, x, gz):
+#         return -gz
+#     def c_foreach(self, (x_i, ), (z_i, )):
+#         return "%(z)s_i = -%(x)s_i;"
+# #Constructor not necessary because unary '-' does this
+
+Neg = s2t.make_broadcast(scal.Neg)
+NegInplace = s2t.make_broadcast(scal.Neg, {0:0})
+neg = gof.op.constructor(s2t.wrap_broadcast(Neg))
+neg_inplace = gof.op.constructor(s2t.wrap_broadcast(NegInplace))
+
+
+# class Log(_Elemwise):
+#     def impl(self, x): return numpy.log(x)
+#     def grad(self, x, gz): return gz / x
+#     def c_foreach(self, (x_i, ), (z_i, )): return "z_i = log(x_i);"
+# log = gof.op.constructor(Log)
+
+Log = s2t.make_broadcast(scal.Log)
+LogInplace = s2t.make_broadcast(scal.Log, {0:0})
+log = gof.op.constructor(s2t.wrap_broadcast(Log))
+log_inplace = gof.op.constructor(s2t.wrap_broadcast(LogInplace))
+
+# class Log2(_Elemwise):
+#     def impl(self, x): return numpy.log2(x)
+#     def grad(self, x, gz): return gz / (x * numpy.log(2.0))
+#     def c_foreach(self, (x_i, ), (z_i, )): return "%(z)s_i = log2(%(x)s_i);"
+# log2 = gof.op.constructor(Log2)
+
+Log2 = s2t.make_broadcast(scal.Log2)
+Log2Inplace = s2t.make_broadcast(scal.Log2, {0:0})
+log2 = gof.op.constructor(s2t.wrap_broadcast(Log2))
+log2_inplace = gof.op.constructor(s2t.wrap_broadcast(Log2Inplace))
+
+# class Sgn(_Elemwise):
+#     def impl(self, x):
+#         return numpy.abs(x) / x
+#     def grad(self, x, gz):
+#         return [None]
+#     def c_foreach(self, (x_i, ), (z_i, )):
+#         return "%(z)s_i = %(x)s_i/abs(%(x)s_i);" # TODO: C use copysign
+# sgn = gof.op.constructor(Sgn)
+
+Sgn = s2t.make_broadcast(scal.Sgn)
+SgnInplace = s2t.make_broadcast(scal.Sgn, {0:0})
+sgn = gof.op.constructor(s2t.wrap_broadcast(Sgn))
+sgn_inplace = gof.op.constructor(s2t.wrap_broadcast(SgnInplace))
+
+# class Sqr(_Elemwise):
+#     def impl(self, x): return x * x
+#     def grad(self, x, gz): return 2.0 * x * gz
+#     def c_foreach(self, (x_i, ), (z_i, )): return "%(z)s_i = %(x)s_i * %(x)s_i;"
+# sqr = gof.op.constructor(Sqr)
+
+Sqr = s2t.make_broadcast(scal.Sqr)
+SqrInplace = s2t.make_broadcast(scal.Sqr, {0:0})
+sqr = gof.op.constructor(s2t.wrap_broadcast(Sqr))
+sqr_inplace = gof.op.constructor(s2t.wrap_broadcast(SqrInplace))
+
+# class Sqrt(_Elemwise):
+#     def impl(self, x): return numpy.sqrt(x)
+#     def grad(self, x, gz): return 0.5 * gz / sqrt(x) 
+#     def c_foreach(self, (x_i, ), (z_i, )): return "%(z)s_i = sqrt(%(x)s_i);"
+# sqrt = gof.op.constructor(Sqrt)
+
+Sqrt = s2t.make_broadcast(scal.Sqrt)
+SqrtInplace = s2t.make_broadcast(scal.Sqrt, {0:0})
+sqrt = gof.op.constructor(s2t.wrap_broadcast(Sqrt))
+sqrt_inplace = gof.op.constructor(s2t.wrap_broadcast(SqrtInplace))
+
+# class Sum(_Elemwise):
+#     def impl(self, x):
+#         return numpy.sum(x)
+#     def grad(self, (x, ), (gz, )):
+#         return fill(x, gz),
+#     def propagate_broadcastable(self, *inputs):
+#         return [()]
+#     def c_init(self, (x, ), (sum, )):
+#         return "dtype_%(sum)s* %(sum)sp = ((dtype_%(sum)s*)PyArray_DATA(%(sum)s)); %(sum)sp[0] = 0;"
+#     def c_foreach(self, (x_i, ), (sum, )):
+#         return "%(sum)sp[0] += %(x)s_i;"
+# sum0 = gof.op.constructor(Sum)
+
+Sum = s2t.Sum
+sum = gof.op.constructor(Sum)

+# class Fill(_Elemwise):
+#     def impl(self, model, value):
+#         return (model * 0) + value #TODO: we can probably do better than this
+#     def grad(self, (model, value), (gz, )):
+#         return None, sum(gz)
+#     def c_init(self, (model, value), (z, )):
+#         return "dtype_%(value)s %(value)s0 = ((dtype_%(value)s*)PyArray_DATA(%(value)s))[0];"
+#     def c_foreach(self, (model_i, value), (z_i, )):
+#         return "%(z)s_i = %(value)s0;"
+# fill = gof.op.constructor(Fill)
+
+
+def broadcast_package(scalar_opclass, name, inplace_versions = True):
+    C = s2t.make_broadcast(scalar_opclass, name = name)
+    c = gof.op.constructor(s2t.wrap_broadcast(C))
+    if inplace_versions:
+        CInplace = s2t.make_broadcast(scalar_opclass, name = name+"Inplace")
+        c_inplace = gof.op.constructor(s2t.wrap_broadcast(CInplace))
+        return C, c, CInplace, c_inplace
+    else:
+        return C, c
+    

-class Neg(_Elemwise):
-    def impl(self, x):
-        return -x
-    def grad(self, x, gz):
-        return -gz
-    def c_foreach(self, (x_i, ), (z_i, )):
-        return "%(z)s_i = -%(x)s_i;"
-#Constructor not necessary because unary '-' does this
-
-class Log(_Elemwise):
-    def impl(self, x): return numpy.log(x)
-    def grad(self, x, gz): return gz / x
-    def c_foreach(self, (x_i, ), (z_i, )): return "z_i = log(x_i);"
-log = gof.op.constructor(Log)
-
-class Log2(_Elemwise):
-    def impl(self, x): return numpy.log2(x)
-    def grad(self, x, gz): return gz / (x * numpy.log(2.0))
-    def c_foreach(self, (x_i, ), (z_i, )): return "%(z)s_i = log2(%(x)s_i);"
-log2 = gof.op.constructor(Log2)
-
-class Sgn(_Elemwise):
-    def impl(self, x):
-        return numpy.abs(x) / x
-    def grad(self, x, gz):
-        return [None]
-    def c_foreach(self, (x_i, ), (z_i, )):
-        return "%(z)s_i = %(x)s_i/abs(%(x)s_i);" # TODO: C use copysign
-sgn = gof.op.constructor(Sgn)
-
-class Sqr(_Elemwise):
-    def impl(self, x): return x * x
-    def grad(self, x, gz): return 2.0 * x * gz
-    def c_foreach(self, (x_i, ), (z_i, )): return "%(z)s_i = %(x)s_i * %(x)s_i;"
-sqr = gof.op.constructor(Sqr)
-
-class Sqrt(_Elemwise):
-    def impl(self, x): return numpy.sqrt(x)
-    def grad(self, x, gz): return 0.5 * gz / sqrt(x) 
-    def c_foreach(self, (x_i, ), (z_i, )): return "%(z)s_i = sqrt(%(x)s_i);"
-sqrt = gof.op.constructor(Sqrt)
-
-class Sum(_Elemwise):
-    def impl(self, x):
-        return numpy.sum(x)
-    def grad(self, x, gz):
-        return fill(x, gz)
-    def propagate_broadcastable(self, *inputs):
-        return [()]
-    def c_init(self, (x, ), (sum, )):
-        return "dtype_%(sum)s* %(sum)sp = ((dtype_%(sum)s*)PyArray_DATA(%(sum)s)); %(sum)sp[0] = 0;"
-    def c_foreach(self, (x_i, ), (sum, )):
-        return "%(sum)sp[0] += %(x)s_i;"
-sum = gof.op.constructor(Sum)
+# Fill = s2t.make_broadcast(scal.Second)
+# FillInplace = s2t.make_broadcast(scal.Second, {0:0})
+# fill = gof.op.constructor(s2t.wrap_broadcast(Fill))
+# fill_inplace = gof.op.constructor(s2t.wrap_broadcast(FillInplace))
+
+Fill, fill, FillInplace, fill_inplace = broadcast_package(scal.Second, 'Fill')

-class Fill(_Elemwise):
-    def impl(self, model, value):
-        return (model * 0) + value #TODO: we can probably do better than this
-    def grad(self, (model, value), gz):
-        return None, sum(gz)
-    def c_init(self, (model, value), (z, )):
-        return "dtype_%(value)s %(value)s0 = ((dtype_%(value)s*)PyArray_DATA(%(value)s))[0];"
-    def c_foreach(self, (model_i, value), (z_i, )):
-        return "%(z)s_i = %(value)s0;"
-fill = gof.op.constructor(Fill)
 def ones_like(model):
    return fill(model, 1.0)
 def zeros_like(model):
    return fill(model, 0.0)


-class TensorCopy(_Elemwise):
-    def impl(self, x):
-        return numpy.array(x)
-    def grad(self, x, gz):
-        return gz
-    def c_foreach(self, (x_i, ), (z_i, )):
-        return "%(z)s_i = %(x)s_i;"
+# class TensorCopy(_Elemwise):
+#     def impl(self, x):
+#         return numpy.array(x)
+#     def grad(self, x, gz):
+#         return gz
+#     def c_foreach(self, (x_i, ), (z_i, )):
+#         return "%(z)s_i = %(x)s_i;"
+
+TensorCopy = s2t.make_broadcast(scal.Identity)
 tensor_copy = gof.op.constructor(TensorCopy)

 ##########################
@@ -451,171 +526,198 @@ subtensor = gof.op.constructor(Subtensor)
 # Arithmetic : Add
 ##########################

-# Elemwise #
-class AddElemwise(_Elemwise):
-    def impl(self, x, y):
-        try:
-            _assert_same_shapes(x, y)
-        except Exception, e:
-            print '------ ERROR HERE'
-            raise
-        return x + y
-    def grad(self, (x, y), gz):
-        return gz, gz
-    def c_foreach(self, (x_i, y_i), (z_i, )):
-        return "%(z)s_i = %(x)s_i + %(y)s_i;"
-add_elemwise = gof.op.constructor(AddElemwise)
-
-class AddElemwiseInplace(AddElemwise.inplace_version()):
-    def impl(self, x, y):
-        _assert_same_shapes(x, y)
-        x += y
-        return x
-add_elemwise_inplace = gof.op.constructor(AddElemwiseInplace)
-
-# Scalar #
-class AddScalar(TensorScalarOp):
-    def impl(self, x, a):
-        _assert_tensor_scalar(x, a)
-        return x + a
-    def grad(self, (x, a), gz):
-        return gz, sum(gz)
-    c_expr = "x_i + a"
-add_scalar = gof.op.constructor(AddScalar)
-
-class AddScalarInplace(AddScalar.inplace_version()):
-    def impl(self, x, a):
-        _assert_tensor_scalar(x, a)
-        x += a
-        return x
-add_scalar_inplace = gof.op.constructor(AddScalarInplace)
-
-add = _scalar_switch(add_elemwise, add_scalar, add_scalar)
-add_inplace = _scalar_switch(add_elemwise_inplace, add_scalar_inplace)
+# # Elemwise #
+# class AddElemwise(_Elemwise):
+#     def impl(self, x, y):
+#         try:
+#             _assert_same_shapes(x, y)
+#         except Exception, e:
+#             print '------ ERROR HERE'
+#             raise
+#         return x + y
+#     def grad(self, (x, y), gz):
+#         return gz, gz
+#     def c_foreach(self, (x_i, y_i), (z_i, )):
+#         return "%(z)s_i = %(x)s_i + %(y)s_i;"
+# add_elemwise = gof.op.constructor(AddElemwise)
+
+# class AddElemwiseInplace(AddElemwise.inplace_version()):
+#     def impl(self, x, y):
+#         _assert_same_shapes(x, y)
+#         x += y
+#         return x
+# add_elemwise_inplace = gof.op.constructor(AddElemwiseInplace)
+
+# # Scalar #
+# class AddScalar(TensorScalarOp):
+#     def impl(self, x, a):
+#         _assert_tensor_scalar(x, a)
+#         return x + a
+#     def grad(self, (x, a), gz):
+#         return gz, sum(gz)
+#     c_expr = "x_i + a"
+# add_scalar = gof.op.constructor(AddScalar)
+
+# class AddScalarInplace(AddScalar.inplace_version()):
+#     def impl(self, x, a):
+#         _assert_tensor_scalar(x, a)
+#         x += a
+#         return x
+# add_scalar_inplace = gof.op.constructor(AddScalarInplace)
+
+# add = _scalar_switch(add_elemwise, add_scalar, add_scalar)
+# add_inplace = _scalar_switch(add_elemwise_inplace, add_scalar_inplace)
+
+Add = s2t.make_broadcast(scal.Add)
+AddInplace = s2t.make_broadcast(scal.Add, {0:0})
+
+add = gof.op.constructor(s2t.wrap_broadcast(Add))
+add_inplace = gof.op.constructor(s2t.wrap_broadcast(AddInplace))


 ##########################
 # Arithmetic : Sub
 ##########################

-# Elemwise #
-class SubElemwise(_Elemwise):
-    def impl(self, x, y):
-        _assert_same_shapes(x, y)
-        return x - y
-    def grad(self, (x, y), gz):
-        return gz, -gz
-    def c_foreach(self, (x_i, y_i), (z_i, )):
-        return "%(z)s_i = %(x)s_i - %(y)s_i;"
-sub_elemwise = gof.op.constructor(SubElemwise)
+# # Elemwise #
+# class SubElemwise(_Elemwise):
+#     def impl(self, x, y):
+#         _assert_same_shapes(x, y)
+#         return x - y
+#     def grad(self, (x, y), gz):
+#         return gz, -gz
+#     def c_foreach(self, (x_i, y_i), (z_i, )):
+#         return "%(z)s_i = %(x)s_i - %(y)s_i;"
+# sub_elemwise = gof.op.constructor(SubElemwise)

-class SubElemwiseInplace(SubElemwise.inplace_version()):
-    def impl(self, x, y):
-        _assert_same_shapes(x, y)
-        x -= y
-        return x
-sub_elemwise_inplace = gof.op.constructor(SubElemwiseInplace)
+# class SubElemwiseInplace(SubElemwise.inplace_version()):
+#     def impl(self, x, y):
+#         _assert_same_shapes(x, y)
+#         x -= y
+#         return x
+# sub_elemwise_inplace = gof.op.constructor(SubElemwiseInplace)
+
+# # Scalar #
+# def sub_scalar_r(x, a):
+#     return add_scalar(x, -a)

-# Scalar #
-def sub_scalar_r(x, a):
-    return add_scalar(x, -a)
+# def sub_scalar_l(x, a):
+#     return add_scalar(-x, a)

-def sub_scalar_l(x, a):
-    return add_scalar(-x, a)
+# def sub_scalar_rinplace(x, a):
+#     return add_scalar_inplace(x, -a)

-def sub_scalar_rinplace(x, a):
-    return add_scalar_inplace(x, -a)
+# sub = _scalar_switch(sub_elemwise, sub_scalar_r, sub_scalar_l)
+# sub_inplace = _scalar_switch(sub_elemwise_inplace, sub_scalar_rinplace)
+
+Sub = s2t.make_broadcast(scal.Sub)
+SubInplace = s2t.make_broadcast(scal.Sub, {0:0})
+
+sub = gof.op.constructor(s2t.wrap_broadcast(Sub))
+sub_inplace = gof.op.constructor(s2t.wrap_broadcast(SubInplace))

-sub = _scalar_switch(sub_elemwise, sub_scalar_r, sub_scalar_l)
-sub_inplace = _scalar_switch(sub_elemwise_inplace, sub_scalar_rinplace)

 ##########################
 # Arithmetic : Mul
 ##########################

-# Elemwise #
-class MulElemwise(_Elemwise):
-    def impl(self, x, y):
-        _assert_same_shapes(x, y)
-        return x * y
-    def grad(self, (x, y), gz):
-        return mul(y, gz), mul(x, gz)
-    def c_foreach(self, (x_i, y_i), (z_i, )):
-        return "%(z)s_i = %(x)s_i * %(y)s_i;"
-mul_elemwise = gof.op.constructor(MulElemwise)
-
-class MulElemwiseInplace(MulElemwise.inplace_version()):
-    def impl(self, x, y):
-        _assert_same_shapes(x, y)
-        x *= y
-        return x
-mul_elemwise_inplace = gof.op.constructor(MulElemwiseInplace)
-
-# Scalar #
-class Scale(TensorScalarOp):
-    def impl(self, x, a):
-        _assert_tensor_scalar(x, a)
-        return x * a
-    def grad(self, (x, a), gz):
-        return scale(a, gz), sum(mul_elemwise(x, gz))
-    c_expr = "%(x)s_i * _%(a)s"
-scale = gof.op.constructor(Scale)
-
-class ScaleInplace(Scale.inplace_version()):
-    def impl(self, x, a):
-        _assert_tensor_scalar(x, a)
-        x *= a
-        return x
-scale_inplace = gof.op.constructor(ScaleInplace)
-
-mul = _scalar_switch(mul_elemwise, scale, scale)
-mul_inplace = _scalar_switch(mul_elemwise_inplace, scale_inplace)
+# # Elemwise #
+# class MulElemwise(_Elemwise):
+#     def impl(self, x, y):
+#         _assert_same_shapes(x, y)
+#         return x * y
+#     def grad(self, (x, y), gz):
+#         return mul(y, gz), mul(x, gz)
+#     def c_foreach(self, (x_i, y_i), (z_i, )):
+#         return "%(z)s_i = %(x)s_i * %(y)s_i;"
+# mul_elemwise = gof.op.constructor(MulElemwise)
+
+# class MulElemwiseInplace(MulElemwise.inplace_version()):
+#     def impl(self, x, y):
+#         _assert_same_shapes(x, y)
+#         x *= y
+#         return x
+# mul_elemwise_inplace = gof.op.constructor(MulElemwiseInplace)
+
+# # Scalar #
+# class Scale(TensorScalarOp):
+#     def impl(self, x, a):
+#         _assert_tensor_scalar(x, a)
+#         return x * a
+#     def grad(self, (x, a), gz):
+#         return scale(a, gz), sum(mul_elemwise(x, gz))
+#     c_expr = "%(x)s_i * _%(a)s"
+# scale = gof.op.constructor(Scale)
+
+# class ScaleInplace(Scale.inplace_version()):
+#     def impl(self, x, a):
+#         _assert_tensor_scalar(x, a)
+#         x *= a
+#         return x
+# scale_inplace = gof.op.constructor(ScaleInplace)
+
+# mul = _scalar_switch(mul_elemwise, scale, scale)
+# mul_inplace = _scalar_switch(mul_elemwise_inplace, scale_inplace)
+
+
+Mul = s2t.make_broadcast(scal.Mul)
+MulInplace = s2t.make_broadcast(scal.Mul, {0:0})
+
+mul = gof.op.constructor(s2t.wrap_broadcast(Mul))
+mul_inplace = gof.op.constructor(s2t.wrap_broadcast(MulInplace))


 ##########################
 # Arithmetic : Div
 ##########################

-# Elemwise #
-class DivElemwise(_Elemwise):
-    def impl(self, x, y):
-        _assert_same_shapes(x, y)
-        return x / y
-    def grad(self, (x, y), gz):
-        return div(gz, y), -div(mul(x, gz), (y*y))
-    def c_foreach(self, (x_i, y_i), (z_i, )):
-        return "%(z)s_i = %(x)s_i / %(y)s_i;"
-div_elemwise = gof.op.constructor(DivElemwise)
+# # Elemwise #
+# class DivElemwise(_Elemwise):
+#     def impl(self, x, y):
+#         _assert_same_shapes(x, y)
+#         return x / y
+#     def grad(self, (x, y), gz):
+#         return div(gz, y), -div(mul(x, gz), (y*y))
+#     def c_foreach(self, (x_i, y_i), (z_i, )):
+#         return "%(z)s_i = %(x)s_i / %(y)s_i;"
+# div_elemwise = gof.op.constructor(DivElemwise)

-class DivElemwiseInplace(DivElemwise.inplace_version()):
-    def impl(self, x, y):
-        _assert_same_shapes(x, y)
-        x /= y
-        return x
-div_elemwise_inplace = gof.op.constructor(DivElemwiseInplace)
+# class DivElemwiseInplace(DivElemwise.inplace_version()):
+#     def impl(self, x, y):
+#         _assert_same_shapes(x, y)
+#         x /= y
+#         return x
+# div_elemwise_inplace = gof.op.constructor(DivElemwiseInplace)

-class InvElemwise(_Elemwise):
-    def impl(self, x):
-        return 1.0/x
-    def grad(self, x, gz):
-        ix = inv(x)
-        return -gz * (ix * ix)
-    def c_foreach(self, (x_i, ), (z_i, )):
-        return "%(z)s_i = 1.0 / %(x)s_i;" #TODO: cast 1.0 to the dtype of x
-inv_elemwise = gof.op.constructor(InvElemwise)
+# class InvElemwise(_Elemwise):
+#     def impl(self, x):
+#         return 1.0/x
+#     def grad(self, x, gz):
+#         ix = inv(x)
+#         return -gz * (ix * ix)
+#     def c_foreach(self, (x_i, ), (z_i, )):
+#         return "%(z)s_i = 1.0 / %(x)s_i;" #TODO: cast 1.0 to the dtype of x
+# inv_elemwise = gof.op.constructor(InvElemwise)
+
+# # Scalar #
+# def div_scalar_r(x, a):
+#     return scale(x, inv_elemwise(a))

-# Scalar #
-def div_scalar_r(x, a):
-    return scale(x, inv_elemwise(a))
+# def div_scalar_l(x, a):
+#     return scale(inv_elemwise(x), a)

-def div_scalar_l(x, a):
-    return scale(inv_elemwise(x), a)
+# def div_scalar_rinplace(x, a):
+#     return scale_inplace(x, inv_elemwise(a))

-def div_scalar_rinplace(x, a):
-    return scale_inplace(x, inv_elemwise(a))
+# div = _scalar_switch(div_elemwise, div_scalar_r, div_scalar_l)
+# div_inplace = _scalar_switch(div_elemwise_inplace, div_scalar_rinplace)
+
+Div = s2t.make_broadcast(scal.Div)
+DivInplace = s2t.make_broadcast(scal.Div, {0:0})
+
+div = gof.op.constructor(s2t.wrap_broadcast(Div))
+div_inplace = gof.op.constructor(s2t.wrap_broadcast(DivInplace))

-div = _scalar_switch(div_elemwise, div_scalar_r, div_scalar_l)
-div_inplace = _scalar_switch(div_elemwise_inplace, div_scalar_rinplace)



@@ -624,59 +726,66 @@ div_inplace = _scalar_switch(div_elemwise_inplace, div_scalar_rinplace)
 # Arithmetic : Pow
 ##########################

-# Elemwise #
-
-class PowElemwise(_Elemwise):
-    def impl(self, x, y):
-        _assert_same_shapes(x, y)
-        return x ** y
-    def grad(self, (x, y), gz):
-        gx = gz * y * (pow_elemwise(x, y-1.0))
-        gy = gz * log(x) * pow_elemwise(x, y)
-        return gx, gy
-    def c_foreach(self, (x_i, y_i), (z_i, )):
-        return "%(z)s_i = pow(%(x)s_i, %(y)s_i);"
-pow_elemwise = gof.op.constructor(PowElemwise)
-
-class PowElemwiseInplace(PowElemwise.inplace_version()):
-    def impl(self, x, y):
-        _assert_same_shapes(x, y)
-        x **= y
-        return x
-pow_elemwise_inplace = gof.op.constructor(PowElemwiseInplace)
-
-# Scalar #
-class PowScalarL(TensorScalarOp):
-    def impl(self, y, x):
-        _assert_tensor_scalar(y, x)
-        return x ** y
-    def grad(self, (y, x), gz):
-        gx = sum(gz * y * x ** (y-1.0))
-        gy = gz * log(x) * x ** y
-        return gy, gx
-    c_expr = "pow(%(a)s, %(x)s_i)"
-pow_scalar_l = gof.op.constructor(PowScalarL)
-
-class PowScalarR(TensorScalarOp):
-    def impl(self, x, a):
-        _assert_tensor_scalar(x, a)
-        return x ** a
-    def grad(self, (x, s), gz):
-        gx = scale(mul_elemwise(gz,pow_scalar_r(x, add_scalar(s,-1.0))), s)
-        gs = sum(mul_elemwise(mul_elemwise(gz, pow_scalar_r(x,s)), log(x)))
-        return gx, gs
-    c_expr = "pow(%(x)s_i, _%(a)s)"
-pow_scalar_r = gof.op.constructor(PowScalarR)
-
-class PowScalarRInplace(PowScalarR.inplace_version()):
-    def impl(self, x, a):
-        _assert_tensor_scalar(x, a)
-        x **= a
-        return x
-pow_scalar_r_inplace = gof.op.constructor(PowScalarRInplace)
-
-pow = _scalar_switch(pow_elemwise, pow_scalar_r, pow_scalar_l)
-pow_inplace = _scalar_switch(pow_elemwise_inplace, pow_scalar_r_inplace)
+# # Elemwise #
+
+# class PowElemwise(_Elemwise):
+#     def impl(self, x, y):
+#         _assert_same_shapes(x, y)
+#         return x ** y
+#     def grad(self, (x, y), gz):
+#         gx = gz * y * (pow_elemwise(x, y-1.0))
+#         gy = gz * log(x) * pow_elemwise(x, y)
+#         return gx, gy
+#     def c_foreach(self, (x_i, y_i), (z_i, )):
+#         return "%(z)s_i = pow(%(x)s_i, %(y)s_i);"
+# pow_elemwise = gof.op.constructor(PowElemwise)
+
+# class PowElemwiseInplace(PowElemwise.inplace_version()):
+#     def impl(self, x, y):
+#         _assert_same_shapes(x, y)
+#         x **= y
+#         return x
+# pow_elemwise_inplace = gof.op.constructor(PowElemwiseInplace)
+
+# # Scalar #
+# class PowScalarL(TensorScalarOp):
+#     def impl(self, y, x):
+#         _assert_tensor_scalar(y, x)
+#         return x ** y
+#     def grad(self, (y, x), gz):
+#         gx = sum(gz * y * x ** (y-1.0))
+#         gy = gz * log(x) * x ** y
+#         return gy, gx
+#     c_expr = "pow(%(a)s, %(x)s_i)"
+# pow_scalar_l = gof.op.constructor(PowScalarL)
+
+# class PowScalarR(TensorScalarOp):
+#     def impl(self, x, a):
+#         _assert_tensor_scalar(x, a)
+#         return x ** a
+#     def grad(self, (x, s), gz):
+#         gx = scale(mul_elemwise(gz,pow_scalar_r(x, add_scalar(s,-1.0))), s)
+#         gs = sum(mul_elemwise(mul_elemwise(gz, pow_scalar_r(x,s)), log(x)))
+#         return gx, gs
+#     c_expr = "pow(%(x)s_i, _%(a)s)"
+# pow_scalar_r = gof.op.constructor(PowScalarR)
+
+# class PowScalarRInplace(PowScalarR.inplace_version()):
+#     def impl(self, x, a):
+#         _assert_tensor_scalar(x, a)
+#         x **= a
+#         return x
+# pow_scalar_r_inplace = gof.op.constructor(PowScalarRInplace)
+
+# pow = _scalar_switch(pow_elemwise, pow_scalar_r, pow_scalar_l)
+# pow_inplace = _scalar_switch(pow_elemwise_inplace, pow_scalar_r_inplace)
+
+
+Pow = s2t.make_broadcast(scal.Pow)
+PowInplace = s2t.make_broadcast(scal.Pow, {0:0})
+
+pow = gof.op.constructor(s2t.wrap_broadcast(Pow))
+pow_inplace = gof.op.constructor(s2t.wrap_broadcast(PowInplace))