finalized a decent version of elemwise

b1fa49f1 · olivier@olivier-desktop · c40eadf8 · b1fa49f1 · b1fa49f1 · b1fa49f1
--- a/core.py
+++ b/core.py
@@ -3,6 +3,7 @@ import gof
 from gof import current_mode, set_mode, build_mode, eval_mode, build_eval_mode, pop_mode, UNCOMPUTED, UNDEFINED, PythonR

 import type_spec
+import cutils

 import numpy
 import weakref
@@ -21,37 +22,12 @@ def build(f, *args, **kwargs):
    pop_mode()
    return r

-
-class Proxy(object):
-
-    __slots__ = ['_obj']
-    
-    def __init__(self, obj = None):
-        self._obj = obj
-
-    def __getattribute__(self, attr):
-        if attr in ['__class__', '_obj']:
-            return object.__getattribute__(self, attr)
-        else:
-            return getattr(object.__getattribute__(self, '_obj'), attr)
-
-    def __setattr__(self, attr, value):
-        if attr in ['_obj']:
-            object.__setattr__(self, attr, value)
-        else:
-            setattr(self._obj, attr, value)
-
-    def __delattr__(self, attr):
-        delattr(self._obj, attr)
-
-
 def as_string(*rs):
    s = gof.graph.as_string(gof.graph.inputs(rs), rs)
    if len(rs) == 1:
        return s[1:-1]
    else:
        return s
-#    return str(gof.Env(gof.graph.inputs([r]), [r]))[1:-1]

 def print_graph(*rs):
    print as_string(*rs)
@@ -77,8 +53,6 @@ def wrap(x):
        return x
    elif isinstance(x, omega_op):
        return x.out
-    elif isinstance(x, Proxy):
-        return wrap(x._obj)
    else:
        return literal(x)

@@ -126,20 +100,22 @@ def cgetspecs(names, vals, converters):
    specs = weave.ext_tools.assign_variable_types(names, d, type_converters = converters) #, auto_downcast = 0)
    return d, specs

-def cgen(name, behavior, inames, ivals, onames, ovals, converters = None):
+def cgen(name, behavior, names, vals, converters = None):
    if not converters:
        converters = type_spec.default
    for converter in converters:
        assert isinstance(converter, type_spec.omega_type_converter_extension)

-    d, specs = cgetspecs(inames + onames, ivals + ovals, converters)
+    d, specs = cgetspecs(names, vals, converters)
    
    template = {}
    template['name'] = name
    template['code'] = behavior
-    template['members'] = "\n".join([spec.struct_members_code() for spec in specs])
-    template['support'] = "\n".join([spec.struct_support_code() for spec in specs])
-    template['typedefs'] = "\n".join([spec.struct_typedefs() for spec in specs])
+    template['members'] = "".join([spec.struct_members_code() for spec in specs])
+    template['support'] = "".join([spec.struct_support_code() for spec in specs])
+    template['typedefs'] = "".join([spec.struct_typedefs() for spec in specs])
+    template['incref'] = "".join(["Py_INCREF(py_%s);\n" % spec.name for spec in specs if spec.use_ref_count])
+    template['decref'] = "".join(["Py_DECREF(py_%s);\n" % spec.name for spec in specs if spec.use_ref_count])

    template['struct_contents'] = """
      %(typedefs)s
@@ -148,23 +124,41 @@ def cgen(name, behavior, inames, ivals, onames, ovals, converters = None):

      %(support)s

-      void execute(void) {
+      void init(void) {
+        %(incref)s
+      }
+
+      void cleanup(void) {
+        %(decref)s
+      }
+
+      int execute(void) {
        %(code)s
+        return 0;
      }
    """ % template

    template['md5'] = md5.md5(template['struct_contents']).hexdigest()
    template['struct_name'] = "_omega_%(name)s_%(md5)s" % template
-
    struct = "struct %(struct_name)s { %(struct_contents)s\n};" % template

-    code = "%(struct_name)s* __STRUCT_P = &%(struct_name)s();\n" % template
-    code += "\n".join([spec.struct_import_code() for spec in specs])
-    code += "\n__STRUCT_P->execute();\n"
-    code += "return_val = 10;"
-    code += "\n//%(md5)s" % template
+    static = """
+    int %(struct_name)s_executor(%(struct_name)s* self) {
+        return self->execute();
+    }

-    return d, code, struct, converters
+    void %(struct_name)s_destructor(void* executor, void* self) {
+        ((%(struct_name)s*)self)->cleanup();
+        free(self);
+    }
+    """ % template
+    
+    code = "%(struct_name)s* __STRUCT_P = new %(struct_name)s();\n" % template
+    code += "".join([spec.struct_import_code() for spec in specs])
+    code += "__STRUCT_P->init();\n"
+    code += "return_val = PyCObject_FromVoidPtrAndDesc((void*)(&%(struct_name)s_executor), __STRUCT_P, %(struct_name)s_destructor);\n" % template
+
+    return d, code, struct + static, converters


 def make_static(cls, fname):
@@ -205,15 +199,18 @@ class omega_op(gof.PythonOp):
        return UNDEFINED

    def c_code(self, converters = None):
-        behavior = self.c_impl(self.inputs, self.outputs)
-        (inames, onames), _1, _2, _3 = inspect.getargspec(self.c_impl)
-        return cgen(self.__class__.__name__, behavior, inames, self.inputs, onames, self.outputs, converters)
+        (inames, onames), behavior = self._c_impl()
+        return cgen(self.__class__.__name__, behavior, inames + onames, self.inputs + self.outputs, converters)

    def _c_alloc(self):
        self.c_alloc(self.inputs, self.outputs)
    
    def c_alloc(inputs, outputs):
        raise NotImplementedError()
+
+    def _c_impl(self):
+        (inames, onames), _1, _2, _3 = inspect.getargspec(self.c_impl)
+        return (inames, onames), self.c_impl(self.inputs, self.outputs)
    
    def c_impl(inputs, outputs):
        raise NotImplementedError()
@@ -221,15 +218,15 @@ class omega_op(gof.PythonOp):
    def c_thunk(self):
        self._c_alloc()
        d, code, struct, converters = self.c_code()
-        def thunk():
-            weave.inline(code, d.keys(), local_dict = d, global_dict = {}, support_code = struct, type_converters = converters)
+        thunk = weave.inline(code, d.keys(), local_dict = d, global_dict = {}, support_code = struct, type_converters = converters)
        return thunk
    
    def c_perform(self):
-        self.c_thunk()()
+        thunk = self.c_thunk()
+        cutils.run_cthunk(thunk)
        

-def elemwise_wrap(beforeloop, inloop, afterloop, loop_vars, writable_loop_vars):
+def elemwise_wrap_old(beforeloop, inloop, afterloop, loop_vars, writable_loop_vars):
    return """
    %(beforeloop)s
    for (int i = 0; i < N_%(v1)s[0]; i++) {
@@ -249,6 +246,97 @@ def elemwise_wrap(beforeloop, inloop, afterloop, loop_vars, writable_loop_vars):
               inloop = inloop,
               afterloop = afterloop)

+def elemwise_loopcode(loopcode, init_template, next_template, acquire_template, cleanup_template, loop_vars, writable_loop_vars, aliases):
+    all_loop_vars = loop_vars + writable_loop_vars
+
+    template = dict(
+        init = "".join([init_template % dict(loop_var = loop_var) for loop_var in all_loop_vars]),
+        next = "".join([next_template % dict(loop_var = loop_var) for loop_var in all_loop_vars]),
+        cleanup = "".join([cleanup_template % dict(loop_var = loop_var) for loop_var in all_loop_vars]),
+        idefs = "".join([("_%(loop_var)s_dtype %(loop_var)s = " + acquire_template + ";\n")
+                         % dict(loop_var = loop_var) for loop_var in loop_vars]),
+        odefs = "".join([("_%(loop_var)s_dtype& %(loop_var)s = " + acquire_template + ";\n")
+                         % dict(loop_var = loop_var) for loop_var in writable_loop_vars]),
+        aliasdefs = "".join(["_%(v1)s_dtype %(v1)s = %(v2)s;\n" % dict(v1=v1, v2=v2)
+                             for v1, v2 in aliases.items()]),
+        loopcode = loopcode
+        )
+    
+    code = """
+    %(init)s
+    while (__elemwise_size--) {
+        %(idefs)s
+        %(odefs)s
+        %(aliasdefs)s
+        %(loopcode)s
+        %(next)s
+    }
+    %(cleanup)s
+    """ % template
+
+    return code
+
+
+def elemwise_wrap(beforeloop, inloop, afterloop, loop_vars, writable_loop_vars, aliases):
+    general_init = "PyArrayIterObject* _%(loop_var)s_iter = (PyArrayIterObject*)PyArray_IterNew((PyObject*)_%(loop_var)s_array);\n"
+#         "if (_%(loop_var)s_iter == NULL) {\n" \
+#         "    PyErr_SetString(PyExc_ValueError, \"Could not make an iterator over variable %(loop_var)s.\");\n" \
+#         "    return 1;\n" \
+#         "}\n"
+    general_next = "PyArray_ITER_NEXT(_%(loop_var)s_iter);\n"
+    general_acquire = "*((_%(loop_var)s_dtype*)_%(loop_var)s_iter->dataptr)";
+    general_cleanup = "if (_%(loop_var)s_iter) Py_DECREF(_%(loop_var)s_iter);\n";
+
+    contiguous_init = "_%(loop_var)s_dtype* _%(loop_var)s_iter = (_%(loop_var)s_dtype*)PyArray_DATA(_%(loop_var)s_array);\n"
+    contiguous_next = "_%(loop_var)s_iter++;\n"
+    contiguous_acquire = "*_%(loop_var)s_iter"
+    contiguous_cleanup = ""
+    
+    all_loop_vars = loop_vars + writable_loop_vars
+    template = dict(
+        v1 = (loop_vars + writable_loop_vars)[0],
+        beforeloop = beforeloop,
+        general_loop = elemwise_loopcode(
+            inloop,
+            general_init, general_next, general_acquire, general_cleanup,
+            loop_vars, writable_loop_vars, aliases),
+        contiguous_loop = elemwise_loopcode(
+            inloop,
+            contiguous_init, contiguous_next, contiguous_acquire, contiguous_cleanup,
+            loop_vars, writable_loop_vars, aliases),
+        contiguity_check = "".join(["all_c_contiguous &= PyArray_ISCARRAY(_%(loop_var)s_array);\n" \
+                                    "all_f_contiguous &= PyArray_ISFARRAY(_%(loop_var)s_array);\n" \
+                                        % dict(loop_var = loop_var)
+                                    for loop_var in all_loop_vars]),
+        afterloop = afterloop)
+    
+    code = """
+    npy_intp __elemwise_size = PyArray_SIZE(_%(v1)s_array);
+    %(beforeloop)s
+    bool all_c_contiguous = 1;
+    bool all_f_contiguous = 1;
+    %(contiguity_check)s
+    if (all_c_contiguous || all_f_contiguous) {
+        %(contiguous_loop)s
+    }
+    else {
+        %(general_loop)s
+    }
+    %(afterloop)s
+    """ % template
+
+    print code
+    
+    return code
+
+
+def upcast(dtype, *dtypes):
+    z = numpy.zeros((), dtype = dtype)
+    for dtype in dtypes:
+        z = z + numpy.zeros((), dtype = dtype)
+    return z.dtype
+
+

 class elemwise(omega_op):

@@ -274,27 +362,44 @@ class elemwise(omega_op):
                    raise Exception("cannot infer an allocation policy automatically for variable " \
                                    "%s because it is not part of the elementwise loop - "\
                                    "please override the c_alloc method" % oname[1:])
-            model = None
+            shape, dtype = None, None
            for iname, input in zip(inames, self.inputs):
                if not iname.startswith("_"):
-                    model = input.data
-            if model is None:
+                    shape = input.data
+            if shape is None:
                raise Exception("cannot infer an allocation policy automatically for output variables " \
                                "because there is no input variable in the loop from which to get the shape")
+
+            dtype = upcast(*[input.data.dtype
+                             for iname, input in zip(inames, self.inputs)
+                             if isinstance(input.data, numpy.ndarray)])
+
            for output in self.outputs:
                inplace_inputs = dmap.get(output, [])
                if inplace_inputs:
                    assert len(inplace_inputs) == 1
                    output.data = inplace_inputs[0].data
                else:
-                    output.data = numpy.ndarray(model.shape, model.dtype)
+                    output.data = numpy.ndarray(shape, dtype)

+    def _c_init(self):
+        (inames, onames), _1, _2, _3 = inspect.getargspec(self.c_init)
+        return (inames, onames), self.c_init(self.inputs, self.outputs)
+        
    def c_init(inputs, outputs):
        return ""

+    def _c_foreach(self):
+        (inames, onames), _1, _2, _3 = inspect.getargspec(self.c_foreach)
+        return (inames, onames), self.c_foreach(self.inputs, self.outputs)
+        
    def c_foreach(inputs, outputs):
        return ""

+    def _c_finalize(self):
+        (inames, onames), _1, _2, _3 = inspect.getargspec(self.c_finalize)
+        return (inames, onames), self.c_finalize(self.inputs, self.outputs)
+
    def c_finalize(inputs, outputs):
        return ""

@@ -306,37 +411,81 @@ class elemwise(omega_op):
                return "_" + name

        try:
-            self.c_impl(self.inputs, self.outputs)
+            self._c_impl()
            raise Exception("c_impl is not used by elemwise ops - define behavior in c_foreach instead")
        except NotImplementedError:
            pass

-        before = self.c_init(self.inputs, self.outputs)
-        during = self.c_foreach(self.inputs, self.outputs)
-        after = self.c_finalize(self.inputs, self.outputs)
-
-        # Get c_init, etc.'s argument names so we can declare them properly in the C code
-        spec_b = inspect.getargspec(self.c_init)
-        spec_d = inspect.getargspec(self.c_foreach)
-        spec_a = inspect.getargspec(self.c_finalize)
-
+        spec_b, before = self._c_init()
+        spec_d, during = self._c_foreach()
+        spec_a, after  = self._c_finalize()
+        
        # Sanity check - apart from loop vars, variables are shared in the before/during/after parts
        if before and spec_b != spec_d:
            raise Exception("The input signature of c_init differs from the input signature of c_foreach.")
        if after and spec_a != spec_d:
            raise Exception("The input signature of c_finalize differs from the input signature of c_foreach.")
        
-        (inames, onames), _1, _2, _3 = spec_d
+        (inames, onames) = spec_d

+        aliases = {}
+        if isinstance(self, inplace):
+            dmap = self.destroy_map()
+            for oname, output in zip(onames, self.outputs):
+                if not oname.startswith("_"):
+                    for input in dmap.get(output, []):
+                        aliases[inames[self.inputs.index(input)]] = oname
+                        
        behavior = elemwise_wrap(before, during, after,
-                                 [iname for iname in inames if not iname.startswith("_")],
-                                 [oname for oname in onames if not oname.startswith("_")])
+                                 [iname for iname in inames if not iname.startswith("_") and not iname in aliases],
+                                 [oname for oname in onames if not oname.startswith("_")],
+                                 aliases)

        inames = [mangle(name) for name in inames]
        onames = [mangle(name) for name in onames]
        
-        return cgen(self.__class__.__name__, behavior, inames, self.inputs, onames, self.outputs, converters)
-
+        return cgen(self.__class__.__name__, behavior, inames + onames, self.inputs + self.outputs, converters)
+
+    @classmethod
+    def inplace_version(cls, dmap = {0: 0}):
+        (inames, onames), _1, _2, _3 = inspect.getargspec(cls.c_foreach)
+        for i, oname in enumerate(onames):
+            if i in dmap:
+                assert not oname.startswith("_")
+        
+        class C(cls, inplace):
+            def destroy_map(self):
+                ret = cls.destroy_map()
+                for output, input in self.dmap.items():
+                    ret[self.outputs.index(output)] = [self.inputs.index(input)]
+                return ret
+            def _impl(self):
+                if self.impl is not cls.impl:
+                    # If the user sets his own inplace operation, we use it
+                    return cls._impl(self)
+                else:
+                    res = cls._impl(self)
+                    if isinstance(res, gof.Result):
+                        res = [res]
+                    else:
+                        res = copy(res)
+                    for output, input in dmap.items():
+                        # The default implementation returned a copy, so we just
+                        # overwrite the original input with the contents of that copy
+                        # This is not meant to be efficient, only correct.
+                        a = self.inputs[input].data
+                        a[:] = res[output]
+                        res[output] = a
+                    if len(res) == 1:
+                        return res[0]
+                    else:
+                        return res
+
+        if dmap == {0:0}:
+            C.__name__ = cls.__name__ + "_inplace" % dmap
+        else:
+            C.__name__ = cls.__name__ + "_inplace%s" % dmap
+        return C


 def scalar_switch(normal_f, scalar_f, scalar_f_reverse = None):
@@ -419,7 +568,7 @@ def assert_same_shapes(impl):
    return ret

 # Wrapper to ensure that the last input to impl is a scalar
-def tensor_scalar_op(impl):
+def tensor_scalar_impl(impl):
    def ret(x, a):
        if a.shape:
            raise ValueError("The second argument to %s must be a scalar." % impl)
@@ -449,83 +598,101 @@ def tensor_scalar_op(impl):

 ## Addition ##

-class add(omega_op):
+class add_elemwise(elemwise):
    impl = assert_same_shapes(numpy.ndarray.__add__)
    def grad(x, y, gz):
        return gz
-    def alloc(x, y):
-        return numpy.ndarray(x.shape, dtype = x.dtype)
-    def c_impl(x, y, z):
-        return """
-        for (int i = 0; i < z.ncols; i++) {
-            for (int j = 0; j < z.nrows; j++) {
-                z(i, j) = x(i, j) + y(i, j);
-            }
-        }
-        """
+    def c_foreach((x, y), (z, )):
+        return "z = x + y;"

-class proto_add_elemwise(omega_op):
-    def grad(x, y, gz):
-        return gz
+iadd_elemwise = add_elemwise.inplace_version()
+iadd_elemwise.impl = assert_same_shapes(numpy.ndarray.__iadd__)

-class add_elemwise(proto_add_elemwise):
-    impl = assert_same_shapes(numpy.ndarray.__add__)

-class iadd_elemwise(proto_add_elemwise, inplace):
-    impl = assert_same_shapes(numpy.ndarray.__iadd__)
+# class proto_add_elemwise(omega_op):
+#     def grad(x, y, gz):
+#         return gz
+
+# class add_elemwise(proto_add_elemwise):
+#     impl = assert_same_shapes(numpy.ndarray.__add__)
+
+# class iadd_elemwise(proto_add_elemwise, inplace):
+#     impl = assert_same_shapes(numpy.ndarray.__iadd__)
+
+class tensor_scalar_op(elemwise):
+    def c_init((x, _a), (z, )):
+        return "_a_dtype a = _a[0];"
+    def _c_foreach(self):
+        return (('x', '_a'), ('z', )), "z = %s;" % self.c_operation

-class proto_add_scalar(omega_op):
+
+class add_scalar(tensor_scalar_op):
+    impl = tensor_scalar_impl(numpy.ndarray.__add__)
    def grad(x, a, gz):
        return gz, sum(gz)
+    c_expr = "x + a"

-class add_scalar(proto_add_scalar):
-    impl = tensor_scalar_op(numpy.ndarray.__add__)
-    
-#     def c_impl(x, s, z):
-#         """
-#         if (*__z == NULL) {
-#             *__z = new ndarray
-#         }
-#         ndarray& z = **__z
-#         """
-
-#         return """
-#         z.resize_like(x);
-#         for (int i = 0; i < z.size(); i++) {
-#             z[i] = x[i] * s;
-#         }
-#         return z;
-#         """
-
-class iadd_scalar(proto_add_scalar, inplace):
-    impl = tensor_scalar_op(numpy.ndarray.__iadd__)
-
-
-class proto_twice(omega_op):
+iadd_scalar = add_scalar.inplace_version()
+iadd_scalar.impl = tensor_scalar_impl(numpy.ndarray.__iadd__) 
+
+
+# class proto_add_scalar(omega_op):
+#     def grad(x, a, gz):
+#         return gz, sum(gz)
+
+# class add_scalar(proto_add_scalar):
+#     impl = tensor_scalar_impl(numpy.ndarray.__add__)
+
+# class iadd_scalar(proto_add_scalar, inplace):
+#     impl = tensor_scalar_impl(numpy.ndarray.__iadd__)
+
+class twice(elemwise):
    def grad(x, gz):
        return scale(gz, 2.0)
-
-class twice(proto_twice):
    def impl(x):
        return x + x
+    def c_foreach((x, ), (z, )):
+        "z = x + x;"

-class itwice(proto_twice, inplace):
-    def impl(x):
-        x += x
-        return x
+itwice = twice.inplace_version()
+
+
+# class proto_twice(omega_op):
+#     def grad(x, gz):
+#         return scale(gz, 2.0)
+
+# class twice(proto_twice):
+#     def impl(x):
+#         return x + x
+
+# class itwice(proto_twice, inplace):
+#     def impl(x):
+#         x += x
+#         return x


 ## Subtraction ##

-class proto_sub_elemwise(omega_op):
+class sub_elemwise(elemwise):
+    impl = assert_same_shapes(numpy.ndarray.__sub__)
    def grad(x, y, gz):
        return gz, -gz
+    def c_foreach((x, y), (z, )):
+        return "z = x - y;"

-class sub_elemwise(proto_sub_elemwise):
-    impl = assert_same_shapes(numpy.ndarray.__sub__)
+isub_elemwise = sub_elemwise.inplace_version()
+isub_elemwise.impl = assert_same_shapes(numpy.ndarray.__isub__)
+
+
+# class proto_sub_elemwise(omega_op):
+#     def grad(x, y, gz):
+#         return gz, -gz
+
+# class sub_elemwise(proto_sub_elemwise):
+#     impl = assert_same_shapes(numpy.ndarray.__sub__)

-class isub_elemwise(proto_sub_elemwise, inplace):
-    impl = assert_same_shapes(numpy.ndarray.__isub__)
+# class isub_elemwise(proto_sub_elemwise, inplace):
+#     impl = assert_same_shapes(numpy.ndarray.__isub__)

 def sub_scalar_r(x, a):
    return add_scalar(x, -a)
@@ -539,67 +706,127 @@ def isub_scalar_r(x, a):

 ## Element-wise multiplication ##

-class proto_mul_elemwise(omega_op):
+class mul_elemwise(elemwise):
+    impl = assert_same_shapes(numpy.ndarray.__mul__)
    def grad(x, y, gz):
        return mul(y, gz), mul(x, gz)
+    def c_foreach((x, y), (z, )):
+        return "z = x * y;"

-class mul_elemwise(proto_mul_elemwise):
-    impl = assert_same_shapes(numpy.ndarray.__mul__)
+imul_elemwise = mul_elemwise.inplace_version()
+imul_elemwise.impl = assert_same_shapes(numpy.ndarray.__imul__)
+
+
+# class proto_mul_elemwise(omega_op):
+#     def grad(x, y, gz):
+#         return mul(y, gz), mul(x, gz)

-class imul_elemwise(proto_mul_elemwise, inplace):
-    impl = assert_same_shapes(numpy.ndarray.__imul__)
+# class mul_elemwise(proto_mul_elemwise):
+#     impl = assert_same_shapes(numpy.ndarray.__mul__)

+# class imul_elemwise(proto_mul_elemwise, inplace):
+#     impl = assert_same_shapes(numpy.ndarray.__imul__)

-class proto_scale(omega_op):
+
+class scale(tensor_scalar_op):
+    impl = tensor_scalar_impl(numpy.ndarray.__mul__)
    def grad(x, a, gz):
        return scale(a, gz), sum(mul_elemwise(x, gz))
+    c_expr = "x * a"
+
+iscale = scale.inplace_version()
+iscale.impl = tensor_scalar_impl(numpy.ndarray.__imul__) 
+

-class scale(proto_scale):
-    impl = tensor_scalar_op(numpy.ndarray.__mul__)
+# class proto_scale(omega_op):
+#     def grad(x, a, gz):
+#         return scale(a, gz), sum(mul_elemwise(x, gz))

-class iscale(proto_scale, inplace):
-    impl = tensor_scalar_op(numpy.ndarray.__imul__)
+# class scale(proto_scale):
+#     impl = tensor_scalar_impl(numpy.ndarray.__mul__)

+# class iscale(proto_scale, inplace):
+#     impl = tensor_scalar_impl(numpy.ndarray.__imul__)

-class proto_sqr(omega_op):
+
+class sqr(elemwise):
    def grad(x, gz):
        return scale(mul_elemwise(x, gz), 2.0)
+    def impl(x):
+        return x * x
+    def c_foreach((x, ), (z, )):
+        "z = x * x;"

-class sqr(proto_sqr):
-    impl = lambda x: numpy.multiply(x, x)
+isqr = sqr.inplace_version()
+isqr.impl = lambda x: x.__imul__(x)

-class isqr(proto_sqr, inplace):
-    impl = lambda x: x.__imul__(x)

+# class proto_sqr(omega_op):
+#     def grad(x, gz):
+#         return scale(mul_elemwise(x, gz), 2.0)

-class proto_sqrt(omega_op):
+# class sqr(proto_sqr):
+#     impl = lambda x: numpy.multiply(x, x)
+
+# class isqr(proto_sqr, inplace):
+#     impl = lambda x: x.__imul__(x)
+
+
+class sqrt(elemwise):
    def grad(x, gz):
        return scale(div(gz, sqrt(x)), 0.5)
-
-class sqrt(proto_sqrt):
    impl = numpy.sqrt
+    def c_foreach((x, ), (z, )):
+        "z = pow(x, 0.5);"

-class isqrt(proto_sqrt, inplace):
-    impl = lambda x: x.__ipow__(0.5)
+isqrt = sqrt.inplace_version()
+isqrt.impl = lambda x: x.__ipow__(0.5)
+
+
+# class proto_sqrt(omega_op):
+#     def grad(x, gz):
+#         return scale(div(gz, sqrt(x)), 0.5)
+
+# class sqrt(proto_sqrt):
+#     impl = numpy.sqrt
+
+# class isqrt(proto_sqrt, inplace):
+#     impl = lambda x: x.__ipow__(0.5)


 ## Exponentiation ##

-class exp(omega_op):
+class exp(elemwise):
    impl = numpy.exp
+    def c_foreach((x, ), (z, )):
+        return "z = exp(x);"
+
+# class exp(omega_op):
+#     impl = numpy.exp
    

 ## Element-wise division ##

-class proto_div_elemwise(omega_op):
+class div_elemwise(elemwise):
+    impl = assert_same_shapes(numpy.ndarray.__div__)
    def grad(x, y, gz):
        return div(gz, y), -div(mul(x, gz), sqr(y))
+    def c_foreach((x, y), (z, )):
+        return "z = x / y;"

-class div_elemwise(proto_div_elemwise):
-    impl = assert_same_shapes(numpy.ndarray.__div__)
+idiv_elemwise = div_elemwise.inplace_version()
+idiv_elemwise.impl = assert_same_shapes(numpy.ndarray.__idiv__)
+
+
+# class proto_div_elemwise(omega_op):
+#     def grad(x, y, gz):
+#         return div(gz, y), -div(mul(x, gz), sqr(y))

-class idiv_elemwise(proto_div_elemwise, inplace):
-    impl = assert_same_shapes(numpy.ndarray.__idiv__)
+# class div_elemwise(proto_div_elemwise):
+#     impl = assert_same_shapes(numpy.ndarray.__div__)
+
+# class idiv_elemwise(proto_div_elemwise, inplace):
+#     impl = assert_same_shapes(numpy.ndarray.__idiv__)

 def div_scalar_r(x, a):
    return scale(x, inv_elemwise(a))
@@ -614,28 +841,48 @@ def idiv_scalar_r(x, a):

 ## Scaling ##

-    
-class proto_neg(omega_op):
+class neg(elemwise):
+    impl = numpy.ndarray.__neg__
    def grad(x, gz):
        return -gz
+    def c_foreach((x, ), (z, )):
+        return "z = -x;"

-class neg(proto_neg):
-    impl = numpy.ndarray.__neg__
+ineg = neg.inplace_version()
+ineg.impl = lambda x: x.__imul__(-1)

-class ineg(proto_neg, inplace):
-    impl = lambda x: x.__imul__(-1)
+    
+# class proto_neg(omega_op):
+#     def grad(x, gz):
+#         return -gz

+# class neg(proto_neg):
+#     impl = numpy.ndarray.__neg__
+
+# class ineg(proto_neg, inplace):
+#     impl = lambda x: x.__imul__(-1)

-class proto_inv_elemwise(omega_op):
-    def grad(x, gz):
-        raise NotImplemented

-class inv_elemwise(omega_op):
+class inv_elemwise(elemwise):
    impl = lambda x: 1 / x
+    def grad(x, gz):
+        return -gz
+    def c_foreach((x, ), (z, )):
+        return "z = 1 / x;"

-class iinv_elemwise(omega_op, inplace):
-    def impl(x):
-        x[:] = 1 / x
+iinv_elemwise = inv_elemwise.inplace_version()
+
+
+# class proto_inv_elemwise(omega_op):
+#     def grad(x, gz):
+#         raise NotImplemented
+
+# class inv_elemwise(omega_op):
+#     impl = lambda x: 1 / x
+
+# class iinv_elemwise(omega_op, inplace):
+#     def impl(x):
+#         x[:] = 1 / x


 ## Dot product ##
@@ -666,46 +913,116 @@ class array_copy(omega_op):

 ## Power ##

-class proto_pow(omega_op):
+class pow_elemwise(elemwise):
+    impl = assert_same_shapes(numpy.ndarray.__pow__)
    def grad(x, s, gz):
        return gz * s * (pow_elemwise(x, s-1.0))
+    def c_foreach((x, s), (z, )):
+        return "z = pow(x, s)"

-class pow_elemwise(proto_pow):
-    impl = assert_same_shapes(numpy.ndarray.__pow__)
+ipow_elemwise = pow_elemwise.inplace_version()
+ipow_elemwise.impl = assert_same_shapes(numpy.ndarray.__ipow__)
+
+    
+# class proto_pow(omega_op):
+#     def grad(x, s, gz):
+#         return gz * s * (pow_elemwise(x, s-1.0))

-class ipow_elemwise(proto_pow, inplace):
-    impl = assert_same_shapes(numpy.ndarray.__ipow__)
+# class pow_elemwise(proto_pow):
+#     impl = assert_same_shapes(numpy.ndarray.__pow__)

+# class ipow_elemwise(proto_pow, inplace):
+#     impl = assert_same_shapes(numpy.ndarray.__ipow__)

-class pow_scalar_l(omega_op):
-    impl = tensor_scalar_op(numpy.ndarray.__pow__)
+
+class pow_scalar_l(tensor_scalar_op):
+    impl = tensor_scalar_impl(lambda x, y: numpy.ndarray.__pow__(y, x))
    def grad(x, s, gz):
        return gz * x * (pow_scalar_l(s,x-1.0))
+    c_expr = "pow(a, x)"

-class pow_scalar_r(omega_op):
-    impl = tensor_scalar_op(numpy.ndarray.__pow__)
+class pow_scalar_r(tensor_scalar_op):
+    impl = tensor_scalar_impl(numpy.ndarray.__pow__)
    def grad(x, s, gz):
        return gz * s * (pow_scalar_r(x,s-1.0))
+    c_expr = "pow(x, a)"

-class ipow_scalar_r(omega_op, inplace):
-    impl = tensor_scalar_op(numpy.ndarray.__ipow__)
-    def grad(x, s, gz):
-        return gz * s * (pow_scalar_r(x,s-1.0))
+ipow_scalar_r = pow_scalar_r.inplace_version()
+ipow_scalar_r.impl = tensor_scalar_impl(numpy.ndarray.__ipow__)
+
+
+# class pow_scalar_l(omega_op):
+#     impl = tensor_scalar_impl(numpy.ndarray.__pow__)
+#     def grad(x, s, gz):
+#         return gz * x * (pow_scalar_l(s,x-1.0))
+
+# class pow_scalar_r(omega_op):
+#     impl = tensor_scalar_impl(numpy.ndarray.__pow__)
+#     def grad(x, s, gz):
+#         return gz * s * (pow_scalar_r(x,s-1.0))
+
+# class ipow_scalar_r(omega_op, inplace):
+#     impl = tensor_scalar_impl(numpy.ndarray.__ipow__)
+#     def grad(x, s, gz):
+#         return gz * s * (pow_scalar_r(x,s-1.0))

 ## Others ##

-class minmax(omega_op):
+class minmax(elemwise):
    nout = 2
    def impl(x):
        return x.min, x.max
+    def c_alloc((x, ), (_min, _max)):
+        _min.data = numpy.ndarray((), x.dtype)
+        _max.data = numpy.ndarray((), x.dtype)
+    def c_init((x, ), (_min, _max)):
+        return """
+        _x_dtype min = _x[0];
+        _x_dtype max = _x[0];
+        """
+    def c_foreach((x, ), (_min, _max)):
+        return """
+        if (x < min) min = x;
+        if (x > max) max = x;
+        """
+    def c_finalize((x, ), (_min, _max)):
+        return """
+        _min[0] = min;
+        _max[0] = max;
+        """
+
+# class minmax(omega_op):
+#     nout = 2
+#     def impl(x):
+#         return x.min, x.max

-class fill(omega_op):
+
+class fill(elemwise):
    impl = lambda model, value: (model * 0) + value
+    def c_init((model, _value), (z, )):
+        return "_z_dtype value = _value[0];"
+    def c_foreach((model, _value), (z, )):
+        return "z = value;"

-class sum(omega_op):
-    impl = numpy.sum
-    def grad(x, gz):
-        return fill(x, gz)
+ifill = fill.inplace_version()
+
+    
+# class fill(omega_op):
+#     impl = lambda model, value: (model * 0) + value
+
+class sum(elemwise):
+    def c_alloc((x, ), (_sum, )):
+        _sum.data = numpy.ndarray((), dtype = x.data.dtype)
+    def c_init((x, ), (_sum, )):
+        return "_sum[0] = 0;"
+    def c_foreach((x, ), (_sum, )):
+        return "_sum[0] += x;"
+
+
+# class sum(omega_op):
+#     impl = numpy.sum
+#     def grad(x, gz):
+#         return fill(x, gz)

    
 ## Array slicing ##

--- a/cutils.py
+++ b/cutils.py
+
+try:
+    from cutils_ext import *
+
+except ImportError:
+
+    from scipy import weave
+
+    single_runner = """
+        if (!PyCObject_Check(py_cthunk)) {
+            PyErr_SetString(PyExc_ValueError,
+                            "Argument to run_cthunk must be a PyCObject returned by the c_thunk method of an omega_op.");
+            return NULL;
+        }
+        int (*fn)(void*) = reinterpret_cast<int (*)(void*)>(PyCObject_AsVoidPtr(py_cthunk));
+        void* it = PyCObject_GetDesc(py_cthunk);
+        int failure = fn(it);
+        if (failure) {
+            return NULL;
+        }
+        """
+
+    cthunk = object()
+    mod = weave.ext_tools.ext_module('cutils_ext')
+    mod.add_function(weave.ext_tools.ext_function('run_cthunk', single_runner, ['cthunk']))
+    mod.compile()
+
+    from cutils_ext import *
--- a/gof/lib.py
+++ b/gof/lib.py
@@ -238,7 +238,7 @@ class PythonOp(Op):
            if input not in exc:
                self.check_input(input)
        try:
-            results = self.impl(*[input.data for input in self.inputs])
+            results = self._impl()
        except Exception, e:
            print "Error in %s: %s" % (self, e)
            raise
@@ -250,7 +250,7 @@ class PythonOp(Op):
                output.set_value(result)

    def _perform(self):
-        results = self.impl(*[input.data for input in self.inputs])
+        results = self._impl()
        if self.nout == 1:
            self.out.set_value(results)
        else:
@@ -267,6 +267,9 @@ class PythonOp(Op):
                    raise Exception("Uncomputed input: %s in %s" % (input, self))
        self.perform()

+    def _impl(self):
+        return self.impl(*[input.data for input in self.inputs])
+    
    def impl(*args):
        raise NotImplementedError("This op has no implementation.")


--- a/type_spec.py
+++ b/type_spec.py
@@ -13,28 +13,33 @@ class omega_type_converter_extension:
        return [(tvars['c_type'], tvars['name'], tvars['var_convert'])]

    def format_provide(self, x):
-        return '%s %s = %s;' % x
+        return '%s %s = %s;\n' % x

    def declaration_code(self, templatize = 0, inline = 0):
        tvars = self.template_vars(inline=inline)
        code = '%(py_var)s = %(var_lookup)s;\n' % tvars
-        code += '\n'.join([self.format_provide(export) for export in self.provides()])
+        code += ''.join([self.format_provide(export) for export in self.provides()])
        return code

+    def struct_init_code(self):
+        return "Py_INCREF(py_%s);" % self.name
+
+    def struct_cleanup_code(self):
+        return "Py_DECREF(py_%s);" % self.name
+    
    def struct_members_code(self):
-        return '\n'.join(['%s_type %s;' % (name, name) for c_type, name, init in self.provides()])
+        res = "PyObject* py_%s;\n" % self.name
+        return res + ''.join(['%s_type %s;\n' % (name, name) for c_type, name, init in self.provides()])

    def struct_import_code(self):
-        return '\n'.join(['__STRUCT_P->%s = %s;' % (name, name) for c_type, name, init in self.provides()])
+        res = "__STRUCT_P->py_%s = py_%s;\n" % (self.name, self.name)
+        return res + ''.join(['__STRUCT_P->%s = %s;\n' % (name, name) for c_type, name, init in self.provides()])

    def struct_support_code(self):
        return ""

    def struct_typedefs(self):
-        return "\n".join(["typedef %s %s_type;" % (c_type, name) for c_type, name, init in self.provides()])
-
-#     def struct_template_types(self):
-#         return [("typename %s_type" % name, ) for c_type, name, init in self.provides()]
+        return ''.join(["typedef %s %s_type;\n" % (c_type, name) for c_type, name, init in self.provides()])


 class int_converter(omega_type_converter_extension, c_spec.int_converter):