Rewrite GpuElemwise to be more like the tensor version and much cleaner.

644dbb6c · Arnaud Bergeron · d5bdbb4a · 644dbb6c
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
+from itertools import izip
+
 import numpy
 from theano import Op, Apply, scalar
+from theano.tensor.elemwise import Elemwise

 try:
+    import pygpu
    from pygpu.tools import ScalarArg, ArrayArg
    from pygpu.elemwise import ElemwiseKernel
 except ImportError:
    pass

-from basic_ops import as_gpuarray_variable
-from type import GpuArrayType
+from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable
+from theano.sandbox.gpuarray.type import GpuArrayType

 from theano.gof.utils import MethodNotDefined

@@ -21,124 +25,140 @@ def make_argument(v, name):
    else:
        return ArrayArg(numpy.dtype(v.type.dtype), name)

-def ensure_out(o, ref):
-    if o is None:
-        return ref._empty_like_me()
-    else:
-        return o
-
-class GpuElemwise(Op):
+def ensure_allocated(storage, shape, dtype):
+    odat = storage[0]
+    if odat is not None:
+        if odat.shape != shape:
+            # It is unsafe to try to resize odat,
+            # we have to allocate output storage.
+            odat = None
+    if odat is None:
+        odat = pygpu.empty(shape, dtype=dtype)
+    storage[0] = odat
+    return odat
+
+def as_C_string_const(s):
+    return '\n'.join('"%s\\n"' % (l.replace('"', '\\"'))
+                     for l in s.split('\n'))
+
+class GpuElemwise(Elemwise):
    nin = property(lambda self: self.scalar_op.nin)
    nout = property(lambda self: self.scalar_op.nout)

-    def __init__(self, scalar_op):
-        self.scalar_op = scalar_op
-        self.destroy_map = {}
-
-    def __getstate__(self):
-        d = copy.copy(self.__dict__)
-        d.pop('__epydoc_asRoutine', None)
-        d.pop('_hashval')
-        return d
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        self._rehash()
-
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.scalar_op == other.scalar_op)
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.scalar_op)
+    def __init__(self, scalar_op, name=None, nfunc_spec=None):
+        # We do not support inplace since it is a lie anyway
+        # (the scalar_op code will never modify anything inplace)
+        Elemwise.__init__(self, scalar_op, inplace_pattern=None, name=name,
+                          nfunc_spec=nfunc_spec)

    def __str__(self):
-        return "GpuElemwise{%s}(gpuarray)" % (self.scalar_op,)
+        if self.name is not None:
+            return self.name
+        return "GpuElemwise{%s}<gpuarray>" % (self.scalar_op,)

    def make_node(self, *inputs):
-        _inputs = [as_gpuarray_variable(i) for i in inputs]
-        if self.nin > 0 and len(_inputs) != self.nin:
-            raise TypeError("Wrong argument count", (self.nin, len(_inputs)))
-        for i in _inputs[1:]:
-            if i.type.ndim != inputs[0].type.ndim:
-                raise TypeError('mismatched rank amongst inputs')
-
-        broadcastable = []
-        for d in xrange(_inputs[0].type.ndim):
-            bcast_d = True
-            for i in _inputs:
-                if not i.type.broadcastable[d]:
-                    bcast_d = False
-                    break
-            broadcastable.append(bcast_d)
-        assert len(broadcastable) == _inputs[0].type.ndim
-
-        assert self.nout > 0
+        res = Elemwise.make_node(self, *inputs)
+        outputs = [GpuArrayType(broadcastable=o.type.broadcastable,
+                                dtype=o.type.dtype)() for o in res.outputs]
+        inputs = [as_gpuarray_variable(i) for i in inputs]
+        res = Apply(self, inputs, outputs)
+        # Try to generate the kernel to catch SupportCodeErrors
+        k = self.generate_kernel(res, 'test')
+        return res
+
+    def generate_kernel(self, node, nodename):
        inps = [make_argument(i, 'i%d' % (n,)) for n, i in
-                enumerate(inputs)]
-        scal_ins = [scalar.Scalar(i.dtype) for i in inputs]
-                          
-        res = Apply(self, _inputs, 
-                    [GpuArrayType(o.dtype, broadcastable)()
-                     for o in self.scalar_op.output_types(scal_ins)])
+                enumerate(node.inputs)]
+        scal_ins = [scalar.Scalar(i.dtype) for i in node.inputs]

        outs = [make_argument(o, 'o%d' % (n,)) for n, o in
-                enumerate(res.outputs)]
-        scal_out = [scalar.Scalar(o.dtype) for o in res.outputs]
+                enumerate(node.outputs)]
+        scal_out = [scalar.Scalar(o.dtype) for o in node.outputs]

        fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
                          [o() for o in scal_out])

-        kcode = self.scalar_op.c_code(fake_node, 'kcode',
-                                      [i.expr() for i in inps],
-                                      [o.expr() for o in outs],
-                                      sub=dict(fail='return;'))
-        res.tag.kcode = kcode
-
-# Translate types for scalar composite ops (except complex).
-        support_code = """
-#define npy_float64 ga_double
-#define npy_float32 ga_float
-#define npy_uint8 ga_ubyte
-#define npy_int8 ga_byte
-#define npy_uint16 ga_ushort
-#define npy_int16 ga_short
-#define npy_uint32 ga_uint
-#define npy_int32 ga_int
-#define npy_uint64 ga_ulong
-#define npy_int64 ga_long
-"""
        try:
-            code = self.scalar_op.c_support_code_apply(fake_node, 'kcode')
+            code = self.scalar_op.c_support_code_apply(fake_node, nodename)
            if code:
-                raise SupportCodeError()
+                raise SupportCodeError(code)
        except MethodNotDefined:
            pass

        support_code = ""
        try:
-            support_code += self.scalar_op.c_support_code()
+            support_code = self.scalar_op.c_support_code()
        except MethodNotDefined:
            pass

-        if support_code != "#define THEANO_MACRO_MOD(x,y) (x % y)":
-            # Avoid the C++ complex struct
-            raise SupportCodeError()
+        if (support_code != "#define THEANO_MACRO_MOD(x,y) (x % y)" and
+            support_code != ""):
+            # The macro is fine, the C++ struct is not.
+            raise SupportCodeError(support_code)

-        k = ElemwiseKernel(None, inps+outs, kcode, preamble=support_code)
-        res.tag.kernel = k
-
-        return res
+        kop = self.scalar_op.c_code(fake_node, nodename+'_scalar',
+                                    [i.name+'[i]' for i in inps],
+                                    [o.name+'[i]' for o in outs],
+                                    dict(fail='return;'))

-    def perform(self, node, inps, out):
-        k = node.tag.kernel
-        outs = [ensure_out(o[0], inps[0]) for o in out]
-
-        # the dict call is there to avoid syntax error in python <= 2.5
-        k(*(inps+outs), **dict(broadcast=True))
+        # Translate types for scalar composite ops (except complex).
+        support_code += """
+#define npy_float64 ga_double
+#define npy_float32 ga_float
+#define npy_uint8 ga_ubyte
+#define npy_int8 ga_byte
+#define npy_uint16 ga_ushort
+#define npy_int16 ga_short
+#define npy_uint32 ga_uint
+#define npy_int32 ga_int
+#define npy_uint64 ga_ulong
+#define npy_int64 ga_long
+"""
+        return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
+
+    def c_support_code_apply(self, node, nodename):
+        # This is useless by itself, but will serve an eventual c_code
+        # implementation
+        k = self.generate_kernel(node, nodename)
+
+        nd = node.inputs[0].type.ndim
+        res = []
+        for i in range(1, nd):
+            var = "static const char %s_%s[] = " % (nodename, str(i))
+            res.append(var + as_C_string_const(k.render_basic(i)) + ';')
+            res.append("static const gpukernel *%s_%s_k = NULL;" % (nodename,
+                                                                    str(i)))
+        var = "static const char %s_c[] = " % (nodename,)
+        res.append(var + as_C_string_const(k.contig_src) + ';')
+        res.append("static const gpukernel *%s_c_k = NULL;" % (nodename,))
+        return '\n'.join(res)
+
+    def c_code(self, *args):
+        # do not pick up the Elemwise version
+        raise MethodNotDefined('c_code')
+
+    def perform(self, node, inputs, output_storage):
+        # Try to reuse the kernel from a previous call to hopefully
+        # avoid recompiling
+        if not hasattr(node, '_cache_elemwise_k'):
+            node._cache_elemwise_k = self.generate_kernel(node, "kcode")
+
+        out_shape = []
+        for values in izip(*[input.shape for input in inputs]):
+            if any(v == 0 for v in values):
+                # All non-broadcasted dimensions should be zero
+                assert max(values) <= 1
+                out_shape.append(0)
+            else:
+                out_shape.append(max(values))
+        out_shape = tuple(out_shape)
+
+        outs = [ensure_allocated(storage, out_shape, output.type.dtype)
+                for output, storage in izip(node.outputs, output_storage)]
+
+        # the dict call is there to avoid a syntax error in python < 2.6
+        node._cache_elemwise_k(*(inputs+outs), **dict(broadcast=True))

-        for o, og in zip(out, outs):
-            o[0] = og

 class SupportCodeError(Exception):
    """