Merge pull request #4323 from abergeron/gpua_newelem

Use the new GpuElemwise from libgpuarray

Merge pull request #4323 from abergeron/gpua_newelem
a536464a · Frédéric Bastien · 57ffd6a0 · 0dbb97c6 · a536464a · a536464a
--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -42,7 +42,7 @@ register_transfer(transfer)

 def init_dev(dev, name=None):
    v = pygpu.gpuarray.api_version()
-    if v[0] != -10000:
+    if v[0] != -9999:
        raise RuntimeError("Wrong major API version for gpuarray:", v[0],
                           "Make sure Theano and libgpuarray/pygpu "
                           "are in sync.")

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
 from __future__ import absolute_import, print_function, division

 import os
-import copy

 import numpy
 from six import integer_types
 from six.moves import StringIO

-import theano
 from theano import tensor, gof
 from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
-import theano.tensor.inplace

 try:
    import pygpu
@@ -18,10 +15,9 @@ try:
 except ImportError:
    pass

-from .type import GpuArrayType
+from .type import GpuArrayType, gpu_context_type
 from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
                        infer_context_name)
-from .elemwise import GpuElemwise


 class GpuSubtensor(HideC, Subtensor):
@@ -168,7 +164,7 @@ class GpuSubtensor(HideC, Subtensor):
        return (6,)


-class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
+class GpuIncSubtensor(IncSubtensor):
    """
    Implement IncSubtensor on the gpu.

@@ -181,45 +177,20 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
    :meth:`copy_of_x`, etc. specialize the c_code for this Op.

    """
-
-    @property
-    def _f16_ok(self):
-        return self.iadd_node.op._f16_ok
-
-    def c_headers(self):
-        return self.iadd_node.op.c_headers()
-
-    def c_init_code(self):
-        return self.iadd_node.op.c_init_code()
-
-    def gpu_kernels(self, node, nodename):
-        subname = nodename + "_add_to_zview"
-        return self.iadd_node.op.gpu_kernels(self.iadd_node, subname)
+    _f16_ok = True
+    params_type = gpu_context_type

    def make_node(self, x, y, *inputs):
        ctx_name = infer_context_name(x, y)
        x = as_gpuarray_variable(x, ctx_name)
        y = as_gpuarray_variable(y, ctx_name)
        rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
-        op = copy.copy(self)
-        ret = gof.Apply(op, [x, y] + rval.inputs[2:], [x.type()])
-        op.create_iadd_node(ret)
+        ret = gof.Apply(self, [x, y] + rval.inputs[2:], [x.type()])
        return ret

    def get_params(self, node):
        return node.outputs[0].type.context

-    def create_iadd_node(self, node):
-        # We store a iadd_node in the op that contain the info needed
-        # for the inplace add.
-        cop = theano.tensor.inplace.add_inplace
-        gop = GpuElemwise(cop.scalar_op, copy.copy(cop.inplace_pattern),
-                          "Gpu" + cop.name, cop.nfunc_spec)
-        y = node.inputs[1]
-        xview = y.type()
-        iadd_node = gop(xview, y).owner
-        self.iadd_node = iadd_node
-
    def perform(self, node, inputs, out_, ctx):
        out, = out_
        x, y = inputs[:2]
@@ -261,18 +232,6 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
                x.__setitem__(cdata, y)
        out[0] = x

-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        owner = getattr(self, "owner", None)
-        if owner:
-            self.create_iadd_node(owner)
-
-    def __getstate__(self):
-        d = copy.copy(self.__dict__)
-        if "iadd_node" in d:
-            d.pop('iadd_node')
-        return d
-
    def do_type_checking(self, node):
        """
        Should raise NotImplementedError if c_code does not support
@@ -365,47 +324,52 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
        """
        return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()

+    def c_headers(self):
+        return ['<numpy_compat.h>', '<gpuarray/error.h>', '<gpuarray/array.h>',
+                '<gpuarray/elemwise.h>']
+
    def c_support_code_struct(self, node, nodename):
-        gop = self.iadd_node.op
-        sub_name = nodename + "_add_to_zview"
-        ret = gop.c_support_code_struct(self.iadd_node, sub_name)
-        ret += """
-        PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst,
-                                                    PyGpuArrayObject* src){
-           PyGpuArrayObject* ret = NULL;
-        """ % locals()
-        inputs = ["dst", "src"]
-        outputs = ["ret"]
-        sub = {"fail": "return NULL;", "params": "dst->context"}
-        ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub)
-        ret += """
-            return ret;
+        return "\nGpuElemwise *iadd;\n"

+    def c_init_code_struct(self, node, name, sub):
+        return """
+        gpuelemwise_arg args[2] = {{0}};
+        args[0].name = "a";
+        args[0].typecode = %(type1)s;
+        args[0].flags = GE_READ|GE_WRITE;
+        args[1].name = "b";
+        args[1].typecode = %(type2)s;
+        args[1].flags = GE_READ;
+        iadd = GpuElemwise_new(%(ctx)s->ops, %(ctx)s->ctx, "", "a += b",
+                               2, args, %(nd)s, 0);
+        if (iadd == NULL) {
+          PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
+          %(fail)s
        }
-        """
-        return ret
+        """ % dict(ctx=sub['params'], fail=sub['fail'],
+                   type1=node.inputs[0].type.typecode,
+                   type2=node.inputs[1].type.typecode,
+                   nd=node.inputs[1].ndim)

    def add_to_zview(self, nodename, x, fail):
        return """
-        PyGpuArrayObject * add_result = inc_sub_iadd_%(nodename)s(zview, %(x)s);
-
-        if (! add_result )
        {
+          void *args[2];
+          args[0] = &zview->ga;
+          args[1] = &%(x)s->ga;
+          if (GpuElemwise_call(iadd, args, GE_BROADCAST) != GA_NO_ERROR) {
+            PyErr_SetString(PyExc_RuntimeError, "Error doing inplace add");
            Py_DECREF(zview);
-            %(fail)s;
-        }
-        else
-        {
-            Py_DECREF(add_result);
+            %(fail)s
+          }
        }
        """ % locals()

    def c_code_cache_version(self):
        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
-        elemwise_version = self.iadd_node.c_code_cache_version()
-        if not parent_version or not elemwise_version:
+        if not parent_version:
            return
-        return parent_version + elemwise_version + (3,)
+        return parent_version + (5,)


 class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
@@ -18,40 +18,18 @@ from pygpu import ndgpuarray as gpuarray

 # This is acutally a test for GpuElemwise
 class test_gpu_Broadcast(test_elemwise.test_Broadcast):
-    op = GpuElemwise
-    type = GpuArrayType
    cop = GpuElemwise
    ctype = GpuArrayType
    # The order is important
    linkers = [gof.PerformLinker, gof.CLinker]

-    def setUp(self):
-        if get_context(test_ctx_name).kind != 'cuda':
-            self.linkers = [gof.PerformLinker]
-
-    def rand_val(self, shp):
-        return rand_gpuarray(*shp, **dict(cls=gpuarray))
-
    def rand_cval(self, shp):
        return rand_gpuarray(*shp, **dict(cls=gpuarray))

-    def test_c(self):
-        if get_context(test_ctx_name).kind != 'cuda':
-            raise SkipTest("Cuda specific tests")
-        super(test_gpu_Broadcast, self).test_c()
-
-    def test_c_inplace(self):
-        if get_context(test_ctx_name).kind != 'cuda':
-            raise SkipTest("Cuda specific tests")
-        super(test_gpu_Broadcast, self).test_c_inplace()
-

 def test_elemwise_pow():
    # Test that GpuElemwise(pow) can compile with any combination of integer
    # or float input dtype.
-    if get_context(test_ctx_name).kind != 'cuda':
-        raise SkipTest("Cuda specific tests")
-
    dtypes = ["uint8", "uint16", "uint32", "uint64",
              "int8", "int16", "int32", "int64",
              "float16", "float32", "float64"]
@@ -65,10 +43,10 @@ def test_elemwise_pow():
            output = base ** exp
            f = theano.function([base, exp], output)

-            # Call the function to make sure the output is valid
            base_val = numpy.random.randint(0, 5, size=10).astype(dtype_base)
            exp_val = numpy.random.randint(0, 3, size=10).astype(dtype_exp)

+            # Call the function to make sure the output is valid
            out = f(base_val, exp_val)
            expected_out = base_val ** exp_val
            assert_allclose(out, expected_out)

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -166,10 +166,12 @@ class test_Broadcast(unittest.TestCase):
    linkers = [gof.PerformLinker, gof.CLinker]

    def rand_val(self, shp):
-        return numpy.asarray(numpy.random.rand(*shp))
+        return numpy.asarray(numpy.random.rand(*shp),
+                             dtype=theano.config.floatX)

    def rand_cval(self, shp):
-        return numpy.asarray(numpy.random.rand(*shp))
+        return numpy.asarray(numpy.random.rand(*shp),
+                             dtype=theano.config.floatX)

    def setUp(self):
        unittest_tools.seed_rng()
@@ -189,8 +191,10 @@ class test_Broadcast(unittest.TestCase):
                         ((2, 3, 4, 5), (1, 3, 1, 5)),
                         ((2, 3, 4, 5), (1, 1, 1, 1)),
                         ((), ())]:
-            x = type('float64', [(entry == 1) for entry in xsh])('x')
-            y = type('float64', [(entry == 1) for entry in ysh])('y')
+            x = type(theano.config.floatX,
+                     [(entry == 1) for entry in xsh])('x')
+            y = type(theano.config.floatX,
+                     [(entry == 1) for entry in ysh])('y')
            e = op(scalar.add)(x, y)
            f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function()
            xv = rand_val(xsh)
@@ -202,8 +206,10 @@ class test_Broadcast(unittest.TestCase):
            # test Elemwise.infer_shape
            # the Shape op don't implement c_code!
            if isinstance(linker, gof.PerformLinker):
-                x = type('float64', [(entry == 1) for entry in xsh])('x')
-                y = type('float64', [(entry == 1) for entry in ysh])('y')
+                x = type(theano.config.floatX,
+                         [(entry == 1) for entry in xsh])('x')
+                y = type(theano.config.floatX,
+                         [(entry == 1) for entry in ysh])('y')
                e = op(scalar.add)(x, y)
                f = copy(linker).accept(FunctionGraph(
                    [x, y], [e.shape])).make_function()
@@ -218,8 +224,10 @@ class test_Broadcast(unittest.TestCase):
                         ((2, 3, 4, 5), (1, 3, 1, 5)),
                         ((2, 3, 4, 5), (1, 1, 1, 1)),
                         ((), ())]:
-            x = type('float64', [(entry == 1) for entry in xsh])('x')
-            y = type('float64', [(entry == 1) for entry in ysh])('y')
+            x = type(theano.config.floatX,
+                     [(entry == 1) for entry in xsh])('x')
+            y = type(theano.config.floatX,
+                     [(entry == 1) for entry in ysh])('y')
            e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
            f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function()
            xv = rand_val(xsh)
@@ -232,8 +240,10 @@ class test_Broadcast(unittest.TestCase):
            # test Elemwise.infer_shape
            # the Shape op don't implement c_code!
            if isinstance(linker, gof.PerformLinker):
-                x = type('float64', [(entry == 1) for entry in xsh])('x')
-                y = type('float64', [(entry == 1) for entry in ysh])('y')
+                x = type(theano.config.floatX,
+                         [(entry == 1) for entry in xsh])('x')
+                y = type(theano.config.floatX,
+                         [(entry == 1) for entry in ysh])('y')
                e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
                f = copy(linker).accept(FunctionGraph(
                    [x, y], [e.shape])).make_function()
@@ -267,13 +277,15 @@ class test_Broadcast(unittest.TestCase):
    def test_fill(self):
        if not theano.config.cxx:
            raise SkipTest("G++ not available, so we need to skip this test.")
-        x = self.ctype('float64', [0, 0])('x')
-        y = self.ctype('float64', [1, 1])('y')
-        for linker, op in zip(self.linkers, [self.op, self.cop]):
+        for linker, op, t, rval in zip(self.linkers, [self.op, self.cop],
+                                       [self.type, self.ctype],
+                                       [self.rand_val, self.rand_cval]):
+            x = t(theano.config.floatX, [0, 0])('x')
+            y = t(theano.config.floatX, [1, 1])('y')
            e = op(scalar.Second(scalar.transfer_type(0)), {0: 0})(x, y)
            f = linker().accept(FunctionGraph([x, y], [e])).make_function()
-            xv = self.rand_cval((5, 5))
-            yv = self.rand_cval((1, 1))
+            xv = rval((5, 5))
+            yv = rval((1, 1))
            f(xv, yv)
            assert (xv == yv).all()

@@ -292,24 +304,28 @@ class test_Broadcast(unittest.TestCase):
    def test_weird_strides(self):
        if not theano.config.cxx:
            raise SkipTest("G++ not available, so we need to skip this test.")
-        x = self.ctype('float64', [0, 0, 0, 0, 0])('x')
-        y = self.ctype('float64', [0, 0, 0, 0, 0])('y')
-        for linker, op in zip(self.linkers, [self.op, self.cop]):
+        for linker, op, t, rval in zip(self.linkers, [self.op, self.cop],
+                                       [self.type, self.ctype],
+                                       [self.rand_val, self.rand_cval]):
+            x = t(theano.config.floatX, [0, 0, 0, 0, 0])('x')
+            y = t(theano.config.floatX, [0, 0, 0, 0, 0])('y')
            e = op(scalar.add)(x, y)
            f = linker().accept(FunctionGraph([x, y], [e])).make_function()
-            xv = self.rand_cval((2, 2, 2, 2, 2))
-            yv = self.rand_cval((2, 2, 2, 2, 2)).transpose(4, 0, 3, 1, 2)
+            xv = rval((2, 2, 2, 2, 2))
+            yv = rval((2, 2, 2, 2, 2)).transpose(4, 0, 3, 1, 2)
            zv = xv + yv
            assert (f(xv, yv) == zv).all()

    def test_same_inputs(self):
        if not theano.config.cxx:
            raise SkipTest("G++ not available, so we need to skip this test.")
-        x = self.ctype('float64', [0, 0])('x')
-        for linker, op in zip(self.linkers, [self.op, self.cop]):
+        for linker, op, t, rval in zip(self.linkers, [self.op, self.cop],
+                                       [self.type, self.ctype],
+                                       [self.rand_val, self.rand_cval]):
+            x = t(theano.config.floatX, [0, 0])('x')
            e = op(scalar.add)(x, x)
            f = linker().accept(FunctionGraph([x], [e])).make_function()
-            xv = self.rand_cval((2, 2))
+            xv = rval((2, 2))
            zv = xv + xv
            assert (f(xv) == zv).all()