Make GpuElemwise work with multiple output, (new back-end raise an error)

41a8e89b · Frederic · Arnaud Bergeron · 2595fea0 · 41a8e89b · 41a8e89b
--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
@@ -66,7 +66,7 @@ class NaiveAlgo(object):
    def cache_version(self):
        ver = self.scalar_op.c_code_cache_version()
        if ver:
-            return (17, self.verbose, self.sync, ver)
+            return (18, self.verbose, self.sync, ver)
        else:
            return ver
@@ -142,6 +142,8 @@ class NaiveAlgo(object):
        # perform the scalar operation on the input and output references
        # TODO: What if the scalar_op needs support_code??
+        for ipos, i in enumerate(node.outputs):
+            print("npy_%s o%d_i;" % (i.dtype, ipos), file=sio)
        task_code = self.scalar_op.c_code(
            Apply(self.scalar_op,
                  [scalar.Scalar(dtype=input.type.dtype).make_variable()
@@ -150,9 +152,11 @@ class NaiveAlgo(object):
                   for output in node.outputs]),
            nodename + '_scalar_',
            get_str_list_logical_scalar(node),
-            ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs)],
+            ['o%i_i' % ipos for ipos, i in enumerate(node.outputs)],
            sub=dict(fail='return;'))  # TODO: set a failure code somehow!!!
        print("       ", task_code, file=sio)
+        for ipos, _ in enumerate(node.outputs):
+            print("o%i_data[i] = o%i_i;" % (ipos, ipos), file=sio)
        print("    }", file=sio)
        #indent = " "*(4*d+7)
@@ -477,6 +481,8 @@ class NaiveAlgo(object):
        print("    for (int i = idx; i < numEls; i += numThreads) {", file=sio)
        # perform the scalar operation on the input and output references
        # TODO: What if the scalar_op needs support_code??
+        for ipos, i in enumerate(node.outputs):
+            print("npy_%s o%d_i;" % (i.dtype, ipos), file=sio)
        task_code = self.scalar_op.c_code(
                Apply(self.scalar_op,
                    [scalar.Scalar(dtype=input.type.dtype).make_variable()
@@ -486,9 +492,11 @@ class NaiveAlgo(object):
                , nodename + '_scalar_'
                #, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)]
                , get_str_list_logical_scalar(node, data_str='i%i_data[i]')
-                , ['o%i_data[i]'%ipos for ipos, i in enumerate(node.outputs)]
+                , ['o%i_i'%ipos for ipos, i in enumerate(node.outputs)]
                , sub=dict(fail='return;'))  # TODO: set a failure code somehow!!!
        print("       ", task_code, file=sio)
+        for ipos, _ in enumerate(node.outputs):
+            print("o%i_data[i] = o%i_i;" % (ipos, ipos), file=sio)
        print("    }", file=sio)
        print("}", file=sio)

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -618,16 +618,40 @@ def test_local_gpu_elemwise_0():
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
+    # Test multiple output
+    a_s = theano.scalar.float32()
+    a = tensor.fmatrix()
+    from theano.scalar.basic import identity
+    out_s = theano.scalar.Composite([a_s, b_s, c_s],
+                                    [identity(a_s), identity(c_s), identity(b_s)])
+    outs_op = tensor.Elemwise(out_s)
+    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
+    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0
+    out = f(a_v, b_v, c_v)
+    utt.assert_allclose(out[0], a_v)
+    utt.assert_allclose(out[1], c_v)
+    utt.assert_allclose(out[2], b_v)
    # Test multiple output
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * c_s])
    outs_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
-    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
+    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0
    out = f(a_v, b_v, c_v)
    utt.assert_allclose(out[0], a_v + b_v)
-    utt.assert_allclose(out[1], a_v + c_v)
+    utt.assert_allclose(out[1], a_v * c_v)
+    # Test non-contiguous input
+    c = cuda.shared_constructor(c_v)
+    f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]),
+                        mode=mode_with_gpu)
+    out = f(a_v, b_v)
+    utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
+    utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
 def test_elemwise_fusion():

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -72,6 +72,8 @@ class GpuElemwise(HideC, Elemwise):
        res = Elemwise.make_node(self, *inputs)
        outputs = [GpuArrayType(broadcastable=o.type.broadcastable,
                                dtype=o.type.dtype)() for o in res.outputs]
+        if len(outputs) > 1:
+            raise NotImplementedError()
        inputs = [as_gpuarray_variable(i) for i in inputs]
        node = Apply(self, inputs, outputs)

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -292,6 +292,23 @@ def test_local_gpu_elemwise():
    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)
+    return  # Not yet implemeted
+    # Test multiple output
+    a_s = theano.scalar.float32()
+    a = tensor.fmatrix()
+    from theano.scalar.basic import identity
+    out_s = theano.scalar.Composite([a_s, b_s, c_s],
+                                    [identity(a_s), identity(c_s), identity(b_s)])
+    outs_op = tensor.Elemwise(out_s)
+    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
+    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
+    out = f(a_v, b_v, c_v)
+    utt.assert_allclose(out[0], a_v)
+    utt.assert_allclose(out[1], c_v)
+    utt.assert_allclose(out[2], b_v)
    # Test multiple output
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s])
    outs_op = tensor.Elemwise(out_s)
@@ -302,4 +319,12 @@ def test_local_gpu_elemwise():
    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
    out = f(a_v, b_v, c_v)
    utt.assert_allclose(out[0], a_v + b_v)
-    utt.assert_allclose(out[1], a_v + c_v)
+    utt.assert_allclose(out[1], a_v * c_v)
+    # Test non-contiguous input
+    c = cuda.shared_constructor(numpy.asarray(c_v, dtype='float32'))
+    f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]),
+                        mode=mode_with_gpu)
+    out = f(a_v, b_v)
+    utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
+    utt.assert_allclose(out[1], a_v[::2] * c_v[::2])