Make opt not crash with multi output CPU elemwise

a579c1eb · Frederic · Arnaud Bergeron · 6319e9dc · a579c1eb · a579c1eb
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -279,7 +279,8 @@ def local_gpu_elemwise_0(node):
                    # TODO: change this when fusion makes Elemwise with
                    # multiple outputs
                    gpu_elemwise = new_op(*(gpu_from_host(i)
-                                            for i in node.inputs))
+                                            for i in node.inputs),
+                                          return_list=True)
                # case 2 - it is still ok if some inputs were upcast to float32
                elif all([i.type.dtype in upcastable
                          for i in node.inputs]):
@@ -292,18 +293,19 @@ def local_gpu_elemwise_0(node):
                        new_inputs = [gpu_from_host(tensor.cast(i, 'float32'))
                                      for i in node.inputs]
-                        gpu_elemwise = new_op(*new_inputs)
+                        gpu_elemwise = new_op(*new_inputs, return_list=True)
                    else:
                        return False
                else:
                    return False
-                gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner)
+                gpu_elemwise = split_huge_add_or_mul(gpu_elemwise[0].owner)
                if not gpu_elemwise:
                    return False
-                if max_inputs_to_GpuElemwise(node) < len(gpu_elemwise.inputs):
+                if (max_inputs_to_GpuElemwise(node) <
+                        len(gpu_elemwise.inputs)):
                    return False
-                return [host_from_gpu(gpu_elemwise.outputs[0])]
+                return [host_from_gpu(out) for out in gpu_elemwise.outputs]
 @register_opt()

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -618,6 +618,15 @@ def test_local_gpu_elemwise_0():
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
    f(a_v, b_v, c_v)
+    # Test multiple output
+    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s])
+    outs_op = tensor.Elemwise(out_s)
+    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
+    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
+    f(a_v, b_v, c_v)
 def test_elemwise_fusion():
    """ Test the the GpuElemwise fusion work correctly"""

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -255,3 +255,46 @@ def test_local_gpu_subtensor():
    assert any([type(node.op) is tensor.Subtensor for node in topo])
    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
    assert any([isinstance(node.op, GpuElemwise) for node in topo])
+def test_local_gpu_elemwise():
+    """
+    Test local_gpu_elemwise_0 when there is a dtype upcastable to float32
+    """
+    a = tensor.bmatrix()
+    b = tensor.fmatrix()
+    c = tensor.fmatrix()
+    a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
+    b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
+    c_v = (numpy.random.rand(4, 5) * 10).astype("float32")
+    # Due to optimization order, this composite is created when all
+    # the op are on the gpu.
+    f = theano.function([a, b, c], [a + b + c], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
+    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
+    f(a_v, b_v, c_v)
+    # Now test with the composite already on the cpu before we move it
+    # to the gpu
+    a_s = theano.scalar.int8()
+    b_s = theano.scalar.float32()
+    c_s = theano.scalar.float32()
+    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
+    out_op = tensor.Elemwise(out_s)
+    f = theano.function([a, b, c], [out_op(a, b, c)], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
+    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
+    f(a_v, b_v, c_v)
+    # Test multiple output
+    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s])
+    outs_op = tensor.Elemwise(out_s)
+    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
+    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
+    f(a_v, b_v, c_v)
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -296,6 +296,10 @@ def inplace_elemwise_optimizer_op(OP):
            # gpuarray GpuElemwise inherit from Elemwise
            if not type(op) == OP:
                continue
+            # TODO support this case
+            if len(node.outputs) > 1:
+                return
            baseline = op.inplace_pattern
            protected_inputs = [
                f.protected for f in node.fgraph._features if