Add back the backward flow in the local_gpu_split opt

6b4c592f · Frederic · 102fb5e1 · 6b4c592f · 6b4c592f
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -304,7 +304,11 @@ def local_gpu_elemwise_1(node):
 def local_gpu_split(node):
    if isinstance(node.op, tensor.Split):
        input = node.inputs[0]
-        if input.owner and isinstance(input.owner.op, HostFromGpu):
+        outs_clients = reduce(list.__add__,
+                              [out.clients for out in node.outputs])
+        if (input.owner and isinstance(input.owner.op, HostFromGpu) or
+            any([c != 'output' and isinstance(c.op, GpuFromHost) for c, idx
+                 in outs_clients])):
            new_op = GpuSplit(node.op.len_splits)
            split_res = new_op(gpu_from_host(input), *node.inputs[1:])
            return [host_from_gpu(o) for o in split_res]

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -289,7 +289,7 @@ def test_local_gpu_subtensor():
    assert any([isinstance(node.op, cuda.GpuElemwise) for node in topo])
-def test_local_split():
+def test_local_gpu_split():
    """ Test that the GpuSplit op is being applied and works """
    # Construct symbolic split
    x = tensor.fvector()
@@ -310,6 +310,17 @@ def test_local_split():
    # Check equality
    assert all([(cpu == gpu).all() for cpu, gpu in zip(cpu_res, gpu_res)])
+    # Test the other path of the optimizer, when it is the output that
+    # is moved to the GPU.
+    ra = cuda.gpu_from_host(ra)
+    f = theano.function([x, splits], [ra, rb, rc],
+                        mode=mode_with_gpu.excluding("InputToGpuOptimizer"))
+    gpu_res = f([0, 1, 2, 3, 4, 5], [3, 2, 1])
+    l = f.maker.fgraph.toposort()
+    assert any([isinstance(o.op, theano.sandbox.cuda.GpuSplit) for o in l])
+    # Check equality
+    assert all([(cpu == gpu).all() for cpu, gpu in zip(cpu_res, gpu_res)])
 def test_print_op():
    """ Test that print ops don't block gpu optimization"""