Do the same on the new gpu back-end

1296be25 · Frederic · 4f06e78d · 1296be25 · 1296be25
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -20,7 +20,8 @@ from theano.gof.python25 import all, any
 from theano.tensor.nnet.conv import ConvOp
 from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import (
-    host_from_gpu, gpu_from_host, HostFromGpu, GpuSplit,
+    host_from_gpu, gpu_from_host, HostFromGpu, GpuFromHost,
+    GpuSplit,
    gpu_alloc, GpuAlloc, GpuReshape, GpuEye, gpu_join, GpuJoin,
 )
 from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
@@ -342,6 +343,21 @@ def local_gpua_split(node):
 @register_opt('fast_compile')
 @op_lifter([tensor.Subtensor])
 def local_gpua_subtensor(node):
+    x = node.inputs[0]
+    if (x.owner and isinstance(x.owner.op, HostFromGpu)):
+        gpu_x = x.owner.inputs[0]
+        if (gpu_x.owner and
+            isinstance(gpu_x.owner.op, GpuFromHost) and
+            # And it is a shared var or an input of the graph.
+            not gpu_x.owner.inputs[0].owner):
+            if len(x.clients) == 1:
+                if any([n == 'output' or any([isinstance(v.type, GpuArrayType)
+                                              for v in n.inputs + n.outputs])
+                        for n,_  in node.outputs[0].clients]):
+                    return
+                else:
+                    return [host_from_gpu(gpu_from_host(node.outputs[0]))]
    return GpuSubtensor(node.op.idx_list)

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -10,6 +10,7 @@ from theano.sandbox.gpuarray.basic_ops import (
    GpuAlloc, GpuReshape, gpu_alloc, gpu_from_host, host_from_gpu)
 from theano.sandbox.gpuarray.elemwise import (
    GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise)
+from theano.sandbox.gpuarray.subtensor import GpuSubtensor
 from theano.sandbox.gpuarray.tests.test_basic_ops import (
    rand_gpuarray, mode_with_gpu, mode_without_gpu
    )
@@ -164,3 +165,44 @@ def test_local_gpu_elemwise_careduce():
    assert len(topo) == 3
    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
    f(numpy.random.rand(3, 4).astype(theano.config.floatX))
+def test_local_gpu_subtensor():
+    # Test shared forced on CPU.
+    t = tensor._shared(numpy.zeros(20, "float32"))
+    f = theano.function([], t[3:4], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert any([type(node.op) is tensor.Subtensor for node in topo])
+    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
+    # Test graph input.
+    t = tensor.fmatrix()
+    f = theano.function([t], t[3:4], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert any([type(node.op) is tensor.Subtensor for node in topo])
+    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
+    # Test multiple use of the input
+    # We want the subtensor to be on the GPU to prevent multiple transfer.
+    t = tensor.fmatrix()
+    f = theano.function([t], [t[3:4], t+1], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert not any([type(node.op) is tensor.Subtensor for node in topo])
+    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
+    # Test multiple use of the input + input as output
+    # We want the subtensor to be on the GPU to prevent multiple transfer.
+    t = tensor.fmatrix()
+    f = theano.function([t], [t[3:4], t+1, t], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert not any([type(node.op) is tensor.Subtensor for node in topo])
+    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
+    # Test shared forced on CPU end we do computation on the output of
+    # the subtensor.
+    t = tensor._shared(numpy.zeros(20, "float32"))
+    f = theano.function([], t[3:4]+1, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert any([type(node.op) is tensor.Subtensor for node in topo])
+    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
+    assert any([isinstance(node.op, GpuElemwise) for node in topo])