Merge pull request #2329 from nouiz/gpu_subtensor

Do not move the GpuSubtensor to the GPU when this do not make sence

Merge pull request #2329 from nouiz/gpu_subtensor
1bafa2d4 · abergeron · 560fb116 · 1296be25 · 1bafa2d4 · 1bafa2d4
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -44,7 +44,7 @@ from theano.sandbox.cuda.elemwise import SupportCodeError
 from theano.scalar.basic_scipy import Erfinv
 from theano.sandbox.cuda.elemwise import erfinv_gpu
 from theano.sandbox.cuda.var import CudaNdarrayConstant
-from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt
+from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt, GpuOp
 from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.blas import _is_real_vector, _is_real_matrix
 from theano.tensor import nlinalg
@@ -832,6 +832,11 @@ def local_gpu_subtensor(node):
           isinstance(host_input.owner.op, tensor.Subtensor):
            subt = host_input.owner.op
            x = host_input.owner.inputs[0]
+            if len(x.clients) == 1:
+                # It mean, the input of the subtensor is used only by
+                # the subtensor. We do not want to move the subtensor
+                # to the GPU in that case.
+                return
            coords = host_input.owner.inputs[1:]
            return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)]
    if isinstance(node.op, tensor.Subtensor):
@@ -839,6 +844,19 @@ def local_gpu_subtensor(node):
        if (x.owner and
            isinstance(x.owner.op, HostFromGpu) and
            x.dtype == "float32"):
+            gpu_x = x.owner.inputs[0]
+            if (gpu_x.owner and
+                isinstance(gpu_x.owner.op, GpuFromHost) and
+                # And it is a shared var or an input of the graph.
+                not gpu_x.owner.inputs[0].owner):
+                if len(x.clients) == 1:
+                    if any([n == 'output' or isinstance(n.op, GpuOp)
+                            for n,_  in node.outputs[0].clients]):
+                        return
+                    else:
+                        return [host_from_gpu(gpu_from_host(node.outputs[0]))]
+                    return
            gpu_x, = x.owner.inputs
            coords = node.inputs[1:]
            return [host_from_gpu(GpuSubtensor(

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -203,6 +203,48 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
    assert numpy.allclose(numpy.asarray(f()), concat)
+def test_local_gpu_subtensor():
+    # Test shared forced on CPU.
+    t = tensor._shared(numpy.zeros(20, "float32"))
+    f = theano.function([], t[3:4], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert any([type(node.op) is tensor.Subtensor for node in topo])
+    assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
+    # Test graph input.
+    t = tensor.fmatrix()
+    f = theano.function([t], t[3:4], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert any([type(node.op) is tensor.Subtensor for node in topo])
+    assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
+    # Test multiple use of the input
+    # We want the subtensor to be on the GPU to prevent multiple transfer.
+    t = tensor.fmatrix()
+    f = theano.function([t], [t[3:4], t+1], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert not any([type(node.op) is tensor.Subtensor for node in topo])
+    assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
+    # Test multiple use of the input + input as output
+    # We want the subtensor to be on the GPU to prevent multiple transfer.
+    t = tensor.fmatrix()
+    f = theano.function([t], [t[3:4], t+1, t], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert not any([type(node.op) is tensor.Subtensor for node in topo])
+    assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
+    # Test shared forced on CPU end we do computation on the output of
+    # the subtensor.
+    t = tensor._shared(numpy.zeros(20, "float32"))
+    f = theano.function([], t[3:4]+1, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert any([type(node.op) is tensor.Subtensor for node in topo])
+    assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
+    assert any([isinstance(node.op, cuda.GpuElemwise) for node in topo])
 def test_print_op():
    """ Test that print ops don't block gpu optimization"""
    b = tensor.fmatrix()

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -20,7 +20,8 @@ from theano.gof.python25 import all, any
 from theano.tensor.nnet.conv import ConvOp
 from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import (
-    host_from_gpu, gpu_from_host, HostFromGpu, GpuSplit,
+    host_from_gpu, gpu_from_host, HostFromGpu, GpuFromHost,
+    GpuSplit,
    gpu_alloc, GpuAlloc, GpuReshape, GpuEye, gpu_join, GpuJoin,
 )
 from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
@@ -342,6 +343,21 @@ def local_gpua_split(node):
 @register_opt('fast_compile')
 @op_lifter([tensor.Subtensor])
 def local_gpua_subtensor(node):
+    x = node.inputs[0]
+    if (x.owner and isinstance(x.owner.op, HostFromGpu)):
+        gpu_x = x.owner.inputs[0]
+        if (gpu_x.owner and
+            isinstance(gpu_x.owner.op, GpuFromHost) and
+            # And it is a shared var or an input of the graph.
+            not gpu_x.owner.inputs[0].owner):
+            if len(x.clients) == 1:
+                if any([n == 'output' or any([isinstance(v.type, GpuArrayType)
+                                              for v in n.inputs + n.outputs])
+                        for n,_  in node.outputs[0].clients]):
+                    return
+                else:
+                    return [host_from_gpu(gpu_from_host(node.outputs[0]))]
    return GpuSubtensor(node.op.idx_list)

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -10,6 +10,7 @@ from theano.sandbox.gpuarray.basic_ops import (
    GpuAlloc, GpuReshape, gpu_alloc, gpu_from_host, host_from_gpu)
 from theano.sandbox.gpuarray.elemwise import (
    GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise)
+from theano.sandbox.gpuarray.subtensor import GpuSubtensor
 from theano.sandbox.gpuarray.tests.test_basic_ops import (
    rand_gpuarray, mode_with_gpu, mode_without_gpu
    )
@@ -164,3 +165,44 @@ def test_local_gpu_elemwise_careduce():
    assert len(topo) == 3
    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
    f(numpy.random.rand(3, 4).astype(theano.config.floatX))
+def test_local_gpu_subtensor():
+    # Test shared forced on CPU.
+    t = tensor._shared(numpy.zeros(20, "float32"))
+    f = theano.function([], t[3:4], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert any([type(node.op) is tensor.Subtensor for node in topo])
+    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
+    # Test graph input.
+    t = tensor.fmatrix()
+    f = theano.function([t], t[3:4], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert any([type(node.op) is tensor.Subtensor for node in topo])
+    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
+    # Test multiple use of the input
+    # We want the subtensor to be on the GPU to prevent multiple transfer.
+    t = tensor.fmatrix()
+    f = theano.function([t], [t[3:4], t+1], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert not any([type(node.op) is tensor.Subtensor for node in topo])
+    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
+    # Test multiple use of the input + input as output
+    # We want the subtensor to be on the GPU to prevent multiple transfer.
+    t = tensor.fmatrix()
+    f = theano.function([t], [t[3:4], t+1, t], mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert not any([type(node.op) is tensor.Subtensor for node in topo])
+    assert any([isinstance(node.op, GpuSubtensor) for node in topo])
+    # Test shared forced on CPU end we do computation on the output of
+    # the subtensor.
+    t = tensor._shared(numpy.zeros(20, "float32"))
+    f = theano.function([], t[3:4]+1, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert any([type(node.op) is tensor.Subtensor for node in topo])
+    assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
+    assert any([isinstance(node.op, GpuElemwise) for node in topo])