Don't scalar float* elemwise unless the result is needed on the GPU.

05649d91 · Frederic Bastien · 0c53fb52 · 05649d91 · 05649d91
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -702,6 +702,13 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
        name = 'Gpu' + name
    if len(outputs) > 1:
        return
+    # We move float* scalar only if the outputs is used on the GPU.
+    # This will trigger a backward pass when needed, but will prevent
+    # many useless transfer to only compute GpuElemwise on scalar.
+    if outputs[0].ndim == 0 and len([c for c, _ in outputs[0].clients
+                                     if isinstance(c.op, GpuFromHost)]) == 0:
+        return
    have_cuda = False
    have_opencl = False
    if inputs and isinstance(inputs[0].type, GpuArrayType):

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -493,6 +493,27 @@ def test_many_arg_elemwise():
                utt.assert_allclose(results_gpu, results_cpu)
+def test_not_useless_scalar_gpuelemwise():
+    # We don't want to move elemwise on scalar on the GPU when the
+    # result will be used on the GPU!
+    with theano.configparser.change_flags(warn_float64='ignore'):
+        X = tensor.fmatrix()
+        x = np.random.randn(32, 32).astype(np.float32)
+        m1 = theano.shared(np.random.randn(32, 32).astype(np.float32))
+        loss = (X - tensor.dot(X, m1)).norm(L=2)
+        lr0 = .001
+        grad = tensor.grad(loss, m1)
+        train = theano.function(inputs=[X], updates=[(m1, m1 - lr1 * grad)],
+                                mode=mode_with_gpu)
+        train(x)
+        topo = train.maker.fgraph.toposort()
+        gemms = [app for app in topo if isinstance(app.op, GpuGemm)]
+        assert len(gemms) == 1
+        assert isinstance(gemms[0].inputs[1].owner.op, tensor.Elemwise)
 def test_local_lift_abstractconv_gpu_shape():
    prev = theano.config.on_opt_error
    try: