Do not move all scalar float on the GPU.

bd2ec838 · Frederic Bastien · 05649d91 · bd2ec838 · bd2ec838 · bd2ec838
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -703,12 +703,6 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
    if len(outputs) > 1:
        return

-    # We move float* scalar only if the outputs is used on the GPU.
-    # This will trigger a backward pass when needed, but will prevent
-    # many useless transfer to only compute GpuElemwise on scalar.
-    if outputs[0].ndim == 0 and len([c for c, _ in outputs[0].clients
-                                     if isinstance(c.op, GpuFromHost)]) == 0:
-        return
    have_cuda = False
    have_opencl = False
    if inputs and isinstance(inputs[0].type, GpuArrayType):

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -502,10 +502,10 @@ def test_not_useless_scalar_gpuelemwise():
        x = np.random.randn(32, 32).astype(np.float32)
        m1 = theano.shared(np.random.randn(32, 32).astype(np.float32))
        loss = (X - tensor.dot(X, m1)).norm(L=2)
-        lr0 = .001
+        lr = theano.shared(np.asarray(.001, dtype=np.float32))
        grad = tensor.grad(loss, m1)

-        train = theano.function(inputs=[X], updates=[(m1, m1 - lr1 * grad)],
+        train = theano.function(inputs=[X], updates=[(m1, m1 - lr * grad)],
                                mode=mode_with_gpu)
        train(x)
        topo = train.maker.fgraph.toposort()

--- a/theano/gpuarray/type.py
+++ b/theano/gpuarray/type.py
@@ -38,8 +38,8 @@ def move_to_gpu(data):
    # We don't support complex on the GPU
    if str(data.dtype) in tensor.basic.complex_dtypes:
        return False
-    # We don't want scalar int on the GPU.
-    if data.ndim == 0 and str(data.dtype) in tensor.basic.discrete_dtypes:
+    # We don't want scalars on the GPU.
+    if data.ndim == 0:
        return False
    return True