Merge pull request #5774 from nouiz/less_gpuelemwise

Don't move scalar float* elemwise unless the result is needed on the GPU.

Merge pull request #5774 from nouiz/less_gpuelemwise
aca733c8 · Pascal Lamblin · GitHub · bd1a12ed · e90987e8 · aca733c8
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -50,9 +50,8 @@ class GpuGemv(BlasOp):
        A = as_gpuarray_variable(A, ctx_name)
        x = as_gpuarray_variable(x, ctx_name)
        y = as_gpuarray_variable(y, ctx_name)
-        with theano.configparser.change_flags(warn_float64='ignore'):
-            alpha = as_tensor_variable(alpha).astype('float64')
-            beta = as_tensor_variable(beta).astype('float64')
+        alpha = as_tensor_variable(alpha)
+        beta = as_tensor_variable(beta)

        assert alpha.ndim == 0
        assert beta.ndim == 0
@@ -60,6 +59,13 @@ class GpuGemv(BlasOp):
        assert x.ndim == 1
        assert y.ndim == 1
        assert A.dtype == x.dtype == y.dtype
+
+        # float16 not supported
+        expected = A.dtype
+        assert theano.scalar.upcast(alpha.dtype,
+                                    beta.dtype, expected) == expected
+        alpha = alpha.astype(expected)
+        beta = beta.astype(expected)
        return Apply(self, [y, alpha, A, x, beta], [y.type()])

    def perform(self, node, inputs, out_storage):
@@ -163,15 +169,30 @@ class GpuGemm(BlasOp):
        A = as_gpuarray_variable(A, ctx_name)
        B = as_gpuarray_variable(B, ctx_name)
        C = as_gpuarray_variable(C, ctx_name)
-        with theano.configparser.change_flags(warn_float64='ignore'):
-            alpha = as_tensor_variable(alpha).astype('float64')
-            beta = as_tensor_variable(beta).astype('float64')
+        alpha = as_tensor_variable(alpha)
+        beta = as_tensor_variable(beta)
+
+        if not (A.dtype == B.dtype == C.dtype):
+            raise TypeError(theano.tensor.blas.Gemm.E_mixed,
+                            (A.dtype, B.dtype, C.dtype,
+                             alpha.dtype, beta.dtype))
+        if not A.dtype.startswith('float'):
+            raise TypeError(theano.tensor.blas.Gemm.E_float, (A.dtype))
+
+        if A.dtype == 'float16':
+            expected = 'float32'
+        else:
+            expected = A.dtype
+        assert theano.scalar.upcast(alpha.dtype,
+                                    beta.dtype, expected) == expected
+        alpha = alpha.astype(expected)
+        beta = beta.astype(expected)
+
        assert alpha.ndim == 0
        assert beta.ndim == 0
        assert A.ndim == 2
        assert B.ndim == 2
        assert C.ndim == 2
-        assert A.dtype == B.dtype == C.dtype
        return Apply(self, [C, alpha, A, B, beta], [C.type()])

    def perform(self, node, inputs, outputs):
@@ -244,13 +265,17 @@ class GpuGer(BlasOp):
        A = as_gpuarray_variable(A, ctx_name)
        x = as_gpuarray_variable(x, ctx_name)
        y = as_gpuarray_variable(y, ctx_name)
-        with theano.configparser.change_flags(warn_float64='ignore'):
-            alpha = as_tensor_variable(alpha).astype('float64')
+        alpha = as_tensor_variable(alpha)
+        if not(A.dtype == x.dtype == y.dtype):
+            raise TypeError('ger requires matching dtypes',
+                            (A.dtype, alpha.dtype, x.dtype, y.dtype))
+
+        assert theano.scalar.upcast(alpha.dtype, A.dtype) == A.dtype
+        alpha = alpha.astype(A.dtype)
        assert alpha.ndim == 0
        assert A.ndim == 2
        assert x.ndim == 1
        assert y.ndim == 1
-        assert A.dtype == x.dtype == y.dtype
        return Apply(self, [A, alpha, x, y], [A.type()])

    def perform(self, node, inp, out):
@@ -383,15 +408,14 @@ class GpuGemmBatch(BlasOp):
        A = as_gpuarray_variable(A, ctx_name)
        B = as_gpuarray_variable(B, ctx_name)
        C = as_gpuarray_variable(C, ctx_name)
-        with theano.configparser.change_flags(warn_float64='ignore'):
-            alpha = as_tensor_variable(alpha).astype('float64')
-            beta = as_tensor_variable(beta).astype('float64')
+        alpha = as_tensor_variable(alpha)
+        beta = as_tensor_variable(beta)
        assert alpha.ndim == 0
        assert beta.ndim == 0
        assert A.ndim == 3
        assert B.ndim == 3
        assert C.ndim == 3
-        assert A.dtype == B.dtype == C.dtype
+        assert A.dtype == B.dtype == C.dtype == alpha.dtype == beta.dtype
        return Apply(self, [C, alpha, A, B, beta], [C.type()])

    def c_headers(self):

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -702,6 +702,7 @@ def local_gpua_elemwise(op, context_name, inputs, outputs):
        name = 'Gpu' + name
    if len(outputs) > 1:
        return
+
    have_cuda = False
    have_opencl = False
    if inputs and isinstance(inputs[0].type, GpuArrayType):
@@ -1162,6 +1163,8 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
 @op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
 @register_opt2([tensor.blas.Gemv], 'fast_compile')
 def local_gpua_gemv(op, context_name, inputs, outputs):
+    if inputs[0].dtype not in ['float32', 'float64']:
+        return
    if op.inplace:
        return gpugemv_inplace
    else:
@@ -1172,6 +1175,8 @@ def local_gpua_gemv(op, context_name, inputs, outputs):
 @op_lifter([tensor.blas.Gemm])
 @register_opt2([tensor.blas.Gemm], 'fast_compile')
 def local_gpua_gemm(op, context_name, inputs, outputs):
+    if inputs[0].dtype not in ['float16', 'float32', 'float64']:
+        return
    if op.inplace:
        return gpugemm_inplace
    else:
@@ -1182,9 +1187,12 @@ def local_gpua_gemm(op, context_name, inputs, outputs):
 @op_lifter([tensor.blas.BatchedDot])
 @register_opt2([tensor.blas.BatchedDot], 'fast_compile')
 def local_gpua_gemmbatch(op, context_name, inputs, outputs):
+    if inputs[0].dtype not in ['float32', 'float64']:
+        return
    a, b = inputs
    c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2])
-    return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
+    return gpugemmbatch_no_inplace(c, np.asarray(1.0, dtype=a.dtype),
+                                   a, b, np.asarray(0.0, dtype=a.dtype))


 @register_opt()
@@ -1215,6 +1223,8 @@ def local_gpua_gemmbatch_output_merge(node, *inputs):
 @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
 @register_opt2([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer], 'fast_compile')
 def local_gpua_ger(op, context_name, inputs, outputs):
+    if inputs[0].dtype not in ['float32', 'float64']:
+        return
    return GpuGer(inplace=op.destructive)



--- a/theano/gpuarray/tests/test_basic_ops.py
+++ b/theano/gpuarray/tests/test_basic_ops.py
@@ -234,7 +234,8 @@ def gpu_alloc_expected(x, *shp):

 GpuAllocTester = makeTester(
    name="GpuAllocTester",
-    op=alloc,
+    # The +1 is there to allow the lift to the GPU.
+    op=lambda *args: alloc(*args) + 1,
    gpu_op=GpuAlloc(test_ctx_name),
    cases=dict(
        correct01=(rand(), np.int32(7)),

--- a/theano/gpuarray/tests/test_blas.py
+++ b/theano/gpuarray/tests/test_blas.py
@@ -15,7 +15,8 @@ from .config import mode_with_gpu
 from .test_basic_ops import makeTester, rand

 from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
-                    gpugemm_inplace, gpugemmbatch_no_inplace,
+                    gpugemm_inplace, gpugemm_no_inplace,
+                    gpugemmbatch_no_inplace,
                    gpuger_inplace, gpuger_no_inplace,
                    GpuGer, gpu_dot22)

@@ -23,16 +24,51 @@ from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
 GpuGemvTester = makeTester(
    'GpuGemvTester',
    op=gemv_inplace, gpu_op=gpugemv_inplace,
-    cases=dict(dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
-               dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
+    # It doesn't support float16
+    cases=dict(dot_vv=[rand(1), 1., rand(1, 2), rand(2), 0.],
+               dot_vm=[rand(3), 1., rand(3, 2), rand(2), 0.],
+               float32=[rand(3).astype('float32'), np.float32(1),
+                        rand(3, 2).astype('float32'),
+                        rand(2).astype('float32'), np.float32(0)],
+               float64=[rand(3).astype('float64'), np.float64(1),
+                        rand(3, 2).astype('float64'),
+                        rand(2).astype('float64'), np.float64(0)],
               # test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
               # test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
               # test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
-               test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
+               test_stride=[rand(3)[::-1], 1., rand(3, 2)[::-1], rand(2)[::-1], 0.],
               )
    )


+def test_float16():
+    # gemm
+    float16_data = [rand(3, 3).astype('float16'),
+                    np.asarray(1, dtype=np.float32),
+                    rand(3, 3).astype('float16'),
+                    rand(3, 3).astype('float16'),
+                    np.asarray(0.5, dtype=np.float32)]
+    float16_shared = [gpuarray_shared_constructor(val)
+                      for val in float16_data]
+    o = gpugemm_no_inplace(*float16_shared)
+    f = theano.function([], o)
+    y, alpha, A, x, beta = float16_data
+    out = f()
+    utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y)
+
+    # dot22
+    float16_data = [rand(3, 3).astype('float16'),
+                    rand(3, 3).astype('float16')]
+
+    float16_shared = [gpuarray_shared_constructor(val)
+                      for val in float16_data]
+    o = gpu_dot22(*float16_shared)
+    f = theano.function([], o)
+    x, y = float16_data
+    out = f()
+    utt.assert_allclose(np.asarray(out), np.dot(x, y))
+
+
 class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
    mode = mode_with_gpu
    dtype = 'float32'
@@ -51,6 +87,7 @@ class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
 GpuGemmTester = makeTester(
    'GpuGemmTester',
    op=gemm_inplace, gpu_op=gpugemm_inplace,
+    # float16 tested in test_float16
    cases=dict(test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
               test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
               test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
@@ -59,7 +96,12 @@ GpuGemmTester = makeTester(
               test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
               test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
               test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
-               test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
+               float32=[rand(3, 4).astype('float32'), np.float32(-1.0),
+                        rand(3, 5).astype('float32'),
+                        rand(5, 4).astype('float32'), np.float32(-1.1)],
+               float64=[rand(3, 4).astype('float64'), np.float64(-1.0),
+                        rand(3, 5).astype('float64'),
+                        rand(5, 4).astype('float64'), np.float64(-1.1)],
               # test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
               # test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
               # test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
@@ -68,14 +110,29 @@ GpuGemmTester = makeTester(
    )


+gemm_batched_tests = dict(
+    ("test_b%im%ik%in%i" % (b, m, k, n),
+     [rand(b, m, n), rand(), rand(b, m, k), rand(b, k, n), rand()])
+    for b, m, k, n in itertools.combinations([2, 3, 5, 7, 11, 13], 4))
+# float16 not supported
+gemm_batched_tests['float32'] = [rand(3, 4, 7).astype('float32'),
+                                 rand().astype('float32'),
+                                 rand(3, 4, 4).astype('float32'),
+                                 rand(3, 4, 7).astype('float32'),
+                                 rand().astype('float32')]
+gemm_batched_tests['float64'] = [rand(3, 4, 7).astype('float64'),
+                                 rand().astype('float64'),
+                                 rand(3, 4, 4).astype('float64'),
+                                 rand(3, 4, 7).astype('float64'),
+                                 rand().astype('float64')]
+
+
 GpuGemmBatchTester = makeTester(
    'GpuGemmBatchTester',
    op=lambda z, alpha, x, y, beta: alpha * batched_dot(x, y) + beta * z,
    gpu_op=gpugemmbatch_no_inplace,
-    cases=dict(
-        ("test_b%im%ik%in%i" % (b, m, k, n),
-         [rand(b, m, n), rand(), rand(b, m, k), rand(b, k, n), rand()])
-        for b, m, k, n in itertools.combinations([2, 3, 5, 7, 11, 13], 4)))
+    cases=gemm_batched_tests
+    )


 class TestGpuSger(TestGer):

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -493,6 +493,27 @@ def test_many_arg_elemwise():
                utt.assert_allclose(results_gpu, results_cpu)


+def test_not_useless_scalar_gpuelemwise():
+    # We don't want to move elemwise on scalar on the GPU when the
+    # result will not be used on the GPU!
+
+    with theano.configparser.change_flags(warn_float64='ignore'):
+        X = tensor.fmatrix()
+        x = np.random.randn(32, 32).astype(np.float32)
+        m1 = theano.shared(np.random.randn(32, 32).astype(np.float32))
+        loss = (X - tensor.dot(X, m1)).norm(L=2)
+        lr = theano.shared(np.asarray(.001, dtype=np.float32))
+        grad = tensor.grad(loss, m1)
+
+        train = theano.function(inputs=[X], updates=[(m1, m1 - lr * grad)],
+                                mode=mode_with_gpu)
+        train(x)
+        topo = train.maker.fgraph.toposort()
+        gemms = [app for app in topo if isinstance(app.op, GpuGemm)]
+        assert len(gemms) == 2
+        assert isinstance(gemms[1].inputs[1].owner.op, tensor.Elemwise)
+
+
 def test_local_lift_abstractconv_gpu_shape():
    prev = theano.config.on_opt_error
    try:

--- a/theano/gpuarray/type.py
+++ b/theano/gpuarray/type.py
@@ -24,11 +24,25 @@ except ImportError:
 _context_reg = {}


+def gpu_supported(data):
+    """
+    Is the following data supported on the GPU?
+
+    Currently, only complex aren't supported.
+
+    Parameters
+    ----------
+    data : numpy.ndarray or TensorVariable
+           (it must have dtype and ndim parameter)
+    """
+    return str(data.dtype) not in tensor.basic.complex_dtypes
+
+
 def move_to_gpu(data):
    """
    Do we want to move this computation to the GPU?

-    Currently, we don't move complex and scalar int.
+    Currently, we don't move complex and scalar.

    Parameters
    ----------
@@ -36,10 +50,10 @@ def move_to_gpu(data):
           (it must have dtype and ndim parameter)
    """
    # We don't support complex on the GPU
-    if str(data.dtype) in tensor.basic.complex_dtypes:
+    if not gpu_supported(data):
        return False
-    # We don't want scalar int on the GPU.
-    if data.ndim == 0 and str(data.dtype) in tensor.basic.discrete_dtypes:
+    # We don't want scalars on the GPU.
+    if data.ndim == 0:
        return False
    return True

@@ -637,7 +651,7 @@ def gpuarray_shared_constructor(value, name=None, strict=False,

    if target is notset:
        target = None
-        if not move_to_gpu(value):
+        if not gpu_supported(value):
            raise TypeError('We do not move that data by default to the GPU')
    try:
        get_context(target)

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -317,7 +317,7 @@ class Ger(Op):
        y = T.as_tensor_variable(y)
        x = T.as_tensor_variable(x)
        alpha = T.as_tensor_variable(alpha)
-        if len(set([A.dtype, alpha.dtype, x.dtype, y.dtype])) != 1:
+        if not(A.dtype == x.dtype == y.dtype == alpha.dtype):
            raise TypeError('ger requires matching dtypes',
                            (A.dtype, alpha.dtype, x.dtype, y.dtype))
        if alpha.ndim != 0:
@@ -852,9 +852,6 @@ class Gemm(GemmRelated):
                (self, len(inputs)))
        z, a, x, y, b = inputs

-        # For the consistency check we don't want z to be a cached constant.
-        if getattr(z, 'cached', False):
-            z = copy.copy(z)
        zr, xr, yr = [set(view_roots(i)) for i in (z, x, y)]

        # We want the gemm to be inplace. When this op is inplace, it
@@ -867,10 +864,11 @@ class Gemm(GemmRelated):
        # think there is another mechanism that would prevent this,
        # but I don't what to modify old code and have chance to break
        # something.
-        if zr.intersection(xr):
-            raise InconsistencyError(Gemm.E_z_uniq, (z, x))
-        if zr.intersection(yr):
-            raise InconsistencyError(Gemm.E_z_uniq, (z, y))
+        if self.inplace:
+            if zr.intersection(xr):
+                raise InconsistencyError(Gemm.E_z_uniq, (z, x))
+            if zr.intersection(yr):
+                raise InconsistencyError(Gemm.E_z_uniq, (z, y))

        if z.ndim != 2:
            raise TypeError(Gemm.E_rank, z)

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -105,7 +105,7 @@ class t_gemm(TestCase):
    def test0a(self):
        Gemm.debug = True
        try:
-            g = gemm_inplace([1.], 1., [1.], [1.], 1.)
+            g = gemm_no_inplace([1.], 1., [1.], [1.], 1.)
        except TypeError as e:
            if exc_message(e) is Gemm.E_rank:
                return