Merge pull request #3355 from abergeron/hgemm

Enables float16 gemm on gpuarray when the cuda version supports it

Merge pull request #3355 from abergeron/hgemm
175045d9 · Frédéric Bastien · d219054e · 560bafe5 · 175045d9 · 175045d9
--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
 import os.path
-from theano import Op, Apply, config
+from theano import Apply, config
 from theano.compile import optdb
 from theano.gof import local_optimizer, LocalOptGroup
+from theano.tensor.basic import as_tensor_variable
 from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
 from theano.tensor.opt import in2out
-from .basic_ops import HideC, as_gpuarray_variable
+from .basic_ops import HideC, as_gpuarray_variable, GpuAllocEmpty
 try:
    import pygpu
@@ -51,7 +52,7 @@ PyGpuArrayObject *gpublas_try_copy(PyGpuArrayObject *out,
 class GpuGemv(BlasOp, Gemv):
    def make_node(self, y, alpha, A, x, beta):
-        res = Gemv.make_node(self, y, alpha, A, x, beta)
+        Gemv.make_node(self, y, alpha, A, x, beta)
        A = as_gpuarray_variable(A)
        x = as_gpuarray_variable(x)
        y = as_gpuarray_variable(y)
@@ -112,8 +113,11 @@ gpugemv_inplace = GpuGemv(inplace=True)
 class GpuGemm(BlasOp, Gemm):
+    _f16_ok = True
    def make_node(self, C, alpha, A, B, beta):
-        res = Gemm.make_node(self, C, alpha, A, B, beta)
+        alpha = as_tensor_variable(alpha)
+        beta = as_tensor_variable(beta)
        A = as_gpuarray_variable(A)
        B = as_gpuarray_variable(B)
        C = as_gpuarray_variable(C)
@@ -176,7 +180,7 @@ gpugemm_inplace = GpuGemm(inplace=True)
 class GpuGer(BlasOp, Ger):
    def make_node(self, A, alpha, x, y):
-        res = Ger.make_node(self, A, alpha, x, y)
+        Ger.make_node(self, A, alpha, x, y)
        A = as_gpuarray_variable(A)
        x = as_gpuarray_variable(x)
        y = as_gpuarray_variable(y)
@@ -236,7 +240,7 @@ gpuger_inplace = GpuGer(destructive=True)
 class GpuDot22(BlasOp, Dot22):
    def make_node(self, x, y):
-        res = Dot22.make_node(self, x, y)
+        Dot22.make_node(self, x, y)
        x = as_gpuarray_variable(x)
        y = as_gpuarray_variable(y)
        assert x.dtype == y.dtype
@@ -287,6 +291,7 @@ class GpuDot22(BlasOp, Dot22):
 gpu_dot22 = GpuDot22()
 @local_optimizer([gpugemv_no_inplace], inplace=True)
 def local_inplace_gpuagemv(node):
    if node.op == gpugemv_no_inplace:
@@ -296,7 +301,12 @@ def local_inplace_gpuagemv(node):
 @local_optimizer([gpugemm_no_inplace], inplace=True)
 def local_inplace_gpuagemm(node):
    if node.op == gpugemm_no_inplace:
-        return [gpugemm_inplace(*node.inputs)]
+        inputs = list(node.inputs)
+        C = inputs[0]
+        if (C.owner and isinstance(C.owner.op, GpuAllocEmpty) and
+                len(C.clients) > 1):
+            inputs[0] = C.owner.op(*C.owner.inputs)
+        return [gpugemm_inplace(*inputs)]
 @local_optimizer([gpuger_no_inplace], inplace=True)
@@ -304,9 +314,11 @@ def local_inplace_gpuager(node):
    if node.op == gpuger_no_inplace:
        return [gpuger_inplace(*node.inputs)]
-gpuablas_opt_inplace = in2out(LocalOptGroup(
+gpuablas_opt_inplace = in2out(LocalOptGroup(local_inplace_gpuagemv,
-        local_inplace_gpuagemv, local_inplace_gpuagemm, local_inplace_gpuager),
+                                            local_inplace_gpuagemm,
+                                            local_inplace_gpuager),
                              name='gpuablas_opt_inplace')
 optdb.register('InplaceGpuaBlasOpt',
               gpuablas_opt_inplace,
               70.0, 'fast_run', 'inplace', 'gpuarray')
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
 import copy
-import theano
 import numpy
+import logging
 from six.moves import xrange
 try:
@@ -8,8 +8,10 @@ try:
 except ImportError:
    pass
+import theano
 from theano import tensor, scalar, gof
 from theano.compile import optdb
+from theano.compile.ops import shape_i
 from theano.gof import (local_optimizer, EquilibriumDB,
                        SequenceDB, Optimizer, toolbox)
 from theano.gof.optdb import LocalGroupDB
@@ -25,9 +27,10 @@ from .basic_ops import (as_gpuarray_variable,
                        host_from_gpu, gpu_from_host,
                        HostFromGpu, GpuFromHost,
                        GpuSplit, GpuContiguous,
-                        gpu_alloc, GpuAlloc, GpuReshape,
+                        gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuReshape,
                        GpuEye, gpu_join, GpuJoin)
-from .blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
+from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
+                   gpugemm_no_inplace)
 from .conv import GpuConv
 from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                   GpuCrossentropySoftmax1HotWithBiasDx,
@@ -38,6 +41,9 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedSubtensor1,
                        GpuAdvancedIncSubtensor1,
                        GpuAdvancedIncSubtensor1_dev20)
+from .opt_util import alpha_merge, output_merge
+_logger = logging.getLogger("theano.sandbox.gpuarray.opt")
 gpu_optimizer = EquilibriumDB()
 gpu_cut_copies = EquilibriumDB()
@@ -619,6 +625,37 @@ def local_gpua_gemm(node):
    return GpuGemm(inplace=node.op.inplace)
+@register_opt('fast_compile')
+@op_lifter([tensor.basic.Dot])
+def local_gpua_hgemm(node):
+    from theano.sandbox.cuda import nvcc_compiler
+    if nvcc_compiler.nvcc_version < '7.5':
+        _logger.warning("Not performing dot of float16 on the GPU since "
+                        "cuda 7.5 is not available. Updating could speed up "
+                        "your code.")
+        return
+    A = node.inputs[0]
+    B = node.inputs[1]
+    if (A.ndim == 2 and B.ndim == 2 and
+            A.dtype == 'float16' and B.dtype == 'float16'):
+        fgraph = node.inputs[0].fgraph
+        C = GpuAllocEmpty(dtype='float16')(shape_i(A, 0, fgraph),
+                                           shape_i(B, 1, fgraph))
+        return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
+@register_opt()
+@alpha_merge(GpuGemm, alpha_in=1, beta_in=4, nd=2)
+def local_gpuagemm_alpha_merge(node, *inputs):
+    return [gpugemm_no_inplace(*inputs)]
+@register_opt()
+@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0, nd=2)
+def local_gpuagemm_output_merge(node, *inputs):
+    return [gpugemm_no_inplace(*inputs)]
 @register_opt('fast_compile')
 @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
 def local_gpua_ger(node):

--- a/theano/sandbox/gpuarray/opt_util.py
+++ b/theano/sandbox/gpuarray/opt_util.py
@@ -73,7 +73,8 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
                    lr = grab_cpu_scalar(node.inputs[0], nd=nd)
                else:
                    lr = grab_cpu_scalar(node.inputs[1], nd=nd)
-                if lr is None or targ is None:
+                if (lr is None or targ is None or
+                        lr.dtype != targ.outputs[0].dtype):
                    return None
                inputs = list(targ.inputs)
                try:
@@ -110,6 +111,8 @@ def output_merge(cls, alpha_in, beta_in, out_in, nd):
                    W = node.inputs[0]
                if targ is None:
                    return None
+                if W.dtype != targ.outputs[0].dtype:
+                    return None
                if not is_equal(targ.inputs[beta_in], 0.0):
                    # other cases are too complex for now
                    return None

--- a/theano/sandbox/gpuarray/tests/test_blas.py
+++ b/theano/sandbox/gpuarray/tests/test_blas.py
 from unittest import TestCase
 from nose.plugins.skip import SkipTest
+import numpy
 import theano
 from theano import tensor
-from theano.tests import unittest_tools
+from theano.tests import unittest_tools as utt
 from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
                                _dot22)
 from theano.tensor.tests.test_blas import TestGer, BaseGemv
@@ -15,7 +17,7 @@ from .test_basic_ops import (makeTester, rand,
 from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
                    gpugemm_inplace, gpugemm_no_inplace,
                    gpuger_inplace, gpuger_no_inplace,
-                    GpuGer, gpu_dot22)
+                    GpuGer, gpu_dot22, GpuGemm)
 GpuGemvTester = makeTester('GpuGemvTester',
@@ -31,7 +33,7 @@ GpuGemvTester = makeTester('GpuGemvTester',
 )
-class TestGpuSgemv(TestCase, BaseGemv, unittest_tools.TestOptimizationMixin):
+class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
    mode = mode_with_gpu
    dtype = 'float32'
@@ -92,7 +94,7 @@ class TestGpuSgerNoTransfer(TestGpuSger):
    shared = staticmethod(gpuarray_shared_constructor)
-class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
+class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
    def setUp(self):
        self.ops = [gpuger_no_inplace, gpuger_inplace]
@@ -115,3 +117,50 @@ GpuDot22Tester = makeTester(
 #        test9=[rand(0, 0), rand(0, 0)],
    )
 )
+def test_hgemm_swap():
+    from theano.sandbox.cuda import nvcc_compiler
+    if nvcc_compiler.nvcc_version < '7.5':
+        raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
+    v = tensor.vector(dtype='float16')
+    m = tensor.matrix(dtype='float16')
+    m2 = tensor.matrix(dtype='float16')
+    m32 = tensor.matrix(dtype='float32')
+    # test that we don't try to replace anything but matrix x matrix in float16
+    f = theano.function([v, m], tensor.dot(v, m), mode=mode_with_gpu)
+    assert len([node for node in f.maker.fgraph.apply_nodes
+                if isinstance(node.op, GpuGemm)]) == 0
+    f = theano.function([m32, m], tensor.dot(m32, m), mode=mode_with_gpu)
+    assert len([node for node in f.maker.fgraph.apply_nodes
+                if isinstance(node.op, GpuGemm)]) == 0
+    f = theano.function([m, m2], tensor.dot(m, m2), mode=mode_with_gpu)
+    assert len([node for node in f.maker.fgraph.apply_nodes
+                if isinstance(node.op, GpuGemm)]) == 1
+    v1 = numpy.random.random((3, 4)).astype('float16')
+    v2 = numpy.random.random((4, 2)).astype('float16')
+    of = f(v1, v2)
+    on = numpy.dot(v1, v2)
+    utt.assert_allclose(of, on)
+def test_hgemm_alpha_output_merge():
+    from theano.sandbox.cuda import nvcc_compiler
+    if nvcc_compiler.nvcc_version < '7.5':
+        raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
+    m1 = tensor.matrix(dtype='float16')
+    m2 = tensor.matrix(dtype='float16')
+    b = tensor.matrix(dtype='float16')
+    hgemm = numpy.asarray(0.05, dtype='float16') * (tensor.dot(m1, m2) + b)
+    f = theano.function([m1, m2, b], hgemm, mode=mode_with_gpu)
+    # there should be 3 gpu_from_host, 1 hgemm and 1 host_from_gpu
+    assert len(f.maker.fgraph.apply_nodes) == 5
--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -36,7 +36,7 @@ class GpuArrayType(Type):
        return self.__class__(dtype=dtype, broadcastable=broadcastable,
                              name=self.name)
-    def __str__(self):
+    def __repr__(self):
        return "GpuArrayType(%s, %s)" % (self.dtype, self.broadcastable)
    def filter(self, data, strict=False, allow_downcast=None):

--- a/theano/tests/test_flake8.py
+++ b/theano/tests/test_flake8.py
@@ -162,7 +162,6 @@ whitelist_flake8 = [
    "sandbox/gpuarray/elemwise.py",
    "sandbox/gpuarray/type.py",
    "sandbox/gpuarray/__init__.py",
-    "sandbox/gpuarray/blas.py",
    "sandbox/gpuarray/kernel_codegen.py",
    "sandbox/gpuarray/conv.py",
    "sandbox/gpuarray/neighbours.py",