提交 175045d9 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3355 from abergeron/hgemm

Enables float16 gemm on gpuarray when the cuda version supports it
import os.path import os.path
from theano import Op, Apply, config from theano import Apply, config
from theano.compile import optdb from theano.compile import optdb
from theano.gof import local_optimizer, LocalOptGroup from theano.gof import local_optimizer, LocalOptGroup
from theano.tensor.basic import as_tensor_variable
from theano.tensor.blas import Dot22, Gemv, Gemm, Ger from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
from theano.tensor.opt import in2out from theano.tensor.opt import in2out
from .basic_ops import HideC, as_gpuarray_variable from .basic_ops import HideC, as_gpuarray_variable, GpuAllocEmpty
try: try:
import pygpu import pygpu
...@@ -51,7 +52,7 @@ PyGpuArrayObject *gpublas_try_copy(PyGpuArrayObject *out, ...@@ -51,7 +52,7 @@ PyGpuArrayObject *gpublas_try_copy(PyGpuArrayObject *out,
class GpuGemv(BlasOp, Gemv): class GpuGemv(BlasOp, Gemv):
def make_node(self, y, alpha, A, x, beta): def make_node(self, y, alpha, A, x, beta):
res = Gemv.make_node(self, y, alpha, A, x, beta) Gemv.make_node(self, y, alpha, A, x, beta)
A = as_gpuarray_variable(A) A = as_gpuarray_variable(A)
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y) y = as_gpuarray_variable(y)
...@@ -112,8 +113,11 @@ gpugemv_inplace = GpuGemv(inplace=True) ...@@ -112,8 +113,11 @@ gpugemv_inplace = GpuGemv(inplace=True)
class GpuGemm(BlasOp, Gemm): class GpuGemm(BlasOp, Gemm):
_f16_ok = True
def make_node(self, C, alpha, A, B, beta): def make_node(self, C, alpha, A, B, beta):
res = Gemm.make_node(self, C, alpha, A, B, beta) alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
A = as_gpuarray_variable(A) A = as_gpuarray_variable(A)
B = as_gpuarray_variable(B) B = as_gpuarray_variable(B)
C = as_gpuarray_variable(C) C = as_gpuarray_variable(C)
...@@ -176,7 +180,7 @@ gpugemm_inplace = GpuGemm(inplace=True) ...@@ -176,7 +180,7 @@ gpugemm_inplace = GpuGemm(inplace=True)
class GpuGer(BlasOp, Ger): class GpuGer(BlasOp, Ger):
def make_node(self, A, alpha, x, y): def make_node(self, A, alpha, x, y):
res = Ger.make_node(self, A, alpha, x, y) Ger.make_node(self, A, alpha, x, y)
A = as_gpuarray_variable(A) A = as_gpuarray_variable(A)
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y) y = as_gpuarray_variable(y)
...@@ -236,7 +240,7 @@ gpuger_inplace = GpuGer(destructive=True) ...@@ -236,7 +240,7 @@ gpuger_inplace = GpuGer(destructive=True)
class GpuDot22(BlasOp, Dot22): class GpuDot22(BlasOp, Dot22):
def make_node(self, x, y): def make_node(self, x, y):
res = Dot22.make_node(self, x, y) Dot22.make_node(self, x, y)
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y) y = as_gpuarray_variable(y)
assert x.dtype == y.dtype assert x.dtype == y.dtype
...@@ -287,6 +291,7 @@ class GpuDot22(BlasOp, Dot22): ...@@ -287,6 +291,7 @@ class GpuDot22(BlasOp, Dot22):
gpu_dot22 = GpuDot22() gpu_dot22 = GpuDot22()
@local_optimizer([gpugemv_no_inplace], inplace=True) @local_optimizer([gpugemv_no_inplace], inplace=True)
def local_inplace_gpuagemv(node): def local_inplace_gpuagemv(node):
if node.op == gpugemv_no_inplace: if node.op == gpugemv_no_inplace:
...@@ -296,7 +301,12 @@ def local_inplace_gpuagemv(node): ...@@ -296,7 +301,12 @@ def local_inplace_gpuagemv(node):
@local_optimizer([gpugemm_no_inplace], inplace=True) @local_optimizer([gpugemm_no_inplace], inplace=True)
def local_inplace_gpuagemm(node): def local_inplace_gpuagemm(node):
if node.op == gpugemm_no_inplace: if node.op == gpugemm_no_inplace:
return [gpugemm_inplace(*node.inputs)] inputs = list(node.inputs)
C = inputs[0]
if (C.owner and isinstance(C.owner.op, GpuAllocEmpty) and
len(C.clients) > 1):
inputs[0] = C.owner.op(*C.owner.inputs)
return [gpugemm_inplace(*inputs)]
@local_optimizer([gpuger_no_inplace], inplace=True) @local_optimizer([gpuger_no_inplace], inplace=True)
...@@ -304,9 +314,11 @@ def local_inplace_gpuager(node): ...@@ -304,9 +314,11 @@ def local_inplace_gpuager(node):
if node.op == gpuger_no_inplace: if node.op == gpuger_no_inplace:
return [gpuger_inplace(*node.inputs)] return [gpuger_inplace(*node.inputs)]
gpuablas_opt_inplace = in2out(LocalOptGroup( gpuablas_opt_inplace = in2out(LocalOptGroup(local_inplace_gpuagemv,
local_inplace_gpuagemv, local_inplace_gpuagemm, local_inplace_gpuager), local_inplace_gpuagemm,
local_inplace_gpuager),
name='gpuablas_opt_inplace') name='gpuablas_opt_inplace')
optdb.register('InplaceGpuaBlasOpt', optdb.register('InplaceGpuaBlasOpt',
gpuablas_opt_inplace, gpuablas_opt_inplace,
70.0, 'fast_run', 'inplace', 'gpuarray') 70.0, 'fast_run', 'inplace', 'gpuarray')
import copy import copy
import theano
import numpy import numpy
import logging
from six.moves import xrange from six.moves import xrange
try: try:
...@@ -8,8 +8,10 @@ try: ...@@ -8,8 +8,10 @@ try:
except ImportError: except ImportError:
pass pass
import theano
from theano import tensor, scalar, gof from theano import tensor, scalar, gof
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, from theano.gof import (local_optimizer, EquilibriumDB,
SequenceDB, Optimizer, toolbox) SequenceDB, Optimizer, toolbox)
from theano.gof.optdb import LocalGroupDB from theano.gof.optdb import LocalGroupDB
...@@ -25,9 +27,10 @@ from .basic_ops import (as_gpuarray_variable, ...@@ -25,9 +27,10 @@ from .basic_ops import (as_gpuarray_variable,
host_from_gpu, gpu_from_host, host_from_gpu, gpu_from_host,
HostFromGpu, GpuFromHost, HostFromGpu, GpuFromHost,
GpuSplit, GpuContiguous, GpuSplit, GpuContiguous,
gpu_alloc, GpuAlloc, GpuReshape, gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin) GpuEye, gpu_join, GpuJoin)
from .blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
gpugemm_no_inplace)
from .conv import GpuConv from .conv import GpuConv
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
...@@ -38,6 +41,9 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor, ...@@ -38,6 +41,9 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20) GpuAdvancedIncSubtensor1_dev20)
from .opt_util import alpha_merge, output_merge
_logger = logging.getLogger("theano.sandbox.gpuarray.opt")
gpu_optimizer = EquilibriumDB() gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB() gpu_cut_copies = EquilibriumDB()
...@@ -619,6 +625,37 @@ def local_gpua_gemm(node): ...@@ -619,6 +625,37 @@ def local_gpua_gemm(node):
return GpuGemm(inplace=node.op.inplace) return GpuGemm(inplace=node.op.inplace)
@register_opt('fast_compile')
@op_lifter([tensor.basic.Dot])
def local_gpua_hgemm(node):
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
_logger.warning("Not performing dot of float16 on the GPU since "
"cuda 7.5 is not available. Updating could speed up "
"your code.")
return
A = node.inputs[0]
B = node.inputs[1]
if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph
C = GpuAllocEmpty(dtype='float16')(shape_i(A, 0, fgraph),
shape_i(B, 1, fgraph))
return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
@register_opt()
@alpha_merge(GpuGemm, alpha_in=1, beta_in=4, nd=2)
def local_gpuagemm_alpha_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt()
@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0, nd=2)
def local_gpuagemm_output_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer]) @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(node): def local_gpua_ger(node):
......
...@@ -73,7 +73,8 @@ def alpha_merge(cls, alpha_in, beta_in, nd): ...@@ -73,7 +73,8 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
lr = grab_cpu_scalar(node.inputs[0], nd=nd) lr = grab_cpu_scalar(node.inputs[0], nd=nd)
else: else:
lr = grab_cpu_scalar(node.inputs[1], nd=nd) lr = grab_cpu_scalar(node.inputs[1], nd=nd)
if lr is None or targ is None: if (lr is None or targ is None or
lr.dtype != targ.outputs[0].dtype):
return None return None
inputs = list(targ.inputs) inputs = list(targ.inputs)
try: try:
...@@ -110,6 +111,8 @@ def output_merge(cls, alpha_in, beta_in, out_in, nd): ...@@ -110,6 +111,8 @@ def output_merge(cls, alpha_in, beta_in, out_in, nd):
W = node.inputs[0] W = node.inputs[0]
if targ is None: if targ is None:
return None return None
if W.dtype != targ.outputs[0].dtype:
return None
if not is_equal(targ.inputs[beta_in], 0.0): if not is_equal(targ.inputs[beta_in], 0.0):
# other cases are too complex for now # other cases are too complex for now
return None return None
......
from unittest import TestCase from unittest import TestCase
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import numpy
import theano import theano
from theano import tensor from theano import tensor
from theano.tests import unittest_tools from theano.tests import unittest_tools as utt
from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive, from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
_dot22) _dot22)
from theano.tensor.tests.test_blas import TestGer, BaseGemv from theano.tensor.tests.test_blas import TestGer, BaseGemv
...@@ -15,7 +17,7 @@ from .test_basic_ops import (makeTester, rand, ...@@ -15,7 +17,7 @@ from .test_basic_ops import (makeTester, rand,
from ..blas import (gpugemv_inplace, gpugemv_no_inplace, from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpugemm_no_inplace, gpugemm_inplace, gpugemm_no_inplace,
gpuger_inplace, gpuger_no_inplace, gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22) GpuGer, gpu_dot22, GpuGemm)
GpuGemvTester = makeTester('GpuGemvTester', GpuGemvTester = makeTester('GpuGemvTester',
...@@ -31,7 +33,7 @@ GpuGemvTester = makeTester('GpuGemvTester', ...@@ -31,7 +33,7 @@ GpuGemvTester = makeTester('GpuGemvTester',
) )
class TestGpuSgemv(TestCase, BaseGemv, unittest_tools.TestOptimizationMixin): class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
mode = mode_with_gpu mode = mode_with_gpu
dtype = 'float32' dtype = 'float32'
...@@ -92,7 +94,7 @@ class TestGpuSgerNoTransfer(TestGpuSger): ...@@ -92,7 +94,7 @@ class TestGpuSgerNoTransfer(TestGpuSger):
shared = staticmethod(gpuarray_shared_constructor) shared = staticmethod(gpuarray_shared_constructor)
class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin): class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
def setUp(self): def setUp(self):
self.ops = [gpuger_no_inplace, gpuger_inplace] self.ops = [gpuger_no_inplace, gpuger_inplace]
...@@ -115,3 +117,50 @@ GpuDot22Tester = makeTester( ...@@ -115,3 +117,50 @@ GpuDot22Tester = makeTester(
# test9=[rand(0, 0), rand(0, 0)], # test9=[rand(0, 0), rand(0, 0)],
) )
) )
def test_hgemm_swap():
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
v = tensor.vector(dtype='float16')
m = tensor.matrix(dtype='float16')
m2 = tensor.matrix(dtype='float16')
m32 = tensor.matrix(dtype='float32')
# test that we don't try to replace anything but matrix x matrix in float16
f = theano.function([v, m], tensor.dot(v, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 0
f = theano.function([m32, m], tensor.dot(m32, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 0
f = theano.function([m, m2], tensor.dot(m, m2), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 1
v1 = numpy.random.random((3, 4)).astype('float16')
v2 = numpy.random.random((4, 2)).astype('float16')
of = f(v1, v2)
on = numpy.dot(v1, v2)
utt.assert_allclose(of, on)
def test_hgemm_alpha_output_merge():
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
m1 = tensor.matrix(dtype='float16')
m2 = tensor.matrix(dtype='float16')
b = tensor.matrix(dtype='float16')
hgemm = numpy.asarray(0.05, dtype='float16') * (tensor.dot(m1, m2) + b)
f = theano.function([m1, m2, b], hgemm, mode=mode_with_gpu)
# there should be 3 gpu_from_host, 1 hgemm and 1 host_from_gpu
assert len(f.maker.fgraph.apply_nodes) == 5
...@@ -36,7 +36,7 @@ class GpuArrayType(Type): ...@@ -36,7 +36,7 @@ class GpuArrayType(Type):
return self.__class__(dtype=dtype, broadcastable=broadcastable, return self.__class__(dtype=dtype, broadcastable=broadcastable,
name=self.name) name=self.name)
def __str__(self): def __repr__(self):
return "GpuArrayType(%s, %s)" % (self.dtype, self.broadcastable) return "GpuArrayType(%s, %s)" % (self.dtype, self.broadcastable)
def filter(self, data, strict=False, allow_downcast=None): def filter(self, data, strict=False, allow_downcast=None):
......
...@@ -162,7 +162,6 @@ whitelist_flake8 = [ ...@@ -162,7 +162,6 @@ whitelist_flake8 = [
"sandbox/gpuarray/elemwise.py", "sandbox/gpuarray/elemwise.py",
"sandbox/gpuarray/type.py", "sandbox/gpuarray/type.py",
"sandbox/gpuarray/__init__.py", "sandbox/gpuarray/__init__.py",
"sandbox/gpuarray/blas.py",
"sandbox/gpuarray/kernel_codegen.py", "sandbox/gpuarray/kernel_codegen.py",
"sandbox/gpuarray/conv.py", "sandbox/gpuarray/conv.py",
"sandbox/gpuarray/neighbours.py", "sandbox/gpuarray/neighbours.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论