提交 7e03d1a9 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Add support for float16 mm product with libgpuarray.

上级 bae54705
...@@ -4,10 +4,11 @@ from theano import Op, Apply, config ...@@ -4,10 +4,11 @@ from theano import Op, Apply, config
from theano.compile import optdb from theano.compile import optdb
from theano.gof import local_optimizer, LocalOptGroup from theano.gof import local_optimizer, LocalOptGroup
from theano.tensor.basic import as_tensor_variable
from theano.tensor.blas import Dot22, Gemv, Gemm, Ger from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
from theano.tensor.opt import in2out from theano.tensor.opt import in2out
from .basic_ops import HideC, as_gpuarray_variable from .basic_ops import HideC, as_gpuarray_variable, GpuAllocEmpty
try: try:
import pygpu import pygpu
...@@ -112,8 +113,11 @@ gpugemv_inplace = GpuGemv(inplace=True) ...@@ -112,8 +113,11 @@ gpugemv_inplace = GpuGemv(inplace=True)
class GpuGemm(BlasOp, Gemm): class GpuGemm(BlasOp, Gemm):
_f16_ok = True
def make_node(self, C, alpha, A, B, beta): def make_node(self, C, alpha, A, B, beta):
res = Gemm.make_node(self, C, alpha, A, B, beta) alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
A = as_gpuarray_variable(A) A = as_gpuarray_variable(A)
B = as_gpuarray_variable(B) B = as_gpuarray_variable(B)
C = as_gpuarray_variable(C) C = as_gpuarray_variable(C)
...@@ -296,7 +300,12 @@ def local_inplace_gpuagemv(node): ...@@ -296,7 +300,12 @@ def local_inplace_gpuagemv(node):
@local_optimizer([gpugemm_no_inplace], inplace=True) @local_optimizer([gpugemm_no_inplace], inplace=True)
def local_inplace_gpuagemm(node): def local_inplace_gpuagemm(node):
if node.op == gpugemm_no_inplace: if node.op == gpugemm_no_inplace:
return [gpugemm_inplace(*node.inputs)] inputs = list(node.inputs)
C = inputs[0]
if (C.owner and isinstance(C.owner.op, GpuAllocEmpty) and
len(C.clients) > 1):
inputs[0] = C.owner.op(*C.owner.inputs)
return [gpugemm_inplace(*inputs)]
@local_optimizer([gpuger_no_inplace], inplace=True) @local_optimizer([gpuger_no_inplace], inplace=True)
......
...@@ -10,6 +10,7 @@ except ImportError: ...@@ -10,6 +10,7 @@ except ImportError:
from theano import tensor, scalar, gof from theano import tensor, scalar, gof
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, from theano.gof import (local_optimizer, EquilibriumDB,
SequenceDB, Optimizer, toolbox) SequenceDB, Optimizer, toolbox)
from theano.gof.optdb import LocalGroupDB from theano.gof.optdb import LocalGroupDB
...@@ -25,9 +26,10 @@ from .basic_ops import (as_gpuarray_variable, ...@@ -25,9 +26,10 @@ from .basic_ops import (as_gpuarray_variable,
host_from_gpu, gpu_from_host, host_from_gpu, gpu_from_host,
HostFromGpu, GpuFromHost, HostFromGpu, GpuFromHost,
GpuSplit, GpuContiguous, GpuSplit, GpuContiguous,
gpu_alloc, GpuAlloc, GpuReshape, gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin) GpuEye, gpu_join, GpuJoin)
from .blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
gpugemm_no_inplace)
from .conv import GpuConv from .conv import GpuConv
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
...@@ -37,6 +39,7 @@ from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, ...@@ -37,6 +39,7 @@ from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
from .subtensor import (GpuIncSubtensor, GpuSubtensor, from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20) GpuAdvancedIncSubtensor1_dev20)
from .opt_util import alpha_merge, output_merge
gpu_optimizer = EquilibriumDB() gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB() gpu_cut_copies = EquilibriumDB()
...@@ -603,6 +606,33 @@ def local_gpua_gemm(node): ...@@ -603,6 +606,33 @@ def local_gpua_gemm(node):
return GpuGemm(inplace=node.op.inplace) return GpuGemm(inplace=node.op.inplace)
@register_opt('fast_compile')
@op_lifter([tensor.basic.Dot])
def local_gpua_hgemm(node):
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
return
A = node.inputs[0]
B = node.inputs[1]
if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph
C = GpuAllocEmpty(dtype='float16')(shape_i(A, 0, fgraph),
shape_i(B, 1, fgraph))
return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
@register_opt()
@alpha_merge(GpuGemm, alpha_in=1, beta_in=2, nd=2)
def local_gpuagemm_alpha_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt()
@output_merge(GpuGemm, alpha_in=1, beta_in=2, out_in=0, nd=2)
def local_gpuagemm_output_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer]) @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(node): def local_gpua_ger(node):
......
from unittest import TestCase from unittest import TestCase
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import numpy
import theano import theano
from theano import tensor from theano import tensor
from theano.tests import unittest_tools from theano.tests import unittest_tools as utt
from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive, from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
_dot22) _dot22)
from theano.tensor.tests.test_blas import TestGer, BaseGemv from theano.tensor.tests.test_blas import TestGer, BaseGemv
...@@ -15,7 +17,7 @@ from .test_basic_ops import (makeTester, rand, ...@@ -15,7 +17,7 @@ from .test_basic_ops import (makeTester, rand,
from ..blas import (gpugemv_inplace, gpugemv_no_inplace, from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpugemm_no_inplace, gpugemm_inplace, gpugemm_no_inplace,
gpuger_inplace, gpuger_no_inplace, gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22) GpuGer, gpu_dot22, GpuGemm)
GpuGemvTester = makeTester('GpuGemvTester', GpuGemvTester = makeTester('GpuGemvTester',
...@@ -31,7 +33,7 @@ GpuGemvTester = makeTester('GpuGemvTester', ...@@ -31,7 +33,7 @@ GpuGemvTester = makeTester('GpuGemvTester',
) )
class TestGpuSgemv(TestCase, BaseGemv, unittest_tools.TestOptimizationMixin): class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
mode = mode_with_gpu mode = mode_with_gpu
dtype = 'float32' dtype = 'float32'
...@@ -92,7 +94,7 @@ class TestGpuSgerNoTransfer(TestGpuSger): ...@@ -92,7 +94,7 @@ class TestGpuSgerNoTransfer(TestGpuSger):
shared = staticmethod(gpuarray_shared_constructor) shared = staticmethod(gpuarray_shared_constructor)
class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin): class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
def setUp(self): def setUp(self):
self.ops = [gpuger_no_inplace, gpuger_inplace] self.ops = [gpuger_no_inplace, gpuger_inplace]
...@@ -115,3 +117,47 @@ GpuDot22Tester = makeTester( ...@@ -115,3 +117,47 @@ GpuDot22Tester = makeTester(
# test9=[rand(0, 0), rand(0, 0)], # test9=[rand(0, 0), rand(0, 0)],
) )
) )
def test_hgemm_swap():
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
v = tensor.vector(dtype='float16')
m = tensor.matrix(dtype='float16')
m2 = tensor.matrix(dtype='float16')
m32 = tensor.matrix(dtype='float32')
# test that we don't try to replace anything but matrix x matrix in float16
f = theano.function([v, m], tensor.dot(v, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 0
f = theano.function([m32, m], tensor.dot(m32, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 0
f = theano.function([m, m2], tensor.dot(m, m2), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 1
def test_hgemm_value():
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
m = tensor.matrix(dtype='float16')
m2 = tensor.matrix(dtype='float16')
f = theano.function([m, m2], tensor.dot(m, m2), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 1
v1 = numpy.random.random((3, 4)).astype('float16')
v2 = numpy.random.random((4, 2)).astype('float16')
of = f(v1, v2)
on = numpy.dot(v1, v2)
utt.assert_allclose(of, on)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论