提交 7e03d1a9 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Add support for float16 mm product with libgpuarray.

上级 bae54705
......@@ -4,10 +4,11 @@ from theano import Op, Apply, config
from theano.compile import optdb
from theano.gof import local_optimizer, LocalOptGroup
from theano.tensor.basic import as_tensor_variable
from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
from theano.tensor.opt import in2out
from .basic_ops import HideC, as_gpuarray_variable
from .basic_ops import HideC, as_gpuarray_variable, GpuAllocEmpty
try:
import pygpu
......@@ -112,8 +113,11 @@ gpugemv_inplace = GpuGemv(inplace=True)
class GpuGemm(BlasOp, Gemm):
_f16_ok = True
def make_node(self, C, alpha, A, B, beta):
res = Gemm.make_node(self, C, alpha, A, B, beta)
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
A = as_gpuarray_variable(A)
B = as_gpuarray_variable(B)
C = as_gpuarray_variable(C)
......@@ -296,7 +300,12 @@ def local_inplace_gpuagemv(node):
@local_optimizer([gpugemm_no_inplace], inplace=True)
def local_inplace_gpuagemm(node):
if node.op == gpugemm_no_inplace:
return [gpugemm_inplace(*node.inputs)]
inputs = list(node.inputs)
C = inputs[0]
if (C.owner and isinstance(C.owner.op, GpuAllocEmpty) and
len(C.clients) > 1):
inputs[0] = C.owner.op(*C.owner.inputs)
return [gpugemm_inplace(*inputs)]
@local_optimizer([gpuger_no_inplace], inplace=True)
......
......@@ -10,6 +10,7 @@ except ImportError:
from theano import tensor, scalar, gof
from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB,
SequenceDB, Optimizer, toolbox)
from theano.gof.optdb import LocalGroupDB
......@@ -25,9 +26,10 @@ from .basic_ops import (as_gpuarray_variable,
host_from_gpu, gpu_from_host,
HostFromGpu, GpuFromHost,
GpuSplit, GpuContiguous,
gpu_alloc, GpuAlloc, GpuReshape,
gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin)
from .blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
gpugemm_no_inplace)
from .conv import GpuConv
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx,
......@@ -37,6 +39,7 @@ from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20)
from .opt_util import alpha_merge, output_merge
gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()
......@@ -603,6 +606,33 @@ def local_gpua_gemm(node):
return GpuGemm(inplace=node.op.inplace)
@register_opt('fast_compile')
@op_lifter([tensor.basic.Dot])
def local_gpua_hgemm(node):
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
return
A = node.inputs[0]
B = node.inputs[1]
if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph
C = GpuAllocEmpty(dtype='float16')(shape_i(A, 0, fgraph),
shape_i(B, 1, fgraph))
return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
@register_opt()
@alpha_merge(GpuGemm, alpha_in=1, beta_in=2, nd=2)
def local_gpuagemm_alpha_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt()
@output_merge(GpuGemm, alpha_in=1, beta_in=2, out_in=0, nd=2)
def local_gpuagemm_output_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt('fast_compile')
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(node):
......
from unittest import TestCase
from nose.plugins.skip import SkipTest
import numpy
import theano
from theano import tensor
from theano.tests import unittest_tools
from theano.tests import unittest_tools as utt
from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
_dot22)
from theano.tensor.tests.test_blas import TestGer, BaseGemv
......@@ -15,7 +17,7 @@ from .test_basic_ops import (makeTester, rand,
from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpugemm_no_inplace,
gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22)
GpuGer, gpu_dot22, GpuGemm)
GpuGemvTester = makeTester('GpuGemvTester',
......@@ -31,7 +33,7 @@ GpuGemvTester = makeTester('GpuGemvTester',
)
class TestGpuSgemv(TestCase, BaseGemv, unittest_tools.TestOptimizationMixin):
class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
mode = mode_with_gpu
dtype = 'float32'
......@@ -92,7 +94,7 @@ class TestGpuSgerNoTransfer(TestGpuSger):
shared = staticmethod(gpuarray_shared_constructor)
class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
def setUp(self):
self.ops = [gpuger_no_inplace, gpuger_inplace]
......@@ -115,3 +117,47 @@ GpuDot22Tester = makeTester(
# test9=[rand(0, 0), rand(0, 0)],
)
)
def test_hgemm_swap():
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
v = tensor.vector(dtype='float16')
m = tensor.matrix(dtype='float16')
m2 = tensor.matrix(dtype='float16')
m32 = tensor.matrix(dtype='float32')
# test that we don't try to replace anything but matrix x matrix in float16
f = theano.function([v, m], tensor.dot(v, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 0
f = theano.function([m32, m], tensor.dot(m32, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 0
f = theano.function([m, m2], tensor.dot(m, m2), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 1
def test_hgemm_value():
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
m = tensor.matrix(dtype='float16')
m2 = tensor.matrix(dtype='float16')
f = theano.function([m, m2], tensor.dot(m, m2), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 1
v1 = numpy.random.random((3, 4)).astype('float16')
v2 = numpy.random.random((4, 2)).astype('float16')
of = f(v1, v2)
on = numpy.dot(v1, v2)
utt.assert_allclose(of, on)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论