提交 175045d9 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3355 from abergeron/hgemm

Enables float16 gemm on gpuarray when the cuda version supports it
import os.path
from theano import Op, Apply, config
from theano import Apply, config
from theano.compile import optdb
from theano.gof import local_optimizer, LocalOptGroup
from theano.tensor.basic import as_tensor_variable
from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
from theano.tensor.opt import in2out
from .basic_ops import HideC, as_gpuarray_variable
from .basic_ops import HideC, as_gpuarray_variable, GpuAllocEmpty
try:
import pygpu
......@@ -51,7 +52,7 @@ PyGpuArrayObject *gpublas_try_copy(PyGpuArrayObject *out,
class GpuGemv(BlasOp, Gemv):
def make_node(self, y, alpha, A, x, beta):
res = Gemv.make_node(self, y, alpha, A, x, beta)
Gemv.make_node(self, y, alpha, A, x, beta)
A = as_gpuarray_variable(A)
x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y)
......@@ -112,8 +113,11 @@ gpugemv_inplace = GpuGemv(inplace=True)
class GpuGemm(BlasOp, Gemm):
_f16_ok = True
def make_node(self, C, alpha, A, B, beta):
res = Gemm.make_node(self, C, alpha, A, B, beta)
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
A = as_gpuarray_variable(A)
B = as_gpuarray_variable(B)
C = as_gpuarray_variable(C)
......@@ -176,7 +180,7 @@ gpugemm_inplace = GpuGemm(inplace=True)
class GpuGer(BlasOp, Ger):
def make_node(self, A, alpha, x, y):
res = Ger.make_node(self, A, alpha, x, y)
Ger.make_node(self, A, alpha, x, y)
A = as_gpuarray_variable(A)
x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y)
......@@ -236,7 +240,7 @@ gpuger_inplace = GpuGer(destructive=True)
class GpuDot22(BlasOp, Dot22):
def make_node(self, x, y):
res = Dot22.make_node(self, x, y)
Dot22.make_node(self, x, y)
x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y)
assert x.dtype == y.dtype
......@@ -287,6 +291,7 @@ class GpuDot22(BlasOp, Dot22):
gpu_dot22 = GpuDot22()
@local_optimizer([gpugemv_no_inplace], inplace=True)
def local_inplace_gpuagemv(node):
if node.op == gpugemv_no_inplace:
......@@ -296,7 +301,12 @@ def local_inplace_gpuagemv(node):
@local_optimizer([gpugemm_no_inplace], inplace=True)
def local_inplace_gpuagemm(node):
if node.op == gpugemm_no_inplace:
return [gpugemm_inplace(*node.inputs)]
inputs = list(node.inputs)
C = inputs[0]
if (C.owner and isinstance(C.owner.op, GpuAllocEmpty) and
len(C.clients) > 1):
inputs[0] = C.owner.op(*C.owner.inputs)
return [gpugemm_inplace(*inputs)]
@local_optimizer([gpuger_no_inplace], inplace=True)
......@@ -304,9 +314,11 @@ def local_inplace_gpuager(node):
if node.op == gpuger_no_inplace:
return [gpuger_inplace(*node.inputs)]
gpuablas_opt_inplace = in2out(LocalOptGroup(
local_inplace_gpuagemv, local_inplace_gpuagemm, local_inplace_gpuager),
gpuablas_opt_inplace = in2out(LocalOptGroup(local_inplace_gpuagemv,
local_inplace_gpuagemm,
local_inplace_gpuager),
name='gpuablas_opt_inplace')
optdb.register('InplaceGpuaBlasOpt',
gpuablas_opt_inplace,
70.0, 'fast_run', 'inplace', 'gpuarray')
import copy
import theano
import numpy
import logging
from six.moves import xrange
try:
......@@ -8,8 +8,10 @@ try:
except ImportError:
pass
import theano
from theano import tensor, scalar, gof
from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB,
SequenceDB, Optimizer, toolbox)
from theano.gof.optdb import LocalGroupDB
......@@ -25,9 +27,10 @@ from .basic_ops import (as_gpuarray_variable,
host_from_gpu, gpu_from_host,
HostFromGpu, GpuFromHost,
GpuSplit, GpuContiguous,
gpu_alloc, GpuAlloc, GpuReshape,
gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin)
from .blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
gpugemm_no_inplace)
from .conv import GpuConv
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx,
......@@ -38,6 +41,9 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20)
from .opt_util import alpha_merge, output_merge
_logger = logging.getLogger("theano.sandbox.gpuarray.opt")
gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()
......@@ -619,6 +625,37 @@ def local_gpua_gemm(node):
return GpuGemm(inplace=node.op.inplace)
@register_opt('fast_compile')
@op_lifter([tensor.basic.Dot])
def local_gpua_hgemm(node):
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
_logger.warning("Not performing dot of float16 on the GPU since "
"cuda 7.5 is not available. Updating could speed up "
"your code.")
return
A = node.inputs[0]
B = node.inputs[1]
if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph
C = GpuAllocEmpty(dtype='float16')(shape_i(A, 0, fgraph),
shape_i(B, 1, fgraph))
return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
@register_opt()
@alpha_merge(GpuGemm, alpha_in=1, beta_in=4, nd=2)
def local_gpuagemm_alpha_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt()
@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0, nd=2)
def local_gpuagemm_output_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt('fast_compile')
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(node):
......
......@@ -73,7 +73,8 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
lr = grab_cpu_scalar(node.inputs[0], nd=nd)
else:
lr = grab_cpu_scalar(node.inputs[1], nd=nd)
if lr is None or targ is None:
if (lr is None or targ is None or
lr.dtype != targ.outputs[0].dtype):
return None
inputs = list(targ.inputs)
try:
......@@ -110,6 +111,8 @@ def output_merge(cls, alpha_in, beta_in, out_in, nd):
W = node.inputs[0]
if targ is None:
return None
if W.dtype != targ.outputs[0].dtype:
return None
if not is_equal(targ.inputs[beta_in], 0.0):
# other cases are too complex for now
return None
......
from unittest import TestCase
from nose.plugins.skip import SkipTest
import numpy
import theano
from theano import tensor
from theano.tests import unittest_tools
from theano.tests import unittest_tools as utt
from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
_dot22)
from theano.tensor.tests.test_blas import TestGer, BaseGemv
......@@ -15,7 +17,7 @@ from .test_basic_ops import (makeTester, rand,
from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpugemm_no_inplace,
gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22)
GpuGer, gpu_dot22, GpuGemm)
GpuGemvTester = makeTester('GpuGemvTester',
......@@ -31,7 +33,7 @@ GpuGemvTester = makeTester('GpuGemvTester',
)
class TestGpuSgemv(TestCase, BaseGemv, unittest_tools.TestOptimizationMixin):
class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
mode = mode_with_gpu
dtype = 'float32'
......@@ -92,7 +94,7 @@ class TestGpuSgerNoTransfer(TestGpuSger):
shared = staticmethod(gpuarray_shared_constructor)
class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
def setUp(self):
self.ops = [gpuger_no_inplace, gpuger_inplace]
......@@ -115,3 +117,50 @@ GpuDot22Tester = makeTester(
# test9=[rand(0, 0), rand(0, 0)],
)
)
def test_hgemm_swap():
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
v = tensor.vector(dtype='float16')
m = tensor.matrix(dtype='float16')
m2 = tensor.matrix(dtype='float16')
m32 = tensor.matrix(dtype='float32')
# test that we don't try to replace anything but matrix x matrix in float16
f = theano.function([v, m], tensor.dot(v, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 0
f = theano.function([m32, m], tensor.dot(m32, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 0
f = theano.function([m, m2], tensor.dot(m, m2), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuGemm)]) == 1
v1 = numpy.random.random((3, 4)).astype('float16')
v2 = numpy.random.random((4, 2)).astype('float16')
of = f(v1, v2)
on = numpy.dot(v1, v2)
utt.assert_allclose(of, on)
def test_hgemm_alpha_output_merge():
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")
m1 = tensor.matrix(dtype='float16')
m2 = tensor.matrix(dtype='float16')
b = tensor.matrix(dtype='float16')
hgemm = numpy.asarray(0.05, dtype='float16') * (tensor.dot(m1, m2) + b)
f = theano.function([m1, m2, b], hgemm, mode=mode_with_gpu)
# there should be 3 gpu_from_host, 1 hgemm and 1 host_from_gpu
assert len(f.maker.fgraph.apply_nodes) == 5
......@@ -36,7 +36,7 @@ class GpuArrayType(Type):
return self.__class__(dtype=dtype, broadcastable=broadcastable,
name=self.name)
def __str__(self):
def __repr__(self):
return "GpuArrayType(%s, %s)" % (self.dtype, self.broadcastable)
def filter(self, data, strict=False, allow_downcast=None):
......
......@@ -162,7 +162,6 @@ whitelist_flake8 = [
"sandbox/gpuarray/elemwise.py",
"sandbox/gpuarray/type.py",
"sandbox/gpuarray/__init__.py",
"sandbox/gpuarray/blas.py",
"sandbox/gpuarray/kernel_codegen.py",
"sandbox/gpuarray/conv.py",
"sandbox/gpuarray/neighbours.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论