提交 7f7749d2 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Add nervana gemm (for float16).

上级 804f9114
...@@ -29,7 +29,7 @@ AddConfigVar('gpuarray.sync', ...@@ -29,7 +29,7 @@ AddConfigVar('gpuarray.sync',
# This is for documentation not to depend on the availability of pygpu # This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant, from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor) GpuArraySharedVariable, gpuarray_shared_constructor)
from . import opt from . import opt, nerv
def init_dev(dev): def init_dev(dev):
......
import numpy
import theano
from theano import Op, Apply, Variable, tensor
from theano.compile import optdb
from theano.gof import local_optimizer
from theano.scalar import as_scalar, constant
from . import opt
from .basic_ops import (as_gpuarray_variable, gpu_alloc, gpu_from_host,
host_from_gpu)
from .opt_util import alpha_merge, output_merge
from .pycuda_helper import ensure_pycuda_context
try:
from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
nerv = NervanaGPU()
except ImportError:
GPUTensor = None
def to_gputensor(a):
assert a.flags.c_contiguous or a.flags.f_contiguous
return GPUTensor(a.shape, dtype=a.dtype, base=a,
gpudata=a.gpudata + a.offset,
strides=a.strides, is_trans=a.flags.f_contiguous)
def ensure_float(val, name):
if not isinstance(val, Variable):
val = constant(val)
if hasattr(val, 'ndim') and val.ndim == 0:
val = as_scalar(val)
if not isinstance(val.type, theano.scalar.Scalar):
raise TypeError("%s: expected a scalar value" % (name,))
if not val.type.dtype == 'float32':
raise TypeError("%s: type is not float32" % (name,))
return val
class Gemm16(Op):
__props__ = ('relu', 'inplace')
_f16_ok = True
def __init__(self, relu=False, inplace=False):
self.relu = relu
# relu = True will require more work in optimizations.
assert self.relu == False
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, C, alpha, A, B, beta):
if GPUTensor is None:
raise RuntimeError("Can't use Gemm16: nervanagpu not found")
A = as_gpuarray_variable(A)
B = as_gpuarray_variable(B)
C = as_gpuarray_variable(C)
alpha = ensure_float(alpha, 'alpha')
beta = ensure_float(beta, 'beta')
assert C.dtype == A.dtype == B.dtype == 'float16'
return Apply(self, [C, alpha, A, B, beta], [C.type()])
def perform(self, node, inputs, outputs):
ctx = ensure_pycuda_context()
C, alpha, A, B, beta = inputs
# The nervana code does not support the case where both inputs
# are trans, so we need to copy one if them if that is the
# case. We copy the smaller one.
if A.flags.f_contiguous and B.flags.f_contiguous:
if A.size < B.size:
A = A.copy()
else:
B = B.copy()
inplace = self.inplace
if inplace and not C.flags.forc:
inplace = False
if not inplace:
C = C.copy()
At = to_gputensor(A)
Bt = to_gputensor(B)
Ct = to_gputensor(C)
nerv.dot(At, Bt, Ct, alpha=alpha, beta=beta, relu=False)
outputs[0][0] = C
@opt.register_opt()
@local_optimizer([tensor.Dot])
def local_dot_to_gemm16(node):
if (type(node.op) == tensor.Dot and
node.inputs[0].dtype == 'float16' and
node.inputs[1].dtype == 'float16' and
node.inputs[0].ndim == 2 and node.inputs[1].ndim == 2):
A = gpu_from_host(node.inputs[0])
B = gpu_from_host(node.inputs[1])
C = gpu_alloc(numpy.asarray(0, dtype='float16'),
A.shape[0], B.shape[1])
return [host_from_gpu(Gemm16()(C, 1.0, A, B, 0.0))]
@opt.register_opt()
@alpha_merge(Gemm16, alpha_in=1, beta_in=4, nd=2)
def local_gemm16_alpha_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)]
@opt.register_opt()
@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0, nd=2)
def local_gemm16_output_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)]
@local_optimizer([Gemm16], inplace=True)
def local_gemm16_inplace(node):
if type(node.op) != Gemm16 or node.op.inplace:
return
return [Gemm16(relu=node.op.relu, inplace=True)(*node.inputs)]
optdb.register('local_gemm16_inplace',
tensor.opt.in2out(local_gemm16_inplace,
name='local_gemm16_inplace'),
70.0, 'fast_run', 'inplace', 'gpuarray')
from functools import wraps
import numpy
import theano
from theano import scalar as scal, Constant
from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from .basic_ops import GpuFromHost, HostFromGpu, host_from_gpu
from .elemwise import GpuDimShuffle, GpuElemwise
_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
def grab_cpu_scalar(v, nd):
if v.owner is not None:
n = v.owner
if (isinstance(n.op, GpuDimShuffle) and
n.op.new_order == ('x',) * nd):
return host_from_gpu(n.inputs[0])
elif (isinstance(n.op, DimShuffle) and
n.op.new_order == ('x',) * nd):
return n.inputs[0]
elif isinstance(n.op, GpuFromHost):
return grab_cpu_scalar(n.inputs[0], nd=nd)
else:
return None
else:
if (isinstance(v, Constant) and
v.broadcastable == (True,) * nd):
return v.dimshuffle(())
def find_node(v, cls):
# This digs through possibly redundant transfers to for the node
# that has the op class specified.
if v.owner is not None:
if isinstance(v.owner.op, cls):
return v.owner
elif (isinstance(v.owner.op, GpuFromHost) and
v.owner.inputs[0].owner is not None and
isinstance(v.owner.inputs[0].owner.op, HostFromGpu)):
return find_node(v.owner.inputs[0].owner.inputs[0], cls)
else:
return None
def is_equal(var, val):
# Returns True if var is always equal to val (python value), False
# otherwise (including if var is not constant)
try:
v = get_scalar_constant_value(var)
return v == val
except NotScalarConstantError:
return False
def alpha_merge(cls, alpha_in, beta_in, nd):
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
def opt(node):
if (isinstance(node.op, GpuElemwise) and
node.op.scalar_op == scal.mul and
node.nin == 2):
targ = find_node(node.inputs[0], cls)
if targ is None:
targ = find_node(node.inputs[1], cls)
lr = grab_cpu_scalar(node.inputs[0], nd=nd)
else:
lr = grab_cpu_scalar(node.inputs[1], nd=nd)
if lr is None or targ is None:
return None
inputs = list(targ.inputs)
inputs[alpha_in] = lr * targ.inputs[alpha_in]
inputs[beta_in] = lr * targ.inputs[beta_in]
return maker(targ, *inputs)
return opt
return wrapper
def output_merge(cls, alpha_in, beta_in, out_in, nd):
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
def opt(node):
if (isinstance(node.op, GpuElemwise) and
node.op.scalar_op == scal.add and
node.nin == 2):
targ = find_node(node.inputs[0], cls)
W = node.inputs[1]
if targ is None:
targ = find_node(node.inputs[1], cls)
W = node.inputs[0]
if targ is None:
return None
if not is_equal(targ.inputs[beta_in], 0.0):
# other cases are too complex for now
return None
if W.broadcastable != targ.inputs[out_in].broadcastable:
# Would need to explicitly tile the output to fill
# the full shape here. Disable for now.
return None
inputs = list(targ.inputs)
inputs[out_in] = W
inputs[beta_in] = _one.clone()
return maker(targ, *inputs)
return opt
return wrapper
try:
from pycuda.driver import Context
if not hasattr(Context, 'attach'):
raise ImportError('too old')
except ImportError:
Context = None
pycuda_initialized = False
pycuda_context = None
def ensure_pycuda_context():
global pycuda_context, pycuda_initialized
if not pycuda_initialized:
if Context is None:
raise RuntimeError("PyCUDA not found or too old.")
else:
pycuda_context = Context.attach()
import atexit
atexit.register(pycuda_context.detach)
pycuda_initialized = True
return pycuda_context
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论