提交 808f855b authored 作者: notoraptor's avatar notoraptor

Remove module `nerv`.

上级 13ff40a3
...@@ -22,9 +22,6 @@ Blas Op ...@@ -22,9 +22,6 @@ Blas Op
.. automodule:: theano.gpuarray.blas .. automodule:: theano.gpuarray.blas
:members: :members:
.. automodule:: theano.gpuarray.nerv
:members:
Elemwise Op Elemwise Op
=========== ===========
......
...@@ -29,7 +29,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant, ...@@ -29,7 +29,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor, GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined) reg_context, get_context, ContextNotDefined)
from .basic_ops import as_gpuarray_variable from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, nerv, extra_ops, multinomial, reduction, rng_mrg from . import fft, dnn, opt, extra_ops, multinomial, reduction, rng_mrg
def transfer(x, target): def transfer(x, target):
......
#section init_code_struct
/* Why do we need this? */
size_t dim = 2048 * 32;
rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, PARAMS,
Py_None);
if (rand_buf == NULL) {
FAIL;
}
#section support_code_struct
PyGpuArrayObject *rand_buf;
int gemm16(PyGpuArrayObject *C, float alpha,
PyGpuArrayObject *A, PyGpuArrayObject *B,
float beta, PyGpuArrayObject **out,
PyGpuContextObject *c) {
PyGpuArrayObject *_A = NULL;
PyGpuArrayObject *_B = NULL;
GpuKernel *gk;
char *prand, *pA, *pB, *pout;
void *params[13];
size_t grid[2];
size_t threads[2];
int res = 0;
int flags = 0;
int lda, ldb, ldc, n, m, k;
int n128, n64;
int size = 0;
int vec = 0;
static unsigned int nprocs = 0;
char opA, opB;
if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY) &&
GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) {
/*
* The nervana kernels do not cover the case where both inputs are
* trans so we need to copy one of them. We choose the smallest
* one.
*/
if (PyGpuArray_DIM(A, 0) * PyGpuArray_DIM(A, 1) <
PyGpuArray_DIM(B, 0) * PyGpuArray_DIM(B, 1)) {
_A = pygpu_copy(A, GA_C_ORDER);
if (_A == NULL) {
res = 1;
goto cleanup;
}
/*
* This is not an extra reference on _A so don't add an INCREF.
* Also, we don't lose the ref on A since our caller will deal
* with it.
*/
A = _A;
} else {
_B = pygpu_copy(B, GA_C_ORDER);
if (_B == NULL) {
res = 1;
goto cleanup;
}
/*
* This is not an extra reference on _B so don't add an INCREF
* Also, we don't lose the ref on B since our caller will deal
* with it.
*/
B = _B;
}
}
if (GEMM16_INPLACE && GpuArray_CHKFLAGS(&C->ga, GA_CARRAY)) {
Py_XDECREF(*out);
*out = C;
Py_INCREF(*out);
} else {
*out = theano_try_copy(*out, C);
if (*out == NULL) {
res = 1;
goto cleanup;
}
}
if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY)) {
opA = 't';
lda = PyGpuArray_STRIDE(A, 1);
} else {
opA = 'n';
lda = PyGpuArray_STRIDE(A, 0);
}
if (GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) {
opB = 't';
ldb = PyGpuArray_STRIDE(B, 1);
} else {
opB = 'n';
ldb = PyGpuArray_STRIDE(B, 0);
}
ldc = PyGpuArray_STRIDE(*out, 0);
/* lda and friend are in number of elements, not bytes */
lda /= 2;
ldb /= 2;
ldc /= 2;
m = PyGpuArray_DIM(*out, 0);
n = PyGpuArray_DIM(*out, 1);
k = PyGpuArray_DIM(B, 0);
/* Tuning code adapted from the python version */
grid[0] = (m + 127) / 128;
if (opA == 'n' && opB == 't')
size = 128;
else {
if (n < 384-16) {
n128 = n % 128;
if (n128 < 112) {
if (48 < n128 && n128 <= 64) {
n64 = n / 64;
if (nprocs == 0)
if (gpucontext_property(A->context->ctx,
GA_CTX_PROP_NUMPROCS, &nprocs)) {
nprocs = 0;
res = 1;
goto cleanup;
}
n64 *= (grid[0] / nprocs);
if (n64 > 1 || (opA == 't' && opB == 'n'))
size = 64;
else
size = 32;
} else {
size = 32;
}
} else {
size = 128;
}
} else {
size = 128;
}
}
grid[1] = (n + (size-1)) / size;
if (size == 128)
threads[0] = 256;
else
threads[0] = 128;
threads[1] = 1;
if ((opA == 't' && opB == 'n' && m % 8 == 0 && n % 8 == 0) ||
(opA == 'n' && opB == 'n' && k % 16 == 0 && n % 8 == 0) ||
(opA == 'n' && opB == 't' && k % 16 == 0))
vec = 1;
switch (size) {
case 128:
if (opA == 'n' && opB == 'n') {
if (vec)
gk = &k_nn_vec_128x128;
else
gk = &k_nn_128x128;
} else if (opA == 'n' && opB == 't') {
if (vec)
gk = &k_nt_vec_128x128;
else
gk = &k_nt_128x128;
} else if (opA == 't' && opB == 'n') {
if (vec)
gk = &k_tn_vec_128x128;
else
gk = &k_tn_128x128;
}
break;
case 64:
if (opA == 'n' && opB == 'n') {
if (vec)
gk = &k_nn_vec_128x64;
else
gk = &k_nn_128x64;
} else if (opA == 't' && opB == 'n') {
if (vec)
gk = &k_tn_vec_128x64;
else
gk = &k_tn_128x64;
}
break;
case 32:
if (opA == 'n' && opB == 'n') {
if (vec)
gk = &k_nn_vec_128x32;
else
gk = &k_nn_128x32;
} else if (opA == 't' && opB == 'n') {
if (vec)
gk = &k_tn_vec_128x32;
else
gk = &k_tn_128x32;
}
break;
default:
PyErr_SetString(PyExc_RuntimeError, "error selecting kernel");
res = 1;
goto cleanup;
}
prand = *((char **)rand_buf->ga.data);
prand += rand_buf->ga.offset;
pA = *((char **)A->ga.data);
pA += A->ga.offset;
pB = *((char **)B->ga.data);
pB += B->ga.offset;
pout = *((char **)(*out)->ga.data);
pout += (*out)->ga.offset;
params[0] = &prand;
params[1] = &pA;
params[2] = &pB;
params[3] = &pout;
params[4] = &lda;
params[5] = &ldb;
params[6] = &ldc;
params[7] = &m;
params[8] = &n;
params[9] = &k;
params[10] = &alpha;
params[11] = &beta;
params[12] = &flags;
if (GpuKernel_call(gk, 2, grid, threads, 0, params) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "error in gemm16 kernel call");
res = 1;
}
cleanup:
Py_XDECREF(_A);
Py_XDECREF(_B);
return res;
}
from __future__ import absolute_import, print_function, division from nose.plugins.skip import SkipTest
import os.path
import theano
from theano import Apply, Variable, tensor
from theano.compile import optdb raise SkipTest("You are importing theano.gpuarray.nerv. "
from theano.compile.ops import shape_i "This module was removed as it was based on nervanagpu that is now deprecated. "
from theano.gof import local_optimizer, COp "To still get this module, use Theano 0.9. "
from theano.scalar import as_scalar, constant "More info about nervanagpu here: https://github.com/NervanaSystems/nervanagpu "
"(viewed on 2017/07/05).")
from . import opt
from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
infer_context_name)
from .type import gpu_context_type
from .opt_util import alpha_merge, output_merge
try:
from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
nerv = NervanaGPU()
except ImportError:
GPUTensor = None
nerv = None
def to_gputensor(a):
assert a.flags.c_contiguous or a.flags.f_contiguous
return GPUTensor(a.shape, dtype=a.dtype, base=a,
gpudata=a.gpudata + a.offset,
strides=a.strides, is_trans=a.flags.f_contiguous)
def ensure_float(val, name):
if not isinstance(val, Variable):
val = constant(val)
if hasattr(val, 'ndim') and val.ndim == 0:
val = as_scalar(val)
if not isinstance(val.type, theano.scalar.Scalar):
raise TypeError("%s: expected a scalar value" % (name,))
if not val.type.dtype == 'float32':
raise TypeError("%s: type is not float32" % (name,))
return val
class Gemm16(COp):
"""
Gemm for float16 using the nervena kernels.
"""
__props__ = ('relu', 'inplace')
_f16_ok = True
params_type = gpu_context_type
KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32',
'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32',
'tn_128x128', 'tn_128x64', 'tn_128x32',
'tn_vec_128x128', 'tn_vec_128x64', 'tn_vec_128x32',
'tn_vec_128x16', 'nt_128x128', 'nt_vec_128x128')
def __init__(self, relu=False, inplace=False):
COp.__init__(self, ["gemm16.c"], "gemm16")
self.relu = relu
# relu = True will require more work in optimizations.
assert self.relu is False
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, C, alpha, A, B, beta):
if GPUTensor is None:
raise RuntimeError("Can't use Gemm16: nervanagpu not found")
ctx_name = infer_context_name(C, A, B)
A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C, ctx_name)
alpha = ensure_float(alpha, 'alpha')
beta = ensure_float(beta, 'beta')
assert C.dtype == A.dtype == B.dtype == 'float16'
return Apply(self, [C, alpha, A, B, beta], [C.type()])
def get_params(self, node):
return node.inputs[0].type.context
def c_headers(self):
return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h',
'string.h']
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def get_op_params(self):
return [('GEMM16_INPLACE', '1' if self.inplace else '0')]
@staticmethod
def cubin_to_code(name):
fname = 'hgemm_{0}.cubin'.format(name)
with open(os.path.join(nerv.cubin_path, fname)) as f:
cubin = f.read()
bcode = ','.join(hex(ord(c)) for c in cubin)
return "static const char bin_%s[] = { %s };" % (name, bcode)
@staticmethod
def init_gpukernel(name, fail):
return """
bcode = bin_%(name)s;
sz = sizeof(bin_%(name)s);
if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
"hgemm_%(name)s", 13, types, GA_USE_BINARY, NULL)
!= GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Could not initialize kernel %(name)s");
%(fail)s;
}
""" % dict(name=name, fail=fail)
def c_support_code(self):
codel = []
for name in self.KERN_NAMES:
codel.append(Gemm16.cubin_to_code(name))
return '\n'.join(codel)
def c_support_code_struct(self, node, nodename):
codel = []
for name in self.KERN_NAMES:
codel.append("GpuKernel k_{0};".format(name))
codel.append(super(Gemm16, self).c_support_code_struct(node, nodename))
return '\n'.join(codel)
def c_init_code_struct(self, node, nodename, sub):
codel = [super(Gemm16, self).c_init_code_struct(node, nodename, sub)]
for name in self.KERN_NAMES:
codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name))
codel.append("const char *bcode;")
codel.append("size_t sz;")
codel.append("PyGpuContextObject *c = %s;" % (sub['params'],))
codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
"GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
"GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};")
for name in self.KERN_NAMES:
codel.append(self.init_gpukernel(name, sub['fail']))
return '\n'.join(codel)
def c_cleanup_code_struct(self, node, nodename):
codel = []
for name in self.KERN_NAMES:
codel.append("GpuKernel_clear(&k_{0});".format(name))
return '\n'.join(codel)
@opt.register_opt('fast_compile')
@opt.op_lifter([tensor.Dot])
@opt.register_opt2([tensor.Dot], 'fast_compile')
def local_gpua_dot_to_gemm16(op, ctx_name, inputs, outputs):
if nerv is None:
return
A = inputs[0]
B = inputs[1]
if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = getattr(outputs[0], 'fgraph', None)
C = GpuAllocEmpty('float16', ctx_name)(
shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
return Gemm16()(C, 1.0, A, B, 0.0)
@opt.register_opt()
@alpha_merge(Gemm16, alpha_in=1, beta_in=4)
def local_gemm16_alpha_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)]
@opt.register_opt()
@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0)
def local_gemm16_output_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)]
@local_optimizer([Gemm16], inplace=True)
def local_gemm16_inplace(node):
if type(node.op) != Gemm16 or node.op.inplace:
return
inputs = list(node.inputs)
C = inputs[0]
if (C.owner and
isinstance(C.owner.op, GpuAllocEmpty) and
len(C.clients) > 1):
inputs[0] = C.owner.op(*C.owner.inputs)
return [Gemm16(relu=node.op.relu, inplace=True)(*inputs)]
optdb.register('local_gemm16_inplace',
tensor.opt.in2out(local_gemm16_inplace,
name='local_gemm16_inplace'),
70.0, 'fast_run', 'inplace', 'gpuarray')
from __future__ import absolute_import, print_function, division
from nose.plugins.skip import SkipTest
import numpy as np
from theano import function
from theano.tests import unittest_tools as utt
from theano.tensor import vector, matrix, dot
from .config import mode_with_gpu
from ..nerv import Gemm16, nerv
def test_gemm16_swap():
if nerv is None:
raise SkipTest("nervanagpu not available")
v = vector(dtype='float16')
m = matrix(dtype='float16')
m2 = matrix(dtype='float16')
m32 = matrix(dtype='float32')
# test that we don't try to replace anything but matrix x matrix in float16
f = function([v, m], dot(v, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, Gemm16)]) == 0
f = function([m32, m], dot(m32, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, Gemm16)]) == 0
f = function([m, m2], dot(m, m2), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, Gemm16)]) == 1
def test_gemm16_value():
if nerv is None:
raise SkipTest("nervanagpu not available")
m = matrix(dtype='float16')
m2 = matrix(dtype='float16')
f = function([m, m2], dot(m, m2), mode=mode_with_gpu)
v1 = np.random.random((3, 4)).astype('float16')
v2 = np.random.random((4, 2)).astype('float16')
of = f(v1, v2)
on = np.dot(v1, v2)
utt.assert_allclose(of, on)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论