提交 6a93ccc7 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2927 from abergeron/nerv_ops

Nerv ops
...@@ -29,7 +29,7 @@ AddConfigVar('gpuarray.sync', ...@@ -29,7 +29,7 @@ AddConfigVar('gpuarray.sync',
# This is for documentation not to depend on the availability of pygpu # This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant, from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor) GpuArraySharedVariable, gpuarray_shared_constructor)
from . import opt from . import opt, nerv
def init_dev(dev): def init_dev(dev):
......
...@@ -586,11 +586,13 @@ class GpuAlloc(HideC, Alloc): ...@@ -586,11 +586,13 @@ class GpuAlloc(HideC, Alloc):
return s return s
def make_node(self, value, *shape): def make_node(self, value, *shape):
res = Alloc.make_node(self, value, *shape)
value = as_gpuarray_variable(value) value = as_gpuarray_variable(value)
otype = GpuArrayType(dtype=res.outputs[0].dtype, sh, bcast = self.validate_shape(shape)
broadcastable=res.outputs[0].broadcastable) if value.ndim > len(sh):
return Apply(self, [value] + res.inputs[1:], [otype()]) TypeError("The GpuAlloc value to use has more dimensions "
"than the specified shape", v.ndim, len(sh))
otype = value.type.clone(broadcastable=bcast)
return Apply(self, [value] + sh, [otype()])
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>'] return ['<numpy_compat.h>']
...@@ -600,7 +602,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -600,7 +602,7 @@ class GpuAlloc(HideC, Alloc):
v = inputs[0] v = inputs[0]
sh = tuple(map(int, inputs[1:])) sh = tuple(map(int, inputs[1:]))
if out[0] is None or out[0].shape != sh: if out[0] is None or out[0].shape != sh:
if v.size == 1 and numpy.asarray(v).flatten().item() == 0: if self.memset_0:
out[0] = gpuarray.zeros(sh, dtype=v.dtype) out[0] = gpuarray.zeros(sh, dtype=v.dtype)
else: else:
out[0] = gpuarray.empty(sh, dtype=v.dtype) out[0] = gpuarray.empty(sh, dtype=v.dtype)
...@@ -712,9 +714,74 @@ class GpuAlloc(HideC, Alloc): ...@@ -712,9 +714,74 @@ class GpuAlloc(HideC, Alloc):
return False return False
return True return True
gpu_alloc = GpuAlloc() gpu_alloc = GpuAlloc()
class GpuAllocEmpty(HideC, Alloc):
__props__ = ('dtype',)
_f16_ok = True
def __init__(self, dtype):
self.dtype = dtype
def make_node(self, *shape):
sh, bcast = self.validate_shape(shape)
otype = GpuArrayType(dtype=self.dtype, broadcastable=bcast)
return Apply(self, sh, [otype()])
def perform(self, node, inputs, out_):
out = out_[0]
sh = [int(i) for i in inputs]
if out[0] is None or out[0].shape != sh:
out[0] = pygpu.empty(sh, dtype=self.dtype)
# if out[0] is the right shape, we just return it
def c_headers(self):
return ['<gpuarray_helper.h>']
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_code(self, node, name, inp, out, sub):
ndim = len(inp)
zz = out[0]
fail = sub['fail']
code = ["""
int i;
size_t shape[%(ndim)s];
""" % dict(ndim=ndim)]
for i, shp_i in enumerate(inp):
code.append("""
shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
""" % dict(i=i, shp_i=shp_i))
code.append("""
if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
pygpu_default_context())) {
%(fail)s
}
""" % dict(zz=zz, ndim=ndim, type=gpuarray.dtype_to_typecode(self.dtype),
fail=fail))
return ''.join(code)
def c_code_cache_version(self):
return (0,)
def do_constant_folding(self, node):
return False
def infer_shape(self, node, input_shapes):
return [node.inputs]
def grad(self, *args):
# Don't reuse the grad implementation from Alloc
raise NotImplementedError("grad disabled")
class GpuContiguous(Op): class GpuContiguous(Op):
""" """
Always return a c contiguous output. Copy the input only if it is Always return a c contiguous output. Copy the input only if it is
......
#section init_code_struct
/* Why do we need this? */
size_t dim = 2048 * 32;
rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, pygpu_default_context(),
Py_None);
if (rand_buf == NULL) {
FAIL;
}
#section support_code_struct
PyGpuArrayObject *rand_buf;
int gemm16(PyGpuArrayObject *C, float alpha,
PyGpuArrayObject *A, PyGpuArrayObject *B,
float beta, PyGpuArrayObject **out) {
PyGpuArrayObject *_A = NULL;
PyGpuArrayObject *_B = NULL;
GpuKernel *gk;
char *prand, *pA, *pB, *pout;
void *params[13];
size_t grid[2];
size_t threads[2];
int res = 0;
int flags = 0;
int lda, ldb, ldc, n, m, k;
int n128, n64;
int size = 0;
int vec = 0;
static unsigned int nprocs = 0;
char opA, opB;
if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY) &&
GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) {
/*
* The nervana kernels do not cover the case where both inputs are
* trans so we need to copy one of them. We choose the smallest
* one.
*/
if (PyGpuArray_DIM(A, 0) * PyGpuArray_DIM(A, 1) <
PyGpuArray_DIM(B, 0) * PyGpuArray_DIM(B, 1)) {
_A = pygpu_copy(A, GA_C_ORDER);
if (_A == NULL) {
res = 1;
goto cleanup;
}
/*
* This is not an extra reference on _A so don't add an INCREF.
* Also, we don't lose the ref on A since our caller will deal
* with it.
*/
A = _A;
} else {
_B = pygpu_copy(B, GA_C_ORDER);
if (_B == NULL) {
res = 1;
goto cleanup;
}
/*
* This is not an extra reference on _B so don't add an INCREF
* Also, we don't lose the ref on B since our caller will deal
* with it.
*/
B = _B;
}
}
if (GEMM16_INPLACE && GpuArray_CHKFLAGS(&C->ga, GA_CARRAY)) {
Py_XDECREF(*out);
*out = C;
Py_INCREF(*out);
} else {
*out = theano_try_copy(*out, C);
if (*out == NULL) {
res = 1;
goto cleanup;
}
}
if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY)) {
opA = 't';
lda = PyGpuArray_STRIDE(A, 1);
} else {
opA = 'n';
lda = PyGpuArray_STRIDE(A, 0);
}
if (GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) {
opB = 't';
ldb = PyGpuArray_STRIDE(B, 1);
} else {
opB = 'n';
ldb = PyGpuArray_STRIDE(B, 0);
}
ldc = PyGpuArray_STRIDE(*out, 0);
/* lda and friend are in number of elements, not bytes */
lda /= 2;
ldb /= 2;
ldc /= 2;
m = PyGpuArray_DIM(*out, 0);
n = PyGpuArray_DIM(*out, 1);
k = PyGpuArray_DIM(B, 0);
/* Tuning code adapted from the python version */
grid[0] = (m + 127) / 128;
if (opA == 'n' && opB == 't')
size = 128;
else {
if (n < 384-16) {
n128 = n % 128;
if (n128 < 112) {
if (48 < n128 && n128 <= 64) {
n64 = n / 64;
if (nprocs == 0)
if (A->ga.ops->property(A->context->ctx, NULL, NULL,
GA_CTX_PROP_NUMPROCS, &nprocs)) {
nprocs = 0;
res = 1;
goto cleanup;
}
n64 *= (grid[0] / nprocs);
if (n64 > 1 || (opA == 't' && opB == 'n'))
size = 64;
else
size = 32;
} else {
size = 32;
}
} else {
size = 128;
}
} else {
size = 128;
}
}
grid[1] = (n + (size-1)) / size;
if (size == 128)
threads[0] = 256;
else
threads[0] = 128;
threads[1] = 1;
if ((opA == 't' && opB == 'n' && m % 8 == 0 && n % 8 == 0) ||
(opA == 'n' && opB == 'n' && k % 16 == 0 && n % 8 == 0) ||
(opA == 'n' && opB == 't' && k % 16 == 0))
vec = 1;
switch (size) {
case 128:
if (opA == 'n' && opB == 'n') {
if (vec)
gk = &k_nn_vec_128x128;
else
gk = &k_nn_128x128;
} else if (opA == 'n' && opB == 't') {
if (vec)
gk = &k_nt_vec_128x128;
else
gk = &k_nt_128x128;
} else if (opA == 't' && opB == 'n') {
if (vec)
gk = &k_tn_vec_128x128;
else
gk = &k_tn_128x128;
}
break;
case 64:
if (opA == 'n' && opB == 'n') {
if (vec)
gk = &k_nn_vec_128x64;
else
gk = &k_nn_128x64;
} else if (opA == 't' && opB == 'n') {
if (vec)
gk = &k_tn_vec_128x64;
else
gk = &k_tn_128x64;
}
break;
case 32:
if (opA == 'n' && opB == 'n') {
if (vec)
gk = &k_nn_vec_128x32;
else
gk = &k_nn_128x32;
} else if (opA == 't' && opB == 'n') {
if (vec)
gk = &k_tn_vec_128x32;
else
gk = &k_tn_128x32;
}
break;
default:
PyErr_SetString(PyExc_RuntimeError, "error selecting kernel");
res = 1;
goto cleanup;
}
prand = *((char **)rand_buf->ga.data);
prand += rand_buf->ga.offset;
pA = *((char **)A->ga.data);
pA += A->ga.offset;
pB = *((char **)B->ga.data);
pB += B->ga.offset;
pout = *((char **)(*out)->ga.data);
pout += (*out)->ga.offset;
params[0] = &prand;
params[1] = &pA;
params[2] = &pB;
params[3] = &pout;
params[4] = &lda;
params[5] = &ldb;
params[6] = &ldc;
params[7] = &m;
params[8] = &n;
params[9] = &k;
params[10] = &alpha;
params[11] = &beta;
params[12] = &flags;
if (GpuKernel_call(gk, 2, threads, grid, 0, params) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "error in gemm16 kernel call");
res = 1;
}
cleanup:
Py_XDECREF(_A);
Py_XDECREF(_B);
return res;
}
...@@ -24,4 +24,22 @@ static int theano_prep_output(PyGpuArrayObject **out, unsigned int nd, ...@@ -24,4 +24,22 @@ static int theano_prep_output(PyGpuArrayObject **out, unsigned int nd,
return (*out == NULL) ? 1 : 0; return (*out == NULL) ? 1 : 0;
} }
static PyGpuArrayObject *theano_try_copy(PyGpuArrayObject *out,
PyGpuArrayObject *V) {
if (out &&
GpuArray_CHKFLAGS(&out->ga, GA_CARRAY) &&
theano_size_check(out, PyGpuArray_NDIM(V),
PyGpuArray_DIMS(V),
V->ga.typecode)) {
if (pygpu_move(out, V)) {
Py_XDECREF(out);
return NULL;
}
} else {
Py_XDECREF(out);
out = pygpu_copy(V, GA_C_ORDER);
}
return out;
}
#endif #endif
import os.path
import theano
from theano import Apply, Variable, tensor
from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.gof import local_optimizer, COp
from theano.scalar import as_scalar, constant
from . import opt
from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty)
from .opt_util import alpha_merge, output_merge
from .pycuda_helper import ensure_pycuda_context
try:
from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
nerv = NervanaGPU()
except ImportError:
GPUTensor = None
nerv = None
def to_gputensor(a):
assert a.flags.c_contiguous or a.flags.f_contiguous
return GPUTensor(a.shape, dtype=a.dtype, base=a,
gpudata=a.gpudata + a.offset,
strides=a.strides, is_trans=a.flags.f_contiguous)
def ensure_float(val, name):
if not isinstance(val, Variable):
val = constant(val)
if hasattr(val, 'ndim') and val.ndim == 0:
val = as_scalar(val)
if not isinstance(val.type, theano.scalar.Scalar):
raise TypeError("%s: expected a scalar value" % (name,))
if not val.type.dtype == 'float32':
raise TypeError("%s: type is not float32" % (name,))
return val
class Gemm16(COp):
__props__ = ('relu', 'inplace')
_f16_ok = True
KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32',
'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32',
'tn_128x128', 'tn_128x64', 'tn_128x32',
'tn_vec_128x128', 'tn_vec_128x64', 'tn_vec_128x32',
'tn_vec_128x16', 'nt_128x128', 'nt_vec_128x128')
def __init__(self, relu=False, inplace=False):
COp.__init__(self, ["gemm16.c"], "gemm16")
self.relu = relu
# relu = True will require more work in optimizations.
assert self.relu is False
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, C, alpha, A, B, beta):
if GPUTensor is None:
raise RuntimeError("Can't use Gemm16: nervanagpu not found")
A = as_gpuarray_variable(A)
B = as_gpuarray_variable(B)
C = as_gpuarray_variable(C)
alpha = ensure_float(alpha, 'alpha')
beta = ensure_float(beta, 'beta')
assert C.dtype == A.dtype == B.dtype == 'float16'
return Apply(self, [C, alpha, A, B, beta], [C.type()])
def perform(self, node, inputs, outputs):
ensure_pycuda_context()
C, alpha, A, B, beta = inputs
# The nervana code does not support the case where both inputs
# are trans, so we need to copy one if them if that is the
# case. We copy the smaller one.
if A.flags.f_contiguous and B.flags.f_contiguous:
if A.size < B.size:
A = A.copy()
else:
B = B.copy()
inplace = self.inplace
if inplace and not C.flags.c_contiguous:
inplace = False
if not inplace:
C = C.copy()
At = to_gputensor(A)
Bt = to_gputensor(B)
Ct = to_gputensor(C)
nerv.dot(At, Bt, Ct, alpha=alpha, beta=beta, relu=False)
outputs[0][0] = C
def c_headers(self):
return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h',
'string.h']
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def get_op_params(self):
return [('GEMM16_INPLACE', '1' if self.inplace else '0')]
@staticmethod
def cubin_to_code(name):
fname = 'hgemm_{0}.cubin'.format(name)
with open(os.path.join(nerv.cubin_path, fname)) as f:
cubin = f.read()
bcode = ','.join(hex(ord(c)) for c in cubin)
return "static const char bin_%s[] = { %s };" % (name, bcode)
@staticmethod
def init_gpukernel(name, fail):
return """
bcode = bin_%(name)s;
sz = sizeof(bin_%(name)s);
if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
"hgemm_%(name)s", 13, types, GA_USE_BINARY, NULL)
!= GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Could not initialize kernel %(name)s");
%(fail)s;
}
""" % dict(name=name, fail=fail)
def c_support_code(self):
codel = []
for name in self.KERN_NAMES:
codel.append(Gemm16.cubin_to_code(name))
return '\n'.join(codel)
def c_support_code_struct(self, node, nodename):
codel = []
for name in self.KERN_NAMES:
codel.append("GpuKernel k_{0};".format(name))
codel.append(super(Gemm16, self).c_support_code_struct(node, nodename))
return '\n'.join(codel)
def c_init_code_struct(self, node, nodename, sub):
codel = [super(Gemm16, self).c_init_code_struct(node, nodename, sub)]
for name in self.KERN_NAMES:
codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name))
codel.append("const char *bcode;")
codel.append("size_t sz;")
codel.append("PyGpuContextObject *c = pygpu_default_context();")
codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
"GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
"GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};")
for name in self.KERN_NAMES:
codel.append(self.init_gpukernel(name, sub['fail']))
return '\n'.join(codel)
def c_cleanup_code_struct(self, node, nodename):
codel = []
for name in self.KERN_NAMES:
codel.append("GpuKernel_clear(&k_{0});".format(name))
return '\n'.join(codel)
@opt.register_opt()
@opt.op_lifter([tensor.Dot])
def local_dot_to_gemm16(node):
A = node.inputs[0]
B = node.inputs[1]
if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph
C = GpuAllocEmpty(dtype='float16')(
shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
return Gemm16()(C, 1.0, A, B, 0.0)
@opt.register_opt()
@alpha_merge(Gemm16, alpha_in=1, beta_in=4, nd=2)
def local_gemm16_alpha_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)]
@opt.register_opt()
@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0, nd=2)
def local_gemm16_output_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)]
@local_optimizer([Gemm16], inplace=True)
def local_gemm16_inplace(node):
if type(node.op) != Gemm16 or node.op.inplace:
return
inputs = list(node.inputs)
C = inputs[0]
if (C.owner and
isinstance(C.owner.op, GpuAllocEmpty) and
len(C.clients) > 1):
inputs[0] = C.owner.op(*C.owner.inputs)
return [Gemm16(relu=node.op.relu, inplace=True)(*inputs)]
optdb.register('local_gemm16_inplace',
tensor.opt.in2out(local_gemm16_inplace,
name='local_gemm16_inplace'),
70.0, 'fast_run', 'inplace', 'gpuarray')
from functools import wraps
import numpy
from theano import scalar as scal, Constant
from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from .basic_ops import GpuFromHost, HostFromGpu, host_from_gpu
from .elemwise import GpuDimShuffle, GpuElemwise
_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
def grab_cpu_scalar(v, nd):
if v.owner is not None:
n = v.owner
if (isinstance(n.op, GpuDimShuffle) and
n.op.new_order == ('x',) * nd):
return host_from_gpu(n.inputs[0])
elif (isinstance(n.op, DimShuffle) and
n.op.new_order == ('x',) * nd):
return n.inputs[0]
elif isinstance(n.op, GpuFromHost):
return grab_cpu_scalar(n.inputs[0], nd=nd)
else:
return None
else:
if (isinstance(v, Constant) and
v.broadcastable == (True,) * nd):
return v.dimshuffle(())
def find_node(v, cls, ignore_clients=False):
# This digs through possibly redundant transfers to for the node
# that has the op class specified. If ignore_clients is False (the
# default) it will only dig through nodes that have a single
# client.
if v.owner is not None and (ignore_clients or v.clients == 1):
if isinstance(v.owner.op, cls):
return v.owner
elif (isinstance(v.owner.op, GpuFromHost) and
v.owner.inputs[0].owner is not None and
isinstance(v.owner.inputs[0].owner.op, HostFromGpu)):
return find_node(v.owner.inputs[0].owner.inputs[0], cls)
else:
return None
def is_equal(var, val):
# Returns True if var is always equal to val (python value), False
# otherwise (including if var is not constant)
try:
v = get_scalar_constant_value(var)
return v == val
except NotScalarConstantError:
return False
def alpha_merge(cls, alpha_in, beta_in, nd):
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
def opt(node):
if (isinstance(node.op, GpuElemwise) and
node.op.scalar_op == scal.mul and
node.nin == 2):
targ = find_node(node.inputs[0], cls)
if targ is None:
targ = find_node(node.inputs[1], cls)
lr = grab_cpu_scalar(node.inputs[0], nd=nd)
else:
lr = grab_cpu_scalar(node.inputs[1], nd=nd)
if lr is None or targ is None:
return None
inputs = list(targ.inputs)
inputs[alpha_in] = lr * targ.inputs[alpha_in]
inputs[beta_in] = lr * targ.inputs[beta_in]
return maker(targ, *inputs)
return opt
return wrapper
def output_merge(cls, alpha_in, beta_in, out_in, nd):
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
def opt(node):
if (isinstance(node.op, GpuElemwise) and
node.op.scalar_op == scal.add and
node.nin == 2):
targ = find_node(node.inputs[0], cls)
W = node.inputs[1]
if targ is None:
targ = find_node(node.inputs[1], cls)
W = node.inputs[0]
if targ is None:
return None
if not is_equal(targ.inputs[beta_in], 0.0):
# other cases are too complex for now
return None
if W.broadcastable != targ.inputs[out_in].broadcastable:
# Would need to explicitly tile the output to fill
# the full shape here. Disable for now.
return None
inputs = list(targ.inputs)
inputs[out_in] = W
inputs[beta_in] = _one.clone()
return maker(targ, *inputs)
return opt
return wrapper
try:
from pycuda.driver import Context
if not hasattr(Context, 'attach'):
raise ImportError('too old')
except ImportError:
Context = None
pycuda_initialized = False
pycuda_context = None
def ensure_pycuda_context():
global pycuda_context, pycuda_initialized
if not pycuda_initialized:
if Context is None:
raise RuntimeError("PyCUDA not found or too old.")
else:
pycuda_context = Context.attach()
import atexit
atexit.register(pycuda_context.detach)
pycuda_initialized = True
return pycuda_context
...@@ -40,6 +40,7 @@ from ..type import (GpuArrayType, ...@@ -40,6 +40,7 @@ from ..type import (GpuArrayType,
from ..basic_ops import ( from ..basic_ops import (
host_from_gpu, gpu_from_host, host_from_gpu, gpu_from_host,
gpu_alloc, GpuAlloc, gpu_alloc, GpuAlloc,
GpuAllocEmpty,
gpu_from_cuda, gpu_from_cuda,
cuda_from_gpu, HostFromGpu, cuda_from_gpu, HostFromGpu,
GpuContiguous, GpuContiguous,
...@@ -309,6 +310,25 @@ class TestAlloc(test_basic.TestAlloc): ...@@ -309,6 +310,25 @@ class TestAlloc(test_basic.TestAlloc):
allocs = [GpuAlloc(), GpuAlloc(), T.Alloc()] allocs = [GpuAlloc(), GpuAlloc(), T.Alloc()]
def test_alloc_empty():
for dt in ['float32', 'int8']:
f = theano.function([], GpuAllocEmpty(dt)(2, 3))
assert len(f.maker.fgraph.apply_nodes) == 1
out = f()
assert out.shape == (2, 3)
assert out.dtype == dt
f = theano.function([], [GpuAllocEmpty('uint64')(3, 2),
GpuAllocEmpty('uint64')(3, 2)])
out = f()
assert out[0].shape == (3, 2)
assert out[0].dtype == 'uint64'
assert out[1].shape == (3, 2)
assert out[1].dtype == 'uint64'
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, GpuAllocEmpty)]) == 1
def test_shape(): def test_shape():
x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])() x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])()
v = gpuarray.zeros((3, 4, 5), dtype='float32') v = gpuarray.zeros((3, 4, 5), dtype='float32')
......
from nose.plugins.skip import SkipTest
import numpy
from theano import function
from theano.tests import unittest_tools as utt
from theano.tensor import vector, matrix, dot
from .test_basic_ops import mode_with_gpu
from ..nerv import Gemm16, nerv
def test_gemm16_swap():
if nerv is None:
raise SkipTest("nervanagpu not available")
v = vector(dtype='float16')
m = matrix(dtype='float16')
m2 = matrix(dtype='float16')
m32 = matrix(dtype='float32')
# test that we don't try to replace anything but matrix x matrix in float16
f = function([v, m], dot(v, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, Gemm16)]) == 0
f = function([m32, m], dot(m32, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, Gemm16)]) == 0
f = function([m, m2], dot(m, m2), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, Gemm16)]) == 1
def test_gemm16_value():
if nerv is None:
raise SkipTest("nervanagpu not available")
m = matrix(dtype='float16')
m2 = matrix(dtype='float16')
f = function([m, m2], dot(m, m2), mode=mode_with_gpu)
v1 = numpy.random.random((3, 4)).astype('float16')
v2 = numpy.random.random((4, 2)).astype('float16')
of = f(v1, v2)
on = numpy.dot(v1, v2)
utt.assert_allclose(of, on)
...@@ -2401,14 +2401,9 @@ class Alloc(gof.Op): ...@@ -2401,14 +2401,9 @@ class Alloc(gof.Op):
""" """
__props__ = () __props__ = ()
def make_node(self, value, *shape): def validate_shape(self, shape):
v = as_tensor_variable(value)
sh = [as_tensor_variable(s) for s in shape] sh = [as_tensor_variable(s) for s in shape]
bcast = [] bcast = []
if v.ndim > len(sh):
raise TypeError("The Alloc value to use has more dimensions"
" than the specified dimensions",
v.ndim, len(sh))
for i, s in enumerate(sh): for i, s in enumerate(sh):
if s.type.dtype[:3] not in ('int', 'uin'): if s.type.dtype[:3] not in ('int', 'uin'):
if config.exception_verbosity == 'high': if config.exception_verbosity == 'high':
...@@ -2424,8 +2419,17 @@ class Alloc(gof.Op): ...@@ -2424,8 +2419,17 @@ class Alloc(gof.Op):
except NotScalarConstantError: except NotScalarConstantError:
const_shp = None const_shp = None
bcast.append(numpy.all(1 == const_shp)) bcast.append(numpy.all(1 == const_shp))
return sh, bcast
def make_node(self, value, *shape):
v = as_tensor_variable(value)
sh, bcast = self.validate_shape(shape)
if v.ndim > len(sh):
raise TypeError("The Alloc value to use has more dimensions"
" than the specified dimensions",
v.ndim, len(sh))
otype = TensorType(dtype=v.dtype, broadcastable=bcast) otype = TensorType(dtype=v.dtype, broadcastable=bcast)
return gof.Apply(self, ([v] + sh), [otype()]) return gof.Apply(self, [v] + sh, [otype()])
def perform(self, node, inputs, out_): def perform(self, node, inputs, out_):
out, = out_ out, = out_
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论