提交 d3bfaae3 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

WIP adaptation of the GpuElemwise code to the C generator in libgpuarray.

上级 2e793229
...@@ -42,7 +42,7 @@ register_transfer(transfer) ...@@ -42,7 +42,7 @@ register_transfer(transfer)
def init_dev(dev, name=None): def init_dev(dev, name=None):
v = pygpu.gpuarray.api_version() v = pygpu.gpuarray.api_version()
if v[0] != -10000: if v[0] != -9999:
raise RuntimeError("Wrong major API version for gpuarray:", v[0], raise RuntimeError("Wrong major API version for gpuarray:", v[0],
"Make sure Theano and libgpuarray/pygpu " "Make sure Theano and libgpuarray/pygpu "
"are in sync.") "are in sync.")
......
...@@ -22,44 +22,25 @@ except ImportError: ...@@ -22,44 +22,25 @@ except ImportError:
from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel, from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
infer_context_name) infer_context_name)
from .type import GpuArrayType from .type import GpuArrayType, gpu_context_type
from .fp16_help import load_w, write_w from .fp16_help import load_w, write_w
def _is_scalar(v):
False
def make_argument(v, name): def make_argument(v, name):
if _is_scalar(v):
return ScalarArg(numpy.dtype(v.type.dtype), name)
else:
return ArrayArg(numpy.dtype(v.type.dtype), name) return ArrayArg(numpy.dtype(v.type.dtype), name)
def ensure_allocated(storage, shape, dtype, ctx):
odat = storage[0]
if odat is not None:
if odat.shape != shape:
# It is unsafe to try to resize odat,
# we have to allocate output storage.
odat = None
if odat is None:
odat = pygpu.empty(shape, dtype=dtype, context=ctx)
storage[0] = odat
return odat
def as_C_string_const(s): def as_C_string_const(s):
return '\n'.join('"%s\\n"' % (l.replace('"', '\\"')) return '\n'.join('"%s\\n"' % (l.replace('"', '\\"'))
for l in s.split('\n')) for l in s.split('\n'))
class GpuElemwise(GpuKernelBase, HideC, Elemwise): class GpuElemwise(HideC, Elemwise):
""" """
Elemwise on the GPU. Elemwise on the GPU.
""" """
params_type = gpu_context_type
nin = property(lambda self: self.scalar_op.nin) nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout) nout = property(lambda self: self.scalar_op.nout)
_f16_ok = True _f16_ok = True
...@@ -108,20 +89,21 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -108,20 +89,21 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
def get_params(self, node): def get_params(self, node):
return node.inputs[0].type.context return node.inputs[0].type.context
def generate_kernel(self, node, nodename): def _get_vnames(self, node):
inps = [make_argument(i, 'i%d' % (n,)) for n, i in inps = ['i%d' % (n,) for n, _ in enumerate(node.inputs)]
enumerate(node.inputs)] outs = ['o%d' % (n,) for n, _ in enumerate(node.outputs) if n not in self.inplace_pattern]
scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs] return inps, outs
outs = [make_argument(o, 'o%d' % (n,)) for n, o in def _generate_op_string(self, node):
enumerate(node.outputs) if n not in self.inplace_pattern] scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs] scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
inps, outs = self._get_vnames(node)
fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins], fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins],
[o() for o in scal_v_outs]) [o() for o in scal_v_outs])
scal_in = [i.name + '[i]' if i.dtype != 'float16' else scal_in = [i if si.dtype != 'float16' else
'__half2float(' + i.name + '[i])' for i in inps] 'load_half(&' + i + ')' for i, si in zip(inps, scal_v_ins)]
scal_out = [] scal_out = []
oi = 0 oi = 0
...@@ -132,13 +114,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -132,13 +114,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
else: else:
arg = outs[oi] arg = outs[oi]
oi += 1 oi += 1
if arg.dtype == 'float16': if node.outputs[n].dtype == 'float16':
scal_f16.append(('tmpf16%i' % (len(scal_f16),), arg)) scal_f16.append(('tmpf16%i' % (len(scal_f16),), arg))
scal_out.append(scal_f16[-1][0]) scal_out.append(scal_f16[-1][0])
else: else:
scal_out.append(arg.name + '[i]') scal_out.append(arg)
kop = self.scalar_op.c_code(fake_node, nodename + '_scalar', kop = self.scalar_op.c_code(fake_node, 'elem_scalar',
scal_in, scal_out, scal_in, scal_out,
dict(fail='return;')) dict(fail='return;'))
...@@ -153,7 +135,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -153,7 +135,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
# variables inthe middle are float32 # variables inthe middle are float32
code.append(kop.replace('npy_float16', 'ga_float')) code.append(kop.replace('npy_float16', 'ga_float'))
for f in scal_f16: for f in scal_f16:
code.append('%s[i] = __float2half_rn(%s);' % (f[1].name, f[0])) code.append('store_half(&%s, %s);' % (f[1], f[0]))
code.append('}') code.append('}')
kop = '\n'.join(code) kop = '\n'.join(code)
...@@ -177,76 +159,74 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -177,76 +159,74 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
("npy_float64", "ga_double"), ("npy_float64", "ga_double"),
]: ]:
kop = kop.replace(npy, ga) kop = kop.replace(npy, ga)
return ElemwiseKernel(self.get_params(node), inps + outs, kop, return support_code, kop
preamble=support_code)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>',
'<gpuarray/elemwise.h>']
def c_support_code(self):
return self.scalar_op.c_support_code() def c_support_code_struct(self, node, name):
return "\nGpuElemwise *ge;\n";
def _gpu_kernel_code(self, node, nodename):
# This is useless by itself, but will serve an eventual c_code def c_init_code_struct(self, node, name, sub):
# implementation inps, outs = self._get_vnames(node)
k = self.generate_kernel(node, nodename) nargs = len(inps) + len(outs)
nd = node.inputs[0].type.ndim support_code, kop = self._generate_op_string(node)
res = [] res = """
for i in range(0, nd + 1): gpuelemwise_arg args[%(nargs)s] = {{0}};
res.append(k.render_basic(i, name="elem_" + str(i)) + ';') """ % dict(nargs=nargs)
res.append(k.contig_src + ';')
for n, (i, name) in enumerate(zip(node.inputs, inps)):
return '\n'.join(res) res += """
args[%(n)s].name = %(name)s;
args[%(n)s].nd = %(nd)s;
args[%(n)s].typecode = %(typecode)s;
args[%(n)s].flags = GE_READ;
""" % dict(n=n, name='"%s"' % (name,), nd=i.ndim,
typecode=i.type.typecode)
p = 0
for n, o in enumerate(node.outputs):
if n in self.inplace_pattern:
res += "\nargs[%(n)s].flags |= GE_WRITE;\n" % dict(n=self.inplace_pattern[n])
else:
nn = len(inps) + p
name = outs[p]
p += 1
res += """
args[%(n)s].name = %(name)s;
args[%(n)s].nd = %(nd)s;
args[%(n)s].typecode = %(typecode)s;
args[%(n)s].flags = GE_WRITE;
""" % dict(n=nn, name='"%s"' % (name,), nd=o.ndim,
typecode=o.type.typecode)
res += """
ge = GpuElemwise_new(%(ctx)s->ops, %(ctx)s->ctx, %(support)s, %(kop)s, %(nargs)s, args, 0);
if (ge == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not initialize elemwise support");
%(fail)s
}
""" % dict(nargs=nargs, ctx=sub['params'], fail=sub['fail'],
support=as_C_string_const(support_code),
kop=as_C_string_const(kop))
def gpu_kernels(self, node, nodename): return res
src = self._gpu_kernel_code(node, nodename)
nd = node.outputs[0].ndim
params = ['uintp']
params.extend('uintp' for _ in range(nd))
num_inputs = len(node.inputs)
num_outputs = len(node.outputs)
for n in range(num_inputs + num_outputs):
if (n - len(node.inputs)) in self.inplace_pattern:
continue
params.extend([gpuarray.GpuArray, 'uintp'])
params.extend('intp' for _ in range(nd))
acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype
return [Kernel(code=src, name="elem_%d" % nd, params=params,
flags=Kernel.get_flags(node.inputs[0].type.dtype,
acc_dtype,
node.outputs[0].type.dtype),
objvar='elem_%d_%s' % (nd, nodename))]
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
if node.inputs[0].type.context.kind != 'cuda':
raise MethodNotDefined('cuda only')
nd = node.outputs[0].ndim nd = node.outputs[0].ndim
fail = sub["fail"] fail = sub["fail"]
initial_dims = ','.join('1' for i in xrange(nd)) initial_dims = ','.join('1' for i in xrange(nd))
opname = str(self.scalar_op) opname = str(self.scalar_op)
ctx = sub['params'] ctx = sub['params']
nargs = len(node.inputs) + len(node.outputs) - len(self.inplace_pattern)
# check that all inputs have valid dimensions # check that all inputs have valid dimensions
emitted_inames = {} emitted_inames = {}
num_kernel_params = 1 + nd + len(inputs + outputs) * (2 + nd)
code = """ code = """
size_t n_blocks = 0; size_t dims[%(nd)s+1] = {%(initial_dims)s};
size_t threads_per_block = 0; void *rargs[%(nargs)s] = {0};
size_t numEls = 0;
const ssize_t zero = 0;
void *kernel_params[%(num_kernel_params)d] = {0};
int err;
""" % locals()
if nd > 0:
code += """
size_t dims[%(nd)s] = {%(initial_dims)s};
""" % locals() """ % locals()
else:
code += """
size_t *dims = NULL;
"""
for idx, iname in enumerate(inputs): for idx, iname in enumerate(inputs):
if iname in emitted_inames: if iname in emitted_inames:
assert emitted_inames[iname] is node.inputs[idx] assert emitted_inames[iname] is node.inputs[idx]
...@@ -255,19 +235,15 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -255,19 +235,15 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
broadcasts = map(int, node.inputs[idx].broadcastable) broadcasts = map(int, node.inputs[idx].broadcastable)
broadcasts = ', '.join(map(str, broadcasts)) broadcasts = ', '.join(map(str, broadcasts))
nd = node.inputs[idx].ndim nd = node.inputs[idx].ndim
if nd > 0:
code += """ code += """
int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s}; int broadcasts_%(iname)s[%(nd)s+1] = {%(broadcasts)s};
""" % locals()
else:
code += """
int *broadcasts_%(iname)s = NULL;
""" % locals() """ % locals()
emitted_inames[iname] = node.inputs[idx] emitted_inames[iname] = node.inputs[idx]
# check that all inputs have valid dimensions # check that all inputs have valid dimensions
emitted_inames = {} emitted_inames = {}
for idx, iname in enumerate(inputs): for idx, iname in enumerate(inputs):
code += "rargs[%(idx)s] = &%(iname)s->ga;\n" % dict(idx=idx, iname=iname)
if iname in emitted_inames: if iname in emitted_inames:
continue continue
code += """ code += """
...@@ -299,6 +275,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -299,6 +275,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
""" % locals() """ % locals()
emitted_inames[iname] = True emitted_inames[iname] = True
# check that all outputs have valid dimensions # check that all outputs have valid dimensions
p = len(node.inputs)
for idx, oname in enumerate(outputs): for idx, oname in enumerate(outputs):
typecode = dtype_to_typecode(node.outputs[idx].dtype) typecode = dtype_to_typecode(node.outputs[idx].dtype)
if idx not in self.inplace_pattern.keys(): if idx not in self.inplace_pattern.keys():
...@@ -324,7 +301,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -324,7 +301,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
%(fail)s %(fail)s
} }
} }
rargs[%(p)s] = &%(oname)s->ga;
""" % locals() """ % locals()
p += 1
else: else:
input_idx = self.inplace_pattern[idx] input_idx = self.inplace_pattern[idx]
iname = inputs[input_idx] iname = inputs[input_idx]
...@@ -350,92 +329,35 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -350,92 +329,35 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
} }
} }
""" % locals() """ % locals()
z = outputs[0]
code += """numEls = PyGpuArray_SIZE(%(z)s);
//first use at least a full warp
threads_per_block = std::min(numEls, (size_t)32); //WARP SIZE
//next start adding multiprocessors
// UP TO NUMBER OF MULTIPROCESSORS, use 30 for now.
n_blocks = std::min(numEls/threads_per_block +
(numEls %% threads_per_block?1:0),
(size_t)30);
// next start adding more warps per multiprocessor
if (threads_per_block * n_blocks < numEls)
threads_per_block = std::min(numEls/n_blocks, (size_t) 256);
""" % locals()
kname = 'elem_%d_%s' % (nd, name)
param = ["(void *)&numEls"]
for i in range(nd):
param.append("(void *)&%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
i=i))
for n, (name, var) in enumerate(zip(inputs + outputs,
node.inputs + node.outputs)):
if (n - len(inputs)) in self.inplace_pattern:
continue
dtype = dtype_to_ctype(var.dtype)
param.append("(void *)%(name)s->ga.data" % locals())
param.append("(void *)&%(name)s->ga.offset" % locals())
for i in range(nd):
param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? (void *)&zero: (void *)&PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
for n, p in enumerate(param):
code += "kernel_params[%(n)d] = %(p)s;\n" % locals()
code += """ code += """
err = GpuKernel_call(&%(kname)s, 1, &threads_per_block, &n_blocks, 0, kernel_params); if (GpuElemwise_call(ge, rargs, GE_BROADCAST) != GA_NO_ERROR) {
if (err != GA_NO_ERROR) { PyErr_SetString(PyExc_RuntimeError, "Error in the elemwise call");
PyErr_Format(PyExc_RuntimeError, %(fail)s
"gpuarray error: %(kname)s: %%s.",
GpuKernel_error(&%(kname)s, err));
%(fail)s;
} }
""" % dict(kname=kname, fail=fail) """ % dict(fail=sub['fail'])
if config.gpuarray.sync: if config.gpuarray.sync:
z = outputs[0]
code += """ code += """
err = GpuArray_sync(&%(z)s->ga); err = GpuArray_sync(&%(z)s->ga);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %(kname)s: %%s.", "gpuarray error: %%s.",
GpuKernel_error(&%(kname)s, err)); GpuArray_error(&%(z)s->ga, err));
%(fail)s; %(fail)s;
} }
""" % locals() """ % locals()
return str(code)
def perform(self, node, inputs, output_storage, ctx): return str(code)
# Try to reuse the kernel from a previous call to hopefully
# avoid recompiling
if not hasattr(node, '_cache_elemwise_k'):
node._cache_elemwise_k = self.generate_kernel(node, "kcode")
out_shape = []
for values in izip(*[input.shape for input in inputs]):
if any(v == 0 for v in values):
# All non-broadcasted dimensions should be zero
assert max(values) <= 1
out_shape.append(0)
else:
out_shape.append(max(values))
out_shape = tuple(out_shape)
args = copy.copy(inputs)
for n, (stor, out) in enumerate(izip(output_storage, node.outputs)):
if n in self.inplace_pattern:
stor[0] = inputs[self.inplace_pattern[n]]
else:
args.append(ensure_allocated(stor, out_shape, out.type.dtype, ctx))
node._cache_elemwise_k(*args, broadcast=True) # To disable the superclass perform.
if config.gpuarray.sync: perform = Op.perform
output_storage[0][0].sync()
def c_code_cache_version(self): def c_code_cache_version(self):
ver = self.scalar_op.c_code_cache_version() ver = self.scalar_op.c_code_cache_version()
if ver: if ver:
return (4, ver) return (5, ver)
else: else:
return ver return ver
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论