提交 a8b3b329 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Rework the interface to GpuKernelBase to accept a list of kernel object.

Each item will be precompiled separetly and embedded into the c_code of the Op. This allows ops that need multiple kernels or that will choose between alternatives at runtime to use this interface. It also groups all kernel-related parameters under one object. This change also save the source of the kernel code to re-attempt source compilation in case the binary is rejected for some reason (some implementations do not support reloading from pre-compiled kernel). There may still be more changes to how stuff works under the hood (most notably a blacklist of bad runtime/drivers that crash when attempting to create a kernel from a binary), but the visible interface should not change anymore, so now is the time to start using it more.
上级 4e350322
......@@ -61,137 +61,141 @@ class HideC(object):
return self.c_code_cache_version()
class GpuKernelBase(object):
def c_kernel_code(self, node):
"""
Return the source code of the kernel.
"""
raise AttributeError("c_kernel_code", type(self))
def c_kernel_params(self, node):
"""
Return the list of typecodes for kernel parameters.
The list can contain strings ( "GA_BUFFER" ) or direct int values.
"""
raise AttributeError("c_kernel_params", type(self))
class Kernel(object):
def __init__(self, code, params, name, flags,
codevar=None, binvar=None, objvar=None):
self.code = code
self.params = params
self.name = name
self.flags = flags
if codevar is None:
codevar = 'kcode_' + name
self.codevar = codevar
if binvar is None:
binvar = 'kbin_' + name
self.binvar = binvar
if objvar is None:
self.objvar = 'k_' + name
self.objvar = objvar
@staticmethod
def get_flags(*types):
def get_dtype(t):
if isinstance(t, (str, unicode)):
return numpy.dtype(t)
elif isinstance(t, Type):
return t.dtype
elif isinstance(t, Variable):
return t.type.dtype
else:
raise TypeError, "can't get a dtype from %s" % (type(t),)
dtypes = [get_dtype(t) for t in types]
flags = dict(cluda=True)
if any(d == numpy.float64 for d in dtypes):
flags['have_double'] = True
if any(d.itemsize < 4 for d in dtypes):
flags['have_small'] = True
if any(d.kind == 'c' for d in dtypes):
flags['have_complex'] = True
if any(d == numpy.float16 for d in dtypes):
flags['have_half'] = True
return flags
def _get_c_flags(self):
res = []
if self.flags.get('cluda', False):
res.append('GA_USE_CLUDA')
if self.flags.get('have_double', False):
res.append('GA_USE_DOUBLE')
if self.flags.get('have_small', False):
res.append('GA_USE_SMALL')
if self.flags.get('have_complex', False):
res.append('GA_USE_COMPLEX')
if self.flags.get('have_half', False):
res.append('GA_USE_SMALL')
return '|'.join(res)
def _get_c_types(self):
def m(t):
if t == gpuarray.GpuArray:
return "GA_BUFFER"
else:
return gpuarray.dtype_to_typecode(t)
return ', '.join(m(t) for t in self.params)
def c_kernel_name(self):
"""
Return the name of the kernel in the source.
"""
raise AttributeError("c_kernel_name", type(self))
def c_kernel_flags(self, node):
class GpuKernelBase(object):
def gpu_kernels(self, node, name):
"""
Return a string representing the C flags for the kernel.
Example:
"GA_USE_CLUDA|GA_USE_DOUBLE"
self._get_kernel_flags(*dtypes) returns an appropritate string
for the result of this function.
This is the method to override. This should return an
iterable of Kernel objects that describe the kernels this op
will need.
"""
raise AttributeError("c_kernel_flags", type(self))
def c_kernel_codevar(self, name):
return 'kcode_' + name
def c_kernel_obj(self, name):
return 'k_' + name
def _get_kernel_flags(self, *dtypes):
dtypes = [numpy.dtype(d) for d in dtypes]
flags = ['GA_USE_CLUDA']
if any(d == numpy.float64 for d in dtypes):
flags.append('GA_USE_DOUBLE')
if any(d.itemsize < 4 for d in dtypes):
flags.append('GA_USE_SMALL')
return '|'.join(flags)
raise MethodNotDefined, 'gpu_kernels'
def c_headers(self):
return ['gpuarray/types.h']
TMAP = {
"GA_BUFFER": gpuarray.GpuArray,
"GA_BOOL": 'bool',
"GA_BYTE": 'int8',
"GA_UBYTE": 'uint8',
"GA_SHORT": 'int16',
"GA_USHORT": 'uint16',
"GA_INT": 'int32',
"GA_UINT": 'uint32',
"GA_LONG": 'int64',
"GA_ULONG": 'uint64',
"GA_FLOAT": 'float32',
"GA_DOUBLE": 'float64',
"GA_CFLOAT": 'complex64',
"GA_CDOUBLE": 'complex128',
}
try:
o = super(GpuKernelBase, self).c_headers()
except MethodNotDefined:
o = []
return o + ['gpuarray/types.h']
def _generate_kernel_bin(self, k):
k = gpuarray.GpuKernel(k.code, k.name, k.params, **k.flags)
bin = k._binary
bocde = ','.join(hex(ord(c)) for c in bin)
return ("""static const char %(bname)s[] = { %(bcode)s };""" %
dict(bname=k.binvar, bcode=bcode))
def _types_to_pytypes(self, types):
def tmap(t):
if t in self.TMAP:
return self.TMAP[t]
return gpuarray.typecode_to_dtype(t)
return [tmap(t) for t in types]
FMAP = {
"GA_USE_CLUDA": 'cluda',
"GA_USE_DOUBLE": 'have_double',
"GA_USE_SMALL": 'have_small',
"GA_USE_COMPLEX": 'have_complex',
"GA_USE_HALF": 'have_half',
}
def _generate_kernel_code(self, k):
code = '\\n'.join(l for l in k.code.split('\n'))
code = code.replace('"', '\\"')
return ("""static const char *%(cname)s = "%(code)s";""" %
dict(cname=k.codevar, code=code))
def _flags_to_pyflags(self, flags):
res = dict()
for fl in flags.split('|'):
res[self.FMAP[fl]] = True
return res
def _generate_kernel_vars(self, k):
return """static GpuKernel %(kname)s;""" % dict(k.objname)
def c_support_code_apply(self, node, name):
kcode = self.c_kernel_code(node)
vname = self.c_kernel_codevar(name)
kname = self.c_kernel_obj(name)
k = gpuarray.GpuKernel(kcode, self.c_kernel_name(),
self._types_to_pytypes(self.c_kernel_params(node)),
**self._flags_to_pyflags(self.c_kernel_flags(node)))
bin = k._binary
bcode = ','.join(hex(ord(c)) for c in bin)
return """static const char %(vname)s[] = { %(bcode)s };
static GpuKernel %(kname)s;""" % dict(vname=vname, kname=kname, bcode=bcode)
kernels = self.gpu_kernels(node, name)
bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels)
codes = '\n'.join(self._generate_kernel_code(k) for k in kernels)
vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels)
return '\n'.join([bins, codes, vars])
def c_init_code_apply(self, node, name):
types = self.c_kernel_params(node)
numargs = len(types)
kname = self.c_kernel_name()
vname = self.c_kernel_codevar(name)
oname = self.c_kernel_obj(name)
flags = self.c_kernel_flags(node)
# TODO: find a way to release the kernel once the module is unloaded
error_out = ""
def _generate_kernel_init(self, k, err):
if PY3:
error_out = "NULL"
return """
int types_%(name)s[%(numargs)u] = {%(types)s};
int err;
const char *kcode = %(vname)s;
size_t sz = sizeof(%(vname)s);
if ((err = GpuKernel_init(&%(oname)s, pygpu_default_context()->ops,
pygpu_default_context()->ctx, 1, &kcode, &sz, "%(kname)s",
%(numargs)s, types_%(name)s, %(flags)s)) != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
err, Gpu_error(pygpu_default_context()->ops,
pygpu_default_context()->ctx, err));
return %(error_out)s;
}
""" % dict(types=','.join(types), numargs=numargs, kname=kname, oname=oname,
vname=vname, flags="GA_USE_BINARY", error_out=error_out, name=name)
else:
error_out = ""
return """{
int types[%(numargs)u] = {%(types)s};
const char *bcode = %(bvar)s;
size_t sz = sizeof(%(bvar)s);
GpuContext *c = pygpu_default_context();
if (GpuKernel_init(%(ovar)s, c->ops, c->ctx, 1, &bcode, &sz, "%(kname)s",
%(numargs)u, types, GA_USE_BINARY) != GA_NO_ERROR) {
if ((%(err)s = GpuKernel_init(%(ovar)s, c->ops, c->ctx, 1, &%(cname)s,
NULL, "%(kname)s", %(numargs)u, types,
%(flags)s)) != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
err, Gpu_error(c->ops, c->ctx, err));
return %(error_out)s;
}
}
}""" % dict(numargs=len(k.params), types=k._get_c_types(), bvar=k.binvar,
ovar=k.objvar, kname=k.name, err=err, cname=k.codevar,
flags=k._get_c_flags(), error_out=error_out)
def c_init_code_apply(self, node, name):
err = 'err_' + name
kernels = self.gpu_kernels(node, name)
inits ='\n'.join(self._generate_kernel_init(k, err) for k in kernels)
return ("int %(err)s;\n" % dict(err=err)) + inits
def _GpuKernelBase_version(self):
ctx = gpuarray.get_default_context()
return (1, ctx.kind, ctx.devname)
return (2, ctx.kind, ctx.devname)
GpuKernelBase_version = property(_GpuKernelBase_version)
......@@ -808,23 +812,20 @@ class GpuEye(GpuKernelBase, Op):
def __hash__(self):
return hash(self.dtype) ^ hash(type(self))
def c_kernel_code(self, node):
return """
def gpu_kernels(self, node, name):
code = """
KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
ga_size nb = n < m ? n : m;
for (ga_size i = LID_0; i < nb; i += LDIM_0) {
a[i*m + i] = 1;
}
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype))
def c_kernel_params(self, node):
return ["GA_BUFFER", "GA_SIZE", "GA_SIZE"]
def c_kernel_name(self):
return "k"
def c_kernel_flags(self, node):
return self._get_kernel_flags(self.dtype)
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype), name=name)
return [Kernel(
code=code, name="k",
params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE],
flags=Kernel.get_flags(self.dtype)
objname='k_eye_'+name,
)]
def c_code(self, node, name, inp, out, sub):
n, m = inp
......@@ -832,7 +833,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
fail = sub['fail']
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
sync = bool(config.gpuarray.sync)
kname = self.c_kernel_obj(name)
kname = 'k_eye_'+name
s = """
size_t dims[2] = {0, 0};
void *args[3];
......
......@@ -20,7 +20,7 @@ except ImportError:
pass
from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable, HideC,
GpuKernelBase)
GpuKernelBase, Kernel)
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.gof.utils import MethodNotDefined
......@@ -2373,40 +2373,29 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if any(redux):
return getattr(node, attr)
def c_kernel_code(self, node):
def gpu_kernels(self, node, name):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
# Some OpenCL compilers do not accept no-arguments kernels
return "KERNEL void reduk(GLOBAL_MEM float *a) {}"
src = "KERNEL void reduk(GLOBAL_MEM float *a) {}"
params = ['float32']
else:
k = self.get_kernel_cache(node)
_, src, _, _ = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim)
return src
def c_kernel_name(self):
return "reduk"
def c_kernel_params(self, node):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
return ["GA_FLOAT"]
else:
# Make sure this is synced with the call definition in
# pygpu/reduction.py
nd = node.inputs[0].ndim
res = ["GA_UINT", "GA_BUFFER"]
res.extend("GA_UINT" for _ in range(nd))
res.append("GA_BUFFER")
res.append("GA_UINT")
res.extend("GA_INT" for _ in range(nd))
return res
def c_kernel_flags(self, node):
params = ['uint32', gpuarray.GpuArray]
params.extend('uint32' for _ in range(nd))
params.append(gpuarray.GpuArray)
params.append('uint32')
params.extend('int32' for _ in range(nd))
acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype
return self._get_kernel_flags(node.inputs[0].type.dtype,
acc_dtype,
node.outputs[0].type.dtype)
return [Kernel(code=src, name="reduk", params=params,
flags=Kernel.get_flags(node.inputs[0].type.dtype,
acc_dtype,
node.outputs[0].type.dtype),
objname='k_reduk_'+name)]
def c_code(self, node, name, inp, out, sub):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
......@@ -2555,14 +2544,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if (%(sync)d)
GpuArray_sync(&%(output)s->ga);
""" % dict(k_var=self.c_kernel_obj(name), sync=bool(config.gpuarray.sync),
""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
return code
def c_code_cache_version(self):
return (0,)
return (0, self.GpuKernelBase_version)
def generate_kernel(self, node, odtype, redux):
if isinstance(self.scalar_op, scalar.basic.Add):
......
......@@ -26,7 +26,7 @@ if cuda_available:
from theano.sandbox.cuda import (CudaNdarrayType,
float32_shared_constructor)
from theano.sandbox.gpuarray.basic_ops import GpuKernelBase
from theano.sandbox.gpuarray.basic_ops import GpuKernelBase, Kernel
from theano.sandbox.gpuarray.type import GpuArrayType
......@@ -772,9 +772,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
return op(rstate, cast(v_size, 'int32'))
def c_headers(self):
return GpuKernelBase.c_headers(self) + ['numpy_compat.h']
return super(GPUA_mrg_uniform, self) + ['numpy_compat.h']
def c_kernel_code(self, node):
def gpu_kernels(self, node):
if self.output_type.dtype == 'float32':
otype = 'float'
NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
......@@ -783,7 +783,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
else:
otype = 'double'
NORM = '4.656612873077392578125e-10'
return """
code = """
KERNEL void mrg_uniform(
%(otype)s *sample_data,
ga_int *state_data,
......@@ -864,14 +864,11 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
""" % locals()
def c_kernel_params(self, node):
return ["GA_BUFFER", "GA_BUFFER", "GA_UINT", "GA_UINT"]
def c_kernel_name(self):
return "mrg_uniform"
def c_kernel_flags(self, node):
return self._get_kernel_flags(self.output_type.dtype, 'int32')
return [Kernel(code=code, name="mrg_uniform",
params=[gpuarray.GpuArray, gpuarray.GpuArray,
'uint32', 'uint32'],
flags=Kernel.get_flags(self.output_type.dtype, 'int32'),
objname='k_mrg_uniform')]
def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp
......@@ -880,7 +877,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
ndim = self.output_type.ndim
o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
kname = self.c_kernel_obj(nodename)
kname = 'k_mrg_uniform'
if self.output_type.dtype == 'float32':
otype = 'float'
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论