提交 a8b3b329 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Rework the interface to GpuKernelBase to accept a list of kernel object.

Each item will be precompiled separetly and embedded into the c_code of the Op. This allows ops that need multiple kernels or that will choose between alternatives at runtime to use this interface. It also groups all kernel-related parameters under one object. This change also save the source of the kernel code to re-attempt source compilation in case the binary is rejected for some reason (some implementations do not support reloading from pre-compiled kernel). There may still be more changes to how stuff works under the hood (most notably a blacklist of bad runtime/drivers that crash when attempting to create a kernel from a binary), but the visible interface should not change anymore, so now is the time to start using it more.
上级 4e350322
...@@ -61,137 +61,141 @@ class HideC(object): ...@@ -61,137 +61,141 @@ class HideC(object):
return self.c_code_cache_version() return self.c_code_cache_version()
class GpuKernelBase(object): class Kernel(object):
def c_kernel_code(self, node): def __init__(self, code, params, name, flags,
""" codevar=None, binvar=None, objvar=None):
Return the source code of the kernel. self.code = code
""" self.params = params
raise AttributeError("c_kernel_code", type(self)) self.name = name
self.flags = flags
def c_kernel_params(self, node): if codevar is None:
""" codevar = 'kcode_' + name
Return the list of typecodes for kernel parameters. self.codevar = codevar
if binvar is None:
The list can contain strings ( "GA_BUFFER" ) or direct int values. binvar = 'kbin_' + name
""" self.binvar = binvar
raise AttributeError("c_kernel_params", type(self)) if objvar is None:
self.objvar = 'k_' + name
self.objvar = objvar
@staticmethod
def get_flags(*types):
def get_dtype(t):
if isinstance(t, (str, unicode)):
return numpy.dtype(t)
elif isinstance(t, Type):
return t.dtype
elif isinstance(t, Variable):
return t.type.dtype
else:
raise TypeError, "can't get a dtype from %s" % (type(t),)
dtypes = [get_dtype(t) for t in types]
flags = dict(cluda=True)
if any(d == numpy.float64 for d in dtypes):
flags['have_double'] = True
if any(d.itemsize < 4 for d in dtypes):
flags['have_small'] = True
if any(d.kind == 'c' for d in dtypes):
flags['have_complex'] = True
if any(d == numpy.float16 for d in dtypes):
flags['have_half'] = True
return flags
def _get_c_flags(self):
res = []
if self.flags.get('cluda', False):
res.append('GA_USE_CLUDA')
if self.flags.get('have_double', False):
res.append('GA_USE_DOUBLE')
if self.flags.get('have_small', False):
res.append('GA_USE_SMALL')
if self.flags.get('have_complex', False):
res.append('GA_USE_COMPLEX')
if self.flags.get('have_half', False):
res.append('GA_USE_SMALL')
return '|'.join(res)
def _get_c_types(self):
def m(t):
if t == gpuarray.GpuArray:
return "GA_BUFFER"
else:
return gpuarray.dtype_to_typecode(t)
return ', '.join(m(t) for t in self.params)
def c_kernel_name(self):
"""
Return the name of the kernel in the source.
"""
raise AttributeError("c_kernel_name", type(self))
def c_kernel_flags(self, node): class GpuKernelBase(object):
def gpu_kernels(self, node, name):
""" """
Return a string representing the C flags for the kernel. This is the method to override. This should return an
iterable of Kernel objects that describe the kernels this op
Example: will need.
"GA_USE_CLUDA|GA_USE_DOUBLE"
self._get_kernel_flags(*dtypes) returns an appropritate string
for the result of this function.
""" """
raise AttributeError("c_kernel_flags", type(self)) raise MethodNotDefined, 'gpu_kernels'
def c_kernel_codevar(self, name):
return 'kcode_' + name
def c_kernel_obj(self, name):
return 'k_' + name
def _get_kernel_flags(self, *dtypes):
dtypes = [numpy.dtype(d) for d in dtypes]
flags = ['GA_USE_CLUDA']
if any(d == numpy.float64 for d in dtypes):
flags.append('GA_USE_DOUBLE')
if any(d.itemsize < 4 for d in dtypes):
flags.append('GA_USE_SMALL')
return '|'.join(flags)
def c_headers(self): def c_headers(self):
return ['gpuarray/types.h'] try:
o = super(GpuKernelBase, self).c_headers()
TMAP = { except MethodNotDefined:
"GA_BUFFER": gpuarray.GpuArray, o = []
"GA_BOOL": 'bool', return o + ['gpuarray/types.h']
"GA_BYTE": 'int8',
"GA_UBYTE": 'uint8', def _generate_kernel_bin(self, k):
"GA_SHORT": 'int16', k = gpuarray.GpuKernel(k.code, k.name, k.params, **k.flags)
"GA_USHORT": 'uint16', bin = k._binary
"GA_INT": 'int32', bocde = ','.join(hex(ord(c)) for c in bin)
"GA_UINT": 'uint32', return ("""static const char %(bname)s[] = { %(bcode)s };""" %
"GA_LONG": 'int64', dict(bname=k.binvar, bcode=bcode))
"GA_ULONG": 'uint64',
"GA_FLOAT": 'float32',
"GA_DOUBLE": 'float64',
"GA_CFLOAT": 'complex64',
"GA_CDOUBLE": 'complex128',
}
def _types_to_pytypes(self, types): def _generate_kernel_code(self, k):
def tmap(t): code = '\\n'.join(l for l in k.code.split('\n'))
if t in self.TMAP: code = code.replace('"', '\\"')
return self.TMAP[t] return ("""static const char *%(cname)s = "%(code)s";""" %
return gpuarray.typecode_to_dtype(t) dict(cname=k.codevar, code=code))
return [tmap(t) for t in types]
FMAP = {
"GA_USE_CLUDA": 'cluda',
"GA_USE_DOUBLE": 'have_double',
"GA_USE_SMALL": 'have_small',
"GA_USE_COMPLEX": 'have_complex',
"GA_USE_HALF": 'have_half',
}
def _flags_to_pyflags(self, flags): def _generate_kernel_vars(self, k):
res = dict() return """static GpuKernel %(kname)s;""" % dict(k.objname)
for fl in flags.split('|'):
res[self.FMAP[fl]] = True
return res
def c_support_code_apply(self, node, name): def c_support_code_apply(self, node, name):
kcode = self.c_kernel_code(node) kernels = self.gpu_kernels(node, name)
vname = self.c_kernel_codevar(name) bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels)
kname = self.c_kernel_obj(name) codes = '\n'.join(self._generate_kernel_code(k) for k in kernels)
k = gpuarray.GpuKernel(kcode, self.c_kernel_name(), vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels)
self._types_to_pytypes(self.c_kernel_params(node)), return '\n'.join([bins, codes, vars])
**self._flags_to_pyflags(self.c_kernel_flags(node)))
bin = k._binary
bcode = ','.join(hex(ord(c)) for c in bin)
return """static const char %(vname)s[] = { %(bcode)s };
static GpuKernel %(kname)s;""" % dict(vname=vname, kname=kname, bcode=bcode)
def c_init_code_apply(self, node, name): def _generate_kernel_init(self, k, err):
types = self.c_kernel_params(node)
numargs = len(types)
kname = self.c_kernel_name()
vname = self.c_kernel_codevar(name)
oname = self.c_kernel_obj(name)
flags = self.c_kernel_flags(node)
# TODO: find a way to release the kernel once the module is unloaded
error_out = ""
if PY3: if PY3:
error_out = "NULL" error_out = "NULL"
return """ else:
int types_%(name)s[%(numargs)u] = {%(types)s}; error_out = ""
int err; return """{
const char *kcode = %(vname)s; int types[%(numargs)u] = {%(types)s};
size_t sz = sizeof(%(vname)s); const char *bcode = %(bvar)s;
if ((err = GpuKernel_init(&%(oname)s, pygpu_default_context()->ops, size_t sz = sizeof(%(bvar)s);
pygpu_default_context()->ctx, 1, &kcode, &sz, "%(kname)s", GpuContext *c = pygpu_default_context();
%(numargs)s, types_%(name)s, %(flags)s)) != GA_NO_ERROR) { if (GpuKernel_init(%(ovar)s, c->ops, c->ctx, 1, &bcode, &sz, "%(kname)s",
%(numargs)u, types, GA_USE_BINARY) != GA_NO_ERROR) {
if ((%(err)s = GpuKernel_init(%(ovar)s, c->ops, c->ctx, 1, &%(cname)s,
NULL, "%(kname)s", %(numargs)u, types,
%(flags)s)) != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s", PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
err, Gpu_error(pygpu_default_context()->ops, err, Gpu_error(c->ops, c->ctx, err));
pygpu_default_context()->ctx, err));
return %(error_out)s; return %(error_out)s;
} }
""" % dict(types=','.join(types), numargs=numargs, kname=kname, oname=oname, }
vname=vname, flags="GA_USE_BINARY", error_out=error_out, name=name) }""" % dict(numargs=len(k.params), types=k._get_c_types(), bvar=k.binvar,
ovar=k.objvar, kname=k.name, err=err, cname=k.codevar,
flags=k._get_c_flags(), error_out=error_out)
def c_init_code_apply(self, node, name):
err = 'err_' + name
kernels = self.gpu_kernels(node, name)
inits ='\n'.join(self._generate_kernel_init(k, err) for k in kernels)
return ("int %(err)s;\n" % dict(err=err)) + inits
def _GpuKernelBase_version(self): def _GpuKernelBase_version(self):
ctx = gpuarray.get_default_context() ctx = gpuarray.get_default_context()
return (1, ctx.kind, ctx.devname) return (2, ctx.kind, ctx.devname)
GpuKernelBase_version = property(_GpuKernelBase_version) GpuKernelBase_version = property(_GpuKernelBase_version)
...@@ -808,23 +812,20 @@ class GpuEye(GpuKernelBase, Op): ...@@ -808,23 +812,20 @@ class GpuEye(GpuKernelBase, Op):
def __hash__(self): def __hash__(self):
return hash(self.dtype) ^ hash(type(self)) return hash(self.dtype) ^ hash(type(self))
def c_kernel_code(self, node): def gpu_kernels(self, node, name):
return """ code = """
KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
ga_size nb = n < m ? n : m; ga_size nb = n < m ? n : m;
for (ga_size i = LID_0; i < nb; i += LDIM_0) { for (ga_size i = LID_0; i < nb; i += LDIM_0) {
a[i*m + i] = 1; a[i*m + i] = 1;
} }
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype)) }""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype), name=name)
return [Kernel(
def c_kernel_params(self, node): code=code, name="k",
return ["GA_BUFFER", "GA_SIZE", "GA_SIZE"] params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE],
flags=Kernel.get_flags(self.dtype)
def c_kernel_name(self): objname='k_eye_'+name,
return "k" )]
def c_kernel_flags(self, node):
return self._get_kernel_flags(self.dtype)
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
n, m = inp n, m = inp
...@@ -832,7 +833,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { ...@@ -832,7 +833,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
fail = sub['fail'] fail = sub['fail']
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype) typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
sync = bool(config.gpuarray.sync) sync = bool(config.gpuarray.sync)
kname = self.c_kernel_obj(name) kname = 'k_eye_'+name
s = """ s = """
size_t dims[2] = {0, 0}; size_t dims[2] = {0, 0};
void *args[3]; void *args[3];
......
...@@ -20,7 +20,7 @@ except ImportError: ...@@ -20,7 +20,7 @@ except ImportError:
pass pass
from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable, HideC, from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable, HideC,
GpuKernelBase) GpuKernelBase, Kernel)
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
...@@ -2373,40 +2373,29 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2373,40 +2373,29 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if any(redux): if any(redux):
return getattr(node, attr) return getattr(node, attr)
def c_kernel_code(self, node): def gpu_kernels(self, node, name):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])): if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
# Some OpenCL compilers do not accept no-arguments kernels # Some OpenCL compilers do not accept no-arguments kernels
return "KERNEL void reduk(GLOBAL_MEM float *a) {}" src = "KERNEL void reduk(GLOBAL_MEM float *a) {}"
params = ['float32']
else: else:
k = self.get_kernel_cache(node) k = self.get_kernel_cache(node)
_, src, _, _ = k._get_basic_kernel(k.init_local_size, _, src, _, _ = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim) node.inputs[0].ndim)
return src
def c_kernel_name(self):
return "reduk"
def c_kernel_params(self, node):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
return ["GA_FLOAT"]
else:
# Make sure this is synced with the call definition in
# pygpu/reduction.py
nd = node.inputs[0].ndim nd = node.inputs[0].ndim
res = ["GA_UINT", "GA_BUFFER"] params = ['uint32', gpuarray.GpuArray]
res.extend("GA_UINT" for _ in range(nd)) params.extend('uint32' for _ in range(nd))
res.append("GA_BUFFER") params.append(gpuarray.GpuArray)
res.append("GA_UINT") params.append('uint32')
res.extend("GA_INT" for _ in range(nd)) params.extend('int32' for _ in range(nd))
return res
def c_kernel_flags(self, node):
acc_dtype = getattr(self, 'acc_dtype', None) acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None: if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype acc_dtype = node.outputs[0].type.dtype
return self._get_kernel_flags(node.inputs[0].type.dtype, return [Kernel(code=src, name="reduk", params=params,
flags=Kernel.get_flags(node.inputs[0].type.dtype,
acc_dtype, acc_dtype,
node.outputs[0].type.dtype) node.outputs[0].type.dtype),
objname='k_reduk_'+name)]
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])): if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
...@@ -2555,14 +2544,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2555,14 +2544,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if (%(sync)d) if (%(sync)d)
GpuArray_sync(&%(output)s->ga); GpuArray_sync(&%(output)s->ga);
""" % dict(k_var=self.c_kernel_obj(name), sync=bool(config.gpuarray.sync), """ % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input, ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype)) cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,) return (0, self.GpuKernelBase_version)
def generate_kernel(self, node, odtype, redux): def generate_kernel(self, node, odtype, redux):
if isinstance(self.scalar_op, scalar.basic.Add): if isinstance(self.scalar_op, scalar.basic.Add):
......
...@@ -26,7 +26,7 @@ if cuda_available: ...@@ -26,7 +26,7 @@ if cuda_available:
from theano.sandbox.cuda import (CudaNdarrayType, from theano.sandbox.cuda import (CudaNdarrayType,
float32_shared_constructor) float32_shared_constructor)
from theano.sandbox.gpuarray.basic_ops import GpuKernelBase from theano.sandbox.gpuarray.basic_ops import GpuKernelBase, Kernel
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
...@@ -772,9 +772,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -772,9 +772,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
return op(rstate, cast(v_size, 'int32')) return op(rstate, cast(v_size, 'int32'))
def c_headers(self): def c_headers(self):
return GpuKernelBase.c_headers(self) + ['numpy_compat.h'] return super(GPUA_mrg_uniform, self) + ['numpy_compat.h']
def c_kernel_code(self, node): def gpu_kernels(self, node):
if self.output_type.dtype == 'float32': if self.output_type.dtype == 'float32':
otype = 'float' otype = 'float'
NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65)) NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
...@@ -783,7 +783,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -783,7 +783,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
else: else:
otype = 'double' otype = 'double'
NORM = '4.656612873077392578125e-10' NORM = '4.656612873077392578125e-10'
return """ code = """
KERNEL void mrg_uniform( KERNEL void mrg_uniform(
%(otype)s *sample_data, %(otype)s *sample_data,
ga_int *state_data, ga_int *state_data,
...@@ -864,14 +864,11 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -864,14 +864,11 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
""" % locals() """ % locals()
def c_kernel_params(self, node): return [Kernel(code=code, name="mrg_uniform",
return ["GA_BUFFER", "GA_BUFFER", "GA_UINT", "GA_UINT"] params=[gpuarray.GpuArray, gpuarray.GpuArray,
'uint32', 'uint32'],
def c_kernel_name(self): flags=Kernel.get_flags(self.output_type.dtype, 'int32'),
return "mrg_uniform" objname='k_mrg_uniform')]
def c_kernel_flags(self, node):
return self._get_kernel_flags(self.output_type.dtype, 'int32')
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp rstate, size = inp
...@@ -880,7 +877,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -880,7 +877,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
ndim = self.output_type.ndim ndim = self.output_type.ndim
o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail'] fail = sub['fail']
kname = self.c_kernel_obj(nodename) kname = 'k_mrg_uniform'
if self.output_type.dtype == 'float32': if self.output_type.dtype == 'float32':
otype = 'float' otype = 'float'
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论