提交 c587ca18 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1833 from abergeron/kernel_precomp

Kernel precomp
......@@ -61,89 +61,146 @@ class HideC(object):
return self.c_code_cache_version()
class GpuKernelBase(object):
GpuKernelBase_version = 0
def c_kernel_code(self, node):
"""
Return the source code of the kernel.
"""
raise AttributeError("c_kernel_code", type(self))
def c_kernel_params(self, node):
"""
Return the list of typecodes for kernel parameters.
The list can contain strings ( "GA_BUFFER" ) or direct int values.
"""
raise AttributeError("c_kernel_params", type(self))
class Kernel(object):
"""
This class groups together all the attributes of a gpu kernel.
"""
def __init__(self, code, params, name, flags,
codevar=None, binvar=None, objvar=None):
self.code = code
self.params = params
self.name = name
self.flags = flags
if codevar is None:
codevar = 'kcode_' + name
self.codevar = codevar
if binvar is None:
binvar = 'kbin_' + name
self.binvar = binvar
if objvar is None:
self.objvar = 'k_' + name
self.objvar = objvar
@staticmethod
def get_flags(*types):
def get_dtype(t):
if isinstance(t, (str, unicode)):
return numpy.dtype(t)
elif isinstance(t, Type):
return t.dtype
elif isinstance(t, Variable):
return t.type.dtype
else:
raise TypeError, "can't get a dtype from %s" % (type(t),)
dtypes = [get_dtype(t) for t in types]
flags = dict(cluda=True)
if any(d == numpy.float64 for d in dtypes):
flags['have_double'] = True
if any(d.itemsize < 4 for d in dtypes):
flags['have_small'] = True
if any(d.kind == 'c' for d in dtypes):
flags['have_complex'] = True
if any(d == numpy.float16 for d in dtypes):
flags['have_half'] = True
return flags
def _get_c_flags(self):
res = []
if self.flags.get('cluda', False):
res.append('GA_USE_CLUDA')
if self.flags.get('have_double', False):
res.append('GA_USE_DOUBLE')
if self.flags.get('have_small', False):
res.append('GA_USE_SMALL')
if self.flags.get('have_complex', False):
res.append('GA_USE_COMPLEX')
if self.flags.get('have_half', False):
res.append('GA_USE_SMALL')
return '|'.join(res)
def _get_c_types(self):
def m(t):
if t == gpuarray.GpuArray:
return "GA_BUFFER"
else:
return str(gpuarray.dtype_to_typecode(t))
return ', '.join(m(t) for t in self.params)
def c_kernel_name(self):
"""
Return the name of the kernel in the source.
"""
raise AttributeError("c_kernel_name", type(self))
def c_kernel_flags(self, node):
class GpuKernelBase(object):
def gpu_kernels(self, node, name):
"""
Return a string representing the C flags for the kernel.
Example:
"GA_USE_CLUDA|GA_USE_DOUBLE"
self._get_kernel_flags(*dtypes) returns an appropritate string
for the result of this function.
This is the method to override. This should return an
iterable of Kernel objects that describe the kernels this op
will need.
"""
raise AttributeError("c_kernel_flags", type(self))
def c_kernel_codevar(self, name):
return 'kcode_' + name
def c_kernel_obj(self, name):
return 'k_' + name
def _get_kernel_flags(self, *dtypes):
dtypes = [numpy.dtype(d) for d in dtypes]
flags = ['GA_USE_CLUDA']
if any(d == numpy.float64 for d in dtypes):
flags.append('GA_USE_DOUBLE')
if any(d.itemsize < 4 for d in dtypes):
flags.append('GA_USE_SMALL')
return '|'.join(flags)
raise MethodNotDefined, 'gpu_kernels'
def c_headers(self):
return ['gpuarray/types.h']
try:
o = super(GpuKernelBase, self).c_headers()
except MethodNotDefined:
o = []
return o + ['gpuarray/types.h']
def _generate_kernel_bin(self, k):
gk = gpuarray.GpuKernel(k.code, k.name, k.params, **k.flags)
bin = gk._binary
bcode = ','.join(hex(ord(c)) for c in bin)
return ("""static const char %(bname)s[] = { %(bcode)s };""" %
dict(bname=k.binvar, bcode=bcode))
def _generate_kernel_code(self, k):
code = '\\n'.join(l for l in k.code.split('\n'))
code = code.replace('"', '\\"')
return ("""static const char *%(cname)s = "%(code)s";""" %
dict(cname=k.codevar, code=code))
def _generate_kernel_vars(self, k):
return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar)
def c_support_code_apply(self, node, name):
kcode = self.c_kernel_code(node)
vname = self.c_kernel_codevar(name)
kname = self.c_kernel_obj(name)
code = '\\n'.join(l for l in kcode.split('\n'))
code = code.replace('"', '\\"')
return """static const char *%(vname)s = "%(code)s";
static GpuKernel %(kname)s;""" % dict(vname=vname, kname=kname, code=code)
kernels = self.gpu_kernels(node, name)
bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels)
codes = '\n'.join(self._generate_kernel_code(k) for k in kernels)
vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels)
return '\n'.join([bins, codes, vars])
def c_init_code_apply(self, node, name):
types = self.c_kernel_params(node)
numargs = len(types)
kname = self.c_kernel_name()
vname = self.c_kernel_codevar(name)
oname = self.c_kernel_obj(name)
flags = self.c_kernel_flags(node)
# TODO: find a way to release the kernel once the module is unloaded
error_out = ""
def _generate_kernel_init(self, k, err):
if PY3:
error_out = "NULL"
return """
int types_%(name)s[%(numargs)u] = {%(types)s};
if (GpuKernel_init(&%(oname)s, pygpu_default_context()->ops,
pygpu_default_context()->ctx, 1, &%(vname)s, NULL,
"%(kname)s", %(numargs)s, types_%(name)s, %(flags)s) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Error initializing kernel");
return %(error_out)s;
}
""" % dict(types=','.join(types), numargs=numargs, kname=kname, oname=oname,
vname=vname, flags=flags, error_out=error_out, name=name)
else:
error_out = ""
return """{
int types[%(numargs)u] = {%(types)s};
const char *bcode = %(bvar)s;
size_t sz = sizeof(%(bvar)s);
PyGpuContextObject *c = pygpu_default_context();
if (GpuKernel_init(&%(ovar)s, c->ops, c->ctx, 1, &bcode, &sz, "%(kname)s",
%(numargs)u, types, GA_USE_BINARY) != GA_NO_ERROR) {
if ((%(err)s = GpuKernel_init(&%(ovar)s, c->ops, c->ctx, 1, &%(cname)s,
NULL, "%(kname)s", %(numargs)u, types,
%(flags)s)) != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
%(err)s, Gpu_error(c->ops, c->ctx, %(err)s));
return %(error_out)s;
}
}
}""" % dict(numargs=len(k.params), types=k._get_c_types(), bvar=k.binvar,
ovar=k.objvar, kname=k.name, err=err, cname=k.codevar,
flags=k._get_c_flags(), error_out=error_out)
def c_init_code_apply(self, node, name):
err = 'err_' + name
kernels = self.gpu_kernels(node, name)
inits ='\n'.join(self._generate_kernel_init(k, err) for k in kernels)
return ("int %(err)s;\n" % dict(err=err)) + inits
def _GpuKernelBase_version(self):
ctx = gpuarray.get_default_context()
return (2, ctx.kind, ctx.devname)
GpuKernelBase_version = property(_GpuKernelBase_version)
class HostFromGpu(Op):
......@@ -815,23 +872,20 @@ class GpuEye(GpuKernelBase, Op):
def __hash__(self):
return hash(self.dtype) ^ hash(type(self))
def c_kernel_code(self, node):
return """
def gpu_kernels(self, node, name):
code = """
KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
ga_size nb = n < m ? n : m;
for (ga_size i = LID_0; i < nb; i += LDIM_0) {
a[i*m + i] = 1;
}
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype))
def c_kernel_params(self, node):
return ["GA_BUFFER", "GA_SIZE", "GA_SIZE"]
def c_kernel_name(self):
return "k"
def c_kernel_flags(self, node):
return self._get_kernel_flags(self.dtype)
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype), name=name)
return [Kernel(
code=code, name="k",
params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE],
flags=Kernel.get_flags(self.dtype),
objvar='k_eye_'+name,
)]
def c_code(self, node, name, inp, out, sub):
n, m = inp
......@@ -839,7 +893,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
fail = sub['fail']
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
sync = bool(config.gpuarray.sync)
kname = self.c_kernel_obj(name)
kname = self.gpu_kernels()[0].objvar
s = """
size_t dims[2] = {0, 0};
void *args[3];
......
......@@ -14,6 +14,7 @@ from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
try:
import pygpu
from pygpu import gpuarray
from pygpu.tools import ScalarArg, ArrayArg
from pygpu.elemwise import ElemwiseKernel
from pygpu.reduction import ReductionKernel
......@@ -22,7 +23,7 @@ except ImportError:
pass
from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable, HideC,
GpuKernelBase)
GpuKernelBase, Kernel)
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.gof.utils import MethodNotDefined
......@@ -2406,40 +2407,29 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if any(redux):
return getattr(node, attr)
def c_kernel_code(self, node):
def gpu_kernels(self, node, name):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
# Some OpenCL compilers do not accept no-arguments kernels
return "KERNEL void reduk(GLOBAL_MEM float *a) {}"
src = "KERNEL void reduk(GLOBAL_MEM float *a) {}"
params = ['float32']
else:
k = self.get_kernel_cache(node)
_, src, _, _ = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim)
return src
def c_kernel_name(self):
return "reduk"
def c_kernel_params(self, node):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
return ["GA_FLOAT"]
else:
# Make sure this is synced with the call definition in
# pygpu/reduction.py
nd = node.inputs[0].ndim
res = ["GA_UINT", "GA_BUFFER"]
res.extend("GA_UINT" for _ in range(nd))
res.append("GA_BUFFER")
res.append("GA_UINT")
res.extend("GA_INT" for _ in range(nd))
return res
def c_kernel_flags(self, node):
params = ['uint32', gpuarray.GpuArray]
params.extend('uint32' for _ in range(nd))
params.append(gpuarray.GpuArray)
params.append('uint32')
params.extend('int32' for _ in range(nd))
acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype
return self._get_kernel_flags(node.inputs[0].type.dtype,
acc_dtype,
node.outputs[0].type.dtype)
return [Kernel(code=src, name="reduk", params=params,
flags=Kernel.get_flags(node.inputs[0].type.dtype,
acc_dtype,
node.outputs[0].type.dtype),
objvar='k_reduk_'+name)]
def c_code(self, node, name, inp, out, sub):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
......@@ -2458,7 +2448,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
sync=bool(config.gpuarray.sync))
k = self.get_kernel_cache(node)
_, src, _, ls = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim)
node.inputs[0].ndim)
if self.axis is None:
redux = [True] * node.inputs[0].ndim
else:
......@@ -2588,14 +2578,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if (%(sync)d)
GpuArray_sync(&%(output)s->ga);
""" % dict(k_var=self.c_kernel_obj(name), sync=bool(config.gpuarray.sync),
""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
return code
def c_code_cache_version(self):
return (0,)
return (0, self.GpuKernelBase_version)
def generate_kernel(self, node, odtype, redux):
if isinstance(self.scalar_op, scalar.basic.Add):
......
......@@ -26,7 +26,7 @@ if cuda_available:
from theano.sandbox.cuda import (CudaNdarrayType,
float32_shared_constructor)
from theano.sandbox.gpuarray.basic_ops import GpuKernelBase
from theano.sandbox.gpuarray.basic_ops import GpuKernelBase, Kernel
from theano.sandbox.gpuarray.type import GpuArrayType
......@@ -772,9 +772,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
return op(rstate, cast(v_size, 'int32'))
def c_headers(self):
return GpuKernelBase.c_headers(self) + ['numpy_compat.h']
return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']
def c_kernel_code(self, node):
def gpu_kernels(self, node, name):
if self.output_type.dtype == 'float32':
otype = 'float'
NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
......@@ -783,10 +783,10 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
else:
otype = 'double'
NORM = '4.656612873077392578125e-10'
return """
code = """
KERNEL void mrg_uniform(
%(otype)s *sample_data,
ga_int *state_data,
GLOBAL_MEM %(otype)s *sample_data,
GLOBAL_MEM ga_int *state_data,
const ga_uint Nsamples,
const ga_uint Nstreams_used)
{
......@@ -809,7 +809,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
const ga_int MASK2 = 65535; //2^16 - 1
const ga_int MULT2 = 21069;
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
const ga_uint idx = GID_0 * LDIM_0 + LID_0;
ga_int y1, y2, x11, x12, x13, x21, x22, x23;
if (idx < Nstreams_used)
......@@ -821,7 +821,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
x22 = state_data[idx*6+4];
x23 = state_data[idx*6+5];
for (int i = idx; i < Nsamples; i += Nstreams_used)
for (ga_uint i = idx; i < Nsamples; i += Nstreams_used)
{
y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
......@@ -864,14 +864,14 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
""" % locals()
def c_kernel_params(self, node):
return ["GA_BUFFER", "GA_BUFFER", "GA_UINT", "GA_UINT"]
def c_kernel_name(self):
return "mrg_uniform"
# we shouldn't get to this line if it's about to fail
from pygpu import gpuarray
def c_kernel_flags(self, node):
return self._get_kernel_flags(self.output_type.dtype, 'int32')
return [Kernel(code=code, name="mrg_uniform",
params=[gpuarray.GpuArray, gpuarray.GpuArray,
'uint32', 'uint32'],
flags=Kernel.get_flags(self.output_type.dtype, 'int32'))
]
def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp
......@@ -880,7 +880,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
ndim = self.output_type.ndim
o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
kname = self.c_kernel_obj(nodename)
kname = self.gpu_kernels(node, nodename)[0].objvar
if self.output_type.dtype == 'float32':
otype = 'float'
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论