提交 c587ca18 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1833 from abergeron/kernel_precomp

Kernel precomp
...@@ -61,89 +61,146 @@ class HideC(object): ...@@ -61,89 +61,146 @@ class HideC(object):
return self.c_code_cache_version() return self.c_code_cache_version()
class GpuKernelBase(object): class Kernel(object):
GpuKernelBase_version = 0 """
This class groups together all the attributes of a gpu kernel.
def c_kernel_code(self, node): """
""" def __init__(self, code, params, name, flags,
Return the source code of the kernel. codevar=None, binvar=None, objvar=None):
""" self.code = code
raise AttributeError("c_kernel_code", type(self)) self.params = params
self.name = name
def c_kernel_params(self, node): self.flags = flags
""" if codevar is None:
Return the list of typecodes for kernel parameters. codevar = 'kcode_' + name
self.codevar = codevar
The list can contain strings ( "GA_BUFFER" ) or direct int values. if binvar is None:
""" binvar = 'kbin_' + name
raise AttributeError("c_kernel_params", type(self)) self.binvar = binvar
if objvar is None:
self.objvar = 'k_' + name
self.objvar = objvar
@staticmethod
def get_flags(*types):
def get_dtype(t):
if isinstance(t, (str, unicode)):
return numpy.dtype(t)
elif isinstance(t, Type):
return t.dtype
elif isinstance(t, Variable):
return t.type.dtype
else:
raise TypeError, "can't get a dtype from %s" % (type(t),)
dtypes = [get_dtype(t) for t in types]
flags = dict(cluda=True)
if any(d == numpy.float64 for d in dtypes):
flags['have_double'] = True
if any(d.itemsize < 4 for d in dtypes):
flags['have_small'] = True
if any(d.kind == 'c' for d in dtypes):
flags['have_complex'] = True
if any(d == numpy.float16 for d in dtypes):
flags['have_half'] = True
return flags
def _get_c_flags(self):
res = []
if self.flags.get('cluda', False):
res.append('GA_USE_CLUDA')
if self.flags.get('have_double', False):
res.append('GA_USE_DOUBLE')
if self.flags.get('have_small', False):
res.append('GA_USE_SMALL')
if self.flags.get('have_complex', False):
res.append('GA_USE_COMPLEX')
if self.flags.get('have_half', False):
res.append('GA_USE_SMALL')
return '|'.join(res)
def _get_c_types(self):
def m(t):
if t == gpuarray.GpuArray:
return "GA_BUFFER"
else:
return str(gpuarray.dtype_to_typecode(t))
return ', '.join(m(t) for t in self.params)
def c_kernel_name(self):
"""
Return the name of the kernel in the source.
"""
raise AttributeError("c_kernel_name", type(self))
def c_kernel_flags(self, node): class GpuKernelBase(object):
def gpu_kernels(self, node, name):
""" """
Return a string representing the C flags for the kernel. This is the method to override. This should return an
iterable of Kernel objects that describe the kernels this op
Example: will need.
"GA_USE_CLUDA|GA_USE_DOUBLE"
self._get_kernel_flags(*dtypes) returns an appropritate string
for the result of this function.
""" """
raise AttributeError("c_kernel_flags", type(self)) raise MethodNotDefined, 'gpu_kernels'
def c_kernel_codevar(self, name):
return 'kcode_' + name
def c_kernel_obj(self, name):
return 'k_' + name
def _get_kernel_flags(self, *dtypes):
dtypes = [numpy.dtype(d) for d in dtypes]
flags = ['GA_USE_CLUDA']
if any(d == numpy.float64 for d in dtypes):
flags.append('GA_USE_DOUBLE')
if any(d.itemsize < 4 for d in dtypes):
flags.append('GA_USE_SMALL')
return '|'.join(flags)
def c_headers(self): def c_headers(self):
return ['gpuarray/types.h'] try:
o = super(GpuKernelBase, self).c_headers()
except MethodNotDefined:
o = []
return o + ['gpuarray/types.h']
def _generate_kernel_bin(self, k):
gk = gpuarray.GpuKernel(k.code, k.name, k.params, **k.flags)
bin = gk._binary
bcode = ','.join(hex(ord(c)) for c in bin)
return ("""static const char %(bname)s[] = { %(bcode)s };""" %
dict(bname=k.binvar, bcode=bcode))
def _generate_kernel_code(self, k):
code = '\\n'.join(l for l in k.code.split('\n'))
code = code.replace('"', '\\"')
return ("""static const char *%(cname)s = "%(code)s";""" %
dict(cname=k.codevar, code=code))
def _generate_kernel_vars(self, k):
return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar)
def c_support_code_apply(self, node, name): def c_support_code_apply(self, node, name):
kcode = self.c_kernel_code(node) kernels = self.gpu_kernels(node, name)
vname = self.c_kernel_codevar(name) bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels)
kname = self.c_kernel_obj(name) codes = '\n'.join(self._generate_kernel_code(k) for k in kernels)
code = '\\n'.join(l for l in kcode.split('\n')) vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels)
code = code.replace('"', '\\"') return '\n'.join([bins, codes, vars])
return """static const char *%(vname)s = "%(code)s";
static GpuKernel %(kname)s;""" % dict(vname=vname, kname=kname, code=code)
def c_init_code_apply(self, node, name): def _generate_kernel_init(self, k, err):
types = self.c_kernel_params(node)
numargs = len(types)
kname = self.c_kernel_name()
vname = self.c_kernel_codevar(name)
oname = self.c_kernel_obj(name)
flags = self.c_kernel_flags(node)
# TODO: find a way to release the kernel once the module is unloaded
error_out = ""
if PY3: if PY3:
error_out = "NULL" error_out = "NULL"
return """ else:
int types_%(name)s[%(numargs)u] = {%(types)s}; error_out = ""
if (GpuKernel_init(&%(oname)s, pygpu_default_context()->ops, return """{
pygpu_default_context()->ctx, 1, &%(vname)s, NULL, int types[%(numargs)u] = {%(types)s};
"%(kname)s", %(numargs)s, types_%(name)s, %(flags)s) != GA_NO_ERROR) { const char *bcode = %(bvar)s;
PyErr_SetString(PyExc_RuntimeError, "Error initializing kernel"); size_t sz = sizeof(%(bvar)s);
return %(error_out)s; PyGpuContextObject *c = pygpu_default_context();
} if (GpuKernel_init(&%(ovar)s, c->ops, c->ctx, 1, &bcode, &sz, "%(kname)s",
""" % dict(types=','.join(types), numargs=numargs, kname=kname, oname=oname, %(numargs)u, types, GA_USE_BINARY) != GA_NO_ERROR) {
vname=vname, flags=flags, error_out=error_out, name=name) if ((%(err)s = GpuKernel_init(&%(ovar)s, c->ops, c->ctx, 1, &%(cname)s,
NULL, "%(kname)s", %(numargs)u, types,
%(flags)s)) != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
%(err)s, Gpu_error(c->ops, c->ctx, %(err)s));
return %(error_out)s;
}
}
}""" % dict(numargs=len(k.params), types=k._get_c_types(), bvar=k.binvar,
ovar=k.objvar, kname=k.name, err=err, cname=k.codevar,
flags=k._get_c_flags(), error_out=error_out)
def c_init_code_apply(self, node, name):
err = 'err_' + name
kernels = self.gpu_kernels(node, name)
inits ='\n'.join(self._generate_kernel_init(k, err) for k in kernels)
return ("int %(err)s;\n" % dict(err=err)) + inits
def _GpuKernelBase_version(self):
ctx = gpuarray.get_default_context()
return (2, ctx.kind, ctx.devname)
GpuKernelBase_version = property(_GpuKernelBase_version)
class HostFromGpu(Op): class HostFromGpu(Op):
...@@ -815,23 +872,20 @@ class GpuEye(GpuKernelBase, Op): ...@@ -815,23 +872,20 @@ class GpuEye(GpuKernelBase, Op):
def __hash__(self): def __hash__(self):
return hash(self.dtype) ^ hash(type(self)) return hash(self.dtype) ^ hash(type(self))
def c_kernel_code(self, node): def gpu_kernels(self, node, name):
return """ code = """
KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
ga_size nb = n < m ? n : m; ga_size nb = n < m ? n : m;
for (ga_size i = LID_0; i < nb; i += LDIM_0) { for (ga_size i = LID_0; i < nb; i += LDIM_0) {
a[i*m + i] = 1; a[i*m + i] = 1;
} }
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype)) }""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype), name=name)
return [Kernel(
def c_kernel_params(self, node): code=code, name="k",
return ["GA_BUFFER", "GA_SIZE", "GA_SIZE"] params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE],
flags=Kernel.get_flags(self.dtype),
def c_kernel_name(self): objvar='k_eye_'+name,
return "k" )]
def c_kernel_flags(self, node):
return self._get_kernel_flags(self.dtype)
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
n, m = inp n, m = inp
...@@ -839,7 +893,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { ...@@ -839,7 +893,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
fail = sub['fail'] fail = sub['fail']
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype) typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
sync = bool(config.gpuarray.sync) sync = bool(config.gpuarray.sync)
kname = self.c_kernel_obj(name) kname = self.gpu_kernels()[0].objvar
s = """ s = """
size_t dims[2] = {0, 0}; size_t dims[2] = {0, 0};
void *args[3]; void *args[3];
......
...@@ -14,6 +14,7 @@ from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler ...@@ -14,6 +14,7 @@ from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
try: try:
import pygpu import pygpu
from pygpu import gpuarray
from pygpu.tools import ScalarArg, ArrayArg from pygpu.tools import ScalarArg, ArrayArg
from pygpu.elemwise import ElemwiseKernel from pygpu.elemwise import ElemwiseKernel
from pygpu.reduction import ReductionKernel from pygpu.reduction import ReductionKernel
...@@ -22,7 +23,7 @@ except ImportError: ...@@ -22,7 +23,7 @@ except ImportError:
pass pass
from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable, HideC, from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable, HideC,
GpuKernelBase) GpuKernelBase, Kernel)
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
...@@ -2406,40 +2407,29 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2406,40 +2407,29 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if any(redux): if any(redux):
return getattr(node, attr) return getattr(node, attr)
def c_kernel_code(self, node): def gpu_kernels(self, node, name):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])): if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
# Some OpenCL compilers do not accept no-arguments kernels # Some OpenCL compilers do not accept no-arguments kernels
return "KERNEL void reduk(GLOBAL_MEM float *a) {}" src = "KERNEL void reduk(GLOBAL_MEM float *a) {}"
params = ['float32']
else: else:
k = self.get_kernel_cache(node) k = self.get_kernel_cache(node)
_, src, _, _ = k._get_basic_kernel(k.init_local_size, _, src, _, _ = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim) node.inputs[0].ndim)
return src
def c_kernel_name(self):
return "reduk"
def c_kernel_params(self, node):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
return ["GA_FLOAT"]
else:
# Make sure this is synced with the call definition in
# pygpu/reduction.py
nd = node.inputs[0].ndim nd = node.inputs[0].ndim
res = ["GA_UINT", "GA_BUFFER"] params = ['uint32', gpuarray.GpuArray]
res.extend("GA_UINT" for _ in range(nd)) params.extend('uint32' for _ in range(nd))
res.append("GA_BUFFER") params.append(gpuarray.GpuArray)
res.append("GA_UINT") params.append('uint32')
res.extend("GA_INT" for _ in range(nd)) params.extend('int32' for _ in range(nd))
return res
def c_kernel_flags(self, node):
acc_dtype = getattr(self, 'acc_dtype', None) acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None: if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype acc_dtype = node.outputs[0].type.dtype
return self._get_kernel_flags(node.inputs[0].type.dtype, return [Kernel(code=src, name="reduk", params=params,
acc_dtype, flags=Kernel.get_flags(node.inputs[0].type.dtype,
node.outputs[0].type.dtype) acc_dtype,
node.outputs[0].type.dtype),
objvar='k_reduk_'+name)]
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])): if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
...@@ -2458,7 +2448,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2458,7 +2448,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
sync=bool(config.gpuarray.sync)) sync=bool(config.gpuarray.sync))
k = self.get_kernel_cache(node) k = self.get_kernel_cache(node)
_, src, _, ls = k._get_basic_kernel(k.init_local_size, _, src, _, ls = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim) node.inputs[0].ndim)
if self.axis is None: if self.axis is None:
redux = [True] * node.inputs[0].ndim redux = [True] * node.inputs[0].ndim
else: else:
...@@ -2588,14 +2578,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2588,14 +2578,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if (%(sync)d) if (%(sync)d)
GpuArray_sync(&%(output)s->ga); GpuArray_sync(&%(output)s->ga);
""" % dict(k_var=self.c_kernel_obj(name), sync=bool(config.gpuarray.sync), """ % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input, ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype)) cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,) return (0, self.GpuKernelBase_version)
def generate_kernel(self, node, odtype, redux): def generate_kernel(self, node, odtype, redux):
if isinstance(self.scalar_op, scalar.basic.Add): if isinstance(self.scalar_op, scalar.basic.Add):
......
...@@ -26,7 +26,7 @@ if cuda_available: ...@@ -26,7 +26,7 @@ if cuda_available:
from theano.sandbox.cuda import (CudaNdarrayType, from theano.sandbox.cuda import (CudaNdarrayType,
float32_shared_constructor) float32_shared_constructor)
from theano.sandbox.gpuarray.basic_ops import GpuKernelBase from theano.sandbox.gpuarray.basic_ops import GpuKernelBase, Kernel
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
...@@ -772,9 +772,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -772,9 +772,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
return op(rstate, cast(v_size, 'int32')) return op(rstate, cast(v_size, 'int32'))
def c_headers(self): def c_headers(self):
return GpuKernelBase.c_headers(self) + ['numpy_compat.h'] return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']
def c_kernel_code(self, node): def gpu_kernels(self, node, name):
if self.output_type.dtype == 'float32': if self.output_type.dtype == 'float32':
otype = 'float' otype = 'float'
NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65)) NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
...@@ -783,10 +783,10 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -783,10 +783,10 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
else: else:
otype = 'double' otype = 'double'
NORM = '4.656612873077392578125e-10' NORM = '4.656612873077392578125e-10'
return """ code = """
KERNEL void mrg_uniform( KERNEL void mrg_uniform(
%(otype)s *sample_data, GLOBAL_MEM %(otype)s *sample_data,
ga_int *state_data, GLOBAL_MEM ga_int *state_data,
const ga_uint Nsamples, const ga_uint Nsamples,
const ga_uint Nstreams_used) const ga_uint Nstreams_used)
{ {
...@@ -809,7 +809,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -809,7 +809,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
const ga_int MASK2 = 65535; //2^16 - 1 const ga_int MASK2 = 65535; //2^16 - 1
const ga_int MULT2 = 21069; const ga_int MULT2 = 21069;
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; const ga_uint idx = GID_0 * LDIM_0 + LID_0;
ga_int y1, y2, x11, x12, x13, x21, x22, x23; ga_int y1, y2, x11, x12, x13, x21, x22, x23;
if (idx < Nstreams_used) if (idx < Nstreams_used)
...@@ -821,7 +821,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -821,7 +821,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
x22 = state_data[idx*6+4]; x22 = state_data[idx*6+4];
x23 = state_data[idx*6+5]; x23 = state_data[idx*6+5];
for (int i = idx; i < Nsamples; i += Nstreams_used) for (ga_uint i = idx; i < Nsamples; i += Nstreams_used)
{ {
y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24); y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0; y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
...@@ -864,14 +864,14 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -864,14 +864,14 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
""" % locals() """ % locals()
def c_kernel_params(self, node): # we shouldn't get to this line if it's about to fail
return ["GA_BUFFER", "GA_BUFFER", "GA_UINT", "GA_UINT"] from pygpu import gpuarray
def c_kernel_name(self):
return "mrg_uniform"
def c_kernel_flags(self, node): return [Kernel(code=code, name="mrg_uniform",
return self._get_kernel_flags(self.output_type.dtype, 'int32') params=[gpuarray.GpuArray, gpuarray.GpuArray,
'uint32', 'uint32'],
flags=Kernel.get_flags(self.output_type.dtype, 'int32'))
]
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp rstate, size = inp
...@@ -880,7 +880,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -880,7 +880,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
ndim = self.output_type.ndim ndim = self.output_type.ndim
o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail'] fail = sub['fail']
kname = self.c_kernel_obj(nodename) kname = self.gpu_kernels(node, nodename)[0].objvar
if self.output_type.dtype == 'float32': if self.output_type.dtype == 'float32':
otype = 'float' otype = 'float'
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论