提交 f85085d0 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1646 from abergeron/c_red

c_code() for GpuCAReduce (in gpuarray)
......@@ -62,6 +62,13 @@ There are less methods to define for an Op than for a Type:
Allows you to specify code that will be executed once when the
module is initialized, before anything else is executed.
.. method:: c_init_code_apply(self, node, name)
Allows you to specify code that will be executed once when the
module is initialized, before anything else is executed and is
specialized for a particular apply of an :ref:`op`. Use
`c_init_code` if the code is the same for each apply of an op.
.. method:: c_support_code()
Allows you to specify helper functions/structs that the
......
......@@ -484,6 +484,7 @@ class CLinker(link.Linker):
self.consts = []
c_support_code_apply = []
c_init_code_apply = []
symbol = {}
......@@ -630,7 +631,15 @@ class CLinker(link.Linker):
# The following will be executed if the "try" block succeeds
assert isinstance(c_support_code_apply[-1], basestring), (
str(node.op) +
" didn't returned a string for c_support_code_apply")
" didn't return a string for c_support_code_apply")
try:
c_init_code_apply.append(op.c_init_code_apply(node, name))
except utils.MethodNotDefined:
pass
else:
assert isinstance(c_init_code_apply[-1], basestring), (
str(node.op) +
" didn't return a string for c_init_code_apply")
# emit c_code
try:
......@@ -638,7 +647,7 @@ class CLinker(link.Linker):
except utils.MethodNotDefined:
raise NotImplementedError("%s cannot produce C code" % op)
assert isinstance(behavior, basestring), (
str(node.op) + " didn't returned a string for c_code")
str(node.op) + " didn't return a string for c_code")
try:
cleanup = op.c_code_cleanup(node, name, isyms, osyms, sub)
......@@ -677,6 +686,7 @@ class CLinker(link.Linker):
self.tasks = tasks
all_info = self.inputs + self.outputs + self.orphans
self.c_support_code_apply = c_support_code_apply
self.c_init_code_apply = c_init_code_apply
if (self.init_tasks, self.tasks) != self.get_init_tasks():
print >> sys.stderr, "init_tasks\n", self.init_tasks
......@@ -1292,7 +1302,7 @@ class CLinker(link.Linker):
mod.add_function(instantiate)
for header in self.headers():
mod.add_include(header)
for init_code_block in self.init_code():
for init_code_block in self.init_code() + self.c_init_code_apply:
mod.add_init_code(init_code_block)
return mod
......
......@@ -187,6 +187,18 @@ class CLinkerObject(object):
self.__class__.__name__)
def c_init_code_apply(self, node, name):
"""
Optional: return a list of code snippets specific to the apply
to be inserted in module initialization.
:Exceptions:
- `MethodNotDefined`: the subclass does not override this method
"""
raise utils.MethodNotDefined("c_init_code_apply", type(self),
self.__class__.__name__)
class CLinkerOp(CLinkerObject):
"""
Interface definition for `Op` subclasses compiled by `CLinker`.
......
......@@ -52,6 +52,7 @@ class HideC(object):
c_compile_args = __hide
c_no_compile_args = __hide
c_init_code = __hide
c_init_code_apply = __hide
def c_code_cache_version(self):
return ()
......@@ -63,13 +64,13 @@ class HideC(object):
class GpuKernelBase(object):
GpuKernelBase_version = 0
def c_kernel_code(self):
def c_kernel_code(self, node):
"""
Return the source code of the kernel.
"""
raise AttributeError("c_kernel_code", type(self))
def c_kernel_params(self):
def c_kernel_params(self, node):
"""
Return the list of typecodes for kernel parameters.
......@@ -83,7 +84,7 @@ class GpuKernelBase(object):
"""
raise AttributeError("c_kernel_name", type(self))
def c_kernel_flags(self):
def c_kernel_flags(self, node):
"""
Return a string representing the C flags for the kernel.
......@@ -95,11 +96,11 @@ class GpuKernelBase(object):
"""
raise AttributeError("c_kernel_flags", type(self))
def c_kernel_codevar(self):
return 'kcode_' + type(self).__name__ + '_' + hex(hash(self))[2:]
def c_kernel_codevar(self, name):
return 'kcode_' + name
def c_kernel_obj(self):
return 'k_' + type(self).__name__ + '_' + hex(hash(self))[2:]
def c_kernel_obj(self, name):
return 'k_' + name
def _get_kernel_flags(self, *dtypes):
dtypes = [numpy.dtype(d) for d in dtypes]
......@@ -113,35 +114,36 @@ class GpuKernelBase(object):
def c_headers(self):
return ['compyte/types.h']
def c_support_code(self):
kcode = self.c_kernel_code()
vname = self.c_kernel_codevar()
kname = self.c_kernel_obj()
def c_support_code_apply(self, node, name):
kcode = self.c_kernel_code(node)
vname = self.c_kernel_codevar(name)
kname = self.c_kernel_obj(name)
code = '\\n'.join(l for l in kcode.split('\n'))
code = code.replace('"', '\\"')
return """static const char *%(vname)s = "%(code)s";
static GpuKernel %(kname)s;""" % dict(vname=vname, kname=kname,code=code)
static GpuKernel %(kname)s;""" % dict(vname=vname, kname=kname, code=code)
def c_init_code(self):
types = self.c_kernel_params()
def c_init_code_apply(self, node, name):
types = self.c_kernel_params(node)
numargs = len(types)
name = self.c_kernel_name()
vname = self.c_kernel_codevar()
kname = self.c_kernel_obj()
flags = self.c_kernel_flags()
kname = self.c_kernel_name()
vname = self.c_kernel_codevar(name)
oname = self.c_kernel_obj(name)
flags = self.c_kernel_flags(node)
# TODO: find a way to release the kernel once the module is unloaded
error_out = ""
if PY3:
error_out = "NULL"
return ["""
int types[%(numargs)u] = {%(types)s};
if (GpuKernel_init(&%(kname)s, pygpu_default_context()->ops,
return """
int types_%(name)s[%(numargs)u] = {%(types)s};
if (GpuKernel_init(&%(oname)s, pygpu_default_context()->ops,
pygpu_default_context()->ctx, 1, &%(vname)s, NULL,
"%(name)s", %(numargs)s, types, %(flags)s) != GA_NO_ERROR) {
"%(kname)s", %(numargs)s, types_%(name)s, %(flags)s) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Error initializing kernel");
return %(error_out)s;
}
""" % dict(types=','.join(types), numargs=numargs, kname=kname, name=name,
vname=vname, flags=flags, error_out=error_out)]
""" % dict(types=','.join(types), numargs=numargs, kname=kname, oname=oname,
vname=vname, flags=flags, error_out=error_out, name=name)
class HostFromGpu(Op):
......@@ -726,7 +728,7 @@ class GpuEye(GpuKernelBase, Op):
def __hash__(self):
return hash(self.dtype) ^ hash(type(self))
def c_kernel_code(self):
def c_kernel_code(self, node):
return """
KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
ga_size nb = n < m ? n : m;
......@@ -735,13 +737,13 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
}
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype))
def c_kernel_params(self):
def c_kernel_params(self, node):
return ["GA_BUFFER", "GA_SIZE", "GA_SIZE"]
def c_kernel_name(self):
return "k"
def c_kernel_flags(self):
def c_kernel_flags(self, node):
return self._get_kernel_flags(self.dtype)
def c_code(self, node, name, inp, out, sub):
......@@ -750,7 +752,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
fail = sub['fail']
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
sync = bool(config.gpuarray.sync)
kname = self.c_kernel_obj()
kname = self.c_kernel_obj(name)
s = """
size_t dims[2] = {0, 0};
void *args[3];
......
......@@ -11,11 +11,12 @@ try:
from pygpu.tools import ScalarArg, ArrayArg
from pygpu.elemwise import ElemwiseKernel
from pygpu.reduction import ReductionKernel
from pygpu.gpuarray import dtype_to_typecode
from pygpu.gpuarray import dtype_to_typecode, dtype_to_ctype
except ImportError:
pass
from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC
from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable, HideC,
GpuKernelBase)
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.gof.utils import MethodNotDefined
......@@ -480,7 +481,7 @@ class GpuDimShuffle(HideC, DimShuffle):
return (3,)
class GpuCAReduce(HideC, CAReduceDtype):
class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
if not hasattr(scalar_op, 'identity'):
raise ValueError("No identity on scalar op")
......@@ -510,18 +511,218 @@ class GpuCAReduce(HideC, CAReduceDtype):
return Apply(res.op, [input], [otype()])
def make_thunk(self, node, storage_map, compute_map, no_recycling):
# cache the kernel object
self.get_kernel_cache(node)
return super(GpuCAReduce, self).make_thunk(node, storage_map,
compute_map, no_recycling)
def get_kernel_cache(self, node):
attr = '@cache_reduction_k'
if self.axis is None:
redux = [True] * node.inputs[0].ndim
else:
redux = self.redux
if not hasattr(node, attr):
acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype
if any(redux):
setattr(node, attr, self.generate_kernel(node, acc_dtype,
redux))
if any(redux):
return getattr(node, attr)
def c_kernel_code(self, node):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
# Some OpenCL compilers do not accept no-arguments kernels
return "KERNEL void reduk(GLOBAL_MEM float *a) {}"
else:
k = self.get_kernel_cache(node)
_, src, _, _ = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim)
return src
def c_kernel_name(self):
return "reduk"
def c_kernel_params(self, node):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
return ["GA_FLOAT"]
else:
# Make sure this is synced with the call definition in
# pygpu/reduction.py
nd = node.inputs[0].ndim
res = ["GA_UINT", "GA_BUFFER"]
res.extend("GA_UINT" for _ in range(nd))
res.append("GA_BUFFER")
res.append("GA_UINT")
res.extend("GA_INT" for _ in range(nd))
return res
def c_kernel_flags(self, node):
acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype
if any(redux):
node._cache_reduction_k = self.generate_kernel(node, acc_dtype,
redux)
return super(GpuCAReduce, self).make_thunk(node, storage_map,
compute_map, no_recycling)
return self._get_kernel_flags(node.inputs[0].type.dtype,
acc_dtype,
node.outputs[0].type.dtype)
def c_code(self, node, name, inp, out, sub):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
# We special case the no-reduction case since the gpu
# kernel has trouble handling it.
return """
Py_XDECREF(%(out)s);
%(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
if (!%(out)s) {
%(fail)s
}
if (%(sync)d)
GpuArray_sync(&%(out)s->ga);
""" % dict(out=out[0], inp=inp[0], fail=sub['fail'],
sync=bool(config.gpuarray.sync))
k = self.get_kernel_cache(node)
_, src, _, ls = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim)
if self.axis is None:
redux = [True] * node.inputs[0].ndim
else:
redux = self.redux
acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype
input = inp[0]
output = out[0]
nd_out = node.outputs[0].ndim
code = """
size_t gs = 1;
unsigned int n = 1;
unsigned int proxy_dim[%(nd_in)s];
unsigned int proxy_off;
int proxy_str[%(nd_in)s];
void *args[%(n_args)s];
PyGpuArrayObject *tmp;
int err;
""" % dict(n_args=4 + (node.inputs[0].ndim * 2), nd_in=node.inputs[0].ndim)
if nd_out != 0:
code += """
size_t out_dims[%(nd_out)s];
int need_out = %(output)s == NULL || %(output)s->ga.nd != %(nd_out)s;
""" % dict(nd_out=nd_out, output=output)
j = 0
for i in range(node.inputs[0].ndim):
if not self.redux[i]:
code += """
out_dims[%(j)s] = %(input)s->ga.dimensions[%(i)s];
if (!need_out)
need_out |= %(output)s->ga.dimensions[%(j)s] != out_dims[%(j)s];
""" % dict(j=j, i=i, input=input, output=output)
j += 1
code += """
if (need_out) {
%(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, pygpu_default_context(), Py_None);
if (!%(output)s) {
%(fail)s
}
}
""" % dict(output=output, nd_out=nd_out, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype))
else:
code += """
if (%(output)s == NULL || %(output)s->ga.nd != 0) {
Py_XDECREF(%(output)s);
%(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(output)s) {
%(fail)s
}
}
""" % dict(output=output, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype))
if acc_dtype != node.outputs[0].type.dtype:
code += """
tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions,
%(acc_type)s, GA_C_ORDER, pygpu_default_context(),
Py_None);
if (!tmp) %(fail)s
""" % dict(output=output, fail=sub['fail'], acc_type=dtype_to_typecode(acc_dtype))
else:
code += """
tmp = %(output)s;
Py_INCREF(tmp);
""" % dict(output=output)
# We need the proxies since we are passing a pointer to the
# data into the call and therefore we need a real copy of the
# data in the proper type.
code += """
args[0] = &n;
args[1] = &tmp->ga;
""" % dict(output=output)
p = 2
for i in range(node.inputs[0].ndim):
code += """
proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
args[%(p)s] = &proxy_dim[%(i)s];
n *= %(input)s->ga.dimensions[%(i)s];
""" % dict(i=i, p=p, input=input)
p += 1
if not redux[i]:
code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i)
code += """
args[%(p)s] = &%(input)s->ga;
proxy_off = %(input)s->ga.offset;
args[%(p)s+1] = &proxy_off;
""" % dict(p=p, input=input)
p += 2
for i in range(node.inputs[0].ndim):
code += """
proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s];
args[%(p)s] = &proxy_str[%(i)s];
""" % dict(p=p, i=i, input=input)
p += 1
code += """
if (gs == 0) gs = 1;
n /= gs;
err = GpuKernel_call(&%(k_var)s, 0, %(ls)s, gs, args);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"compyte error: GpuCAReduce: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s
}
if (%(cast_out)d) {
err = GpuArray_move(&%(output)s->ga, &tmp->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"compyte error: GpuCAReduce [cast]: %%s.",
GpuArray_error(&tmp->ga, err));
%(fail)s
}
} else {
Py_XDECREF(%(output)s);
%(output)s = tmp;
}
if (%(sync)d)
GpuArray_sync(&%(output)s->ga);
""" % dict(k_var=self.c_kernel_obj(name), sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
return code
def c_code_cache_version(self):
return (0,)
def generate_kernel(self, node, odtype, redux):
if isinstance(self.scalar_op, scalar.basic.Add):
......@@ -533,8 +734,7 @@ class GpuCAReduce(HideC, CAReduceDtype):
return ReductionKernel(pygpu.get_default_context(), odtype,
self.scalar_op.identity, reduce_expr, redux,
arguments=[make_argument(node.inputs[0], 'a')],
init_nd=node.inputs[0].ndim
)
init_nd=node.inputs[0].ndim)
def perform(self, node, inp, out):
input, = inp
......@@ -546,7 +746,7 @@ class GpuCAReduce(HideC, CAReduceDtype):
redux = self.redux
if any(redux):
output[0] = node._cache_reduction_k(input).astype(copy=False,
output[0] = self.get_kernel_cache(node)(input).astype(copy=False,
dtype=node.outputs[0].type.dtype)
else:
output[0] = pygpu.gpuarray.array(input, copy=True,
......
......@@ -55,7 +55,12 @@ class test_GpuCAReduce(test_CAReduce):
test_nan=True)
def test_c(self):
raise SkipTest("no C code")
for dtype in self.dtypes + self.bin_dtypes:
for op in self.reds:
self.with_linker(gof.CLinker(), op, dtype=dtype)
def test_c_nan(self):
raise SkipTest("no C code")
for dtype in self.dtypes:
for op in self.reds:
self.with_linker(gof.CLinker(), op, dtype=dtype,
test_nan=True)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论