提交 f85085d0 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1646 from abergeron/c_red

c_code() for GpuCAReduce (in gpuarray)
...@@ -62,6 +62,13 @@ There are less methods to define for an Op than for a Type: ...@@ -62,6 +62,13 @@ There are less methods to define for an Op than for a Type:
Allows you to specify code that will be executed once when the Allows you to specify code that will be executed once when the
module is initialized, before anything else is executed. module is initialized, before anything else is executed.
.. method:: c_init_code_apply(self, node, name)
Allows you to specify code that will be executed once when the
module is initialized, before anything else is executed and is
specialized for a particular apply of an :ref:`op`. Use
`c_init_code` if the code is the same for each apply of an op.
.. method:: c_support_code() .. method:: c_support_code()
Allows you to specify helper functions/structs that the Allows you to specify helper functions/structs that the
......
...@@ -484,6 +484,7 @@ class CLinker(link.Linker): ...@@ -484,6 +484,7 @@ class CLinker(link.Linker):
self.consts = [] self.consts = []
c_support_code_apply = [] c_support_code_apply = []
c_init_code_apply = []
symbol = {} symbol = {}
...@@ -630,7 +631,15 @@ class CLinker(link.Linker): ...@@ -630,7 +631,15 @@ class CLinker(link.Linker):
# The following will be executed if the "try" block succeeds # The following will be executed if the "try" block succeeds
assert isinstance(c_support_code_apply[-1], basestring), ( assert isinstance(c_support_code_apply[-1], basestring), (
str(node.op) + str(node.op) +
" didn't returned a string for c_support_code_apply") " didn't return a string for c_support_code_apply")
try:
c_init_code_apply.append(op.c_init_code_apply(node, name))
except utils.MethodNotDefined:
pass
else:
assert isinstance(c_init_code_apply[-1], basestring), (
str(node.op) +
" didn't return a string for c_init_code_apply")
# emit c_code # emit c_code
try: try:
...@@ -638,7 +647,7 @@ class CLinker(link.Linker): ...@@ -638,7 +647,7 @@ class CLinker(link.Linker):
except utils.MethodNotDefined: except utils.MethodNotDefined:
raise NotImplementedError("%s cannot produce C code" % op) raise NotImplementedError("%s cannot produce C code" % op)
assert isinstance(behavior, basestring), ( assert isinstance(behavior, basestring), (
str(node.op) + " didn't returned a string for c_code") str(node.op) + " didn't return a string for c_code")
try: try:
cleanup = op.c_code_cleanup(node, name, isyms, osyms, sub) cleanup = op.c_code_cleanup(node, name, isyms, osyms, sub)
...@@ -677,6 +686,7 @@ class CLinker(link.Linker): ...@@ -677,6 +686,7 @@ class CLinker(link.Linker):
self.tasks = tasks self.tasks = tasks
all_info = self.inputs + self.outputs + self.orphans all_info = self.inputs + self.outputs + self.orphans
self.c_support_code_apply = c_support_code_apply self.c_support_code_apply = c_support_code_apply
self.c_init_code_apply = c_init_code_apply
if (self.init_tasks, self.tasks) != self.get_init_tasks(): if (self.init_tasks, self.tasks) != self.get_init_tasks():
print >> sys.stderr, "init_tasks\n", self.init_tasks print >> sys.stderr, "init_tasks\n", self.init_tasks
...@@ -1292,7 +1302,7 @@ class CLinker(link.Linker): ...@@ -1292,7 +1302,7 @@ class CLinker(link.Linker):
mod.add_function(instantiate) mod.add_function(instantiate)
for header in self.headers(): for header in self.headers():
mod.add_include(header) mod.add_include(header)
for init_code_block in self.init_code(): for init_code_block in self.init_code() + self.c_init_code_apply:
mod.add_init_code(init_code_block) mod.add_init_code(init_code_block)
return mod return mod
......
...@@ -187,6 +187,18 @@ class CLinkerObject(object): ...@@ -187,6 +187,18 @@ class CLinkerObject(object):
self.__class__.__name__) self.__class__.__name__)
def c_init_code_apply(self, node, name):
"""
Optional: return a list of code snippets specific to the apply
to be inserted in module initialization.
:Exceptions:
- `MethodNotDefined`: the subclass does not override this method
"""
raise utils.MethodNotDefined("c_init_code_apply", type(self),
self.__class__.__name__)
class CLinkerOp(CLinkerObject): class CLinkerOp(CLinkerObject):
""" """
Interface definition for `Op` subclasses compiled by `CLinker`. Interface definition for `Op` subclasses compiled by `CLinker`.
......
...@@ -52,6 +52,7 @@ class HideC(object): ...@@ -52,6 +52,7 @@ class HideC(object):
c_compile_args = __hide c_compile_args = __hide
c_no_compile_args = __hide c_no_compile_args = __hide
c_init_code = __hide c_init_code = __hide
c_init_code_apply = __hide
def c_code_cache_version(self): def c_code_cache_version(self):
return () return ()
...@@ -63,13 +64,13 @@ class HideC(object): ...@@ -63,13 +64,13 @@ class HideC(object):
class GpuKernelBase(object): class GpuKernelBase(object):
GpuKernelBase_version = 0 GpuKernelBase_version = 0
def c_kernel_code(self): def c_kernel_code(self, node):
""" """
Return the source code of the kernel. Return the source code of the kernel.
""" """
raise AttributeError("c_kernel_code", type(self)) raise AttributeError("c_kernel_code", type(self))
def c_kernel_params(self): def c_kernel_params(self, node):
""" """
Return the list of typecodes for kernel parameters. Return the list of typecodes for kernel parameters.
...@@ -83,7 +84,7 @@ class GpuKernelBase(object): ...@@ -83,7 +84,7 @@ class GpuKernelBase(object):
""" """
raise AttributeError("c_kernel_name", type(self)) raise AttributeError("c_kernel_name", type(self))
def c_kernel_flags(self): def c_kernel_flags(self, node):
""" """
Return a string representing the C flags for the kernel. Return a string representing the C flags for the kernel.
...@@ -95,11 +96,11 @@ class GpuKernelBase(object): ...@@ -95,11 +96,11 @@ class GpuKernelBase(object):
""" """
raise AttributeError("c_kernel_flags", type(self)) raise AttributeError("c_kernel_flags", type(self))
def c_kernel_codevar(self): def c_kernel_codevar(self, name):
return 'kcode_' + type(self).__name__ + '_' + hex(hash(self))[2:] return 'kcode_' + name
def c_kernel_obj(self): def c_kernel_obj(self, name):
return 'k_' + type(self).__name__ + '_' + hex(hash(self))[2:] return 'k_' + name
def _get_kernel_flags(self, *dtypes): def _get_kernel_flags(self, *dtypes):
dtypes = [numpy.dtype(d) for d in dtypes] dtypes = [numpy.dtype(d) for d in dtypes]
...@@ -113,35 +114,36 @@ class GpuKernelBase(object): ...@@ -113,35 +114,36 @@ class GpuKernelBase(object):
def c_headers(self): def c_headers(self):
return ['compyte/types.h'] return ['compyte/types.h']
def c_support_code(self): def c_support_code_apply(self, node, name):
kcode = self.c_kernel_code() kcode = self.c_kernel_code(node)
vname = self.c_kernel_codevar() vname = self.c_kernel_codevar(name)
kname = self.c_kernel_obj() kname = self.c_kernel_obj(name)
code = '\\n'.join(l for l in kcode.split('\n')) code = '\\n'.join(l for l in kcode.split('\n'))
code = code.replace('"', '\\"')
return """static const char *%(vname)s = "%(code)s"; return """static const char *%(vname)s = "%(code)s";
static GpuKernel %(kname)s;""" % dict(vname=vname, kname=kname,code=code) static GpuKernel %(kname)s;""" % dict(vname=vname, kname=kname, code=code)
def c_init_code(self): def c_init_code_apply(self, node, name):
types = self.c_kernel_params() types = self.c_kernel_params(node)
numargs = len(types) numargs = len(types)
name = self.c_kernel_name() kname = self.c_kernel_name()
vname = self.c_kernel_codevar() vname = self.c_kernel_codevar(name)
kname = self.c_kernel_obj() oname = self.c_kernel_obj(name)
flags = self.c_kernel_flags() flags = self.c_kernel_flags(node)
# TODO: find a way to release the kernel once the module is unloaded # TODO: find a way to release the kernel once the module is unloaded
error_out = "" error_out = ""
if PY3: if PY3:
error_out = "NULL" error_out = "NULL"
return [""" return """
int types[%(numargs)u] = {%(types)s}; int types_%(name)s[%(numargs)u] = {%(types)s};
if (GpuKernel_init(&%(kname)s, pygpu_default_context()->ops, if (GpuKernel_init(&%(oname)s, pygpu_default_context()->ops,
pygpu_default_context()->ctx, 1, &%(vname)s, NULL, pygpu_default_context()->ctx, 1, &%(vname)s, NULL,
"%(name)s", %(numargs)s, types, %(flags)s) != GA_NO_ERROR) { "%(kname)s", %(numargs)s, types_%(name)s, %(flags)s) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Error initializing kernel"); PyErr_SetString(PyExc_RuntimeError, "Error initializing kernel");
return %(error_out)s; return %(error_out)s;
} }
""" % dict(types=','.join(types), numargs=numargs, kname=kname, name=name, """ % dict(types=','.join(types), numargs=numargs, kname=kname, oname=oname,
vname=vname, flags=flags, error_out=error_out)] vname=vname, flags=flags, error_out=error_out, name=name)
class HostFromGpu(Op): class HostFromGpu(Op):
...@@ -726,7 +728,7 @@ class GpuEye(GpuKernelBase, Op): ...@@ -726,7 +728,7 @@ class GpuEye(GpuKernelBase, Op):
def __hash__(self): def __hash__(self):
return hash(self.dtype) ^ hash(type(self)) return hash(self.dtype) ^ hash(type(self))
def c_kernel_code(self): def c_kernel_code(self, node):
return """ return """
KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
ga_size nb = n < m ? n : m; ga_size nb = n < m ? n : m;
...@@ -735,13 +737,13 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { ...@@ -735,13 +737,13 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
} }
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype)) }""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype))
def c_kernel_params(self): def c_kernel_params(self, node):
return ["GA_BUFFER", "GA_SIZE", "GA_SIZE"] return ["GA_BUFFER", "GA_SIZE", "GA_SIZE"]
def c_kernel_name(self): def c_kernel_name(self):
return "k" return "k"
def c_kernel_flags(self): def c_kernel_flags(self, node):
return self._get_kernel_flags(self.dtype) return self._get_kernel_flags(self.dtype)
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
...@@ -750,7 +752,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { ...@@ -750,7 +752,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
fail = sub['fail'] fail = sub['fail']
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype) typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
sync = bool(config.gpuarray.sync) sync = bool(config.gpuarray.sync)
kname = self.c_kernel_obj() kname = self.c_kernel_obj(name)
s = """ s = """
size_t dims[2] = {0, 0}; size_t dims[2] = {0, 0};
void *args[3]; void *args[3];
......
...@@ -11,11 +11,12 @@ try: ...@@ -11,11 +11,12 @@ try:
from pygpu.tools import ScalarArg, ArrayArg from pygpu.tools import ScalarArg, ArrayArg
from pygpu.elemwise import ElemwiseKernel from pygpu.elemwise import ElemwiseKernel
from pygpu.reduction import ReductionKernel from pygpu.reduction import ReductionKernel
from pygpu.gpuarray import dtype_to_typecode from pygpu.gpuarray import dtype_to_typecode, dtype_to_ctype
except ImportError: except ImportError:
pass pass
from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable, HideC,
GpuKernelBase)
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
...@@ -480,7 +481,7 @@ class GpuDimShuffle(HideC, DimShuffle): ...@@ -480,7 +481,7 @@ class GpuDimShuffle(HideC, DimShuffle):
return (3,) return (3,)
class GpuCAReduce(HideC, CAReduceDtype): class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None): def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
if not hasattr(scalar_op, 'identity'): if not hasattr(scalar_op, 'identity'):
raise ValueError("No identity on scalar op") raise ValueError("No identity on scalar op")
...@@ -510,18 +511,218 @@ class GpuCAReduce(HideC, CAReduceDtype): ...@@ -510,18 +511,218 @@ class GpuCAReduce(HideC, CAReduceDtype):
return Apply(res.op, [input], [otype()]) return Apply(res.op, [input], [otype()])
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling):
# cache the kernel object
self.get_kernel_cache(node)
return super(GpuCAReduce, self).make_thunk(node, storage_map,
compute_map, no_recycling)
def get_kernel_cache(self, node):
attr = '@cache_reduction_k'
if self.axis is None: if self.axis is None:
redux = [True] * node.inputs[0].ndim redux = [True] * node.inputs[0].ndim
else: else:
redux = self.redux redux = self.redux
if not hasattr(node, attr):
acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype
if any(redux):
setattr(node, attr, self.generate_kernel(node, acc_dtype,
redux))
if any(redux):
return getattr(node, attr)
def c_kernel_code(self, node):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
# Some OpenCL compilers do not accept no-arguments kernels
return "KERNEL void reduk(GLOBAL_MEM float *a) {}"
else:
k = self.get_kernel_cache(node)
_, src, _, _ = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim)
return src
def c_kernel_name(self):
return "reduk"
def c_kernel_params(self, node):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
return ["GA_FLOAT"]
else:
# Make sure this is synced with the call definition in
# pygpu/reduction.py
nd = node.inputs[0].ndim
res = ["GA_UINT", "GA_BUFFER"]
res.extend("GA_UINT" for _ in range(nd))
res.append("GA_BUFFER")
res.append("GA_UINT")
res.extend("GA_INT" for _ in range(nd))
return res
def c_kernel_flags(self, node):
acc_dtype = getattr(self, 'acc_dtype', None) acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None: if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype acc_dtype = node.outputs[0].type.dtype
if any(redux): return self._get_kernel_flags(node.inputs[0].type.dtype,
node._cache_reduction_k = self.generate_kernel(node, acc_dtype, acc_dtype,
redux) node.outputs[0].type.dtype)
return super(GpuCAReduce, self).make_thunk(node, storage_map,
compute_map, no_recycling) def c_code(self, node, name, inp, out, sub):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
# We special case the no-reduction case since the gpu
# kernel has trouble handling it.
return """
Py_XDECREF(%(out)s);
%(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
if (!%(out)s) {
%(fail)s
}
if (%(sync)d)
GpuArray_sync(&%(out)s->ga);
""" % dict(out=out[0], inp=inp[0], fail=sub['fail'],
sync=bool(config.gpuarray.sync))
k = self.get_kernel_cache(node)
_, src, _, ls = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim)
if self.axis is None:
redux = [True] * node.inputs[0].ndim
else:
redux = self.redux
acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype
input = inp[0]
output = out[0]
nd_out = node.outputs[0].ndim
code = """
size_t gs = 1;
unsigned int n = 1;
unsigned int proxy_dim[%(nd_in)s];
unsigned int proxy_off;
int proxy_str[%(nd_in)s];
void *args[%(n_args)s];
PyGpuArrayObject *tmp;
int err;
""" % dict(n_args=4 + (node.inputs[0].ndim * 2), nd_in=node.inputs[0].ndim)
if nd_out != 0:
code += """
size_t out_dims[%(nd_out)s];
int need_out = %(output)s == NULL || %(output)s->ga.nd != %(nd_out)s;
""" % dict(nd_out=nd_out, output=output)
j = 0
for i in range(node.inputs[0].ndim):
if not self.redux[i]:
code += """
out_dims[%(j)s] = %(input)s->ga.dimensions[%(i)s];
if (!need_out)
need_out |= %(output)s->ga.dimensions[%(j)s] != out_dims[%(j)s];
""" % dict(j=j, i=i, input=input, output=output)
j += 1
code += """
if (need_out) {
%(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, pygpu_default_context(), Py_None);
if (!%(output)s) {
%(fail)s
}
}
""" % dict(output=output, nd_out=nd_out, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype))
else:
code += """
if (%(output)s == NULL || %(output)s->ga.nd != 0) {
Py_XDECREF(%(output)s);
%(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(output)s) {
%(fail)s
}
}
""" % dict(output=output, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype))
if acc_dtype != node.outputs[0].type.dtype:
code += """
tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions,
%(acc_type)s, GA_C_ORDER, pygpu_default_context(),
Py_None);
if (!tmp) %(fail)s
""" % dict(output=output, fail=sub['fail'], acc_type=dtype_to_typecode(acc_dtype))
else:
code += """
tmp = %(output)s;
Py_INCREF(tmp);
""" % dict(output=output)
# We need the proxies since we are passing a pointer to the
# data into the call and therefore we need a real copy of the
# data in the proper type.
code += """
args[0] = &n;
args[1] = &tmp->ga;
""" % dict(output=output)
p = 2
for i in range(node.inputs[0].ndim):
code += """
proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
args[%(p)s] = &proxy_dim[%(i)s];
n *= %(input)s->ga.dimensions[%(i)s];
""" % dict(i=i, p=p, input=input)
p += 1
if not redux[i]:
code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i)
code += """
args[%(p)s] = &%(input)s->ga;
proxy_off = %(input)s->ga.offset;
args[%(p)s+1] = &proxy_off;
""" % dict(p=p, input=input)
p += 2
for i in range(node.inputs[0].ndim):
code += """
proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s];
args[%(p)s] = &proxy_str[%(i)s];
""" % dict(p=p, i=i, input=input)
p += 1
code += """
if (gs == 0) gs = 1;
n /= gs;
err = GpuKernel_call(&%(k_var)s, 0, %(ls)s, gs, args);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"compyte error: GpuCAReduce: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s
}
if (%(cast_out)d) {
err = GpuArray_move(&%(output)s->ga, &tmp->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"compyte error: GpuCAReduce [cast]: %%s.",
GpuArray_error(&tmp->ga, err));
%(fail)s
}
} else {
Py_XDECREF(%(output)s);
%(output)s = tmp;
}
if (%(sync)d)
GpuArray_sync(&%(output)s->ga);
""" % dict(k_var=self.c_kernel_obj(name), sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
return code
def c_code_cache_version(self):
return (0,)
def generate_kernel(self, node, odtype, redux): def generate_kernel(self, node, odtype, redux):
if isinstance(self.scalar_op, scalar.basic.Add): if isinstance(self.scalar_op, scalar.basic.Add):
...@@ -533,8 +734,7 @@ class GpuCAReduce(HideC, CAReduceDtype): ...@@ -533,8 +734,7 @@ class GpuCAReduce(HideC, CAReduceDtype):
return ReductionKernel(pygpu.get_default_context(), odtype, return ReductionKernel(pygpu.get_default_context(), odtype,
self.scalar_op.identity, reduce_expr, redux, self.scalar_op.identity, reduce_expr, redux,
arguments=[make_argument(node.inputs[0], 'a')], arguments=[make_argument(node.inputs[0], 'a')],
init_nd=node.inputs[0].ndim init_nd=node.inputs[0].ndim)
)
def perform(self, node, inp, out): def perform(self, node, inp, out):
input, = inp input, = inp
...@@ -546,7 +746,7 @@ class GpuCAReduce(HideC, CAReduceDtype): ...@@ -546,7 +746,7 @@ class GpuCAReduce(HideC, CAReduceDtype):
redux = self.redux redux = self.redux
if any(redux): if any(redux):
output[0] = node._cache_reduction_k(input).astype(copy=False, output[0] = self.get_kernel_cache(node)(input).astype(copy=False,
dtype=node.outputs[0].type.dtype) dtype=node.outputs[0].type.dtype)
else: else:
output[0] = pygpu.gpuarray.array(input, copy=True, output[0] = pygpu.gpuarray.array(input, copy=True,
......
...@@ -55,7 +55,12 @@ class test_GpuCAReduce(test_CAReduce): ...@@ -55,7 +55,12 @@ class test_GpuCAReduce(test_CAReduce):
test_nan=True) test_nan=True)
def test_c(self): def test_c(self):
raise SkipTest("no C code") for dtype in self.dtypes + self.bin_dtypes:
for op in self.reds:
self.with_linker(gof.CLinker(), op, dtype=dtype)
def test_c_nan(self): def test_c_nan(self):
raise SkipTest("no C code") for dtype in self.dtypes:
for op in self.reds:
self.with_linker(gof.CLinker(), op, dtype=dtype,
test_nan=True)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论