提交 a8b3b329 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Rework the interface to GpuKernelBase to accept a list of kernel object.

Each item will be precompiled separetly and embedded into the c_code of the Op. This allows ops that need multiple kernels or that will choose between alternatives at runtime to use this interface. It also groups all kernel-related parameters under one object. This change also save the source of the kernel code to re-attempt source compilation in case the binary is rejected for some reason (some implementations do not support reloading from pre-compiled kernel). There may still be more changes to how stuff works under the hood (most notably a blacklist of bad runtime/drivers that crash when attempting to create a kernel from a binary), but the visible interface should not change anymore, so now is the time to start using it more.
上级 4e350322
......@@ -20,7 +20,7 @@ except ImportError:
pass
from theano.sandbox.gpuarray.basic_ops import (as_gpuarray_variable, HideC,
GpuKernelBase)
GpuKernelBase, Kernel)
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.gof.utils import MethodNotDefined
......@@ -2373,40 +2373,29 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if any(redux):
return getattr(node, attr)
def c_kernel_code(self, node):
def gpu_kernels(self, node, name):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
# Some OpenCL compilers do not accept no-arguments kernels
return "KERNEL void reduk(GLOBAL_MEM float *a) {}"
src = "KERNEL void reduk(GLOBAL_MEM float *a) {}"
params = ['float32']
else:
k = self.get_kernel_cache(node)
_, src, _, _ = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim)
return src
def c_kernel_name(self):
return "reduk"
def c_kernel_params(self, node):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
return ["GA_FLOAT"]
else:
# Make sure this is synced with the call definition in
# pygpu/reduction.py
nd = node.inputs[0].ndim
res = ["GA_UINT", "GA_BUFFER"]
res.extend("GA_UINT" for _ in range(nd))
res.append("GA_BUFFER")
res.append("GA_UINT")
res.extend("GA_INT" for _ in range(nd))
return res
def c_kernel_flags(self, node):
params = ['uint32', gpuarray.GpuArray]
params.extend('uint32' for _ in range(nd))
params.append(gpuarray.GpuArray)
params.append('uint32')
params.extend('int32' for _ in range(nd))
acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype
return self._get_kernel_flags(node.inputs[0].type.dtype,
acc_dtype,
node.outputs[0].type.dtype)
return [Kernel(code=src, name="reduk", params=params,
flags=Kernel.get_flags(node.inputs[0].type.dtype,
acc_dtype,
node.outputs[0].type.dtype),
objname='k_reduk_'+name)]
def c_code(self, node, name, inp, out, sub):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
......@@ -2555,14 +2544,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if (%(sync)d)
GpuArray_sync(&%(output)s->ga);
""" % dict(k_var=self.c_kernel_obj(name), sync=bool(config.gpuarray.sync),
""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
return code
def c_code_cache_version(self):
return (0,)
return (0, self.GpuKernelBase_version)
def generate_kernel(self, node, odtype, redux):
if isinstance(self.scalar_op, scalar.basic.Add):
......
......@@ -26,7 +26,7 @@ if cuda_available:
from theano.sandbox.cuda import (CudaNdarrayType,
float32_shared_constructor)
from theano.sandbox.gpuarray.basic_ops import GpuKernelBase
from theano.sandbox.gpuarray.basic_ops import GpuKernelBase, Kernel
from theano.sandbox.gpuarray.type import GpuArrayType
......@@ -772,9 +772,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
return op(rstate, cast(v_size, 'int32'))
def c_headers(self):
return GpuKernelBase.c_headers(self) + ['numpy_compat.h']
return super(GPUA_mrg_uniform, self) + ['numpy_compat.h']
def c_kernel_code(self, node):
def gpu_kernels(self, node):
if self.output_type.dtype == 'float32':
otype = 'float'
NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
......@@ -783,7 +783,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
else:
otype = 'double'
NORM = '4.656612873077392578125e-10'
return """
code = """
KERNEL void mrg_uniform(
%(otype)s *sample_data,
ga_int *state_data,
......@@ -864,14 +864,11 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
""" % locals()
def c_kernel_params(self, node):
return ["GA_BUFFER", "GA_BUFFER", "GA_UINT", "GA_UINT"]
def c_kernel_name(self):
return "mrg_uniform"
def c_kernel_flags(self, node):
return self._get_kernel_flags(self.output_type.dtype, 'int32')
return [Kernel(code=code, name="mrg_uniform",
params=[gpuarray.GpuArray, gpuarray.GpuArray,
'uint32', 'uint32'],
flags=Kernel.get_flags(self.output_type.dtype, 'int32'),
objname='k_mrg_uniform')]
def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp
......@@ -880,7 +877,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
ndim = self.output_type.ndim
o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
kname = self.c_kernel_obj(nodename)
kname = 'k_mrg_uniform'
if self.output_type.dtype == 'float32':
otype = 'float'
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论