提交 d5181aee authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Fix the last remaining problems with the existing clients and make the kernel…

Fix the last remaining problems with the existing clients and make the kernel code for MRG more OpenCL-friendly.
上级 ec5283b2
......@@ -12,6 +12,7 @@ from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
try:
import pygpu
from pygpu import gpuarray
from pygpu.tools import ScalarArg, ArrayArg
from pygpu.elemwise import ElemwiseKernel
from pygpu.reduction import ReductionKernel
......@@ -2414,7 +2415,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
sync=bool(config.gpuarray.sync))
k = self.get_kernel_cache(node)
_, src, _, ls = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim)
node.inputs[0].ndim)
if self.axis is None:
redux = [True] * node.inputs[0].ndim
else:
......
......@@ -772,9 +772,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
return op(rstate, cast(v_size, 'int32'))
def c_headers(self):
return super(GPUA_mrg_uniform, self) + ['numpy_compat.h']
return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']
def gpu_kernels(self, node):
def gpu_kernels(self, node, name):
if self.output_type.dtype == 'float32':
otype = 'float'
NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
......@@ -785,8 +785,8 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
NORM = '4.656612873077392578125e-10'
code = """
KERNEL void mrg_uniform(
%(otype)s *sample_data,
ga_int *state_data,
GLOBAL_MEM %(otype)s *sample_data,
GLOBAL_MEM ga_int *state_data,
const ga_uint Nsamples,
const ga_uint Nstreams_used)
{
......@@ -809,7 +809,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
const ga_int MASK2 = 65535; //2^16 - 1
const ga_int MULT2 = 21069;
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
const ga_uint idx = GID_0 * LDIM_0 + LID_0;
ga_int y1, y2, x11, x12, x13, x21, x22, x23;
if (idx < Nstreams_used)
......@@ -821,7 +821,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
x22 = state_data[idx*6+4];
x23 = state_data[idx*6+5];
for (int i = idx; i < Nsamples; i += Nstreams_used)
for (ga_uint i = idx; i < Nsamples; i += Nstreams_used)
{
y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
......@@ -864,6 +864,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
""" % locals()
# we shouldn't get to this line if it's about to fail
from pygpu import gpuarray
return [Kernel(code=code, name="mrg_uniform",
params=[gpuarray.GpuArray, gpuarray.GpuArray,
'uint32', 'uint32'],
......@@ -877,7 +880,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
ndim = self.output_type.ndim
o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
kname = self.gpu_kernels()[0].objvar
kname = self.gpu_kernels(node, nodename)[0].objvar
if self.output_type.dtype == 'float32':
otype = 'float'
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论