提交 daf7fc95 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1845 from abergeron/gpua_rng_opt

Gpua rng opt
......@@ -78,7 +78,7 @@ class Kernel(object):
binvar = 'kbin_' + name
self.binvar = binvar
if objvar is None:
self.objvar = 'k_' + name
objvar = 'k_' + name
self.objvar = objvar
@staticmethod
......@@ -893,7 +893,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
fail = sub['fail']
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
sync = bool(config.gpuarray.sync)
kname = self.gpu_kernels()[0].objvar
kname = self.gpu_kernels(node, name)[0].objvar
s = """
size_t dims[2] = {0, 0};
void *args[3];
......
......@@ -952,17 +952,25 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
}
}
if (PyGpuArray_NDIM(%(o_rstate)s) != 1)
if (PyGpuArray_NDIM(%(o_rstate)s) != 2)
{
PyErr_SetString(PyExc_ValueError, "rstate must be vector");
%(fail)s;
PyErr_SetString(PyExc_ValueError, "rstate must be a matrix");
%(fail)s
}
if (PyGpuArray_DIMS(%(o_rstate)s)[0] %% 6)
if (PyGpuArray_DIMS(%(o_rstate)s)[1] != 6)
{
PyErr_Format(PyExc_ValueError, "rstate len must be multiple of 6");
%(fail)s;
PyErr_Format(PyExc_ValueError, "rstate must have 6 columns");
%(fail)s
}
if (%(o_rstate)s->ga.typecode != GA_INT) {
PyErr_Format(PyExc_ValueError, "rstate must be int32");
%(fail)s
}
if (!GpuArray_CHKFLAGS(&%(o_rstate)s->ga, GA_C_CONTIGUOUS)) {
PyErr_Format(PyExc_ValueError, "rstate must be C contiguous");
%(fail)s
}
n_streams = PyGpuArray_DIMS(%(o_rstate)s)[0]/6;
n_streams = PyGpuArray_DIMS(%(o_rstate)s)[0];
if (n_streams > n_elements)
n_streams = n_elements;
......@@ -984,7 +992,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
""" % locals()
def c_code_cache_version(self):
return (2, self.GpuKernelBase_version)
return (3, self.GpuKernelBase_version)
def guess_n_streams(size, warn=True):
......@@ -1341,11 +1349,26 @@ class MRG_RandomStreams(object):
assert final_samples.dtype == dtype
return final_samples
from theano.sandbox.gpuarray.opt import (register_opt as register_gpua,
host_from_gpu as host_from_gpua)
@register_gpua()
@local_optimizer([mrg_uniform])
def local_gpua_mrg(node):
if (type(node.op) == mrg_uniform and
isinstance(node.inputs[0].type, GpuArrayType)):
outs = GPUA_mrg_uniform.new(node.inputs[0],
node.op.output_type.ndim,
node.op.output_type.dtype,
node.inputs[1])
return [outs[0], host_from_gpua(outs[1])]
MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform)
@local_optimizer(MRG_RNGs)
def mrg_random_make_inplace(node):
op = node.op
if isinstance(op, mrg_uniform) and not op.inplace:
if isinstance(op, MRG_RNGs) and not op.inplace:
# op might be gpu version
new_op = op.__class__(op.output_type, inplace=True)
return new_op.make_node(*node.inputs).outputs
......
......@@ -325,7 +325,8 @@ def test_consistency_GPUA_serial():
for i in range(n_streams):
stream_rstate = curr_rstate.copy()
for j in range(n_substreams):
substream_rstate = numpy.array(stream_rstate.copy(), dtype='int32')
substream_rstate = numpy.array([stream_rstate.copy()],
dtype='int32')
# Transfer to device
rstate = gpuarray_shared_constructor(substream_rstate)
......@@ -380,7 +381,7 @@ def test_consistency_GPUA_parallel():
rstate = [curr_rstate.copy()]
for j in range(1, n_substreams):
rstate.append(rng_mrg.ff_2p72(rstate[-1]))
rstate = numpy.asarray(rstate).flatten()
rstate = numpy.asarray(rstate)
rstate = gpuarray_shared_constructor(rstate)
new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate, ndim=None,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论