提交 3c4ac6d0 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Move the gpuarray ops and tests under the gpuarray directory.

上级 ae9139ac
"""
GPU implementation of MRG31k3p random number generator for Theano.
Generator code in SSJ package (L'Ecuyer & Simard).
http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
"""
from __future__ import absolute_import, print_function, division
import numpy
from theano.gof import local_optimizer
from theano.sandbox.rng_mrg import mrg_uniform_base, mrg_uniform
from theano.tensor import as_tensor_variable, get_vector_length
from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name
from theano.gpuarray.type import GpuArrayType
from theano.gpuarray.fp16_help import write_w
from theano.gpuarray.opt import (register_opt as register_gpua,
register_opt2,
host_from_gpu as host_from_gpua)
class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
# GpuArray version
_f16_ok = True
def get_params(self, node):
return node.inputs[0].type.context
@classmethod
def new(cls, rstate, ndim, dtype, size):
v_size = as_tensor_variable(size)
if ndim is None:
ndim = get_vector_length(v_size)
op = cls(GpuArrayType(dtype, (False,) * ndim))
return op(rstate, v_size)
def c_headers(self):
return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']
def gpu_kernels(self, node, name):
write = write_w(self.output_type.dtype)
if self.output_type.dtype == 'float16':
otype = 'ga_half'
# limit the values of the state that we use.
mask = '& 0x7fff'
NORM = '3.0518e-05f' # numpy.float16(1.0/(2**15+8))
# this was determined by finding the biggest number such that
# numpy.float16(number * (M1 & 0x7fff)) < 1.0
elif self.output_type.dtype == 'float32':
otype = 'float'
mask = ''
NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that
# numpy.float32(number * M1) < 1.0
elif self.output_type.dtype == 'float64':
otype = 'double'
mask = ''
NORM = '4.656612873077392578125e-10'
else:
raise ValueError('Unsupported data type for output',
self.output_type.dtype)
code = """
KERNEL void mrg_uniform(
GLOBAL_MEM %(otype)s *sample_data,
GLOBAL_MEM ga_int *state_data,
const ga_uint Nsamples,
const ga_uint Nstreams_used)
{
/*
* The cluda backend makes sure that ga_int corresponds to
* a 32 bit signed type on the target device. It is not a
* variable width type.
*/
const ga_int i7 = 7;
const ga_int i9 = 9;
const ga_int i15 = 15;
const ga_int i16 = 16;
const ga_int i22 = 22;
const ga_int i24 = 24;
const ga_int M1 = 2147483647; //2^31 - 1
const ga_int M2 = 2147462579; //2^31 - 21069
const ga_int MASK12 = 511; //2^9 - 1
const ga_int MASK13 = 16777215; //2^24 - 1
const ga_int MASK2 = 65535; //2^16 - 1
const ga_int MULT2 = 21069;
const ga_uint idx = GID_0 * LDIM_0 + LID_0;
ga_int y1, y2, x11, x12, x13, x21, x22, x23;
if (idx < Nstreams_used)
{
x11 = state_data[idx*6+0];
x12 = state_data[idx*6+1];
x13 = state_data[idx*6+2];
x21 = state_data[idx*6+3];
x22 = state_data[idx*6+4];
x23 = state_data[idx*6+5];
for (ga_uint i = idx; i < Nsamples; i += Nstreams_used)
{
y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
y1 += x13;
y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
x13 = x12;
x12 = x11;
x11 = y1;
y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
y1 -= (y1 < 0 || y1 >= M2) ? M2 : 0;
y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
y2 += x23;
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
y2 += y1;
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
x23 = x22;
x22 = x21;
x21 = y2;
if (x11 <= x21) {
sample_data[i] = %(write)s(((x11 - x21 + M1) %(mask)s) * %(NORM)s);
}
else
{
sample_data[i] = %(write)s(((x11 - x21) %(mask)s) * %(NORM)s);
}
}
state_data[idx*6+0]= x11;
state_data[idx*6+1]= x12;
state_data[idx*6+2]= x13;
state_data[idx*6+3]= x21;
state_data[idx*6+4]= x22;
state_data[idx*6+5]= x23;
}
}
""" % locals()
# we shouldn't get to this line if it's about to fail
from pygpu import gpuarray
return [Kernel(code=code, name="mrg_uniform",
params=[gpuarray.GpuArray, gpuarray.GpuArray,
'uint32', 'uint32'],
flags=Kernel.get_flags(self.output_type.dtype, 'int32'))
]
def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp
o_rstate, o_sample = out
inplace = int(self.inplace)
ndim = self.output_type.ndim
o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
ctx = sub['params']
kname = self.gpu_kernels(node, nodename)[0].objvar
otypecode = str(self.output_type.typecode)
return """
npy_int64 M1 = 2147483647; //2^31 - 1
// The +1 is to avoid odims[0] which fails on windows
size_t odims[%(ndim)s+1];
size_t n_elements = 1;
unsigned int n_streams;
int must_alloc_sample = ((NULL == %(o_sample)s)
|| !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
|| !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
|| (PyGpuArray_NDIM(%(o_sample)s) != %(ndim)s));
if (PyArray_NDIM(%(size)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "size must be vector");
%(fail)s
}
if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
{
PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)",
%(ndim)s, PyArray_DIMS(%(size)s)[0]);
%(fail)s
}
for (int i = 0; i < %(ndim)s; ++i)
{
odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample
|| PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]);
}
if (n_elements > M1)
{
PyErr_SetString(
PyExc_ValueError,
"rng_mrg gpu implementation does not support more than (2**31 -1) samples");
%(fail)s
}
if (must_alloc_sample)
{
Py_XDECREF(%(o_sample)s);
%(o_sample)s = pygpu_empty(%(ndim)s, odims, %(otypecode)s, GA_C_ORDER,
%(ctx)s, Py_None);
if(!%(o_sample)s)
{
%(fail)s;
}
}
if (!pygpu_GpuArray_Check((PyObject*)%(rstate)s))
{
PyErr_Format(PyExc_ValueError, "rstate must be gpuarray");
%(fail)s;
}
Py_XDECREF(%(o_rstate)s);
if (%(inplace)s)
{
Py_INCREF(%(rstate)s);
%(o_rstate)s = %(rstate)s;
}
else
{
%(o_rstate)s = pygpu_copy(%(rstate)s, GA_ANY_ORDER);
if (!%(o_rstate)s) {
%(fail)s
}
}
if (PyGpuArray_NDIM(%(o_rstate)s) != 2)
{
PyErr_SetString(PyExc_ValueError, "rstate must be a matrix");
%(fail)s
}
if (PyGpuArray_DIMS(%(o_rstate)s)[1] != 6)
{
PyErr_Format(PyExc_ValueError, "rstate must have 6 columns");
%(fail)s
}
if (%(o_rstate)s->ga.typecode != GA_INT) {
PyErr_Format(PyExc_ValueError, "rstate must be int32");
%(fail)s
}
if (!GpuArray_CHKFLAGS(&%(o_rstate)s->ga, GA_C_CONTIGUOUS)) {
PyErr_Format(PyExc_ValueError, "rstate must be C contiguous");
%(fail)s
}
n_streams = PyGpuArray_DIMS(%(o_rstate)s)[0];
if (n_streams > n_elements)
n_streams = n_elements;
{
size_t ls = 0, gs = 0;
int err = GpuKernel_sched(&%(kname)s, n_streams, &ls, &gs);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuKernel_sched: %%s\\n",
GpuKernel_error(&%(kname)s, err));
%(fail)s
}
// Make sure we run as many blocks as we need to cover the whole n_streams
gs = (n_streams + ls - 1)/ls;
err = mrg_uniform_call(1, &ls, &gs, 0, %(o_sample)s->ga.data, %(o_rstate)s->ga.data, n_elements, n_streams);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "mrg_uniform_call: %%s\\n",
GpuKernel_error(&%(kname)s, err));
%(fail)s
}
}
""" % locals()
def c_code_cache_version(self):
return (12,)
@register_opt2([mrg_uniform], 'fast_compile')
def local_gpua_mrg_graph(op, context_name, inputs, outputs):
if (type(op) == mrg_uniform and
isinstance(inputs[0].type, GpuArrayType)):
outs = GPUA_mrg_uniform.new(inputs[0],
op.output_type.ndim,
op.output_type.dtype,
inputs[1])
return [outs[0], host_from_gpua(outs[1])]
@register_gpua('fast_compile')
@local_optimizer([mrg_uniform])
def local_gpua_mrg(node):
context_name = infer_context_name(*node.inputs)
return local_gpua_mrg_graph(node.op, context_name, node.inputs, node.outputs)
from __future__ import absolute_import, print_function, division
from .config import test_ctx_name, mode_with_gpu from .config import test_ctx_name, mode_with_gpu
from ..type import (get_context, GpuArrayType, GpuArraySharedVariable, from ..type import (get_context, GpuArrayType, GpuArraySharedVariable,
...@@ -11,6 +12,7 @@ from theano.misc.pkl_utils import dump, load ...@@ -11,6 +12,7 @@ from theano.misc.pkl_utils import dump, load
from theano.tensor.tests.test_opt import test_fusion as t_fusion from theano.tensor.tests.test_opt import test_fusion as t_fusion
class test_fusion(t_fusion): class test_fusion(t_fusion):
mode = mode_with_gpu mode = mode_with_gpu
shared = gpuarray_shared_constructor shared = gpuarray_shared_constructor
......
from __future__ import absolute_import, print_function, division
import functools
import numpy
import theano
from theano import tensor
from theano.sandbox import rng_mrg
from theano.sandbox.rng_mrg import MRG_RandomStreams
from theano.sandbox.tests.test_rng_mrg import java_samples, rng_mrg_overflow
from theano.tests import unittest_tools as utt
from theano.gpuarray.tests.config import mode_with_gpu as mode
from theano.gpuarray.type import gpuarray_shared_constructor
utt.seed_rng()
def test_consistency_GPUA_serial():
# Verify that the random numbers generated by GPUA_mrg_uniform, serially,
# are the same as the reference (Java) implementation by L'Ecuyer et al.
seed = 12345
n_samples = 5
n_streams = 12
n_substreams = 7
samples = []
curr_rstate = numpy.array([seed] * 6, dtype='int32')
for i in range(n_streams):
stream_rstate = curr_rstate.copy()
for j in range(n_substreams):
substream_rstate = numpy.array([stream_rstate.copy()],
dtype='int32')
# Transfer to device
rstate = gpuarray_shared_constructor(substream_rstate)
new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate,
ndim=None,
dtype='float32',
size=(1,))
rstate.default_update = new_rstate
# Not really necessary, just mimicking
# rng_mrg.MRG_RandomStreams' behavior
sample.rstate = rstate
sample.update = (rstate, new_rstate)
# We need the sample back in the main memory
cpu_sample = tensor.as_tensor_variable(sample)
f = theano.function([], cpu_sample, mode=mode)
for k in range(n_samples):
s = f()
samples.append(s)
# next substream
stream_rstate = rng_mrg.ff_2p72(stream_rstate)
# next stream
curr_rstate = rng_mrg.ff_2p134(curr_rstate)
samples = numpy.array(samples).flatten()
assert(numpy.allclose(samples, java_samples))
def test_consistency_GPUA_parallel():
# Verify that the random numbers generated by GPUA_mrg_uniform, in
# parallel, are the same as the reference (Java) implementation by
# L'Ecuyer et al.
seed = 12345
n_samples = 5
n_streams = 12
n_substreams = 7 # 7 samples will be drawn in parallel
samples = []
curr_rstate = numpy.array([seed] * 6, dtype='int32')
for i in range(n_streams):
stream_samples = []
rstate = [curr_rstate.copy()]
for j in range(1, n_substreams):
rstate.append(rng_mrg.ff_2p72(rstate[-1]))
rstate = numpy.asarray(rstate)
rstate = gpuarray_shared_constructor(rstate)
new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate, ndim=None,
dtype='float32',
size=(n_substreams,))
rstate.default_update = new_rstate
# Not really necessary, just mimicking
# rng_mrg.MRG_RandomStreams' behavior
sample.rstate = rstate
sample.update = (rstate, new_rstate)
# We need the sample back in the main memory
cpu_sample = tensor.as_tensor_variable(sample)
f = theano.function([], cpu_sample, mode=mode)
for k in range(n_samples):
s = f()
stream_samples.append(s)
samples.append(numpy.array(stream_samples).T.flatten())
# next stream
curr_rstate = rng_mrg.ff_2p134(curr_rstate)
samples = numpy.array(samples).flatten()
assert(numpy.allclose(samples, java_samples))
def test_GPUA_full_fill():
# Make sure the whole sample buffer is filled. Also make sure
# large samples are consistent with CPU results.
import theano.gpuarray.tests.config
from theano.gpuarray.type import gpuarray_shared_constructor
# This needs to be large to trigger the problem on GPU
size = (10, 1000)
R = MRG_RandomStreams(234)
uni = R.uniform(size, nstreams=60 * 256)
f_cpu = theano.function([], uni)
rstate_gpu = gpuarray_shared_constructor(R.state_updates[-1][0].get_value())
new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate_gpu, ndim=None,
dtype='float32',
size=size)
rstate_gpu.default_update = new_rstate
f_gpu = theano.function([], sample)
utt.assert_allclose(f_cpu(), f_gpu())
def test_overflow_gpu_new_backend():
from theano.gpuarray.tests.test_basic_ops import \
mode_with_gpu as mode
from theano.gpuarray.type import gpuarray_shared_constructor
seed = 12345
n_substreams = 7
curr_rstate = numpy.array([seed] * 6, dtype='int32')
rstate = [curr_rstate.copy()]
for j in range(1, n_substreams):
rstate.append(rng_mrg.ff_2p72(rstate[-1]))
rstate = numpy.asarray(rstate)
rstate = gpuarray_shared_constructor(rstate)
fct = functools.partial(rng_mrg.GPUA_mrg_uniform.new, rstate,
ndim=None, dtype='float32')
# should raise error as the size overflows
sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
# should not raise error
sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
# should support int32 sizes
sizes = [(numpy.int32(2**10), ),
(numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
def test_validate_input_types_gpuarray_backend():
from theano.sandbox.rng_mrg import mrg_uniform
from theano.gpuarray.type import gpuarray_shared_constructor
from theano.configparser import change_flags
with change_flags(compute_test_value="raise"):
rstate = numpy.zeros((7, 6), dtype="int32")
rstate = gpuarray_shared_constructor(rstate)
mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3,))
...@@ -23,12 +23,6 @@ from theano.compile import optdb ...@@ -23,12 +23,6 @@ from theano.compile import optdb
from theano.gof import local_optimizer from theano.gof import local_optimizer
from . import multinomial from . import multinomial
from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name, as_gpuarray_variable
from theano.gpuarray.type import GpuArrayType
from theano.gpuarray.fp16_help import write_w
from theano.gpuarray.opt import (register_opt as register_gpua,
register_opt2)
def matVecModM(A, s, m): def matVecModM(A, s, m):
# TODO : need description for method, parameter and return # TODO : need description for method, parameter and return
...@@ -557,274 +551,6 @@ class mrg_uniform(mrg_uniform_base): ...@@ -557,274 +551,6 @@ class mrg_uniform(mrg_uniform_base):
return (8, ) return (8, )
class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
# GpuArray version
_f16_ok = True
def make_node(self, rstate, size):
# error checking slightly redundant here, since
# this op should not be called directly.
#
# call through MRG_RandomStreams instead.
broad = []
for i in range(self.output_type.ndim):
broad.append(tensor.extract_constant(size[i]) == 1)
output_type = self.output_type.clone(broadcastable=broad)()
rstate = as_gpuarray_variable(rstate, infer_context_name(rstate))
return Apply(self,
[rstate, size],
[rstate.type(), output_type])
def get_params(self, node):
return node.inputs[0].type.context
@classmethod
def new(cls, rstate, ndim, dtype, size):
v_size = as_tensor_variable(size)
if ndim is None:
ndim = get_vector_length(v_size)
op = cls(GpuArrayType(dtype, (False,) * ndim))
return op(rstate, v_size)
def c_headers(self):
return super(GPUA_mrg_uniform, self).c_headers() + ['numpy_compat.h']
def gpu_kernels(self, node, name):
write = write_w(self.output_type.dtype)
if self.output_type.dtype == 'float16':
otype = 'ga_half'
# limit the values of the state that we use.
mask = '& 0x7fff'
NORM = '3.0518e-05f' # np.float16(1.0/(2**15+8))
# this was determined by finding the biggest number such that
# np.float16(number * (M1 & 0x7fff)) < 1.0
elif self.output_type.dtype == 'float32':
otype = 'float'
mask = ''
NORM = '4.6566126e-10f' # np.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that
# np.float32(number * M1) < 1.0
elif self.output_type.dtype == 'float64':
otype = 'double'
mask = ''
NORM = '4.656612873077392578125e-10'
else:
raise ValueError('Unsupported data type for output',
self.output_type.dtype)
code = """
KERNEL void mrg_uniform(
GLOBAL_MEM %(otype)s *sample_data,
GLOBAL_MEM ga_int *state_data,
const ga_uint Nsamples,
const ga_uint Nstreams_used)
{
/*
* The cluda backend makes sure that ga_int corresponds to
* a 32 bit signed type on the target device. It is not a
* variable width type.
*/
const ga_int i7 = 7;
const ga_int i9 = 9;
const ga_int i15 = 15;
const ga_int i16 = 16;
const ga_int i22 = 22;
const ga_int i24 = 24;
const ga_int M1 = 2147483647; //2^31 - 1
const ga_int M2 = 2147462579; //2^31 - 21069
const ga_int MASK12 = 511; //2^9 - 1
const ga_int MASK13 = 16777215; //2^24 - 1
const ga_int MASK2 = 65535; //2^16 - 1
const ga_int MULT2 = 21069;
const ga_uint idx = GID_0 * LDIM_0 + LID_0;
ga_int y1, y2, x11, x12, x13, x21, x22, x23;
if (idx < Nstreams_used)
{
x11 = state_data[idx*6+0];
x12 = state_data[idx*6+1];
x13 = state_data[idx*6+2];
x21 = state_data[idx*6+3];
x22 = state_data[idx*6+4];
x23 = state_data[idx*6+5];
for (ga_uint i = idx; i < Nsamples; i += Nstreams_used)
{
y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
y1 += x13;
y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0;
x13 = x12;
x12 = x11;
x11 = y1;
y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
y1 -= (y1 < 0 || y1 >= M2) ? M2 : 0;
y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
y2 += x23;
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
y2 += y1;
y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0;
x23 = x22;
x22 = x21;
x21 = y2;
if (x11 <= x21) {
sample_data[i] = %(write)s(((x11 - x21 + M1) %(mask)s) * %(NORM)s);
}
else
{
sample_data[i] = %(write)s(((x11 - x21) %(mask)s) * %(NORM)s);
}
}
state_data[idx*6+0]= x11;
state_data[idx*6+1]= x12;
state_data[idx*6+2]= x13;
state_data[idx*6+3]= x21;
state_data[idx*6+4]= x22;
state_data[idx*6+5]= x23;
}
}
""" % locals()
# we shouldn't get to this line if it's about to fail
from pygpu import gpuarray
return [Kernel(code=code, name="mrg_uniform",
params=[gpuarray.GpuArray, gpuarray.GpuArray,
'uint32', 'uint32'],
flags=Kernel.get_flags(self.output_type.dtype, 'int32'))
]
def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp
o_rstate, o_sample = out
inplace = int(self.inplace)
ndim = self.output_type.ndim
o_type_num = np.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
ctx = sub['params']
kname = self.gpu_kernels(node, nodename)[0].objvar
otypecode = str(self.output_type.typecode)
return """
npy_int64 M1 = 2147483647; //2^31 - 1
// The +1 is to avoid odims[0] which fails on windows
size_t odims[%(ndim)s+1];
size_t n_elements = 1;
unsigned int n_streams;
int must_alloc_sample = ((NULL == %(o_sample)s)
|| !pygpu_GpuArray_Check((PyObject*)%(o_sample)s)
|| !(%(o_sample)s->ga.flags & GA_C_CONTIGUOUS)
|| (PyGpuArray_NDIM(%(o_sample)s) != %(ndim)s));
if (PyArray_NDIM(%(size)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "size must be vector");
%(fail)s
}
if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
{
PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%li)",
%(ndim)s, PyArray_DIMS(%(size)s)[0]);
%(fail)s
}
for (int i = 0; i < %(ndim)s; ++i)
{
odims[i] = *(dtype_%(size)s *)PyArray_GETPTR1(%(size)s, i);
n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample
|| PyGpuArray_DIMS(%(o_sample)s)[i] != odims[i]);
}
if (n_elements > M1)
{
PyErr_SetString(
PyExc_ValueError,
"rng_mrg gpu implementation does not support more than (2**31 -1) samples");
%(fail)s
}
if (must_alloc_sample)
{
Py_XDECREF(%(o_sample)s);
%(o_sample)s = pygpu_empty(%(ndim)s, odims, %(otypecode)s, GA_C_ORDER,
%(ctx)s, Py_None);
if(!%(o_sample)s)
{
%(fail)s;
}
}
if (!pygpu_GpuArray_Check((PyObject*)%(rstate)s))
{
PyErr_Format(PyExc_ValueError, "rstate must be gpuarray");
%(fail)s;
}
Py_XDECREF(%(o_rstate)s);
if (%(inplace)s)
{
Py_INCREF(%(rstate)s);
%(o_rstate)s = %(rstate)s;
}
else
{
%(o_rstate)s = pygpu_copy(%(rstate)s, GA_ANY_ORDER);
if (!%(o_rstate)s) {
%(fail)s
}
}
if (PyGpuArray_NDIM(%(o_rstate)s) != 2)
{
PyErr_SetString(PyExc_ValueError, "rstate must be a matrix");
%(fail)s
}
if (PyGpuArray_DIMS(%(o_rstate)s)[1] != 6)
{
PyErr_Format(PyExc_ValueError, "rstate must have 6 columns");
%(fail)s
}
if (%(o_rstate)s->ga.typecode != GA_INT) {
PyErr_Format(PyExc_ValueError, "rstate must be int32");
%(fail)s
}
if (!GpuArray_CHKFLAGS(&%(o_rstate)s->ga, GA_C_CONTIGUOUS)) {
PyErr_Format(PyExc_ValueError, "rstate must be C contiguous");
%(fail)s
}
n_streams = PyGpuArray_DIMS(%(o_rstate)s)[0];
if (n_streams > n_elements)
n_streams = n_elements;
{
size_t ls = 0, gs = 0;
int err = GpuKernel_sched(&%(kname)s, n_streams, &ls, &gs);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuKernel_sched: %%s\\n",
GpuKernel_error(&%(kname)s, err));
%(fail)s
}
// Make sure we run as many blocks as we need to cover the whole n_streams
gs = (n_streams + ls - 1)/ls;
err = mrg_uniform_call(1, &ls, &gs, 0, %(o_sample)s->ga.data, %(o_rstate)s->ga.data, n_elements, n_streams);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "mrg_uniform_call: %%s\\n",
GpuKernel_error(&%(kname)s, err));
%(fail)s
}
}
""" % locals()
def c_code_cache_version(self):
return (12,)
def guess_n_streams(size, warn=False): def guess_n_streams(size, warn=False):
# TODO : need description for parameter 'size' # TODO : need description for parameter 'size'
""" """
...@@ -1317,36 +1043,16 @@ class MRG_RandomStreams(object): ...@@ -1317,36 +1043,16 @@ class MRG_RandomStreams(object):
return final_samples return final_samples
@register_opt2([mrg_uniform], 'fast_compile') @local_optimizer((mrg_uniform_base,))
def local_gpua_mrg_graph(op, context_name, inputs, outputs):
if (type(op) == mrg_uniform and
isinstance(inputs[0].type, GpuArrayType)):
outs = GPUA_mrg_uniform.new(inputs[0],
op.output_type.ndim,
op.output_type.dtype,
inputs[1])
return [outs[0], outs[1].transfer('cpu')]
@register_gpua('fast_compile')
@local_optimizer([mrg_uniform])
def local_gpua_mrg(node):
context_name = infer_context_name(*node.inputs)
return local_gpua_mrg_graph(node.op, context_name, node.inputs, node.outputs)
MRG_RNGs = (mrg_uniform, GPUA_mrg_uniform)
@local_optimizer(MRG_RNGs)
def mrg_random_make_inplace(node): def mrg_random_make_inplace(node):
op = node.op op = node.op
if isinstance(op, MRG_RNGs) and not op.inplace: if isinstance(op, mrg_uniform_base) and not op.inplace:
# op might be gpu version # op might be gpu version
new_op = op.__class__(op.output_type, inplace=True) new_op = op.__class__(op.output_type, inplace=True)
return new_op.make_node(*node.inputs).outputs return new_op.make_node(*node.inputs).outputs
return False return False
optdb.register('random_make_inplace_mrg', optdb.register('random_make_inplace_mrg',
opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), opt.in2out(mrg_random_make_inplace, ignore_newtrees=True),
99, 'fast_run', 'inplace') 99, 'fast_run', 'inplace')
...@@ -17,7 +17,6 @@ from theano.sandbox import rng_mrg ...@@ -17,7 +17,6 @@ from theano.sandbox import rng_mrg
from theano.sandbox.rng_mrg import MRG_RandomStreams from theano.sandbox.rng_mrg import MRG_RandomStreams
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tests.unittest_tools import attr from theano.tests.unittest_tools import attr
import theano.gpuarray.tests.config
# TODO: test MRG_RandomStreams # TODO: test MRG_RandomStreams
# Partly done in test_consistency_randomstreams # Partly done in test_consistency_randomstreams
...@@ -186,129 +185,6 @@ def test_consistency_cpu_parallel(): ...@@ -186,129 +185,6 @@ def test_consistency_cpu_parallel():
assert(np.allclose(samples, java_samples)) assert(np.allclose(samples, java_samples))
def test_consistency_GPUA_serial():
# Verify that the random numbers generated by GPUA_mrg_uniform, serially,
# are the same as the reference (Java) implementation by L'Ecuyer et al.
from theano.gpuarray.tests.config import mode_with_gpu as mode
from theano.gpuarray.type import gpuarray_shared_constructor
seed = 12345
n_samples = 5
n_streams = 12
n_substreams = 7
samples = []
curr_rstate = np.array([seed] * 6, dtype='int32')
for i in range(n_streams):
stream_rstate = curr_rstate.copy()
for j in range(n_substreams):
substream_rstate = np.array([stream_rstate.copy()],
dtype='int32')
# Transfer to device
rstate = gpuarray_shared_constructor(substream_rstate)
new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate,
ndim=None,
dtype='float32',
size=(1,))
rstate.default_update = new_rstate
# Not really necessary, just mimicking
# rng_mrg.MRG_RandomStreams' behavior
sample.rstate = rstate
sample.update = (rstate, new_rstate)
# We need the sample back in the main memory
cpu_sample = tensor.as_tensor_variable(sample)
f = theano.function([], cpu_sample, mode=mode)
for k in range(n_samples):
s = f()
samples.append(s)
# next substream
stream_rstate = rng_mrg.ff_2p72(stream_rstate)
# next stream
curr_rstate = rng_mrg.ff_2p134(curr_rstate)
samples = np.array(samples).flatten()
assert(np.allclose(samples, java_samples))
def test_consistency_GPUA_parallel():
# Verify that the random numbers generated by GPUA_mrg_uniform, in
# parallel, are the same as the reference (Java) implementation by
# L'Ecuyer et al.
from theano.gpuarray.tests.config import mode_with_gpu as mode
from theano.gpuarray.type import gpuarray_shared_constructor
seed = 12345
n_samples = 5
n_streams = 12
n_substreams = 7 # 7 samples will be drawn in parallel
samples = []
curr_rstate = np.array([seed] * 6, dtype='int32')
for i in range(n_streams):
stream_samples = []
rstate = [curr_rstate.copy()]
for j in range(1, n_substreams):
rstate.append(rng_mrg.ff_2p72(rstate[-1]))
rstate = np.asarray(rstate)
rstate = gpuarray_shared_constructor(rstate)
new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate, ndim=None,
dtype='float32',
size=(n_substreams,))
rstate.default_update = new_rstate
# Not really necessary, just mimicking
# rng_mrg.MRG_RandomStreams' behavior
sample.rstate = rstate
sample.update = (rstate, new_rstate)
# We need the sample back in the main memory
cpu_sample = tensor.as_tensor_variable(sample)
f = theano.function([], cpu_sample, mode=mode)
for k in range(n_samples):
s = f()
stream_samples.append(s)
samples.append(np.array(stream_samples).T.flatten())
# next stream
curr_rstate = rng_mrg.ff_2p134(curr_rstate)
samples = np.array(samples).flatten()
assert(np.allclose(samples, java_samples))
def test_GPUA_full_fill():
# Make sure the whole sample buffer is filled. Also make sure
# large samples are consistent with CPU results.
import theano.gpuarray.tests.config
from theano.gpuarray.type import gpuarray_shared_constructor
# This needs to be large to trigger the problem on GPU
size = (10, 1000)
R = MRG_RandomStreams(234)
uni = R.uniform(size, nstreams=60 * 256)
f_cpu = theano.function([], uni)
rstate_gpu = gpuarray_shared_constructor(R.state_updates[-1][0].get_value())
new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate_gpu, ndim=None,
dtype='float32',
size=size)
rstate_gpu.default_update = new_rstate
f_gpu = theano.function([], sample)
utt.assert_allclose(f_cpu(), f_gpu())
def basictest(f, steps, sample_size, prefix="", allow_01=False, inputs=None, def basictest(f, steps, sample_size, prefix="", allow_01=False, inputs=None,
target_avg=0.5, target_std=None, mean_rtol=0.01, std_tol=0.01): target_avg=0.5, target_std=None, mean_rtol=0.01, std_tol=0.01):
if inputs is None: if inputs is None:
...@@ -842,42 +718,6 @@ def test_overflow_cpu(): ...@@ -842,42 +718,6 @@ def test_overflow_cpu():
rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False) rng_mrg_overflow(sizes, fct, config.mode, should_raise_error=False)
def test_overflow_gpu_new_backend():
from theano.gpuarray.tests.test_basic_ops import \
mode_with_gpu as mode
from theano.gpuarray.type import gpuarray_shared_constructor
seed = 12345
n_substreams = 7
curr_rstate = np.array([seed] * 6, dtype='int32')
rstate = [curr_rstate.copy()]
for j in range(1, n_substreams):
rstate.append(rng_mrg.ff_2p72(rstate[-1]))
rstate = np.asarray(rstate)
rstate = gpuarray_shared_constructor(rstate)
fct = functools.partial(rng_mrg.GPUA_mrg_uniform.new, rstate,
ndim=None, dtype='float32')
# should raise error as the size overflows
sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
# should not raise error
sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
# should support int32 sizes
sizes = [(np.int32(2**10), ),
(np.int32(2), np.int32(2**10), np.int32(2**10))]
rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
def test_validate_input_types_gpuarray_backend():
from theano.sandbox.rng_mrg import mrg_uniform
from theano.gpuarray.type import gpuarray_shared_constructor
from theano.configparser import change_flags
with change_flags(compute_test_value="raise"):
rstate = np.zeros((7, 6), dtype="int32")
rstate = gpuarray_shared_constructor(rstate)
mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3,))
if __name__ == "__main__": if __name__ == "__main__":
rng = MRG_RandomStreams(np.random.randint(2147462579)) rng = MRG_RandomStreams(np.random.randint(2147462579))
print(theano.__file__) print(theano.__file__)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论