提交 25dfa312 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6401 from mrTsjolder/master

Implement truncated normal with box-muller
...@@ -39,11 +39,11 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op): ...@@ -39,11 +39,11 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
return [gpuarray_helper_inc_dir()] return [gpuarray_helper_inc_dir()]
def make_node(self, pvals, unis): def make_node(self, pvals, unis):
assert unis.dtype == pvals.dtype
assert pvals.dtype in ['float32', 'float16', 'float64']
ctx_name = infer_context_name(pvals, unis) ctx_name = infer_context_name(pvals, unis)
pvals = as_gpuarray_variable(pvals, ctx_name) pvals = as_gpuarray_variable(pvals, ctx_name)
unis = as_gpuarray_variable(unis, ctx_name) unis = as_gpuarray_variable(unis, ctx_name)
assert pvals.dtype in ['float32', 'float16', 'float64']
assert unis.dtype in ['float32', 'float16', 'float64']
if pvals.ndim != 2: if pvals.ndim != 2:
raise NotImplementedError('pvals ndim should be 2', pvals.ndim) raise NotImplementedError('pvals ndim should be 2', pvals.ndim)
...@@ -62,7 +62,8 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op): ...@@ -62,7 +62,8 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
def gpu_kernels(self, node, name): def gpu_kernels(self, node, name):
out_ctype = pygpu.gpuarray.dtype_to_ctype(node.outputs[0].dtype) out_ctype = pygpu.gpuarray.dtype_to_ctype(node.outputs[0].dtype)
in_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[0].dtype) pvals_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[0].dtype)
unis_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[1].dtype)
work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype)) work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype))
write_out_ctype = write_w(node.outputs[0].dtype) write_out_ctype = write_w(node.outputs[0].dtype)
load_in_ctype = load_w(node.inputs[0].dtype) load_in_ctype = load_w(node.inputs[0].dtype)
...@@ -71,11 +72,11 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op): ...@@ -71,11 +72,11 @@ class GPUAMultinomialFromUniform(GpuKernelBase, Op):
KERNEL void k_multi_warp_multinomial( KERNEL void k_multi_warp_multinomial(
const ga_size nb_multi, const ga_size nb_multi,
const ga_size nb_outcomes, const ga_size nb_outcomes,
GLOBAL_MEM %(in_ctype)s *global_pvals, GLOBAL_MEM %(pvals_ctype)s *global_pvals,
const ga_size global_pvals_offset, const ga_size global_pvals_offset,
const ga_ssize pvals_row_stride, const ga_ssize pvals_row_stride,
const ga_ssize pvals_col_stride, const ga_ssize pvals_col_stride,
GLOBAL_MEM %(in_ctype)s *global_unis, GLOBAL_MEM %(unis_ctype)s *global_unis,
const ga_size global_unis_offset, const ga_size global_unis_offset,
const ga_ssize unis_stride, const ga_ssize unis_stride,
GLOBAL_MEM %(out_ctype)s *global_outs, GLOBAL_MEM %(out_ctype)s *global_outs,
...@@ -84,8 +85,8 @@ KERNEL void k_multi_warp_multinomial( ...@@ -84,8 +85,8 @@ KERNEL void k_multi_warp_multinomial(
const ga_ssize outs_col_stride const ga_ssize outs_col_stride
) )
{ {
global_pvals = (GLOBAL_MEM %(in_ctype)s *)(((GLOBAL_MEM char *)global_pvals) + global_pvals_offset); global_pvals = (GLOBAL_MEM %(pvals_ctype)s *)(((GLOBAL_MEM char *)global_pvals) + global_pvals_offset);
global_unis = (GLOBAL_MEM %(in_ctype)s *)(((GLOBAL_MEM char *)global_unis) + global_unis_offset); global_unis = (GLOBAL_MEM %(unis_ctype)s *)(((GLOBAL_MEM char *)global_unis) + global_unis_offset);
global_outs = (GLOBAL_MEM %(out_ctype)s *)(((GLOBAL_MEM char *)global_outs) + global_outs_offset); global_outs = (GLOBAL_MEM %(out_ctype)s *)(((GLOBAL_MEM char *)global_outs) + global_outs_offset);
// each thread takes care of one multinomial draw // each thread takes care of one multinomial draw
int n = LDIM_0*GID_0 + LID_0; int n = LDIM_0*GID_0 + LID_0;
...@@ -113,7 +114,8 @@ KERNEL void k_multi_warp_multinomial( ...@@ -113,7 +114,8 @@ KERNEL void k_multi_warp_multinomial(
} }
} }
""" % dict(out_ctype=out_ctype, write_out_ctype=write_out_ctype, """ % dict(out_ctype=out_ctype, write_out_ctype=write_out_ctype,
work_ctype=work_ctype, in_ctype=in_ctype, load_in_ctype=load_in_ctype) work_ctype=work_ctype, pvals_ctype=pvals_ctype,
unis_ctype=unis_ctype, load_in_ctype=load_in_ctype)
return [Kernel( return [Kernel(
code=code, name="k_multi_warp_multinomial", code=code, name="k_multi_warp_multinomial",
params=[pygpu.gpuarray.SIZE, params=[pygpu.gpuarray.SIZE,
...@@ -139,7 +141,8 @@ KERNEL void k_multi_warp_multinomial( ...@@ -139,7 +141,8 @@ KERNEL void k_multi_warp_multinomial(
ctx = sub['params'] ctx = sub['params']
kname = self.gpu_kernels(node, name)[0].objvar kname = self.gpu_kernels(node, name)[0].objvar
out_typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype) out_typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
in_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype) pvals_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
unis_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
s = """ s = """
PyGpuArrayObject * pvals = %(pvals)s; PyGpuArrayObject * pvals = %(pvals)s;
PyGpuArrayObject * unis = %(unis)s; PyGpuArrayObject * unis = %(unis)s;
...@@ -201,7 +204,15 @@ KERNEL void k_multi_warp_multinomial( ...@@ -201,7 +204,15 @@ KERNEL void k_multi_warp_multinomial(
assert(nb_blocks*nb_threads >= nb_multi); assert(nb_blocks*nb_threads >= nb_multi);
int err = k_multi_warp_multinomial_call(1, &nb_blocks, &nb_threads, 0, PyGpuArray_DIMS(out)[1], PyGpuArray_DIMS(out)[0], pvals->ga.data, pvals->ga.offset, PyGpuArray_STRIDES(pvals)[0]/gpuarray_get_elsize(%(in_typecode)s), PyGpuArray_STRIDES(pvals)[1]/gpuarray_get_elsize(%(in_typecode)s), unis->ga.data, unis->ga.offset, PyGpuArray_STRIDES(unis)[0]/gpuarray_get_elsize(%(in_typecode)s), out->ga.data, out->ga.offset, PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s), PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s)); int err = k_multi_warp_multinomial_call(
1, &nb_blocks, &nb_threads, 0,
PyGpuArray_DIMS(out)[1], PyGpuArray_DIMS(out)[0], pvals->ga.data, pvals->ga.offset,
PyGpuArray_STRIDES(pvals)[0]/gpuarray_get_elsize(%(pvals_typecode)s),
PyGpuArray_STRIDES(pvals)[1]/gpuarray_get_elsize(%(pvals_typecode)s),
unis->ga.data, unis->ga.offset,
PyGpuArray_STRIDES(unis)[0]/gpuarray_get_elsize(%(unis_typecode)s), out->ga.data,
out->ga.offset, PyGpuArray_STRIDES(out)[0]/gpuarray_get_elsize(%(out_typecode)s),
PyGpuArray_STRIDES(out)[1]/gpuarray_get_elsize(%(out_typecode)s));
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format( PyErr_Format(
...@@ -218,7 +229,7 @@ KERNEL void k_multi_warp_multinomial( ...@@ -218,7 +229,7 @@ KERNEL void k_multi_warp_multinomial(
return s return s
def c_code_cache_version(self): def c_code_cache_version(self):
return (6,) return (7,)
class GPUAChoiceFromUniform(GpuKernelBase, Op): class GPUAChoiceFromUniform(GpuKernelBase, Op):
......
...@@ -271,7 +271,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -271,7 +271,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
if (n_streams > n_elements) if (n_streams > n_elements)
n_streams = n_elements; n_streams = n_elements;
{ if (n_elements > 0){
size_t ls = 0, gs = 0; size_t ls = 0, gs = 0;
int err = GpuKernel_sched(&%(kname)s, n_streams, &ls, &gs); int err = GpuKernel_sched(&%(kname)s, n_streams, &ls, &gs);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
...@@ -303,7 +303,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -303,7 +303,7 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
""" % dict(fail=sub['fail'])) """ % dict(fail=sub['fail']))
def c_code_cache_version(self): def c_code_cache_version(self):
return (16,) return (17,)
@register_opt2([mrg_uniform], 'fast_compile') @register_opt2([mrg_uniform], 'fast_compile')
......
差异被折叠。
...@@ -298,7 +298,8 @@ def test_broadcastable(): ...@@ -298,7 +298,8 @@ def test_broadcastable():
pvals_2 = R.uniform(size=size2) pvals_2 = R.uniform(size=size2)
pvals_2 = pvals_2 / tensor.sum(pvals_2) pvals_2 = pvals_2 / tensor.sum(pvals_2)
for distribution in [R.uniform, R.binomial, R.multinomial, R.multinomial_wo_replacement, R.normal]: for distribution in [R.uniform, R.normal, R.truncated_normal,
R.binomial, R.multinomial, R.multinomial_wo_replacement]:
# multinomial or multinomial_wo_replacement does not support "size" argument, # multinomial or multinomial_wo_replacement does not support "size" argument,
# the sizes of them are implicitly defined with "pvals" argument. # the sizes of them are implicitly defined with "pvals" argument.
if distribution in [R.multinomial, R.multinomial_wo_replacement]: if distribution in [R.multinomial, R.multinomial_wo_replacement]:
...@@ -378,7 +379,6 @@ def t_binomial(mean, size, const_size, var_input, input, steps, rtol): ...@@ -378,7 +379,6 @@ def t_binomial(mean, size, const_size, var_input, input, steps, rtol):
@attr('slow') @attr('slow')
def test_normal0(): def test_normal0():
steps = 50 steps = 50
std = 2. std = 2.
if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
...@@ -391,7 +391,7 @@ def test_normal0(): ...@@ -391,7 +391,7 @@ def test_normal0():
sample_size_odd = (sample_size[0], sample_size[1] - 1) sample_size_odd = (sample_size[0], sample_size[1] - 1)
x = tensor.matrix() x = tensor.matrix()
for size, const_size, var_input, input, avg, rtol, std_tol in [ test_cases = [
(sample_size, sample_size, [], [], -5., default_rtol, default_rtol), (sample_size, sample_size, [], [], -5., default_rtol, default_rtol),
(x.shape, sample_size, [x], (x.shape, sample_size, [x],
[np.zeros(sample_size, dtype=config.floatX)], [np.zeros(sample_size, dtype=config.floatX)],
...@@ -409,8 +409,9 @@ def test_normal0(): ...@@ -409,8 +409,9 @@ def test_normal0():
# test with few samples at the same time # test with few samples at the same time
((1,), (1,), [], [], -5., default_rtol, 0.02), ((1,), (1,), [], [], -5., default_rtol, 0.02),
((3,), (3,), [], [], -5., default_rtol, 0.02), ((3,), (3,), [], [], -5., default_rtol, 0.02),
]: ]
for size, const_size, var_input, input, avg, rtol, std_tol in test_cases:
R = MRG_RandomStreams(234) R = MRG_RandomStreams(234)
# Note: we specify `nstreams` to avoid a warning. # Note: we specify `nstreams` to avoid a warning.
n = R.normal(size=size, avg=avg, std=std, n = R.normal(size=size, avg=avg, std=std,
...@@ -438,6 +439,126 @@ def test_normal0(): ...@@ -438,6 +439,126 @@ def test_normal0():
prefix='numpy ', allow_01=True, inputs=input, mean_rtol=rtol) prefix='numpy ', allow_01=True, inputs=input, mean_rtol=rtol)
@attr('slow')
def test_normal_truncation():
# just a copy of test_normal0 with extra bound check
steps = 50
std = 2.
# standard deviation is slightly less than for a regular Gaussian
# constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
target_std = .87962566103423978 * std
if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
config.mode == 'Mode' and config.linker in ['py']):
sample_size = (25, 30)
default_rtol = .02
else:
sample_size = (999, 50)
default_rtol = .01
sample_size_odd = (sample_size[0], sample_size[1] - 1)
x = tensor.matrix()
test_cases = [
(sample_size, sample_size, [], [], -5., default_rtol, default_rtol),
(x.shape, sample_size, [x],
[np.zeros(sample_size, dtype=config.floatX)],
-5., default_rtol, default_rtol),
# test odd value
(x.shape, sample_size_odd, [x],
[np.zeros(sample_size_odd, dtype=config.floatX)],
-5., default_rtol, default_rtol),
(sample_size, sample_size, [], [],
np.arange(np.prod(sample_size),
dtype='float32').reshape(sample_size),
10. * std / np.sqrt(steps), default_rtol),
# test empty size (scalar)
((), (), [], [], -5., default_rtol, 0.02),
# test with few samples at the same time
((1,), (1,), [], [], -5., default_rtol, 0.02),
((3,), (3,), [], [], -5., default_rtol, 0.02),
]
for size, const_size, var_input, input, avg, rtol, std_tol in test_cases:
R = MRG_RandomStreams(234)
# Note: we specify `nstreams` to avoid a warning.
n = R.normal(size=size, avg=avg, std=std, truncate=True,
nstreams=rng_mrg.guess_n_streams(size, warn=False))
f = theano.function(var_input, n)
# check if truncated at 2*std
samples = f(*input)
assert np.all(avg + 2 * std - samples >= 0), \
("bad upper bound? %s %s" % (samples, avg + 2 * std))
assert np.all(samples - (avg - 2 * std) >= 0), \
("bad lower bound? %s %s" % (samples, avg - 2 * std))
# Increase the number of steps if size implies only a few samples
if np.prod(const_size) < 10:
steps_ = steps * 50
else:
steps_ = steps
basictest(f, steps_, const_size, target_avg=avg, target_std=target_std,
prefix='mrg ', allow_01=True, inputs=input,
mean_rtol=rtol, std_tol=std_tol)
sys.stdout.flush()
@attr('slow')
def test_truncated_normal():
# just a copy of test_normal0 for truncated normal
steps = 50
std = 2.
if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or
config.mode == 'Mode' and config.linker in ['py']):
sample_size = (25, 30)
default_rtol = .02
else:
sample_size = (999, 50)
default_rtol = .01
sample_size_odd = (sample_size[0], sample_size[1] - 1)
x = tensor.matrix()
test_cases = [
(sample_size, sample_size, [], [], -5., default_rtol, default_rtol),
(x.shape, sample_size, [x],
[np.zeros(sample_size, dtype=config.floatX)],
-5., default_rtol, default_rtol),
# test odd value
(x.shape, sample_size_odd, [x],
[np.zeros(sample_size_odd, dtype=config.floatX)],
-5., default_rtol, default_rtol),
(sample_size, sample_size, [], [],
np.arange(np.prod(sample_size),
dtype='float32').reshape(sample_size),
10. * std / np.sqrt(steps), default_rtol),
# test empty size (scalar)
((), (), [], [], -5., default_rtol, 0.02),
# test with few samples at the same time
((1,), (1,), [], [], -5., default_rtol, 0.02),
((3,), (3,), [], [], -5., default_rtol, 0.02),
]
for size, const_size, var_input, input, avg, rtol, std_tol in test_cases:
R = MRG_RandomStreams(234)
# Note: we specify `nstreams` to avoid a warning.
n = R.truncated_normal(size=size, avg=avg, std=std,
nstreams=rng_mrg.guess_n_streams(size, warn=False))
f = theano.function(var_input, n)
# Increase the number of steps if size implies only a few samples
if np.prod(const_size) < 10:
steps_ = steps * 60
else:
steps_ = steps
basictest(f, steps_, const_size, target_avg=avg, target_std=std,
prefix='mrg ', allow_01=True, inputs=input,
mean_rtol=rtol, std_tol=std_tol)
sys.stdout.flush()
def basic_multinomialtest(f, steps, sample_size, target_pvals, n_samples, def basic_multinomialtest(f, steps, sample_size, target_pvals, n_samples,
prefix="", mean_rtol=0.04): prefix="", mean_rtol=0.04):
...@@ -519,6 +640,7 @@ class T_MRG(unittest.TestCase): ...@@ -519,6 +640,7 @@ class T_MRG(unittest.TestCase):
self.assertRaises(ValueError, R.binomial, size) self.assertRaises(ValueError, R.binomial, size)
self.assertRaises(ValueError, R.multinomial, size, 1, []) self.assertRaises(ValueError, R.multinomial, size, 1, [])
self.assertRaises(ValueError, R.normal, size) self.assertRaises(ValueError, R.normal, size)
self.assertRaises(ValueError, R.truncated_normal, size)
def test_multiple_rng_aliasing(): def test_multiple_rng_aliasing():
...@@ -734,6 +856,19 @@ def test_undefined_grad(): ...@@ -734,6 +856,19 @@ def test_undefined_grad():
assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
(avg, std)) (avg, std))
# checking truncated normal distribution
avg = tensor.scalar()
out = srng.truncated_normal((), avg=avg)
assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, avg)
std = tensor.scalar()
out = srng.truncated_normal((), avg=0, std=std)
assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, std)
out = srng.truncated_normal((), avg=avg, std=std)
assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
(avg, std))
def test_f16_nonzero(mode=None, op_to_check=rng_mrg.mrg_uniform): def test_f16_nonzero(mode=None, op_to_check=rng_mrg.mrg_uniform):
srng = MRG_RandomStreams(seed=utt.fetch_seed()) srng = MRG_RandomStreams(seed=utt.fetch_seed())
...@@ -755,6 +890,8 @@ def test_target_parameter(): ...@@ -755,6 +890,8 @@ def test_target_parameter():
assert isinstance(f(), np.ndarray) assert isinstance(f(), np.ndarray)
basic_target_parameter_test(srng.uniform((3, 2), target='cpu')) basic_target_parameter_test(srng.uniform((3, 2), target='cpu'))
basic_target_parameter_test(srng.normal((3, 2), target='cpu'))
basic_target_parameter_test(srng.truncated_normal((3, 2), target='cpu'))
basic_target_parameter_test(srng.binomial((3, 2), target='cpu')) basic_target_parameter_test(srng.binomial((3, 2), target='cpu'))
basic_target_parameter_test(srng.multinomial(pvals=pvals.astype('float32'), target='cpu')) basic_target_parameter_test(srng.multinomial(pvals=pvals.astype('float32'), target='cpu'))
basic_target_parameter_test(srng.choice(p=pvals.astype('float32'), replace=False, target='cpu')) basic_target_parameter_test(srng.choice(p=pvals.astype('float32'), replace=False, target='cpu'))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论