提交 931dfec1 authored 作者: carriepl's avatar carriepl

Merge pull request #3795 from aalmah/rand_weighted_select

Sample multinomial with replacement
......@@ -6,6 +6,7 @@ import theano.tensor as T
from theano.gof import local_optimizer
from theano.tensor import NotScalarConstantError, get_scalar_constant_value
from theano.scalar import as_scalar
import copy
from theano.sandbox.cuda import cuda_available, GpuOp
if cuda_available:
......@@ -193,6 +194,72 @@ class MultinomialFromUniform(Op):
break
class MultinomialWOReplacementFromUniform(MultinomialFromUniform):
"""
Converts samples from a uniform into sample (without replacement) from a multinomial.
"""
def make_node(self, pvals, unis, n=1):
pvals = T.as_tensor_variable(pvals)
unis = T.as_tensor_variable(unis)
if pvals.ndim != 2:
raise NotImplementedError('pvals ndim should be 2', pvals.ndim)
if unis.ndim != 1:
raise NotImplementedError('unis ndim should be 1', unis.ndim)
if self.odtype == 'auto':
odtype = 'int64'
else:
odtype = self.odtype
out = T.tensor(dtype=odtype, broadcastable=pvals.type.broadcastable)
return Apply(self, [pvals, unis, as_scalar(n)], [out])
def perform(self, node, ins, outs):
(pvals, unis, n_samples) = ins
# make a copy so we do not overwrite the input
pvals = copy.copy(pvals)
(z,) = outs
if n_samples > pvals.shape[1]:
raise ValueError("Cannot sample without replacement n samples bigger "
"than the size of the distribution.")
if unis.shape[0] != pvals.shape[0] * n_samples:
raise ValueError("unis.shape[0] != pvals.shape[0] * n_samples",
unis.shape[0], pvals.shape[0], n_samples)
if self.odtype == 'auto':
odtype = 'int64'
else:
odtype = self.odtype
if z[0] is None or not numpy.all(z[0].shape == [pvals.shape[0], n_samples]):
z[0] = -1 * numpy.ones((pvals.shape[0], n_samples), dtype=odtype)
nb_multi = pvals.shape[0]
nb_outcomes = pvals.shape[1]
# For each multinomial, loop over each possible outcome,
# and set selected pval to 0 after being selected
for c in range(n_samples):
for n in range(nb_multi):
cummul = 0
unis_n = unis[c * nb_multi + n]
for m in range(nb_outcomes):
cummul += pvals[n, m]
if (cummul > unis_n):
z[0][n, c] = m
# set to zero and re-normalize so that it's not selected again
pvals[n, m] = 0.
pvals[n] /= pvals[n].sum()
break
def c_code_cache_version(self):
return None
def c_code(self, node, name, ins, outs, sub):
raise NotImplementedError('no C implementation yet!')
class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
"""
The output is transposed compared to MultinomialFromUniform.
......
......@@ -1363,6 +1363,54 @@ class MRG_RandomStreams(object):
raise NotImplementedError(("MRG_RandomStreams.multinomial only"
" implemented for pvals.ndim = 2"))
def multinomial_wo_replacement(self, size=None, n=1, pvals=None, ndim=None, dtype='int64',
nstreams=None):
"""
Sample `n` times *WITHOUT replacement* from a multinomial distribution
defined by probabilities pvals, and returns the indices of the sampled
elements.
`n` needs to be in [1, m], where m is the number of elements to select
from, i.e. m == pvals.shape[1]. By default n = 1.
Example : pvals = [[.98, .01, .01], [.01, .49, .50]] and n=1 will
probably result in [[0],[2]]. When setting n=2, this
will probably result in [[0,1],[2,1]].
Notes
-----
-`size` and `ndim` are only there keep the same signature as other
uniform, binomial, normal, etc.
TODO : adapt multinomial to take that into account
-Does not do any value checking on pvals, i.e. there is no
check that the elements are non-negative, less than 1, or
sum to 1. passing pvals = [[-2., 2.]] will result in
sampling [[0, 0]]
"""
if pvals is None:
raise TypeError("You have to specify pvals")
pvals = as_tensor_variable(pvals)
if size is not None:
raise ValueError("Provided a size argument to "
"MRG_RandomStreams.multinomial_wo_replacement, which does not use "
"the size argument.")
if ndim is not None:
raise ValueError("Provided an ndim argument to "
"MRG_RandomStreams.multinomial_wo_replacement, which does not use "
"the ndim argument.")
if pvals.ndim == 2:
# size = [pvals.shape[0], as_tensor_variable(n)]
size = pvals[:,0].shape * n
unis = self.uniform(size=size, ndim=1, nstreams=nstreams)
op = multinomial.MultinomialWOReplacementFromUniform(dtype)
n_samples = as_tensor_variable(n)
return op(pvals, unis, n_samples)
else:
raise NotImplementedError(("MRG_RandomStreams.multinomial_wo_replacement only"
" implemented for pvals.ndim = 2"))
def normal(self, size, avg=0.0, std=1.0, ndim=None,
dtype=None, nstreams=None):
"""
......
import numpy
from theano import config, function, tensor
from theano.sandbox import multinomial
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import unittest
class test_OP(unittest.TestCase):
def test_select_distinct(self):
"""
Tests that MultinomialWOReplacementFromUniform always selects distinct elements
"""
p = tensor.fmatrix()
u = tensor.fvector()
n = tensor.iscalar()
m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n)
f = function([p, u, n], m, allow_input_downcast=True)
n_elements = 1000
all_indices = range(n_elements)
numpy.random.seed(12345)
for i in [5, 10, 50, 100, 500, n_elements]:
uni = numpy.random.rand(i).astype(config.floatX)
pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
res = f(pvals, uni, i)
res = numpy.squeeze(res)
assert len(res) == i
assert numpy.all(numpy.in1d(numpy.unique(res), all_indices)), res
def test_fail_select_alot(self):
"""
Tests that MultinomialWOReplacementFromUniform fails when asked to sample more
elements than the actual number of elements
"""
p = tensor.fmatrix()
u = tensor.fvector()
n = tensor.iscalar()
m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n)
f = function([p, u, n], m, allow_input_downcast=True)
n_elements = 100
n_selected = 200
numpy.random.seed(12345)
uni = numpy.random.rand(n_selected).astype(config.floatX)
pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
self.assertRaises(ValueError, f, pvals, uni, n_selected)
def test_select_proportional_to_weight(self):
"""
Tests that MultinomialWOReplacementFromUniform selects elements, on average,
proportional to the their probabilities
"""
p = tensor.fmatrix()
u = tensor.fvector()
n = tensor.iscalar()
m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n)
f = function([p, u, n], m, allow_input_downcast=True)
n_elements = 100
n_selected = 10
mean_rtol = 0.0005
numpy.random.seed(12345)
pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
avg_pvals = numpy.zeros((n_elements,), dtype=config.floatX)
for rep in range(10000):
uni = numpy.random.rand(n_selected).astype(config.floatX)
res = f(pvals, uni, n_selected)
res = numpy.squeeze(res)
avg_pvals[res] += 1
avg_pvals /= avg_pvals.sum()
avg_diff = numpy.mean(abs(avg_pvals - pvals))
assert avg_diff < mean_rtol, avg_diff
class test_function(unittest.TestCase):
def test_select_distinct(self):
"""
Tests that multinomial_wo_replacement always selects distinct elements
"""
th_rng = RandomStreams(12345)
p = tensor.fmatrix()
n = tensor.iscalar()
m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
f = function([p, n], m, allow_input_downcast=True)
n_elements = 1000
all_indices = range(n_elements)
numpy.random.seed(12345)
for i in [5, 10, 50, 100, 500, n_elements]:
pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
res = f(pvals, i)
res = numpy.squeeze(res)
assert len(res) == i
assert numpy.all(numpy.in1d(numpy.unique(res), all_indices)), res
def test_fail_select_alot(self):
"""
Tests that multinomial_wo_replacement fails when asked to sample more
elements than the actual number of elements
"""
th_rng = RandomStreams(12345)
p = tensor.fmatrix()
n = tensor.iscalar()
m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
f = function([p, n], m, allow_input_downcast=True)
n_elements = 100
n_selected = 200
numpy.random.seed(12345)
pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
self.assertRaises(ValueError, f, pvals, n_selected)
def test_select_proportional_to_weight(self):
"""
Tests that multinomial_wo_replacement selects elements, on average,
proportional to the their probabilities
"""
th_rng = RandomStreams(12345)
p = tensor.fmatrix()
n = tensor.iscalar()
m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
f = function([p, n], m, allow_input_downcast=True)
n_elements = 100
n_selected = 10
mean_rtol = 0.0005
numpy.random.seed(12345)
pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
pvals /= pvals.sum(1)
avg_pvals = numpy.zeros((n_elements,), dtype=config.floatX)
for rep in range(10000):
res = f(pvals, n_selected)
res = numpy.squeeze(res)
avg_pvals[res] += 1
avg_pvals /= avg_pvals.sum()
avg_diff = numpy.mean(abs(avg_pvals - pvals))
assert avg_diff < mean_rtol
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论