Merge pull request #3795 from aalmah/rand_weighted_select

Sample multinomial with replacement

Merge pull request #3795 from aalmah/rand_weighted_select
931dfec1 · carriepl · b2362664 · 837e3abf · 931dfec1 · 931dfec1
--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -6,6 +6,7 @@ import theano.tensor as T
 from theano.gof import local_optimizer
 from theano.tensor import NotScalarConstantError, get_scalar_constant_value
 from theano.scalar import as_scalar
+import copy

 from theano.sandbox.cuda import cuda_available, GpuOp
 if cuda_available:
@@ -193,6 +194,72 @@ class MultinomialFromUniform(Op):
                            break


+class MultinomialWOReplacementFromUniform(MultinomialFromUniform):
+    """
+    Converts samples from a uniform into sample (without replacement) from a multinomial.
+
+    """
+
+    def make_node(self, pvals, unis, n=1):
+        pvals = T.as_tensor_variable(pvals)
+        unis = T.as_tensor_variable(unis)
+        if pvals.ndim != 2:
+            raise NotImplementedError('pvals ndim should be 2', pvals.ndim)
+        if unis.ndim != 1:
+            raise NotImplementedError('unis ndim should be 1', unis.ndim)
+        if self.odtype == 'auto':
+            odtype = 'int64'
+        else:
+            odtype = self.odtype
+        out = T.tensor(dtype=odtype, broadcastable=pvals.type.broadcastable)
+        return Apply(self, [pvals, unis, as_scalar(n)], [out])
+
+    def perform(self, node, ins, outs):
+        (pvals, unis, n_samples) = ins
+        # make a copy so we do not overwrite the input
+        pvals = copy.copy(pvals)
+        (z,) = outs
+
+        if n_samples > pvals.shape[1]:
+            raise ValueError("Cannot sample without replacement n samples bigger "
+                             "than the size of the distribution.")
+
+        if unis.shape[0] != pvals.shape[0] * n_samples:
+            raise ValueError("unis.shape[0] != pvals.shape[0] * n_samples",
+                             unis.shape[0], pvals.shape[0], n_samples)
+
+        if self.odtype == 'auto':
+            odtype = 'int64'
+        else:
+            odtype = self.odtype
+        if z[0] is None or not numpy.all(z[0].shape == [pvals.shape[0], n_samples]):
+            z[0] = -1 * numpy.ones((pvals.shape[0], n_samples), dtype=odtype)
+
+        nb_multi = pvals.shape[0]
+        nb_outcomes = pvals.shape[1]
+
+        # For each multinomial, loop over each possible outcome,
+        # and set selected pval to 0 after being selected
+        for c in range(n_samples):
+            for n in range(nb_multi):
+                cummul = 0
+                unis_n = unis[c * nb_multi + n]
+                for m in range(nb_outcomes):
+                    cummul += pvals[n, m]
+                    if (cummul > unis_n):
+                        z[0][n, c] = m
+                        # set to zero and re-normalize so that it's not selected again
+                        pvals[n, m] = 0.
+                        pvals[n] /= pvals[n].sum()
+                        break
+
+    def c_code_cache_version(self):
+        return None
+
+    def c_code(self, node, name, ins, outs, sub):
+        raise NotImplementedError('no C implementation yet!')
+
+
 class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
    """
    The output is transposed compared to MultinomialFromUniform.

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -1363,6 +1363,54 @@ class MRG_RandomStreams(object):
            raise NotImplementedError(("MRG_RandomStreams.multinomial only"
                                       " implemented for pvals.ndim = 2"))

+    def multinomial_wo_replacement(self, size=None, n=1, pvals=None, ndim=None, dtype='int64',
+                                   nstreams=None):
+        """
+        Sample `n` times *WITHOUT replacement* from a multinomial distribution
+        defined by probabilities pvals, and returns the indices of the sampled
+        elements.
+        `n` needs to be in [1, m], where m is the number of elements to select
+        from, i.e. m == pvals.shape[1]. By default n = 1.
+        
+        Example : pvals = [[.98, .01, .01], [.01, .49, .50]] and n=1 will
+        probably result in [[0],[2]]. When setting n=2, this
+        will probably result in [[0,1],[2,1]].
+        
+        Notes
+        -----
+        -`size` and `ndim` are only there keep the same signature as other
+        uniform, binomial, normal, etc.
+        TODO : adapt multinomial to take that into account
+
+        -Does not do any value checking on pvals, i.e. there is no
+        check that the elements are non-negative, less than 1, or
+        sum to 1. passing pvals = [[-2., 2.]] will result in
+        sampling [[0, 0]]
+
+        """
+        if pvals is None:
+            raise TypeError("You have to specify pvals")
+        pvals = as_tensor_variable(pvals)
+
+        if size is not None:
+            raise ValueError("Provided a size argument to "
+                             "MRG_RandomStreams.multinomial_wo_replacement, which does not use "
+                             "the size argument.")
+        if ndim is not None:
+            raise ValueError("Provided an ndim argument to "
+                             "MRG_RandomStreams.multinomial_wo_replacement, which does not use "
+                             "the ndim argument.")
+        if pvals.ndim == 2:
+            # size = [pvals.shape[0], as_tensor_variable(n)]
+            size = pvals[:,0].shape * n
+            unis = self.uniform(size=size, ndim=1, nstreams=nstreams)
+            op = multinomial.MultinomialWOReplacementFromUniform(dtype)
+            n_samples = as_tensor_variable(n)
+            return op(pvals, unis, n_samples)
+        else:
+            raise NotImplementedError(("MRG_RandomStreams.multinomial_wo_replacement only"
+                                       " implemented for pvals.ndim = 2"))
+
    def normal(self, size, avg=0.0, std=1.0, ndim=None,
               dtype=None, nstreams=None):
        """

--- a/theano/sandbox/tests/test_multinomial_wo_replacement.py
+++ b/theano/sandbox/tests/test_multinomial_wo_replacement.py
+import numpy
+from theano import config, function, tensor
+from theano.sandbox import multinomial
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
+import unittest
+
+
+class test_OP(unittest.TestCase):
+
+    def test_select_distinct(self):
+        """
+        Tests that MultinomialWOReplacementFromUniform always selects distinct elements
+        """
+        p = tensor.fmatrix()
+        u = tensor.fvector()
+        n = tensor.iscalar()
+        m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n)
+
+        f = function([p, u, n], m, allow_input_downcast=True)
+
+        n_elements = 1000
+        all_indices = range(n_elements)
+        numpy.random.seed(12345)
+        for i in [5, 10, 50, 100, 500, n_elements]:
+            uni = numpy.random.rand(i).astype(config.floatX)
+            pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
+            pvals /= pvals.sum(1)
+            res = f(pvals, uni, i)
+            res = numpy.squeeze(res)
+            assert len(res) == i
+            assert numpy.all(numpy.in1d(numpy.unique(res), all_indices)), res
+
+    def test_fail_select_alot(self):
+        """
+        Tests that MultinomialWOReplacementFromUniform fails when asked to sample more
+        elements than the actual number of elements
+        """
+        p = tensor.fmatrix()
+        u = tensor.fvector()
+        n = tensor.iscalar()
+        m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n)
+
+        f = function([p, u, n], m, allow_input_downcast=True)
+
+        n_elements = 100
+        n_selected = 200
+        numpy.random.seed(12345)
+        uni = numpy.random.rand(n_selected).astype(config.floatX)
+        pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
+        pvals /= pvals.sum(1)
+        self.assertRaises(ValueError, f, pvals, uni, n_selected)
+
+    def test_select_proportional_to_weight(self):
+        """
+        Tests that MultinomialWOReplacementFromUniform selects elements, on average,
+        proportional to the their probabilities
+        """
+        p = tensor.fmatrix()
+        u = tensor.fvector()
+        n = tensor.iscalar()
+        m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n)
+
+        f = function([p, u, n], m, allow_input_downcast=True)
+
+        n_elements = 100
+        n_selected = 10
+        mean_rtol = 0.0005
+        numpy.random.seed(12345)
+        pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
+        pvals /= pvals.sum(1)
+        avg_pvals = numpy.zeros((n_elements,), dtype=config.floatX)
+
+        for rep in range(10000):
+            uni = numpy.random.rand(n_selected).astype(config.floatX)
+            res = f(pvals, uni, n_selected)
+            res = numpy.squeeze(res)
+            avg_pvals[res] += 1
+        avg_pvals /= avg_pvals.sum()
+        avg_diff = numpy.mean(abs(avg_pvals - pvals))
+        assert avg_diff < mean_rtol, avg_diff
+
+
+class test_function(unittest.TestCase):
+
+    def test_select_distinct(self):
+        """
+        Tests that multinomial_wo_replacement always selects distinct elements
+        """
+        th_rng = RandomStreams(12345)
+
+        p = tensor.fmatrix()
+        n = tensor.iscalar()
+        m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
+
+        f = function([p, n], m, allow_input_downcast=True)
+
+        n_elements = 1000
+        all_indices = range(n_elements)
+        numpy.random.seed(12345)
+        for i in [5, 10, 50, 100, 500, n_elements]:
+            pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
+            pvals /= pvals.sum(1)
+            res = f(pvals, i)
+            res = numpy.squeeze(res)
+            assert len(res) == i
+            assert numpy.all(numpy.in1d(numpy.unique(res), all_indices)), res
+
+    def test_fail_select_alot(self):
+        """
+        Tests that multinomial_wo_replacement fails when asked to sample more
+        elements than the actual number of elements
+        """
+        th_rng = RandomStreams(12345)
+
+        p = tensor.fmatrix()
+        n = tensor.iscalar()
+        m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
+
+        f = function([p, n], m, allow_input_downcast=True)
+
+        n_elements = 100
+        n_selected = 200
+        numpy.random.seed(12345)
+        pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
+        pvals /= pvals.sum(1)
+        self.assertRaises(ValueError, f, pvals, n_selected)
+
+    def test_select_proportional_to_weight(self):
+        """
+        Tests that multinomial_wo_replacement selects elements, on average,
+        proportional to the their probabilities
+        """
+        th_rng = RandomStreams(12345)
+
+        p = tensor.fmatrix()
+        n = tensor.iscalar()
+        m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
+
+        f = function([p, n], m, allow_input_downcast=True)
+
+        n_elements = 100
+        n_selected = 10
+        mean_rtol = 0.0005
+        numpy.random.seed(12345)
+        pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
+        pvals /= pvals.sum(1)
+        avg_pvals = numpy.zeros((n_elements,), dtype=config.floatX)
+
+        for rep in range(10000):
+            res = f(pvals, n_selected)
+            res = numpy.squeeze(res)
+            avg_pvals[res] += 1
+        avg_pvals /= avg_pvals.sum()
+        avg_diff = numpy.mean(abs(avg_pvals - pvals))
+        assert avg_diff < mean_rtol