added multinomial for MRG

45c5780f · Simon Lemieux · 0016c4d6 · 45c5780f · 45c5780f · 45c5780f
--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
+import theano
+from theano import Op, Apply
+import theano.tensor as T
+from theano.tensor.opt import register_specialize
+from theano.gof import local_optimizer
+from theano.sandbox.cuda import cuda_available
+if cuda_available:
+    from theano.sandbox.cuda import CudaNdarrayType
+    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
+class Multinomial(Op):
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, pvals, unis):
+        pvals = T.as_tensor_variable(pvals)
+        unis = T.as_tensor_variable(unis)
+        #assert pvals.dtype == 'float32'
+        #assert unis.dtype == 'float32'
+        return Apply(self, [pvals, unis], [pvals.type()])
+    def grad(self, (pvals, unis), (gz,)):
+        return [None, None]
+    def c_code_cache_version(self):
+        return ()
+        #return (2,)
+    def c_code(self, node, name, (pvals, unis), (z,), sub):
+        fail = sub['fail']
+        return """
+        if (%(pvals)s->nd != 2)
+        {
+            PyErr_Format(PyExc_TypeError, "pvals wrong rank");
+            %(fail)s;
+        }
+        if (%(unis)s->nd != 1)
+        {
+            PyErr_Format(PyExc_TypeError, "unis wrong rank");
+            %(fail)s;
+        }
+        if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[1])
+        {
+            PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
+            %(fail)s;
+        }
+        if ((NULL == %(z)s)
+            || ((%(z)s->dimensions)[0] != (%(pvals)s->dimensions)[0])
+            || ((%(z)s->dimensions)[1] != (%(pvals)s->dimensions)[1])
+        )
+        {
+            Py_XDECREF(%(z)s);
+            npy_intp dims[2];
+            dims[0] = (%(pvals)s->dimensions)[0];
+            dims[1] = (%(pvals)s->dimensions)[1];
+            %(z)s = (PyArrayObject*) PyArray_ZEROS(2,
+                dims,
+                type_num_%(pvals)s,
+                0);
+            if (!%(z)s)
+            {
+                PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
+                %(fail)s;
+            }
+        }
+        { // NESTED SCOPE
+        const int nb_outcomes = %(pvals)s->dimensions[0];
+        const int nb_multi = %(pvals)s->dimensions[1];
+        //
+        // For each multinomials, loop over each possible outcome
+        //
+        for (int n = 0; n < nb_multi; ++n)
+        {
+            dtype_%(pvals)s cummul = 0.;
+            const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR1(%(unis)s, n);
+            for (int m = 0; m < nb_outcomes; ++m)
+            {
+                dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, m,n);
+                const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, m,n);
+                cummul += *pvals_nm;
+                if (*unis_n < cummul)
+                {
+                    *z_nm = 1.;
+                    break;
+                }
+            }
+        }
+        } // END NESTED SCOPE
+        """ % locals()
+multinomial = Multinomial()
+class GpuMultinomial(Multinomial):
+    def make_node(self, pvals, unis):
+        assert pvals.dtype == 'float32'
+        assert unis.dtype == 'float32'
+        if not isinstance(pvals.type, CudaNdarrayType):
+            raise TypeError('pvals must be cudandarray', pvals)
+        if not isinstance(unis.type, CudaNdarrayType):
+            raise TypeError('unis must be cudandarray', unis)
+        return Apply(self, [pvals, unis], [pvals.type()])
+    def c_code_cache_version(self):
+        #return ()
+        return (1,)
+    def c_support_code_apply(self, node, nodename):
+        return """
+        static __global__ void k_multi_warp_%(nodename)s(
+            const int nb_multi,
+            const int nb_outcomes,
+            const int pvals_row_strides,
+            const int pvals_col_strides,
+            float * global_pvals,
+            float * global_unis,
+            float * global_outs
+        )
+        {            
+            int n = 32*blockIdx.x + threadIdx.x;
+            if (n < nb_multi)
+            {    
+            float cummul = 0.;
+            bool done = false;
+            for (int m = 0; m < nb_outcomes; ++m)
+            {
+                cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
+                float current_out = 0.;
+                if (!done && global_unis[n] < cummul)
+                {
+                    current_out = 1.;
+                    done = true;
+                }  
+                global_outs[n + m * nb_multi] = current_out;
+            }
+            }
+        }
+        """ % locals()
+    def c_code(self, node, name, (pvals, unis), (z,), sub):
+        fail = sub['fail']
+        return """
+        if (%(pvals)s->nd != 2)
+        {
+            PyErr_Format(PyExc_TypeError, "pvals wrong rank");
+            %(fail)s;
+        }
+        if (%(unis)s->nd != 1)
+        {
+            PyErr_Format(PyExc_TypeError, "unis wrong rank");
+            %(fail)s;
+        }
+        if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
+        {
+            PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
+            %(fail)s;
+        }
+        if (!CudaNdarray_is_c_contiguous(%(unis)s))
+        {
+            PyErr_Format(PyExc_NotImplementedError, "require unis to be contiguous");
+            %(fail)s;
+        }
+        // Would be more efficient if pvals were also contiguous but practically I think it is not often the cas,
+        // since we are working on pvals.T here
+        if ((NULL == %(z)s)
+            || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
+            || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[1]))
+        {
+            Py_XDECREF(%(z)s);
+            npy_intp dims[2];
+            dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
+            dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
+            %(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
+            if (!%(z)s)
+            {
+                PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
+                %(fail)s;
+            }
+        }
+        { // NESTED SCOPE
+            int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
+            int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
+            int nb_block;
+            if (nb_multi %% 32 == 0)
+                nb_block = nb_multi/32;
+            else
+                nb_block = (int)((float)nb_multi/32. + 1.); 
+            dim3 n_blocks(nb_block,1,1);
+            dim3 n_threads(32,1,1);
+            int n_shared = 0;
+            k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+                CudaNdarray_HOST_DIMS(%(z)s)[1],
+                CudaNdarray_HOST_DIMS(%(z)s)[0],
+                CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
+                CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
+                CudaNdarray_DEV_DATA(%(pvals)s),
+                CudaNdarray_DEV_DATA(%(unis)s),
+                CudaNdarray_DEV_DATA(%(z)s)
+            );
+            CNDA_THREAD_SYNC;
+            cudaError_t sts = cudaGetLastError();
+            if (cudaSuccess != sts) 
+            {
+                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
+                    "k_multi_warp_%(name)s",
+                    cudaGetErrorString(sts),
+                    n_blocks.x,
+                    n_blocks.y,
+                    n_threads.x,
+                    n_threads.y,
+                    n_threads.z,
+                    n_shared);
+                %(fail)s;
+            }
+        } // END NESTED SCOPE
+        """ % locals()
+gpu_multinomial = GpuMultinomial()
+@local_optimizer()
+def use_gpu_multinomial(node):
+    if node.op == multinomial:
+        return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))]
+if theano.config.device.startswith('gpu'):
+    register_specialize(use_gpu_multinomial)
\ No newline at end of file
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -14,6 +14,8 @@ from theano.tensor import zeros_like, sqrt, log, sin, cos, join
 from theano.compile import optdb
 from theano.gof import local_optimizer
+from multinomial import multinomial
 from theano.sandbox.cuda import cuda_available, cuda_enabled
 if cuda_available:
    from theano.sandbox.cuda import CudaNdarrayType, float32_shared_constructor
@@ -672,6 +674,27 @@ class MRG_RandomStreams(object):
                return cast(self.uniform(size=size) < p, dtype)
        else:
            raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")
+    def multinomial(self, size=None, n=1, pvals=[[.5,.5]], ndim=None, dtype='int64'):
+        """
+        Sample `n` (currently `n` needs to be 1) times from a multinomial distribution defined by
+        probabilities pvals.
+        Example : pvals = [[.98,.01, .01], [.01, .98 .01]] will probably result in [[1,0,0],[0,1,0]].
+        .. note::
+            `size` and `ndim` are only there keep the same signature as other uniform, binomial, normal, etc.
+            todo : adapt multinomial to take that into account
+        """
+        pvals = as_tensor_variable(pvals)
+        if n == 1 and pvals.ndim == 2:
+            pvals = as_tensor_variable(pvals)
+            unis = self.uniform(size=pvals.shape[0:1], ndim=1)
+            return cast(multinomial(pvals.T, unis).T, dtype)
+        else:
+            raise NotImplementedError("MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2")
    def normal(self, size=None, avg=0.0, std=1.0, ndim=None, dtype=config.floatX):
        # We need an even number of ]0,1[ samples. Then we split them

--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -475,3 +475,59 @@ def test_normal0():
    ff = theano.function([], nn)
    basictest(ff, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='numpy ', allow_01=True)
+def basic_multinomialtest(f, steps, target_pvals, prefix="", mean_rtol=0.04):
+    dt = 0.0
+    avg_pvals = numpy.zeros(target_pvals.shape, dtype=config.floatX)
+    for i in xrange(steps):
+        t0 = time.time()
+        ival = f()
+        dt += time.time() - t0
+        #ival = numpy.asarray(ival)
+        avg_pvals += ival
+    avg_pvals/= steps
+    print 'random?[:10]\n', f()[:10]
+    print prefix, 'mean', avg_pvals
+    print numpy.mean(abs(avg_pvals - target_pvals))# < mean_rtol, 'bad mean? %s %s' % (str(avg_pvals), str(target_pvals))
+    print prefix, 'time', dt
+    print prefix, 'elements', steps*numpy.prod(target_pvals.shape)
+    print prefix, 'samples/sec', steps*numpy.prod(target_pvals.shape) / dt
+def test_multinomial():
+    steps = 100
+    if mode in ['DEBUG_MODE','FAST_COMPILE']:
+        sample_size = (49,5)
+    else:
+        sample_size = (450,6)
+    print ''
+    print 'ON CPU:'
+    pvals = numpy.asarray(numpy.random.uniform(size=sample_size))
+    pvals = numpy.apply_along_axis(lambda row : row/numpy.sum(row), 1, pvals)
+    R = MRG_RandomStreams(234, use_cuda=False)
+    m = R.multinomial(pvals=pvals, dtype=config.floatX)
+    f = theano.function([], m, mode=mode)
+    theano.printing.debugprint(f)
+    basic_multinomialtest(f, steps, pvals, prefix='mrg ')
+    sys.stdout.flush()
+    if mode!='FAST_COMPILE' and cuda_enabled:
+        print ''
+        print 'ON GPU:'
+        R = MRG_RandomStreams(234, use_cuda=True)
+        n = R.multinomial(pvals=pvals, dtype='float32')
+        assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
+        f = theano.function([], theano.Out(
+            theano.sandbox.cuda.basic_ops.gpu_from_host(n),
+            borrow=True), mode=mode_with_gpu)
+        theano.printing.debugprint(f)
+        sys.stdout.flush()
+        basic_multinomialtest(f, steps, pvals, prefix='gpu mrg ')