Merge pull request #4757 from abergeron/multi_gpu

Multinomial Without Replacement

Merge pull request #4757 from abergeron/multi_gpu
cb6c5b9c · Frédéric Bastien · GitHub · 884ed6be · f7c01d53 · cb6c5b9c
--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -11,11 +11,13 @@ import theano
 import theano.sandbox.multinomial
 from theano import Apply, config
 from theano.gof import Op
-from theano.tensor import NotScalarConstantError, get_scalar_constant_value
 from theano import gpuarray
+from theano.tensor import NotScalarConstantError, get_scalar_constant_value
 from .basic_ops import as_gpuarray_variable, infer_context_name
 from .opt import register_opt, op_lifter, register_opt2
 from .type import GpuArrayType
+from theano.scalar import as_scalar
 class GPUAMultinomialFromUniform(gpuarray.basic_ops.GpuKernelBase, Op):
@@ -227,6 +229,239 @@ KERNEL void k_multi_warp_multinomial(
        return (1,)
+class GPUAMultinomialWOReplacementFromUniform(gpuarray.basic_ops.GpuKernelBase, Op):
+    """
+    The output is transposed compared to MultinomialWOReplacementFromUniform.
+    We must insert a Transpose op after it.
+    The optimization that moves it to the gpu does it.
+    """
+    __props__ = ("odtype",)
+    def __init__(self, odtype):
+        Op.__init__(self)
+        self.odtype = odtype
+    def get_params(self, node):
+        return node.outputs[0].type.context
+    def c_headers(self):
+        return ['<numpy_compat.h>', 'gpuarray_helper.h']
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
+    def make_node(self, pvals, unis, n):
+        assert pvals.dtype == 'float32'
+        assert unis.dtype == 'float32'
+        ctx_name = infer_context_name(pvals, unis)
+        pvals = as_gpuarray_variable(pvals, ctx_name)
+        unis = as_gpuarray_variable(unis, ctx_name)
+        if pvals.ndim != 2:
+            raise NotImplementedError('pvals ndim should be 2', pvals.ndim)
+        if unis.ndim != 1:
+            raise NotImplementedError('unis ndim should be 1', unis.ndim)
+        if self.odtype == 'auto':
+            odtype = 'int64'
+        else:
+            odtype = self.odtype
+        assert odtype == 'int64', odtype
+        br = (pvals.broadcastable[1], pvals.broadcastable[0])
+        out = GpuArrayType(broadcastable=br,
+                           dtype=odtype,
+                           context_name=ctx_name)()
+        return Apply(self, [pvals, unis, as_scalar(n)], [out])
+    def gpu_kernels(self, node, name):
+        code = """
+KERNEL void k_multi_warp_multinomial_wor(
+    const ga_size nb_multi,
+    const ga_size nb_outcomes,
+    const ga_size n_samples,
+    GLOBAL_MEM float * global_pvals_copy,
+    const ga_ssize pvals_row_stride,
+    const ga_ssize pvals_col_stride,
+    GLOBAL_MEM float * global_unis,
+    const ga_ssize unis_stride,
+    GLOBAL_MEM ga_long * global_outs,
+    const ga_ssize outs_row_stride,
+    const ga_ssize outs_col_stride
+)
+{
+    // each thread takes care of one multinomial-wor n_samples-draw
+    int n = LDIM_0*GID_0 + LID_0;
+    if (n < nb_multi)
+    {
+        for (int c = 0; c < n_samples; ++c)
+        {
+            float cummul = 0.;
+            bool done = false;
+            const float unis_n = global_unis[(c * nb_multi + n)*unis_stride];
+            for (ga_size m = 0; m < nb_outcomes; ++m)
+            {
+                float pvals_nm = global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride];
+                cummul += pvals_nm;
+                if (!done && unis_n < cummul)
+                {
+                    //write out transposed for speed.
+                    global_outs[n * outs_col_stride +
+                                c * outs_row_stride] = m;
+                    global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride] = 0.0;
+                    cummul -= pvals_nm;
+                    done = true;
+                }
+            }
+            // renormalize the multinomial
+            for (ga_int k = 0; k < nb_outcomes; ++k)
+            {
+                global_pvals_copy[k * pvals_col_stride + n * pvals_row_stride] /= cummul;
+            }
+        }
+    }
+}
+"""
+        return [gpuarray.basic_ops.Kernel(
+            code=code, name="k_multi_warp_multinomial_wor",
+            params=[pygpu.gpuarray.SIZE,
+                    pygpu.gpuarray.SIZE,
+                    pygpu.gpuarray.SIZE,
+                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.SSIZE
+                    ],
+            flags=gpuarray.basic_ops.Kernel.get_flags(node.outputs[0].dtype),
+            objvar='k_multi_warp_multinomial_wor_' + name)]
+    def c_code(self, node, name, inp, outputs, sub):
+        pvals, unis, n = inp
+        out, = outputs
+        fail = sub['fail']
+        ctx = sub['params']
+        sync = bool(config.gpuarray.sync)
+        kname = self.gpu_kernels(node, name)[0].objvar
+        s = """
+    PyGpuArrayObject * pvals = %(pvals)s;
+    PyGpuArrayObject * unis = %(unis)s;
+    const size_t n_samples = %(n)s;
+    PyGpuArrayObject * out = %(out)s;
+    // create a copy of pvals matrix
+    PyGpuArrayObject * pvals_copy = NULL;
+    size_t dims[2];
+    if (PyGpuArray_NDIM(pvals) != 2)
+    {
+        PyErr_Format(PyExc_TypeError, "pvals wrong rank");
+        %(fail)s
+    }
+    if (PyGpuArray_NDIM(unis) != 1)
+    {
+        PyErr_Format(PyExc_TypeError, "unis wrong rank");
+        %(fail)s
+    }
+    if ( n_samples > (PyGpuArray_DIMS(pvals)[1]) )
+    {
+        PyErr_Format(PyExc_ValueError, "Cannot sample without replacement n samples bigger than the size of the distribution.");
+        %(fail)s;
+    }
+    if (PyGpuArray_DIMS(unis)[0] != PyGpuArray_DIMS(pvals)[0] * n_samples)
+    {
+        PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0] * n");
+        %(fail)s
+    }
+    pvals_copy = pygpu_copy(pvals, GA_C_ORDER);
+    dims[0] = n_samples;
+    dims[1] = PyGpuArray_DIMS(pvals)[0];
+    if (theano_prep_output(&out, 2, dims, GA_LONG,
+                           GA_C_ORDER, %(ctx)s) != 0){
+      %(fail)s
+    }
+    %(out)s = out;
+    { // NESTED SCOPE
+        int nb_multi = PyGpuArray_DIMS(pvals)[0];
+        int nb_outcomes = PyGpuArray_DIMS(pvals)[1];
+        //TODO : change this for a beautiful constant
+        int max_nb_blocks = 2<<15 - 1;
+        size_t nb_blocks = max_nb_blocks + 1;
+        size_t nb_threads=16; // so it really starts at 32, because of the *2
+        do
+        {
+            nb_threads*=2;
+            if (nb_multi %% nb_threads == 0)
+                nb_blocks = nb_multi/nb_threads;
+            else
+                nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
+        } while (nb_blocks > max_nb_blocks);
+        // TODO : next line is a bit hardcoded...
+        if (nb_threads > 512)
+        {
+            PyErr_Format(
+                PyExc_ValueError,
+                "Multinomial is not implemented for so many rows in the matrix (%%i)",
+                nb_multi);
+            %(fail)s
+        }
+        assert(nb_blocks*nb_threads >= nb_multi);
+        void *args[11];
+        ssize_t strides[5] = {
+            PyGpuArray_STRIDES(pvals)[0]/sizeof(float),
+            PyGpuArray_STRIDES(pvals)[1]/sizeof(float),
+            PyGpuArray_STRIDES(unis)[0]/sizeof(float),
+            PyGpuArray_STRIDES(out)[0]/8,
+            PyGpuArray_STRIDES(out)[1]/8
+        };
+        int err;
+        args[0] = (void*)&PyGpuArray_DIMS(pvals)[0];
+        args[1] = (void*)&PyGpuArray_DIMS(pvals)[1];
+        args[2] = (void*)&n_samples;
+        args[3] = pvals_copy->ga.data; //PyGpuArray_DEV_DATA(pvals);
+        args[4] = (void*)&strides[0];
+        args[5] = (void*)&strides[1];
+        args[6] = unis->ga.data; //PyGpuArray_DEV_DATA(unis);
+        args[7] = (void*)&strides[2];
+        args[8] = out->ga.data; //PyGpuArray_DEV_DATA(out);
+        args[9] = (void*)&strides[3];
+        args[10] = (void*)&strides[4];
+        err = GpuKernel_call(&%(kname)s, 1, &nb_threads, &nb_blocks, 0, args);
+        if (err != GA_NO_ERROR) {
+           PyErr_Format(
+                PyExc_RuntimeError,
+                "gpuarray error: %%s: %%s.\\n",
+                "k_multi_warp_%(name)s",
+                GpuKernel_error(&%(kname)s, err));
+            %(fail)s;
+        }
+        if(%(sync)d)
+            GpuArray_sync(&(out->ga));
+    } // END NESTED SCOPE
+        """ % locals()
+        return s
+    def c_code_cache_version(self):
+        return (1,)
 @register_opt('fast_compile')
 @op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
 @register_opt2([theano.sandbox.multinomial.MultinomialFromUniform], 'fast_compile')
@@ -248,3 +483,16 @@ def local_gpua_multinomial(op, context_name, inputs, outputs):
        gpu_op = GPUAMultinomialFromUniform(op.odtype)
        return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
            gpu_op(p, u))
+@register_opt('fast_compile')
+@op_lifter([theano.sandbox.multinomial.MultinomialWOReplacementFromUniform])
+@register_opt2([theano.sandbox.multinomial.MultinomialWOReplacementFromUniform], 'fast_compile')
+def local_gpua_multinomial_wor(op, context_name, inputs, outputs):
+    # TODO : need description for function
+    p, u, n = inputs
+    m, = outputs
+    if ((p.dtype == u.dtype == 'float32') and (m.dtype == 'int64')):
+        gpu_op = GPUAMultinomialWOReplacementFromUniform(op.odtype)
+        return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
+            gpu_op(p, u, n))
--- a/theano/gpuarray/tests/test_multinomial.py
+++ b/theano/gpuarray/tests/test_multinomial.py
@@ -2,23 +2,18 @@ from __future__ import absolute_import, print_function, division
 import numpy
+import unittest
 import theano
 from theano import config, function, tensor
-from ..multinomial import GPUAMultinomialFromUniform
+from theano.sandbox import multinomial
-import theano.tests.unittest_tools as utt
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
-from .config import mode_with_gpu, mode_without_gpu
-def get_mode(gpu):
-    mode = mode_without_gpu
-    if gpu:
-        mode = mode_with_gpu
-    return mode
+import theano.tests.unittest_tools as utt
-def run_with_c(f, gpu=False):
+from .config import mode_with_gpu
-    mode = get_mode(gpu)
+from ..multinomial import (GPUAMultinomialFromUniform,
-    f(mode, gpu)
+                           GPUAMultinomialWOReplacementFromUniform)
 def test_multinomial_0():
@@ -30,68 +25,59 @@ def test_multinomial_0():
    m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
-    def body(mode, gpu):
+    # the m*2 allows the multinomial to reuse output
-        # the m*2 allows the multinomial to reuse output
+    f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
-        f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
-        if gpu:
+    assert any([type(node.op) is GPUAMultinomialFromUniform
-            assert any([type(node.op) is GPUAMultinomialFromUniform
+                for node in f.maker.fgraph.toposort()])
-                        for node in f.maker.fgraph.toposort()])
-        # test that both first and second samples can be drawn
-        utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
-                            [[2, 0], [0, 2]])
-        # test that both second labels can be drawn
+    # test that both first and second samples can be drawn
-        r = f([[.2, .8], [.3, .7]], [.31, .31])
+    utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
-        utt.assert_allclose(r, [[0, 2], [0, 2]])
+                        [[2, 0], [0, 2]])
-        # test that both first labels can be drawn
+    # test that both second labels can be drawn
-        r = f([[.2, .8], [.3, .7]], [.21, .21])
+    r = f([[.2, .8], [.3, .7]], [.31, .31])
-        utt.assert_allclose(r, [[0, 2], [2, 0]])
+    utt.assert_allclose(r, [[0, 2], [0, 2]])
-        # change the size to make sure output gets reallocated ok
+    # test that both first labels can be drawn
-        # and also make sure that the GPU version doesn't screw up the
+    r = f([[.2, .8], [.3, .7]], [.21, .21])
-        # transposed-ness
+    utt.assert_allclose(r, [[0, 2], [2, 0]])
-        r = f([[.2, .8]], [.25])
-        utt.assert_allclose(r, [[0, 2]])
-    run_with_c(body)
+    # change the size to make sure output gets reallocated ok
-    run_with_c(body, True)
+    # and also make sure that the GPU version doesn't screw up the
+    # transposed-ness
+    r = f([[.2, .8]], [.25])
+    utt.assert_allclose(r, [[0, 2]])
 # TODO: check a bigger example (make sure blocking on GPU is handled correctly)
 def test_multinomial_large():
    # DEBUG_MODE will test this on GPU
-    def body(mode, gpu):
+    p = tensor.fmatrix()
-        p = tensor.fmatrix()
+    u = tensor.fvector()
-        u = tensor.fvector()
+    m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
-        m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
+    f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu)
-        f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
+    assert any([type(node.op) is GPUAMultinomialFromUniform
-        if gpu:
+                for node in f.maker.fgraph.toposort()])
-            assert any([type(node.op) is GPUAMultinomialFromUniform
-                        for node in f.maker.fgraph.toposort()])
+    pval = numpy.arange(10000 * 4,
+                        dtype='float32').reshape((10000, 4)) + 0.1
-        pval = numpy.arange(10000 * 4,
+    pval = pval / pval.sum(axis=1)[:, None]
-                            dtype='float32').reshape((10000, 4)) + 0.1
+    uval = numpy.ones_like(pval[:, 0]) * 0.5
-        pval = pval / pval.sum(axis=1)[:, None]
+    mval = f(pval, uval)
-        uval = numpy.ones_like(pval[:, 0]) * 0.5
-        mval = f(pval, uval)
+    assert mval.shape == pval.shape
+    if config.cast_policy == 'custom':
-        assert mval.shape == pval.shape
+        assert mval.dtype == pval.dtype
-        if config.cast_policy == 'custom':
+    elif config.cast_policy == 'numpy+floatX':
-            assert mval.dtype == pval.dtype
+        assert mval.dtype == config.floatX
-        elif config.cast_policy == 'numpy+floatX':
+    elif config.cast_policy == 'numpy':
-            assert mval.dtype == config.floatX
+        assert mval.dtype == 'float64'
-        elif config.cast_policy == 'numpy':
+    else:
-            assert mval.dtype == 'float64'
+        raise NotImplementedError(config.cast_policy)
-        else:
+    utt.assert_allclose(mval.sum(axis=1), 2)
-            raise NotImplementedError(config.cast_policy)
+    asdf = numpy.asarray([0, 0, 2, 0]) + 0 * pval
-        utt.assert_allclose(mval.sum(axis=1), 2)
+    utt.assert_allclose(mval, asdf)  # broadcast over all rows
-        asdf = numpy.asarray([0, 0, 2, 0]) + 0 * pval
-        utt.assert_allclose(mval, asdf)  # broadcast over all rows
-    run_with_c(body)
-    run_with_c(body, True)
 def test_gpu_opt():
@@ -104,7 +90,7 @@ def test_gpu_opt():
    m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
    assert m.dtype == 'float32', m.dtype
-    f = function([p, u], m, allow_input_downcast=True, mode=get_mode(True))
+    f = function([p, u], m, allow_input_downcast=True, mode=mode_with_gpu)
    assert any([type(node.op) is GPUAMultinomialFromUniform
                for node in f.maker.fgraph.toposort()])
    pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
@@ -117,10 +103,192 @@ def test_gpu_opt():
    m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(r, u)
    assert m.dtype == 'float32', m.dtype
-    f = function([r, u], m, allow_input_downcast=True, mode=get_mode(True))
+    f = function([r, u], m, allow_input_downcast=True, mode=mode_with_gpu)
    assert any([type(node.op) is GPUAMultinomialFromUniform
                for node in f.maker.fgraph.toposort()])
    pval = numpy.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
    pval = pval / pval.sum(axis=1)[:, None]
    uval = numpy.ones_like(pval[:, 0]) * 0.5
    f(pval, uval)
+class test_OP_wor(unittest.TestCase):
+    def test_select_distinct(self):
+        """
+        Tests that MultinomialWOReplacementFromUniform always selects distinct elements
+        """
+        p = tensor.fmatrix()
+        u = tensor.fvector()
+        n = tensor.iscalar()
+        m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n)
+        f = function([p, u, n], m, allow_input_downcast=True)
+        n_elements = 1000
+        all_indices = range(n_elements)
+        numpy.random.seed(12345)
+        for i in [5, 10, 50, 100, 500, n_elements]:
+            uni = numpy.random.rand(i).astype(config.floatX)
+            pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
+            pvals /= pvals.sum(1)
+            res = f(pvals, uni, i)
+            res = numpy.squeeze(res)
+            assert len(res) == i, res
+            assert numpy.all(numpy.in1d(numpy.unique(res), all_indices)), res
+    def test_fail_select_alot(self):
+        """
+        Tests that MultinomialWOReplacementFromUniform fails when asked to sample more
+        elements than the actual number of elements
+        """
+        p = tensor.fmatrix()
+        u = tensor.fvector()
+        n = tensor.iscalar()
+        m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n)
+        f = function([p, u, n], m, allow_input_downcast=True)
+        n_elements = 100
+        n_selected = 200
+        numpy.random.seed(12345)
+        uni = numpy.random.rand(n_selected).astype(config.floatX)
+        pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
+        pvals /= pvals.sum(1)
+        self.assertRaises(ValueError, f, pvals, uni, n_selected)
+    def test_select_proportional_to_weight(self):
+        """
+        Tests that MultinomialWOReplacementFromUniform selects elements, on average,
+        proportional to the their probabilities
+        """
+        p = tensor.fmatrix()
+        u = tensor.fvector()
+        n = tensor.iscalar()
+        m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n)
+        f = function([p, u, n], m, allow_input_downcast=True)
+        n_elements = 100
+        n_selected = 10
+        mean_rtol = 0.0005
+        numpy.random.seed(12345)
+        pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
+        pvals /= pvals.sum(1)
+        avg_pvals = numpy.zeros((n_elements,), dtype=config.floatX)
+        for rep in range(10000):
+            uni = numpy.random.rand(n_selected).astype(config.floatX)
+            res = f(pvals, uni, n_selected)
+            res = numpy.squeeze(res)
+            avg_pvals[res] += 1
+        avg_pvals /= avg_pvals.sum()
+        avg_diff = numpy.mean(abs(avg_pvals - pvals))
+        assert avg_diff < mean_rtol, avg_diff
+class test_function_wor(unittest.TestCase):
+    def test_select_distinct(self):
+        """
+        Tests that multinomial_wo_replacement always selects distinct elements
+        """
+        th_rng = RandomStreams(12345)
+        p = tensor.fmatrix()
+        n = tensor.iscalar()
+        m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
+        f = function([p, n], m, allow_input_downcast=True)
+        n_elements = 1000
+        all_indices = range(n_elements)
+        numpy.random.seed(12345)
+        for i in [5, 10, 50, 100, 500, n_elements]:
+            pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
+            pvals /= pvals.sum(1)
+            res = f(pvals, i)
+            res = numpy.squeeze(res)
+            assert len(res) == i
+            assert numpy.all(numpy.in1d(numpy.unique(res), all_indices)), res
+    def test_fail_select_alot(self):
+        """
+        Tests that multinomial_wo_replacement fails when asked to sample more
+        elements than the actual number of elements
+        """
+        th_rng = RandomStreams(12345)
+        p = tensor.fmatrix()
+        n = tensor.iscalar()
+        m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
+        f = function([p, n], m, allow_input_downcast=True)
+        n_elements = 100
+        n_selected = 200
+        numpy.random.seed(12345)
+        pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
+        pvals /= pvals.sum(1)
+        self.assertRaises(ValueError, f, pvals, n_selected)
+    def test_select_proportional_to_weight(self):
+        """
+        Tests that multinomial_wo_replacement selects elements, on average,
+        proportional to the their probabilities
+        """
+        th_rng = RandomStreams(12345)
+        p = tensor.fmatrix()
+        n = tensor.iscalar()
+        m = th_rng.multinomial_wo_replacement(pvals=p, n=n)
+        f = function([p, n], m, allow_input_downcast=True)
+        n_elements = 100
+        n_selected = 10
+        mean_rtol = 0.0005
+        numpy.random.seed(12345)
+        pvals = numpy.random.randint(1, 100, (1, n_elements)).astype(config.floatX)
+        pvals /= pvals.sum(1)
+        avg_pvals = numpy.zeros((n_elements,), dtype=config.floatX)
+        for rep in range(10000):
+            res = f(pvals, n_selected)
+            res = numpy.squeeze(res)
+            avg_pvals[res] += 1
+        avg_pvals /= avg_pvals.sum()
+        avg_diff = numpy.mean(abs(avg_pvals - pvals))
+        assert avg_diff < mean_rtol
+def test_gpu_opt_wor():
+    # We test the case where we put the op on the gpu when the output
+    # is moved to the gpu.
+    p = tensor.fmatrix()
+    u = tensor.fvector()
+    n = tensor.iscalar()
+    m = multinomial.MultinomialWOReplacementFromUniform('auto')(p, u, n)
+    assert m.dtype == 'int64', m.dtype
+    f = function([p, u, n], m, allow_input_downcast=True, mode=mode_with_gpu)
+    assert any([type(node.op) is GPUAMultinomialWOReplacementFromUniform
+                for node in f.maker.fgraph.toposort()])
+    n_samples = 3
+    pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
+    pval = pval / pval.sum(axis=1)[:, None]
+    uval = numpy.ones(pval.shape[0] * n_samples) * 0.5
+    f(pval, uval, n_samples)
+    # Test with a row, it was failing in the past.
+    r = tensor.frow()
+    m = multinomial.MultinomialWOReplacementFromUniform('auto')(r, u, n)
+    assert m.dtype == 'int64', m.dtype
+    f = function([r, u, n], m, allow_input_downcast=True, mode=mode_with_gpu)
+    assert any([type(node.op) is GPUAMultinomialWOReplacementFromUniform
+                for node in f.maker.fgraph.toposort()])
+    pval = numpy.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
+    pval = pval / pval.sum(axis=1)[:, None]
+    uval = numpy.ones_like(pval[:, 0]) * 0.5
+    f(pval, uval, 1)