Advance MultinomialFromUniform for the new back-end

c708e3a5 · Frederic Bastien · fa978938 · c708e3a5 · c708e3a5 · c708e3a5
--- a/theano/sandbox/gpuarray/multinomial.c
+++ b/theano/sandbox/gpuarray/multinomial.c
 #section support_code_apply

-static __global__ void k_multi_warp_APPLYSPECIFIC(multinomial)(
+static __global__ void k_multi_warp_multinomial(
    const int nb_multi,
    const int nb_outcomes,
    float * global_pvals,
@@ -66,6 +66,7 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals,
    if (theano_prep_output(out, 2, dims, unis->ga.typecode,
                           GA_C_ORDER, c) != 0)
      return 1;
+    GpuArray_memset(&((*out)->ga), 0);

    { // NESTED SCOPE
        int nb_multi = PyGpuArray_DIMS(pvals)[0];
@@ -97,19 +98,23 @@ int APPLY_SPECIFIC(multinomial)(PyGpuArrayObject *pvals,

        assert(nb_blocks*nb_threads >= nb_multi);

-        k_multi_warp_APPLYSPECIFIC(multinomial)<<<n_blocks, n_threads, n_shared>>>(
+        k_multi_warp_multinomial<<<n_blocks, n_threads, n_shared>>>(
            PyGpuArray_DIMS(*out)[1],
            PyGpuArray_DIMS(*out)[0],
-            PyGpuArray_DEV_DATA(%(pvals)s),
-            PyGpuArray_STRIDES(%(pvals)s)[0],
-            PyGpuArray_STRIDES(%(pvals)s)[1],
-            PyGpuArray_DEV_DATA(%(unis)s),
-            PyGpuArray_STRIDES(%(unis)s)[0],
-            PyGpuArray_DEV_DATA(*out),
+            (float*)PyGpuArray_DEV_DATA(pvals),
+            PyGpuArray_STRIDES(pvals)[0],
+            PyGpuArray_STRIDES(pvals)[1],
+            (float*)PyGpuArray_DEV_DATA(unis),
+            PyGpuArray_STRIDES(unis)[0],
+            (float*)PyGpuArray_DEV_DATA(*out),
            PyGpuArray_STRIDES(*out)[0],
            PyGpuArray_STRIDES(*out)[1]
        );
-        CNDA_THREAD_SYNC;
+
+	//TODO
+	//if(false)//SYNC)
+	  //	  GpuArray_sync((*out)->ga);
+ 	//        SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess != sts)
        {

--- a/theano/sandbox/gpuarray/multinomial.py
+++ b/theano/sandbox/gpuarray/multinomial.py
@@ -2,19 +2,32 @@ import os

 import pygpu

+import theano
+import theano.sandbox.multinomial
 from theano import Apply
-from theano.gof import COp
+from theano.gof import COp, local_optimizer
 from .basic_ops import as_gpuarray_variable, infer_context_name
 from .type import gpu_context_type, GpuArrayType
+from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
+from theano.sandbox import gpuarray
+from theano.sandbox.gpuarray.opt import register_opt, op_lifter
+from theano.tensor import NotScalarConstantError, get_scalar_constant_value
+

 class GPUAMultinomialFromUniform(COp):
+    __props__ = ("odtype",)
    params_type = gpu_context_type

+    def __init__(self, odtype):
+        COp.__init__(self, ['multinomial.c'], 'APPLY_SPECIFIC(multinomial)')
+        self.odtype = odtype
+
    def get_params(self, node):
        return node.outputs[0].type.context

-    def __init__(self):
-        COp.__init__(self, ['multinomial.c'], 'APPLY_SPECIFIC(multinomial)')
+    def c_compiler(self):
+        # TODO: get rid of this
+        return NVCC_compiler

    def c_headers(self):
        return ['<numpy_compat.h>', 'gpuarray_helper.h']
@@ -30,10 +43,44 @@ class GPUAMultinomialFromUniform(COp):
        pvals = as_gpuarray_variable(pvals, ctx_name)
        unis = as_gpuarray_variable(unis, ctx_name)

+        if pvals.ndim != 2:
+            raise NotImplementedError('pvals ndim should be 2', pvals.ndim)
+        if unis.ndim != 1:
+            raise NotImplementedError('unis ndim should be 1', unis.ndim)
+        if self.odtype == 'auto':
+            odtype = pvals.dtype
+        else:
+            odtype = self.odtype
+        if odtype != pvals.dtype:
+            raise NotImplementedError(
+                'GpuMultinomialFromUniform works only if '
+                'self.odtype == pvals.dtype', odtype, pvals.dtype)
        br = (pvals.broadcastable[1], pvals.broadcastable[0])
-        out = GpuArrayType(broadcastable=br, dtype="float32")()
+        out = GpuArrayType(broadcastable=br, dtype=odtype)()

        return Apply(self, [pvals, unis], [out])

    def c_code_cache_version(self):
        return (8,)
+
+
+@register_opt()
+@op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
+def local_gpua_multinomial(node, context_name):
+    # TODO : need description for function
+
+    if len(node.inputs) == 2:
+        p, u = node.inputs
+        n_samples = 1
+    else:
+        p, u, n_samples = node.inputs
+    try:
+        if get_scalar_constant_value(n_samples) != 1:
+            return None
+    except NotScalarConstantError:
+        return None
+    m, = node.outputs
+    if (p.dtype == u.dtype == m.dtype == 'float32'):
+        gpu_op = GPUAMultinomialFromUniform(node.op.odtype)
+        return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
+            gpu_op(p, u))
--- a/theano/sandbox/gpuarray/tests/test_multinomial.py
+++ b/theano/sandbox/gpuarray/tests/test_multinomial.py
+from __future__ import absolute_import, print_function, division
+
+import copy
+
 import numpy

 import theano
-from theano import tensor
-from theano.sandbox.gpuarray.multinomial import GPUAMultinomialFromUniform
+from theano import config, function, tensor
+from ..multinomial import GPUAMultinomialFromUniform
 from .config import mode_with_gpu
+from theano.compile.mode import get_default_mode, predefined_linkers
+import theano.tests.unittest_tools as utt
+from .. import pygpu_activated
+
+
+def get_mode(gpu):
+    mode = get_default_mode()
+    mode = copy.copy(mode)
+    if gpu:
+        mode = mode.including('gpuarray', 'gpu_local_optimizations',
+                              'local_cut_gpu_host_gpu')
+    if isinstance(mode.linker, theano.gof.PerformLinker):
+        mode.linker = predefined_linkers['c|py']
+    if hasattr(mode.linker, 'c_thunks'):
+        mode.linker.c_thunks = True
+    return mode
+
+
+def run_with_c(f, gpu=False):
+    mode = get_mode(gpu)
+    f(mode, gpu)
+

 def test_multinomial0():
    # This tests the MultinomialFromUniform Op directly, not going through the
@@ -12,8 +38,51 @@ def test_multinomial0():
    p = tensor.fmatrix()
    u = tensor.fvector()

-    m = GPUAMultinomialFromUniform()(p, u)
+    m = GPUAMultinomialFromUniform('auto')(p, u)

    f = theano.function([p, u], m, mode=mode_with_gpu)
+    theano.printing.debugprint(f)
+    ret = f(numpy.array([[0.1, 0.2, 0.3, 0.4],
+                         [0.1, 0.2, 0.3, 0.4]], dtype='float32'),
+            numpy.array([0.05, 0.05], dtype='float32'))
+    print(numpy.asarray(ret))
+
+
+def test_multinomial_0():
+    # This tests the MultinomialFromUniform Op directly, not going through the
+    # multinomial() call in GPU random generation.
+
+    p = tensor.fmatrix()
+    u = tensor.fvector()
+
+    m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
+
+    def body(mode, gpu):
+        # the m*2 allows the multinomial to reuse output
+        f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
+
+        if gpu:
+            assert any([type(node.op) is GPUAMultinomialFromUniform
+                        for node in f.maker.fgraph.toposort()])
+
+        # test that both first and second samples can be drawn
+        utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
+                            [[2, 0], [0, 2]])
+
+        # test that both second labels can be drawn
+        r = f([[.2, .8], [.3, .7]], [.31, .31])
+        utt.assert_allclose(r, [[0, 2], [0, 2]])
+
+        # test that both first labels can be drawn
+        r = f([[.2, .8], [.3, .7]], [.21, .21])
+        utt.assert_allclose(r, [[0, 2], [2, 0]])
+
+        # change the size to make sure output gets reallocated ok
+        # and also make sure that the GPU version doesn't screw up the
+        # transposed-ness
+        r = f([[.2, .8]], [.25])
+        utt.assert_allclose(r, [[0, 2]])

-    assert f(numpy.array([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]]), numpy.array([0.05, 0.05]))
\ No newline at end of file
+    run_with_c(body)
+    if pygpu_activated:
+        run_with_c(body, True)