first trial - unfinished

first trial - midwork start local memory debugging debugging initial test fixed bugs modified test moved test file debugging first trial - finished

first trial - unfinished
e9e07abd · Amjad Almahairi · Arnaud Bergeron · d5944c96 · e9e07abd · e9e07abd
--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -11,11 +11,13 @@ import theano
 import theano.sandbox.multinomial
 from theano import Apply, config
 from theano.gof import Op
-from theano.tensor import NotScalarConstantError, get_scalar_constant_value
 from theano import gpuarray
+from theano.tensor import NotScalarConstantError, get_scalar_constant_value
 from .basic_ops import as_gpuarray_variable, infer_context_name
 from .opt import register_opt, op_lifter, register_opt2
 from .type import GpuArrayType
+from theano.scalar import as_scalar
 class GPUAMultinomialFromUniform(gpuarray.basic_ops.GpuKernelBase, Op):
@@ -227,6 +229,239 @@ KERNEL void k_multi_warp_multinomial(
        return (1,)
+class GPUAMultinomialWOReplacementFromUniform(gpuarray.basic_ops.GpuKernelBase, Op):
+    """
+    The output is transposed compared to MultinomialWOReplacementFromUniform.
+    We must insert a Transpose op after it.
+    The optimization that moves it to the gpu does it.
+    """
+    __props__ = ("odtype",)
+    def __init__(self, odtype):
+        Op.__init__(self)
+        self.odtype = odtype
+    def get_params(self, node):
+        return node.outputs[0].type.context
+    def c_headers(self):
+        return ['<numpy_compat.h>', 'gpuarray_helper.h']
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
+    def make_node(self, pvals, unis, n):
+        assert pvals.dtype == 'float32'
+        assert unis.dtype == 'float32'
+        ctx_name = infer_context_name(pvals, unis)
+        pvals = as_gpuarray_variable(pvals, ctx_name)
+        unis = as_gpuarray_variable(unis, ctx_name)
+        if pvals.ndim != 2:
+            raise NotImplementedError('pvals ndim should be 2', pvals.ndim)
+        if unis.ndim != 1:
+            raise NotImplementedError('unis ndim should be 1', unis.ndim)
+        if self.odtype == 'auto':
+            odtype = 'int64'
+        else:
+            odtype = self.odtype
+        assert odtype == 'int64', odtype
+        br = (pvals.broadcastable[1], pvals.broadcastable[0])
+        out = GpuArrayType(broadcastable=br,
+                           dtype=odtype,
+                           context_name=ctx_name)()
+        return Apply(self, [pvals, unis, as_scalar(n)], [out])
+    def gpu_kernels(self, node, name):
+        code = """
+KERNEL void k_multi_warp_multinomial_wor(
+    const ga_size nb_multi,
+    const ga_size nb_outcomes,
+    const ga_size n_samples,
+    GLOBAL_MEM float * global_pvals_copy,
+    const ga_ssize pvals_row_stride,
+    const ga_ssize pvals_col_stride,
+    GLOBAL_MEM float * global_unis,
+    const ga_ssize unis_stride,
+    GLOBAL_MEM ga_long * global_outs,
+    const ga_ssize outs_row_stride,
+    const ga_ssize outs_col_stride
+)
+{
+    // each thread takes care of one multinomial-wor n_samples-draw
+    int n = LDIM_0*GID_0 + LID_0;
+    if (n < nb_multi)
+    {
+        for (int c = 0; c < n_samples; ++c)
+        {
+            float cummul = 0.;
+            bool done = false;
+            const float unis_n = global_unis[(c * nb_multi + n)*unis_stride];
+            for (ga_size m = 0; m < nb_outcomes; ++m)
+            {
+                float pvals_nm = global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride];
+                cummul += pvals_nm;
+                if (!done && unis_n < cummul)
+                {
+                    //write out transposed for speed.
+                    global_outs[n * outs_col_stride +
+                                c * outs_row_stride] = m;
+                    global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride] = 0.0;
+                    cummul -= pvals_nm;
+                    done = true;
+                }
+            }
+            // renormalize the multinomial
+            for (ga_int k = 0; k < nb_outcomes; ++k)
+            {
+                global_pvals_copy[k * pvals_col_stride + n * pvals_row_stride] /= cummul;
+            }
+        }
+    }
+}
+"""
+        return [gpuarray.basic_ops.Kernel(
+            code=code, name="k_multi_warp_multinomial_wor",
+            params=[pygpu.gpuarray.SIZE,
+                    pygpu.gpuarray.SIZE,
+                    pygpu.gpuarray.SIZE,
+                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.SSIZE
+                    ],
+            flags=gpuarray.basic_ops.Kernel.get_flags(node.outputs[0].dtype),
+            objvar='k_multi_warp_multinomial_wor_' + name)]
+    def c_code(self, node, name, inp, outputs, sub):
+        pvals, unis, n = inp
+        out, = outputs
+        fail = sub['fail']
+        ctx = sub['params']
+        sync = bool(config.gpuarray.sync)
+        kname = self.gpu_kernels(node, name)[0].objvar
+        s = """
+    PyGpuArrayObject * pvals = %(pvals)s;
+    PyGpuArrayObject * unis = %(unis)s;
+    const size_t n_samples = %(n)s;
+    PyGpuArrayObject * out = %(out)s;
+    // create a copy of pvals matrix
+    PyGpuArrayObject * pvals_copy = NULL;
+    size_t dims[2];
+    if (PyGpuArray_NDIM(pvals) != 2)
+    {
+        PyErr_Format(PyExc_TypeError, "pvals wrong rank");
+        %(fail)s
+    }
+    if (PyGpuArray_NDIM(unis) != 1)
+    {
+        PyErr_Format(PyExc_TypeError, "unis wrong rank");
+        %(fail)s
+    }
+    if ( n_samples > (PyGpuArray_DIMS(pvals)[1]) )
+    {
+        PyErr_Format(PyExc_ValueError, "Cannot sample without replacement n samples bigger than the size of the distribution.");
+        %(fail)s;
+    }
+    if (PyGpuArray_DIMS(unis)[0] != PyGpuArray_DIMS(pvals)[0] * n_samples)
+    {
+        PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0] * n");
+        %(fail)s
+    }
+    pvals_copy = pygpu_copy(pvals, GA_C_ORDER);
+    dims[0] = n_samples;
+    dims[1] = PyGpuArray_DIMS(pvals)[0];
+    if (theano_prep_output(&out, 2, dims, GA_LONG,
+                           GA_C_ORDER, %(ctx)s) != 0){
+      %(fail)s
+    }
+    %(out)s = out;
+    { // NESTED SCOPE
+        int nb_multi = PyGpuArray_DIMS(pvals)[0];
+        int nb_outcomes = PyGpuArray_DIMS(pvals)[1];
+        //TODO : change this for a beautiful constant
+        int max_nb_blocks = 2<<15 - 1;
+        size_t nb_blocks = max_nb_blocks + 1;
+        size_t nb_threads=16; // so it really starts at 32, because of the *2
+        do
+        {
+            nb_threads*=2;
+            if (nb_multi %% nb_threads == 0)
+                nb_blocks = nb_multi/nb_threads;
+            else
+                nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
+        } while (nb_blocks > max_nb_blocks);
+        // TODO : next line is a bit hardcoded...
+        if (nb_threads > 512)
+        {
+            PyErr_Format(
+                PyExc_ValueError,
+                "Multinomial is not implemented for so many rows in the matrix (%%i)",
+                nb_multi);
+            %(fail)s
+        }
+        assert(nb_blocks*nb_threads >= nb_multi);
+        void *args[11];
+        ssize_t strides[5] = {
+            PyGpuArray_STRIDES(pvals)[0]/sizeof(float),
+            PyGpuArray_STRIDES(pvals)[1]/sizeof(float),
+            PyGpuArray_STRIDES(unis)[0]/sizeof(float),
+            PyGpuArray_STRIDES(out)[0]/8,
+            PyGpuArray_STRIDES(out)[1]/8
+        };
+        int err;
+        args[0] = (void*)&PyGpuArray_DIMS(pvals)[0];
+        args[1] = (void*)&PyGpuArray_DIMS(pvals)[1];
+        args[2] = (void*)&n_samples;
+        args[3] = pvals_copy->ga.data; //PyGpuArray_DEV_DATA(pvals);
+        args[4] = (void*)&strides[0];
+        args[5] = (void*)&strides[1];
+        args[6] = unis->ga.data; //PyGpuArray_DEV_DATA(unis);
+        args[7] = (void*)&strides[2];
+        args[8] = out->ga.data; //PyGpuArray_DEV_DATA(out);
+        args[9] = (void*)&strides[3];
+        args[10] = (void*)&strides[4];
+        err = GpuKernel_call(&%(kname)s, 1, &nb_threads, &nb_blocks, 0, args);
+        if (err != GA_NO_ERROR) {
+           PyErr_Format(
+                PyExc_RuntimeError,
+                "gpuarray error: %%s: %%s.\\n",
+                "k_multi_warp_%(name)s",
+                GpuKernel_error(&%(kname)s, err));
+            %(fail)s;
+        }
+        if(%(sync)d)
+            GpuArray_sync(&(out->ga));
+    } // END NESTED SCOPE
+        """ % locals()
+        return s
+    def c_code_cache_version(self):
+        return (1,)
 @register_opt('fast_compile')
 @op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
 @register_opt2([theano.sandbox.multinomial.MultinomialFromUniform], 'fast_compile')
@@ -248,3 +483,20 @@ def local_gpua_multinomial(op, context_name, inputs, outputs):
        gpu_op = GPUAMultinomialFromUniform(op.odtype)
        return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
            gpu_op(p, u))
+@register_opt()
+@op_lifter([theano.sandbox.multinomial.MultinomialWOReplacementFromUniform])
+def local_gpua_multinomial_wor(node, context_name):
+    # TODO : need description for function
+    p, u, n = node.inputs
+    # try:
+    #     if get_scalar_constant_value(n_samples) != 1:
+    #         return None
+    # except NotScalarConstantError:
+    #     return None
+    m, = node.outputs
+    if ((p.dtype == u.dtype == 'float32') and (m.dtype == 'int64')):
+        gpu_op = GPUAMultinomialWOReplacementFromUniform(node.op.odtype)
+        return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
+            gpu_op(p, u, n))
--- a/theano/gpuarray/tests/test_multinomial.py
+++ b/theano/gpuarray/tests/test_multinomial.py