Make GPUAMultinomialFromUniform use GpuKernelBase.

aa483425 · Frederic Bastien · d88c1a84 · aa483425
--- a/theano/sandbox/gpuarray/multinomial.py
+++ b/theano/sandbox/gpuarray/multinomial.py
+# TODO test dtype != float32
 import os
 import pygpu
 import theano
 import theano.sandbox.multinomial
-from theano import Apply
+from theano import Apply, config
-from theano.gof import COp, local_optimizer
+from theano.gof import Op, local_optimizer
+from theano.tensor import NotScalarConstantError, get_scalar_constant_value
+from theano.sandbox import gpuarray
 from .basic_ops import as_gpuarray_variable, infer_context_name
+from .fp16_help import write_w
+from .opt import register_opt, op_lifter
 from .type import gpu_context_type, GpuArrayType
-from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
-from theano.sandbox import gpuarray
-from theano.sandbox.gpuarray.opt import register_opt, op_lifter
-from theano.tensor import NotScalarConstantError, get_scalar_constant_value
-class GPUAMultinomialFromUniform(COp):
+class GPUAMultinomialFromUniform(gpuarray.basic_ops.GpuKernelBase, Op):
    __props__ = ("odtype",)
    params_type = gpu_context_type
    def __init__(self, odtype):
-        COp.__init__(self, ['multinomial.c'], 'APPLY_SPECIFIC(multinomial)')
+        Op.__init__(self)
        self.odtype = odtype
    def get_params(self, node):
        return node.outputs[0].type.context
-    def c_compiler(self):
-        # TODO: get rid of this
-        return NVCC_compiler
    def c_headers(self):
        return ['<numpy_compat.h>', 'gpuarray_helper.h']
@@ -60,9 +57,180 @@ class GPUAMultinomialFromUniform(COp):
        return Apply(self, [pvals, unis], [out])
- #   def c_code_cache_version(self):
+    def gpu_kernels(self, node, name):
- #       return
+        dtype = node.outputs[0].dtype
- #       return (8,)
+        code = """
+KERNEL void k_multi_warp_multinomial(
+    const ga_size nb_multi,
+    const ga_size nb_outcomes,
+    GLOBAL_MEM float * global_pvals,
+    const ga_ssize pvals_row_stride,
+    const ga_ssize pvals_col_stride,
+    GLOBAL_MEM float * global_unis,
+    const ga_ssize unis_stride,
+    GLOBAL_MEM float * global_outs,
+    const ga_ssize outs_row_stride,
+    const ga_ssize outs_col_stride
+)
+{
+    // each thread takes care of one multinomial draw
+    int n = blockDim.x*blockIdx.x + threadIdx.x;
+    if (n < nb_multi)
+    {
+        float cummul = 0.;
+        bool done = false;
+        const float unis_n = global_unis[n*unis_stride];
+        for (ga_size m = 0; m < nb_outcomes; ++m)
+        {
+            float current_out = 0.;
+            if (!done)
+            {
+                cummul += global_pvals[m * pvals_col_stride +
+                                       n * pvals_row_stride];
+                if (unis_n < cummul)
+                {
+                    current_out = 1.;
+                    done = true;
+                }
+            }
+            //write out transposed for speed.
+            global_outs[n * outs_col_stride +
+                        m * outs_row_stride] = current_out;
+        }
+    }
+}
+//KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
+//    ga_size nb = n < m ? n : m;
+//    for (ga_size i = LID_0; i < nb; i += LDIM_0) {
+//        a[i*m + i] = %(write_a)s(1);
+//    }
+//}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(dtype),
+              name=name, write_a=write_w(dtype))
+        return [gpuarray.basic_ops.Kernel(
+            code=code, name="k_multi_warp_multinomial",
+            params=[pygpu.gpuarray.SIZE,
+                    pygpu.gpuarray.SIZE,
+                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.SSIZE
+                ],
+            flags=gpuarray.basic_ops.Kernel.get_flags(node.outputs[0].dtype),
+            objvar='k_multi_warp_multinomial_' + name)]
+    def c_code(self, node, name, inp, outputs, sub):
+        pvals, unis = inp
+        out, = outputs
+        fail = sub['fail']
+        ctx = sub['params']
+        #typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
+        sync = bool(config.gpuarray.sync)
+        kname = self.gpu_kernels(node, name)[0].objvar
+        s = """
+        PyGpuArrayObject * pvals = %(pvals)s;
+        PyGpuArrayObject * unis = %(unis)s;
+        PyGpuArrayObject * out = %(out)s;
+    size_t dims[2];
+    if (PyGpuArray_NDIM(pvals) != 2)
+    {
+        PyErr_Format(PyExc_TypeError, "pvals wrong rank");
+        %(fail)s
+    }
+    if (PyGpuArray_NDIM(unis) != 1)
+    {
+        PyErr_Format(PyExc_TypeError, "unis wrong rank");
+        %(fail)s
+    }
+    if (PyGpuArray_DIMS(unis)[0] != PyGpuArray_DIMS(pvals)[0])
+    {
+        PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
+        %(fail)s
+    }
+    dims[0] = PyGpuArray_DIMS(pvals)[1];
+    dims[1] = PyGpuArray_DIMS(pvals)[0];
+    if (theano_prep_output(&out, 2, dims, unis->ga.typecode,
+                           GA_C_ORDER, %(ctx)s) != 0){
+      %(fail)s
+    }
+    %(out)s = out;
+    GpuArray_memset(&(out->ga), 0);
+    { // NESTED SCOPE
+        int nb_multi = PyGpuArray_DIMS(pvals)[0];
+        int nb_outcomes = PyGpuArray_DIMS(pvals)[1];
+        //TODO : change this for a beautiful constant
+        int max_nb_blocks = 2<<15 - 1;
+        size_t nb_blocks = max_nb_blocks + 1;
+        size_t nb_threads=16; // so it really starts at 32, because of the *2
+        do
+        {
+            nb_threads*=2;
+            if (nb_multi % %nb_threads == 0)
+                nb_blocks = nb_multi/nb_threads;
+            else
+                nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
+        } while (nb_blocks > max_nb_blocks);
+        //printf("\\nN=%%i b=%%i t=%%i t*b=%%i",
+        //         nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
+        // TODO : next line is a bit hardcoded...
+        if (nb_threads > 512)
+        {
+            PyErr_Format(
+                PyExc_ValueError,
+                "Multinomial is not implemented for so many rows in the matrix (%%i)",
+                nb_multi);
+            %(fail)s
+        }
+        assert(nb_blocks*nb_threads >= nb_multi);
+        void *args[10];
+        ssize_t strides[5] = {
+            PyGpuArray_STRIDES(pvals)[0]/sizeof(float),
+            PyGpuArray_STRIDES(pvals)[1]/sizeof(float),
+            PyGpuArray_STRIDES(unis)[0]/sizeof(float),
+            PyGpuArray_STRIDES(out)[0]/sizeof(float),
+            PyGpuArray_STRIDES(out)[1]/sizeof(float)
+        };
+        int err;
+        args[0] = (void*)&PyGpuArray_DIMS(out)[1];
+        args[1] = (void*)&PyGpuArray_DIMS(out)[0];
+        args[2] = pvals->ga.data; //PyGpuArray_DEV_DATA(pvals);
+        args[3] = (void*)&strides[0];
+        args[4] = (void*)&strides[1];
+        args[5] = unis->ga.data; //PyGpuArray_DEV_DATA(unis);
+        args[6] = (void*)&strides[2];
+        args[7] = out->ga.data; //PyGpuArray_DEV_DATA(out);
+        args[8] = (void*)&strides[3];
+        args[9] = (void*)&strides[4];
+        err = GpuKernel_call(&%(kname)s, 1, &nb_threads, &nb_blocks, 0, args);
+        if (err != GA_NO_ERROR) {
+           PyErr_Format(
+                PyExc_RuntimeError,
+                "gpuarray error: %%s: %%s.\\n",
+                "k_multi_warp_%(name)s",
+                GpuKernel_error(&%(kname)s, err));
+            %(fail)s;
+        }
+        if(%(sync)d)
+            GpuArray_sync(&(out->ga));
+    } // END NESTED SCOPE
+        """ % locals()
+        return s
+    def c_code_cache_version(self):
+        return (1,)
 @register_opt()