I HATE U hg rebase Y U GO DELETE EVERYTHING

bb724b38 · James Bergstra · c3c73138 · bb724b38
--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -10,30 +10,44 @@ if cuda_available:
    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
 class Multinomial(Op):
+    def __init__(self, odtype):
+        self.odtype=odtype
    def __eq__(self, other):
-        return type(self) == type(other)
+        return type(self) == type(other) and self.odtype==other.odtype
    def __hash__(self):
-        return hash(type(self))
+        return hash((type(self), self.odtype))
    def __str__(self):
-        return self.__class__.__name__
+        return '%s{%s}'%(self.__class__.__name__, self.odtype)
+    def __setstate__(self, dct):
+        self.__dict__.update(dct)
+        try:
+            self.odtype
+        except:
+            self.odtype='auto'
    def make_node(self, pvals, unis):
        pvals = T.as_tensor_variable(pvals)
        unis = T.as_tensor_variable(unis)
-        #assert pvals.dtype == 'float32'
+        if pvals.ndim != 2:
-        #assert unis.dtype == 'float32'
+            raise NotImplementedError('pvals ndim', pvals.ndim)
-        return Apply(self, [pvals, unis], [pvals.type()])
+        if unis.ndim != 1:
+            raise NotImplementedError('unis ndim', unis.ndim)
-    def grad(self, inp, grads):
+        if self.odtype=='auto':
-        pvals, unis = inp
+            odtype = pvals.dtype
-        gz, = grads
+        else:
+            odtype = self.odtype
+        return Apply(self, [pvals, unis], [T.matrix(dtype=odtype)])
+    def grad(self, ins, outs):
+        pvals, unis = ins
+        (gz,) = outs
        return [None, None]
    def c_code_cache_version(self):
-        return (3,)
+        return (5,)
-    def c_code(self, node, name, inp, out, sub):
+    def c_code(self, node, name, ins, outs, sub):
-        pvals, unis = inp
+        (pvals, unis) = ins
-        z, = out
+        (z,) = outs
        fail = sub['fail']
        return """
@@ -48,9 +62,9 @@ class Multinomial(Op):
            %(fail)s;
        }
-        if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[1])
+        if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[0])
        {
-            PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
+            PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
            %(fail)s;
        }
@@ -60,15 +74,10 @@ class Multinomial(Op):
        )
        {
            Py_XDECREF(%(z)s);
-            npy_intp dims[2];
-            dims[0] = (%(pvals)s->dimensions)[0];
-            dims[1] = (%(pvals)s->dimensions)[1];
            %(z)s = (PyArrayObject*) PyArray_ZEROS(2,
-                dims,
+                %(pvals)s->dimensions,
-                type_num_%(pvals)s,
+                type_num_%(z)s,
                0);
            if (!%(z)s)
            {
                PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
@@ -78,32 +87,37 @@ class Multinomial(Op):
        { // NESTED SCOPE
-        const int nb_outcomes = %(pvals)s->dimensions[0];
+        const int nb_multi = %(pvals)s->dimensions[0];
-        const int nb_multi = %(pvals)s->dimensions[1];
+        const int nb_outcomes = %(pvals)s->dimensions[1];
        //
        // For each multinomials, loop over each possible outcome
        //
        for (int n = 0; n < nb_multi; ++n)
        {
+            int waiting = 1;
            dtype_%(pvals)s cummul = 0.;
            const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR1(%(unis)s, n);
            for (int m = 0; m < nb_outcomes; ++m)
            {
-                dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, m,n);
+                dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, n,m);
-                const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, m,n);
+                const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, n,m);
                cummul += *pvals_nm;
-                if (*unis_n < cummul)
+                if (waiting && (cummul > *unis_n))
                {
                    *z_nm = 1.;
-                    break;
+                    waiting = 0;
+                }
+                else
+                {
+                    // if we re-used old z pointer, we have to clear it out.
+                    *z_nm = 0.;
                }
            }
        }
        } // END NESTED SCOPE
        """ % locals()
-multinomial = Multinomial()
+#multinomial = Multinomial()
 class GpuMultinomial(Multinomial):
@@ -115,11 +129,16 @@ class GpuMultinomial(Multinomial):
            raise TypeError('pvals must be cudandarray', pvals)
        if not isinstance(unis.type, CudaNdarrayType):
            raise TypeError('unis must be cudandarray', unis)
+        if self.odtype=='auto':
+            odtype = pvals.dtype
+        else:
+            odtype = self.odtype
+        if odtype != pvals.dtype:
+            raise NotImplementedError()
        return Apply(self, [pvals, unis], [pvals.type()])
    def c_code_cache_version(self):
-        #return ()
+        return (6,)
-        return (super(GpuMultinomial,self).c_code_cache_version(),2)
    def c_support_code_apply(self, node, nodename):
        return """
@@ -128,28 +147,32 @@ class GpuMultinomial(Multinomial):
            const int nb_outcomes,
            const int pvals_row_strides,
            const int pvals_col_strides,
+            const int unis_stride,
            float * global_pvals,
            float * global_unis,
            float * global_outs
        )
        {
+            // each thread takes care of one multinomial draw
            int n = blockDim.x*blockIdx.x + threadIdx.x;
            if (n < nb_multi)
            {
            float cummul = 0.;
            bool done = false;
+            const float unis_n = global_unis[n*unis_stride];
            for (int m = 0; m < nb_outcomes; ++m)
            {
-                cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
                float current_out = 0.;
+                if (!done)
-                if (!done && global_unis[n] < cummul)
                {
-                    current_out = 1.;
+                    cummul += global_pvals[m * pvals_col_strides + n * pvals_row_strides];
-                    done = true;
+                    if (unis_n < cummul)
+                    {
+                        current_out = 1.;
+                        done = true;
+                    }
                }
+                //write out transposed for speed.
                global_outs[n + m * nb_multi] = current_out;
            }
            }
@@ -158,12 +181,12 @@ class GpuMultinomial(Multinomial):
        """ % locals()
-    def c_code(self, node, name, inp, out, sub):
+    def c_code(self, node, name, ins, outs, sub):
-        pvals, unis = inp
+        (pvals, unis) = ins
-        z, = out
+        (z,) = outs
        fail = sub['fail']
        return """
        if (%(pvals)s->nd != 2)
        {
            PyErr_Format(PyExc_TypeError, "pvals wrong rank");
@@ -174,28 +197,21 @@ class GpuMultinomial(Multinomial):
            PyErr_Format(PyExc_TypeError, "unis wrong rank");
            %(fail)s;
        }
+        if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
-        if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
-        {
-            PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
-            %(fail)s;
-        }
-        if (!CudaNdarray_is_c_contiguous(%(unis)s))
        {
-            PyErr_Format(PyExc_NotImplementedError, "require unis to be contiguous");
+            PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
            %(fail)s;
        }
-        // Would be more efficient if pvals were also contiguous but practically I think it is not often the cas,
-        // since we are working on pvals.T here
+        //N.B. that the output is TRANSPOSED compared with pvals
        if ((NULL == %(z)s)
-            || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
+            || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
-            || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[1]))
+            || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[0]))
        {
            Py_XDECREF(%(z)s);
            npy_intp dims[2];
-            dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
+            dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
-            dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
+            dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
            %(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
            if (!%(z)s)
            {
@@ -205,9 +221,8 @@ class GpuMultinomial(Multinomial):
        }
        { // NESTED SCOPE
-            int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
+            int nb_multi = CudaNdarray_HOST_DIMS(%(pvals)s)[0];
-            int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
+            int nb_outcomes = CudaNdarray_HOST_DIMS(%(pvals)s)[1];
            //TODO : change this for a beautiful constant
            int max_nb_blocks = 2<<15 - 1;
            int nb_blocks = max_nb_blocks + 1;
@@ -226,20 +241,21 @@ class GpuMultinomial(Multinomial):
            // TODO : next line is a bit hardcoded...
            if (nb_threads > 512)
            {
-                PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi);
+                PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for so many rows in the matrix (%%i)", nb_multi);
                %(fail)s;
            }
            dim3 n_blocks(nb_blocks,1,1);
            dim3 n_threads(nb_threads,1,1);
            int n_shared = 0;
+            assert(nb_blocks*nb_threads >= nb_multi);
            k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                CudaNdarray_HOST_DIMS(%(z)s)[1],
                CudaNdarray_HOST_DIMS(%(z)s)[0],
                CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
                CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
+                CudaNdarray_HOST_STRIDES(%(unis)s)[0],
                CudaNdarray_DEV_DATA(%(pvals)s),
                CudaNdarray_DEV_DATA(%(unis)s),
                CudaNdarray_DEV_DATA(%(z)s)
@@ -262,12 +278,271 @@ class GpuMultinomial(Multinomial):
        } // END NESTED SCOPE
        """ % locals()
-gpu_multinomial = GpuMultinomial()
 @local_optimizer()
 def use_gpu_multinomial(node):
    if node.op == multinomial:
-        return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))]
+        p, u = node.inputs
+        m, = node.outputs
+        if p.dtype == u.dtype == m.dtype == 'float32':
+            gpu_op = GpuMultinomial(op.odtype)
+            return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs]))]
 if cuda_enabled:#theano.config.device.startswith('gpu'):
    register_specialize(use_gpu_multinomial)
+if 0: # I hate you hg rebase, I hate you so very, very much.
+    class Multinomial(Op):
+        def __eq__(self, other):
+            return type(self) == type(other)
+        def __hash__(self):
+            return hash(type(self))
+        def __str__(self):
+            return self.__class__.__name__
+        def make_node(self, pvals, unis):
+            pvals = T.as_tensor_variable(pvals)
+            unis = T.as_tensor_variable(unis)
+            #assert pvals.dtype == 'float32'
+            #assert unis.dtype == 'float32'
+            return Apply(self, [pvals, unis], [pvals.type()])
+        def grad(self, inp, grads):
+            pvals, unis = inp
+            gz, = grads
+            return [None, None]
+        def c_code_cache_version(self):
+            return (3,)
+        def c_code(self, node, name, inp, out, sub):
+            pvals, unis = inp
+            z, = out
+            fail = sub['fail']
+            return """
+            if (%(pvals)s->nd != 2)
+            {
+                PyErr_Format(PyExc_TypeError, "pvals wrong rank");
+                %(fail)s;
+            }
+            if (%(unis)s->nd != 1)
+            {
+                PyErr_Format(PyExc_TypeError, "unis wrong rank");
+                %(fail)s;
+            }
+            if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[1])
+            {
+                PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
+                %(fail)s;
+            }
+            if ((NULL == %(z)s)
+                || ((%(z)s->dimensions)[0] != (%(pvals)s->dimensions)[0])
+                || ((%(z)s->dimensions)[1] != (%(pvals)s->dimensions)[1])
+            )
+            {
+                Py_XDECREF(%(z)s);
+                npy_intp dims[2];
+                dims[0] = (%(pvals)s->dimensions)[0];
+                dims[1] = (%(pvals)s->dimensions)[1];
+                %(z)s = (PyArrayObject*) PyArray_ZEROS(2,
+                    dims,
+                    type_num_%(pvals)s,
+                    0);
+                if (!%(z)s)
+                {
+                    PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
+                    %(fail)s;
+                }
+            }
+            { // NESTED SCOPE
+            const int nb_outcomes = %(pvals)s->dimensions[0];
+            const int nb_multi = %(pvals)s->dimensions[1];
+            //
+            // For each multinomials, loop over each possible outcome
+            //
+            for (int n = 0; n < nb_multi; ++n)
+            {
+                dtype_%(pvals)s cummul = 0.;
+                const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR1(%(unis)s, n);
+                for (int m = 0; m < nb_outcomes; ++m)
+                {
+                    dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, m,n);
+                    const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, m,n);
+                    cummul += *pvals_nm;
+                    if (*unis_n < cummul)
+                    {
+                        *z_nm = 1.;
+                        break;
+                    }
+                }
+            }
+            } // END NESTED SCOPE
+            """ % locals()
+    multinomial = Multinomial()
+    class GpuMultinomial(Multinomial):
+        def make_node(self, pvals, unis):
+            assert pvals.dtype == 'float32'
+            assert unis.dtype == 'float32'
+            if not isinstance(pvals.type, CudaNdarrayType):
+                raise TypeError('pvals must be cudandarray', pvals)
+            if not isinstance(unis.type, CudaNdarrayType):
+                raise TypeError('unis must be cudandarray', unis)
+            return Apply(self, [pvals, unis], [pvals.type()])
+        def c_code_cache_version(self):
+            #return ()
+            return (super(GpuMultinomial,self).c_code_cache_version(),2)
+        def c_support_code_apply(self, node, nodename):
+            return """
+            static __global__ void k_multi_warp_%(nodename)s(
+                const int nb_multi,
+                const int nb_outcomes,
+                const int pvals_row_strides,
+                const int pvals_col_strides,
+                float * global_pvals,
+                float * global_unis,
+                float * global_outs
+            )
+            {
+                int n = blockDim.x*blockIdx.x + threadIdx.x;
+                if (n < nb_multi)
+                {
+                float cummul = 0.;
+                bool done = false;
+                for (int m = 0; m < nb_outcomes; ++m)
+                {
+                    cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
+                    float current_out = 0.;
+                    if (!done && global_unis[n] < cummul)
+                    {
+                        current_out = 1.;
+                        done = true;
+                    }
+                    global_outs[n + m * nb_multi] = current_out;
+                }
+                }
+            }
+            """ % locals()
+        def c_code(self, node, name, inp, out, sub):
+            pvals, unis = inp
+            z, = out
+            fail = sub['fail']
+            return """
+            if (%(pvals)s->nd != 2)
+            {
+                PyErr_Format(PyExc_TypeError, "pvals wrong rank");
+                %(fail)s;
+            }
+            if (%(unis)s->nd != 1)
+            {
+                PyErr_Format(PyExc_TypeError, "unis wrong rank");
+                %(fail)s;
+            }
+            if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
+            {
+                PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
+                %(fail)s;
+            }
+            if (!CudaNdarray_is_c_contiguous(%(unis)s))
+            {
+                PyErr_Format(PyExc_NotImplementedError, "require unis to be contiguous");
+                %(fail)s;
+            }
+            // Would be more efficient if pvals were also contiguous but practically I think it is not often the cas,
+            // since we are working on pvals.T here
+            if ((NULL == %(z)s)
+                || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
+                || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[1]))
+            {
+                Py_XDECREF(%(z)s);
+                npy_intp dims[2];
+                dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
+                dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
+                %(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
+                if (!%(z)s)
+                {
+                    PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
+                    %(fail)s;
+                }
+            }
+            { // NESTED SCOPE
+                int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
+                int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
+                //TODO : change this for a beautiful constant
+                int max_nb_blocks = 2<<15 - 1;
+                int nb_blocks = max_nb_blocks + 1;
+                int nb_threads=16; // so it really starts at 32, because of the *2
+                do
+                {
+                    nb_threads*=2;
+                    if (nb_multi %% nb_threads == 0)
+                        nb_blocks = nb_multi/nb_threads;
+                    else
+                        nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
+                } while (nb_blocks > max_nb_blocks);
+                //printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
+                // TODO : next line is a bit hardcoded...
+                if (nb_threads > 512)
+                {
+                    PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi);
+                    %(fail)s;
+                }
+                dim3 n_blocks(nb_blocks,1,1);
+                dim3 n_threads(nb_threads,1,1);
+                int n_shared = 0;
+                k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+                    CudaNdarray_HOST_DIMS(%(z)s)[1],
+                    CudaNdarray_HOST_DIMS(%(z)s)[0],
+                    CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
+                    CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
+                    CudaNdarray_DEV_DATA(%(pvals)s),
+                    CudaNdarray_DEV_DATA(%(unis)s),
+                    CudaNdarray_DEV_DATA(%(z)s)
+                );
+                CNDA_THREAD_SYNC;
+                cudaError_t sts = cudaGetLastError();
+                if (cudaSuccess != sts)
+                {
+                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
+                        "k_multi_warp_%(name)s",
+                        cudaGetErrorString(sts),
+                        n_blocks.x,
+                        n_blocks.y,
+                        n_threads.x,
+                        n_threads.y,
+                        n_threads.z,
+                        n_shared);
+                    %(fail)s;
+                }
+            } // END NESTED SCOPE
+            """ % locals()
+    gpu_multinomial = GpuMultinomial()