merge

c84ef1d8 · Olivier Breuleux · 8ca1bf09 · bb724b38 · c84ef1d8 · c84ef1d8
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -381,6 +381,7 @@ class InvalidValueError(DebugModeError):
        client_node = self.client_node
        hint = self.hint
        specific_hint = self.specific_hint
+        context = debugprint(r, prefix='  ', depth=12, file=StringIO()).getvalue()
        return """InvalidValueError
        type(variable) = %(type_r)s
        variable       = %(r)s
@@ -393,7 +394,8 @@ class InvalidValueError(DebugModeError):
        isfinite       = %(v_isfinite)s
        client_node    = %(client_node)s
        hint           = %(hint)s
-        specific_hint   = %(specific_hint)s
+        specific_hint  = %(specific_hint)s
+        context        = ...\n%(context)s
        """ % locals()

 ########################

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -156,7 +156,11 @@ def use(device, force=False, default_to_move_computation_to_gpu = True,
        raise EnvironmentError("You forced use of device %s, but CUDA initialization failed "
                               "with error:\n%s" % (device, cuda_initialization_error_message))
    if not cuda_available:
-        warning('CUDA is installed, but device %s is not available' % device)
+        if cuda_initialization_error_message:
+            error_addendum = " (error: %s)" % cuda_initialization_error_message
+        else:
+            error_addendum = ""
+        warning('CUDA is installed, but device %s is not available%s' % (device, error_addendum))
        return

    if device == 'gpu':

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -2427,6 +2427,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
    if (CudaNdarray_is_c_contiguous(self) && CudaNdarray_is_c_contiguous(other) && size == size_source)
    {
        cublasScopy(size, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1);
+        CNDA_THREAD_SYNC;
        if (CUBLAS_STATUS_SUCCESS != cublasGetError())
        {
            PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
@@ -2442,14 +2443,6 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
            {
                // THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS
                assert(0);
-                assert (size==1);
-                cublasScopy(1, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1);
-                CNDA_THREAD_SYNC;
-                if (CUBLAS_STATUS_SUCCESS != cublasGetError())
-                {
-                    PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
-                    return -1;
-                }
            }; break;
        case 1: // vector
            {

--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
@@ -132,9 +132,9 @@ class CudaNdarraySharedVariable(SharedVariable, _operators):
            return other._as_CudaNdarrayVariable()

        if not isinstance(other.type, tensor.TensorType):
-            raise TypeError('Incompatible type', other.type)
+            raise TypeError('Incompatible type', (self, (self.type, other.type)))
        if (other.type.dtype != self.dtype):
-            raise TypeError('Incompatible dtype', (self.dtype, other.type.dtype))
+            raise TypeError('Incompatible dtype', (self, (self.dtype, other.type.dtype)))
        if (other.type.broadcastable != self.broadcastable):
            raise TypeError('Incompatible broadcastable', (self, (self.broadcastable,
                other.type.broadcastable)))

--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -10,30 +10,44 @@ if cuda_available:
    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host

 class Multinomial(Op):
+    def __init__(self, odtype):
+        self.odtype=odtype
    def __eq__(self, other):
-        return type(self) == type(other)
+        return type(self) == type(other) and self.odtype==other.odtype
    def __hash__(self):
-        return hash(type(self))
+        return hash((type(self), self.odtype))
    def __str__(self):
-        return self.__class__.__name__
+        return '%s{%s}'%(self.__class__.__name__, self.odtype)
+    def __setstate__(self, dct):
+        self.__dict__.update(dct)
+        try:
+            self.odtype
+        except:
+            self.odtype='auto'
    def make_node(self, pvals, unis):
        pvals = T.as_tensor_variable(pvals)
        unis = T.as_tensor_variable(unis)
-        #assert pvals.dtype == 'float32'
-        #assert unis.dtype == 'float32'
-        return Apply(self, [pvals, unis], [pvals.type()])
-
-    def grad(self, inp, grads):
-        pvals, unis = inp
-        gz, = grads
+        if pvals.ndim != 2:
+            raise NotImplementedError('pvals ndim', pvals.ndim)
+        if unis.ndim != 1:
+            raise NotImplementedError('unis ndim', unis.ndim)
+        if self.odtype=='auto':
+            odtype = pvals.dtype
+        else:
+            odtype = self.odtype
+        return Apply(self, [pvals, unis], [T.matrix(dtype=odtype)])
+
+    def grad(self, ins, outs):
+        pvals, unis = ins
+        (gz,) = outs
        return [None, None]

    def c_code_cache_version(self):
-        return (3,)
+        return (5,)

-    def c_code(self, node, name, inp, out, sub):
-        pvals, unis = inp
-        z, = out
+    def c_code(self, node, name, ins, outs, sub):
+        (pvals, unis) = ins
+        (z,) = outs

        fail = sub['fail']
        return """
@@ -48,9 +62,9 @@ class Multinomial(Op):
            %(fail)s;
        }

-        if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[1])
+        if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[0])
        {
-            PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
+            PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
            %(fail)s;
        }

@@ -60,15 +74,10 @@ class Multinomial(Op):
        )
        {
            Py_XDECREF(%(z)s);
-            npy_intp dims[2];
-            dims[0] = (%(pvals)s->dimensions)[0];
-            dims[1] = (%(pvals)s->dimensions)[1];
-
            %(z)s = (PyArrayObject*) PyArray_ZEROS(2,
-                dims,
-                type_num_%(pvals)s,
+                %(pvals)s->dimensions,
+                type_num_%(z)s,
                0);
-
            if (!%(z)s)
            {
                PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
@@ -78,32 +87,37 @@ class Multinomial(Op):

        { // NESTED SCOPE

-        const int nb_outcomes = %(pvals)s->dimensions[0];
-        const int nb_multi = %(pvals)s->dimensions[1];
+        const int nb_multi = %(pvals)s->dimensions[0];
+        const int nb_outcomes = %(pvals)s->dimensions[1];

        //
        // For each multinomials, loop over each possible outcome
        //
        for (int n = 0; n < nb_multi; ++n)
        {
+            int waiting = 1;
            dtype_%(pvals)s cummul = 0.;
            const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR1(%(unis)s, n);
            for (int m = 0; m < nb_outcomes; ++m)
            {
-                dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, m,n);
-                const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, m,n);
+                dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, n,m);
+                const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, n,m);
                cummul += *pvals_nm;
-                if (*unis_n < cummul)
+                if (waiting && (cummul > *unis_n))
                {
                    *z_nm = 1.;
-                    break;
+                    waiting = 0;
+                }
+                else
+                {
+                    // if we re-used old z pointer, we have to clear it out.
+                    *z_nm = 0.;
                }
            }
        }
-
        } // END NESTED SCOPE
        """ % locals()
-multinomial = Multinomial()
+#multinomial = Multinomial()


 class GpuMultinomial(Multinomial):
@@ -115,11 +129,16 @@ class GpuMultinomial(Multinomial):
            raise TypeError('pvals must be cudandarray', pvals)
        if not isinstance(unis.type, CudaNdarrayType):
            raise TypeError('unis must be cudandarray', unis)
+        if self.odtype=='auto':
+            odtype = pvals.dtype
+        else:
+            odtype = self.odtype
+        if odtype != pvals.dtype:
+            raise NotImplementedError()
        return Apply(self, [pvals, unis], [pvals.type()])

    def c_code_cache_version(self):
-        #return ()
-        return (super(GpuMultinomial,self).c_code_cache_version(),2)
+        return (6,)

    def c_support_code_apply(self, node, nodename):
        return """
@@ -128,28 +147,32 @@ class GpuMultinomial(Multinomial):
            const int nb_outcomes,
            const int pvals_row_strides,
            const int pvals_col_strides,
+            const int unis_stride,
            float * global_pvals,
            float * global_unis,
            float * global_outs
        )
        {
+            // each thread takes care of one multinomial draw
            int n = blockDim.x*blockIdx.x + threadIdx.x;
            if (n < nb_multi)
            {
-
            float cummul = 0.;
            bool done = false;
+            const float unis_n = global_unis[n*unis_stride];
            for (int m = 0; m < nb_outcomes; ++m)
            {
-                cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
-
                float current_out = 0.;
-
-                if (!done && global_unis[n] < cummul)
+                if (!done)
                {
-                    current_out = 1.;
-                    done = true;
+                    cummul += global_pvals[m * pvals_col_strides + n * pvals_row_strides];
+                    if (unis_n < cummul)
+                    {
+                        current_out = 1.;
+                        done = true;
+                    }
                }
+                //write out transposed for speed.
                global_outs[n + m * nb_multi] = current_out;
            }
            }
@@ -158,12 +181,12 @@ class GpuMultinomial(Multinomial):
        """ % locals()


-    def c_code(self, node, name, inp, out, sub):
-        pvals, unis = inp
-        z, = out
+    def c_code(self, node, name, ins, outs, sub):
+        (pvals, unis) = ins
+        (z,) = outs
+
        fail = sub['fail']
        return """
-
        if (%(pvals)s->nd != 2)
        {
            PyErr_Format(PyExc_TypeError, "pvals wrong rank");
@@ -174,28 +197,21 @@ class GpuMultinomial(Multinomial):
            PyErr_Format(PyExc_TypeError, "unis wrong rank");
            %(fail)s;
        }
-
-        if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
-        {
-            PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
-            %(fail)s;
-        }
-        if (!CudaNdarray_is_c_contiguous(%(unis)s))
+        if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
        {
-            PyErr_Format(PyExc_NotImplementedError, "require unis to be contiguous");
+            PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
            %(fail)s;
        }
-        // Would be more efficient if pvals were also contiguous but practically I think it is not often the cas,
-        // since we are working on pvals.T here

+        //N.B. that the output is TRANSPOSED compared with pvals
        if ((NULL == %(z)s)
-            || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
-            || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[1]))
+            || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
+            || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[0]))
        {
            Py_XDECREF(%(z)s);
            npy_intp dims[2];
-            dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
-            dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
+            dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
+            dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
            %(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
            if (!%(z)s)
            {
@@ -205,9 +221,8 @@ class GpuMultinomial(Multinomial):
        }

        { // NESTED SCOPE
-            int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
-            int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
-
+            int nb_multi = CudaNdarray_HOST_DIMS(%(pvals)s)[0];
+            int nb_outcomes = CudaNdarray_HOST_DIMS(%(pvals)s)[1];
            //TODO : change this for a beautiful constant
            int max_nb_blocks = 2<<15 - 1;
            int nb_blocks = max_nb_blocks + 1;
@@ -226,20 +241,21 @@ class GpuMultinomial(Multinomial):
            // TODO : next line is a bit hardcoded...
            if (nb_threads > 512)
            {
-                PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi);
+                PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for so many rows in the matrix (%%i)", nb_multi);
                %(fail)s;
            }
-
-
            dim3 n_blocks(nb_blocks,1,1);
            dim3 n_threads(nb_threads,1,1);
            int n_shared = 0;

+            assert(nb_blocks*nb_threads >= nb_multi);
+
            k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                CudaNdarray_HOST_DIMS(%(z)s)[1],
                CudaNdarray_HOST_DIMS(%(z)s)[0],
                CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
                CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
+                CudaNdarray_HOST_STRIDES(%(unis)s)[0],
                CudaNdarray_DEV_DATA(%(pvals)s),
                CudaNdarray_DEV_DATA(%(unis)s),
                CudaNdarray_DEV_DATA(%(z)s)
@@ -262,12 +278,271 @@ class GpuMultinomial(Multinomial):

        } // END NESTED SCOPE
        """ % locals()
-gpu_multinomial = GpuMultinomial()

 @local_optimizer()
 def use_gpu_multinomial(node):
    if node.op == multinomial:
-        return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))]
+        p, u = node.inputs
+        m, = node.outputs
+        if p.dtype == u.dtype == m.dtype == 'float32':
+            gpu_op = GpuMultinomial(op.odtype)
+            return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs]))]
 if cuda_enabled:#theano.config.device.startswith('gpu'):
    register_specialize(use_gpu_multinomial)

+
+if 0: # I hate you hg rebase, I hate you so very, very much.
+    class Multinomial(Op):
+        def __eq__(self, other):
+            return type(self) == type(other)
+        def __hash__(self):
+            return hash(type(self))
+        def __str__(self):
+            return self.__class__.__name__
+        def make_node(self, pvals, unis):
+            pvals = T.as_tensor_variable(pvals)
+            unis = T.as_tensor_variable(unis)
+            #assert pvals.dtype == 'float32'
+            #assert unis.dtype == 'float32'
+            return Apply(self, [pvals, unis], [pvals.type()])
+
+        def grad(self, inp, grads):
+            pvals, unis = inp
+            gz, = grads
+            return [None, None]
+
+        def c_code_cache_version(self):
+            return (3,)
+
+        def c_code(self, node, name, inp, out, sub):
+            pvals, unis = inp
+            z, = out
+
+            fail = sub['fail']
+            return """
+            if (%(pvals)s->nd != 2)
+            {
+                PyErr_Format(PyExc_TypeError, "pvals wrong rank");
+                %(fail)s;
+            }
+            if (%(unis)s->nd != 1)
+            {
+                PyErr_Format(PyExc_TypeError, "unis wrong rank");
+                %(fail)s;
+            }
+
+            if (%(unis)s->dimensions[0] != %(pvals)s->dimensions[1])
+            {
+                PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
+                %(fail)s;
+            }
+
+            if ((NULL == %(z)s)
+                || ((%(z)s->dimensions)[0] != (%(pvals)s->dimensions)[0])
+                || ((%(z)s->dimensions)[1] != (%(pvals)s->dimensions)[1])
+            )
+            {
+                Py_XDECREF(%(z)s);
+                npy_intp dims[2];
+                dims[0] = (%(pvals)s->dimensions)[0];
+                dims[1] = (%(pvals)s->dimensions)[1];
+
+                %(z)s = (PyArrayObject*) PyArray_ZEROS(2,
+                    dims,
+                    type_num_%(pvals)s,
+                    0);
+
+                if (!%(z)s)
+                {
+                    PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
+                    %(fail)s;
+                }
+            }
+
+            { // NESTED SCOPE
+
+            const int nb_outcomes = %(pvals)s->dimensions[0];
+            const int nb_multi = %(pvals)s->dimensions[1];
+
+            //
+            // For each multinomials, loop over each possible outcome
+            //
+            for (int n = 0; n < nb_multi; ++n)
+            {
+                dtype_%(pvals)s cummul = 0.;
+                const dtype_%(unis)s* unis_n = (dtype_%(unis)s*)PyArray_GETPTR1(%(unis)s, n);
+                for (int m = 0; m < nb_outcomes; ++m)
+                {
+                    dtype_%(z)s* z_nm = (dtype_%(z)s*)PyArray_GETPTR2(%(z)s, m,n);
+                    const dtype_%(pvals)s* pvals_nm = (dtype_%(pvals)s*)PyArray_GETPTR2(%(pvals)s, m,n);
+                    cummul += *pvals_nm;
+                    if (*unis_n < cummul)
+                    {
+                        *z_nm = 1.;
+                        break;
+                    }
+                }
+            }
+
+            } // END NESTED SCOPE
+            """ % locals()
+    multinomial = Multinomial()
+
+
+    class GpuMultinomial(Multinomial):
+
+        def make_node(self, pvals, unis):
+            assert pvals.dtype == 'float32'
+            assert unis.dtype == 'float32'
+            if not isinstance(pvals.type, CudaNdarrayType):
+                raise TypeError('pvals must be cudandarray', pvals)
+            if not isinstance(unis.type, CudaNdarrayType):
+                raise TypeError('unis must be cudandarray', unis)
+            return Apply(self, [pvals, unis], [pvals.type()])
+
+        def c_code_cache_version(self):
+            #return ()
+            return (super(GpuMultinomial,self).c_code_cache_version(),2)
+
+        def c_support_code_apply(self, node, nodename):
+            return """
+            static __global__ void k_multi_warp_%(nodename)s(
+                const int nb_multi,
+                const int nb_outcomes,
+                const int pvals_row_strides,
+                const int pvals_col_strides,
+                float * global_pvals,
+                float * global_unis,
+                float * global_outs
+            )
+            {
+                int n = blockDim.x*blockIdx.x + threadIdx.x;
+                if (n < nb_multi)
+                {
+
+                float cummul = 0.;
+                bool done = false;
+                for (int m = 0; m < nb_outcomes; ++m)
+                {
+                    cummul += global_pvals[n * pvals_col_strides + m * pvals_row_strides];
+
+                    float current_out = 0.;
+
+                    if (!done && global_unis[n] < cummul)
+                    {
+                        current_out = 1.;
+                        done = true;
+                    }
+                    global_outs[n + m * nb_multi] = current_out;
+                }
+                }
+            }
+
+            """ % locals()
+
+
+        def c_code(self, node, name, inp, out, sub):
+            pvals, unis = inp
+            z, = out
+            fail = sub['fail']
+            return """
+
+            if (%(pvals)s->nd != 2)
+            {
+                PyErr_Format(PyExc_TypeError, "pvals wrong rank");
+                %(fail)s;
+            }
+            if (%(unis)s->nd != 1)
+            {
+                PyErr_Format(PyExc_TypeError, "unis wrong rank");
+                %(fail)s;
+            }
+
+            if (CudaNdarray_HOST_DIMS(%(unis)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[1])
+            {
+                PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[1]");
+                %(fail)s;
+            }
+            if (!CudaNdarray_is_c_contiguous(%(unis)s))
+            {
+                PyErr_Format(PyExc_NotImplementedError, "require unis to be contiguous");
+                %(fail)s;
+            }
+            // Would be more efficient if pvals were also contiguous but practically I think it is not often the cas,
+            // since we are working on pvals.T here
+
+            if ((NULL == %(z)s)
+                || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(pvals)s)[0])
+                || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(pvals)s)[1]))
+            {
+                Py_XDECREF(%(z)s);
+                npy_intp dims[2];
+                dims[0] = (CudaNdarray_HOST_DIMS(%(pvals)s)[0]);
+                dims[1] = (CudaNdarray_HOST_DIMS(%(pvals)s)[1]);
+                %(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
+                if (!%(z)s)
+                {
+                    PyErr_SetString(PyExc_MemoryError, "failed to alloc z output");
+                    %(fail)s;
+                }
+            }
+
+            { // NESTED SCOPE
+                int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
+                int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
+
+                //TODO : change this for a beautiful constant
+                int max_nb_blocks = 2<<15 - 1;
+                int nb_blocks = max_nb_blocks + 1;
+                int nb_threads=16; // so it really starts at 32, because of the *2
+                do
+                {
+                    nb_threads*=2;
+                    if (nb_multi %% nb_threads == 0)
+                        nb_blocks = nb_multi/nb_threads;
+                    else
+                        nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
+                } while (nb_blocks > max_nb_blocks);
+
+                //printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
+
+                // TODO : next line is a bit hardcoded...
+                if (nb_threads > 512)
+                {
+                    PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi);
+                    %(fail)s;
+                }
+
+
+                dim3 n_blocks(nb_blocks,1,1);
+                dim3 n_threads(nb_threads,1,1);
+                int n_shared = 0;
+
+                k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+                    CudaNdarray_HOST_DIMS(%(z)s)[1],
+                    CudaNdarray_HOST_DIMS(%(z)s)[0],
+                    CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
+                    CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
+                    CudaNdarray_DEV_DATA(%(pvals)s),
+                    CudaNdarray_DEV_DATA(%(unis)s),
+                    CudaNdarray_DEV_DATA(%(z)s)
+                );
+                CNDA_THREAD_SYNC;
+                cudaError_t sts = cudaGetLastError();
+                if (cudaSuccess != sts)
+                {
+                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i; shared: %%i)\\n",
+                        "k_multi_warp_%(name)s",
+                        cudaGetErrorString(sts),
+                        n_blocks.x,
+                        n_blocks.y,
+                        n_threads.x,
+                        n_threads.y,
+                        n_threads.z,
+                        n_shared);
+                    %(fail)s;
+                }
+
+            } // END NESTED SCOPE
+            """ % locals()
+    gpu_multinomial = GpuMultinomial()
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -9,13 +9,14 @@ import sys
 import numpy

 from theano import Op, Apply, shared, config, Variable
-from theano.tensor import raw_random, TensorType, as_tensor_variable, get_vector_length, cast, opt
+from theano.tensor import (raw_random, TensorType, as_tensor_variable,
+        get_vector_length, cast, opt)
 from theano.tensor import zeros_like, sqrt, log, sin, cos, join, prod
 from theano.compile import optdb
 from theano.gof import local_optimizer
 from theano.gof.python25 import all

-from multinomial import multinomial
+import multinomial

 from theano.sandbox.cuda import cuda_available, cuda_enabled
 if cuda_available:
@@ -83,10 +84,12 @@ def mrg_next_value(rstate, new_rstate):
    x11, x12, x13, x21, x22, x23 = rstate
    assert type(x11) == numpy.int32

-    i0, i7, i9, i15, i16, i22, i24 = [numpy.int32(i) for i in (0,7, 9, 15, 16, 22, 24)]
+    i0, i7, i9, i15, i16, i22, i24 = [numpy.int32(i)
+            for i in (0,7, 9, 15, 16, 22, 24)]

    #first component
-    y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
+    y1 = (((x12 & MASK12) << i22) + (x12 >> i9)
+        + ((x13 & MASK13) << i7) + (x13 >> i24))

    assert type(y1) == numpy.int32
    if (y1 < 0 or y1 >= M1):     #must also check overflow
@@ -741,10 +744,15 @@ class MRG_RandomStreams(object):
            raise TypeError("You have to specify pvals")
        pvals = as_tensor_variable(pvals)
        if n == 1 and pvals.ndim == 2:
-            unis = self.uniform(size=pvals.shape[0:1], ndim=1)
-            return cast(multinomial(pvals.T, unis).T, dtype)
+            ndim, size, bcast = raw_random._infer_ndim_bcast(
+                    ndim, size, n, pvals[:,0])
+            bcast = bcast+(pvals.type.broadcastable[-1],)
+            unis = self.uniform(size=size, ndim=1)
+            op = multinomial.Multinomial(dtype)
+            return op(pvals, unis)
        else:
-            raise NotImplementedError("MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2")
+            raise NotImplementedError(("MRG_RandomStreams.multinomial only"
+                " implemented with n == 1 and pvals.ndim = 2"))

    def normal(self, size=None, avg=0.0, std=1.0, ndim=None, dtype=config.floatX):
        """

--- a/theano/sandbox/test_multinomial.py
+++ b/theano/sandbox/test_multinomial.py
+
+import numpy
+from theano import tensor, shared, function
+import multinomial
+
+def test_multimomial_0():
+    # This tests the multinomial Op directly, not going through the
+    # multinomial() call in GPU random generation.
+
+    p = tensor.matrix()
+    u = tensor.vector()
+
+    m = multinomial.Multinomial('auto')(p,u)
+
+    #the m*2 allows the multinomial to reuse output
+    f = function([p,u], m*2, allow_input_downcast=True)
+
+
+    # test that both first and second samples can be drawn
+    assert numpy.allclose(f([[1,0], [0,1]], [.1, .1]),
+            [[2,0], [0,2]])
+
+    # test that both second labels can be drawn
+    r = f([[.2,.8], [.3,.7]], [.31, .31])
+    assert numpy.allclose(r, [[0,2], [0,2]]), r
+
+
+    # test that both first labels can be drawn
+    r = f([[.2,.8], [.3,.7]], [.21, .21])
+    assert numpy.allclose(r, [[0,2], [2,0]]), r
+
+    #change the size to make sure output gets reallocated ok
+    # and also make sure that the GPU version doesn't screw up the
+    # transposed-ness
+    r = f([[.2,.8] ], [.25])
+    assert numpy.allclose(r, [[0,2]]), r
+
+
+#TODO: check a bigger example (make sure blocking on GPU is handled correctly)
+def test_multinomial_large():
+    # DEBUG_MODE will test this on GPU
+    p = tensor.fmatrix()
+    u = tensor.fvector()
+    m = multinomial.Multinomial('auto')(p,u)
+    f = function([p,u], m*2, allow_input_downcast=True)
+
+    pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4))+0.1
+    pval = pval / pval.sum(axis=1)[:,None]
+    uval = numpy.ones_like(pval[:,0]) * 0.5
+    mval = f(pval,uval)
+
+    assert mval.shape == pval.shape
+    assert mval.dtype == pval.dtype
+    assert numpy.allclose(mval.sum(axis=1), 2)
+    asdf = numpy.asarray([0, 0, 2, 0])+0*pval
+    assert numpy.allclose(mval, asdf) #broadcast over all rows
+
+
+def test_multinomial_dtypes():
+    p = tensor.dmatrix()
+    u = tensor.dvector()
+    m = multinomial.Multinomial('auto')(p,u)
+    assert m.dtype == 'float64', m.dtype
+
+    p = tensor.fmatrix()
+    u = tensor.fvector()
+    m = multinomial.Multinomial('auto')(p,u)
+    assert m.dtype == 'float32', m.dtype
+
+
+    p = tensor.fmatrix()
+    u = tensor.fvector()
+    m = multinomial.Multinomial('float64')(p,u)
+    assert m.dtype == 'float64', m.dtype
--- a/theano/tensor/raw_random.py
+++ b/theano/tensor/raw_random.py
@@ -236,6 +236,10 @@ class RandomFunction(gof.Op):


 def _infer_ndim(ndim, shape, *args):
+    ndim, ivec, bcast = _infer_ndim_bcast(ndim, shape, *args)
+    return ndim, ivec
+
+def _infer_ndim_bcast(ndim, shape, *args):
    """
    Infer the number of dimensions from the shape or the other arguments.

@@ -255,7 +259,11 @@ def _infer_ndim(ndim, shape, *args):
    else:
        args_ndim = 0

-    if isinstance(shape, (tuple, list)):
+    # there is a convention that -1 means the corresponding shape of a
+    # potentially-broadcasted symbolic arg
+    if (isinstance(shape, (tuple, list))
+            and numpy.all(numpy.asarray(shape)>=0)):
+        bcast = [(s==1) for s in shape]
        v_shape = tensor.TensorConstant(type=tensor.lvector, data=theano._asarray(shape, dtype='int64'))
        shape_ndim = len(shape)
        if ndim is None:
@@ -265,6 +273,53 @@ def _infer_ndim(ndim, shape, *args):
                raise ValueError('ndim should be equal to len(shape), but\n',
                            'ndim = %s, len(shape) = %s, shape = %s'
                            % (ndim, shape_ndim, shape))
+    elif isinstance(shape, (tuple, list)):
+        # there is a convention that -1 means the corresponding shape of a
+        # potentially-broadcasted symbolic arg
+        #
+        # This case combines together symbolic and non-symbolic shape
+        # information
+        if ndim is None:
+            ndim=args_ndim
+        else:
+            ndim = max(args_ndim, ndim)
+        ndim = max(args_ndim, len(shape))
+        shape = [-1]*(ndim - len(shape))+list(shape)
+        bcast = []
+        pre_v_shape = []
+        for i,s in enumerate(shape):
+            if hasattr(s, 'type'): # s is symbolic
+                bcast.append(False) # todo - introspect further
+                pre_v_shape.append(s)
+            else:
+                if s >= 0:
+                    pre_v_shape.append(tensor.as_tensor_variable(s))
+                    bcast.append((s==1))
+                elif s == -1:
+                    n_a_i = 0
+                    for a in args:
+                        # ndim: _   _   _   _   _   _
+                        # ashp:         s0  s1  s2  s3
+                        #           i
+                        if i >= ndim - a.ndim:
+                            n_a_i += 1
+                            a_i = i + a.ndim -ndim
+                            if not a.broadcastable[a_i]:
+                                pre_v_shape.append(a.shape[a_i])
+                                bcast.append(False)
+                                break
+                    else:
+                        if n_a_i == 0:
+                            raise ValueError(('Auto-shape of -1 must overlap'
+                                'with the shape of one of the broadcastable'
+                                'inputs'))
+                        else:
+                            pre_v_shape.append(tensor.as_tensor_variable(1))
+                            bcast.append(True)
+                else:
+                    ValueError('negative shape', s)
+        # post-condition: shape may still contain both symbolic and non-symbolic things
+        v_shape = tensor.stack(*pre_v_shape)

    elif shape is None:
        # The number of drawn samples will be determined automatically,
@@ -272,20 +327,23 @@ def _infer_ndim(ndim, shape, *args):
        v_shape = tensor.constant([], dtype='int64')
        if ndim is None:
            ndim = args_ndim
+        bcast = [False]*ndim #TODO: retrieve broadcasting patterns of arguments

    else:
        v_shape = tensor.as_tensor_variable(shape)
        if ndim is None:
            ndim = tensor.get_vector_length(v_shape)
+        bcast = [False]*ndim

    if not (v_shape.dtype.startswith('int') or v_shape.dtype.startswith('uint')):
-        raise TypeError('shape must be an integer vector or list')
+        raise TypeError('shape must be an integer vector or list', v_shape.dtype)

    if args_ndim > ndim:
        raise ValueError('ndim should be at least as big as required by args value',
                    (ndim, args_ndim), args)

-    return ndim, v_shape
+    assert ndim == len(bcast)
+    return ndim, tensor.cast(v_shape, 'int32'), tuple(bcast)

 def _generate_broadcasting_indices(out_shape, *shapes):
    '''
@@ -549,29 +607,89 @@ def multinomial_helper(random_state, n, pvals, size):
    out = numpy.ndarray(out_size)
    broadcast_ind = _generate_broadcasting_indices(size, n.shape, pvals.shape[:-1])
    # Iterate over these indices, drawing from one multinomial at a time from numpy
+    assert pvals.min() >= 0
    for mi, ni, pi in zip(*broadcast_ind):
-        out[mi] = random_state.multinomial(n=n[ni], pvals=pvals[pi])
+        pvi = pvals[pi]
+
+        # This might someday be fixed upstream
+        # Currently numpy raises an exception in this method if the sum
+        # of probabilities meets or exceeds 1.0.
+        # In  perfect arithmetic this would be correct, but in float32 or
+        # float64 it is too strict.
+        pisum = numpy.sum(pvi)
+        if 1.0 < pisum < 1.0+1e-5:#correct if we went a little over
+            # because mtrand.pyx has a ValueError that will trigger if
+            # sum(pvals[:-1]) > 1.0
+            pvi = pvi * (1.0 - 5e-5)
+            #pvi = pvi * .9
+            pisum = numpy.sum(pvi)
+        elif pvi[-1]<5e-5: #will this even work?
+            pvi = pvi * (1.0 - 5e-5)
+            pisum = numpy.sum(pvi)
+        assert pisum<=1.0, pisum
+        out[mi] = random_state.multinomial(n=n[ni],
+                pvals=pvi.astype('float64'))
    return out

-def multinomial(random_state, size=None, n=1, pvals=[0.5, 0.5], ndim=None, dtype='int64'):
+def multinomial(random_state, size=None, n=1, pvals=[0.5, 0.5],
+        ndim=None, dtype='int64'):
    """
-    Sample n times from a multinomial distribution defined by
-    probabilities pvals, as many times as required by size. For
-    instance, if size=(p,q), p*q samples will be drawn, and the output
-    shape will be (p,q,len(pvals)).
+    Sample from one or more multinomial distributions defined by
+    one-dimensional slices in pvals.

-    Theano tries to infer the number of dimensions from the length of
-    the size argument and the shapes of n and pvals, but you may always
-    specify it with the `ndim` parameter.
+    :param pvals: a tensor of shape "nmulti+(L,)" describing each multinomial
+        distribution.  This tensor must have the property that
+        numpy.allclose(pvals.sum(axis=-1), 1) is true.
+
+    :param size: a vector of shape information for the output; this can also
+        specify the "nmulti" part of pvals' shape.  A -1 in the k'th position
+        from the right means to borrow the k'th position from the
+        right in nmulti. (See examples below.)
+        Default ``None`` means size=nmulti.
+
+    :param n: the number of experiments to simulate for each multinomial. This
+        can be a scalar, or tensor, it will be broadcasted to have shape "nmulti".
+
+    :param dtype: the dtype of the return value (which will represent counts)
+
+    :returns: tensor of len(size)+1 dimensions, and shape[-1]==L, with the specified ``dtype``,
+        with the experiment counts.  See examples to understand the shape of the
+        return value, which is derived from both size and pvals.shape.
+        In return value rval, "numpy.allclose(rval.sum(axis=-1), n)" will be true.
+
+    For example, to simulate n experiments from each multinomial in a batch of
+    size B:
+
+        size=None, pvals.shape=(B,L) --> rval.shape=[B,L]
+
+        rval[i,j] is the count of possibility j in the i'th distribution (row)
+        in pvals.
+
+    Using size:
+
+        size=(1,-1), pvals.shape=(A,B,L)
+        --> rval.shape=[1,B,L], and requires that A==1.
+
+        rval[k,i,j] is the count of possibility j in the distribution specified
+        by pvals[k,i].
+
+    Using size for broadcasting of pvals:
+
+        size=(10,1,-1), pvals.shape=(A,B,L)
+        --> rval.shape=[10,1,B,L], and requires that A==1.
+
+        rval[l,k,i,j] is the count of possibility j in the distribution specified
+        by pvals[k,i], in the l'th of 10 draws.

-    .. note::
-        Note that the output will then be of dimension ndim+1.
    """
    n = tensor.as_tensor_variable(n)
    pvals = tensor.as_tensor_variable(pvals)
-    ndim, size = _infer_ndim(ndim, size, n, pvals[0])
+    # until ellipsis is implemented (argh)
+    tmp = pvals.T[0].T
+    ndim, size, bcast = _infer_ndim_bcast(ndim, size, n, tmp)
+    bcast = bcast+(pvals.type.broadcastable[-1],)
    op = RandomFunction(multinomial_helper,
-            tensor.TensorType(dtype = 'int64', broadcastable = (False,)*(ndim+1)),
+            tensor.TensorType(dtype = dtype, broadcastable = bcast),
            ndim_added=1)
    return op(random_state, size, n, pvals)


--- a/theano/tensor/tests/test_raw_random.py
+++ b/theano/tensor/tests/test_raw_random.py
@@ -732,6 +732,52 @@ class T_random_function(unittest.TestCase):
        assert numpy.all(val2 == numpy_val2)
        self.assertRaises(ValueError, g, rng2, n_val[:-1], pvals_val[:-1])

+
+    def test_multinomial_tensor3_a(self):
+        # Test the examples given in the multinomial documentation regarding
+        # tensor3 objects
+        rng_R = random_state_type()
+        n = 9
+        pvals = tensor.dtensor3()
+        post_r, out = multinomial(rng_R, n=n, pvals=pvals, size=(1,-1))
+        assert out.ndim == 3
+        assert out.broadcastable==(True, False, False)
+
+        f = compile.function([rng_R, pvals], [post_r, out], accept_inplace=True)
+
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        numpy_rng = numpy.random.RandomState(utt.fetch_seed())
+
+        pvals_val = numpy.asarray([[[.1, .9], [.2, .8], [.3, .7]]])
+        assert pvals_val.shape == (1, 3, 2)
+
+        new_rng, draw = f(rng, pvals_val)
+        assert draw.shape==(1,3,2)
+        assert numpy.allclose(draw.sum(axis=2), 9)
+
+    def test_multinomial_tensor3_b(self):
+        # Test the examples given in the multinomial documentation regarding
+        # tensor3 objects
+        rng_R = random_state_type()
+        n = 9
+        pvals = tensor.dtensor3()
+        post_r, out = multinomial(rng_R, n=n, pvals=pvals, size=(10, 1,-1))
+        assert out.ndim == 4
+        assert out.broadcastable==(False, True, False, False)
+
+        f = compile.function([rng_R, pvals], [post_r, out], accept_inplace=True)
+
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        numpy_rng = numpy.random.RandomState(utt.fetch_seed())
+
+        pvals_val = numpy.asarray([[[.1, .9], [.2, .8], [.3, .7]]])
+        assert pvals_val.shape == (1, 3, 2)
+
+        out_rng, draw = f(rng, pvals_val)
+        assert draw.shape==(10,1,3,2)
+        assert numpy.allclose(draw.sum(axis=3), 9)
+
+
    def test_dtype(self):
        rng_R = random_state_type()
        low = tensor.lscalar()