merge

7461eaab · Simon Lemieux · 29bea05f · 6bd132c5 · 7461eaab · 7461eaab
--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -4,7 +4,7 @@ import theano.tensor as T
 from theano.tensor.opt import register_specialize
 from theano.gof import local_optimizer

-from theano.sandbox.cuda import cuda_available
+from theano.sandbox.cuda import cuda_available, cuda_enabled
 if cuda_available:
    from theano.sandbox.cuda import CudaNdarrayType
    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
@@ -109,12 +109,11 @@ class GpuMultinomial(Multinomial):
            raise TypeError('pvals must be cudandarray', pvals)
        if not isinstance(unis.type, CudaNdarrayType):
            raise TypeError('unis must be cudandarray', unis)
-
        return Apply(self, [pvals, unis], [pvals.type()])

    def c_code_cache_version(self):
-        #return ()
-        return (super(GpuMultinomial,self).c_code_cache_version(),1)
+        return ()
+        #return (super(GpuMultinomial,self).c_code_cache_version(),1)

    def c_support_code_apply(self, node, nodename):
        return """
@@ -128,7 +127,7 @@ class GpuMultinomial(Multinomial):
            float * global_outs
        )
        {            
-            int n = 32*blockIdx.x + threadIdx.x;
+            int n = blockDim.x*blockIdx.x + threadIdx.x;
            if (n < nb_multi)
            {    
            
@@ -201,14 +200,31 @@ class GpuMultinomial(Multinomial):
            int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
            int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
            
-            int nb_block;
-            if (nb_multi %% 32 == 0)
-                nb_block = nb_multi/32;
-            else
-                nb_block = (int)((float)nb_multi/32. + 1.); 
+            //TODO : change this for a beautiful constant
+            int max_nb_blocks = 2<<15 - 1;
+            int nb_blocks = max_nb_blocks + 1;
+            int nb_threads=16; // so it really starts at 32, because of the *2
+            do
+            {
+                nb_threads*=2;
+                if (nb_multi %% nb_threads == 0)
+                    nb_blocks = nb_multi/nb_threads;
+                else
+                    nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.); 
+            } while (nb_blocks > max_nb_blocks);
+
+            //printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
+
+            // TODO : next line is a bit hardcoded...
+            if (nb_threads > 512)
+            {
+                PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi);
+                %(fail)s;
+            }
+
                
-            dim3 n_blocks(nb_block,1,1);
-            dim3 n_threads(32,1,1);
+            dim3 n_blocks(nb_blocks,1,1);
+            dim3 n_threads(nb_threads,1,1);
            int n_shared = 0;

            k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
@@ -244,6 +260,6 @@ gpu_multinomial = GpuMultinomial()
 def use_gpu_multinomial(node):
    if node.op == multinomial:
        return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))]
-if theano.config.device.startswith('gpu'):
+if cuda_enabled:#theano.config.device.startswith('gpu'):
    register_specialize(use_gpu_multinomial)
    
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -685,7 +685,7 @@ class MRG_RandomStreams(object):
        else:
            raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")
            
-    def multinomial(self, size=None, n=1, pvals=[[.5,.5]], ndim=None, dtype='int64'):
+    def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64'):
        """
        Sample `n` (currently `n` needs to be 1) times from a multinomial distribution defined by
        probabilities pvals.
@@ -696,13 +696,12 @@ class MRG_RandomStreams(object):
            `size` and `ndim` are only there keep the same signature as other uniform, binomial, normal, etc.
            todo : adapt multinomial to take that into account
        """
+        if pvals is None:
+            raise TypeError("You have to specify pvals")
        pvals = as_tensor_variable(pvals)
        if n == 1 and pvals.ndim == 2:
-            pvals = as_tensor_variable(pvals)
            unis = self.uniform(size=pvals.shape[0:1], ndim=1)
-            
            return cast(multinomial(pvals.T, unis).T, dtype)
-
        else:
            raise NotImplementedError("MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2")


--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -528,6 +528,7 @@ def test_multinomial():
        print ''
        print 'ON GPU:'
        R = MRG_RandomStreams(234, use_cuda=True)
+        pvals = numpy.asarray(pvals, dtype='float32')
        n = R.multinomial(pvals=pvals, dtype='float32')
        assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
        f = theano.function([], theano.Out(