merge

85447abe · James Bergstra · 13b8fb68 · 7ae6897c · 85447abe · 85447abe
--- a/doc/index.txt
+++ b/doc/index.txt
@@ -52,6 +52,10 @@ Community

 * Register and post to `theano-dev`_ if you want to talk to the developers.

+* Register and post to `theano-announce`_ if you want to be keep informed on important change on theano(low volume).
+
+* Register and post to `theano-buildbot`_ if you want to receive our daily buildbot email.
+
 * We try to stay organized with `Theano's Trac <http://trac-hg.assembla.com/theano/report/1>`__ 

 * Come visit us in Montreal!  Most of the developers are students in the LISA_ group at the `University of Montreal`_.
@@ -77,6 +81,8 @@ Community

 .. _theano-dev: http://groups.google.com/group/theano-dev
 .. _theano-users: http://groups.google.com/group/theano-users
+.. _theano-announce: http://groups.google.com/group/theano-announce
+.. _theano-buildbot: http://groups.google.com/group/theano-buildbot
 .. _tickets: http://pylearn.org/theano/trac/query?status=accepted&status=assigned&status=new&status=reopened&group=milestone&max=200&col=id&col=summary&col=status&col=owner&col=type&col=priority&col=component&col=time&report=9&order=priority

 .. _LISA: http://www.iro.umontreal.ca/~lisa

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -188,7 +188,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
    def make_node(self, dy, sm, y_idx):
        return Apply(self, [dy, sm, y_idx],[sm.type()])
    def c_code_cache_version(self):
-        return (2,)
+        return (3,)
        #return ()
    def c_code(self, node, nodename, (dnll, sm, y_idx), (dx,), sub):
        fail = sub['fail']
@@ -229,7 +229,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
            kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
                <<<
                    CudaNdarray_HOST_DIMS(%(dx)s)[0],
-                    CudaNdarray_HOST_DIMS(%(dx)s)[1]
+                    std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256)
                >>>(
                        CudaNdarray_HOST_DIMS(%(dx)s)[0],
                        CudaNdarray_HOST_DIMS(%(dx)s)[1], 
@@ -303,7 +303,7 @@ class GpuSoftmax (Op):
        return shape
    def c_code_cache_version(self):
        #return ()
-        return (1,) + inline_softmax.code_version
+        return (2,) + inline_softmax.code_version
    def c_code(self, node, nodename, (x,), (z,), sub):
        fail = sub['fail']
        return """
@@ -330,7 +330,7 @@ class GpuSoftmax (Op):
            kSoftmax_%(nodename)s
                <<<
                // todo: cap these at the card limits, implement loops in kernel
-                    CudaNdarray_HOST_DIMS(%(x)s)[0],
+                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024),
                    CudaNdarray_HOST_DIMS(%(x)s)[1],
                    CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
                >>>(
@@ -362,11 +362,14 @@ class GpuSoftmax (Op):
                body=[
                    "extern __shared__ float buf[]",
                    "float * buf2 = buf + N",
-                    "buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]",
-                    "buf2[threadIdx.x] = buf[threadIdx.x]",
-                    "__syncthreads()",
-                    inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
-                    "sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]"
+                    "for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
+                      "buf[threadIdx.x] = x[blockIDX * sx0 + threadIdx.x * sx1]",
+                      "buf2[threadIdx.x] = buf[threadIdx.x]",
+                      "__syncthreads()",
+                      inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
+                      "sm[blockIDX * N + threadIdx.x] = buf[threadIdx.x]",
+                      "__syncthreads()",
+                    "}",
                    ])


@@ -386,7 +389,7 @@ class GpuSoftmaxWithBias (Op):
        return  [shape[0]]
    def c_code_cache_version(self):
        #return ()
-        return (1,) + inline_softmax.code_version
+        return (2,) + inline_softmax.code_version

    def c_code(self, node, nodename, (x,b), (z,), sub):
        fail = sub['fail']
@@ -425,7 +428,7 @@ class GpuSoftmaxWithBias (Op):
            kSoftmaxWithBias_%(nodename)s
                <<<
                // todo: cap these at the card limits, implement loops in kernel
-                    CudaNdarray_HOST_DIMS(%(x)s)[0],
+                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024),
                    CudaNdarray_HOST_DIMS(%(x)s)[1],
                    CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
                >>>(
@@ -461,10 +464,14 @@ class GpuSoftmaxWithBias (Op):
                body=[
                    "extern __shared__ float buf[]",
                    "float * buf2 = buf + N",
-                    "buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]",
-                    "buf[threadIdx.x] += b[threadIdx.x * sb0]",
-                    "buf2[threadIdx.x] = buf[threadIdx.x]",
-                    "__syncthreads()",
-                    inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
-                    "sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]"
+                    "for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
+                       "buf[threadIdx.x] = x[blockIDX * sx0 + threadIdx.x * sx1]",
+                       "buf[threadIdx.x] += b[threadIdx.x * sb0]",
+                       "buf2[threadIdx.x] = buf[threadIdx.x]",
+                       "__syncthreads()",
+                       inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
+                       "sm[blockIDX * N + threadIdx.x] = buf[threadIdx.x]",
+                       "__syncthreads()",
+                    "}",
                    ])
+#for (int i = blockIdx.x; i < N; i += gridDim.x)
--- a/theano/sandbox/cuda/tests/test_mlp.py
+++ b/theano/sandbox/cuda/tests/test_mlp.py
--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -254,7 +254,9 @@ class CudaNdarrayType(Type):
        return ret

    def c_libraries(self):
-        return ['cudart']
+        # returning cublas because the cuda_ndarray.cuh header includes calls to SetVector and
+        # cublasGetError
+        return ['cudart', 'cublas']

    def c_support_code(cls):
        return ""

--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -4,7 +4,7 @@ import theano.tensor as T
 from theano.tensor.opt import register_specialize
 from theano.gof import local_optimizer

-from theano.sandbox.cuda import cuda_available
+from theano.sandbox.cuda import cuda_available, cuda_enabled
 if cuda_available:
    from theano.sandbox.cuda import CudaNdarrayType
    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
@@ -109,12 +109,11 @@ class GpuMultinomial(Multinomial):
            raise TypeError('pvals must be cudandarray', pvals)
        if not isinstance(unis.type, CudaNdarrayType):
            raise TypeError('unis must be cudandarray', unis)
-
        return Apply(self, [pvals, unis], [pvals.type()])

    def c_code_cache_version(self):
-        #return ()
-        return (super(GpuMultinomial,self).c_code_cache_version(),1)
+        return ()
+        #return (super(GpuMultinomial,self).c_code_cache_version(),1)

    def c_support_code_apply(self, node, nodename):
        return """
@@ -128,7 +127,7 @@ class GpuMultinomial(Multinomial):
            float * global_outs
        )
        {            
-            int n = 32*blockIdx.x + threadIdx.x;
+            int n = blockDim.x*blockIdx.x + threadIdx.x;
            if (n < nb_multi)
            {    
            
@@ -201,14 +200,31 @@ class GpuMultinomial(Multinomial):
            int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
            int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
            
-            int nb_block;
-            if (nb_multi %% 32 == 0)
-                nb_block = nb_multi/32;
-            else
-                nb_block = (int)((float)nb_multi/32. + 1.); 
+            //TODO : change this for a beautiful constant
+            int max_nb_blocks = 2<<15 - 1;
+            int nb_blocks = max_nb_blocks + 1;
+            int nb_threads=16; // so it really starts at 32, because of the *2
+            do
+            {
+                nb_threads*=2;
+                if (nb_multi %% nb_threads == 0)
+                    nb_blocks = nb_multi/nb_threads;
+                else
+                    nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.); 
+            } while (nb_blocks > max_nb_blocks);
+
+            //printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
+
+            // TODO : next line is a bit hardcoded...
+            if (nb_threads > 512)
+            {
+                PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi);
+                %(fail)s;
+            }
+
                
-            dim3 n_blocks(nb_block,1,1);
-            dim3 n_threads(32,1,1);
+            dim3 n_blocks(nb_blocks,1,1);
+            dim3 n_threads(nb_threads,1,1);
            int n_shared = 0;

            k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
@@ -244,6 +260,6 @@ gpu_multinomial = GpuMultinomial()
 def use_gpu_multinomial(node):
    if node.op == multinomial:
        return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))]
-if theano.config.device.startswith('gpu'):
+if cuda_enabled:#theano.config.device.startswith('gpu'):
    register_specialize(use_gpu_multinomial)
    
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -685,7 +685,7 @@ class MRG_RandomStreams(object):
        else:
            raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")
            
-    def multinomial(self, size=None, n=1, pvals=[[.5,.5]], ndim=None, dtype='int64'):
+    def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64'):
        """
        Sample `n` (currently `n` needs to be 1) times from a multinomial distribution defined by
        probabilities pvals.
@@ -696,13 +696,12 @@ class MRG_RandomStreams(object):
            `size` and `ndim` are only there keep the same signature as other uniform, binomial, normal, etc.
            todo : adapt multinomial to take that into account
        """
+        if pvals is None:
+            raise TypeError("You have to specify pvals")
        pvals = as_tensor_variable(pvals)
        if n == 1 and pvals.ndim == 2:
-            pvals = as_tensor_variable(pvals)
            unis = self.uniform(size=pvals.shape[0:1], ndim=1)
-            
            return cast(multinomial(pvals.T, unis).T, dtype)
-
        else:
            raise NotImplementedError("MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2")


--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -345,7 +345,7 @@ def test_uniform():
        #print 'random?[-1,-10:]\n', out[-1,-10:]
        basictest(f, steps, sample_size, prefix='mrg  cpu', inputs=input)

-        if mode!='FAST_COMPILE':
+        if mode!='FAST_COMPILE' and cuda_available:
            print ''
            print 'ON GPU with size=(%s):'%str(size)
            R = MRG_RandomStreams(234, use_cuda=True)
@@ -403,7 +403,7 @@ def test_binomial():
            print 'random?[-1,-10:]\n', out[-1,-10:]
            basictest(f, steps, sample_size, prefix='mrg  cpu', inputs=input, allow_01=True, target_avg = mean)

-            if mode!='FAST_COMPILE':
+            if mode!='FAST_COMPILE' and cuda_available:
                print ''
                print 'ON GPU with size=(%s) and mean(%d):'%(str(size),mean)
                R = MRG_RandomStreams(234, use_cuda=True)
@@ -450,7 +450,7 @@ def test_normal0():
    # now with odd number of samples
    sample_size = (sample_size[0],sample_size[1]-1)

-    if mode!='FAST_COMPILE':
+    if mode!='FAST_COMPILE' and cuda_available:
        print ''
        print 'ON GPU:'
        R = MRG_RandomStreams(234, use_cuda=True)
@@ -465,7 +465,7 @@ def test_normal0():
        print 'random?[:10]\n', numpy.asarray(f())[0,0:10]
        print '----'
        sys.stdout.flush()
-        basictest(f, steps, sample_size_odd, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True)
+        basictest(f, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True)
        

    print ''
@@ -528,6 +528,7 @@ def test_multinomial():
        print ''
        print 'ON GPU:'
        R = MRG_RandomStreams(234, use_cuda=True)
+        pvals = numpy.asarray(pvals, dtype='float32')
        n = R.multinomial(pvals=pvals, dtype='float32')
        assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
        f = theano.function([], theano.Out(