make GpuSoftmax and GpuSoftmaxWithBias loop when their is too much block. Add test for this.

7ae6897c · Frederic Bastien · 51a4b704 · 7ae6897c · 7ae6897c
--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -303,7 +303,7 @@ class GpuSoftmax (Op):
        return shape
    def c_code_cache_version(self):
        #return ()
-        return (1,) + inline_softmax.code_version
+        return (2,) + inline_softmax.code_version
    def c_code(self, node, nodename, (x,), (z,), sub):
        fail = sub['fail']
        return """
@@ -330,7 +330,7 @@ class GpuSoftmax (Op):
            kSoftmax_%(nodename)s
                <<<
                // todo: cap these at the card limits, implement loops in kernel
-                    CudaNdarray_HOST_DIMS(%(x)s)[0],
+                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024),
                    CudaNdarray_HOST_DIMS(%(x)s)[1],
                    CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
                >>>(
@@ -362,11 +362,14 @@ class GpuSoftmax (Op):
                body=[
                    "extern __shared__ float buf[]",
                    "float * buf2 = buf + N",
-                    "buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]",
+                    "for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
+                      "buf[threadIdx.x] = x[blockIDX * sx0 + threadIdx.x * sx1]",
                      "buf2[threadIdx.x] = buf[threadIdx.x]",
                      "__syncthreads()",
                      inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
-                    "sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]"
+                      "sm[blockIDX * N + threadIdx.x] = buf[threadIdx.x]",
+                      "__syncthreads()",
+                    "}",
                    ])


@@ -386,7 +389,7 @@ class GpuSoftmaxWithBias (Op):
        return  [shape[0]]
    def c_code_cache_version(self):
        #return ()
-        return (1,) + inline_softmax.code_version
+        return (2,) + inline_softmax.code_version

    def c_code(self, node, nodename, (x,b), (z,), sub):
        fail = sub['fail']
@@ -425,7 +428,7 @@ class GpuSoftmaxWithBias (Op):
            kSoftmaxWithBias_%(nodename)s
                <<<
                // todo: cap these at the card limits, implement loops in kernel
-                    CudaNdarray_HOST_DIMS(%(x)s)[0],
+                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024),
                    CudaNdarray_HOST_DIMS(%(x)s)[1],
                    CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
                >>>(
@@ -461,10 +464,14 @@ class GpuSoftmaxWithBias (Op):
                body=[
                    "extern __shared__ float buf[]",
                    "float * buf2 = buf + N",
-                    "buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]",
+                    "for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
+                       "buf[threadIdx.x] = x[blockIDX * sx0 + threadIdx.x * sx1]",
                       "buf[threadIdx.x] += b[threadIdx.x * sb0]",
                       "buf2[threadIdx.x] = buf[threadIdx.x]",
                       "__syncthreads()",
                       inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
-                    "sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]"
+                       "sm[blockIDX * N + threadIdx.x] = buf[threadIdx.x]",
+                       "__syncthreads()",
+                    "}",
                    ])
+#for (int i = blockIdx.x; i < N; i += gridDim.x)
--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -17,6 +17,10 @@ else:
 def test_GpuCrossentropySoftmax1HotWithBiasDx():
    """
    This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias and GpuCrossentropySoftmax1HotWithBiasDx
+
+
+    We check that we loop when their is too much threads
+    TODO: check that we loop when their is too much block(>32*1024)
    """

    n_in = 1000
@@ -61,3 +65,53 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
    assert numpy.allclose(out[1],gout[1])
    assert numpy.allclose(out[2],gout[2],atol=2e-6)

+
+def test_softmax_with_bias():
+    """
+    This is basic test for GpuSoftmaxWithBias
+
+    We check that we loop when their is too much block
+    TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
+    """
+    x = T.fmatrix('x')
+
+    #we need to test n>32*1024 to check that we make the block loop.
+    n,m=2<<15,5
+
+    data = numpy.arange(n*m, dtype='float32').reshape(n,m)
+
+    z = T.nnet.softmax_with_bias(x, T.zeros_like(x[0,:]))
+
+    f = theano.function([x],z, mode=mode_without_gpu)
+    f_gpu = theano.function([x],z, mode=mode_with_gpu)
+    assert f.maker.env.toposort()[-1].op==T.nnet.softmax_with_bias
+    assert isinstance(f_gpu.maker.env.toposort()[-2].op,cuda.nnet.GpuSoftmaxWithBias)
+    
+    out=f(data)
+    gout=f_gpu(data)
+    assert numpy.allclose(out,gout),numpy.absolute(out-gout)
+
+def test_softmax():
+    """
+    This is basic test for GpuSoftmax
+
+    We check that we loop when their is too much block
+    TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
+    """
+    x = T.fmatrix('x')
+
+    #we need to test n>32*1024 to check that we make the block loop.
+    n,m=2<<15,5
+
+    data = numpy.arange(n*m, dtype='float32').reshape(n,m)
+
+    z = T.nnet.softmax(x)
+
+    f = theano.function([x],z, mode=mode_without_gpu)
+    f_gpu = theano.function([x],z, mode=mode_with_gpu)
+    assert f.maker.env.toposort()[-1].op==T.nnet.softmax
+    assert isinstance(f_gpu.maker.env.toposort()[-2].op,cuda.nnet.GpuSoftmax)
+    
+    out=f(data)
+    gout=f_gpu(data)
+    assert numpy.allclose(out,gout),numpy.absolute(out-gout)