Make GpuSoftmax work with bigger row.

f3fc1f0f · Frederic · c2c34032 · f3fc1f0f · f3fc1f0f · f3fc1f0f
--- a/theano/sandbox/cuda/kernel_codegen.py
+++ b/theano/sandbox/cuda/kernel_codegen.py
@@ -70,7 +70,7 @@ def inline_reduce(N, buf, pos, count, manner_fn):
    return """
    {
-        // This function trashes buf[1..N], leaving the reduction result in buf[0].
+        // This function trashes buf[1..warpSize], leaving the reduction result in buf[0].
        if (%(pos)s < warpSize)
        {
@@ -158,3 +158,118 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
            '}',
            '__syncthreads()',
            ]
+@code_version((1,))
+def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
+                               manner_fn, manner_init):
+    """Return C++ code for a function that reduces a contiguous buffer.
+    :param N: length of the buffer
+    :param buf: buffer pointer of size warpSize * sizeof(float)
+    :param pos: index of executing thread
+    :param count: number of executing threads
+    :param manner_fn: a function that accepts strings of arguments a
+        and b, and returns c code for their reduction. (Example:
+        return "%(a)s + %(b)s" for a sum reduction).
+    :param manner_init: a function that accepts strings of arguments a
+        and return c code for its initialization
+    :postcondition:
+    This function leaves the answer in position 0 of the buffer.  The
+    rest of the buffer is trashed by this function.
+    :note: buf should be in gpu shared memory, we access it many times.
+    """
+    init = manner_init("%(x)s[tx * %(stride_x)s]" % locals())
+    loop_line = manner_fn("%s[%s]" % (buf, pos),
+                          manner_init("%s[i * %s]" % (x, stride_x)))
+    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
+    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
+    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
+    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
+    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
+    return """
+    {
+        // This function trashes buf[1..warpSize], leaving the reduction result in buf[0].
+        for (int tx = %(pos)s; tx<warpSize; tx += %(count)s){
+            %(buf)s[tx] = %(init)s;
+        }
+        __syncthreads();
+        if (%(pos)s < warpSize)
+        {
+            for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
+            {
+                %(buf)s[%(pos)s] = %(loop_line)s;
+            }
+            if (%(pos)s < 16)
+            {
+                //reduce so that %(pos)s 0 has the sum of everything
+                if(%(pos)s + 16 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_16)s;
+                if(%(pos)s + 8 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_8)s;
+                if(%(pos)s + 4 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_4)s;
+                if(%(pos)s + 2 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_2)s;
+                if(%(pos)s + 1 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_1)s;
+            }
+        }
+    }
+    """ % locals()
+@code_version(inline_reduce_fixed_shared.code_version)
+def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count):
+    return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
+                                      lambda a, b: "max(%s, %s)" % (a, b),
+                                      lambda a: a)
+@code_version((1,) + inline_reduce_max.code_version +
+              inline_reduce_sum.code_version)
+def inline_softmax_fixed_shared(N, buf, x, stride_x,
+                                sm, sm_stride,
+                                threadPos, threadCount):
+    """
+    :param N: length of the buffer, atleast waprSize(32).
+    :param buf: a shared memory buffer of size warpSize * sizeof(float)
+    :param x: a ptr to the gpu memory where the row is stored
+    :param stride_x: the stride between each element in x
+    :param sm: a ptr to the gpu memory to store the result
+    :param sm_stride: the stride between eash sm element
+    :param threadPos: index of executing thread
+    :param threadCount: number of executing threads
+    :Precondition: buf is empty
+    :Postcondition: buf[0] contains the softmax, buf2 contains un-normalized softmax
+    :note: buf and buf2 should be in gpu shared memory, we access it many times.
+    :note2: We use __i as an int variable in a loop
+    """
+    return [
+        #get max of buf (trashing all but buf[0])
+        inline_reduce_fixed_shared_max(N, buf, x, stride_x, threadPos, threadCount),
+        '__syncthreads()',
+        'float row_max = '+buf+'[0]',
+        '__syncthreads()',
+        inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount,
+                                   lambda a, b: "%s + %s" % (a, b),
+                                   lambda a: "exp(%s - row_max)" % a),
+        '__syncthreads()',
+        'float row_sum = '+buf+'[0]',
+        '__syncthreads()',
+        "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+            # This set all value correctly
+            "%(sm)s[tx * %(sm_stride)s] = exp(%(x)s[tx * %(stride_x)s] - row_max) / row_sum" % locals(),
+        "}",
+        '__syncthreads()',
+    ]
--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -7,7 +7,8 @@ from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel, inline_reduce_max,
                                                inline_reduce_sum,
-                                                inline_softmax)
+                                                inline_softmax,
+                                                inline_softmax_fixed_shared)
 class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
@@ -350,8 +351,7 @@ class GpuSoftmax (GpuOp):
        return shape
    def c_code_cache_version(self):
-        #return ()
+        return (8,) + inline_softmax.code_version
-        return (7,) + inline_softmax.code_version
    def c_code(self, node, nodename, inp, out, sub):
        x, = inp
@@ -386,6 +386,7 @@ class GpuSoftmax (GpuOp):
            if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
            {
+              if(n_shared_bytes < (32 * 1024 - 500)){
                kSoftmax_%(nodename)s
                    <<<
                        n_blocks,
@@ -403,6 +404,25 @@ class GpuSoftmax (GpuOp):
                            CudaNdarray_HOST_STRIDES(%(z)s)[0],
                            CudaNdarray_HOST_STRIDES(%(z)s)[1]
                    );
+              }else{
+                kSoftmax_fixed_shared%(nodename)s
+                    <<<
+                        n_blocks,
+                        n_threads,
+                        32 * sizeof(float)
+                    >>>(
+                            CudaNdarray_HOST_DIMS(%(x)s)[0],
+                            CudaNdarray_HOST_DIMS(%(x)s)[1],
+                            CudaNdarray_DEV_DATA(%(x)s),
+                            CudaNdarray_HOST_STRIDES(%(x)s)[0],
+                            CudaNdarray_HOST_STRIDES(%(x)s)[1],
+                            CudaNdarray_DEV_DATA(%(z)s),
+                            CudaNdarray_HOST_STRIDES(%(z)s)[0],
+                            CudaNdarray_HOST_STRIDES(%(z)s)[1]
+                    );
+              }
              CNDA_THREAD_SYNC;
              cudaError_t err = cudaGetLastError();
              if( cudaSuccess != err)
@@ -410,7 +430,7 @@ class GpuSoftmax (GpuOp):
                  PyErr_Format(PyExc_RuntimeError,
                               "Cuda error: %%s: %%s.\\n Used %%d blocks,"
                               " %%d threads %%d bytes of shared memory",
-                                 "kSoftmax_%(nodename)s", cudaGetErrorString(err),
+                               "kSoftmax[_fixed_shared]%(nodename)s", cudaGetErrorString(err),
                                n_blocks, n_threads, n_shared_bytes);
                  %(fail)s;
              }
@@ -420,7 +440,7 @@ class GpuSoftmax (GpuOp):
        """ % locals()
    def c_support_code_apply(self, node, nodename):
-        return nvcc_kernel("kSoftmax_%s" % nodename,
+        ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
                params=['int M', 'int N',
                    'const float * x', 'const int sx0', 'const int sx1',
                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
@@ -442,6 +462,22 @@ class GpuSoftmax (GpuOp):
                      "__syncthreads()",
                    "}",
                ])
+        ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
+                params=['int M', 'int N',
+                    'const float * x', 'const int sx0', 'const int sx1',
+                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
+                body=[
+                    "extern __shared__ float buf[]",
+                    "for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
+                      "const float *x_ptr = &x[blockIDX * sx0]",
+                      "float *sm_ptr = &sm[blockIDX * sm_s0]",
+                      inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
+                                                  'sm_ptr', 'sm_s1',
+                                                  'threadIdx.x', 'blockDim.x'),
+                      "__syncthreads()",
+                    "}",
+                    ])
+        return ret1 + "\n" + ret2
 gpu_softmax = GpuSoftmax()

--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -219,9 +219,7 @@ def test_softmax():
    This is basic test for GpuSoftmax
    We check that we loop when their is too much block
+    We use slower code when there isn't enough shared memory
-    TODO: check that we loop when their is too much thread.(THIS IS
-    NOT IMPLEMENTED)
    """
    x = T.fmatrix('x')
@@ -232,25 +230,17 @@ def test_softmax():
    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
                      cuda.nnet.GpuSoftmax)
-    def cmp(n, m, catch=False):
+    def cmp(n, m):
        """Some old card won't accept the configuration arguments of
        this implementation. For those cases set catch=True to skip
        those errors.
        """
-        try:
        #print "test_softmax",n,m
        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
        out = f(data)
        gout = f_gpu(data)
        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-        except RuntimeError, e:
-            if not catch:
-                raise
-            # Different CUDA driver have different error message
-            assert (e.args[0].startswith(
-              'Cuda error: kSoftmax_node_0: invalid configuration argument.\n') or
-            e.args[0].startswith('Cuda error: kSoftmax_node_0: invalid argument.\n'))
    #we need to test n>32*1024 to check that we make the block loop.
    cmp(2, 5)
@@ -262,5 +252,9 @@ def test_softmax():
    cmp(4, 1024)
    cmp(4, 2000)
    cmp(4, 2024)
-    #GTX285 don't have enough shared mem for this case.
+    # The GTX285 don't have enough shared memory.
-    cmp(4, 4074, True)
+    cmp(4, 4074)
+    # The GTX580, 680 and kepler don't have enough shared memory.
+    cmp(2, 10000)
+    cmp(128, 16 * 1024)
+    cmp(128, 64 * 1024)