Merge pull request #1303 from nouiz/gpusoftmax

Fix GpuSoftmax[WithBias] Ops to work for large rows

Merge pull request #1303 from nouiz/gpusoftmax
ec3a90d4 · James Bergstra · 97fff534 · 2335f829 · ec3a90d4 · ec3a90d4
--- a/doc/dev_start_guide.txt
+++ b/doc/dev_start_guide.txt
@@ -505,5 +505,6 @@ Other tools that can help you
 * `line_profiler <http://pypi.python.org/pypi/line_profiler/>`_: Line-by-line profiler.
 * `memory_profiler <http://fseoane.net/blog/2012/line-by-line-report-of-memory-usage/>`_: memory profiler
 * `runsnake <http://www.vrplumber.com/programming/runsnakerun/>`_: Gui for cProfile(time profiler) and Meliae(memory profiler)
+ * `Guppy <https://pypi.python.org/pypi/guppy/>`_: Supports object and heap memory sizing, profiling and debugging.
 * `hub <https://github.com/defunkt/hub>`_: A tool that adds github commands to the git command line.
 * `git pull-requests <http://www.splitbrain.org/blog/2011-06/19-automate_github_pull_requests>`_: Another tool for git/github command line.
--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -198,6 +198,8 @@ if __name__ == "__main__":

        cuda version      5.0    4.2    4.1    4.0    3.2    3.0   # note
        gpu
+        K20m/ECC          0.07s
+        K20/NOECC         0.07s
        M2070             0.25s         0.27s         0.32s
        M2050(Amazon)     0.25s
        C2075                    0.25s
@@ -215,7 +217,7 @@ if __name__ == "__main__":
        GTX 285                  0.452s        0.452s        0.40s # cuda 3.0 seems faster? driver version?
        GTX 550 Ti                             0.57s
        GT 520                   2.68s                3.06s
-        520M                                          3.19s        # with bumblebee on Ubuntu 12.04
+        520M              2.44s                       3.19s        # with bumblebee on Ubuntu 12.04
        GT 220                                        3.80s
        GT 210                                 6.35s
        8500 GT                                              10.68s

--- a/theano/sandbox/cuda/kernel_codegen.py
+++ b/theano/sandbox/cuda/kernel_codegen.py
 """ Helper routines for generating gpu kernels for nvcc.
 """
+
+
 def nvcc_kernel(name, params, body):
    """Return the c code of a kernel function.

    :param params: the parameters to the function as one or more strings

-    :param body: the [nested] list of statements for the body of the function.  These will be
-    separated by ';' characters.
+    :param body: the [nested] list of statements for the body of the
+         function.  These will be separated by ';' characters.
+
    """
    paramstr = ', '.join(params)
+
    def flatbody():
        for b in body:
            if isinstance(b, (list, tuple)):
@@ -21,12 +25,14 @@ def nvcc_kernel(name, params, body):
    {
        %(bodystr)s;
    }
-    """ %locals()
+    """ % locals()
+

 def code_version(version):
    """decorator to support version-based cache mechanism"""
    if not isinstance(version, tuple):
        raise TypeError('version must be tuple', version)
+
    def deco(f):
        f.code_version = version
        return f
@@ -34,35 +40,38 @@ def code_version(version):

 UNVERSIONED = ()

+
 @code_version((1,))
 def inline_reduce(N, buf, pos, count, manner_fn):
-    """
-    Return C++ code for a function that reduces a contiguous buffer.
+    """Return C++ code for a function that reduces a contiguous buffer.

    :param N: length of the buffer
    :param buf: buffer pointer
    :param pos: index of executing thread
    :param count: number of executing threads
-    :param manner_fn: a function that accepts strings of arguments a and b, and returns c code
-    for their reduction. (Example: return "%(a)s + %(b)s" for a sum reduction).
+
+    :param manner_fn: a function that accepts strings of arguments a
+        and b, and returns c code for their reduction. (Example:
+        return "%(a)s + %(b)s" for a sum reduction).

    :postcondition:
-    This function leaves the answer in position 0 of the buffer.  The rest of the buffer is
-    trashed by this function.
+    This function leaves the answer in position 0 of the buffer.  The
+    rest of the buffer is trashed by this function.

    :note: buf should be in gpu shared memory, we access it many times.

    """
-    loop_line = manner_fn("%s[%s]"%(buf,pos), "%s[i]" %(buf))
-    r_16 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+16]" %(buf, pos))
-    r_8 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+8]" %(buf, pos))
-    r_4 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+4]" %(buf, pos))
-    r_2 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+2]" %(buf, pos))
-    r_1 = manner_fn("%s[%s]" %(buf, pos), "%s[%s+1]" %(buf, pos))
+    loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
+    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
+    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
+    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
+    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
+    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))

    return """
    {
-        // This function trashes buf[1..N], leaving the reduction result in buf[0].
+        // This function trashes buf[1..warpSize],
+        // leaving the reduction result in buf[0].

        if (%(pos)s < warpSize)
        {
@@ -88,24 +97,33 @@ def inline_reduce(N, buf, pos, count, manner_fn):
    }
    """ % locals()

+
 @code_version(inline_reduce.code_version)
 def inline_reduce_max(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count, lambda a, b: "max(%s, %s)"%(a,b))
+    return inline_reduce(N, buf, pos, count,
+                         lambda a, b: "max(%s, %s)" % (a, b))
+

 @code_version(inline_reduce.code_version)
 def inline_reduce_sum(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count, lambda a, b: "%s + %s"%(a,b))
+    return inline_reduce(N, buf, pos, count,
+                         lambda a, b: "%s + %s" % (a, b))
+

 @code_version(inline_reduce.code_version)
 def inline_reduce_min(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count, lambda a, b: "min(%s, %s)"%(a,b))
+    return inline_reduce(N, buf, pos, count,
+                         lambda a, b: "min(%s, %s)" % (a, b))
+

 @code_version(inline_reduce.code_version)
 def inline_reduce_prod(N, buf, pos, count):
-    return inline_reduce(N, buf, pos, count, lambda a, b: "%s * %s"%(a,b))
+    return inline_reduce(N, buf, pos, count,
+                         lambda a, b: "%s * %s" % (a, b))


-@code_version((2,) + inline_reduce_max.code_version + inline_reduce_sum.code_version)
+@code_version((2,) + inline_reduce_max.code_version +
+              inline_reduce_sum.code_version)
 def inline_softmax(N, buf, buf2, threadPos, threadCount):
    """

@@ -113,10 +131,12 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
    :param threadPos: index of executing thread
    :param threadCount: number of executing threads

-    :Precondition: buf and buf2 contain two identical copies of the input to softmax
-    :Postcondition: buf contains the softmax, buf2 contains un-normalized softmax
+    :Precondition: buf and buf2 contain two identical copies of the input
+        to softmax
+    :Postcondition: buf contains the softmax, buf2 contains un-normalized
+        softmax

-    :note: buf and buf2 should be in gpu shared memory, we access it many times.
+    :note: buf and buf2 should be in gpu shared memory, we access it many times

    :note2: We use __i as an int variable in a loop
    """
@@ -124,20 +144,173 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
            #get max of buf (trashing all but buf[0])
            inline_reduce_max(N, buf, threadPos, threadCount),
            '__syncthreads()',
-            'float row_max = '+buf+'[0]',
+            'float row_max = ' + buf + '[0]',
            '__syncthreads()',
-            'for(int __i='+threadPos+'; __i<'+N+'; __i+='+threadCount+'){',
-                buf+'[__i] = exp('+buf2+'[__i] - row_max)',
-                buf2+'[__i] = '+buf+'[__i]',
+            'for(int __i=' + threadPos + '; __i<' + N +
+                  '; __i+=' + threadCount + '){',
+                buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
+                buf2 + '[__i] = ' + buf + '[__i]',
            '}',
            '__syncthreads()',
            inline_reduce_sum(N, buf, threadPos, threadCount),
            '__syncthreads()',
-            'float row_sum = '+buf+'[0]',
+            'float row_sum = ' + buf + '[0]',
            '__syncthreads()',
            # divide each exp() result by the sum to complete the job.
-            'for(int __i='+threadPos+'; __i<'+N+'; __i+='+threadCount+'){',
-                buf+'[__i] = '+buf2+'[__i] / row_sum',
+            'for(int __i=' + threadPos + '; __i<' + N +
+                  '; __i+=' + threadCount + '){',
+                buf + '[__i] = ' + buf2 + '[__i] / row_sum',
            '}',
            '__syncthreads()',
            ]
+
+
+@code_version((1,))
+def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
+                               manner_fn, manner_init,
+                               b='', stride_b=''):
+    """Return C++ code for a function that reduces a contiguous buffer.
+
+    :param N: length of the buffer
+    :param buf: buffer pointer of size warpSize * sizeof(float)
+    :param pos: index of executing thread
+    :param count: number of executing threads
+    :param b: Optional, pointer to the bias
+    :param stride_b: Optional, the stride of b if b is provided
+
+    :param manner_fn: a function that accepts strings of arguments a
+        and b, and returns c code for their reduction. (Example:
+        return "%(a)s + %(b)s" for a sum reduction).
+    :param manner_init: a function that accepts strings of arguments a
+        and return c code for its initialization
+
+    :postcondition:
+    This function leaves the answer in position 0 of the buffer.  The
+    rest of the buffer is trashed by this function.
+
+    :note: buf should be in gpu shared memory, we access it many times.
+
+    """
+    if b:
+        init = manner_init("%(x)s[%(pos)s * %(stride_x)s] +"
+                           " %(b)s[%(pos)s * %(stride_b)s]" % locals())
+        loop_line = manner_fn("red",
+                              manner_init("%(x)s[i * %(stride_x)s] + "
+                                          "%(b)s[i * %(stride_b)s]" %
+                                          locals()))
+    else:
+        init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
+        loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
+                                                 locals()))
+    loop_line2 = manner_fn("%s[%s]" % (buf, pos),
+                          "%s[i]" % buf)
+    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
+    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
+    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
+    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
+    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
+
+    return """
+    {
+        // This function trashes buf[1..n_threads],
+        // leaving the reduction result in buf[0].
+        float red = %(init)s;
+        #pragma unroll 16
+        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
+          red = %(loop_line)s;
+        }
+        buf[%(pos)s] = red;
+        __syncthreads();
+        if (%(pos)s < warpSize)
+        {
+            for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize)
+            {
+                %(buf)s[%(pos)s] = %(loop_line2)s;
+            }
+            if (%(pos)s < 16)
+            {
+                //reduce so that %(pos)s 0 has the reduction of everything
+                if(%(pos)s + 16 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_16)s;
+                if(%(pos)s + 8 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_8)s;
+                if(%(pos)s + 4 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_4)s;
+                if(%(pos)s + 2 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_2)s;
+                if(%(pos)s + 1 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_1)s;
+            }
+        }
+    }
+    """ % locals()
+
+
+@code_version(inline_reduce_fixed_shared.code_version)
+def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count,
+                                   b='', stride_b=''):
+    return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
+                                      lambda a, b: "max(%s, %s)" % (a, b),
+                                      lambda a: a,
+                                      b, stride_b)
+
+
+@code_version((1,) + inline_reduce_max.code_version +
+              inline_reduce_sum.code_version)
+def inline_softmax_fixed_shared(N, buf, x, stride_x,
+                                sm, sm_stride,
+                                threadPos, threadCount,
+                                b='', stride_b=''):
+    """
+
+    :param N: length of the buffer, atleast waprSize(32).
+    :param buf: a shared memory buffer of size warpSize * sizeof(float)
+    :param x: a ptr to the gpu memory where the row is stored
+    :param stride_x: the stride between each element in x
+    :param sm: a ptr to the gpu memory to store the result
+    :param sm_stride: the stride between eash sm element
+    :param threadPos: index of executing thread
+    :param threadCount: number of executing threads
+    :param b: Optional, pointer to the bias
+    :param stride_b: Optional, the stride of b if b is provided
+
+    :Precondition: buf is empty
+    :Postcondition: buf[0] contains the softmax,
+        buf2 contains un-normalized softmax
+
+    :note: buf should be in gpu shared memory, we access it many times.
+
+    :note2: We use tx as an int variable in a loop
+    """
+    ret = [
+        #get max of buf (trashing all but buf[0])
+        inline_reduce_fixed_shared_max(N, buf, x, stride_x,
+                                       threadPos, threadCount, b, stride_b),
+        '__syncthreads()',
+        'float row_max = ' + buf + '[0]',
+        '__syncthreads()',
+        inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount,
+                                   lambda a, b: "%s + %s" % (a, b),
+                                   lambda a: "exp(%s - row_max)" % a,
+                                   b, stride_b),
+        '__syncthreads()',
+        'float row_sum = ' + buf + '[0]',
+        '__syncthreads()',
+        "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+        ]
+    # This set all value correctly
+    if b:
+        ret += [
+            "%(sm)s[tx * %(sm_stride)s] = "
+            "  exp(%(x)s[tx * %(stride_x)s] +"
+            "            %(b)s[tx * %(stride_b)s] - row_max)"
+            " / row_sum" % locals()]
+    else:
+        ret += [
+            "%(sm)s[tx * %(sm_stride)s] = "
+            "exp(%(x)s[tx * %(stride_x)s] - row_max) / row_sum" % locals()]
+    ret += [
+        "}",
+        '__syncthreads()',
+    ]
+    return ret
--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
-from theano import Op, Type, Apply, Variable, Constant
-from theano import tensor, scalar
+from theano import Op, Apply
 import StringIO

-from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import GpuOp

-from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel, inline_reduce_max,
-                                                inline_reduce_sum,
-                                                inline_softmax)
+from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel,
+                                                inline_softmax,
+                                                inline_softmax_fixed_shared)


 class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
@@ -111,7 +109,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
            PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
            %(fail)s;
        }
-        if (CudaNdarray_HOST_DIMS(%(x)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
+        if (CudaNdarray_HOST_DIMS(%(x)s)[0] !=
+            CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
        {
            PyErr_SetString(PyExc_ValueError,
                            "dimension mismatch in x,y_idx arguments");
@@ -124,56 +123,73 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
            %(fail)s;
        }
        if ((NULL == %(nll)s) //initial condition
-            || (CudaNdarray_HOST_DIMS(%(nll)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
+            || (CudaNdarray_HOST_DIMS(%(nll)s)[0] !=
+                CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
        {
            Py_XDECREF(%(nll)s);
-            %(nll)s = (CudaNdarray*)CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(%(y_idx)s));
+            %(nll)s = (CudaNdarray*)CudaNdarray_NewDims(1,
+                CudaNdarray_HOST_DIMS(%(y_idx)s));
            if(!%(nll)s)
            {
                %(fail)s;
            }
        }
        if ((NULL == %(sm)s)
-            || (CudaNdarray_HOST_DIMS(%(sm)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0])
-            || (CudaNdarray_HOST_DIMS(%(sm)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1]))
+            || (CudaNdarray_HOST_DIMS(%(sm)s)[0] !=
+                CudaNdarray_HOST_DIMS(%(x)s)[0])
+            || (CudaNdarray_HOST_DIMS(%(sm)s)[1] !=
+                CudaNdarray_HOST_DIMS(%(x)s)[1]))
        {
            Py_XDECREF(%(sm)s);
-            %(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2, CudaNdarray_HOST_DIMS(%(x)s));
+            %(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2,
+                CudaNdarray_HOST_DIMS(%(x)s));
            if(!%(sm)s)
            {
                PyErr_SetString(PyExc_MemoryError,
                                "failed to alloc sm output");
-                // no need to decref cnda_nll, the cleanup code should pick it up.
+                // no need to decref cnda_nll, the cleanup code should do it up
                %(fail)s;
            }
        }
        if ((NULL == %(am)s)
-            || (CudaNdarray_HOST_DIMS(%(am)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
+            || (CudaNdarray_HOST_DIMS(%(am)s)[0] !=
+                CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
        {
            Py_XDECREF(%(am)s);
-            %(am)s = (CudaNdarray*) CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(%(y_idx)s));
+            %(am)s = (CudaNdarray*) CudaNdarray_NewDims(1,
+                CudaNdarray_HOST_DIMS(%(y_idx)s));
            if(!%(am)s)
            {
                PyErr_SetString(PyExc_MemoryError,
                                "failed to alloc am output");
-                // no need to decref nll amd sm, the cleanup code should pick it up.
+                // no need to decref nll and sm,
+                // the cleanup code should do it up
                %(fail)s;
            }
        }
        {
            int n_blocks = CudaNdarray_HOST_DIMS(%(sm)s)[0];
-            int n_threads = 1; //TODO: launch more threads per row and do parallel sum and max reductions.
+     //TODO: launch more threads per row and do parallel sum and max reductions
+            int n_threads = 1;
            int n_shared_bytes = 0; //n_threads * sizeof(float);

            k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>(
                CudaNdarray_HOST_DIMS(%(x)s)[0],
                CudaNdarray_HOST_DIMS(%(x)s)[1],
-                CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_HOST_STRIDES(%(b)s)[0],
-                CudaNdarray_DEV_DATA(%(y_idx)s), CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
-                CudaNdarray_DEV_DATA(%(nll)s), CudaNdarray_HOST_STRIDES(%(nll)s)[0],
-                CudaNdarray_DEV_DATA(%(sm)s), CudaNdarray_HOST_STRIDES(%(sm)s)[0], CudaNdarray_HOST_STRIDES(%(sm)s)[1],
-                CudaNdarray_DEV_DATA(%(am)s), CudaNdarray_HOST_STRIDES(%(am)s)[0]);
+                CudaNdarray_DEV_DATA(%(x)s),
+                CudaNdarray_HOST_STRIDES(%(x)s)[0],
+                CudaNdarray_HOST_STRIDES(%(x)s)[1],
+                CudaNdarray_DEV_DATA(%(b)s),
+                CudaNdarray_HOST_STRIDES(%(b)s)[0],
+                CudaNdarray_DEV_DATA(%(y_idx)s),
+                CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
+                CudaNdarray_DEV_DATA(%(nll)s),
+                CudaNdarray_HOST_STRIDES(%(nll)s)[0],
+                CudaNdarray_DEV_DATA(%(sm)s),
+                CudaNdarray_HOST_STRIDES(%(sm)s)[0],
+                CudaNdarray_HOST_STRIDES(%(sm)s)[1],
+                CudaNdarray_DEV_DATA(%(am)s),
+                CudaNdarray_HOST_STRIDES(%(am)s)[0]);
            CNDA_THREAD_SYNC;
            cudaError_t err = cudaGetLastError();
            if (cudaSuccess != err)
@@ -181,7 +197,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
                PyErr_Format(PyExc_RuntimeError,
                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n",
                             cudaGetErrorString(err));
-                // no need to decref output vars the cleanup code should pick them up.
+                // no need to decref output vars the cleanup code will do it
                %(fail)s;
            }
        }
@@ -203,7 +219,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
    nout = 1
    """Gradient wrt x of the CrossentropySoftmax1Hot Op"""
    def __init__(self, **kwargs):
-        Op.__init__(self,**kwargs)
+        Op.__init__(self, **kwargs)

    def __eq__(self, other):
        return type(self) == type(other)
@@ -233,26 +249,33 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
            PyErr_SetString(PyExc_ValueError, "rank error");
            %(fail)s;
        }
-        if (CudaNdarray_HOST_DIMS(%(dnll)s)[0] != CudaNdarray_HOST_DIMS(%(sm)s)[0])
+        if (CudaNdarray_HOST_DIMS(%(dnll)s)[0] !=
+            CudaNdarray_HOST_DIMS(%(sm)s)[0])
        {
-            PyErr_Format(PyExc_ValueError, "dnll.shape[0] == %%i, but sm.shape[0] == %%i",
-            CudaNdarray_HOST_DIMS(%(dnll)s)[0],CudaNdarray_HOST_DIMS(%(sm)s)[0]);
+            PyErr_Format(PyExc_ValueError,
+                         "dnll.shape[0] == %%i, but sm.shape[0] == %%i",
+                         CudaNdarray_HOST_DIMS(%(dnll)s)[0],
+                         CudaNdarray_HOST_DIMS(%(sm)s)[0]);
            %(fail)s;
        }
-        if (CudaNdarray_HOST_DIMS(%(dnll)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
+        if (CudaNdarray_HOST_DIMS(%(dnll)s)[0] !=
+            CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
        {
            PyErr_SetString(PyExc_ValueError,
                            "dnll.shape[0] != y_idx.shape[0]");
            %(fail)s;
        }
        if ((NULL == %(dx)s)
-            || (CudaNdarray_HOST_DIMS(%(dx)s)[0] != CudaNdarray_HOST_DIMS(%(sm)s)[0])
-            || (CudaNdarray_HOST_DIMS(%(dx)s)[1] != CudaNdarray_HOST_DIMS(%(sm)s)[1]))
+            || (CudaNdarray_HOST_DIMS(%(dx)s)[0] !=
+                CudaNdarray_HOST_DIMS(%(sm)s)[0])
+            || (CudaNdarray_HOST_DIMS(%(dx)s)[1] !=
+                CudaNdarray_HOST_DIMS(%(sm)s)[1]))
        {
            Py_XDECREF(%(dx)s);
            %(dx)s = (CudaNdarray*)CudaNdarray_New();
            if ((NULL == %(dx)s)
-                || CudaNdarray_alloc_contiguous(%(dx)s, 2, CudaNdarray_HOST_DIMS(%(sm)s)))
+                || CudaNdarray_alloc_contiguous(%(dx)s, 2,
+                                                CudaNdarray_HOST_DIMS(%(sm)s)))
            {
                Py_XDECREF(%(dx)s);
                %(dx)s = NULL;
@@ -314,13 +337,16 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
                {
                    if (y_i == j)
                    {
-                        dx[i * dx_s0 + j * dx_s1] = dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0);
+                        dx[i * dx_s0 + j * dx_s1] =
+                            dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0);
                    }
                    else
                    {
-                        dx[i * dx_s0 + j * dx_s1] = dnll_i * sm[i * sm_s0 + j * sm_s1];
+                        dx[i * dx_s0 + j * dx_s1] =
+                            dnll_i * sm[i * sm_s0 + j * sm_s1];
                    }
-                    //dx[i * dx_s0 + j * dx_s1] = dnll_i * sm[i * sm_s0 + j * sm_s1];
+                    //dx[i * dx_s0 + j * dx_s1] =
+                    //    dnll_i * sm[i * sm_s0 + j * sm_s1];
                    //dx[i*dx_s0+j*dx_s1] = 0;
                }
            }
@@ -350,8 +376,7 @@ class GpuSoftmax (GpuOp):
        return shape

    def c_code_cache_version(self):
-        #return ()
-        return (7,) + inline_softmax.code_version
+        return (9,) + inline_softmax.code_version

    def c_code(self, node, nodename, inp, out, sub):
        x, = inp
@@ -364,8 +389,10 @@ class GpuSoftmax (GpuOp):
            %(fail)s;
        }
        if ((NULL == %(z)s) ||
-            (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0]) ||
-            (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1]))
+            (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
+             CudaNdarray_HOST_DIMS(%(x)s)[0]) ||
+            (CudaNdarray_HOST_DIMS(%(z)s)[1] !=
+             CudaNdarray_HOST_DIMS(%(x)s)[1]))
        {
            Py_XDECREF(%(z)s);
            %(z)s = (CudaNdarray*)CudaNdarray_New();
@@ -379,13 +406,19 @@ class GpuSoftmax (GpuOp):
            }
        }
        {
-            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], 32 * 1024);
+            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
+                                    32 * 1024);
 //TODO, detect the maximum number of thread per block.
            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512);
-            int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float);
+            int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] *
+                                     2 * sizeof(float);

            if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
            {
+              //Those numbers are based on not too recent GPU
+              //to make them compatible with more GPU.
+              //TODO: read the information from the card.
+              if(n_shared_bytes < (32 * 1024 - 500)){
                kSoftmax_%(nodename)s
                    <<<
                        n_blocks,
@@ -403,31 +436,52 @@ class GpuSoftmax (GpuOp):
                            CudaNdarray_HOST_STRIDES(%(z)s)[0],
                            CudaNdarray_HOST_STRIDES(%(z)s)[1]
                    );
-                CNDA_THREAD_SYNC;
-                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-                                 "Cuda error: %%s: %%s.\\n Used %%d blocks,"
-                                 " %%d threads %%d bytes of shared memory",
-                                 "kSoftmax_%(nodename)s", cudaGetErrorString(err),
-                                 n_blocks, n_threads, n_shared_bytes);
-                    %(fail)s;
-                }
+              }else{
+                kSoftmax_fixed_shared%(nodename)s
+                    <<<
+                        n_blocks,
+                        n_threads,
+                        n_threads * sizeof(float)
+                    >>>(
+                            CudaNdarray_HOST_DIMS(%(x)s)[0],
+                            CudaNdarray_HOST_DIMS(%(x)s)[1],
+
+                            CudaNdarray_DEV_DATA(%(x)s),
+                            CudaNdarray_HOST_STRIDES(%(x)s)[0],
+                            CudaNdarray_HOST_STRIDES(%(x)s)[1],
+
+                            CudaNdarray_DEV_DATA(%(z)s),
+                            CudaNdarray_HOST_STRIDES(%(z)s)[0],
+                            CudaNdarray_HOST_STRIDES(%(z)s)[1]
+                    );
+              }
+              CNDA_THREAD_SYNC;
+              cudaError_t err = cudaGetLastError();
+              if( cudaSuccess != err)
+              {
+                  PyErr_Format(PyExc_RuntimeError,
+                               "Cuda error: %%s: %%s.\\n Used %%d blocks,"
+                               " %%d threads %%d bytes of shared memory",
+                               "kSoftmax[_fixed_shared]%(nodename)s",
+                               cudaGetErrorString(err),
+                               n_blocks, n_threads, n_shared_bytes);
+                  %(fail)s;
+              }
            }
        }
        assert(%(z)s);
        """ % locals()

    def c_support_code_apply(self, node, nodename):
-        return nvcc_kernel("kSoftmax_%s" % nodename,
+        ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
                params=['int M', 'int N',
                    'const float * x', 'const int sx0', 'const int sx1',
                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
                body=[
                    "extern __shared__ float buf[]",
                    "float * buf2 = buf + N",
-                    "for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
+                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                    "     blockIDX += gridDim.x){",
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                        "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
                        "buf2[tx] = buf[tx]",
@@ -441,7 +495,24 @@ class GpuSoftmax (GpuOp):
                      "}",
                      "__syncthreads()",
                    "}",
+                ])
+        ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
+                params=['int M', 'int N',
+                    'const float * x', 'const int sx0', 'const int sx1',
+                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
+                body=[
+                    "extern __shared__ float buf[]",
+                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                    "     blockIDX += gridDim.x){",
+                      "const float *x_ptr = &x[blockIDX * sx0]",
+                      "float *sm_ptr = &sm[blockIDX * sm_s0]",
+                      inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
+                                                  'sm_ptr', 'sm_s1',
+                                                  'threadIdx.x', 'blockDim.x'),
+                      "__syncthreads()",
+                    "}",
                    ])
+        return ret1 + "\n" + ret2

 gpu_softmax = GpuSoftmax()

@@ -470,7 +541,7 @@ class GpuSoftmaxWithBias (GpuOp):

    def c_code_cache_version(self):
        #return ()
-        return (7,) + inline_softmax.code_version
+        return (8,) + inline_softmax.code_version

    def c_code(self, node, nodename, inp, out, sub):
        x, b = inp
@@ -487,20 +558,27 @@ class GpuSoftmaxWithBias (GpuOp):
            PyErr_SetString(PyExc_ValueError, "rank error for the bias");
            %(fail)s;
        }
-        if ((CudaNdarray_HOST_DIMS(%(x)s)[1] != CudaNdarray_HOST_DIMS(%(b)s)[0]))
+        if ((CudaNdarray_HOST_DIMS(%(x)s)[1] !=
+            CudaNdarray_HOST_DIMS(%(b)s)[0]))
        {
-            PyErr_Format(PyExc_ValueError, "number of columns in x (%%ld) does not match length of b (%%ld)",
-                (long int)CudaNdarray_HOST_DIMS(%(x)s)[1], (long int)CudaNdarray_HOST_DIMS(%(b)s)[0]);
+            PyErr_Format(PyExc_ValueError,
+                         "number of columns in x (%%ld)"
+                         " does not match length of b (%%ld)",
+                         (long int)CudaNdarray_HOST_DIMS(%(x)s)[1],
+                         (long int)CudaNdarray_HOST_DIMS(%(b)s)[0]);
            %(fail)s;
        }
        if ((NULL == %(z)s)
-            || (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0])
-            || (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1]))
+            || (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
+                CudaNdarray_HOST_DIMS(%(x)s)[0])
+            || (CudaNdarray_HOST_DIMS(%(z)s)[1] !=
+                CudaNdarray_HOST_DIMS(%(x)s)[1]))
        {
            Py_XDECREF(%(z)s);
            %(z)s = (CudaNdarray*)CudaNdarray_New();
            if ((NULL == %(z)s)
-                || CudaNdarray_alloc_contiguous(%(z)s, 2, CudaNdarray_HOST_DIMS(%(x)s)))
+                || CudaNdarray_alloc_contiguous(%(z)s, 2,
+                       CudaNdarray_HOST_DIMS(%(x)s)))
            {
                Py_XDECREF(%(z)s);
                %(z)s = NULL;
@@ -511,13 +589,13 @@ class GpuSoftmaxWithBias (GpuOp):
            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024);
 //TODO, detect the maximum number of thread per block.
            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512);
-            int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float);
+            int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] *
+                                     2 * sizeof(float);
            if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
            {
+              if(n_shared_bytes < (32 * 1024 - 500)){
                kSoftmaxWithBias_%(nodename)s
                    <<<
-                    // todo: cap these at the card limits,
-                    //       implement loops in kernel
                        n_blocks,
                        n_threads,
                        n_shared_bytes
@@ -536,6 +614,28 @@ class GpuSoftmaxWithBias (GpuOp):
                        CudaNdarray_HOST_STRIDES(%(z)s)[0],
                        CudaNdarray_HOST_STRIDES(%(z)s)[1]
                    );
+              }else{
+                kSoftmaxWithBias_fixed_shared%(nodename)s
+                    <<<
+                        n_blocks,
+                        n_threads,
+                        n_threads * sizeof(float)
+                    >>>(
+                        CudaNdarray_HOST_DIMS(%(x)s)[0],
+                        CudaNdarray_HOST_DIMS(%(x)s)[1],
+
+                        CudaNdarray_DEV_DATA(%(x)s),
+                        CudaNdarray_HOST_STRIDES(%(x)s)[0],
+                        CudaNdarray_HOST_STRIDES(%(x)s)[1],
+
+                        CudaNdarray_DEV_DATA(%(b)s),
+                        CudaNdarray_HOST_STRIDES(%(b)s)[0],
+
+                        CudaNdarray_DEV_DATA(%(z)s),
+                        CudaNdarray_HOST_STRIDES(%(z)s)[0],
+                        CudaNdarray_HOST_STRIDES(%(z)s)[1]
+                    );
+              }
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
                if( cudaSuccess != err)
@@ -552,27 +652,52 @@ class GpuSoftmaxWithBias (GpuOp):
        """ % locals()

    def c_support_code_apply(self, node, nodename):
-        return nvcc_kernel("kSoftmaxWithBias_%s"%nodename,
+        ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
                params=['int M', 'int N',
-                    'const float * x', 'const int sx0', 'const int sx1',
-                    'const float * b', 'const int sb0',
-                    'float * sm', 'const int ssm0', 'const int ssm1'],
+                        'const float * x', 'const int sx0', 'const int sx1',
+                        'const float * b', 'const int sb0',
+                        'float * sm', 'const int sm_s0', 'const int sm_s1'],
                body=[
                    "extern __shared__ float buf[]",
                    "float * buf2 = buf + N",
-                    "for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
+                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                    "     blockIDX += gridDim.x){",
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                         "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
                         "buf[tx] += b[tx * sb0]",
                         "buf2[tx] = buf[tx]",
                      "}",
                       "__syncthreads()",
-                       inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
+                       inline_softmax('N', 'buf', 'buf2',
+                                      'threadIdx.x', 'blockDim.x'),
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                         "sm[blockIDX * ssm0 + tx * ssm1] = buf[tx]",
+                         "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
                      "}",
                      "__syncthreads()",
                    "}",
                    ])
+        ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
+                           params=['int M', 'int N',
+                                   'const float * x',
+                                   'const int sx0', 'const int sx1',
+                                   'const float * b', 'const int sb0',
+                                   'float * sm',
+                                   'const int sm_s0', 'const int sm_s1'],
+                           body=[
+                               "extern __shared__ float buf[]",
+                               "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                               "     blockIDX += gridDim.x){",
+                               "const float *x_ptr = &x[blockIDX * sx0]",
+                               "float *sm_ptr = &sm[blockIDX * sm_s0]",
+                               inline_softmax_fixed_shared('N', 'buf',
+                                                           'x_ptr', 'sx1',
+                                                           'sm_ptr', 'sm_s1',
+                                                           'threadIdx.x',
+                                                           'blockDim.x',
+                                                           'b', 'sb0'),
+                               "__syncthreads()",
+                               "}",
+                           ])
+        return ret1 + "\n" + ret2

 gpu_softmax_with_bias = GpuSoftmaxWithBias()
--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -172,8 +172,8 @@ def test_softmax_with_bias():
    x = T.fmatrix('x')
    # We can't use zeros_like(x[0,::]) as this don't allow to test with
    # 0 shape.
-    z = T.nnet.softmax_with_bias(x, T.alloc(numpy.asarray(0, dtype='float32'),
-                                            x.shape[1]))
+    z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
+                                             dtype='float32')[::2])

    f = theano.function([x], z, mode=mode_without_gpu)
    f_gpu = theano.function([x], z, mode=mode_with_gpu)
@@ -181,24 +181,12 @@ def test_softmax_with_bias():
    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
                      cuda.nnet.GpuSoftmaxWithBias)

-    def cmp(n, m, catch=False):
-        """Some old card won't accet the configuration arguments of
-        this implementation. For those cases set catch=True to skip
-        those errors.
-        """
-        try:
-            #print "test_softmax",n,m
-            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
-            out = f(data)
-            gout = f_gpu(data)
-            assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-        except RuntimeError, e:
-            if not catch:
-                raise
-            # Different CUDA driver have different error message
-            assert (e.args[0].startswith(
-              'Cuda error: kSoftmaxWithBias_node_0: invalid configuration argument.\n') or
-            e.args[0].startswith('Cuda error: kSoftmaxWithBias_node_0: invalid argument.\n'))
+    def cmp(n, m):
+        #print "test_softmax",n,m
+        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+        out = f(data)
+        gout = f_gpu(data)
+        assert numpy.allclose(out, gout), numpy.absolute(out - gout)

    cmp(2, 5)
    #we need to test n>32*1024 to check that we make the block loop.
@@ -211,7 +199,11 @@ def test_softmax_with_bias():
    cmp(4, 2000)
    cmp(4, 2024)
    #GTX285 don't have enough shared mem for this case.
-    cmp(4, 4074, True)
+    cmp(4, 4074)
+    # The GTX580, 680 and kepler don't have enough shared memory.
+    cmp(2, 10000)
+    cmp(128, 16 * 1024)
+    cmp(128, 64 * 1024)


 def test_softmax():
@@ -219,9 +211,7 @@ def test_softmax():
    This is basic test for GpuSoftmax

    We check that we loop when their is too much block
-
-    TODO: check that we loop when their is too much thread.(THIS IS
-    NOT IMPLEMENTED)
+    We use slower code when there isn't enough shared memory
    """
    x = T.fmatrix('x')

@@ -232,25 +222,12 @@ def test_softmax():
    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
                      cuda.nnet.GpuSoftmax)

-    def cmp(n, m, catch=False):
-        """Some old card won't accept the configuration arguments of
-        this implementation. For those cases set catch=True to skip
-        those errors.
-
-        """
-        try:
-            #print "test_softmax",n,m
-            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
-            out = f(data)
-            gout = f_gpu(data)
-            assert numpy.allclose(out, gout), numpy.absolute(out - gout)
-        except RuntimeError, e:
-            if not catch:
-                raise
-            # Different CUDA driver have different error message
-            assert (e.args[0].startswith(
-              'Cuda error: kSoftmax_node_0: invalid configuration argument.\n') or
-            e.args[0].startswith('Cuda error: kSoftmax_node_0: invalid argument.\n'))
+    def cmp(n, m):
+        #print "test_softmax",n,m
+        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+        out = f(data)
+        gout = f_gpu(data)
+        assert numpy.allclose(out, gout), numpy.absolute(out - gout)

    #we need to test n>32*1024 to check that we make the block loop.
    cmp(2, 5)
@@ -262,5 +239,9 @@ def test_softmax():
    cmp(4, 1024)
    cmp(4, 2000)
    cmp(4, 2024)
-    #GTX285 don't have enough shared mem for this case.
-    cmp(4, 4074, True)
+    # The GTX285 don't have enough shared memory.
+    cmp(4, 4074)
+    # The GTX580, 680 and kepler don't have enough shared memory.
+    cmp(2, 10000)
+    cmp(128, 16 * 1024)
+    cmp(128, 64 * 1024)