Merge pull request #1726 from carriepl/master

Conversion of GpuSoftmax and GpuSoftmaxWithBias to the new backend

Merge pull request #1726 from carriepl/master
ff0abb5f · abergeron · db4352dc · eaab9d97 · ff0abb5f · ff0abb5f
--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ b/theano/sandbox/gpuarray/kernel_codegen.py
+""" Helper routines for generating gpu kernels for nvcc.
+"""
+def nvcc_kernel(name, params, body):
+    """Return the c code of a kernel function.
+    :param params: the parameters to the function as one or more strings
+    :param body: the [nested] list of statements for the body of the
+         function.  These will be separated by ';' characters.
+    """
+    paramstr = ', '.join(params)
+    def flatbody():
+        for b in body:
+            if isinstance(b, (list, tuple)):
+                for bb in b:
+                    yield bb
+            else:
+                yield b
+    bodystr = ';\n'.join(flatbody())
+    return """__global__ void %(name)s (%(paramstr)s)
+    {
+        %(bodystr)s;
+    }
+    """ % locals()
+def code_version(version):
+    """decorator to support version-based cache mechanism"""
+    if not isinstance(version, tuple):
+        raise TypeError('version must be tuple', version)
+    def deco(f):
+        f.code_version = version
+        return f
+    return deco
+UNVERSIONED = ()
+@code_version((1,))
+def inline_reduce(N, buf, pos, count, manner_fn):
+    """Return C++ code for a function that reduces a contiguous buffer.
+    :param N: length of the buffer
+    :param buf: buffer pointer
+    :param pos: index of executing thread
+    :param count: number of executing threads
+    :param manner_fn: a function that accepts strings of arguments a
+        and b, and returns c code for their reduction. (Example:
+        return "%(a)s + %(b)s" for a sum reduction).
+    :postcondition:
+    This function leaves the answer in position 0 of the buffer.  The
+    rest of the buffer is trashed by this function.
+    :note: buf should be in gpu shared memory, we access it many times.
+    """
+    loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
+    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
+    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
+    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
+    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
+    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
+    return """
+    {
+        // This function trashes buf[1..warpSize],
+        // leaving the reduction result in buf[0].
+        if (%(pos)s < warpSize)
+        {
+            for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
+            {
+                %(buf)s[%(pos)s] = %(loop_line)s;
+            }
+            if (%(pos)s < 16)
+            {
+                //reduce so that %(pos)s 0 has the sum of everything
+                if(%(pos)s + 16 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_16)s;
+                if(%(pos)s + 8 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_8)s;
+                if(%(pos)s + 4 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_4)s;
+                if(%(pos)s + 2 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_2)s;
+                if(%(pos)s + 1 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_1)s;
+            }
+        }
+    }
+    """ % locals()
+@code_version(inline_reduce.code_version)
+def inline_reduce_max(N, buf, pos, count):
+    return inline_reduce(N, buf, pos, count,
+                         lambda a, b: "max(%s, %s)" % (a, b))
+@code_version(inline_reduce.code_version)
+def inline_reduce_sum(N, buf, pos, count):
+    return inline_reduce(N, buf, pos, count,
+                         lambda a, b: "%s + %s" % (a, b))
+@code_version(inline_reduce.code_version)
+def inline_reduce_min(N, buf, pos, count):
+    return inline_reduce(N, buf, pos, count,
+                         lambda a, b: "min(%s, %s)" % (a, b))
+@code_version(inline_reduce.code_version)
+def inline_reduce_prod(N, buf, pos, count):
+    return inline_reduce(N, buf, pos, count,
+                         lambda a, b: "%s * %s" % (a, b))
+@code_version((2,) + inline_reduce_max.code_version +
+              inline_reduce_sum.code_version)
+def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
+    """
+    :param N: length of the buffer
+    :param threadPos: index of executing thread
+    :param threadCount: number of executing threads
+    :param dtype: dtype of the softmax's output
+    :Precondition: buf and buf2 contain two identical copies of the input
+        to softmax
+    :Postcondition: buf contains the softmax, buf2 contains un-normalized
+        softmax
+    :note: buf and buf2 should be in gpu shared memory, we access it many times
+    :note2: We use __i as an int variable in a loop
+    """
+    return [
+            #get max of buf (trashing all but buf[0])
+            inline_reduce_max(N, buf, threadPos, threadCount),
+            '__syncthreads()',
+            ('npy_%s row_max = ' + buf + '[0]') % dtype,
+            '__syncthreads()',
+            'for(int __i=' + threadPos + '; __i<' + N +
+                  '; __i+=' + threadCount + '){',
+                buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
+                buf2 + '[__i] = ' + buf + '[__i]',
+            '}',
+            '__syncthreads()',
+            inline_reduce_sum(N, buf, threadPos, threadCount),
+            '__syncthreads()',
+            ('npy_%s row_sum = ' + buf + '[0]') % dtype,
+            '__syncthreads()',
+            # divide each exp() result by the sum to complete the job.
+            'for(int __i=' + threadPos + '; __i<' + N +
+                  '; __i+=' + threadCount + '){',
+                buf + '[__i] = ' + buf2 + '[__i] / row_sum',
+            '}',
+            '__syncthreads()',
+            ]
+@code_version((1,))
+def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
+                               manner_fn, manner_init,
+                               b='', stride_b='', dtype='float32'):
+    """Return C++ code for a function that reduces a contiguous buffer.
+    :param N: length of the buffer
+    :param buf: buffer pointer of size warpSize * sizeof(dtype)
+    :param pos: index of executing thread
+    :param count: number of executing threads
+    :param b: Optional, pointer to the bias
+    :param stride_b: Optional, the stride of b if b is provided
+    :param dtype: Optional, the dtype of the output
+    :param manner_fn: a function that accepts strings of arguments a
+        and b, and returns c code for their reduction. (Example:
+        return "%(a)s + %(b)s" for a sum reduction).
+    :param manner_init: a function that accepts strings of arguments a
+        and return c code for its initialization
+    :postcondition:
+    This function leaves the answer in position 0 of the buffer.  The
+    rest of the buffer is trashed by this function.
+    :note: buf should be in gpu shared memory, we access it many times.
+    """
+    if b:
+        init = manner_init("%(x)s[%(pos)s * %(stride_x)s] +"
+                           " %(b)s[%(pos)s * %(stride_b)s]" % locals())
+        loop_line = manner_fn("red",
+                              manner_init("%(x)s[i * %(stride_x)s] + "
+                                          "%(b)s[i * %(stride_b)s]" %
+                                          locals()))
+    else:
+        init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
+        loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
+                                                 locals()))
+    loop_line2 = manner_fn("%s[%s]" % (buf, pos),
+                          "%s[i]" % buf)
+    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
+    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
+    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
+    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
+    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
+    return """
+    {
+        // This function trashes buf[1..n_threads],
+        // leaving the reduction result in buf[0].
+        npy_%(dtype)s red = %(init)s;
+        #pragma unroll 16
+        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
+          red = %(loop_line)s;
+        }
+        buf[%(pos)s] = red;
+        __syncthreads();
+        if (%(pos)s < warpSize)
+        {
+            for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize)
+            {
+                %(buf)s[%(pos)s] = %(loop_line2)s;
+            }
+            if (%(pos)s < 16)
+            {
+                //reduce so that %(pos)s 0 has the reduction of everything
+                if(%(pos)s + 16 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_16)s;
+                if(%(pos)s + 8 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_8)s;
+                if(%(pos)s + 4 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_4)s;
+                if(%(pos)s + 2 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_2)s;
+                if(%(pos)s + 1 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_1)s;
+            }
+        }
+    }
+    """ % locals()
+@code_version(inline_reduce_fixed_shared.code_version)
+def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count,
+                                   b='', stride_b='', dtype='float32'):
+    return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
+                                      lambda a, b: "max(%s, %s)" % (a, b),
+                                      lambda a: a,
+                                      b, stride_b, dtype)
+@code_version((1,) + inline_reduce_max.code_version +
+              inline_reduce_sum.code_version)
+def inline_softmax_fixed_shared(N, buf, x, stride_x,
+                                sm, sm_stride,
+                                threadPos, threadCount,
+                                b='', stride_b='', dtype="float32"):
+    """
+    :param N: length of the buffer, atleast waprSize(32).
+    :param buf: a shared memory buffer of size warpSize * sizeof(dtype)
+    :param x: a ptr to the gpu memory where the row is stored
+    :param stride_x: the stride between each element in x
+    :param sm: a ptr to the gpu memory to store the result
+    :param sm_stride: the stride between eash sm element
+    :param threadPos: index of executing thread
+    :param threadCount: number of executing threads
+    :param b: Optional, pointer to the bias
+    :param stride_b: Optional, the stride of b if b is provided
+    :param dtype: Optional, the dtype of the softmax's output if not float32
+    :Precondition: buf is empty
+    :Postcondition: buf[0] contains the softmax,
+        buf2 contains un-normalized softmax
+    :note: buf should be in gpu shared memory, we access it many times.
+    :note2: We use tx as an int variable in a loop
+    """
+    ret = [
+        #get max of buf (trashing all but buf[0])
+        inline_reduce_fixed_shared_max(N, buf, x, stride_x,
+                                       threadPos, threadCount, b, stride_b,
+                                       dtype),
+        '__syncthreads()',
+        ('npy_%s row_max = ' + buf + '[0]') % dtype,
+        '__syncthreads()',
+        inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount,
+                                   lambda a, b: "%s + %s" % (a, b),
+                                   lambda a: "exp(%s - row_max)" % a,
+                                   b, stride_b, dtype),
+        '__syncthreads()',
+        ('npy_%s row_sum = ' + buf + '[0]') % dtype,
+        '__syncthreads()',
+        "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+        ]
+    # This set all value correctly
+    if b:
+        ret += [
+            "%(sm)s[tx * %(sm_stride)s] = "
+            "  exp(%(x)s[tx * %(stride_x)s] +"
+            "            %(b)s[tx * %(stride_b)s] - row_max)"
+            " / row_sum" % locals()]
+    else:
+        ret += [
+            "%(sm)s[tx * %(sm_stride)s] = "
+            "exp(%(x)s[tx * %(stride_x)s] - row_max) / row_sum" % locals()]
+    ret += [
+        "}",
+        '__syncthreads()',
+    ]
+    return ret
--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
 import numpy
-from theano import Op, Apply
+from theano import Op, Apply, config
 from theano.compat.six import StringIO
 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
@@ -14,6 +13,10 @@ except ImportError:
 from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable
 from theano.sandbox.gpuarray.type import GpuArrayType
+from theano.sandbox.gpuarray.kernel_codegen import (nvcc_kernel,
+                                                   inline_softmax,
+                                                   inline_softmax_fixed_shared)
 class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
@@ -440,3 +443,413 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
 gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
+class GpuSoftmax (Op):
+    """
+    Implement Softmax on the gpu.
+    """
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, x):
+        x = as_gpuarray_variable(x)
+        return Apply(self, [x], [x.type()])
+    def infer_shape(self, node, shape):
+        return shape
+    def c_code_cache_version(self):
+        return (12,) + inline_softmax.code_version
+    def c_headers(self):
+        return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>',
+                '<compyte/ext_cuda.h>']
+    def c_compiler(self):
+        return NVCC_compiler
+    def c_init_code(self):
+        return ['setup_ext_cuda();']
+    def c_code(self, node, nodename, inp, out, sub):
+        dtype_x = node.inputs[0].dtype
+        dtype_z = node.outputs[0].dtype
+        itemsize_x = numpy.dtype(dtype_x).itemsize
+        itemsize_z = numpy.dtype(dtype_z).itemsize
+        typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
+        x, = inp
+        z, = out
+        fail = sub['fail']
+        if config.gpuarray.sync:
+            cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
+        else:
+            cnda_thread_sync = ""  
+        return """
+        if (PyGpuArray_NDIM(%(x)s) != 2)
+        {
+            PyErr_SetString(PyExc_ValueError, "rank error");
+            %(fail)s;
+        }
+        if ((NULL == %(z)s) ||
+            (PyGpuArray_DIMS(%(z)s)[0] !=
+             PyGpuArray_DIMS(%(x)s)[0]) ||
+            (PyGpuArray_DIMS(%(z)s)[1] !=
+             PyGpuArray_DIMS(%(x)s)[1]))
+        {
+            Py_XDECREF(%(z)s);
+            %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
+                                %(typecode)s,
+                                GA_C_ORDER,
+                                pygpu_default_context(), Py_None);
+            if (!%(z)s) {
+                %(fail)s
+            } 
+        }
+        {
+            int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0],
+                                    (size_t)(32 * 1024));
+//TODO, detect the maximum number of thread per block.
+            int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512);
+            int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
+                                     2 * sizeof(npy_%(dtype_x)s);
+            if (PyGpuArray_DIMS(%(x)s)[0] > 0)
+            {
+              //Those numbers are based on not too recent GPU
+              //to make them compatible with more GPU.
+              //TODO: read the information from the card.
+              if(n_shared_bytes < (32 * 1024 - 500)){
+                kSoftmax_%(nodename)s
+                    <<<
+                        n_blocks,
+                        n_threads,
+                        n_shared_bytes
+                    >>>(
+                            PyGpuArray_DIMS(%(x)s)[0],
+                            PyGpuArray_DIMS(%(x)s)[1],
+                            (npy_%(dtype_x)s*)(
+                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
+                                    %(x)s->ga.offset),
+                            PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
+                            PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
+                            (npy_%(dtype_z)s*)(
+                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
+                                    %(z)s->ga.offset),
+                            PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
+                            PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
+                    );
+              }else{
+                kSoftmax_fixed_shared%(nodename)s
+                    <<<
+                        n_blocks,
+                        n_threads,
+                        n_threads * sizeof(npy_%(dtype_x)s)
+                    >>>(
+                            PyGpuArray_DIMS(%(x)s)[0],
+                            PyGpuArray_DIMS(%(x)s)[1],
+                            (npy_%(dtype_x)s*)(
+                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
+                                    %(x)s->ga.offset),
+                            PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
+                            PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
+                            (npy_%(dtype_z)s*)(
+                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
+                                    %(z)s->ga.offset),
+                            PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
+                            PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
+                    );
+              }
+              %(cnda_thread_sync)s
+              cudaError_t err = cudaGetLastError();
+              if( cudaSuccess != err)
+              {
+                  PyErr_Format(PyExc_RuntimeError,
+                               "Cuda error: %%s: %%s.\\n Used %%d blocks,"
+                               " %%d threads %%d bytes of shared memory",
+                               "kSoftmax[_fixed_shared]%(nodename)s",
+                               cudaGetErrorString(err),
+                               n_blocks, n_threads, n_shared_bytes);
+                  %(fail)s;
+              }
+            }
+        }
+        assert(%(z)s);
+        """ % locals()
+    def c_support_code_apply(self, node, nodename):
+        dtype_x = node.inputs[0].dtype
+        dtype_sm = node.outputs[0].dtype
+        ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
+                params=['int M', 'int N',
+                    'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
+                    'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
+                body=[
+                    "extern __shared__ npy_%(dtype_sm)s buf[]",
+                    "npy_%(dtype_sm)s * buf2 = buf + N",
+                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                    "     blockIDX += gridDim.x){",
+                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+                        "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
+                        "buf2[tx] = buf[tx]",
+                      "}",
+                      "__syncthreads()",
+                      inline_softmax('N', 'buf', 'buf2',
+                                     'threadIdx.x', 'blockDim.x', dtype_sm),
+                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+                        # This set all value correctly
+                        "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
+                      "}",
+                      "__syncthreads()",
+                    "}",
+                ])
+        ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
+                params=['int M', 'int N',
+                    'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
+                    'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
+                body=[
+                    "extern __shared__ npy_%(dtype_sm)s buf[]",
+                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                    "     blockIDX += gridDim.x){",
+                      "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
+                      "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
+                      inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
+                                                  'sm_ptr', 'sm_s1',
+                                                  'threadIdx.x', 'blockDim.x',
+                                                  dtype=dtype_sm),
+                      "__syncthreads()",
+                    "}",
+                    ])
+        return (ret1 + "\n" + ret2) % locals()
+gpu_softmax = GpuSoftmax()
+class GpuSoftmaxWithBias (Op):
+    """
+    Implement SoftmaxWithBias on the gpu.
+    """
+    nin = 2
+    nout = 1
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, x, b):
+        x = as_gpuarray_variable(x)
+        b = as_gpuarray_variable(b)
+        return Apply(self, [x, b], [x.type()])
+    def infer_shape(self, node, shape):
+        return  [shape[0]]
+    def c_code_cache_version(self):
+        return (11,) + inline_softmax.code_version
+    def c_headers(self):
+        return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>',
+                '<compyte/ext_cuda.h>']
+    def c_compiler(self):
+        return NVCC_compiler
+    def c_init_code(self):
+        return ['setup_ext_cuda();']
+    def c_code(self, node, nodename, inp, out, sub):
+        dtype_x = node.inputs[0].dtype
+        dtype_b = node.inputs[1].dtype
+        dtype_z = node.outputs[0].dtype
+        itemsize_x = numpy.dtype(dtype_x).itemsize
+        itemsize_b = numpy.dtype(dtype_b).itemsize
+        itemsize_z = numpy.dtype(dtype_z).itemsize
+        typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
+        x, b = inp
+        z, = out
+        fail = sub['fail']
+        if config.gpuarray.sync:
+            cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
+        else:
+            cnda_thread_sync = "" 
+        return """
+        if (PyGpuArray_NDIM(%(x)s) != 2)
+        {
+            PyErr_SetString(PyExc_ValueError, "rank error input");
+            %(fail)s;
+        }
+        if (PyGpuArray_NDIM(%(b)s) != 1)
+        {
+            PyErr_SetString(PyExc_ValueError, "rank error for the bias");
+            %(fail)s;
+        }
+        if ((PyGpuArray_DIMS(%(x)s)[1] !=
+            PyGpuArray_DIMS(%(b)s)[0]))
+        {
+            PyErr_Format(PyExc_ValueError,
+                         "number of columns in x (%%ld)"
+                         " does not match length of b (%%ld)",
+                         (long int)PyGpuArray_DIMS(%(x)s)[1],
+                         (long int)PyGpuArray_DIMS(%(b)s)[0]);
+            %(fail)s;
+        }
+        if ((NULL == %(z)s)
+            || (PyGpuArray_DIMS(%(z)s)[0] !=
+                PyGpuArray_DIMS(%(x)s)[0])
+            || (PyGpuArray_DIMS(%(z)s)[1] !=
+                PyGpuArray_DIMS(%(x)s)[1]))
+        {
+            Py_XDECREF(%(z)s);
+            %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
+                                %(typecode)s,
+                                GA_C_ORDER,
+                                pygpu_default_context(), Py_None);
+            if (!%(z)s) {
+                %(fail)s
+            } 
+        }
+        {
+            int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)(32*1024));
+//TODO, detect the maximum number of thread per block.
+            int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512);
+            int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
+                                     2 * sizeof(npy_%(dtype_x)s);
+            if (PyGpuArray_DIMS(%(x)s)[0] > 0)
+            {
+              if(n_shared_bytes < (32 * 1024 - 500)){
+                kSoftmaxWithBias_%(nodename)s
+                    <<<
+                        n_blocks,
+                        n_threads,
+                        n_shared_bytes
+                    >>>(
+                        PyGpuArray_DIMS(%(x)s)[0],
+                        PyGpuArray_DIMS(%(x)s)[1],
+                        (npy_%(dtype_x)s*)(
+                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
+                                    %(x)s->ga.offset),
+                        PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
+                        PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
+                        (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
+                                           %(b)s->ga.offset),
+                        PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
+                        (npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
+                                           %(z)s->ga.offset),
+                        PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
+                        PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
+                    );
+              }else{
+                kSoftmaxWithBias_fixed_shared%(nodename)s
+                    <<<
+                        n_blocks,
+                        n_threads,
+                        n_threads * sizeof(npy_%(dtype_x)s)
+                    >>>(
+                        PyGpuArray_DIMS(%(x)s)[0],
+                        PyGpuArray_DIMS(%(x)s)[1],
+                        (npy_%(dtype_x)s*)(
+                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
+                                    %(x)s->ga.offset),
+                        PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
+                        PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
+                        (npy_%(dtype_b)s*)(
+                                    ((char *)cuda_get_ptr(%(b)s->ga.data)) +
+                                    %(b)s->ga.offset),
+                        PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
+                        (npy_%(dtype_z)s*)(
+                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
+                                    %(z)s->ga.offset),
+                        PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
+                        PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
+                    );
+              }
+                %(cnda_thread_sync)s
+                cudaError_t err = cudaGetLastError();
+                if( cudaSuccess != err)
+                {
+                    PyErr_Format(PyExc_RuntimeError,
+                                 "Cuda error: %%s: %%s.\\n",
+                                 "kSoftmaxWithBias_%(nodename)s",
+                                 cudaGetErrorString(err));
+                    %(fail)s;
+                }
+            }
+        }
+        assert(%(z)s);
+        """ % locals()
+    def c_support_code_apply(self, node, nodename):
+        dtype_x = node.inputs[0].dtype
+        dtype_b = node.inputs[1].dtype
+        dtype_sm = node.outputs[0].dtype
+        ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
+                params=['int M', 'int N',
+                        'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
+                        'const npy_%(dtype_b)s * b', 'const int sb0',
+                        'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
+                body=[
+                    "extern __shared__ npy_%(dtype_sm)s buf[]",
+                    "npy_%(dtype_sm)s * buf2 = buf + N",
+                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                    "     blockIDX += gridDim.x){",
+                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+                         "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
+                         "buf[tx] += b[tx * sb0]",
+                         "buf2[tx] = buf[tx]",
+                      "}",
+                       "__syncthreads()",
+                       inline_softmax('N', 'buf', 'buf2',
+                                      'threadIdx.x', 'blockDim.x', dtype_sm),
+                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+                         "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
+                      "}",
+                      "__syncthreads()",
+                    "}",
+                    ])
+        ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
+                           params=['int M', 'int N',
+                                   'const npy_%(dtype_x)s * x',
+                                   'const int sx0', 'const int sx1',
+                                   'const npy_%(dtype_b)s * b', 'const int sb0',
+                                   'npy_%(dtype_sm)s * sm',
+                                   'const int sm_s0', 'const int sm_s1'],
+                           body=[
+                               "extern __shared__ npy_%(dtype_sm)s buf[]",
+                               "for (int blockIDX = blockIdx.x; blockIDX < M;"
+                               "     blockIDX += gridDim.x){",
+                               "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
+                               "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
+                               inline_softmax_fixed_shared('N', 'buf',
+                                                           'x_ptr', 'sx1',
+                                                           'sm_ptr', 'sm_s1',
+                                                           'threadIdx.x',
+                                                           'blockDim.x',
+                                                           'b', 'sb0',
+                                                           dtype_sm),
+                               "__syncthreads()",
+                               "}",
+                           ])
+        return (ret1 + "\n" + ret2) % locals()
+gpu_softmax_with_bias = GpuSoftmaxWithBias()
\ No newline at end of file
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -20,7 +20,9 @@ from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
 from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
 from theano.sandbox.gpuarray.conv import GpuConv
 from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
-                                          GpuCrossentropySoftmax1HotWithBiasDx)
+                                          GpuCrossentropySoftmax1HotWithBiasDx,
+                                          GpuSoftmaxWithBias,
+                                          GpuSoftmax)
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduceCuda)
 from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
@@ -341,6 +343,15 @@ def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
 def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
    return GpuCrossentropySoftmax1HotWithBiasDx()
+@register_opt()
+@op_lifter([tensor.nnet.Softmax])
+def local_gpua_softmax(node):
+    return GpuSoftmax()
+@register_opt()
+@op_lifter([tensor.nnet.SoftmaxWithBias])
+def local_gpua_softmaxwithbias(node):
+    return GpuSoftmaxWithBias()
 @register_opt()
 @op_lifter([gpu_from_host, ConvOp])

--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
@@ -157,3 +157,132 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
        assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
            rtol, atol)
+def test_softmax_with_bias_float32():
+    softmax_with_bias_unittest_template(dtypeInput='float32',
+                                        dtypeBias='float32')
+def test_softmax_with_bias_float64():
+    softmax_with_bias_unittest_template(dtypeInput='float32',
+                                        dtypeBias='float64')
+    softmax_with_bias_unittest_template(dtypeInput='float64',
+                                        dtypeBias='float32')
+    softmax_with_bias_unittest_template(dtypeInput='float64',
+                                        dtypeBias='float64')
+def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
+    """
+    This is basic test for GpuSoftmaxWithBias with float64 variables
+    We check that we loop when their is too much block
+    TODO: check that we loop when their is too much thread.(THIS IS
+    NOT IMPLEMENTED)
+    """
+    assert dtypeInput in ['float32', 'float64']
+    assert dtypeBias in ['float32', 'float64']
+    if dtypeInput == 'float32':
+        x = T.fmatrix('x')
+    elif dtypeInput == 'float64':
+        x = T.dmatrix('x')
+    # We can't use zeros_like(x[0,::]) as this don't allow to test with
+    # 0 shape
+    if dtypeBias == 'float32':
+        z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
+                                                 dtype='float32')[::2])
+    elif dtypeBias == 'float64':
+        z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
+                                                 dtype='float64')[::2])
+    f = theano.function([x], z, mode=mode_without_gpu)
+    f_gpu = theano.function([x], z, mode=mode_with_gpu)
+    assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax_with_bias
+    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
+                      theano.sandbox.gpuarray.nnet.GpuSoftmaxWithBias)
+    def cmp(n, m):
+        #print "test_softmax",n,m
+        if dtypeInput == 'float32':
+            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+        elif dtypeInput == 'float64':
+            data = numpy.arange(n * m, dtype='float64').reshape(n, m)
+        out = f(data)
+        gout = f_gpu(data)
+        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
+    cmp(2, 5)
+    #we need to test n>32*1024 to check that we make the block loop.
+    cmp(2 << 15, 5)
+    cmp(4074, 400)
+    cmp(0, 10)
+    cmp(784, 784)
+    cmp(4, 1000)
+    cmp(4, 1024)
+    cmp(4, 2000)
+    cmp(4, 2024)
+    #GTX285 don't have enough shared mem for this case.
+    cmp(4, 4074)
+    # The GTX580, 680 and kepler don't have enough shared memory.
+    cmp(2, 10000)
+    cmp(128, 16 * 1024)
+    cmp(128, 64 * 1024)
+def test_softmax_float32():
+    softmax_unittest_template('float32')
+def test_softmax_float64():
+    softmax_unittest_template('float64')
+def softmax_unittest_template(dtypeInput):
+    """
+    This is basic test for GpuSoftmax with float64 variables
+    We check that we loop when their is too much block
+    We use slower code when there isn't enough shared memory
+    """
+    assert dtypeInput in ['float32', 'float64']
+    if dtypeInput == 'float32':
+        x = T.fmatrix('x')
+    elif dtypeInput == 'float64':
+        x = T.dmatrix('x')
+    z = T.nnet.softmax(x)
+    f = theano.function([x], z, mode=mode_without_gpu)
+    f_gpu = theano.function([x], z, mode=mode_with_gpu)
+    assert f.maker.fgraph.toposort()[-1].op == T.nnet.softmax
+    assert isinstance(f_gpu.maker.fgraph.toposort()[-2].op,
+                      theano.sandbox.gpuarray.nnet.GpuSoftmax)
+    def cmp(n, m):
+        if dtypeInput == 'float32':
+            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+        elif dtypeInput == 'float64':
+            data = numpy.arange(n * m, dtype='float64').reshape(n, m)
+        out = f(data)
+        gout = f_gpu(data)
+        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
+    #we need to test n>32*1024 to check that we make the block loop.
+    cmp(2, 5)
+    cmp(2 << 15, 5)
+    cmp(4074, 400)
+    cmp(0, 10)
+    cmp(784, 784)
+    cmp(4, 1000)
+    cmp(4, 1024)
+    cmp(4, 2000)
+    cmp(4, 2024)
+    # The GTX285 don't have enough shared memory.
+    cmp(4, 4074)
+    # The GTX580, 680 and kepler don't have enough shared memory.
+    cmp(2, 10000)
+    cmp(128, 16 * 1024)
+    cmp(128, 64 * 1024)
\ No newline at end of file