Correct various bugs with ops GpuSoftmax and GpuSoftmaxWithBias. Current version…

Correct various bugs with ops GpuSoftmax and GpuSoftmaxWithBias. Current version passes tests with float32

Correct various bugs with ops GpuSoftmax and GpuSoftmaxWithBias. Current version…
ae8ce3c6 · Pierre Luc Carrier · 6ef5d264 · ae8ce3c6 · ae8ce3c6 · ae8ce3c6
--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ b/theano/sandbox/gpuarray/kernel_codegen.py
+""" Helper routines for generating gpu kernels for nvcc.
+"""
+def nvcc_kernel(name, params, body):
+    """Return the c code of a kernel function.
+    :param params: the parameters to the function as one or more strings
+    :param body: the [nested] list of statements for the body of the
+         function.  These will be separated by ';' characters.
+    """
+    paramstr = ', '.join(params)
+    def flatbody():
+        for b in body:
+            if isinstance(b, (list, tuple)):
+                for bb in b:
+                    yield bb
+            else:
+                yield b
+    bodystr = ';\n'.join(flatbody())
+    return """__global__ void %(name)s (%(paramstr)s)
+    {
+        %(bodystr)s;
+    }
+    """ % locals()
+def code_version(version):
+    """decorator to support version-based cache mechanism"""
+    if not isinstance(version, tuple):
+        raise TypeError('version must be tuple', version)
+    def deco(f):
+        f.code_version = version
+        return f
+    return deco
+UNVERSIONED = ()
+@code_version((1,))
+def inline_reduce(N, buf, pos, count, manner_fn):
+    """Return C++ code for a function that reduces a contiguous buffer.
+    :param N: length of the buffer
+    :param buf: buffer pointer
+    :param pos: index of executing thread
+    :param count: number of executing threads
+    :param manner_fn: a function that accepts strings of arguments a
+        and b, and returns c code for their reduction. (Example:
+        return "%(a)s + %(b)s" for a sum reduction).
+    :postcondition:
+    This function leaves the answer in position 0 of the buffer.  The
+    rest of the buffer is trashed by this function.
+    :note: buf should be in gpu shared memory, we access it many times.
+    """
+    loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
+    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
+    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
+    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
+    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
+    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
+    return """
+    {
+        // This function trashes buf[1..warpSize],
+        // leaving the reduction result in buf[0].
+        if (%(pos)s < warpSize)
+        {
+            for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
+            {
+                %(buf)s[%(pos)s] = %(loop_line)s;
+            }
+            if (%(pos)s < 16)
+            {
+                //reduce so that %(pos)s 0 has the sum of everything
+                if(%(pos)s + 16 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_16)s;
+                if(%(pos)s + 8 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_8)s;
+                if(%(pos)s + 4 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_4)s;
+                if(%(pos)s + 2 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_2)s;
+                if(%(pos)s + 1 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_1)s;
+            }
+        }
+    }
+    """ % locals()
+@code_version(inline_reduce.code_version)
+def inline_reduce_max(N, buf, pos, count):
+    return inline_reduce(N, buf, pos, count,
+                         lambda a, b: "max(%s, %s)" % (a, b))
+@code_version(inline_reduce.code_version)
+def inline_reduce_sum(N, buf, pos, count):
+    return inline_reduce(N, buf, pos, count,
+                         lambda a, b: "%s + %s" % (a, b))
+@code_version(inline_reduce.code_version)
+def inline_reduce_min(N, buf, pos, count):
+    return inline_reduce(N, buf, pos, count,
+                         lambda a, b: "min(%s, %s)" % (a, b))
+@code_version(inline_reduce.code_version)
+def inline_reduce_prod(N, buf, pos, count):
+    return inline_reduce(N, buf, pos, count,
+                         lambda a, b: "%s * %s" % (a, b))
+@code_version((2,) + inline_reduce_max.code_version +
+              inline_reduce_sum.code_version)
+def inline_softmax(N, buf, buf2, threadPos, threadCount):
+    """
+    :param N: length of the buffer
+    :param threadPos: index of executing thread
+    :param threadCount: number of executing threads
+    :Precondition: buf and buf2 contain two identical copies of the input
+        to softmax
+    :Postcondition: buf contains the softmax, buf2 contains un-normalized
+        softmax
+    :note: buf and buf2 should be in gpu shared memory, we access it many times
+    :note2: We use __i as an int variable in a loop
+    """
+    return [
+            #get max of buf (trashing all but buf[0])
+            inline_reduce_max(N, buf, threadPos, threadCount),
+            '__syncthreads()',
+            'float row_max = ' + buf + '[0]',
+            '__syncthreads()',
+            'for(int __i=' + threadPos + '; __i<' + N +
+                  '; __i+=' + threadCount + '){',
+                buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
+                buf2 + '[__i] = ' + buf + '[__i]',
+            '}',
+            '__syncthreads()',
+            inline_reduce_sum(N, buf, threadPos, threadCount),
+            '__syncthreads()',
+            'float row_sum = ' + buf + '[0]',
+            '__syncthreads()',
+            # divide each exp() result by the sum to complete the job.
+            'for(int __i=' + threadPos + '; __i<' + N +
+                  '; __i+=' + threadCount + '){',
+                buf + '[__i] = ' + buf2 + '[__i] / row_sum',
+            '}',
+            '__syncthreads()',
+            ]
+@code_version((1,))
+def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
+                               manner_fn, manner_init,
+                               b='', stride_b=''):
+    """Return C++ code for a function that reduces a contiguous buffer.
+    :param N: length of the buffer
+    :param buf: buffer pointer of size warpSize * sizeof(float)
+    :param pos: index of executing thread
+    :param count: number of executing threads
+    :param b: Optional, pointer to the bias
+    :param stride_b: Optional, the stride of b if b is provided
+    :param manner_fn: a function that accepts strings of arguments a
+        and b, and returns c code for their reduction. (Example:
+        return "%(a)s + %(b)s" for a sum reduction).
+    :param manner_init: a function that accepts strings of arguments a
+        and return c code for its initialization
+    :postcondition:
+    This function leaves the answer in position 0 of the buffer.  The
+    rest of the buffer is trashed by this function.
+    :note: buf should be in gpu shared memory, we access it many times.
+    """
+    if b:
+        init = manner_init("%(x)s[%(pos)s * %(stride_x)s] +"
+                           " %(b)s[%(pos)s * %(stride_b)s]" % locals())
+        loop_line = manner_fn("red",
+                              manner_init("%(x)s[i * %(stride_x)s] + "
+                                          "%(b)s[i * %(stride_b)s]" %
+                                          locals()))
+    else:
+        init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
+        loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
+                                                 locals()))
+    loop_line2 = manner_fn("%s[%s]" % (buf, pos),
+                          "%s[i]" % buf)
+    r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
+    r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
+    r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
+    r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
+    r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
+    return """
+    {
+        // This function trashes buf[1..n_threads],
+        // leaving the reduction result in buf[0].
+        float red = %(init)s;
+        #pragma unroll 16
+        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
+          red = %(loop_line)s;
+        }
+        buf[%(pos)s] = red;
+        __syncthreads();
+        if (%(pos)s < warpSize)
+        {
+            for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize)
+            {
+                %(buf)s[%(pos)s] = %(loop_line2)s;
+            }
+            if (%(pos)s < 16)
+            {
+                //reduce so that %(pos)s 0 has the reduction of everything
+                if(%(pos)s + 16 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_16)s;
+                if(%(pos)s + 8 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_8)s;
+                if(%(pos)s + 4 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_4)s;
+                if(%(pos)s + 2 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_2)s;
+                if(%(pos)s + 1 < %(N)s)
+                    %(buf)s[%(pos)s] = %(r_1)s;
+            }
+        }
+    }
+    """ % locals()
+@code_version(inline_reduce_fixed_shared.code_version)
+def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count,
+                                   b='', stride_b=''):
+    return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
+                                      lambda a, b: "max(%s, %s)" % (a, b),
+                                      lambda a: a,
+                                      b, stride_b)
+@code_version((1,) + inline_reduce_max.code_version +
+              inline_reduce_sum.code_version)
+def inline_softmax_fixed_shared(N, buf, x, stride_x,
+                                sm, sm_stride,
+                                threadPos, threadCount,
+                                b='', stride_b=''):
+    """
+    :param N: length of the buffer, atleast waprSize(32).
+    :param buf: a shared memory buffer of size warpSize * sizeof(float)
+    :param x: a ptr to the gpu memory where the row is stored
+    :param stride_x: the stride between each element in x
+    :param sm: a ptr to the gpu memory to store the result
+    :param sm_stride: the stride between eash sm element
+    :param threadPos: index of executing thread
+    :param threadCount: number of executing threads
+    :param b: Optional, pointer to the bias
+    :param stride_b: Optional, the stride of b if b is provided
+    :Precondition: buf is empty
+    :Postcondition: buf[0] contains the softmax,
+        buf2 contains un-normalized softmax
+    :note: buf should be in gpu shared memory, we access it many times.
+    :note2: We use tx as an int variable in a loop
+    """
+    ret = [
+        #get max of buf (trashing all but buf[0])
+        inline_reduce_fixed_shared_max(N, buf, x, stride_x,
+                                       threadPos, threadCount, b, stride_b),
+        '__syncthreads()',
+        'float row_max = ' + buf + '[0]',
+        '__syncthreads()',
+        inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount,
+                                   lambda a, b: "%s + %s" % (a, b),
+                                   lambda a: "exp(%s - row_max)" % a,
+                                   b, stride_b),
+        '__syncthreads()',
+        'float row_sum = ' + buf + '[0]',
+        '__syncthreads()',
+        "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
+        ]
+    # This set all value correctly
+    if b:
+        ret += [
+            "%(sm)s[tx * %(sm_stride)s] = "
+            "  exp(%(x)s[tx * %(stride_x)s] +"
+            "            %(b)s[tx * %(stride_b)s] - row_max)"
+            " / row_sum" % locals()]
+    else:
+        ret += [
+            "%(sm)s[tx * %(sm_stride)s] = "
+            "exp(%(x)s[tx * %(stride_x)s] - row_max) / row_sum" % locals()]
+    ret += [
+        "}",
+        '__syncthreads()',
+    ]
+    return ret
--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
 import numpy
-from theano import Op, Apply
+from theano import Op, Apply, config
 from theano.compat.six import StringIO
 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
@@ -14,6 +13,10 @@ except ImportError:
 from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable
 from theano.sandbox.gpuarray.type import GpuArrayType
+from theano.sandbox.gpuarray.kernel_codegen import (nvcc_kernel,
+                                                   inline_softmax,
+                                                   inline_softmax_fixed_shared)
 class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
@@ -456,16 +459,17 @@ class GpuSoftmax (Op):
        return self.__class__.__name__
    def make_node(self, x):
+        x = as_gpuarray_variable(x)
        return Apply(self, [x], [x.type()])
    def infer_shape(self, node, shape):
        return shape
    def c_code_cache_version(self):
-        return (9,) + inline_softmax.code_version
+        return (10,) + inline_softmax.code_version
    def c_headers(self):
-        return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>']
+        return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>']
    def c_compiler(self):
        return NVCC_compiler
@@ -474,11 +478,16 @@ class GpuSoftmax (Op):
        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
    def c_code(self, node, nodename, inp, out, sub):
-        dtype = self.dtype
+        dtype_x = node.inputs[0].dtype
-        typecode = pygpu.gpuarray.dtype_to_typecode(dtype)
+        dtype_z = node.outputs[0].dtype
+        typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
        x, = inp
        z, = out
        fail = sub['fail']
+        if config.gpuarray.sync:
+            cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
+        else:
+            cnda_thread_sync = ""  
        return """
        if (PyGpuArray_NDIM(%(x)s) != 2)
        {
@@ -496,17 +505,17 @@ class GpuSoftmax (Op):
                                %(typecode)s,
                                GA_C_ORDER,
                                pygpu_default_context(), Py_None);
-            if (!%(nll)s) {
+            if (!%(z)s) {
                %(fail)s
            } 
        }
        {
-            int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0],
+            int n_blocks = std::min((int)PyGpuArray_DIMS(%(x)s)[0],
                                    32 * 1024);
 //TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], 512);
+            int n_threads = std::min((int)PyGpuArray_DIMS(%(x)s)[1], 512);
            int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
-                                     2 * sizeof(npy_%(dtype)s);
+                                     2 * sizeof(npy_%(dtype_x)s);
            if (PyGpuArray_DIMS(%(x)s)[0] > 0)
            {
@@ -523,38 +532,42 @@ class GpuSoftmax (Op):
                            PyGpuArray_DIMS(%(x)s)[0],
                            PyGpuArray_DIMS(%(x)s)[1],
-                            (dtype_%(x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) +
+                            (npy_%(dtype_x)s*)(
+                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
                                    %(x)s->ga.offset),
-                            PyGpuArray_STRIDES(%(x)s)[0],
+                            PyGpuArray_STRIDES(%(x)s)[0] / 4,
-                            PyGpuArray_STRIDES(%(x)s)[1],
+                            PyGpuArray_STRIDES(%(x)s)[1] / 4,
-                            (dtype_%(z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
+                            (npy_%(dtype_z)s*)(
-                                           %(z)s->ga.offset);
+                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                            PyGpuArray_STRIDES(%(z)s)[0],
+                                    %(z)s->ga.offset),
-                            PyGpuArray_STRIDES(%(z)s)[1]
+                            PyGpuArray_STRIDES(%(z)s)[0] / 4,
+                            PyGpuArray_STRIDES(%(z)s)[1] / 4
                    );
              }else{
                kSoftmax_fixed_shared%(nodename)s
                    <<<
                        n_blocks,
                        n_threads,
-                        n_threads * sizeof(npy_%(dtype)s)
+                        n_threads * sizeof(npy_%(dtype_x)s)
                    >>>(
                            PyGpuArray_DIMS(%(x)s)[0],
                            PyGpuArray_DIMS(%(x)s)[1],
-                            (dtype_%(x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) +
+                            (npy_%(dtype_x)s*)(
+                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
                                    %(x)s->ga.offset),
-                            PyGpuArray_STRIDES(%(x)s)[0],
+                            PyGpuArray_STRIDES(%(x)s)[0] / 4,
-                            PyGpuArray_STRIDES(%(x)s)[1],
+                            PyGpuArray_STRIDES(%(x)s)[1] / 4,
-                            (dtype_%(z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
+                            (npy_%(dtype_z)s*)(
-                                           %(z)s->ga.offset);
+                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
-                            PyGpuArray_STRIDES(%(z)s)[0],
+                                    %(z)s->ga.offset),
-                            PyGpuArray_STRIDES(%(z)s)[1]
+                            PyGpuArray_STRIDES(%(z)s)[0] / 4,
+                            PyGpuArray_STRIDES(%(z)s)[1] / 4
                    );
              }
-              CNDA_THREAD_SYNC;
+              %(cnda_thread_sync)s
              cudaError_t err = cudaGetLastError();
              if( cudaSuccess != err)
              {
@@ -572,6 +585,7 @@ class GpuSoftmax (Op):
        """ % locals()
    def c_support_code_apply(self, node, nodename):
+        dtype = node.inputs[0].dtype
        ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
                params=['int M', 'int N',
                    'const npy_%(dtype)s * x', 'const int sx0', 'const int sx1',
@@ -612,12 +626,12 @@ class GpuSoftmax (Op):
                    "}",
                    ])
        ret3 = "CUdeviceptr (*cuda_get_ptr)(gpudata *g);"
-        return ret1 + "\n" + ret2 + "\n"= ret3
+        return (ret1 + "\n" + ret2 + "\n" + ret3) % locals()
 gpu_softmax = GpuSoftmax()
-class GpuSoftmaxWithBias (GpuOp):
+class GpuSoftmaxWithBias (Op):
    """
    Implement SoftmaxWithBias on the gpu.
    """
@@ -634,17 +648,18 @@ class GpuSoftmaxWithBias (GpuOp):
        return self.__class__.__name__
    def make_node(self, x, b):
+        x = as_gpuarray_variable(x)
+        b = as_gpuarray_variable(b)
        return Apply(self, [x, b], [x.type()])
    def infer_shape(self, node, shape):
        return  [shape[0]]
    def c_code_cache_version(self):
-        #return ()
+        return (9,) + inline_softmax.code_version
-        return (8,) + inline_softmax.code_version
    def c_headers(self):
-        return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>']
+        return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>']
    def c_compiler(self):
        return NVCC_compiler
@@ -653,11 +668,17 @@ class GpuSoftmaxWithBias (GpuOp):
        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
    def c_code(self, node, nodename, inp, out, sub):
-        dtype = self.dtype
+        dtype_x = node.inputs[0].dtype
-        typecode = pygpu.gpuarray.dtype_to_typecode(dtype)
+        dtype_b = node.inputs[1].dtype
+        dtype_z = node.outputs[0].dtype
+        typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
        x, b = inp
        z, = out
        fail = sub['fail']
+        if config.gpuarray.sync:
+            cnda_thread_sync = "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
+        else:
+            cnda_thread_sync = "" 
        return """
        if (PyGpuArray_NDIM(%(x)s) != 2)
        {
@@ -690,16 +711,16 @@ class GpuSoftmaxWithBias (GpuOp):
                                %(typecode)s,
                                GA_C_ORDER,
                                pygpu_default_context(), Py_None);
-            if (!%(nll)s) {
+            if (!%(z)s) {
                %(fail)s
            } 
        }
        {
-            int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0],32*1024);
+            int n_blocks = std::min((int)PyGpuArray_DIMS(%(x)s)[0], 32*1024);
 //TODO, detect the maximum number of thread per block.
-            int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], 512);
+            int n_threads = std::min((int)PyGpuArray_DIMS(%(x)s)[1], 512);
            int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
-                                     2 * sizeof(npy_%(dtype)s);
+                                     2 * sizeof(npy_%(dtype_x)s);
            if (PyGpuArray_DIMS(%(x)s)[0] > 0)
            {
              if(n_shared_bytes < (32 * 1024 - 500)){
@@ -712,46 +733,50 @@ class GpuSoftmaxWithBias (GpuOp):
                        PyGpuArray_DIMS(%(x)s)[0],
                        PyGpuArray_DIMS(%(x)s)[1],
-                        (dtype_%(x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) +
+                        (npy_%(dtype_x)s*)(
+                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
                                    %(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0],
+                        PyGpuArray_STRIDES(%(x)s)[0] / 4,
-                        PyGpuArray_STRIDES(%(x)s)[1],
+                        PyGpuArray_STRIDES(%(x)s)[1] / 4,
-                        (dtype_%(b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
+                        (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
                                           %(b)s->ga.offset),
-                        PyGpuArray_STRIDES(%(b)s)[0],
+                        PyGpuArray_STRIDES(%(b)s)[0] / 4,
-                        (dtype_%(z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
+                        (npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
                                           %(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0],
+                        PyGpuArray_STRIDES(%(z)s)[0] / 4,
-                        PyGpuArray_STRIDES(%(z)s)[1]
+                        PyGpuArray_STRIDES(%(z)s)[1] / 4
                    );
              }else{
                kSoftmaxWithBias_fixed_shared%(nodename)s
                    <<<
                        n_blocks,
                        n_threads,
-                        n_threads * sizeof(npy_%(dtype)s)
+                        n_threads * sizeof(npy_%(dtype_x)s)
                    >>>(
                        PyGpuArray_DIMS(%(x)s)[0],
                        PyGpuArray_DIMS(%(x)s)[1],
-                        (dtype_%(x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) +
+                        (npy_%(dtype_x)s*)(
+                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
                                    %(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0],
+                        PyGpuArray_STRIDES(%(x)s)[0] / 4,
-                        PyGpuArray_STRIDES(%(x)s)[1],
+                        PyGpuArray_STRIDES(%(x)s)[1] / 4,
-                        (dtype_%(b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
+                        (npy_%(dtype_b)s*)(
+                                    ((char *)cuda_get_ptr(%(b)s->ga.data)) +
                                    %(b)s->ga.offset),
-                        PyGpuArray_STRIDES(%(b)s)[0],
+                        PyGpuArray_STRIDES(%(b)s)[0] / 4,
-                        (dtype_%(z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
+                        (npy_%(dtype_z)s*)(
+                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
                                    %(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0],
+                        PyGpuArray_STRIDES(%(z)s)[0] / 4,
-                        PyGpuArray_STRIDES(%(z)s)[1]
+                        PyGpuArray_STRIDES(%(z)s)[1] / 4
                    );
              }
-                CNDA_THREAD_SYNC;
+                %(cnda_thread_sync)s
                cudaError_t err = cudaGetLastError();
                if( cudaSuccess != err)
                {
@@ -767,7 +792,7 @@ class GpuSoftmaxWithBias (GpuOp):
        """ % locals()
    def c_support_code_apply(self, node, nodename):
-        dtype = self.dtype
+        dtype = node.inputs[0].dtype
        ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
                params=['int M', 'int N',
                        'const npy_%(dtype)s * x', 'const int sx0', 'const int sx1',
@@ -815,6 +840,6 @@ class GpuSoftmaxWithBias (GpuOp):
                               "}",
                           ])
        ret3 = "CUdeviceptr (*cuda_get_ptr)(gpudata *g);"
-        return ret1 + "\n" + ret2 + "\n"= ret3
+        return (ret1 + "\n" + ret2 + "\n" + ret3) % locals()
 gpu_softmax_with_bias = GpuSoftmaxWithBias()
\ No newline at end of file
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -344,12 +344,12 @@ def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
    return GpuCrossentropySoftmax1HotWithBiasDx()
 @register_opt()
-@op_lifter([tensor.nnet.GpuSoftmax])
+@op_lifter([tensor.nnet.Softmax])
 def local_gpua_softmax(node):
    return GpuSoftmax()
 @register_opt()
-@op_lifter([tensor.nnet.GpuSoftmaxWithBias])
+@op_lifter([tensor.nnet.SoftmaxWithBias])
 def local_gpua_softmaxwithbias(node):
    return GpuSoftmaxWithBias()