Make GpuSoftmax and GpuSoftmaxWithBias compatible with float64 and adjust unit tests to test this

25823ba2 · Pierre Luc Carrier · d58444e1 · 25823ba2 · 25823ba2 · 25823ba2
--- a/theano/sandbox/gpuarray/kernel_codegen.py
+++ b/theano/sandbox/gpuarray/kernel_codegen.py
@@ -124,12 +124,13 @@ def inline_reduce_prod(N, buf, pos, count):
 @code_version((2,) + inline_reduce_max.code_version +
              inline_reduce_sum.code_version)
-def inline_softmax(N, buf, buf2, threadPos, threadCount):
+def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
    """
    :param N: length of the buffer
    :param threadPos: index of executing thread
    :param threadCount: number of executing threads
+    :param dtype: dtype of the softmax's output
    :Precondition: buf and buf2 contain two identical copies of the input
        to softmax
@@ -144,7 +145,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
            #get max of buf (trashing all but buf[0])
            inline_reduce_max(N, buf, threadPos, threadCount),
            '__syncthreads()',
-            'float row_max = ' + buf + '[0]',
+            ('npy_%s row_max = ' + buf + '[0]') % dtype,
            '__syncthreads()',
            'for(int __i=' + threadPos + '; __i<' + N +
                  '; __i+=' + threadCount + '){',
@@ -154,7 +155,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
            '__syncthreads()',
            inline_reduce_sum(N, buf, threadPos, threadCount),
            '__syncthreads()',
-            'float row_sum = ' + buf + '[0]',
+            ('npy_%s row_sum = ' + buf + '[0]') % dtype,
            '__syncthreads()',
            # divide each exp() result by the sum to complete the job.
            'for(int __i=' + threadPos + '; __i<' + N +
@@ -168,15 +169,16 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
 @code_version((1,))
 def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
                               manner_fn, manner_init,
-                               b='', stride_b=''):
+                               b='', stride_b='', dtype='float32'):
    """Return C++ code for a function that reduces a contiguous buffer.
    :param N: length of the buffer
-    :param buf: buffer pointer of size warpSize * sizeof(float)
+    :param buf: buffer pointer of size warpSize * sizeof(dtype)
    :param pos: index of executing thread
    :param count: number of executing threads
    :param b: Optional, pointer to the bias
    :param stride_b: Optional, the stride of b if b is provided
+    :param dtype: Optional, the dtype of the output
    :param manner_fn: a function that accepts strings of arguments a
        and b, and returns c code for their reduction. (Example:
@@ -214,7 +216,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
    {
        // This function trashes buf[1..n_threads],
        // leaving the reduction result in buf[0].
-        float red = %(init)s;
+        npy_%(dtype)s red = %(init)s;
        #pragma unroll 16
        for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
          red = %(loop_line)s;
@@ -248,11 +250,11 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
 @code_version(inline_reduce_fixed_shared.code_version)
 def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count,
-                                   b='', stride_b=''):
+                                   b='', stride_b='', dtype='float32'):
    return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
                                      lambda a, b: "max(%s, %s)" % (a, b),
                                      lambda a: a,
-                                      b, stride_b)
+                                      b, stride_b, dtype)
 @code_version((1,) + inline_reduce_max.code_version +
@@ -260,11 +262,11 @@ def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count,
 def inline_softmax_fixed_shared(N, buf, x, stride_x,
                                sm, sm_stride,
                                threadPos, threadCount,
-                                b='', stride_b=''):
+                                b='', stride_b='', dtype="float32"):
    """
    :param N: length of the buffer, atleast waprSize(32).
-    :param buf: a shared memory buffer of size warpSize * sizeof(float)
+    :param buf: a shared memory buffer of size warpSize * sizeof(dtype)
    :param x: a ptr to the gpu memory where the row is stored
    :param stride_x: the stride between each element in x
    :param sm: a ptr to the gpu memory to store the result
@@ -273,6 +275,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
    :param threadCount: number of executing threads
    :param b: Optional, pointer to the bias
    :param stride_b: Optional, the stride of b if b is provided
+    :param dtype: Optional, the dtype of the softmax's output if not float32
    :Precondition: buf is empty
    :Postcondition: buf[0] contains the softmax,
@@ -285,16 +288,17 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
    ret = [
        #get max of buf (trashing all but buf[0])
        inline_reduce_fixed_shared_max(N, buf, x, stride_x,
-                                       threadPos, threadCount, b, stride_b),
+                                       threadPos, threadCount, b, stride_b,
+                                       dtype),
        '__syncthreads()',
-        'float row_max = ' + buf + '[0]',
+        ('npy_%s row_max = ' + buf + '[0]') % dtype,
        '__syncthreads()',
        inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount,
                                   lambda a, b: "%s + %s" % (a, b),
                                   lambda a: "exp(%s - row_max)" % a,
-                                   b, stride_b),
+                                   b, stride_b, dtype),
        '__syncthreads()',
-        'float row_sum = ' + buf + '[0]',
+        ('npy_%s row_sum = ' + buf + '[0]') % dtype,
        '__syncthreads()',
        "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
        ]

--- a/theano/sandbox/gpuarray/nnet.py
+++ b/theano/sandbox/gpuarray/nnet.py
@@ -466,7 +466,7 @@ class GpuSoftmax (Op):
        return shape
    def c_code_cache_version(self):
-        return (11,) + inline_softmax.code_version
+        return (12,) + inline_softmax.code_version
    def c_headers(self):
        return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>',
@@ -481,6 +481,8 @@ class GpuSoftmax (Op):
    def c_code(self, node, nodename, inp, out, sub):
        dtype_x = node.inputs[0].dtype
        dtype_z = node.outputs[0].dtype
+        itemsize_x = numpy.dtype(dtype_x).itemsize
+        itemsize_z = numpy.dtype(dtype_z).itemsize
        typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
        x, = inp
        z, = out
@@ -536,14 +538,14 @@ class GpuSoftmax (Op):
                            (npy_%(dtype_x)s*)(
                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
                                    %(x)s->ga.offset),
-                            PyGpuArray_STRIDES(%(x)s)[0] / 4,
+                            PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                            PyGpuArray_STRIDES(%(x)s)[1] / 4,
+                            PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
                            (npy_%(dtype_z)s*)(
                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
                                    %(z)s->ga.offset),
-                            PyGpuArray_STRIDES(%(z)s)[0] / 4,
+                            PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                            PyGpuArray_STRIDES(%(z)s)[1] / 4
+                            PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
                    );
              }else{
                kSoftmax_fixed_shared%(nodename)s
@@ -558,14 +560,14 @@ class GpuSoftmax (Op):
                            (npy_%(dtype_x)s*)(
                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
                                    %(x)s->ga.offset),
-                            PyGpuArray_STRIDES(%(x)s)[0] / 4,
+                            PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                            PyGpuArray_STRIDES(%(x)s)[1] / 4,
+                            PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
                            (npy_%(dtype_z)s*)(
                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
                                    %(z)s->ga.offset),
-                            PyGpuArray_STRIDES(%(z)s)[0] / 4,
+                            PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                            PyGpuArray_STRIDES(%(z)s)[1] / 4
+                            PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
                    );
              }
              %(cnda_thread_sync)s
@@ -586,14 +588,15 @@ class GpuSoftmax (Op):
        """ % locals()
    def c_support_code_apply(self, node, nodename):
-        dtype = node.inputs[0].dtype
+        dtype_x = node.inputs[0].dtype
+        dtype_sm = node.outputs[0].dtype
        ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
                params=['int M', 'int N',
-                    'const npy_%(dtype)s * x', 'const int sx0', 'const int sx1',
+                    'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
-                    'npy_%(dtype)s * sm', 'const int sm_s0', 'const int sm_s1'],
+                    'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
                body=[
-                    "extern __shared__ npy_%(dtype)s buf[]",
+                    "extern __shared__ npy_%(dtype_sm)s buf[]",
-                    "npy_%(dtype)s * buf2 = buf + N",
+                    "npy_%(dtype_sm)s * buf2 = buf + N",
                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
                    "     blockIDX += gridDim.x){",
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
@@ -602,7 +605,7 @@ class GpuSoftmax (Op):
                      "}",
                      "__syncthreads()",
                      inline_softmax('N', 'buf', 'buf2',
-                                     'threadIdx.x', 'blockDim.x'),
+                                     'threadIdx.x', 'blockDim.x', dtype_sm),
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                        # This set all value correctly
                        "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
@@ -612,17 +615,18 @@ class GpuSoftmax (Op):
                ])
        ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
                params=['int M', 'int N',
-                    'const npy_%(dtype)s * x', 'const int sx0', 'const int sx1',
+                    'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
-                    'npy_%(dtype)s * sm', 'const int sm_s0', 'const int sm_s1'],
+                    'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
                body=[
-                    "extern __shared__ npy_%(dtype)s buf[]",
+                    "extern __shared__ npy_%(dtype_sm)s buf[]",
                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
                    "     blockIDX += gridDim.x){",
-                      "const npy_%(dtype)s *x_ptr = &x[blockIDX * sx0]",
+                      "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
-                      "npy_%(dtype)s *sm_ptr = &sm[blockIDX * sm_s0]",
+                      "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
                      inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
                                                  'sm_ptr', 'sm_s1',
-                                                  'threadIdx.x', 'blockDim.x'),
+                                                  'threadIdx.x', 'blockDim.x',
+                                                  dtype=dtype_sm),
                      "__syncthreads()",
                    "}",
                    ])
@@ -656,7 +660,7 @@ class GpuSoftmaxWithBias (Op):
        return  [shape[0]]
    def c_code_cache_version(self):
-        return (10,) + inline_softmax.code_version
+        return (11,) + inline_softmax.code_version
    def c_headers(self):
        return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>',
@@ -672,6 +676,9 @@ class GpuSoftmaxWithBias (Op):
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_z = node.outputs[0].dtype
+        itemsize_x = numpy.dtype(dtype_x).itemsize
+        itemsize_b = numpy.dtype(dtype_b).itemsize
+        itemsize_z = numpy.dtype(dtype_z).itemsize
        typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
        x, b = inp
        z, = out
@@ -737,17 +744,17 @@ class GpuSoftmaxWithBias (Op):
                        (npy_%(dtype_x)s*)(
                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
                                    %(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0] / 4,
+                        PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                        PyGpuArray_STRIDES(%(x)s)[1] / 4,
+                        PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
                        (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
                                           %(b)s->ga.offset),
-                        PyGpuArray_STRIDES(%(b)s)[0] / 4,
+                        PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
                        (npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
                                           %(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0] / 4,
+                        PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                        PyGpuArray_STRIDES(%(z)s)[1] / 4
+                        PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
                    );
              }else{
                kSoftmaxWithBias_fixed_shared%(nodename)s
@@ -762,19 +769,19 @@ class GpuSoftmaxWithBias (Op):
                        (npy_%(dtype_x)s*)(
                                    ((char *)cuda_get_ptr(%(x)s->ga.data)) +
                                    %(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0] / 4,
+                        PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
-                        PyGpuArray_STRIDES(%(x)s)[1] / 4,
+                        PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
                        (npy_%(dtype_b)s*)(
                                    ((char *)cuda_get_ptr(%(b)s->ga.data)) +
                                    %(b)s->ga.offset),
-                        PyGpuArray_STRIDES(%(b)s)[0] / 4,
+                        PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
                        (npy_%(dtype_z)s*)(
                                    ((char *)cuda_get_ptr(%(z)s->ga.data)) +
                                    %(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0] / 4,
+                        PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
-                        PyGpuArray_STRIDES(%(z)s)[1] / 4
+                        PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
                    );
              }
                %(cnda_thread_sync)s
@@ -793,15 +800,17 @@ class GpuSoftmaxWithBias (Op):
        """ % locals()
    def c_support_code_apply(self, node, nodename):
-        dtype = node.inputs[0].dtype
+        dtype_x = node.inputs[0].dtype
+        dtype_b = node.inputs[1].dtype
+        dtype_sm = node.outputs[0].dtype
        ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
                params=['int M', 'int N',
-                        'const npy_%(dtype)s * x', 'const int sx0', 'const int sx1',
+                        'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
-                        'const npy_%(dtype)s * b', 'const int sb0',
+                        'const npy_%(dtype_b)s * b', 'const int sb0',
-                        'npy_%(dtype)s * sm', 'const int sm_s0', 'const int sm_s1'],
+                        'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
                body=[
-                    "extern __shared__ npy_%(dtype)s buf[]",
+                    "extern __shared__ npy_%(dtype_sm)s buf[]",
-                    "npy_%(dtype)s * buf2 = buf + N",
+                    "npy_%(dtype_sm)s * buf2 = buf + N",
                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
                    "     blockIDX += gridDim.x){",
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
@@ -811,7 +820,7 @@ class GpuSoftmaxWithBias (Op):
                      "}",
                       "__syncthreads()",
                       inline_softmax('N', 'buf', 'buf2',
-                                      'threadIdx.x', 'blockDim.x'),
+                                      'threadIdx.x', 'blockDim.x', dtype_sm),
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
                         "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
                      "}",
@@ -820,23 +829,24 @@ class GpuSoftmaxWithBias (Op):
                    ])
        ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
                           params=['int M', 'int N',
-                                   'const npy_%(dtype)s * x',
+                                   'const npy_%(dtype_x)s * x',
                                   'const int sx0', 'const int sx1',
-                                   'const npy_%(dtype)s * b', 'const int sb0',
+                                   'const npy_%(dtype_b)s * b', 'const int sb0',
-                                   'npy_%(dtype)s * sm',
+                                   'npy_%(dtype_sm)s * sm',
                                   'const int sm_s0', 'const int sm_s1'],
                           body=[
-                               "extern __shared__ npy_%(dtype)s buf[]",
+                               "extern __shared__ npy_%(dtype_sm)s buf[]",
                               "for (int blockIDX = blockIdx.x; blockIDX < M;"
                               "     blockIDX += gridDim.x){",
-                               "const npy_%(dtype)s *x_ptr = &x[blockIDX * sx0]",
+                               "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
-                               "npy_%(dtype)s *sm_ptr = &sm[blockIDX * sm_s0]",
+                               "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
                               inline_softmax_fixed_shared('N', 'buf',
                                                           'x_ptr', 'sx1',
                                                           'sm_ptr', 'sm_s1',
                                                           'threadIdx.x',
                                                           'blockDim.x',
-                                                           'b', 'sb0'),
+                                                           'b', 'sb0',
+                                                           dtype_sm),
                               "__syncthreads()",
                               "}",
                           ])

--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
@@ -159,20 +159,44 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
            rtol, atol)
-def test_softmax_with_bias():
+def test_softmax_with_bias_float32():
+    softmax_with_bias_unittest_template(dtypeInput='float32',
+                                        dtypeBias='float32')
+def test_softmax_with_bias_float64():
+    softmax_with_bias_unittest_template(dtypeInput='float32',
+                                        dtypeBias='float64')
+    softmax_with_bias_unittest_template(dtypeInput='float64',
+                                        dtypeBias='float32')
+    softmax_with_bias_unittest_template(dtypeInput='float64',
+                                        dtypeBias='float64')
+def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
    """
-    This is basic test for GpuSoftmaxWithBias
+    This is basic test for GpuSoftmaxWithBias with float64 variables
    We check that we loop when their is too much block
    TODO: check that we loop when their is too much thread.(THIS IS
    NOT IMPLEMENTED)
    """
-    x = T.fmatrix('x')
+    assert dtypeInput in ['float32', 'float64']
+    assert dtypeBias in ['float32', 'float64']
+    if dtypeInput == 'float32':
+        x = T.fmatrix('x')
+    elif dtypeInput == 'float64':
+        x = T.dmatrix('x')
    # We can't use zeros_like(x[0,::]) as this don't allow to test with
-    # 0 shape.
+    # 0 shape
-    z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
+    if dtypeBias == 'float32':
-                                             dtype='float32')[::2])
+        z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
+                                                 dtype='float32')[::2])
+    elif dtypeBias == 'float64':
+        z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
+                                                 dtype='float64')[::2])
    f = theano.function([x], z, mode=mode_without_gpu)
    f_gpu = theano.function([x], z, mode=mode_with_gpu)
@@ -182,7 +206,11 @@ def test_softmax_with_bias():
    def cmp(n, m):
        #print "test_softmax",n,m
-        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+        if dtypeInput == 'float32':
+            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+        elif dtypeInput == 'float64':
+            data = numpy.arange(n * m, dtype='float64').reshape(n, m)
        out = f(data)
        gout = f_gpu(data)
        assert numpy.allclose(out, gout), numpy.absolute(out - gout)
@@ -205,14 +233,25 @@ def test_softmax_with_bias():
    cmp(128, 64 * 1024)
-def test_softmax():
+def test_softmax_float32():
+    softmax_unittest_template('float32')
+def test_softmax_float64():
+    softmax_unittest_template('float32')
+def softmax_unittest_template(dtypeInput):
    """
-    This is basic test for GpuSoftmax
+    This is basic test for GpuSoftmax with float64 variables
    We check that we loop when their is too much block
    We use slower code when there isn't enough shared memory
    """
-    x = T.fmatrix('x')
+    assert dtypeInput in ['float32', 'float64']
+    if dtypeInput == 'float32':
+        x = T.fmatrix('x')
+    elif dtypeInput == 'float64':
+        x = T.dmatrix('x')
    z = T.nnet.softmax(x)
    f = theano.function([x], z, mode=mode_without_gpu)
@@ -222,8 +261,11 @@ def test_softmax():
                      theano.sandbox.gpuarray.nnet.GpuSoftmax)
    def cmp(n, m):
-        #print "test_softmax",n,m
+        if dtypeInput == 'float32':
-        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
+        elif dtypeInput == 'float64':
+            data = numpy.arange(n * m, dtype='float64').reshape(n, m)
        out = f(data)
        gout = f_gpu(data)
        assert numpy.allclose(out, gout), numpy.absolute(out - gout)