提交 25823ba2 authored 作者: Pierre Luc Carrier's avatar Pierre Luc Carrier

Make GpuSoftmax and GpuSoftmaxWithBias compatible with float64 and adjust unit tests to test this

上级 d58444e1
...@@ -124,12 +124,13 @@ def inline_reduce_prod(N, buf, pos, count): ...@@ -124,12 +124,13 @@ def inline_reduce_prod(N, buf, pos, count):
@code_version((2,) + inline_reduce_max.code_version + @code_version((2,) + inline_reduce_max.code_version +
inline_reduce_sum.code_version) inline_reduce_sum.code_version)
def inline_softmax(N, buf, buf2, threadPos, threadCount): def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
""" """
:param N: length of the buffer :param N: length of the buffer
:param threadPos: index of executing thread :param threadPos: index of executing thread
:param threadCount: number of executing threads :param threadCount: number of executing threads
:param dtype: dtype of the softmax's output
:Precondition: buf and buf2 contain two identical copies of the input :Precondition: buf and buf2 contain two identical copies of the input
to softmax to softmax
...@@ -144,7 +145,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount): ...@@ -144,7 +145,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
#get max of buf (trashing all but buf[0]) #get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount), inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()', '__syncthreads()',
'float row_max = ' + buf + '[0]', ('npy_%s row_max = ' + buf + '[0]') % dtype,
'__syncthreads()', '__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N + 'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){', '; __i+=' + threadCount + '){',
...@@ -154,7 +155,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount): ...@@ -154,7 +155,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
'__syncthreads()', '__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount), inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()', '__syncthreads()',
'float row_sum = ' + buf + '[0]', ('npy_%s row_sum = ' + buf + '[0]') % dtype,
'__syncthreads()', '__syncthreads()',
# divide each exp() result by the sum to complete the job. # divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N + 'for(int __i=' + threadPos + '; __i<' + N +
...@@ -168,15 +169,16 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount): ...@@ -168,15 +169,16 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
@code_version((1,)) @code_version((1,))
def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
manner_fn, manner_init, manner_fn, manner_init,
b='', stride_b=''): b='', stride_b='', dtype='float32'):
"""Return C++ code for a function that reduces a contiguous buffer. """Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer :param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(float) :param buf: buffer pointer of size warpSize * sizeof(dtype)
:param pos: index of executing thread :param pos: index of executing thread
:param count: number of executing threads :param count: number of executing threads
:param b: Optional, pointer to the bias :param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided :param stride_b: Optional, the stride of b if b is provided
:param dtype: Optional, the dtype of the output
:param manner_fn: a function that accepts strings of arguments a :param manner_fn: a function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example: and b, and returns c code for their reduction. (Example:
...@@ -214,7 +216,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, ...@@ -214,7 +216,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
{ {
// This function trashes buf[1..n_threads], // This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0]. // leaving the reduction result in buf[0].
float red = %(init)s; npy_%(dtype)s red = %(init)s;
#pragma unroll 16 #pragma unroll 16
for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){ for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
red = %(loop_line)s; red = %(loop_line)s;
...@@ -248,11 +250,11 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, ...@@ -248,11 +250,11 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
@code_version(inline_reduce_fixed_shared.code_version) @code_version(inline_reduce_fixed_shared.code_version)
def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count, def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count,
b='', stride_b=''): b='', stride_b='', dtype='float32'):
return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
lambda a, b: "max(%s, %s)" % (a, b), lambda a, b: "max(%s, %s)" % (a, b),
lambda a: a, lambda a: a,
b, stride_b) b, stride_b, dtype)
@code_version((1,) + inline_reduce_max.code_version + @code_version((1,) + inline_reduce_max.code_version +
...@@ -260,11 +262,11 @@ def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count, ...@@ -260,11 +262,11 @@ def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count,
def inline_softmax_fixed_shared(N, buf, x, stride_x, def inline_softmax_fixed_shared(N, buf, x, stride_x,
sm, sm_stride, sm, sm_stride,
threadPos, threadCount, threadPos, threadCount,
b='', stride_b=''): b='', stride_b='', dtype="float32"):
""" """
:param N: length of the buffer, atleast waprSize(32). :param N: length of the buffer, atleast waprSize(32).
:param buf: a shared memory buffer of size warpSize * sizeof(float) :param buf: a shared memory buffer of size warpSize * sizeof(dtype)
:param x: a ptr to the gpu memory where the row is stored :param x: a ptr to the gpu memory where the row is stored
:param stride_x: the stride between each element in x :param stride_x: the stride between each element in x
:param sm: a ptr to the gpu memory to store the result :param sm: a ptr to the gpu memory to store the result
...@@ -273,6 +275,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, ...@@ -273,6 +275,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
:param threadCount: number of executing threads :param threadCount: number of executing threads
:param b: Optional, pointer to the bias :param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided :param stride_b: Optional, the stride of b if b is provided
:param dtype: Optional, the dtype of the softmax's output if not float32
:Precondition: buf is empty :Precondition: buf is empty
:Postcondition: buf[0] contains the softmax, :Postcondition: buf[0] contains the softmax,
...@@ -285,16 +288,17 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, ...@@ -285,16 +288,17 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
ret = [ ret = [
#get max of buf (trashing all but buf[0]) #get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max(N, buf, x, stride_x, inline_reduce_fixed_shared_max(N, buf, x, stride_x,
threadPos, threadCount, b, stride_b), threadPos, threadCount, b, stride_b,
dtype),
'__syncthreads()', '__syncthreads()',
'float row_max = ' + buf + '[0]', ('npy_%s row_max = ' + buf + '[0]') % dtype,
'__syncthreads()', '__syncthreads()',
inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount, inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount,
lambda a, b: "%s + %s" % (a, b), lambda a, b: "%s + %s" % (a, b),
lambda a: "exp(%s - row_max)" % a, lambda a: "exp(%s - row_max)" % a,
b, stride_b), b, stride_b, dtype),
'__syncthreads()', '__syncthreads()',
'float row_sum = ' + buf + '[0]', ('npy_%s row_sum = ' + buf + '[0]') % dtype,
'__syncthreads()', '__syncthreads()',
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
] ]
......
...@@ -466,7 +466,7 @@ class GpuSoftmax (Op): ...@@ -466,7 +466,7 @@ class GpuSoftmax (Op):
return shape return shape
def c_code_cache_version(self): def c_code_cache_version(self):
return (11,) + inline_softmax.code_version return (12,) + inline_softmax.code_version
def c_headers(self): def c_headers(self):
return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>',
...@@ -481,6 +481,8 @@ class GpuSoftmax (Op): ...@@ -481,6 +481,8 @@ class GpuSoftmax (Op):
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_z = node.outputs[0].dtype dtype_z = node.outputs[0].dtype
itemsize_x = numpy.dtype(dtype_x).itemsize
itemsize_z = numpy.dtype(dtype_z).itemsize
typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype) typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
x, = inp x, = inp
z, = out z, = out
...@@ -536,14 +538,14 @@ class GpuSoftmax (Op): ...@@ -536,14 +538,14 @@ class GpuSoftmax (Op):
(npy_%(dtype_x)s*)( (npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) + ((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset), %(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / 4, PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / 4, PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_z)s*)( (npy_%(dtype_z)s*)(
((char *)cuda_get_ptr(%(z)s->ga.data)) + ((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset), %(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / 4, PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / 4 PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
); );
}else{ }else{
kSoftmax_fixed_shared%(nodename)s kSoftmax_fixed_shared%(nodename)s
...@@ -558,14 +560,14 @@ class GpuSoftmax (Op): ...@@ -558,14 +560,14 @@ class GpuSoftmax (Op):
(npy_%(dtype_x)s*)( (npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) + ((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset), %(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / 4, PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / 4, PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_z)s*)( (npy_%(dtype_z)s*)(
((char *)cuda_get_ptr(%(z)s->ga.data)) + ((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset), %(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / 4, PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / 4 PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
); );
} }
%(cnda_thread_sync)s %(cnda_thread_sync)s
...@@ -586,14 +588,15 @@ class GpuSoftmax (Op): ...@@ -586,14 +588,15 @@ class GpuSoftmax (Op):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
dtype = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_sm = node.outputs[0].dtype
ret1 = nvcc_kernel("kSoftmax_%s" % nodename, ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const npy_%(dtype)s * x', 'const int sx0', 'const int sx1', 'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
'npy_%(dtype)s * sm', 'const int sm_s0', 'const int sm_s1'], 'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=[
"extern __shared__ npy_%(dtype)s buf[]", "extern __shared__ npy_%(dtype_sm)s buf[]",
"npy_%(dtype)s * buf2 = buf + N", "npy_%(dtype_sm)s * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
...@@ -602,7 +605,7 @@ class GpuSoftmax (Op): ...@@ -602,7 +605,7 @@ class GpuSoftmax (Op):
"}", "}",
"__syncthreads()", "__syncthreads()",
inline_softmax('N', 'buf', 'buf2', inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x'), 'threadIdx.x', 'blockDim.x', dtype_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly # This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
...@@ -612,17 +615,18 @@ class GpuSoftmax (Op): ...@@ -612,17 +615,18 @@ class GpuSoftmax (Op):
]) ])
ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename, ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const npy_%(dtype)s * x', 'const int sx0', 'const int sx1', 'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
'npy_%(dtype)s * sm', 'const int sm_s0', 'const int sm_s1'], 'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=[
"extern __shared__ npy_%(dtype)s buf[]", "extern __shared__ npy_%(dtype_sm)s buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"const npy_%(dtype)s *x_ptr = &x[blockIDX * sx0]", "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
"npy_%(dtype)s *sm_ptr = &sm[blockIDX * sm_s0]", "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1', inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
'sm_ptr', 'sm_s1', 'sm_ptr', 'sm_s1',
'threadIdx.x', 'blockDim.x'), 'threadIdx.x', 'blockDim.x',
dtype=dtype_sm),
"__syncthreads()", "__syncthreads()",
"}", "}",
]) ])
...@@ -656,7 +660,7 @@ class GpuSoftmaxWithBias (Op): ...@@ -656,7 +660,7 @@ class GpuSoftmaxWithBias (Op):
return [shape[0]] return [shape[0]]
def c_code_cache_version(self): def c_code_cache_version(self):
return (10,) + inline_softmax.code_version return (11,) + inline_softmax.code_version
def c_headers(self): def c_headers(self):
return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>',
...@@ -672,6 +676,9 @@ class GpuSoftmaxWithBias (Op): ...@@ -672,6 +676,9 @@ class GpuSoftmaxWithBias (Op):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype dtype_b = node.inputs[1].dtype
dtype_z = node.outputs[0].dtype dtype_z = node.outputs[0].dtype
itemsize_x = numpy.dtype(dtype_x).itemsize
itemsize_b = numpy.dtype(dtype_b).itemsize
itemsize_z = numpy.dtype(dtype_z).itemsize
typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype) typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
x, b = inp x, b = inp
z, = out z, = out
...@@ -737,17 +744,17 @@ class GpuSoftmaxWithBias (Op): ...@@ -737,17 +744,17 @@ class GpuSoftmaxWithBias (Op):
(npy_%(dtype_x)s*)( (npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) + ((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset), %(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / 4, PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / 4, PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) + (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
%(b)s->ga.offset), %(b)s->ga.offset),
PyGpuArray_STRIDES(%(b)s)[0] / 4, PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
(npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) + (npy_%(dtype_z)s*)(((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset), %(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / 4, PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / 4 PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
); );
}else{ }else{
kSoftmaxWithBias_fixed_shared%(nodename)s kSoftmaxWithBias_fixed_shared%(nodename)s
...@@ -762,19 +769,19 @@ class GpuSoftmaxWithBias (Op): ...@@ -762,19 +769,19 @@ class GpuSoftmaxWithBias (Op):
(npy_%(dtype_x)s*)( (npy_%(dtype_x)s*)(
((char *)cuda_get_ptr(%(x)s->ga.data)) + ((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset), %(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / 4, PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / 4, PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_b)s*)( (npy_%(dtype_b)s*)(
((char *)cuda_get_ptr(%(b)s->ga.data)) + ((char *)cuda_get_ptr(%(b)s->ga.data)) +
%(b)s->ga.offset), %(b)s->ga.offset),
PyGpuArray_STRIDES(%(b)s)[0] / 4, PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
(npy_%(dtype_z)s*)( (npy_%(dtype_z)s*)(
((char *)cuda_get_ptr(%(z)s->ga.data)) + ((char *)cuda_get_ptr(%(z)s->ga.data)) +
%(z)s->ga.offset), %(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0] / 4, PyGpuArray_STRIDES(%(z)s)[0] / %(itemsize_z)s,
PyGpuArray_STRIDES(%(z)s)[1] / 4 PyGpuArray_STRIDES(%(z)s)[1] / %(itemsize_z)s
); );
} }
%(cnda_thread_sync)s %(cnda_thread_sync)s
...@@ -793,15 +800,17 @@ class GpuSoftmaxWithBias (Op): ...@@ -793,15 +800,17 @@ class GpuSoftmaxWithBias (Op):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
dtype = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_sm = node.outputs[0].dtype
ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename, ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const npy_%(dtype)s * x', 'const int sx0', 'const int sx1', 'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
'const npy_%(dtype)s * b', 'const int sb0', 'const npy_%(dtype_b)s * b', 'const int sb0',
'npy_%(dtype)s * sm', 'const int sm_s0', 'const int sm_s1'], 'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=[
"extern __shared__ npy_%(dtype)s buf[]", "extern __shared__ npy_%(dtype_sm)s buf[]",
"npy_%(dtype)s * buf2 = buf + N", "npy_%(dtype_sm)s * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
...@@ -811,7 +820,7 @@ class GpuSoftmaxWithBias (Op): ...@@ -811,7 +820,7 @@ class GpuSoftmaxWithBias (Op):
"}", "}",
"__syncthreads()", "__syncthreads()",
inline_softmax('N', 'buf', 'buf2', inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x'), 'threadIdx.x', 'blockDim.x', dtype_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
"}", "}",
...@@ -820,23 +829,24 @@ class GpuSoftmaxWithBias (Op): ...@@ -820,23 +829,24 @@ class GpuSoftmaxWithBias (Op):
]) ])
ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename, ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const npy_%(dtype)s * x', 'const npy_%(dtype_x)s * x',
'const int sx0', 'const int sx1', 'const int sx0', 'const int sx1',
'const npy_%(dtype)s * b', 'const int sb0', 'const npy_%(dtype_b)s * b', 'const int sb0',
'npy_%(dtype)s * sm', 'npy_%(dtype_sm)s * sm',
'const int sm_s0', 'const int sm_s1'], 'const int sm_s0', 'const int sm_s1'],
body=[ body=[
"extern __shared__ npy_%(dtype)s buf[]", "extern __shared__ npy_%(dtype_sm)s buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"const npy_%(dtype)s *x_ptr = &x[blockIDX * sx0]", "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
"npy_%(dtype)s *sm_ptr = &sm[blockIDX * sm_s0]", "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf', inline_softmax_fixed_shared('N', 'buf',
'x_ptr', 'sx1', 'x_ptr', 'sx1',
'sm_ptr', 'sm_s1', 'sm_ptr', 'sm_s1',
'threadIdx.x', 'threadIdx.x',
'blockDim.x', 'blockDim.x',
'b', 'sb0'), 'b', 'sb0',
dtype_sm),
"__syncthreads()", "__syncthreads()",
"}", "}",
]) ])
......
...@@ -159,20 +159,44 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx(): ...@@ -159,20 +159,44 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
rtol, atol) rtol, atol)
def test_softmax_with_bias(): def test_softmax_with_bias_float32():
softmax_with_bias_unittest_template(dtypeInput='float32',
dtypeBias='float32')
def test_softmax_with_bias_float64():
softmax_with_bias_unittest_template(dtypeInput='float32',
dtypeBias='float64')
softmax_with_bias_unittest_template(dtypeInput='float64',
dtypeBias='float32')
softmax_with_bias_unittest_template(dtypeInput='float64',
dtypeBias='float64')
def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
""" """
This is basic test for GpuSoftmaxWithBias This is basic test for GpuSoftmaxWithBias with float64 variables
We check that we loop when their is too much block We check that we loop when their is too much block
TODO: check that we loop when their is too much thread.(THIS IS TODO: check that we loop when their is too much thread.(THIS IS
NOT IMPLEMENTED) NOT IMPLEMENTED)
""" """
x = T.fmatrix('x') assert dtypeInput in ['float32', 'float64']
assert dtypeBias in ['float32', 'float64']
if dtypeInput == 'float32':
x = T.fmatrix('x')
elif dtypeInput == 'float64':
x = T.dmatrix('x')
# We can't use zeros_like(x[0,::]) as this don't allow to test with # We can't use zeros_like(x[0,::]) as this don't allow to test with
# 0 shape. # 0 shape
z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2, if dtypeBias == 'float32':
dtype='float32')[::2]) z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
dtype='float32')[::2])
elif dtypeBias == 'float64':
z = T.nnet.softmax_with_bias(x, T.arange(x.shape[1] * 2,
dtype='float64')[::2])
f = theano.function([x], z, mode=mode_without_gpu) f = theano.function([x], z, mode=mode_without_gpu)
f_gpu = theano.function([x], z, mode=mode_with_gpu) f_gpu = theano.function([x], z, mode=mode_with_gpu)
...@@ -182,7 +206,11 @@ def test_softmax_with_bias(): ...@@ -182,7 +206,11 @@ def test_softmax_with_bias():
def cmp(n, m): def cmp(n, m):
#print "test_softmax",n,m #print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m) if dtypeInput == 'float32':
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
elif dtypeInput == 'float64':
data = numpy.arange(n * m, dtype='float64').reshape(n, m)
out = f(data) out = f(data)
gout = f_gpu(data) gout = f_gpu(data)
assert numpy.allclose(out, gout), numpy.absolute(out - gout) assert numpy.allclose(out, gout), numpy.absolute(out - gout)
...@@ -205,14 +233,25 @@ def test_softmax_with_bias(): ...@@ -205,14 +233,25 @@ def test_softmax_with_bias():
cmp(128, 64 * 1024) cmp(128, 64 * 1024)
def test_softmax(): def test_softmax_float32():
softmax_unittest_template('float32')
def test_softmax_float64():
softmax_unittest_template('float32')
def softmax_unittest_template(dtypeInput):
""" """
This is basic test for GpuSoftmax This is basic test for GpuSoftmax with float64 variables
We check that we loop when their is too much block We check that we loop when their is too much block
We use slower code when there isn't enough shared memory We use slower code when there isn't enough shared memory
""" """
x = T.fmatrix('x') assert dtypeInput in ['float32', 'float64']
if dtypeInput == 'float32':
x = T.fmatrix('x')
elif dtypeInput == 'float64':
x = T.dmatrix('x')
z = T.nnet.softmax(x) z = T.nnet.softmax(x)
f = theano.function([x], z, mode=mode_without_gpu) f = theano.function([x], z, mode=mode_without_gpu)
...@@ -222,8 +261,11 @@ def test_softmax(): ...@@ -222,8 +261,11 @@ def test_softmax():
theano.sandbox.gpuarray.nnet.GpuSoftmax) theano.sandbox.gpuarray.nnet.GpuSoftmax)
def cmp(n, m): def cmp(n, m):
#print "test_softmax",n,m if dtypeInput == 'float32':
data = numpy.arange(n * m, dtype='float32').reshape(n, m) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
elif dtypeInput == 'float64':
data = numpy.arange(n * m, dtype='float64').reshape(n, m)
out = f(data) out = f(data)
gout = f_gpu(data) gout = f_gpu(data)
assert numpy.allclose(out, gout), numpy.absolute(out - gout) assert numpy.allclose(out, gout), numpy.absolute(out - gout)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论