提交 26496654 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Make GpuSoftmax and GpuSoftmaxWithBias work with f16

上级 4a2e513e
...@@ -121,7 +121,7 @@ def inline_reduce_prod(N, buf, pos, count): ...@@ -121,7 +121,7 @@ def inline_reduce_prod(N, buf, pos, count):
lambda a, b: "%s * %s" % (a, b)) lambda a, b: "%s * %s" % (a, b))
@code_version((2,) + inline_reduce_max.code_version + @code_version((3,) + inline_reduce_max.code_version +
inline_reduce_sum.code_version) inline_reduce_sum.code_version)
def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
""" """
...@@ -165,10 +165,10 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): ...@@ -165,10 +165,10 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
] ]
@code_version((1,)) @code_version((2,))
def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
manner_fn, manner_init, manner_fn, manner_init,
b='', stride_b='', dtype='float32'): b='', stride_b='', load_b='', dtype='float32'):
"""Return C++ code for a function that reduces a contiguous buffer. """Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer :param N: length of the buffer
...@@ -193,15 +193,15 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, ...@@ -193,15 +193,15 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
""" """
if b: if b:
init = manner_init("%(x)s[%(pos)s * %(stride_x)s] +" init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s]) +"
" %(b)s[%(pos)s * %(stride_b)s]" % locals()) " %(load_b)s(%(b)s[%(pos)s * %(stride_b)s])" % locals())
loop_line = manner_fn("red", loop_line = manner_fn("red",
manner_init("%(x)s[i * %(stride_x)s] + " manner_init("%(load_x)s(%(x)s[i * %(stride_x)s]) + "
"%(b)s[i * %(stride_b)s]" % "%(load_b)s(%(b)s[i * %(stride_b)s])" %
locals())) locals()))
else: else:
init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals()) init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s])" % locals())
loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" % loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" %
locals())) locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos), loop_line2 = manner_fn("%s[%s]" % (buf, pos),
"%s[i]" % buf) "%s[i]" % buf)
...@@ -248,20 +248,22 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, ...@@ -248,20 +248,22 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
@code_version(inline_reduce_fixed_shared.code_version) @code_version(inline_reduce_fixed_shared.code_version)
def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count, def inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x, pos, count,
b='', stride_b='', dtype='float32'): b='', stride_b='', load_b='',
return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, dtype='float32'):
return inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
lambda a, b: "max(%s, %s)" % (a, b), lambda a, b: "max(%s, %s)" % (a, b),
lambda a: a, lambda a: a,
b, stride_b, dtype) b, stride_b, load_b, dtype)
@code_version((1,) + inline_reduce_max.code_version + @code_version((2,) + inline_reduce_max.code_version +
inline_reduce_sum.code_version) inline_reduce_sum.code_version)
def inline_softmax_fixed_shared(N, buf, x, stride_x, def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
sm, sm_stride, sm, sm_stride, write_sm,
threadPos, threadCount, threadPos, threadCount,
b='', stride_b='', dtype="float32"): b='', stride_b='', load_b='',
dtype="float32"):
""" """
:param N: length of the buffer, atleast waprSize(32). :param N: length of the buffer, atleast waprSize(32).
...@@ -286,16 +288,18 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, ...@@ -286,16 +288,18 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
""" """
ret = [ ret = [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max(N, buf, x, stride_x, inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x,
threadPos, threadCount, b, stride_b, threadPos, threadCount,
b, stride_b, load_b,
dtype), dtype),
'__syncthreads()', '__syncthreads()',
('npy_%s row_max = ' + buf + '[0]') % dtype, ('npy_%s row_max = ' + buf + '[0]') % dtype,
'__syncthreads()', '__syncthreads()',
inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount, inline_reduce_fixed_shared(N, buf, x, stride_x, load_x,
threadPos, threadCount,
lambda a, b: "%s + %s" % (a, b), lambda a, b: "%s + %s" % (a, b),
lambda a: "exp(%s - row_max)" % a, lambda a: "exp(%s - row_max)" % a,
b, stride_b, dtype), b, stride_b, load_b, dtype),
'__syncthreads()', '__syncthreads()',
('npy_%s row_sum = ' + buf + '[0]') % dtype, ('npy_%s row_sum = ' + buf + '[0]') % dtype,
'__syncthreads()', '__syncthreads()',
...@@ -305,13 +309,14 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, ...@@ -305,13 +309,14 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
if b: if b:
ret += [ ret += [
"%(sm)s[tx * %(sm_stride)s] = " "%(sm)s[tx * %(sm_stride)s] = "
" exp(%(x)s[tx * %(stride_x)s] +" " %(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) +"
" %(b)s[tx * %(stride_b)s] - row_max)" " %(load_b)s(%(b)s[tx * %(stride_b)s]) - row_max)"
" / row_sum" % locals()] " / row_sum)" % locals()]
else: else:
ret += [ ret += [
"%(sm)s[tx * %(sm_stride)s] = " "%(sm)s[tx * %(sm_stride)s] = "
"exp(%(x)s[tx * %(stride_x)s] - row_max) / row_sum" % locals()] "%(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) - row_max)"
" / row_sum)" % locals()]
ret += [ ret += [
"}", "}",
'__syncthreads()', '__syncthreads()',
......
...@@ -464,6 +464,7 @@ class GpuSoftmax (Op): ...@@ -464,6 +464,7 @@ class GpuSoftmax (Op):
Implement Softmax on the gpu. Implement Softmax on the gpu.
""" """
__props__ = () __props__ = ()
_f16_ok = True
def make_node(self, x): def make_node(self, x):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
...@@ -473,7 +474,7 @@ class GpuSoftmax (Op): ...@@ -473,7 +474,7 @@ class GpuSoftmax (Op):
return shape return shape
def c_code_cache_version(self): def c_code_cache_version(self):
return (12,) + inline_softmax.code_version return (13,) + inline_softmax.code_version
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
...@@ -487,6 +488,7 @@ class GpuSoftmax (Op): ...@@ -487,6 +488,7 @@ class GpuSoftmax (Op):
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
work_x = work_dtype(dtype_x)
dtype_z = node.outputs[0].dtype dtype_z = node.outputs[0].dtype
itemsize_x = numpy.dtype(dtype_x).itemsize itemsize_x = numpy.dtype(dtype_x).itemsize
itemsize_z = numpy.dtype(dtype_z).itemsize itemsize_z = numpy.dtype(dtype_z).itemsize
...@@ -525,7 +527,7 @@ class GpuSoftmax (Op): ...@@ -525,7 +527,7 @@ class GpuSoftmax (Op):
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512); int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512);
int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] * int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(dtype_x)s); 2 * sizeof(npy_%(work_x)s);
if (PyGpuArray_DIMS(%(x)s)[0] > 0) if (PyGpuArray_DIMS(%(x)s)[0] > 0)
{ {
...@@ -559,7 +561,7 @@ class GpuSoftmax (Op): ...@@ -559,7 +561,7 @@ class GpuSoftmax (Op):
<<< <<<
n_blocks, n_blocks,
n_threads, n_threads,
n_threads * sizeof(npy_%(dtype_x)s) n_threads * sizeof(npy_%(work_x)s)
>>>( >>>(
PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1], PyGpuArray_DIMS(%(x)s)[1],
...@@ -597,25 +599,28 @@ class GpuSoftmax (Op): ...@@ -597,25 +599,28 @@ class GpuSoftmax (Op):
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_sm = node.outputs[0].dtype dtype_sm = node.outputs[0].dtype
load_x = load_w(node.inputs[0].dtype)
write_sm = write_w(node.outputs[0].dtype)
work_sm = work_dtype(node.outputs[0].dtype)
ret1 = nvcc_kernel("kSoftmax_%s" % nodename, ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1', 'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'], 'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=[
"extern __shared__ npy_%(dtype_sm)s buf[]", "extern __shared__ npy_%(work_sm)s buf[]",
"npy_%(dtype_sm)s * buf2 = buf + N", "npy_%(work_sm)s * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]", "buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1])",
"buf2[tx] = buf[tx]", "buf2[tx] = buf[tx]",
"}", "}",
"__syncthreads()", "__syncthreads()",
inline_softmax('N', 'buf', 'buf2', inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x', dtype_sm), 'threadIdx.x', 'blockDim.x', work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly # This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx])",
"}", "}",
"__syncthreads()", "__syncthreads()",
"}", "}",
...@@ -625,15 +630,16 @@ class GpuSoftmax (Op): ...@@ -625,15 +630,16 @@ class GpuSoftmax (Op):
'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1', 'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'], 'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=[
"extern __shared__ npy_%(dtype_sm)s buf[]", "extern __shared__ npy_%(work_sm)s buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]", "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
"npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]", "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1', inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
'sm_ptr', 'sm_s1', load_x,
'sm_ptr', 'sm_s1', write_sm,
'threadIdx.x', 'blockDim.x', 'threadIdx.x', 'blockDim.x',
dtype=dtype_sm), dtype=work_sm),
"__syncthreads()", "__syncthreads()",
"}", "}",
]) ])
...@@ -649,6 +655,7 @@ class GpuSoftmaxWithBias (Op): ...@@ -649,6 +655,7 @@ class GpuSoftmaxWithBias (Op):
nin = 2 nin = 2
nout = 1 nout = 1
__props__ = () __props__ = ()
_f16_ok = True
def make_node(self, x, b): def make_node(self, x, b):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
...@@ -659,7 +666,7 @@ class GpuSoftmaxWithBias (Op): ...@@ -659,7 +666,7 @@ class GpuSoftmaxWithBias (Op):
return [shape[0]] return [shape[0]]
def c_code_cache_version(self): def c_code_cache_version(self):
return (11,) + inline_softmax.code_version return (12,) + inline_softmax.code_version
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
...@@ -675,6 +682,7 @@ class GpuSoftmaxWithBias (Op): ...@@ -675,6 +682,7 @@ class GpuSoftmaxWithBias (Op):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype dtype_b = node.inputs[1].dtype
dtype_z = node.outputs[0].dtype dtype_z = node.outputs[0].dtype
work_x = work_dtype(dtype_x)
itemsize_x = numpy.dtype(dtype_x).itemsize itemsize_x = numpy.dtype(dtype_x).itemsize
itemsize_b = numpy.dtype(dtype_b).itemsize itemsize_b = numpy.dtype(dtype_b).itemsize
itemsize_z = numpy.dtype(dtype_z).itemsize itemsize_z = numpy.dtype(dtype_z).itemsize
...@@ -727,7 +735,7 @@ class GpuSoftmaxWithBias (Op): ...@@ -727,7 +735,7 @@ class GpuSoftmaxWithBias (Op):
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512); int n_threads = std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)512);
int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] * int n_shared_bytes = PyGpuArray_DIMS(%(x)s)[1] *
2 * sizeof(npy_%(dtype_x)s); 2 * sizeof(npy_%(work_x)s);
if (PyGpuArray_DIMS(%(x)s)[0] > 0) if (PyGpuArray_DIMS(%(x)s)[0] > 0)
{ {
if(n_shared_bytes < (32 * 1024 - 500)){ if(n_shared_bytes < (32 * 1024 - 500)){
...@@ -760,7 +768,7 @@ class GpuSoftmaxWithBias (Op): ...@@ -760,7 +768,7 @@ class GpuSoftmaxWithBias (Op):
<<< <<<
n_blocks, n_blocks,
n_threads, n_threads,
n_threads * sizeof(npy_%(dtype_x)s) n_threads * sizeof(npy_%(work_x)s)
>>>( >>>(
PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1], PyGpuArray_DIMS(%(x)s)[1],
...@@ -802,26 +810,30 @@ class GpuSoftmaxWithBias (Op): ...@@ -802,26 +810,30 @@ class GpuSoftmaxWithBias (Op):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype dtype_b = node.inputs[1].dtype
dtype_sm = node.outputs[0].dtype dtype_sm = node.outputs[0].dtype
load_x = load_w(node.inputs[0].dtype)
load_b = load_w(node.inputs[1].dtype)
write_sm = write_w(node.outputs[0].dtype)
work_sm = work_dtype(node.outputs[0].dtype)
ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename, ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1', 'const npy_%(dtype_x)s * x', 'const int sx0', 'const int sx1',
'const npy_%(dtype_b)s * b', 'const int sb0', 'const npy_%(dtype_b)s * b', 'const int sb0',
'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'], 'npy_%(dtype_sm)s * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=[
"extern __shared__ npy_%(dtype_sm)s buf[]", "extern __shared__ npy_%(work_sm)s buf[]",
"npy_%(dtype_sm)s * buf2 = buf + N", "npy_%(work_sm)s * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]", "buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1])",
"buf[tx] += b[tx * sb0]", "buf[tx] += %(load_b)s(b[tx * sb0])",
"buf2[tx] = buf[tx]", "buf2[tx] = buf[tx]",
"}", "}",
"__syncthreads()", "__syncthreads()",
inline_softmax('N', 'buf', 'buf2', inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x', dtype_sm), 'threadIdx.x', 'blockDim.x', work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx])",
"}", "}",
"__syncthreads()", "__syncthreads()",
"}", "}",
...@@ -834,18 +846,20 @@ class GpuSoftmaxWithBias (Op): ...@@ -834,18 +846,20 @@ class GpuSoftmaxWithBias (Op):
'npy_%(dtype_sm)s * sm', 'npy_%(dtype_sm)s * sm',
'const int sm_s0', 'const int sm_s1'], 'const int sm_s0', 'const int sm_s1'],
body=[ body=[
"extern __shared__ npy_%(dtype_sm)s buf[]", "extern __shared__ npy_%(work_sm)s buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]", "const npy_%(dtype_x)s *x_ptr = &x[blockIDX * sx0]",
"npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]", "npy_%(dtype_sm)s *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf', inline_softmax_fixed_shared('N', 'buf',
'x_ptr', 'sx1', 'x_ptr', 'sx1',
load_x,
'sm_ptr', 'sm_s1', 'sm_ptr', 'sm_s1',
write_sm,
'threadIdx.x', 'threadIdx.x',
'blockDim.x', 'blockDim.x',
'b', 'sb0', 'b', 'sb0', load_b,
dtype_sm), work_sm),
"__syncthreads()", "__syncthreads()",
"}", "}",
]) ])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论