提交 f83f03af authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Fix GpuSoftmax and GpuSoftmaxWithBias for non-float32 operation.

上级 a5814d54
...@@ -524,7 +524,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -524,7 +524,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx() gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
class GpuSoftmax (GpuKernelBase, Op): class GpuSoftmax(GpuKernelBase, Op):
""" """
Implement Softmax on the gpu. Implement Softmax on the gpu.
...@@ -541,7 +541,7 @@ class GpuSoftmax (GpuKernelBase, Op): ...@@ -541,7 +541,7 @@ class GpuSoftmax (GpuKernelBase, Op):
return shape return shape
def c_code_cache_version(self): def c_code_cache_version(self):
return (13,) + inline_softmax.code_version return (14,) + inline_softmax.code_version
def c_header_dirs(self): def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
...@@ -656,7 +656,8 @@ class GpuSoftmax (GpuKernelBase, Op): ...@@ -656,7 +656,8 @@ class GpuSoftmax (GpuKernelBase, Op):
work_sm = work_dtype(dtype_sm) work_sm = work_dtype(dtype_sm)
flags = Kernel.get_flags(dtype_x, dtype_sm) flags = Kernel.get_flags(dtype_x, dtype_sm)
type_x = gpuarray.dtype_to_ctype(dtype_x) type_x = gpuarray.dtype_to_ctype(dtype_x)
type_sm = gpuarray.dtype_to_ctype(work_sm) type_sm = gpuarray.dtype_to_ctype(dtype_sm)
type_acc = gpuarray.dtype_to_ctype(work_sm)
params = [ params = [
'uintp', 'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', 'intp',
...@@ -672,8 +673,8 @@ class GpuSoftmax (GpuKernelBase, Op): ...@@ -672,8 +673,8 @@ class GpuSoftmax (GpuKernelBase, Op):
'%s * sm' % type_sm, 'const ga_size offset_sm', '%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'], 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[ body=[
"extern __shared__ %s buf[]" % type_sm, "extern __shared__ %s buf[]" % type_acc,
"%s * buf2 = buf + N" % type_sm, "%s * buf2 = buf + N" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x, "x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm, "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
...@@ -683,8 +684,8 @@ class GpuSoftmax (GpuKernelBase, Op): ...@@ -683,8 +684,8 @@ class GpuSoftmax (GpuKernelBase, Op):
"buf2[tx] = buf[tx]", "buf2[tx] = buf[tx]",
"}", "}",
"__syncthreads()", "__syncthreads()",
inline_softmax('N', 'buf', 'buf2', inline_softmax('N', 'buf', 'buf2', 'threadIdx.x',
'threadIdx.x', 'blockDim.x', work_sm), 'blockDim.x', dtype=work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly # This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm, "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
...@@ -703,7 +704,7 @@ class GpuSoftmax (GpuKernelBase, Op): ...@@ -703,7 +704,7 @@ class GpuSoftmax (GpuKernelBase, Op):
'%s * sm' % type_sm, 'const ga_size offset_sm', '%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'], 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[ body=[
"extern __shared__ %s buf[]" % type_sm, "extern __shared__ %s buf[]" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x, "x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm, "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
...@@ -745,7 +746,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op): ...@@ -745,7 +746,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
return [shape[0]] return [shape[0]]
def c_code_cache_version(self): def c_code_cache_version(self):
return (12,) + inline_softmax.code_version return (13,) + inline_softmax.code_version
def c_header_dirs(self): def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
...@@ -880,7 +881,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op): ...@@ -880,7 +881,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm) flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm)
type_x = gpuarray.dtype_to_ctype(dtype_x) type_x = gpuarray.dtype_to_ctype(dtype_x)
type_b = gpuarray.dtype_to_ctype(dtype_b) type_b = gpuarray.dtype_to_ctype(dtype_b)
type_sm = gpuarray.dtype_to_ctype(work_sm) type_sm = gpuarray.dtype_to_ctype(dtype_sm)
type_acc = gpuarray.dtype_to_ctype(work_sm)
params = [ params = [
'uintp', 'uintp', 'uintp', 'uintp',
gpuarray.GpuArray, 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', 'intp',
...@@ -899,8 +901,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op): ...@@ -899,8 +901,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
'%s * sm' % type_sm, 'const ga_size offset_sm', '%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'], 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[ body=[
"extern __shared__ %s buf[]" % type_sm, "extern __shared__ %s buf[]" % type_acc,
"%s * buf2 = buf + N" % type_sm, "%s * buf2 = buf + N" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x, "x = (const %s *)(((char *)x)+offset_x)" % type_x,
"b = (const %s *)(((char *)b)+offset_b)" % type_b, "b = (const %s *)(((char *)b)+offset_b)" % type_b,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm, "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
...@@ -933,7 +935,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op): ...@@ -933,7 +935,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
'%s * sm' % type_sm, 'const ga_size offset_sm', '%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'], 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[ body=[
"extern __shared__ %s buf[]" % type_sm, "extern __shared__ %s buf[]" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x, "x = (const %s *)(((char *)x)+offset_x)" % type_x,
"b = (const %s *)(((char *)b)+offset_b)" % type_b, "b = (const %s *)(((char *)b)+offset_b)" % type_b,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm, "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论