提交 2c6d7e6e authored 作者: xiaoqie's avatar xiaoqie

remove redundant local_barrier()

上级 e82a538e
...@@ -639,7 +639,6 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -639,7 +639,6 @@ class GpuSoftmax(GpuKernelBase, Op):
local_barrier(); local_barrier();
} }
} }
local_barrier();
%(ctype)s row_max = buf[0]; %(ctype)s row_max = buf[0];
local_barrier(); local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){ for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
...@@ -666,9 +665,7 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -666,9 +665,7 @@ class GpuSoftmax(GpuKernelBase, Op):
local_barrier(); local_barrier();
} }
} }
local_barrier();
%(ctype)s row_sum = buf[0]; %(ctype)s row_sum = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0) { for(ga_int __i=LID_0; __i<N; __i+=LDIM_0) {
buf[__i] = buf2[__i] / row_sum; buf[__i] = buf2[__i] / row_sum;
} }
...@@ -720,7 +717,6 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -720,7 +717,6 @@ class GpuSoftmax(GpuKernelBase, Op):
local_barrier(); local_barrier();
} }
} }
local_barrier();
%(ctype)s row_max = buf[0]; %(ctype)s row_max = buf[0];
local_barrier(); local_barrier();
...@@ -748,14 +744,11 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -748,14 +744,11 @@ class GpuSoftmax(GpuKernelBase, Op):
} }
} }
local_barrier();
%(ctype)s row_sum = buf[0]; %(ctype)s row_sum = buf[0];
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){ for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) - row_max) / row_sum); sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) - row_max) / row_sum);
} }
local_barrier(); local_barrier();
local_barrier();
} }
} }
""" % locals() """ % locals()
...@@ -962,7 +955,6 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -962,7 +955,6 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
} }
} }
local_barrier();
%(ctype)s row_max = buf[0]; %(ctype)s row_max = buf[0];
local_barrier(); local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){; for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){;
...@@ -990,9 +982,7 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -990,9 +982,7 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
} }
} }
local_barrier();
%(ctype)s row_sum = buf[0]; %(ctype)s row_sum = buf[0];
local_barrier();
for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){ for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){
buf[__i] = buf2[__i] / row_sum; buf[__i] = buf2[__i] / row_sum;
} }
...@@ -1047,10 +1037,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -1047,10 +1037,8 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
} }
} }
local_barrier();
%(ctype)s row_max = buf[0]; %(ctype)s row_max = buf[0];
local_barrier(); local_barrier();
{ {
// This function trashes buf[1..n_threads], // This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0]. // leaving the reduction result in buf[0].
...@@ -1075,14 +1063,11 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -1075,14 +1063,11 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
} }
} }
local_barrier();
%(ctype)s row_sum = buf[0]; %(ctype)s row_sum = buf[0];
local_barrier();
for (ga_int tx = LID_0; tx< N; tx += LDIM_0){ for (ga_int tx = LID_0; tx< N; tx += LDIM_0){
sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) + %(load_b)s(b[tx * sb0]) - row_max) / row_sum); sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) + %(load_b)s(b[tx * sb0]) - row_max) / row_sum);
} }
local_barrier(); local_barrier();
local_barrier();
} }
} }
""" % locals() """ % locals()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论