提交 4e1e0f8a authored 作者: Frederic Bastien's avatar Frederic Bastien

small optimization to GpuSum

上级 5c4a7c72
...@@ -636,7 +636,20 @@ class GpuSum(Op): ...@@ -636,7 +636,20 @@ class GpuSum(Op):
} }
buf[threadNum] = mysum; buf[threadNum] = mysum;
// no sync because only one warp is running // no sync because only one warp is running
if (threadNum < 16) if(threadCount >32)
{
buf[threadNum] += buf[threadNum+16];
buf[threadNum] += buf[threadNum+8];
buf[threadNum] += buf[threadNum+4];
buf[threadNum] += buf[threadNum+2];
buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
%(z_pos)s = buf[0];
}
}
else if (threadNum < 16)
{ {
//reduce so that threadNum 0 has the sum of everything //reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16]; if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
...@@ -979,7 +992,7 @@ class GpuSum(Op): ...@@ -979,7 +992,7 @@ class GpuSum(Op):
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (9,) return (10,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
......
...@@ -36,7 +36,7 @@ def test_sum(): ...@@ -36,7 +36,7 @@ def test_sum():
""" """
for shape, pattern in [((5,),[0]), for shape, pattern in [((5,),[0]),
((5,4),[0,1]),((5,4),[0]), ((5,4),[0,1]),((33,31),[0,1]),((5,4),[0]),#need something bigger then 32 for some opt test.
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]), ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
a = tensor.TensorType('float32',(False,)*len(shape))() a = tensor.TensorType('float32',(False,)*len(shape))()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论