提交 4e1e0f8a authored 作者: Frederic Bastien's avatar Frederic Bastien

small optimization to GpuSum

上级 5c4a7c72
......@@ -636,7 +636,20 @@ class GpuSum(Op):
}
buf[threadNum] = mysum;
// no sync because only one warp is running
if (threadNum < 16)
if(threadCount >32)
{
buf[threadNum] += buf[threadNum+16];
buf[threadNum] += buf[threadNum+8];
buf[threadNum] += buf[threadNum+4];
buf[threadNum] += buf[threadNum+2];
buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
%(z_pos)s = buf[0];
}
}
else if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
......@@ -979,7 +992,7 @@ class GpuSum(Op):
def c_code_cache_version(self):
#return ()
return (9,)
return (10,)
def c_support_code_apply(self, node, nodename):
......
......@@ -36,7 +36,7 @@ def test_sum():
"""
for shape, pattern in [((5,),[0]),
((5,4),[0,1]),((5,4),[0]),
((5,4),[0,1]),((33,31),[0,1]),((5,4),[0]),#need something bigger then 32 for some opt test.
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
a = tensor.TensorType('float32',(False,)*len(shape))()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论