提交 b6c266e6 authored 作者: Frederic Bastien's avatar Frederic Bastien

fix a bug on new Fermi GPU.

上级 06c15dc1
...@@ -646,6 +646,8 @@ class GpuSum(Op): ...@@ -646,6 +646,8 @@ class GpuSum(Op):
mysum += buf[i]; mysum += buf[i];
} }
buf[threadNum] = mysum; buf[threadNum] = mysum;
/*Comment this optimization as it don't work on Fermi GPU.
TODO: find why it don't work or put the GPU compute capability into the version
// no sync because only one warp is running // no sync because only one warp is running
if(threadCount >32) if(threadCount >32)
{ {
...@@ -660,7 +662,8 @@ class GpuSum(Op): ...@@ -660,7 +662,8 @@ class GpuSum(Op):
} }
} }
else if (threadNum < 16) else */
if (threadNum < 16)
{ {
//reduce so that threadNum 0 has the sum of everything //reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16]; if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
...@@ -1057,8 +1060,7 @@ class GpuSum(Op): ...@@ -1057,8 +1060,7 @@ class GpuSum(Op):
""" %locals() """ %locals()
def c_code_cache_version(self): def c_code_cache_version(self):
#return () return (11,)
return (10,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论