提交 bab14bae authored 作者: Frederic Bastien's avatar Frederic Bastien

bump version number following bugfix and remove code duplication.

上级 a59d9a54
......@@ -924,7 +924,7 @@ class GpuSum(Op):
def c_code_cache_version(self):
#return ()
return (7,)
return (8,)
def c_support_code_apply(self, node, nodename):
......@@ -932,6 +932,7 @@ class GpuSum(Op):
if self.reduce_mask == (1,):
#this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]')
print >> sio, """
static __global__ void kernel_reduce_sum_1_%(nodename)s(
const unsigned int d0,
......@@ -953,31 +954,7 @@ class GpuSum(Op):
float Ai = A[i0 * sA0];
mysum += Ai;
}
buf[threadNum] = mysum;
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[0] = buf[0];
}
}
}
%(reducebuf)s
}
""" %locals()
if self.reduce_mask == (1,1):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论