提交 bab14bae authored 作者: Frederic Bastien's avatar Frederic Bastien

bump version number following bugfix and remove code duplication.

上级 a59d9a54
...@@ -924,7 +924,7 @@ class GpuSum(Op): ...@@ -924,7 +924,7 @@ class GpuSum(Op):
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (7,) return (8,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
...@@ -932,6 +932,7 @@ class GpuSum(Op): ...@@ -932,6 +932,7 @@ class GpuSum(Op):
if self.reduce_mask == (1,): if self.reduce_mask == (1,):
#this kernel is ok for up to a few thousand elements, but #this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_1_%(nodename)s( static __global__ void kernel_reduce_sum_1_%(nodename)s(
const unsigned int d0, const unsigned int d0,
...@@ -953,31 +954,7 @@ class GpuSum(Op): ...@@ -953,31 +954,7 @@ class GpuSum(Op):
float Ai = A[i0 * sA0]; float Ai = A[i0 * sA0];
mysum += Ai; mysum += Ai;
} }
buf[threadNum] = mysum; %(reducebuf)s
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[0] = buf[0];
}
}
}
} }
""" %locals() """ %locals()
if self.reduce_mask == (1,1): if self.reduce_mask == (1,1):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论