提交 1d7a79a5 authored 作者: James Bergstra's avatar James Bergstra

Merge pull request #217 from nouiz/fix_sum

small fix to gpusum. It can happen that we start 2 reduction at the same time
...@@ -664,6 +664,7 @@ class GpuSum(Op): ...@@ -664,6 +664,7 @@ class GpuSum(Op):
def _k_reduce_buf(self, z_pos): def _k_reduce_buf(self, z_pos):
return """ return """
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = mysum; buf[threadNum] = mysum;
__syncthreads(); __syncthreads();
...@@ -713,6 +714,7 @@ class GpuSum(Op): ...@@ -713,6 +714,7 @@ class GpuSum(Op):
#nb_reduce<=warpSize #nb_reduce<=warpSize
def _k_reduce_buf_multiple(self, z_pos, nb_reduce): def _k_reduce_buf_multiple(self, z_pos, nb_reduce):
return """ return """
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = mysum; buf[threadNum] = mysum;
__syncthreads(); __syncthreads();
...@@ -1214,7 +1216,7 @@ class GpuSum(Op): ...@@ -1214,7 +1216,7 @@ class GpuSum(Op):
""" %locals() """ %locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (21,) return (22,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
sio = StringIO.StringIO() sio = StringIO.StringIO()
......
...@@ -57,9 +57,16 @@ def test_sum(): ...@@ -57,9 +57,16 @@ def test_sum():
#test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enought thread/block in each dimensions #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enought thread/block in each dimensions
((4100,3),[0]),((3,4101),[0]),#10 ((4100,3),[0]),((3,4101),[0]),#10
((1024,33),[0]),((33,1024),[0]),#10
((1025,33),[0]),((33,1025),[0]),#10
((4100,3),[1]),((3,4101),[1]),#01 ((4100,3),[1]),((3,4101),[1]),#01
((1024,33),[1]),((33,1024),[1]),#01
((1025,33),[1]),((33,1025),[1]),#01
((4100,3),[0,1]),((3,4101),[0,1]),#11 ((4100,3),[0,1]),((3,4101),[0,1]),#11
((1024,33),[0,1]),((33,1024),[0,1]),#01
((1025,33),[0,1]),((33,1025),[0,1]),#01
((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100 ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010 ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论