提交 609593b4 authored 作者: Frederic Bastien's avatar Frederic Bastien

fix GpuSum pattern 01,011 and 0111 when the outer dimensions is bigger then 4096.

上级 791b8bcf
...@@ -1062,7 +1062,7 @@ class GpuSum(Op): ...@@ -1062,7 +1062,7 @@ class GpuSum(Op):
""" %locals() """ %locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (15,) return (16,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
...@@ -1174,7 +1174,7 @@ class GpuSum(Op): ...@@ -1174,7 +1174,7 @@ class GpuSum(Op):
for_i2 = "for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)" for_i2 = "for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)"
for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)" for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)"
reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]') reducebuf = self._k_reduce_buf('Z[i0 * sZ0]')
param_dim = ",".join(["const int d%(i)s"%locals() for i in range(nd_in)]) param_dim = ",".join(["const int d%(i)s"%locals() for i in range(nd_in)])
param_strides = ",".join(["const int sA%(i)s"%locals() for i in range(nd_in)]) param_strides = ",".join(["const int sA%(i)s"%locals() for i in range(nd_in)])
decl = self._k_decl(node,nodename) decl = self._k_decl(node,nodename)
...@@ -1182,15 +1182,18 @@ class GpuSum(Op): ...@@ -1182,15 +1182,18 @@ class GpuSum(Op):
print >> sio, """ print >> sio, """
%(decl)s{ %(decl)s{
%(init)s %(init)s
%(for_i1)s{ for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
%(for_i2)s{ mysum = 0;
%(for_i3)s{ %(for_i1)s{
float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + blockIdx.x * sA0]; %(for_i2)s{
mysum += Ai; %(for_i3)s{
float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0];
mysum += Ai;
}
} }
} }
%(reducebuf)s
} }
%(reducebuf)s
} }
""" %locals() """ %locals()
if self.reduce_mask == (1,0): if self.reduce_mask == (1,0):
......
...@@ -38,7 +38,27 @@ def test_sum(): ...@@ -38,7 +38,27 @@ def test_sum():
((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test. ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]), ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
((0,0,0,0),[0,1,2,3]), ((0,0,0,0),[0,1,2,3]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3])]: ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
#test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enought thread/block in each dimensions
((4100,3),[0]),((3,4101),[0]),#10
((4100,3),[1]),((3,4101),[1]),#01
((4100,3),[0,1]),((3,4101),[0,1]),#11
((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
#((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010 ##not implemented
((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
]:
a = tensor.TensorType('float32',(False,)*len(shape))() a = tensor.TensorType('float32',(False,)*len(shape))()
b = T.Sum(pattern)(a) b = T.Sum(pattern)(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论