提交 215a3ae6 authored 作者: Frederic Bastien's avatar Frederic Bastien

added GpuSum for pattern 0011

上级 63329761
...@@ -533,6 +533,8 @@ class GpuSum(Op): ...@@ -533,6 +533,8 @@ class GpuSum(Op):
print >> sio, """ print >> sio, """
if (verbose) printf("running kernel_reduce_sum_%(pattern)s_%(name)s\\n"); if (verbose) printf("running kernel_reduce_sum_%(pattern)s_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z; int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
if (verbose>1) printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d, n_blocks.x=%%d, n_blocks.y=%%d n_shared=%%d\\n",
n_threads.x,n_threads.y,n_threads.z,n_blocks.x,n_blocks.y,n_shared);
kernel_reduce_sum_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_sum_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
""" %locals() """ %locals()
for i in xrange(ndim): for i in xrange(ndim):
...@@ -972,6 +974,35 @@ class GpuSum(Op): ...@@ -972,6 +974,35 @@ class GpuSum(Op):
} }
""" % locals() """ % locals()
def c_code_reduce_0011(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail)
print >> sio, """
{
int verbose = 0;
dim3 n_blocks(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_BLOCKS));
while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS && n_blocks.y < CudaNdarray_HOST_DIMS(%(x)s)[1])
{
n_blocks.y += 1;
}
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[3],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK
&& n_threads.y < CudaNdarray_HOST_DIMS(%(x)s)[2]
&& n_threads.x * n_threads.y * sizeof(float) <=(15*1024-200))
{
n_threads.y += 1;
}
%(makecall)s
}
""" % locals()
def c_code_reduce_1111(self, sio, node, name, x, z, fail): def c_code_reduce_1111(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print >> sio, """ print >> sio, """
...@@ -1060,7 +1091,7 @@ class GpuSum(Op): ...@@ -1060,7 +1091,7 @@ class GpuSum(Op):
""" %locals() """ %locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (11,) return (12,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
...@@ -1343,6 +1374,34 @@ class GpuSum(Op): ...@@ -1343,6 +1374,34 @@ class GpuSum(Op):
} }
} }
""" %locals() """ %locals()
if self.reduce_mask == (0,0,1,1):
# this kernel uses one block for each row,
# threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
print >> sio, """
%(decl)s
{
%(init)s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{
float mysum = 0.0f;
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
{
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
{
mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
}
}
%(reducebuf)s
}
}
}
""" %locals()
if self.reduce_mask == (1,1,1,1): if self.reduce_mask == (1,1,1,1):
reducebuf = self._k_reduce_buf('Z[0]') reducebuf = self._k_reduce_buf('Z[0]')
decl = self._k_decl(node, nodename) decl = self._k_decl(node, nodename)
......
...@@ -31,14 +31,14 @@ def tes_use(): ...@@ -31,14 +31,14 @@ def tes_use():
def test_sum(): def test_sum():
""" """
test sum pattern 1, 11, 10, 01, 100, 110, 011, 001, 111, 0111, 1011, 1111 test sum pattern 1, 11, 10, 01, 100, 110, 011, 001, 111, 0011, 0111, 1011, 1111
TODO: test with broadcast TODO: test with broadcast
""" """
for shape, pattern in [((5,),[0]), for shape, pattern in [((5,),[0]),
((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test. ((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]), ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3])]: ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3])]:
a = tensor.TensorType('float32',(False,)*len(shape))() a = tensor.TensorType('float32',(False,)*len(shape))()
b = T.Sum(pattern)(a) b = T.Sum(pattern)(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论