提交 46a0de64 authored 作者: James Bergstra's avatar James Bergstra

GpuSum - extending case 010 to work for large grids

上级 7c4ef142
...@@ -902,7 +902,11 @@ class GpuSum(Op): ...@@ -902,7 +902,11 @@ class GpuSum(Op):
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[2]); dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_BLOCKS));
n_blocks.y = std::min(
CudaNdarray_HOST_DIMS(%(x)s)[2],
NUM_VECTOR_OP_BLOCKS / n_blocks.x
);
if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n"); if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x; int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
...@@ -1334,7 +1338,7 @@ class GpuSum(Op): ...@@ -1334,7 +1338,7 @@ class GpuSum(Op):
#TODO: This kernel is pretty inefficient in terms of reading, because if A is #TODO: This kernel is pretty inefficient in terms of reading, because if A is
# c_contiguous (typical case) then each warp is accessing non-contigous # c_contiguous (typical case) then each warp is accessing non-contigous
# memory (a segment of a column). # memory (a segment of a column).
reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0 + blockIdx.y*sZ1]') reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_010_%(nodename)s( static __global__ void kernel_reduce_sum_010_%(nodename)s(
const int d0, const int d0,
...@@ -1353,11 +1357,19 @@ class GpuSum(Op): ...@@ -1353,11 +1357,19 @@ class GpuSum(Op):
return; //TODO: set error code return; //TODO: set error code
} }
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
{ {
mysum += A[blockIdx.x * sA0 + i1 * sA1 + blockIdx.y * sA2]; for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{
mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
}
%(reducebuf)s
}
} }
%(reducebuf)s
} }
""" %locals() """ %locals()
if self.reduce_mask == (1,1,0): if self.reduce_mask == (1,1,0):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论