提交 05ee56d6 authored 作者: Frederic's avatar Frederic

Speed up reducetion 100 on the GPU

上级 5d1915c3
......@@ -1557,15 +1557,65 @@ class GpuCAReduce(GpuOp):
print("""
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], NUM_VECTOR_OP_BLOCKS));
while (n_blocks.x * (n_blocks.y+1) <= NUM_VECTOR_OP_BLOCKS && n_blocks.y <= CudaNdarray_HOST_DIMS(%(x)s)[2])
{
n_blocks.y += 1;
if (CudaNdarray_HOST_STRIDES(%(x)s)[2] != 1){
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
NUM_VECTOR_OP_BLOCKS));
while (n_blocks.x * (n_blocks.y+1) <= NUM_VECTOR_OP_BLOCKS &&
n_blocks.y <= CudaNdarray_HOST_DIMS(%(x)s)[2])
{
n_blocks.y += 1;
}
%(makecall)s
}
else
{ // reuse 010_AD kernel, we transpose the 2 first dim
// See the reduction for the real 010_AD kernel for
// explanation. We do this to get coalesced read.
dim3 n_threads(32,1,1);
int A = CudaNdarray_HOST_DIMS(%(x)s)[1];
int B = CudaNdarray_HOST_DIMS(%(x)s)[0];
int C = CudaNdarray_HOST_DIMS(%(x)s)[2];
int D = C/32;
if (32*D < C) D+= 1;
assert ((C <= 32*D) && (32*D < C+32));
dim3 n_blocks(A,D);
if (n_blocks.x > NUM_VECTOR_OP_BLOCKS)
n_blocks.x = NUM_VECTOR_OP_BLOCKS;
if (n_blocks.x*n_blocks.y > NUM_VECTOR_OP_BLOCKS)
n_blocks.y = NUM_VECTOR_OP_BLOCKS/n_blocks.x;
int n_shared = 0;
kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
A,B,C,D,
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[2],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_010_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
}
%(makecall)s
}
""" % locals(), file=sio)
......@@ -1736,7 +1786,7 @@ class GpuCAReduce(GpuOp):
""" % locals(), file=sio)
def c_code_cache_version_apply(self, node):
version = [12] # the version corresponding to the c code in this Op
version = [13] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op,
......@@ -1970,7 +2020,7 @@ class GpuCAReduce(GpuOp):
}
""" % locals(), file=sio)
if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0) or self.reduce_mask == (1, 0, 0):
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"X[a * sX0 + b * sX1 + c * sX2]",
{}, True)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论