提交 8644ac71 authored 作者: kelvinxu's avatar kelvinxu 提交者: Kelvin Xu

initial copy over of blockCumSum

上级 ce7d9c1a
......@@ -97,6 +97,195 @@ class GpuCumsum(CumsumOp, Op):
k_var = "k_finalCumSum_" + nodename
params =
code = """
void k_blockCumSum_%(nodename)s(float* input, float* output, int nbElementsPerCumsum, dim3 inputStrides, dim3 outputStrides, int offsetY, int offsetZ, float* blockSum) {
// Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumsums of the 2D/3D case.
int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
// Check if current thread has data to process.
if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
return;
}
extern __shared__ float partialCumSum[];
// Load data in shared memory
k_fetchData_%(nodename)s(partialCumSum, input, globalThreadID, inputStrides, offsetY, offsetZ);
// Use a dichotomy approach to compute the cumsum (i.e. balanced binary tree).
// The tree is sweeped from the leaves to the root and from the root to the leaves.
// Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
k_reductionPhase_%(nodename)s(partialCumSum);
k_reversePhase_%(nodename)s(partialCumSum);
// Write the final output to global memory
k_pushData_%(nodename)s(partialCumSum, output, globalThreadID, outputStrides, offsetY, offsetZ);
if (blockSum != NULL){
if (threadIdx.x == blockDim.x - 1) {
blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumSum[threadIdx.x*2 + 1];
}
}
}
int cumSum_%(nodename)s(CudaNdarray* input, CudaNdarray* output, int axis, int maxThreads, int maxGridY, int maxGridZ) {
int shape[3] = { 1, 1, 1 };
dim3 inputStrides(0, 0, 0);
dim3 outputStrides(0, 0, 0);
switch (CudaNdarray_NDIM(input))
{
case 1:
shape[0] = CudaNdarray_HOST_DIMS(input)[0];
inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0];
break;
case 2:
shape[0] = CudaNdarray_HOST_DIMS(input)[0];
shape[1] = CudaNdarray_HOST_DIMS(input)[1];
inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
inputStrides.y = CudaNdarray_HOST_STRIDES(input)[1];
outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0];
outputStrides.y = CudaNdarray_HOST_STRIDES(output)[1];
break;
case 3:
shape[0] = CudaNdarray_HOST_DIMS(input)[0];
shape[1] = CudaNdarray_HOST_DIMS(input)[1];
shape[2] = CudaNdarray_HOST_DIMS(input)[2];
inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
inputStrides.y = CudaNdarray_HOST_STRIDES(input)[1];
inputStrides.z = CudaNdarray_HOST_STRIDES(input)[2];
outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0];
outputStrides.y = CudaNdarray_HOST_STRIDES(output)[1];
outputStrides.z = CudaNdarray_HOST_STRIDES(output)[2];
break;
default:
return -1;
}
if (shape[axis] <= 1) {
CudaNdarray_CopyFromCudaNdarray(output, input);
return 0;
}
// Perform cumsum on array of even size.
int nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2);
// Determine how many elements can be processed in one block.
int dimBlockX = ceil( min(nbElementsPerCumsum, 2*maxThreads) / 2.0);
// Determine how many blocks are needed in total.
int dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX)); // Nb. of blocks needed per cumsum.
int dimGridY; // Nb. of independent cumsums (width).
int dimGridZ; // Nb. of independent cumsums (height).
int tmp;
switch (axis)
{
case 0:
dimGridY = shape[1];
dimGridZ = shape[2];
break;
case 1:
dimGridY = shape[0];
dimGridZ = shape[2];
tmp = inputStrides.x;
inputStrides.x = inputStrides.y;
inputStrides.y = tmp;
tmp = outputStrides.x;
outputStrides.x = outputStrides.y;
outputStrides.y = tmp;
break;
case 2:
dimGridY = shape[1];
dimGridZ = shape[0];
tmp = inputStrides.x;
inputStrides.x = inputStrides.z;
inputStrides.z = tmp;
tmp = outputStrides.x;
outputStrides.x = outputStrides.z;
outputStrides.z = tmp;
break;
default:
return -1;
}
const int shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(2, shapeBlockSum);
// Perform `maxGridY`*`maxGridZ` cumsums in parallel.
for (int offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
int localDimGridY = min(dimGridY - offsetY, maxGridY);
for (int offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
int localDimGridZ = min(dimGridZ - offsetZ, maxGridZ);
dim3 dimGrid(dimGridX, localDimGridY, localDimGridZ);
dim3 dimBlock(dimBlockX, 1, 1); // One cumsum per block.
int sharedBytes = (2*dimBlockX) * sizeof(float);
k_blockCumSum_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
(
CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output),
nbElementsPerCumsum,
inputStrides,
outputStrides,
offsetY,
offsetZ,
CudaNdarray_DEV_DATA(deviceBlockSum)
);
if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive).
if (cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
Py_DECREF(deviceBlockSum);
return -1;
}
// Since there are more than one block (i.e. `dimGridX > 1`)
// report partial cumsums of previous blocks to subsequents ones.
dim3 dimGrid(dimGridX, localDimGridY, localDimGridZ);
dim3 dimBlock(dimBlockX, 1, 1);
k_finalCumSum_%(nodename)s<<<dimGrid, dimBlock>>>
(
CudaNdarray_DEV_DATA(output),
CudaNdarray_DEV_DATA(deviceBlockSum),
nbElementsPerCumsum,
outputStrides,
offsetY,
offsetZ
);
}
// If shape[axis] is odd, the last element is compute manually
if (shape[axis] != nbElementsPerCumsum){
dim3 dimGrid(1, localDimGridY, localDimGridZ);
dim3 dimBlock(1, 1, 1);
k_cumadd_%(nodename)s<<<dimGrid, dimBlock>>>
(
CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output),
inputStrides,
outputStrides,
offsetY,
offsetZ,
shape[axis]-2,
shape[axis]-1
);
}
}
}
Py_DECREF(deviceBlockSum);
CNDA_THREAD_SYNC;
return 0;
}
"""
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论