提交 9b1c1a5d authored 作者: kelvinxu's avatar kelvinxu 提交者: Kelvin Xu

make adjustments

上级 8644ac71
...@@ -95,9 +95,9 @@ class GpuCumsum(CumsumOp, Op): ...@@ -95,9 +95,9 @@ class GpuCumsum(CumsumOp, Op):
# finalCumSum # finalCumSum
k_name = "k_finalCumSum" k_name = "k_finalCumSum"
k_var = "k_finalCumSum_" + nodename k_var = "k_finalCumSum_" + nodename
params = # params =
code = """ code = """
void k_blockCumSum_%(nodename)s(float* input, float* output, int nbElementsPerCumsum, dim3 inputStrides, dim3 outputStrides, int offsetY, int offsetZ, float* blockSum) { void k_blockCumSum_%(nodename)s(float* input, float* output, int nbElementsPerCumsum, size_t inputStrides[3], size_t outputStrides[3], int offsetY, int offsetZ, float* blockSum) {
// Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis. // Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumsums of the 2D/3D case. // The Y and Z axis of the grid will contain all independent cumsums of the 2D/3D case.
...@@ -131,34 +131,34 @@ class GpuCumsum(CumsumOp, Op): ...@@ -131,34 +131,34 @@ class GpuCumsum(CumsumOp, Op):
int cumSum_%(nodename)s(CudaNdarray* input, CudaNdarray* output, int axis, int maxThreads, int maxGridY, int maxGridZ) { int cumSum_%(nodename)s(CudaNdarray* input, CudaNdarray* output, int axis, int maxThreads, int maxGridY, int maxGridZ) {
int shape[3] = { 1, 1, 1 }; int shape[3] = { 1, 1, 1 };
dim3 inputStrides(0, 0, 0); size_t inputStrides[3] = {0, 0, 0};
dim3 outputStrides(0, 0, 0); size_t outputStrides[3] = {0, 0, 0};
switch (CudaNdarray_NDIM(input)) switch (PyGpuArray_NDIM(input))
{ {
case 1: case 1:
shape[0] = CudaNdarray_HOST_DIMS(input)[0]; shape[0] = PyGpuArray_DIMS(input)[0];
inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0]; inputStrides[0] = PyGpuArray_STRIDES(input)[0];
outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0]; outputStrides[0] = PyGpuArray_STRIDES(output)[0];
break; break;
case 2: case 2:
shape[0] = CudaNdarray_HOST_DIMS(input)[0]; shape[0] = PyGpuArray_DIMS(input)[0];
shape[1] = CudaNdarray_HOST_DIMS(input)[1]; shape[1] = PyGpuArray_DIMS(input)[1];
inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0]; inputStrides[0] = PyGpuArray_STRIDES(input)[0];
inputStrides.y = CudaNdarray_HOST_STRIDES(input)[1]; inputStrides[1] = PyGpuArray_STRIDES(input)[1];
outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0]; outputStrides[0] = PyGpuArray_STRIDES(output)[0];
outputStrides.y = CudaNdarray_HOST_STRIDES(output)[1]; outputStrides[1] = PyGpuArray_STRIDES(output)[1];
break; break;
case 3: case 3:
shape[0] = CudaNdarray_HOST_DIMS(input)[0]; shape[0] = PyGpuArray_DIMS(input)[0];
shape[1] = CudaNdarray_HOST_DIMS(input)[1]; shape[1] = PyGpuArray_DIMS(input)[1];
shape[2] = CudaNdarray_HOST_DIMS(input)[2]; shape[2] = PyGpuArray_DIMS(input)[2];
inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0]; inputStrides[0] = PyGpuArray_STRIDES(input)[0];
inputStrides.y = CudaNdarray_HOST_STRIDES(input)[1]; inputStrides[1] = PyGpuArray_STRIDES(input)[1];
inputStrides.z = CudaNdarray_HOST_STRIDES(input)[2]; inputStrides[2] = PyGpuArray_STRIDES(input)[2];
outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0]; outputStrides[0] = PyGpuArray_STRIDES(output)[0];
outputStrides.y = CudaNdarray_HOST_STRIDES(output)[1]; outputStrides[1] = PyGpuArray_STRIDES(output)[1];
outputStrides.z = CudaNdarray_HOST_STRIDES(output)[2]; outputStrides[2] = PyGpuArray_STRIDES(output)[2];
break; break;
default: default:
return -1; return -1;
...@@ -191,25 +191,25 @@ class GpuCumsum(CumsumOp, Op): ...@@ -191,25 +191,25 @@ class GpuCumsum(CumsumOp, Op):
dimGridY = shape[0]; dimGridY = shape[0];
dimGridZ = shape[2]; dimGridZ = shape[2];
tmp = inputStrides.x; tmp = inputStrides[0];
inputStrides.x = inputStrides.y; inputStrides[0] = inputStrides[1];
inputStrides.y = tmp; inputStrides[1] = tmp;
tmp = outputStrides.x; tmp = outputStrides[0];
outputStrides.x = outputStrides.y; outputStrides[0] = outputStrides[1];
outputStrides.y = tmp; outputStrides[1] = tmp;
break; break;
case 2: case 2:
dimGridY = shape[1]; dimGridY = shape[1];
dimGridZ = shape[0]; dimGridZ = shape[0];
tmp = inputStrides.x; tmp = inputStrides[0];
inputStrides.x = inputStrides.z; inputStrides[0] = inputStrides[2];
inputStrides.z = tmp; inputStrides[2] = tmp;
tmp = outputStrides.x; tmp = outputStrides[0];
outputStrides.x = outputStrides.z; outputStrides[0] = outputStrides[2];
outputStrides.z = tmp; outputStrides[2] = tmp;
break; break;
default: default:
return -1; return -1;
...@@ -225,8 +225,8 @@ class GpuCumsum(CumsumOp, Op): ...@@ -225,8 +225,8 @@ class GpuCumsum(CumsumOp, Op):
for (int offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){ for (int offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
int localDimGridZ = min(dimGridZ - offsetZ, maxGridZ); int localDimGridZ = min(dimGridZ - offsetZ, maxGridZ);
dim3 dimGrid(dimGridX, localDimGridY, localDimGridZ); size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
dim3 dimBlock(dimBlockX, 1, 1); // One cumsum per block. size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cumsum per block.
int sharedBytes = (2*dimBlockX) * sizeof(float); int sharedBytes = (2*dimBlockX) * sizeof(float);
k_blockCumSum_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>> k_blockCumSum_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
...@@ -250,8 +250,8 @@ class GpuCumsum(CumsumOp, Op): ...@@ -250,8 +250,8 @@ class GpuCumsum(CumsumOp, Op):
// Since there are more than one block (i.e. `dimGridX > 1`) // Since there are more than one block (i.e. `dimGridX > 1`)
// report partial cumsums of previous blocks to subsequents ones. // report partial cumsums of previous blocks to subsequents ones.
dim3 dimGrid(dimGridX, localDimGridY, localDimGridZ); size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
dim3 dimBlock(dimBlockX, 1, 1); size_t dimBlock[3] = {dimBlockX, 1, 1};
k_finalCumSum_%(nodename)s<<<dimGrid, dimBlock>>> k_finalCumSum_%(nodename)s<<<dimGrid, dimBlock>>>
( (
CudaNdarray_DEV_DATA(output), CudaNdarray_DEV_DATA(output),
...@@ -265,8 +265,8 @@ class GpuCumsum(CumsumOp, Op): ...@@ -265,8 +265,8 @@ class GpuCumsum(CumsumOp, Op):
// If shape[axis] is odd, the last element is compute manually // If shape[axis] is odd, the last element is compute manually
if (shape[axis] != nbElementsPerCumsum){ if (shape[axis] != nbElementsPerCumsum){
dim3 dimGrid(1, localDimGridY, localDimGridZ); size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
dim3 dimBlock(1, 1, 1); size_t dimBlock[3] = {1, 1, 1};
k_cumadd_%(nodename)s<<<dimGrid, dimBlock>>> k_cumadd_%(nodename)s<<<dimGrid, dimBlock>>>
( (
CudaNdarray_DEV_DATA(input), CudaNdarray_DEV_DATA(input),
...@@ -291,6 +291,4 @@ class GpuCumsum(CumsumOp, Op): ...@@ -291,6 +291,4 @@ class GpuCumsum(CumsumOp, Op):
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
return kernels return kernels
def c_code(self, node, name, inp, out, sub):
if node.inputs[0].type.context.kind != 'cuda':
raise NotImplementedError("cuda only")
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论