@@ -98,7 +98,7 @@ class GpuCumsum(CumsumOp, GpuOp):
...
@@ -98,7 +98,7 @@ class GpuCumsum(CumsumOp, GpuOp):
}
}
__device__
__device__
void k_fetchData_%(nodename)s(float* partialCumSum, float* input, int globalThreadID, dim3 dataStrides, int offsetY, int offsetZ, int nbElementsPerCumsum) {
void k_fetchData_%(nodename)s(float* partialCumSum, float* input, int globalThreadID, dim3 dataStrides, int offsetY, int offsetZ) {
// blockIdx.y and blockIdx.z represents the current independent cumsum
// blockIdx.y and blockIdx.z represents the current independent cumsum
int idY = blockIdx.y + offsetY;
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
int idZ = blockIdx.z + offsetZ;
...
@@ -110,7 +110,7 @@ class GpuCumsum(CumsumOp, GpuOp):
...
@@ -110,7 +110,7 @@ class GpuCumsum(CumsumOp, GpuOp):
}
}
__device__
__device__
void k_pushData_%(nodename)s(float* partialCumSum, float* output, int globalThreadID, dim3 dataStrides, int offsetY, int offsetZ, int nbElementsPerCumsum) {
void k_pushData_%(nodename)s(float* partialCumSum, float* output, int globalThreadID, dim3 dataStrides, int offsetY, int offsetZ) {
__syncthreads();
__syncthreads();
// blockIdx.y and blockIdx.z represents the current independent cumsum
// blockIdx.y and blockIdx.z represents the current independent cumsum
int idY = blockIdx.y + offsetY;
int idY = blockIdx.y + offsetY;
...
@@ -173,7 +173,7 @@ class GpuCumsum(CumsumOp, GpuOp):
...
@@ -173,7 +173,7 @@ class GpuCumsum(CumsumOp, GpuOp):