提交 b4661fbd authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #5988 from lamblin/fix_gpucumop_offset

Correctly handle offset in GpuCumOp
......@@ -35,7 +35,7 @@ class GpuCumOp(GpuKernelBase, Op):
return hash(self.axis) ^ hash(self.mode)
def c_code_cache_version(self):
return (4,)
return (5,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
......@@ -67,13 +67,16 @@ class GpuCumOp(GpuKernelBase, Op):
dtype_x = node.inputs[0].dtype
flags = Kernel.get_flags(dtype_x)
code = """
KERNEL void %(kname)s(float* input, float* output,
KERNEL void %(kname)s(float* input, ga_size input_offset,
float* output, ga_size output_offset,
ga_ssize inputStrides_x,
ga_ssize inputStrides_y,
ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
const int beforeLastElementIdx, const int lastElementIdx){
input = (float *)(((char *)input) + input_offset);
output = (float *)(((char *)output) + output_offset);
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
......@@ -85,8 +88,10 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast];
}
""" % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE,
'intc', 'intc',
'intc', 'intc',
......@@ -96,10 +101,11 @@ class GpuCumOp(GpuKernelBase, Op):
# blockCumOp
kname = "k_blockCumOp"
k_var = "k_blockCumOp_" + nodename
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', gpuarray.GpuArray, ]
'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE]
code = """
// helper functions
WITHIN_KERNEL
......@@ -154,12 +160,17 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_odd] = partialCumOp[threadIdx.x*2 + 1];
}
KERNEL void k_blockCumOp(float* input, float* output,
size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, int offsetY,
int offsetZ, float* blockSum) {
KERNEL void k_blockCumOp(float* input, ga_size input_offset,
float* output, ga_size output_offset,
size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, int offsetY,
int offsetZ, float* blockSum, ga_size blockSum_offset) {
input = (float *)(((char *)input) + input_offset);
output = (float *)(((char *)output) + output_offset);
blockSum = (float *)(((char *)blockSum) + blockSum_offset);
// Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case.
......@@ -197,9 +208,14 @@ class GpuCumOp(GpuKernelBase, Op):
kname = "k_finalCumOp"
k_var = "k_finalCumOp_" + nodename
code = """
KERNEL void k_finalCumOp(float* output, float* blockSum, size_t nbElementsPerCumOp,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
KERNEL void k_finalCumOp(float* output, ga_size output_offset,
float* blockSum, ga_size blockSum_offset,
size_t nbElementsPerCumOp,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
output = (float *)(((char *)output) + output_offset);
blockSum = (float *)(((char *)blockSum) + blockSum_offset);
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
// Check if current has data to process.
......@@ -218,7 +234,8 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_odd] %(op)s= currentBlockSum;
}
""" % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', ]
kernels.append(Kernel(code=code, name=kname, params=params,
......@@ -380,8 +397,10 @@ class GpuCumOp(GpuKernelBase, Op):
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block.
size_t sharedBytes = (2*dimBlockX) * sizeof(float);
void* kernel_params[] = {(void*) ((char *)(input->ga.data) + input->ga.offset),
(void*) ((char *)(output->ga.data) + output->ga.offset),
void* kernel_params[] = {(void*) input->ga.data,
(void*) &(input->ga.offset),
(void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) &nbElementsPerCumOp,
(void*) &inputStrides_x,
(void*) &inputStrides_y,
......@@ -391,7 +410,8 @@ class GpuCumOp(GpuKernelBase, Op):
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ,
(void*) ((char*)(deviceBlockSum->ga.data) + deviceBlockSum->ga.offset)
(void*) deviceBlockSum->ga.data,
(void*) &(deviceBlockSum->ga.offset)
};
int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){
......@@ -409,8 +429,10 @@ class GpuCumOp(GpuKernelBase, Op):
// report partial cum ops of previous blocks to subsequents ones.
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1};
void* kernel_params[] = {(void*) ((char *)(output->ga.data) + output->ga.offset),
(void*) ((char *)(deviceBlockSum->ga.data) + deviceBlockSum->ga.offset),
void* kernel_params[] = {(void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) deviceBlockSum->ga.data,
(void*) &(deviceBlockSum->ga.offset),
(void*) &nbElementsPerCumOp,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
......@@ -430,8 +452,10 @@ class GpuCumOp(GpuKernelBase, Op):
size_t dimBlock[3] = {1, 1, 1};
size_t tmp0 = shape[axis]-2;
size_t tmp1 = shape[axis]-1;
void* kernel_params[] = {(void*) ((char *)(input->ga.data) + input->ga.offset),
(void*) ((char *)(output->ga.data) + output->ga.offset),
void* kernel_params[] = {(void*) input->ga.data,
(void*) &(input->ga.offset),
(void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) &inputStrides_x,
(void*) &inputStrides_y,
(void*) &inputStrides_z,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论