提交 226c3f9c authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Actual fix that does not segfault.

上级 361721d8
...@@ -35,7 +35,7 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -35,7 +35,7 @@ class GpuCumOp(GpuKernelBase, Op):
return hash(self.axis) ^ hash(self.mode) return hash(self.axis) ^ hash(self.mode)
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (5,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
...@@ -67,13 +67,16 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -67,13 +67,16 @@ class GpuCumOp(GpuKernelBase, Op):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
flags = Kernel.get_flags(dtype_x) flags = Kernel.get_flags(dtype_x)
code = """ code = """
KERNEL void %(kname)s(float* input, float* output, KERNEL void %(kname)s(float* input, ga_size input_offset,
float* output, ga_size output_offset,
ga_ssize inputStrides_x, ga_ssize inputStrides_x,
ga_ssize inputStrides_y, ga_ssize inputStrides_y,
ga_ssize inputStrides_z, ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, const int offsetY, const int offsetZ, ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
const int beforeLastElementIdx, const int lastElementIdx){ const int beforeLastElementIdx, const int lastElementIdx){
input = (float *)(((char *)input) + input_offset);
output = (float *)(((char *)output) + output_offset);
int idY = blockIdx.y + offsetY; int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ; int idZ = blockIdx.z + offsetZ;
...@@ -85,8 +88,10 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -85,8 +88,10 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast]; output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast];
} }
""" % locals() """ % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE, params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'intc', 'intc', 'intc', 'intc',
'intc', 'intc', 'intc', 'intc',
...@@ -96,10 +101,11 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -96,10 +101,11 @@ class GpuCumOp(GpuKernelBase, Op):
# blockCumOp # blockCumOp
kname = "k_blockCumOp" kname = "k_blockCumOp"
k_var = "k_blockCumOp_" + nodename k_var = "k_blockCumOp_" + nodename
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE, params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', gpuarray.GpuArray, ] 'int32', 'int32', gpuarray.GpuArray, gpuarray.SIZE]
code = """ code = """
// helper functions // helper functions
WITHIN_KERNEL WITHIN_KERNEL
...@@ -154,12 +160,17 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -154,12 +160,17 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_odd] = partialCumOp[threadIdx.x*2 + 1]; output[idx_odd] = partialCumOp[threadIdx.x*2 + 1];
} }
KERNEL void k_blockCumOp(float* input, float* output, KERNEL void k_blockCumOp(float* input, ga_size input_offset,
size_t nbElementsPerCumOp, ga_ssize inputStrides_x, float* output, ga_size output_offset,
ga_ssize inputStrides_y, ga_ssize inputStrides_z, size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ga_ssize outputStrides_z, int offsetY, ga_ssize outputStrides_x, ga_ssize outputStrides_y,
int offsetZ, float* blockSum) { ga_ssize outputStrides_z, int offsetY,
int offsetZ, float* blockSum, ga_size blockSum_offset) {
input = (float *)(((char *)input) + input_offset);
output = (float *)(((char *)output) + output_offset);
blockSum = (float *)(((char *)blockSum) + blockSum_offset);
// Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis. // Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case. // The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case.
...@@ -197,9 +208,14 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -197,9 +208,14 @@ class GpuCumOp(GpuKernelBase, Op):
kname = "k_finalCumOp" kname = "k_finalCumOp"
k_var = "k_finalCumOp_" + nodename k_var = "k_finalCumOp_" + nodename
code = """ code = """
KERNEL void k_finalCumOp(float* output, float* blockSum, size_t nbElementsPerCumOp, KERNEL void k_finalCumOp(float* output, ga_size output_offset,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, float* blockSum, ga_size blockSum_offset,
int offsetY, int offsetZ) { size_t nbElementsPerCumOp,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
output = (float *)(((char *)output) + output_offset);
blockSum = (float *)(((char *)blockSum) + blockSum_offset);
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x; int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
// Check if current has data to process. // Check if current has data to process.
...@@ -218,7 +234,8 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -218,7 +234,8 @@ class GpuCumOp(GpuKernelBase, Op):
output[idx_odd] %(op)s= currentBlockSum; output[idx_odd] %(op)s= currentBlockSum;
} }
""" % locals() """ % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE, params = [gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', ] 'int32', 'int32', ]
kernels.append(Kernel(code=code, name=kname, params=params, kernels.append(Kernel(code=code, name=kname, params=params,
...@@ -380,8 +397,10 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -380,8 +397,10 @@ class GpuCumOp(GpuKernelBase, Op):
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block. size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block.
size_t sharedBytes = (2*dimBlockX) * sizeof(float); size_t sharedBytes = (2*dimBlockX) * sizeof(float);
void* kernel_params[] = {(void*) ((char *)(input->ga.data) + input->ga.offset), void* kernel_params[] = {(void*) input->ga.data,
(void*) ((char *)(output->ga.data) + output->ga.offset), (void*) &(input->ga.offset),
(void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) &nbElementsPerCumOp, (void*) &nbElementsPerCumOp,
(void*) &inputStrides_x, (void*) &inputStrides_x,
(void*) &inputStrides_y, (void*) &inputStrides_y,
...@@ -391,7 +410,8 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -391,7 +410,8 @@ class GpuCumOp(GpuKernelBase, Op):
(void*) &outputStrides_z, (void*) &outputStrides_z,
(void*) &offsetY, (void*) &offsetY,
(void*) &offsetZ, (void*) &offsetZ,
(void*) ((char*)(deviceBlockSum->ga.data) + deviceBlockSum->ga.offset) (void*) deviceBlockSum->ga.data,
(void*) &(deviceBlockSum->ga.offset)
}; };
int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params); int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimGrid, dimBlock, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
...@@ -409,8 +429,10 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -409,8 +429,10 @@ class GpuCumOp(GpuKernelBase, Op):
// report partial cum ops of previous blocks to subsequents ones. // report partial cum ops of previous blocks to subsequents ones.
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; size_t dimBlock[3] = {dimBlockX, 1, 1};
void* kernel_params[] = {(void*) ((char *)(output->ga.data) + output->ga.offset), void* kernel_params[] = {(void*) output->ga.data,
(void*) ((char *)(deviceBlockSum->ga.data) + deviceBlockSum->ga.offset), (void*) &(output->ga.offset),
(void*) deviceBlockSum->ga.data,
(void*) &(deviceBlockSum->ga.offset),
(void*) &nbElementsPerCumOp, (void*) &nbElementsPerCumOp,
(void*) &outputStrides_x, (void*) &outputStrides_x,
(void*) &outputStrides_y, (void*) &outputStrides_y,
...@@ -430,8 +452,10 @@ class GpuCumOp(GpuKernelBase, Op): ...@@ -430,8 +452,10 @@ class GpuCumOp(GpuKernelBase, Op):
size_t dimBlock[3] = {1, 1, 1}; size_t dimBlock[3] = {1, 1, 1};
size_t tmp0 = shape[axis]-2; size_t tmp0 = shape[axis]-2;
size_t tmp1 = shape[axis]-1; size_t tmp1 = shape[axis]-1;
void* kernel_params[] = {(void*) ((char *)(input->ga.data) + input->ga.offset), void* kernel_params[] = {(void*) input->ga.data,
(void*) ((char *)(output->ga.data) + output->ga.offset), (void*) &(input->ga.offset),
(void*) output->ga.data,
(void*) &(output->ga.offset),
(void*) &inputStrides_x, (void*) &inputStrides_x,
(void*) &inputStrides_y, (void*) &inputStrides_y,
(void*) &inputStrides_z, (void*) &inputStrides_z,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论