提交 7cff935e authored 作者: khaotik's avatar khaotik

cumsum -> cumop for both cumsum and cumprod

上级 88648d8d
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import os import os
from theano import Apply, Op from theano import Apply, Op
from theano.tensor.extra_ops import CumsumOp from theano.tensor.extra_ops import CumsumOp, CumprodOp
from .basic_ops import infer_context_name from .basic_ops import infer_context_name
try: try:
from pygpu import gpuarray from pygpu import gpuarray
...@@ -12,7 +12,7 @@ from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape) ...@@ -12,7 +12,7 @@ from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape)
from .opt import register_opt, op_lifter, register_opt2 from .opt import register_opt, op_lifter, register_opt2
class GpuCumsum(GpuKernelBase, Op): class GpuCumOp(GpuKernelBase, Op):
""" """
Parameters Parameters
---------- ----------
...@@ -20,10 +20,19 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -20,10 +20,19 @@ class GpuCumsum(GpuKernelBase, Op):
Can not be None. If you want the array flattened, do it before. Can not be None. If you want the array flattened, do it before.
""" """
SUPPORTED_NDIMS = 3 SUPPORTED_NDIMS = 3
__props__ = ('axis',) __props__ = ('axis', 'mode')
def __init__(self, axis): def __init__(self, axis, mode='add'):
self.axis = axis self.axis = axis
self.mode = mode
def __eq__(self, other):
if type(other) != type(self):
return False
return self.axis == other.axis and self.mode == other.mode
def __hash__(self):
return hash(self.axis) ^ hash(self.mode)
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (3,)
...@@ -38,14 +47,14 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -38,14 +47,14 @@ class GpuCumsum(GpuKernelBase, Op):
return node.inputs[0].type.context return node.inputs[0].type.context
def make_node(self, x): def make_node(self, x):
assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum" assert x.type.dtype == 'float32', "Only float32 supported for GpuCumOp"
context_name = infer_context_name(x) context_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name) x = as_gpuarray_variable(x, context_name)
if x.ndim > GpuCumsum.SUPPORTED_NDIMS: if x.ndim > GpuCumOp.SUPPORTED_NDIMS:
raise NotImplementedError('Only cumsum on 1D, 2D and\ raise NotImplementedError('Only cum op on 1D, 2D and\
3D arrays are supported right now!') 3D arrays are supported right now!')
if self.axis >= x.ndim or self.axis < -x.ndim: if self.axis >= x.ndim or self.axis < -x.ndim:
...@@ -56,6 +65,7 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -56,6 +65,7 @@ class GpuCumsum(GpuKernelBase, Op):
kernels = [] kernels = []
# cumadd # cumadd
kname = "k_cumadd" kname = "k_cumadd"
op = {'mul': '*', 'add': '+'}[self.mode]
k_var = "k_cumadd_" + nodename k_var = "k_cumadd_" + nodename
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
flags = Kernel.get_flags(dtype_x) flags = Kernel.get_flags(dtype_x)
...@@ -75,7 +85,7 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -75,7 +85,7 @@ class GpuCumsum(GpuKernelBase, Op):
int idx_last_input = lastElementIdx*inputStrides_x + dataOffsetY_input; int idx_last_input = lastElementIdx*inputStrides_x + dataOffsetY_input;
int idx_last_output = lastElementIdx*outputStrides_x + dataOffsetY_output; int idx_last_output = lastElementIdx*outputStrides_x + dataOffsetY_output;
int idx_beforelast = beforeLastElementIdx*outputStrides_x + dataOffsetY_output; int idx_beforelast = beforeLastElementIdx*outputStrides_x + dataOffsetY_output;
output[idx_last_output] = input[idx_last_input] + output[idx_beforelast]; output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast];
} }
""" % locals() """ % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE, params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE,
...@@ -86,9 +96,9 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -86,9 +96,9 @@ class GpuCumsum(GpuKernelBase, Op):
] ]
kernels.append(Kernel(code=code, name=kname, params=params, kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
# blockCumSum # blockCumOp
kname = "k_blockCumSum" kname = "k_blockCumOp"
k_var = "k_blockCumSum_" + nodename k_var = "k_blockCumOp_" + nodename
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE, params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
...@@ -96,109 +106,108 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -96,109 +106,108 @@ class GpuCumsum(GpuKernelBase, Op):
code = """ code = """
// helper functions // helper functions
WITHIN_KERNEL WITHIN_KERNEL
void k_reductionPhase(float* partialCumSum) { void k_reductionPhase(float* partialCumOp) {
// Traverse down from leaves to root building partial sums at internal nodes in the tree. // Traverse down from leaves to root building partial sums at internal nodes in the tree.
for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) { for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
local_barrier(); local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1; unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index < blockDim.x*2) { if(index < blockDim.x*2) {
partialCumSum[index] += partialCumSum[index - stride]; partialCumOp[index] %(op)s= partialCumOp[index - stride];
} }
} }
} }
WITHIN_KERNEL WITHIN_KERNEL
void k_fetchData(float* partialCumSum, float* input, int globalThreadID, void k_fetchData(float* partialCumOp, float* input, int globalThreadID,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) { int offsetY, int offsetZ) {
// blockIdx.y and blockIdx.z represents the current independent cumsum // blockIdx.y and blockIdx.z represents the current independent cum op
int idY = blockIdx.y + offsetY; int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z; int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z;
int idx_even = (globalThreadID*2 ) * dataStrides_x + offset; int idx_even = (globalThreadID*2 ) * dataStrides_x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset; int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset;
partialCumSum[threadIdx.x*2] = input[idx_even]; partialCumOp[threadIdx.x*2] = input[idx_even];
partialCumSum[threadIdx.x*2 + 1] = input[idx_odd]; partialCumOp[threadIdx.x*2 + 1] = input[idx_odd];
} }
WITHIN_KERNEL WITHIN_KERNEL
void k_reversePhase(float* partialCumSum) { void k_reversePhase(float* partialCumOp) {
// Traverse back up the tree building the scan from the partial sums // Traverse back up the tree building the scan from the partial sums
for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) { for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
local_barrier(); local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1; unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index + stride < blockDim.x*2) { if(index + stride < blockDim.x*2) {
partialCumSum[index + stride] += partialCumSum[index]; partialCumOp[index + stride] %(op)s= partialCumOp[index];
} }
} }
} }
WITHIN_KERNEL WITHIN_KERNEL
void k_pushData(float* partialCumSum, float* output, int globalThreadID, void k_pushData(float* partialCumOp, float* output, int globalThreadID,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) { int offsetY, int offsetZ) {
local_barrier(); local_barrier();
// blockIdx.y and blockIdx.z represents the current independent cumsum // blockIdx.y and blockIdx.z represents the current independent cum op
int idY = blockIdx.y + offsetY; int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ; int idZ = blockIdx.z + offsetZ;
int offset = idY * dataStrides_y + idZ * dataStrides_z; int offset = idY * dataStrides_y + idZ * dataStrides_z;
int idx_even = (globalThreadID*2 ) * dataStrides_x + offset; int idx_even = (globalThreadID*2 ) * dataStrides_x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset; int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset;
output[idx_even] = partialCumSum[threadIdx.x*2]; output[idx_even] = partialCumOp[threadIdx.x*2];
output[idx_odd] = partialCumSum[threadIdx.x*2 + 1]; output[idx_odd] = partialCumOp[threadIdx.x*2 + 1];
} }
KERNEL void k_blockCumSum(float* input, float* output, KERNEL void k_blockCumOp(float* input, float* output,
size_t nbElementsPerCumsum, ga_ssize inputStrides_x, size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
ga_ssize inputStrides_y, ga_ssize inputStrides_z, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, int offsetY, ga_ssize outputStrides_z, int offsetY,
int offsetZ, float* blockSum) { int offsetZ, float* blockSum) {
// Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis. // Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumsums of the 2D/3D case. // The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case.
int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x; int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
// Check if current thread has data to process. // Check if current thread has data to process.
if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) { if (globalThreadID >= (nbElementsPerCumOp+1)/2) {
return; return;
} }
extern __shared__ float partialCumSum[]; extern __shared__ float partialCumOp[];
// Load data in shared memory // Load data in shared memory
k_fetchData(partialCumSum, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ); k_fetchData(partialCumOp, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ);
// Use a dichotomy approach to compute the cumsum (i.e. balanced binary tree). // Use a dichotomy approach to compute the cum op (i.e. balanced binary tree).
// The tree is sweeped from the leaves to the root and from the root to the leaves. // The tree is sweeped from the leaves to the root and from the root to the leaves.
// Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf // Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
k_reductionPhase(partialCumSum); k_reductionPhase(partialCumOp);
k_reversePhase(partialCumSum); k_reversePhase(partialCumOp);
// Write the final output to global memory // Write the final output to global memory
k_pushData(partialCumSum, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ); k_pushData(partialCumOp, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);
if (blockSum != NULL){ if (blockSum != NULL){
if (threadIdx.x == blockDim.x - 1) { if (threadIdx.x == blockDim.x - 1) {
blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumSum[threadIdx.x*2 + 1]; blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumOp[threadIdx.x*2 + 1];
} }
} }
} }
""" """ % locals()
kernels.append(Kernel(code=code, name=kname, params=params, kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
# k_finalCumSum # k_finalCumOp
kname = "k_finalCumSum" kname = "k_finalCumOp"
k_var = "k_finalCumSum_" + nodename k_var = "k_finalCumOp_" + nodename
code = """ code = """
KERNEL void k_finalCumSum(float* output, float* blockSum, size_t nbElementsPerCumsum, KERNEL void k_finalCumOp(float* output, float* blockSum, size_t nbElementsPerCumOp,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) { int offsetY, int offsetZ) {
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x; int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
// Check if current has data to process. // Check if current has data to process.
if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) { if (globalThreadID >= (nbElementsPerCumOp+1)/2)
return; return;
}
int idY = blockIdx.y + offsetY; int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ; int idZ = blockIdx.z + offsetZ;
...@@ -208,10 +217,10 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -208,10 +217,10 @@ class GpuCumsum(GpuKernelBase, Op):
int offset = idY * dataStrides_y + idZ * dataStrides_z; int offset = idY * dataStrides_y + idZ * dataStrides_z;
int idx_even = (globalThreadID*2 ) * dataStrides_x + offset; int idx_even = (globalThreadID*2 ) * dataStrides_x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset; int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset;
output[idx_even] += currentBlockSum; output[idx_even] %(op)s= currentBlockSum;
output[idx_odd] += currentBlockSum; output[idx_odd] %(op)s= currentBlockSum;
} }
""" """ % locals()
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE, params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', ] 'int32', 'int32', ]
...@@ -263,7 +272,7 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -263,7 +272,7 @@ class GpuCumsum(GpuKernelBase, Op):
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2"); PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
%(fail)s; %(fail)s;
} }
if (cumSum_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){ if (cumOp_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){
%(fail)s; %(fail)s;
} }
} }
...@@ -274,7 +283,7 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -274,7 +283,7 @@ class GpuCumsum(GpuKernelBase, Op):
def c_support_code_struct(self, node, nodename): def c_support_code_struct(self, node, nodename):
code = """ code = """
int cumSum_%(nodename)s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) { int cumOp_%(nodename)s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) {
size_t shape[3] = { 1, 1, 1 }; size_t shape[3] = { 1, 1, 1 };
ssize_t inputStrides_x; ssize_t inputStrides_x;
ssize_t inputStrides_y; ssize_t inputStrides_y;
...@@ -316,14 +325,14 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -316,14 +325,14 @@ class GpuCumsum(GpuKernelBase, Op):
int err = pygpu_move(output, input); int err = pygpu_move(output, input);
return err; return err;
} }
// Perform cumsum on array of even size. // Perform cum op on array of even size.
size_t nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2); size_t nbElementsPerCumOp = shape[axis] - (shape[axis] %% 2);
// Determine how many elements can be processed in one block. // Determine how many elements can be processed in one block.
size_t dimBlockX = ceil((nbElementsPerCumsum > 2*maxThreads ? 2*maxThreads : nbElementsPerCumsum) / 2.0); size_t dimBlockX = ((nbElementsPerCumOp > 2*maxThreads ? 2*maxThreads : nbElementsPerCumOp)+1)/2;
// Determine how many blocks are needed in total. // Determine how many blocks are needed in total.
size_t dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX)); // Nb. of blocks needed per cumsum. size_t dimGridX = (nbElementsPerCumOp+2*dimBlockX-1) / (2*dimBlockX); // Nb. of blocks needed per cum op.
size_t dimGridY; // Nb. of independent cumsums (width). size_t dimGridY; // Nb. of independent cum ops (width).
size_t dimGridZ; // Nb. of independent cumsums (height). size_t dimGridZ; // Nb. of independent cum ops (height).
ssize_t tmp; ssize_t tmp;
switch (axis) switch (axis)
{ {
...@@ -365,18 +374,18 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -365,18 +374,18 @@ class GpuCumsum(GpuKernelBase, Op):
if (deviceBlockSum == NULL){ if (deviceBlockSum == NULL){
return -1; return -1;
} }
// Perform `maxGridY`*`maxGridZ` cumsums in parallel. // Perform `maxGridY`*`maxGridZ` cum ops in parallel.
for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){ for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
size_t localDimGridY = (dimGridY - offsetY < maxGridY) ? (dimGridY - offsetY) : (maxGridY); size_t localDimGridY = (dimGridY - offsetY < maxGridY) ? (dimGridY - offsetY) : (maxGridY);
for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){ for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
size_t localDimGridZ = (dimGridZ - offsetZ < maxGridZ) ? (dimGridZ - offsetZ) : (maxGridZ); size_t localDimGridZ = (dimGridZ - offsetZ < maxGridZ) ? (dimGridZ - offsetZ) : (maxGridZ);
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cumsum per block. size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cum op per block.
size_t sharedBytes = (2*dimBlockX) * sizeof(float); size_t sharedBytes = (2*dimBlockX) * sizeof(float);
void* kernel_params[] = {(void*) input->ga.data, void* kernel_params[] = {(void*) input->ga.data,
(void*) output->ga.data, (void*) output->ga.data,
(void*) &nbElementsPerCumsum, (void*) &nbElementsPerCumOp,
(void*) &inputStrides_x, (void*) &inputStrides_x,
(void*) &inputStrides_y, (void*) &inputStrides_y,
(void*) &inputStrides_z, (void*) &inputStrides_z,
...@@ -387,39 +396,39 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -387,39 +396,39 @@ class GpuCumsum(GpuKernelBase, Op):
(void*) &offsetZ, (void*) &offsetZ,
(void*) deviceBlockSum->ga.data (void*) deviceBlockSum->ga.data
}; };
int err = GpuKernel_call(&k_blockCumSum_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params); int err = GpuKernel_call(&k_blockCumOp_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "blockCumSum call failed"); PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed");
return -1; return -1;
} }
if (dimGridX > 1) { if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive). // Do a cum op over the blockSum (recursive).
if (cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){ if (cumOp_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
Py_DECREF(deviceBlockSum); Py_DECREF(deviceBlockSum);
return -1; return -1;
} }
// Since there are more than one block (i.e. `dimGridX > 1`) // Since there are more than one block (i.e. `dimGridX > 1`)
// report partial cumsums of previous blocks to subsequents ones. // report partial cum ops of previous blocks to subsequents ones.
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; size_t dimBlock[3] = {dimBlockX, 1, 1};
void* kernel_params[] = {(void*) output->ga.data, void* kernel_params[] = {(void*) output->ga.data,
(void*) deviceBlockSum->ga.data, (void*) deviceBlockSum->ga.data,
(void*) &nbElementsPerCumsum, (void*) &nbElementsPerCumOp,
(void*) &outputStrides_x, (void*) &outputStrides_x,
(void*) &outputStrides_y, (void*) &outputStrides_y,
(void*) &outputStrides_z, (void*) &outputStrides_z,
(void*) &offsetY, (void*) &offsetY,
(void*) &offsetZ (void*) &offsetZ
}; };
int err = GpuKernel_call(&k_finalCumSum_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params); int err = GpuKernel_call(&k_finalCumOp_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){ if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "finalCumSum call failed"); PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed");
return -1; return -1;
} }
} }
// If shape[axis] is odd, the last element is compute manually // If shape[axis] is odd, the last element is compute manually
if (shape[axis] != nbElementsPerCumsum){ if (shape[axis] != nbElementsPerCumOp){
size_t dimGrid[3] = {1, localDimGridY, localDimGridZ}; size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {1, 1, 1}; size_t dimBlock[3] = {1, 1, 1};
size_t tmp0 = shape[axis]-2; size_t tmp0 = shape[axis]-2;
...@@ -450,26 +459,33 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -450,26 +459,33 @@ class GpuCumsum(GpuKernelBase, Op):
return 0; return 0;
} }
""" % locals() """ % locals()
return super(GpuCumsum, self).c_support_code_struct(node, nodename) + code return super(GpuCumOp, self).c_support_code_struct(node, nodename) + code
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([CumsumOp]) @op_lifter([CumsumOp, CumprodOp])
@register_opt2([CumsumOp], 'fast_compile') @register_opt2([CumsumOp, CumprodOp], 'fast_compile')
def local_gpua_cumsumop(op, ctx_name, inputs, outputs): def local_gpua_cumop(op, ctx_name, inputs, outputs):
if inputs[0].dtype == 'float32': if inputs[0].dtype != 'float32':
axis = op.axis return False
x = inputs[0] if isinstance(op, CumsumOp):
if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS: mode = 'add'
return None elif isinstance(op, CumprodOp):
mode = 'mul'
x = as_gpuarray_variable(x, ctx_name) else:
return False
if axis is None and x.ndim > 1: axis = op.axis
x = GpuReshape(1)(x, (-1,)) x = inputs[0]
if axis is not None and x.ndim > GpuCumOp.SUPPORTED_NDIMS:
# ``gpu_cumsum`` assume array has been flattened if needed. return False
if axis is None:
axis = 0 x = as_gpuarray_variable(x, ctx_name)
return GpuCumsum(axis)(x) if axis is None and x.ndim > 1:
x = GpuReshape(1)(x, (-1,))
# ``gpu_cumop`` assume array has been flattened if needed.
if axis is None:
axis = 0
return GpuCumOp(axis, mode)(x)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论