提交 ba81f75f authored 作者: kelvinxu's avatar kelvinxu 提交者: Kelvin Xu

Bugs and adding tests

上级 5c373d90
......@@ -28,7 +28,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined)
from .basic_ops import as_gpuarray_variable
from . import dnn, opt, nerv
from . import dnn, opt, nerv, extra_ops
def transfer(x, target):
try:
......
......@@ -16,49 +16,40 @@ from .opt import register_opt as register_gpu_opt, op_lifter
from .type import GpuArrayType
class GpuCumsum(CumsumOp, Op):
class GpuCumsum(CumsumOp, GpuKernelBase):
"""
Parameters
----------
axis
Can not be None. If you want the array flatten, do it before.
Can not be None. If you want the array flattened, do it before.
"""
SUPPORTED_NDIMS = 3
__props__ = ('axis',)
def __init__(self, axis):
self.axis = axis
# not sure if this should be here
self.max_threads_dim0 = None
self.max_grid_size1 = None
self.max_gride_size2 = None
def __str__(self):
return "%s{%s}" % (self.__class__.__name__, self.axis)
def c_code_cache_version(self):
return (1,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def make_node(self, x):
assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"
x = as_gpuarray_variable(x, infer_context_name(x))
if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
raise NotImplementedError('Only cumsum on 1D, 2D and\
3D arrays are supported right now!')
if self.axis >= x.ndim or self.axis < -x.ndim:
raise ValueError('axis(={1}) out of bounds'.format(self.axis))
x_ = as_gpuarray_variable(x, infer_context_name(x_))
return Apply(self, [x_], [x_.type()])
# ask Arnaud about this
def make_thunk(self, node, storage_map, comput_map, no_recycling):
pass
return Apply(self, [x], [x.type()])
# copied from neighbour.py
......@@ -70,34 +61,102 @@ class GpuCumsum(CumsumOp, Op):
def gpu_kernels(self, node, nodename):
kernels = []
# cumadd
k_name = "k_cumadd"
kname = "k_cumadd"
k_var = "k_cumadd_" + nodename
params =
dtype_x = node.inputs[0].dtype
flags = Kernel.get_flags(dtype_x)
code = """
KERNEL void %(kname)s(float* input, float* output, size_t inputStrides[3],
size_t[3] outputStrides, int offsetY, int offsetZ,
int beforeLastElementIdx, int lastElementIdx){
KERNEL void %(kname)s(float* input, float* output, ssize_t inputStrides_x,
ssize_t inputStrides_y, ssize_t inputStrides_z,
ssize_t outputStrides_x, ssize_t outputStrides_y,
ssize_t outputStrides_z, const int offsetY, const int offsetZ,
const int beforeLastElementIdx, const int lastElementIdx){
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
int dataOffsetY_input = idY * inputStrides[1] + idZ * inputStrides.[3];
int dataOffsetY_output = idY * outputStrides[1] + idZ * outputStrides[2];
int idx_last_input = lastElementIdx*inputStrides[0] + dataOffsetY_input;
int idx_last_output = lastElementIdx*outputStrides[0] + dataOffsetY_output;
int idx_beforelast = beforeLastElementIdx*outputStrides[0] + dataOffsetY_output;
int dataOffsetY_input = idY * inputStrides_y + idZ * inputStrides_z;
int dataOffsetY_output = idY * outputStrides_y + idZ * outputStrides_z;
int idx_last_input = lastElementIdx*inputStrides_x + dataOffsetY_input;
int idx_last_output = lastElementIdx*outputStrides_x + dataOffsetY_output;
int idx_beforelast = beforeLastElementIdx*outputStrides_x + dataOffsetY_output;
output[idx_last_output] = input[idx_last_input] + output[idx_beforelast];
}
""" % locals()
kernels.append(Kernel(code=code, name="k_cumadd", params=params,
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE,
'intc', 'intc',
'intc', 'intc',
]
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
# finalCumSum
k_name = "k_finalCumSum"
k_var = "k_finalCumSum_" + nodename
# params =
code = """
void k_blockCumSum_%(nodename)s(float* input, float* output, int nbElementsPerCumsum, size_t inputStrides[3], size_t outputStrides[3], int offsetY, int offsetZ, float* blockSum) {
# blockCumSum
kname = "k_blockCumSum"
k_var = "k_blockCumSum_" + nodename
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32', gpuarray.GpuArray,]
code="""
// helper functions
WITHIN_KERNEL
void k_reductionPhase_%(nodename)s(float* partialCumSum) {
// Traverse down from leaves to root building partial sums at internal nodes in the tree.
for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index < blockDim.x*2) {
partialCumSum[index] += partialCumSum[index - stride];
}
}
}
WITHIN_KERNEL
void k_fetchData_%(nodename)s(float* partialCumSum, float* input, int globalThreadID,
ssize_t dataStrides_x, ssize_t dataStrides_y, ssize_t dataStrides_z,
int offsetY, int offsetZ) {
// blockIdx.y and blockIdx.z represents the current independent cumsum
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z;
int idx_even = (globalThreadID*2 ) * dataStrides_x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset;
partialCumSum[threadIdx.x*2] = input[idx_even];
partialCumSum[threadIdx.x*2 + 1] = input[idx_odd];
}
WITHIN_KERNEL
void k_reversePhase_%(nodename)s(float* partialCumSum) {
// Traverse back up the tree building the scan from the partial sums
for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
local_barrier();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index + stride < blockDim.x*2) {
partialCumSum[index + stride] += partialCumSum[index];
}
}
}
WITHIN_KERNEL
void k_pushData_%(nodename)s(float* partialCumSum, float* output, int globalThreadID,
ssize_t dataStrides_x, ssize_t dataStrides_y, ssize_t dataStrides_z,
int offsetY, int offsetZ) {
local_barrier();
// blockIdx.y and blockIdx.z represents the current independent cumsum
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
int offset = idY * dataStrides_y + idZ * dataStrides_z;
int idx_even = (globalThreadID*2 ) * dataStrides_x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset;
output[idx_even] = partialCumSum[threadIdx.x*2];
output[idx_odd] = partialCumSum[threadIdx.x*2 + 1];
}
KERNEL void k_blockCumSum(float* input, float* output,
size_t nbElementsPerCumsum, ssize_t inputStrides_x,
ssize_t inputStrides_y, ssize_t inputStrides_z,
ssize_t outputStrides_x, ssize_t outputStrides_y,
ssize_t outputStrides_z, int offsetY,
int offsetZ, float* blockSum) {
// Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumsums of the 2D/3D case.
......@@ -111,7 +170,7 @@ class GpuCumsum(CumsumOp, Op):
extern __shared__ float partialCumSum[];
// Load data in shared memory
k_fetchData_%(nodename)s(partialCumSum, input, globalThreadID, inputStrides, offsetY, offsetZ);
k_fetchData_%(nodename)s(partialCumSum, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ);
// Use a dichotomy approach to compute the cumsum (i.e. balanced binary tree).
// The tree is sweeped from the leaves to the root and from the root to the leaves.
......@@ -120,7 +179,7 @@ class GpuCumsum(CumsumOp, Op):
k_reversePhase_%(nodename)s(partialCumSum);
// Write the final output to global memory
k_pushData_%(nodename)s(partialCumSum, output, globalThreadID, outputStrides, offsetY, offsetZ);
k_pushData_%(nodename)s(partialCumSum, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_x,, offsetY, offsetZ);
if (blockSum != NULL){
if (threadIdx.x == blockDim.x - 1) {
......@@ -128,165 +187,38 @@ class GpuCumsum(CumsumOp, Op):
}
}
}
"""
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
# k_finalCumSum
kname = "k_finalCumSum"
k_var = "k_finalCumSum_" + nodename
code = """
KERNEL void k_finalCumSum(float* output, float* blockSum, size_t nbElementsPerCumsum,
ssize_t dataStrides_x, ssize_t dataStrides_y, ssize_t dataStrides_z,
int offsetY, int offsetZ) {
int globalThreadID = (blockIdx_x + 1) * blockDim_x + threadIdx_x;
int cumSum_%(nodename)s(CudaNdarray* input, CudaNdarray* output, int axis, int maxThreads, int maxGridY, int maxGridZ) {
int shape[3] = { 1, 1, 1 };
size_t inputStrides[3] = {0, 0, 0};
size_t outputStrides[3] = {0, 0, 0};
switch (PyGpuArray_NDIM(input))
{
case 1:
shape[0] = PyGpuArray_DIMS(input)[0];
inputStrides[0] = PyGpuArray_STRIDES(input)[0];
outputStrides[0] = PyGpuArray_STRIDES(output)[0];
break;
case 2:
shape[0] = PyGpuArray_DIMS(input)[0];
shape[1] = PyGpuArray_DIMS(input)[1];
inputStrides[0] = PyGpuArray_STRIDES(input)[0];
inputStrides[1] = PyGpuArray_STRIDES(input)[1];
outputStrides[0] = PyGpuArray_STRIDES(output)[0];
outputStrides[1] = PyGpuArray_STRIDES(output)[1];
break;
case 3:
shape[0] = PyGpuArray_DIMS(input)[0];
shape[1] = PyGpuArray_DIMS(input)[1];
shape[2] = PyGpuArray_DIMS(input)[2];
inputStrides[0] = PyGpuArray_STRIDES(input)[0];
inputStrides[1] = PyGpuArray_STRIDES(input)[1];
inputStrides[2] = PyGpuArray_STRIDES(input)[2];
outputStrides[0] = PyGpuArray_STRIDES(output)[0];
outputStrides[1] = PyGpuArray_STRIDES(output)[1];
outputStrides[2] = PyGpuArray_STRIDES(output)[2];
break;
default:
return -1;
}
if (shape[axis] <= 1) {
CudaNdarray_CopyFromCudaNdarray(output, input);
return 0;
}
// Perform cumsum on array of even size.
int nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2);
// Determine how many elements can be processed in one block.
int dimBlockX = ceil( min(nbElementsPerCumsum, 2*maxThreads) / 2.0);
// Determine how many blocks are needed in total.
int dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX)); // Nb. of blocks needed per cumsum.
int dimGridY; // Nb. of independent cumsums (width).
int dimGridZ; // Nb. of independent cumsums (height).
int tmp;
switch (axis)
{
case 0:
dimGridY = shape[1];
dimGridZ = shape[2];
break;
case 1:
dimGridY = shape[0];
dimGridZ = shape[2];
tmp = inputStrides[0];
inputStrides[0] = inputStrides[1];
inputStrides[1] = tmp;
tmp = outputStrides[0];
outputStrides[0] = outputStrides[1];
outputStrides[1] = tmp;
break;
case 2:
dimGridY = shape[1];
dimGridZ = shape[0];
tmp = inputStrides[0];
inputStrides[0] = inputStrides[2];
inputStrides[2] = tmp;
tmp = outputStrides[0];
outputStrides[0] = outputStrides[2];
outputStrides[2] = tmp;
break;
default:
return -1;
// Check if current has data to process.
if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
return;
}
const int shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(2, shapeBlockSum);
// Perform `maxGridY`*`maxGridZ` cumsums in parallel.
for (int offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
int localDimGridY = min(dimGridY - offsetY, maxGridY);
for (int offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
int localDimGridZ = min(dimGridZ - offsetZ, maxGridZ);
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cumsum per block.
int sharedBytes = (2*dimBlockX) * sizeof(float);
k_blockCumSum_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
(
CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output),
nbElementsPerCumsum,
inputStrides,
outputStrides,
offsetY,
offsetZ,
CudaNdarray_DEV_DATA(deviceBlockSum)
);
if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive).
if (cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
Py_DECREF(deviceBlockSum);
return -1;
}
int idY = blockIdx_y + offsetY;
int idZ = blockIdx_z + offsetZ;
// Since there are more than one block (i.e. `dimGridX > 1`)
// report partial cumsums of previous blocks to subsequents ones.
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1};
k_finalCumSum_%(nodename)s<<<dimGrid, dimBlock>>>
(
CudaNdarray_DEV_DATA(output),
CudaNdarray_DEV_DATA(deviceBlockSum),
nbElementsPerCumsum,
outputStrides,
offsetY,
offsetZ
);
}
const float currentBlockSum = blockSum[blockIdx_x*(gridDim_y*gridDim_z) + idY*gridDim.z + idZ];
// If shape[axis] is odd, the last element is compute manually
if (shape[axis] != nbElementsPerCumsum){
size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {1, 1, 1};
k_cumadd_%(nodename)s<<<dimGrid, dimBlock>>>
(
CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output),
inputStrides,
outputStrides,
offsetY,
offsetZ,
shape[axis]-2,
shape[axis]-1
);
}
}
}
Py_DECREF(deviceBlockSum);
CNDA_THREAD_SYNC;
return 0;
int offset = idY * dataStrides_y + idZ * dataStrides_z;
int idx_even = (globalThreadID*2 ) * dataStrides_x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides_x + offset;
output[idx_even] += currentBlockSum;
output[idx_odd] += currentBlockSum;
}
"""
"""
params = [gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE,
gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.SSIZE,
'int32', 'int32',]
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
return kernels
......@@ -300,17 +232,8 @@ class GpuCumsum(CumsumOp, Op):
axis = self.axis if self.axis is not None else 0
fail = sub['fail']
max_threads_dim0 = self.max_threads_dim0
max_grid_size1 = self.max_grid_size1
max_grid_size2 = self.max_grid_size2
if max_threads_dim0 is None or max_grid_size1 is None or max_grid_size2 is None:
raise NotImplementedError("GpuCumsum.c_code should not be called "
"directly. It should be called by "
"make_thunk() that add some information "
"related to the selected GPU.")
code = """
const int* shape = PyGpuArray_DIMS(%(x)s);
const size_t* shape = PyGpuArray_DIMS(%(x)s);
bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s);
int axis = %(axis)s;
......@@ -319,36 +242,31 @@ class GpuCumsum(CumsumOp, Op):
axis += PyGpuArray_NDIM(%(x)s);
}
// If output is already allocated, check if its shape matches the input's one.
if (!needAllocation) {
for (int i= 0; i < PyGpuArray_NDIM(%(x)s); ++i) {
if (PyGpuArray_DIMS(%(x)s)[i] != PyGpuArray_DIMS(%(z)s)[i]) {
needAllocation = true;
}
}
}
if (needAllocation){
Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*) CudaNdarray_NewDims(PyGpuArray_NDIM(%(x)s), shape);
}
if (!%(z)s) {
if (theano_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s), %(type)s, GA_C_ORDER, %(ctx)s) == 0){
%(fail)s;
}
{ // Namespace for kernel calls //
if (cumSum_%(nodename)s(%(x)s, %(z)s, axis, %(max_threads_dim0)s, %(max_grid_size1)s, %(max_grid_size2)s) == -1){
size_t max_threads_dim0;
size_t max_grid_size1;
size_t max_grid_size2;
int err;
err = %(ctx)s->ops->property(%(ctx)s->ctx, NULL, NULL, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0");
%(fail)s;
}
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n",
"cumSum_%(nodename)s",
cudaGetErrorString(sts));
err = %(ctx)s->ops->property(%(ctx)s->ctx, NULL, NULL, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1");
%(fail)s;
}
err = %(ctx)s->ops->property(%(ctx)s->ctx, NULL, NULL, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
%(fail)s;
}
if (cumSum_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){
%(fail)s;
}
}
......@@ -356,8 +274,166 @@ class GpuCumsum(CumsumOp, Op):
return code
def c_support_code_apply(self, node, nodename):
code = """int cumSum_%(nodename)s(float* input, float* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) {
size_t shape[3] = { 1, 1, 1 };
ssize_t inputStrides_x;
ssize_t inputStrides_y;
ssize_t inputStrides_z;
ssize_t outputStrides_x;
ssize_t outputStrides_y;
ssize_t outputStrides_z;
switch (PYArray_NDIM(input))
{
case 1:
shape[0] = PyArray_DIMS(input)[0];
inputStrides_x = PyGpuArray_STRIDES(input)[0];
outputStrides_x = PyGpuArray_STRIDES(output)[0];
break;
case 2:
shape[0] = PyArray_DIMS(input)[0];
shape[1] = PyArray_DIMS(input)[1];
inputStrides_x = PyGpuArray_STRIDES(input)[0];
inputStrides_y = PyGpuArray_STRIDES(input)[1];
outputStrides_x = PyGpuArray_STRIDES(output)[0];
outputStrides_y = PyGpuArray_STRIDES(output)[1];
break;
case 3:
shape[0] = PyArray_DIMS(input)[0];
shape[1] = PyArray_DIMS(input)[1];
shape[2] = PyArray_DIMS(input)[2];
inputStrides_x = PyGpuArray_STRIDES(input)[0];
inputStrides_y = PyGpuArray_STRIDES(input)[1];
inputStrides_z = PyGpuArray_STRIDES(input)[2];
outputStrides_x = PyGpuArray_STRIDES(output)[0];
outputStrides_y = PyGpuArray_STRIDES(output)[1];
outputStrides_z = PyGpuArray_STRIDES(output)[2];
break;
default:
return -1;
}
if (shape[axis] <= 1) {
output = pygpu_copy(input, GA_ANY_ORDER);
return 0;
}
// Perform cumsum on array of even size.
size_t nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2);
// Determine how many elements can be processed in one block.
size_t dimBlockX = ceil( min(nbElementsPerCumsum, 2*maxThreads) / 2.0);
// Determine how many blocks are needed in total.
size_t dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX)); // Nb. of blocks needed per cumsum.
size_t dimGridY; // Nb. of independent cumsums (width).
size_t dimGridZ; // Nb. of independent cumsums (height).
ssize_t tmp;
switch (axis)
{
case 0:
dimGridY = shape[1];
dimGridZ = shape[2];
break;
case 1:
dimGridY = shape[0];
dimGridZ = shape[2];
tmp = inputStrides_x;
inputStrides_x = inputStrides_y;
inputStrides_y = tmp;
tmp = outputStrides_x;
outputStrides_x = outputStrides_y;
outputStrides_y = tmp;
break;
case 2:
dimGridY = shape[1];
dimGridZ = shape[0];
tmp = inputStrides_x;
inputStrides_x = inputStrides_z;
inputStrides_z = tmp;
tmp = outputStrides_x;
outputStrides_x = outputStrides_z;
outputStrides_z = tmp;
break;
default:
return -1;
}
const size_t shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
PyGpuArrayObject* deviceBlockSum = pygpu_empty(2, shapeBlockSum, output->typecode,
GA_C_ORDER, input->context->ctx, Py_None);
if (deviceBlockSum == NULL){
return -1;
}
// Perform `maxGridY`*`maxGridZ` cumsums in parallel.
for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
size_t localDimGridY = min(dimGridY - offsetY, maxGridY);
for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
size_t localDimGridZ = min(dimGridZ - offsetZ, maxGridZ);
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cumsum per block.
size_t sharedBytes = (2*dimBlockX) * sizeof(float);
void* kernel_params[] = {(void*) input->ga.data,
(void*) output->ga.data,
(void*) &nbElementsPerCumsum,
(void*) &inputStrides_x,
(void*) &inputStrides_y,
(void*) &inputStrides_z,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ,
(void*) deviceBlockSum->ga.data;
};
int err = GpuKernel_call(k_blockCumSum_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive).
if (cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
Py_DECREF(deviceBlockSum);
return -1;
}
// Since there are more than one block (i.e. `dimGridX > 1`)
// report partial cumsums of previous blocks to subsequents ones.
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1};
void* kernel_params[] = {(void*) output->ga.data,
(void*) deviceBlockSum->ga.data,
(void*) &nbElementsPerCumsum,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ
};
int err = GpuKernel_call(k_finalCumSum_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
}
// If shape[axis] is odd, the last element is compute manually
if (shape[axis] != nbElementsPerCumsum){
size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {1, 1, 1};
void* kernel_params[] = {(void*) input->ga.data,
(void*) output->ga.data,
(void*) &inputStrides_x,
(void*) &inputStrides_y,
(void*) &inputStrides_z,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ,
(void*) &(shape[axis]-2),
(void*) &(shape[axis]-1)
};
int err = GpuKernel_call(k_cumadd_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
}
}
}
Py_XDECREF(deviceBlockSum);
return 0;
}
"""
return "\n".join(super(GpuKernelBase, self).c_support_code_apply(node, name), code)
@op_lifter([CumsumOp])
def use_gpu_cumsumop(node, axis):
return GpuCumsum(axis)
def use_gpu_cumsumop(node, ctx_name):
return GpuCumsum(node.op.axis)
register_gpu_opt()(use_gpu_cumsumop)
# Skip test if cuda_ndarray is not available.
from __future__ import absolute_import, print_function, division
import itertools
import numpy as np
from six.moves import xrange
from theano import tensor as T
import theano
import theano.tensor.tests.test_extra_ops
from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tests import unittest_tools as utt
from .config import mode_with_gpu, test_ctx_name, test_ctx
from ..extra_ops import GpuCumsum
from ..type import get_context
class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
mode = mode_with_gpu
def setUp(self):
super(TestGpuCumsum, self).setUp()
if get_context(test_ctx_name).kind != 'cuda':
raise SkipTest("Cuda specific tests")
self.max_threads_dim0 = test_ctx.maxlsize0
self.max_grid_size1 = test_ctx.maxgsize1
def test_Strides1D(self):
x = T.fvector('x')
for axis in [0, None, -1]:
a = np.random.random((42,)).astype("float32")
cumsum_function = theano.function([x], cumsum(x, axis=axis),
mode=self.mode)
slicings = [slice(None, None, None), # Normal strides
slice(None, None, 2), # Stepped strides
slice(None, None, -1), # Negative strides
]
# Cartesian product of all slicings to test.
for slicing in itertools.product(slicings, repeat=x.ndim):
f = theano.function([x], cumsum(x[slicing], axis=axis),
mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
utt.assert_allclose(np.cumsum(a[slicing], axis=axis), f(a))
utt.assert_allclose(np.cumsum(a[slicing], axis=axis),
cumsum_function(a[slicing]))
def test_Strides2D(self):
x = T.fmatrix('x')
for axis in [0, 1, None, -1, -2]:
a = np.random.random((42, 30)).astype("float32")
cumsum_function = theano.function([x], cumsum(x, axis=axis),
mode=self.mode)
slicings = [slice(None, None, None), # Normal strides
slice(None, None, 2), # Stepped strides
slice(None, None, -1), # Negative strides
]
# Cartesian product of all slicings to test.
for slicing in itertools.product(slicings, repeat=x.ndim):
f = theano.function([x], cumsum(x[slicing], axis=axis),
mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
utt.assert_allclose(np.cumsum(a[slicing], axis=axis), f(a))
utt.assert_allclose(np.cumsum(a[slicing], axis=axis),
cumsum_function(a[slicing]))
def test_Strides3D(self):
x = T.ftensor3('x')
for axis in [0, 1, 2, None, -1, -2, -3]:
a = np.random.random((42, 30, 25)).astype("float32")
cumsum_function = theano.function([x], cumsum(x, axis=axis),
mode=self.mode)
slicings = [slice(None, None, None), # Normal strides
slice(None, None, 2), # Stepped strides
slice(None, None, -1), # Negative strides
]
# Cartesian product of all slicings to test.
for slicing in itertools.product(slicings, repeat=x.ndim):
f = theano.function([x], cumsum(x[slicing], axis=axis),
mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
utt.assert_allclose(np.cumsum(a[slicing], axis=axis), f(a))
utt.assert_allclose(np.cumsum(a[slicing], axis=axis),
cumsum_function(a[slicing]))
def test_GpuCumsum1D(self):
block_max_size = self.max_threads_dim0 * 2
x = T.fvector('x')
f = theano.function([x], cumsum(x), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
# Extensive testing for the first 1025 sizes
a = np.random.random(1025).astype("float32")
for i in xrange(a.shape[0]):
utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))
# Use multiple GPU threadblocks
a = np.random.random((block_max_size+2,)).astype("float32")
utt.assert_allclose(np.cumsum(a), f(a))
# Use recursive cumsum
a = np.ones((block_max_size*(block_max_size+1)+2,),
dtype="float32")
utt.assert_allclose(np.cumsum(a), f(a))
def test_GpuCumsum2D(self):
block_max_size = self.max_threads_dim0 * 2
x = T.fmatrix('x')
for shape_axis, axis in zip([0, 1, 0, 1, 0], [0, 1, None, -1, -2]):
f = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
# Extensive testing for the first 1025 sizes
a_shape = [5, 5]
a_shape[shape_axis] = 1025
a = np.random.random(a_shape).astype("float32")
slices = [slice(None), slice(None)]
for i in xrange(a.shape[shape_axis]):
slices[shape_axis] = slice(i)
fa = f(a[slices])
npa = np.cumsum(a[slices], axis=axis)
utt.assert_allclose(npa, fa)
# Use multiple GPU threadblocks
a_shape = [5, 5]
a_shape[shape_axis] = block_max_size+2
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks
a_shape = [4, 4]
a_shape[1-shape_axis] = self.max_grid_size1+1
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)
# Use recursive cumsum
a_shape = [3, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
a = np.random.random(a_shape).astype("float32")
a = np.sign(a-0.5).astype("float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum3D(self):
block_max_size = self.max_threads_dim0 * 2
x = T.ftensor3('x')
for shape_axis, axis in zip([0, 1, 2, 0, 2, 1, 0], [0, 1, 2, None, -1, -2, -3]):
f = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, GpuCumsum)]
# Extensive testing for the first 1025 sizes
a_shape = [5, 5, 5]
a_shape[shape_axis] = 1025
a = np.random.rand(*a_shape).astype("float32")
slices = [slice(None), slice(None), slice(None)]
for i in xrange(a.shape[shape_axis]):
slices[shape_axis] = slice(i)
fa = f(a[slices])
npa = np.cumsum(a[slices], axis=axis)
utt.assert_allclose(npa, fa)
# Use multiple GPU threadblocks (along accumulation axis)
a_shape = [2, 2, 2]
a_shape[shape_axis] = block_max_size+2
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks (not along accumulation axis)
a_shape = [5, 5, 5]
a_shape[(shape_axis+1) % 3] = self.max_grid_size1+1
a = np.random.random(a_shape).astype("float32")
if axis is None:
# Avoid floating point error
a = np.sign(a-0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
a_shape = [5, 5, 5]
a_shape[(shape_axis+2) % 3] = self.max_grid_size1+1
a = np.random.random(a_shape).astype("float32")
if axis is None:
# Avoid floating point error
a = np.sign(a-0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use recursive cumsum (along accumulation axis)
a_shape = [3, 3, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
a = np.random.random(a_shape).astype("float32")
a = np.sign(a-0.5).astype("float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum4D(self):
# Should not use the GPU version.
x = T.ftensor4('x')
f = theano.function([x], cumsum(x, axis=1), mode=self.mode)
assert [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, CumsumOp)]
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论