提交 99cffe57 authored 作者: Kelvin Xu's avatar Kelvin Xu

passing tests

上级 ba81f75f
...@@ -43,7 +43,7 @@ class GpuCumsum(CumsumOp, GpuOp): ...@@ -43,7 +43,7 @@ class GpuCumsum(CumsumOp, GpuOp):
if x.ndim > GpuCumsum.SUPPORTED_NDIMS: if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
raise NotImplementedError('Only cumsum on 1D, 2D and 3D array are supported right now!') raise NotImplementedError('Only cumsum on 1D, 2D and 3D array are supported right now!')
print(self.axis)
if self.axis >= x.ndim or self.axis < -x.ndim: if self.axis >= x.ndim or self.axis < -x.ndim:
raise ValueError('axis(={1}) out of bounds'.format(self.axis)) raise ValueError('axis(={1}) out of bounds'.format(self.axis))
......
...@@ -53,6 +53,7 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -53,6 +53,7 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
a = np.random.random((42,)).astype("float32") a = np.random.random((42,)).astype("float32")
cumsum_function = theano.function([x], cumsum(x, axis=axis), cumsum_function = theano.function([x], cumsum(x, axis=axis),
mode=self.mode) mode=self.mode)
theano.printing.debugprint(cumsum_function)
slicings = [slice(None, None, None), # Normal strides slicings = [slice(None, None, None), # Normal strides
slice(None, None, 2), # Stepped strides slice(None, None, 2), # Stepped strides
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import theano import theano
import numpy import numpy
import os
from theano import Op, Apply, config from theano import Op, Apply, config
from theano.tensor.extra_ops import CumsumOp from theano.tensor.extra_ops import CumsumOp
...@@ -11,12 +12,12 @@ except ImportError: ...@@ -11,12 +12,12 @@ except ImportError:
pass pass
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name) infer_context_name, GpuFromHost, HideC)
from .opt import register_opt as register_gpu_opt, op_lifter from .opt import register_opt as register_gpu_opt, op_lifter
from .type import GpuArrayType from .type import GpuArrayType
class GpuCumsum(CumsumOp, GpuKernelBase): class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
""" """
Parameters Parameters
---------- ----------
...@@ -32,12 +33,17 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -32,12 +33,17 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
def __str__(self): def __str__(self):
return "%s{%s}" % (self.__class__.__name__, self.axis) return "%s{%s}" % (self.__class__.__name__, self.axis)
def c_code_cache_version(self): def c_code_cache_version_apply(self, node):
return (1,) return None
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def get_params(self, node):
return node.inputs[0].type.context
def make_node(self, x): def make_node(self, x):
assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum" assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"
...@@ -48,7 +54,7 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -48,7 +54,7 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
3D arrays are supported right now!') 3D arrays are supported right now!')
if self.axis >= x.ndim or self.axis < -x.ndim: if self.axis >= x.ndim or self.axis < -x.ndim:
raise ValueError('axis(={1}) out of bounds'.format(self.axis)) raise ValueError('axis(={0}) out of bounds'.format(self.axis))
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
...@@ -66,10 +72,12 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -66,10 +72,12 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
flags = Kernel.get_flags(dtype_x) flags = Kernel.get_flags(dtype_x)
code = """ code = """
KERNEL void %(kname)s(float* input, float* output, ssize_t inputStrides_x, KERNEL void %(kname)s(float* input, float* output,
ssize_t inputStrides_y, ssize_t inputStrides_z, ga_ssize inputStrides_x,
ssize_t outputStrides_x, ssize_t outputStrides_y, ga_ssize inputStrides_y,
ssize_t outputStrides_z, const int offsetY, const int offsetZ, ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
const int beforeLastElementIdx, const int lastElementIdx){ const int beforeLastElementIdx, const int lastElementIdx){
int idY = blockIdx.y + offsetY; int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ; int idZ = blockIdx.z + offsetZ;
...@@ -100,7 +108,7 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -100,7 +108,7 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
code=""" code="""
// helper functions // helper functions
WITHIN_KERNEL WITHIN_KERNEL
void k_reductionPhase_%(nodename)s(float* partialCumSum) { void k_reductionPhase(float* partialCumSum) {
// Traverse down from leaves to root building partial sums at internal nodes in the tree. // Traverse down from leaves to root building partial sums at internal nodes in the tree.
for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) { for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
local_barrier(); local_barrier();
...@@ -112,8 +120,8 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -112,8 +120,8 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
} }
WITHIN_KERNEL WITHIN_KERNEL
void k_fetchData_%(nodename)s(float* partialCumSum, float* input, int globalThreadID, void k_fetchData(float* partialCumSum, float* input, int globalThreadID,
ssize_t dataStrides_x, ssize_t dataStrides_y, ssize_t dataStrides_z, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) { int offsetY, int offsetZ) {
// blockIdx.y and blockIdx.z represents the current independent cumsum // blockIdx.y and blockIdx.z represents the current independent cumsum
int idY = blockIdx.y + offsetY; int idY = blockIdx.y + offsetY;
...@@ -125,7 +133,7 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -125,7 +133,7 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
} }
WITHIN_KERNEL WITHIN_KERNEL
void k_reversePhase_%(nodename)s(float* partialCumSum) { void k_reversePhase(float* partialCumSum) {
// Traverse back up the tree building the scan from the partial sums // Traverse back up the tree building the scan from the partial sums
for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) { for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
local_barrier(); local_barrier();
...@@ -137,8 +145,8 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -137,8 +145,8 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
} }
WITHIN_KERNEL WITHIN_KERNEL
void k_pushData_%(nodename)s(float* partialCumSum, float* output, int globalThreadID, void k_pushData(float* partialCumSum, float* output, int globalThreadID,
ssize_t dataStrides_x, ssize_t dataStrides_y, ssize_t dataStrides_z, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) { int offsetY, int offsetZ) {
local_barrier(); local_barrier();
// blockIdx.y and blockIdx.z represents the current independent cumsum // blockIdx.y and blockIdx.z represents the current independent cumsum
...@@ -152,10 +160,10 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -152,10 +160,10 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
} }
KERNEL void k_blockCumSum(float* input, float* output, KERNEL void k_blockCumSum(float* input, float* output,
size_t nbElementsPerCumsum, ssize_t inputStrides_x, size_t nbElementsPerCumsum, ga_ssize inputStrides_x,
ssize_t inputStrides_y, ssize_t inputStrides_z, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
ssize_t outputStrides_x, ssize_t outputStrides_y, ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ssize_t outputStrides_z, int offsetY, ga_ssize outputStrides_z, int offsetY,
int offsetZ, float* blockSum) { int offsetZ, float* blockSum) {
// Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis. // Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumsums of the 2D/3D case. // The Y and Z axis of the grid will contain all independent cumsums of the 2D/3D case.
...@@ -170,16 +178,16 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -170,16 +178,16 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
extern __shared__ float partialCumSum[]; extern __shared__ float partialCumSum[];
// Load data in shared memory // Load data in shared memory
k_fetchData_%(nodename)s(partialCumSum, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ); k_fetchData(partialCumSum, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ);
// Use a dichotomy approach to compute the cumsum (i.e. balanced binary tree). // Use a dichotomy approach to compute the cumsum (i.e. balanced binary tree).
// The tree is sweeped from the leaves to the root and from the root to the leaves. // The tree is sweeped from the leaves to the root and from the root to the leaves.
// Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf // Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
k_reductionPhase_%(nodename)s(partialCumSum); k_reductionPhase(partialCumSum);
k_reversePhase_%(nodename)s(partialCumSum); k_reversePhase(partialCumSum);
// Write the final output to global memory // Write the final output to global memory
k_pushData_%(nodename)s(partialCumSum, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_x,, offsetY, offsetZ); k_pushData(partialCumSum, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);
if (blockSum != NULL){ if (blockSum != NULL){
if (threadIdx.x == blockDim.x - 1) { if (threadIdx.x == blockDim.x - 1) {
...@@ -195,19 +203,19 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -195,19 +203,19 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
k_var = "k_finalCumSum_" + nodename k_var = "k_finalCumSum_" + nodename
code = """ code = """
KERNEL void k_finalCumSum(float* output, float* blockSum, size_t nbElementsPerCumsum, KERNEL void k_finalCumSum(float* output, float* blockSum, size_t nbElementsPerCumsum,
ssize_t dataStrides_x, ssize_t dataStrides_y, ssize_t dataStrides_z, ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) { int offsetY, int offsetZ) {
int globalThreadID = (blockIdx_x + 1) * blockDim_x + threadIdx_x; int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
// Check if current has data to process. // Check if current has data to process.
if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) { if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
return; return;
} }
int idY = blockIdx_y + offsetY; int idY = blockIdx.y + offsetY;
int idZ = blockIdx_z + offsetZ; int idZ = blockIdx.z + offsetZ;
const float currentBlockSum = blockSum[blockIdx_x*(gridDim_y*gridDim_z) + idY*gridDim.z + idZ]; const float currentBlockSum = blockSum[blockIdx.x*(gridDim.y*gridDim.z) + idY*gridDim.z + idZ];
int offset = idY * dataStrides_y + idZ * dataStrides_z; int offset = idY * dataStrides_y + idZ * dataStrides_z;
int idx_even = (globalThreadID*2 ) * dataStrides_x + offset; int idx_even = (globalThreadID*2 ) * dataStrides_x + offset;
...@@ -224,15 +232,17 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -224,15 +232,17 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
return kernels return kernels
def c_code(self, node, name, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != 'cuda': if node.inputs[0].type.context.kind != 'cuda':
raise NotImplementedError("cuda only") raise NotImplementedError("cuda only")
x, = inp x, = inp
z, = out z, = out
axis = self.axis if self.axis is not None else 0 axis = self.axis if self.axis is not None else 0
fail = sub['fail'] fail = sub['fail']
ctx = sub['params']
code = """ code = """
const size_t* shape = PyGpuArray_DIMS(%(x)s); const size_t* shape = PyGpuArray_DIMS(%(x)s);
bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s); bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s);
...@@ -242,7 +252,7 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -242,7 +252,7 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
axis += PyGpuArray_NDIM(%(x)s); axis += PyGpuArray_NDIM(%(x)s);
} }
if (theano_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s), %(type)s, GA_C_ORDER, %(ctx)s) == 0){ if (theano_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s), %(x)s->ga.typecode, GA_C_ORDER, %(ctx)s) != 0){
%(fail)s; %(fail)s;
} }
...@@ -274,8 +284,10 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -274,8 +284,10 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
return code return code
def c_support_code_apply(self, node, nodename): def c_support_code_struct(self, node, nodename):
code = """int cumSum_%(nodename)s(float* input, float* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) { code = """
int cumSum_%(nodename)s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) {
size_t shape[3] = { 1, 1, 1 }; size_t shape[3] = { 1, 1, 1 };
ssize_t inputStrides_x; ssize_t inputStrides_x;
ssize_t inputStrides_y; ssize_t inputStrides_y;
...@@ -283,43 +295,44 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -283,43 +295,44 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
ssize_t outputStrides_x; ssize_t outputStrides_x;
ssize_t outputStrides_y; ssize_t outputStrides_y;
ssize_t outputStrides_z; ssize_t outputStrides_z;
switch (PYArray_NDIM(input)) switch (PyGpuArray_NDIM(input))
{ {
case 1: case 1:
shape[0] = PyArray_DIMS(input)[0]; shape[0] = PyGpuArray_DIMS(input)[0];
inputStrides_x = PyGpuArray_STRIDES(input)[0]; inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
outputStrides_x = PyGpuArray_STRIDES(output)[0]; outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
break; break;
case 2: case 2:
shape[0] = PyArray_DIMS(input)[0]; shape[0] = PyGpuArray_DIMS(input)[0];
shape[1] = PyArray_DIMS(input)[1]; shape[1] = PyGpuArray_DIMS(input)[1];
inputStrides_x = PyGpuArray_STRIDES(input)[0]; inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
inputStrides_y = PyGpuArray_STRIDES(input)[1]; inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float);
outputStrides_x = PyGpuArray_STRIDES(output)[0]; outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
outputStrides_y = PyGpuArray_STRIDES(output)[1]; outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float);
break; break;
case 3: case 3:
shape[0] = PyArray_DIMS(input)[0]; shape[0] = PyGpuArray_DIMS(input)[0];
shape[1] = PyArray_DIMS(input)[1]; shape[1] = PyGpuArray_DIMS(input)[1];
shape[2] = PyArray_DIMS(input)[2]; shape[2] = PyGpuArray_DIMS(input)[2];
inputStrides_x = PyGpuArray_STRIDES(input)[0]; inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
inputStrides_y = PyGpuArray_STRIDES(input)[1]; inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float);
inputStrides_z = PyGpuArray_STRIDES(input)[2]; inputStrides_z = PyGpuArray_STRIDES(input)[2] / sizeof(float);
outputStrides_x = PyGpuArray_STRIDES(output)[0]; outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
outputStrides_y = PyGpuArray_STRIDES(output)[1]; outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float);
outputStrides_z = PyGpuArray_STRIDES(output)[2]; outputStrides_z = PyGpuArray_STRIDES(output)[2] / sizeof(float);
break; break;
default: default:
PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis");
return -1; return -1;
} }
if (shape[axis] <= 1) { if (shape[axis] <= 1) {
output = pygpu_copy(input, GA_ANY_ORDER); int err = pygpu_move(output, input);
return 0; return err;
} }
// Perform cumsum on array of even size. // Perform cumsum on array of even size.
size_t nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2); size_t nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2);
// Determine how many elements can be processed in one block. // Determine how many elements can be processed in one block.
size_t dimBlockX = ceil( min(nbElementsPerCumsum, 2*maxThreads) / 2.0); size_t dimBlockX = ceil((nbElementsPerCumsum > 2*maxThreads ? 2*maxThreads : nbElementsPerCumsum) / 2.0);
// Determine how many blocks are needed in total. // Determine how many blocks are needed in total.
size_t dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX)); // Nb. of blocks needed per cumsum. size_t dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX)); // Nb. of blocks needed per cumsum.
size_t dimGridY; // Nb. of independent cumsums (width). size_t dimGridY; // Nb. of independent cumsums (width).
...@@ -344,27 +357,33 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -344,27 +357,33 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
case 2: case 2:
dimGridY = shape[1]; dimGridY = shape[1];
dimGridZ = shape[0]; dimGridZ = shape[0];
tmp = inputStrides_x; tmp = inputStrides_x;
inputStrides_x = inputStrides_z; inputStrides_x = inputStrides_z;
inputStrides_z = tmp; inputStrides_z = tmp;
tmp = outputStrides_x; tmp = outputStrides_x;
outputStrides_x = outputStrides_z; outputStrides_x = outputStrides_z;
outputStrides_z = tmp; outputStrides_z = tmp;
break; break;
default: default:
PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis");
return -1; return -1;
} }
const size_t shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ }; const size_t shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
PyGpuArrayObject* deviceBlockSum = pygpu_empty(2, shapeBlockSum, output->typecode, PyGpuArrayObject* deviceBlockSum = pygpu_empty(2, shapeBlockSum, output->ga.typecode,
GA_C_ORDER, input->context->ctx, Py_None); GA_C_ORDER, input->context, Py_None);
if (deviceBlockSum == NULL){ if (deviceBlockSum == NULL){
return -1; return -1;
} }
// Perform `maxGridY`*`maxGridZ` cumsums in parallel. // Perform `maxGridY`*`maxGridZ` cumsums in parallel.
for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){ for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
size_t localDimGridY = min(dimGridY - offsetY, maxGridY); size_t localDimGridY = (dimGridY - offsetY < maxGridY) ? (dimGridY - offsetY) : (maxGridY);
for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){ for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
size_t localDimGridZ = min(dimGridZ - offsetZ, maxGridZ); size_t localDimGridZ = (dimGridZ - offsetZ < maxGridZ) ? (dimGridZ - offsetZ) : (maxGridZ);
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ}; size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cumsum per block. size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cumsum per block.
size_t sharedBytes = (2*dimBlockX) * sizeof(float); size_t sharedBytes = (2*dimBlockX) * sizeof(float);
...@@ -379,9 +398,13 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -379,9 +398,13 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
(void*) &outputStrides_z, (void*) &outputStrides_z,
(void*) &offsetY, (void*) &offsetY,
(void*) &offsetZ, (void*) &offsetZ,
(void*) deviceBlockSum->ga.data; (void*) deviceBlockSum->ga.data
}; };
int err = GpuKernel_call(k_blockCumSum_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params); int err = GpuKernel_call(&k_blockCumSum_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "blockCumSum call failed");
return -1;
}
if (dimGridX > 1) { if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive). // Do a cumsum over the blockSum (recursive).
...@@ -402,12 +425,18 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -402,12 +425,18 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
(void*) &offsetY, (void*) &offsetY,
(void*) &offsetZ (void*) &offsetZ
}; };
int err = GpuKernel_call(k_finalCumSum_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params); int err = GpuKernel_call(&k_finalCumSum_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "finalCumSum call failed");
return -1;
}
} }
// If shape[axis] is odd, the last element is compute manually // If shape[axis] is odd, the last element is compute manually
if (shape[axis] != nbElementsPerCumsum){ if (shape[axis] != nbElementsPerCumsum){
size_t dimGrid[3] = {1, localDimGridY, localDimGridZ}; size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {1, 1, 1}; size_t dimBlock[3] = {1, 1, 1};
size_t tmp0 = shape[axis]-2;
size_t tmp1 = shape[axis]-1;
void* kernel_params[] = {(void*) input->ga.data, void* kernel_params[] = {(void*) input->ga.data,
(void*) output->ga.data, (void*) output->ga.data,
(void*) &inputStrides_x, (void*) &inputStrides_x,
...@@ -418,22 +447,43 @@ class GpuCumsum(CumsumOp, GpuKernelBase): ...@@ -418,22 +447,43 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
(void*) &outputStrides_z, (void*) &outputStrides_z,
(void*) &offsetY, (void*) &offsetY,
(void*) &offsetZ, (void*) &offsetZ,
(void*) &(shape[axis]-2), (void*) &(tmp0),
(void*) &(shape[axis]-1) (void*) &(tmp1)
}; };
int err = GpuKernel_call(k_cumadd_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params); int err = GpuKernel_call(&k_cumadd_%(nodename)s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "cumadd call failed");
return -1;
}
} }
} }
} }
Py_XDECREF(deviceBlockSum); Py_XDECREF(deviceBlockSum);
return 0; return 0;
} }
""" """ % locals()
return "\n".join(super(GpuKernelBase, self).c_support_code_apply(node, name), code) return super(GpuCumsum, self).c_support_code_struct(node, nodename) + code
@op_lifter([CumsumOp]) @op_lifter([CumsumOp])
def use_gpu_cumsumop(node, ctx_name): def use_gpu_cumsumop(node, ctx_name):
return GpuCumsum(node.op.axis) if node.inputs[0].dtype == 'float32':
axis = node.op.axis
x = node.inputs[0]
if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
return None
if axis is None and x.ndim > 1:
x = x.flatten()
x = GpuFromHost(ctx_name)(x)
# ``gpu_cumsum`` assume array has been flattened if needed.
if axis is None:
axis = 0
return GpuCumsum(axis)(x)
register_gpu_opt()(use_gpu_cumsumop) register_gpu_opt()(use_gpu_cumsumop)
...@@ -12,7 +12,7 @@ import theano.tensor.tests.test_extra_ops ...@@ -12,7 +12,7 @@ import theano.tensor.tests.test_extra_ops
from theano.tensor.extra_ops import cumsum, CumsumOp from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from .config import mode_with_gpu, test_ctx_name, test_ctx from .config import mode_with_gpu, test_ctx_name
from ..extra_ops import GpuCumsum from ..extra_ops import GpuCumsum
from ..type import get_context from ..type import get_context
...@@ -22,10 +22,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -22,10 +22,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
def setUp(self): def setUp(self):
super(TestGpuCumsum, self).setUp() super(TestGpuCumsum, self).setUp()
if get_context(test_ctx_name).kind != 'cuda': test_ctx = get_context(test_ctx_name)
if test_ctx.kind != 'cuda':
raise SkipTest("Cuda specific tests") raise SkipTest("Cuda specific tests")
self.max_threads_dim0 = test_ctx.maxlsize0 self.max_threads_dim0 = test_ctx.maxlsize0
self.max_grid_size1 = test_ctx.maxgsize1 self.max_grid_size1 = test_ctx.maxgsize2
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论