Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
99cffe57
提交
99cffe57
authored
4月 19, 2016
作者:
Kelvin Xu
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
passing tests
上级
ba81f75f
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
231 行增加
和
179 行删除
+231
-179
extra_ops.py
theano/sandbox/cuda/extra_ops.py
+1
-1
test_extra_ops.py
theano/sandbox/cuda/tests/test_extra_ops.py
+1
-0
extra_ops.py
theano/sandbox/gpuarray/extra_ops.py
+225
-175
test_extra_ops.py
theano/sandbox/gpuarray/tests/test_extra_ops.py
+4
-3
没有找到文件。
theano/sandbox/cuda/extra_ops.py
浏览文件 @
99cffe57
...
...
@@ -43,7 +43,7 @@ class GpuCumsum(CumsumOp, GpuOp):
if
x
.
ndim
>
GpuCumsum
.
SUPPORTED_NDIMS
:
raise
NotImplementedError
(
'Only cumsum on 1D, 2D and 3D array are supported right now!'
)
print
(
self
.
axis
)
if
self
.
axis
>=
x
.
ndim
or
self
.
axis
<
-
x
.
ndim
:
raise
ValueError
(
'axis(={1}) out of bounds'
.
format
(
self
.
axis
))
...
...
theano/sandbox/cuda/tests/test_extra_ops.py
浏览文件 @
99cffe57
...
...
@@ -53,6 +53,7 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
a
=
np
.
random
.
random
((
42
,))
.
astype
(
"float32"
)
cumsum_function
=
theano
.
function
([
x
],
cumsum
(
x
,
axis
=
axis
),
mode
=
self
.
mode
)
theano
.
printing
.
debugprint
(
cumsum_function
)
slicings
=
[
slice
(
None
,
None
,
None
),
# Normal strides
slice
(
None
,
None
,
2
),
# Stepped strides
...
...
theano/sandbox/gpuarray/extra_ops.py
浏览文件 @
99cffe57
from
__future__
import
absolute_import
,
print_function
,
division
import
theano
import
numpy
import
os
from
theano
import
Op
,
Apply
,
config
from
theano.tensor.extra_ops
import
CumsumOp
...
...
@@ -11,12 +12,12 @@ except ImportError:
pass
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
,
infer_context_name
)
infer_context_name
,
GpuFromHost
,
HideC
)
from
.opt
import
register_opt
as
register_gpu_opt
,
op_lifter
from
.type
import
GpuArrayType
class
GpuCumsum
(
CumsumOp
,
GpuKernelBase
):
class
GpuCumsum
(
GpuKernelBase
,
HideC
,
CumsumOp
):
"""
Parameters
----------
...
...
@@ -32,12 +33,17 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
def
__str__
(
self
):
return
"
%
s{
%
s}"
%
(
self
.
__class__
.
__name__
,
self
.
axis
)
def
c_code_cache_version
(
self
):
return
(
1
,)
def
c_code_cache_version
_apply
(
self
,
node
):
return
None
def
c_headers
(
self
):
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
,
'<gpuarray_helper.h>'
]
def
c_header_dirs
(
self
):
return
[
os
.
path
.
dirname
(
__file__
)]
def
get_params
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
def
make_node
(
self
,
x
):
assert
x
.
type
.
dtype
==
'float32'
,
"Only float32 supported for GpuCumSum"
...
...
@@ -48,7 +54,7 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
3D arrays are supported right now!'
)
if
self
.
axis
>=
x
.
ndim
or
self
.
axis
<
-
x
.
ndim
:
raise
ValueError
(
'axis(={
1
}) out of bounds'
.
format
(
self
.
axis
))
raise
ValueError
(
'axis(={
0
}) out of bounds'
.
format
(
self
.
axis
))
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
...
...
@@ -66,10 +72,12 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
dtype_x
=
node
.
inputs
[
0
]
.
dtype
flags
=
Kernel
.
get_flags
(
dtype_x
)
code
=
"""
KERNEL void
%(kname)
s(float* input, float* output, ssize_t inputStrides_x,
ssize_t inputStrides_y, ssize_t inputStrides_z,
ssize_t outputStrides_x, ssize_t outputStrides_y,
ssize_t outputStrides_z, const int offsetY, const int offsetZ,
KERNEL void
%(kname)
s(float* input, float* output,
ga_ssize inputStrides_x,
ga_ssize inputStrides_y,
ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
const int beforeLastElementIdx, const int lastElementIdx){
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
...
...
@@ -100,7 +108,7 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
code
=
"""
// helper functions
WITHIN_KERNEL
void k_reductionPhase
_
%(nodename)
s
(float* partialCumSum) {
void k_reductionPhase(float* partialCumSum) {
// Traverse down from leaves to root building partial sums at internal nodes in the tree.
for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
local_barrier();
...
...
@@ -112,9 +120,9 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
}
WITHIN_KERNEL
void k_fetchData
_
%(nodename)
s
(float* partialCumSum, float* input, int globalThreadID,
ssize_t dataStrides_x, ssize_t dataStrides_y, ssize_t
dataStrides_z,
int offsetY, int offsetZ) {
void k_fetchData(float* partialCumSum, float* input, int globalThreadID,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize
dataStrides_z,
int offsetY, int offsetZ) {
// blockIdx.y and blockIdx.z represents the current independent cumsum
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z;
...
...
@@ -125,7 +133,7 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
}
WITHIN_KERNEL
void k_reversePhase
_
%(nodename)
s
(float* partialCumSum) {
void k_reversePhase(float* partialCumSum) {
// Traverse back up the tree building the scan from the partial sums
for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
local_barrier();
...
...
@@ -137,9 +145,9 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
}
WITHIN_KERNEL
void k_pushData
_
%(nodename)
s
(float* partialCumSum, float* output, int globalThreadID,
ssize_t dataStrides_x, ssize_t dataStrides_y, ssize_t
dataStrides_z,
int offsetY, int offsetZ) {
void k_pushData(float* partialCumSum, float* output, int globalThreadID,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize
dataStrides_z,
int offsetY, int offsetZ) {
local_barrier();
// blockIdx.y and blockIdx.z represents the current independent cumsum
int idY = blockIdx.y + offsetY;
...
...
@@ -152,10 +160,10 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
}
KERNEL void k_blockCumSum(float* input, float* output,
size_t nbElementsPerCumsum,
ssize_t
inputStrides_x,
ssize_t inputStrides_y, ssize_t
inputStrides_z,
ssize_t outputStrides_x, ssize_t
outputStrides_y,
ssize_t
outputStrides_z, int offsetY,
size_t nbElementsPerCumsum,
ga_ssize
inputStrides_x,
ga_ssize inputStrides_y, ga_ssize
inputStrides_z,
ga_ssize outputStrides_x, ga_ssize
outputStrides_y,
ga_ssize
outputStrides_z, int offsetY,
int offsetZ, float* blockSum) {
// Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumsums of the 2D/3D case.
...
...
@@ -170,16 +178,16 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
extern __shared__ float partialCumSum[];
// Load data in shared memory
k_fetchData
_
%(nodename)
s
(partialCumSum, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ);
k_fetchData(partialCumSum, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ);
// Use a dichotomy approach to compute the cumsum (i.e. balanced binary tree).
// The tree is sweeped from the leaves to the root and from the root to the leaves.
// Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
k_reductionPhase
_
%(nodename)
s
(partialCumSum);
k_reversePhase
_
%(nodename)
s
(partialCumSum);
k_reductionPhase(partialCumSum);
k_reversePhase(partialCumSum);
// Write the final output to global memory
k_pushData
_
%(nodename)
s(partialCumSum, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_x,
, offsetY, offsetZ);
k_pushData
(partialCumSum, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z
, offsetY, offsetZ);
if (blockSum != NULL){
if (threadIdx.x == blockDim.x - 1) {
...
...
@@ -195,19 +203,19 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
k_var
=
"k_finalCumSum_"
+
nodename
code
=
"""
KERNEL void k_finalCumSum(float* output, float* blockSum, size_t nbElementsPerCumsum,
ssize_t dataStrides_x, ssize_t dataStrides_y, ssize_t
dataStrides_z,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize
dataStrides_z,
int offsetY, int offsetZ) {
int globalThreadID = (blockIdx
_x + 1) * blockDim_x + threadIdx_
x;
int globalThreadID = (blockIdx
.x + 1) * blockDim.x + threadIdx.
x;
// Check if current has data to process.
if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
return;
}
int idY = blockIdx
_
y + offsetY;
int idZ = blockIdx
_
z + offsetZ;
int idY = blockIdx
.
y + offsetY;
int idZ = blockIdx
.
z + offsetZ;
const float currentBlockSum = blockSum[blockIdx
_x*(gridDim_y*gridDim_
z) + idY*gridDim.z + idZ];
const float currentBlockSum = blockSum[blockIdx
.x*(gridDim.y*gridDim.
z) + idY*gridDim.z + idZ];
int offset = idY * dataStrides_y + idZ * dataStrides_z;
int idx_even = (globalThreadID*2 ) * dataStrides_x + offset;
...
...
@@ -224,15 +232,17 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
return
kernels
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
n
oden
ame
,
inp
,
out
,
sub
):
if
node
.
inputs
[
0
]
.
type
.
context
.
kind
!=
'cuda'
:
raise
NotImplementedError
(
"cuda only"
)
x
,
=
inp
z
,
=
out
axis
=
self
.
axis
if
self
.
axis
is
not
None
else
0
fail
=
sub
[
'fail'
]
ctx
=
sub
[
'params'
]
code
=
"""
const size_t* shape = PyGpuArray_DIMS(
%(x)
s);
bool needAllocation = !
%(z)
s || PyGpuArray_NDIM(
%(x)
s) != PyGpuArray_NDIM(
%(z)
s);
...
...
@@ -242,7 +252,7 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
axis += PyGpuArray_NDIM(
%(x)
s);
}
if (theano_prep_output(&
%(z)
s, PyGpuArray_NDIM(
%(x)
s), PyGpuArray_DIMS(
%(x)
s),
%(
type)
s, GA_C_ORDER,
%(ctx)
s) =
= 0){
if (theano_prep_output(&
%(z)
s, PyGpuArray_NDIM(
%(x)
s), PyGpuArray_DIMS(
%(x)
s),
%(
x)
s->ga.typecode, GA_C_ORDER,
%(ctx)
s) !
= 0){
%(fail)
s;
}
...
...
@@ -274,103 +284,161 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
return
code
def
c_support_code_apply
(
self
,
node
,
nodename
):
code
=
"""int cumSum_
%(nodename)
s(float* input, float* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) {
size_t shape[3] = { 1, 1, 1 };
ssize_t inputStrides_x;
ssize_t inputStrides_y;
ssize_t inputStrides_z;
ssize_t outputStrides_x;
ssize_t outputStrides_y;
ssize_t outputStrides_z;
switch (PYArray_NDIM(input))
{
case 1:
shape[0] = PyArray_DIMS(input)[0];
inputStrides_x = PyGpuArray_STRIDES(input)[0];
outputStrides_x = PyGpuArray_STRIDES(output)[0];
break;
case 2:
shape[0] = PyArray_DIMS(input)[0];
shape[1] = PyArray_DIMS(input)[1];
inputStrides_x = PyGpuArray_STRIDES(input)[0];
inputStrides_y = PyGpuArray_STRIDES(input)[1];
outputStrides_x = PyGpuArray_STRIDES(output)[0];
outputStrides_y = PyGpuArray_STRIDES(output)[1];
break;
case 3:
shape[0] = PyArray_DIMS(input)[0];
shape[1] = PyArray_DIMS(input)[1];
shape[2] = PyArray_DIMS(input)[2];
inputStrides_x = PyGpuArray_STRIDES(input)[0];
inputStrides_y = PyGpuArray_STRIDES(input)[1];
inputStrides_z = PyGpuArray_STRIDES(input)[2];
outputStrides_x = PyGpuArray_STRIDES(output)[0];
outputStrides_y = PyGpuArray_STRIDES(output)[1];
outputStrides_z = PyGpuArray_STRIDES(output)[2];
break;
default:
return -1;
}
if (shape[axis] <= 1) {
output = pygpu_copy(input, GA_ANY_ORDER);
return 0;
}
// Perform cumsum on array of even size.
size_t nbElementsPerCumsum = shape[axis] - (shape[axis]
%% 2
);
// Determine how many elements can be processed in one block.
size_t dimBlockX = ceil( min(nbElementsPerCumsum, 2*maxThreads) / 2.0);
// Determine how many blocks are needed in total.
size_t dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX)); // Nb. of blocks needed per cumsum.
size_t dimGridY; // Nb. of independent cumsums (width).
size_t dimGridZ; // Nb. of independent cumsums (height).
ssize_t tmp;
switch (axis)
{
case 0:
dimGridY = shape[1];
dimGridZ = shape[2];
break;
case 1:
dimGridY = shape[0];
dimGridZ = shape[2];
tmp = inputStrides_x;
inputStrides_x = inputStrides_y;
inputStrides_y = tmp;
tmp = outputStrides_x;
outputStrides_x = outputStrides_y;
outputStrides_y = tmp;
break;
case 2:
dimGridY = shape[1];
dimGridZ = shape[0];
tmp = inputStrides_x;
inputStrides_x = inputStrides_z;
inputStrides_z = tmp;
tmp = outputStrides_x;
outputStrides_x = outputStrides_z;
outputStrides_z = tmp;
break;
default:
return -1;
}
const size_t shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
PyGpuArrayObject* deviceBlockSum = pygpu_empty(2, shapeBlockSum, output->typecode,
GA_C_ORDER, input->context->ctx, Py_None);
if (deviceBlockSum == NULL){
return -1;
}
// Perform `maxGridY`*`maxGridZ` cumsums in parallel.
for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
size_t localDimGridY = min(dimGridY - offsetY, maxGridY);
for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
size_t localDimGridZ = min(dimGridZ - offsetZ, maxGridZ);
def
c_support_code_struct
(
self
,
node
,
nodename
):
code
=
"""
int cumSum_
%(nodename)
s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) {
size_t shape[3] = { 1, 1, 1 };
ssize_t inputStrides_x;
ssize_t inputStrides_y;
ssize_t inputStrides_z;
ssize_t outputStrides_x;
ssize_t outputStrides_y;
ssize_t outputStrides_z;
switch (PyGpuArray_NDIM(input))
{
case 1:
shape[0] = PyGpuArray_DIMS(input)[0];
inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
break;
case 2:
shape[0] = PyGpuArray_DIMS(input)[0];
shape[1] = PyGpuArray_DIMS(input)[1];
inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float);
outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float);
break;
case 3:
shape[0] = PyGpuArray_DIMS(input)[0];
shape[1] = PyGpuArray_DIMS(input)[1];
shape[2] = PyGpuArray_DIMS(input)[2];
inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float);
inputStrides_z = PyGpuArray_STRIDES(input)[2] / sizeof(float);
outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float);
outputStrides_z = PyGpuArray_STRIDES(output)[2] / sizeof(float);
break;
default:
PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis");
return -1;
}
if (shape[axis] <= 1) {
int err = pygpu_move(output, input);
return err;
}
// Perform cumsum on array of even size.
size_t nbElementsPerCumsum = shape[axis] - (shape[axis]
%% 2
);
// Determine how many elements can be processed in one block.
size_t dimBlockX = ceil((nbElementsPerCumsum > 2*maxThreads ? 2*maxThreads : nbElementsPerCumsum) / 2.0);
// Determine how many blocks are needed in total.
size_t dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX)); // Nb. of blocks needed per cumsum.
size_t dimGridY; // Nb. of independent cumsums (width).
size_t dimGridZ; // Nb. of independent cumsums (height).
ssize_t tmp;
switch (axis)
{
case 0:
dimGridY = shape[1];
dimGridZ = shape[2];
break;
case 1:
dimGridY = shape[0];
dimGridZ = shape[2];
tmp = inputStrides_x;
inputStrides_x = inputStrides_y;
inputStrides_y = tmp;
tmp = outputStrides_x;
outputStrides_x = outputStrides_y;
outputStrides_y = tmp;
break;
case 2:
dimGridY = shape[1];
dimGridZ = shape[0];
tmp = inputStrides_x;
inputStrides_x = inputStrides_z;
inputStrides_z = tmp;
tmp = outputStrides_x;
outputStrides_x = outputStrides_z;
outputStrides_z = tmp;
break;
default:
PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis");
return -1;
}
const size_t shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
PyGpuArrayObject* deviceBlockSum = pygpu_empty(2, shapeBlockSum, output->ga.typecode,
GA_C_ORDER, input->context, Py_None);
if (deviceBlockSum == NULL){
return -1;
}
// Perform `maxGridY`*`maxGridZ` cumsums in parallel.
for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
size_t localDimGridY = (dimGridY - offsetY < maxGridY) ? (dimGridY - offsetY) : (maxGridY);
for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
size_t localDimGridZ = (dimGridZ - offsetZ < maxGridZ) ? (dimGridZ - offsetZ) : (maxGridZ);
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cumsum per block.
size_t sharedBytes = (2*dimBlockX) * sizeof(float);
void* kernel_params[] = {(void*) input->ga.data,
(void*) output->ga.data,
(void*) &nbElementsPerCumsum,
(void*) &inputStrides_x,
(void*) &inputStrides_y,
(void*) &inputStrides_z,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ,
(void*) deviceBlockSum->ga.data
};
int err = GpuKernel_call(&k_blockCumSum_
%(nodename)
s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "blockCumSum call failed");
return -1;
}
if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive).
if (cumSum_
%(nodename)
s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
Py_DECREF(deviceBlockSum);
return -1;
}
// Since there are more than one block (i.e. `dimGridX > 1`)
// report partial cumsums of previous blocks to subsequents ones.
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1}; // One cumsum per block.
size_t sharedBytes = (2*dimBlockX) * sizeof(float);
size_t dimBlock[3] = {dimBlockX, 1, 1};
void* kernel_params[] = {(void*) output->ga.data,
(void*) deviceBlockSum->ga.data,
(void*) &nbElementsPerCumsum,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ
};
int err = GpuKernel_call(&k_finalCumSum_
%(nodename)
s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "finalCumSum call failed");
return -1;
}
}
// If shape[axis] is odd, the last element is compute manually
if (shape[axis] != nbElementsPerCumsum){
size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {1, 1, 1};
size_t tmp0 = shape[axis]-2;
size_t tmp1 = shape[axis]-1;
void* kernel_params[] = {(void*) input->ga.data,
(void*) output->ga.data,
(void*) &nbElementsPerCumsum,
(void*) &inputStrides_x,
(void*) &inputStrides_y,
(void*) &inputStrides_z,
...
...
@@ -379,61 +447,43 @@ class GpuCumsum(CumsumOp, GpuKernelBase):
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ,
(void*) deviceBlockSum->ga.data;
};
int err = GpuKernel_call(k_blockCumSum_
%(nodename)
s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive).
if (cumSum_
%(nodename)
s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
Py_DECREF(deviceBlockSum);
return -1;
}
// Since there are more than one block (i.e. `dimGridX > 1`)
// report partial cumsums of previous blocks to subsequents ones.
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1};
void* kernel_params[] = {(void*) output->ga.data,
(void*) deviceBlockSum->ga.data,
(void*) &nbElementsPerCumsum,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ
};
int err = GpuKernel_call(k_finalCumSum_
%(nodename)
s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
}
// If shape[axis] is odd, the last element is compute manually
if (shape[axis] != nbElementsPerCumsum){
size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {1, 1, 1};
void* kernel_params[] = {(void*) input->ga.data,
(void*) output->ga.data,
(void*) &inputStrides_x,
(void*) &inputStrides_y,
(void*) &inputStrides_z,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
(void*) &outputStrides_z,
(void*) &offsetY,
(void*) &offsetZ,
(void*) &(shape[axis]-2),
(void*) &(shape[axis]-1)
};
int err = GpuKernel_call(k_cumadd_
%(nodename)
s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
(void*) &(tmp0),
(void*) &(tmp1)
};
int err = GpuKernel_call(&k_cumadd_
%(nodename)
s, 3, dimBlock, dimGrid, sharedBytes, kernel_params);
if (err != GA_NO_ERROR){
PyErr_SetString(PyExc_RuntimeError, "cumadd call failed");
return -1;
}
}
}
Py_XDECREF(deviceBlockSum);
return 0;
}
"""
return
"
\n
"
.
join
(
super
(
GpuKernelBase
,
self
)
.
c_support_code_apply
(
node
,
name
),
code
)
Py_XDECREF(deviceBlockSum);
return 0;
}
"""
%
locals
()
return
super
(
GpuCumsum
,
self
)
.
c_support_code_struct
(
node
,
nodename
)
+
code
@op_lifter
([
CumsumOp
])
def
use_gpu_cumsumop
(
node
,
ctx_name
):
return
GpuCumsum
(
node
.
op
.
axis
)
if
node
.
inputs
[
0
]
.
dtype
==
'float32'
:
axis
=
node
.
op
.
axis
x
=
node
.
inputs
[
0
]
if
axis
is
not
None
and
x
.
ndim
>
GpuCumsum
.
SUPPORTED_NDIMS
:
return
None
if
axis
is
None
and
x
.
ndim
>
1
:
x
=
x
.
flatten
()
x
=
GpuFromHost
(
ctx_name
)(
x
)
# ``gpu_cumsum`` assume array has been flattened if needed.
if
axis
is
None
:
axis
=
0
return
GpuCumsum
(
axis
)(
x
)
register_gpu_opt
()(
use_gpu_cumsumop
)
theano/sandbox/gpuarray/tests/test_extra_ops.py
浏览文件 @
99cffe57
...
...
@@ -12,7 +12,7 @@ import theano.tensor.tests.test_extra_ops
from
theano.tensor.extra_ops
import
cumsum
,
CumsumOp
from
theano.tests
import
unittest_tools
as
utt
from
.config
import
mode_with_gpu
,
test_ctx_name
,
test_ctx
from
.config
import
mode_with_gpu
,
test_ctx_name
from
..extra_ops
import
GpuCumsum
from
..type
import
get_context
...
...
@@ -22,10 +22,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
def
setUp
(
self
):
super
(
TestGpuCumsum
,
self
)
.
setUp
()
if
get_context
(
test_ctx_name
)
.
kind
!=
'cuda'
:
test_ctx
=
get_context
(
test_ctx_name
)
if
test_ctx
.
kind
!=
'cuda'
:
raise
SkipTest
(
"Cuda specific tests"
)
self
.
max_threads_dim0
=
test_ctx
.
maxlsize0
self
.
max_grid_size1
=
test_ctx
.
maxgsize
1
self
.
max_grid_size1
=
test_ctx
.
maxgsize
2
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论