Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
d87cf8b4
提交
d87cf8b4
authored
4月 20, 2016
作者:
Kelvin Xu
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
pep8
上级
99cffe57
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
39 行增加
和
52 行删除
+39
-52
extra_ops.py
theano/sandbox/cuda/extra_ops.py
+1
-1
extra_ops.py
theano/sandbox/gpuarray/extra_ops.py
+23
-35
test_extra_ops.py
theano/sandbox/gpuarray/tests/test_extra_ops.py
+15
-16
没有找到文件。
theano/sandbox/cuda/extra_ops.py
浏览文件 @
d87cf8b4
...
@@ -43,7 +43,7 @@ class GpuCumsum(CumsumOp, GpuOp):
...
@@ -43,7 +43,7 @@ class GpuCumsum(CumsumOp, GpuOp):
if
x
.
ndim
>
GpuCumsum
.
SUPPORTED_NDIMS
:
if
x
.
ndim
>
GpuCumsum
.
SUPPORTED_NDIMS
:
raise
NotImplementedError
(
'Only cumsum on 1D, 2D and 3D array are supported right now!'
)
raise
NotImplementedError
(
'Only cumsum on 1D, 2D and 3D array are supported right now!'
)
print
(
self
.
axis
)
if
self
.
axis
>=
x
.
ndim
or
self
.
axis
<
-
x
.
ndim
:
if
self
.
axis
>=
x
.
ndim
or
self
.
axis
<
-
x
.
ndim
:
raise
ValueError
(
'axis(={1}) out of bounds'
.
format
(
self
.
axis
))
raise
ValueError
(
'axis(={1}) out of bounds'
.
format
(
self
.
axis
))
...
...
theano/sandbox/gpuarray/extra_ops.py
浏览文件 @
d87cf8b4
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
import
theano
import
numpy
import
os
import
os
from
theano
import
Op
,
Apply
,
config
from
theano
import
Apply
from
theano.tensor.extra_ops
import
CumsumOp
from
theano.tensor.extra_ops
import
CumsumOp
try
:
try
:
import
pygpu
from
pygpu
import
gpuarray
from
pygpu
import
gpuarray
except
ImportError
:
except
ImportError
:
pass
pass
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
,
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
,
infer_context_name
,
GpuFromHost
,
HideC
)
infer_context_name
,
GpuFromHost
)
from
.opt
import
register_opt
as
register_gpu_opt
,
op_lifter
from
.opt
import
register_opt
as
register_gpu_opt
,
op_lifter
from
.type
import
GpuArrayType
class
GpuCumsum
(
GpuKernelBase
,
HideC
,
CumsumOp
):
class
GpuCumsum
(
GpuKernelBase
):
"""
"""
Parameters
Parameters
----------
----------
...
@@ -34,7 +30,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
...
@@ -34,7 +30,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
return
"
%
s{
%
s}"
%
(
self
.
__class__
.
__name__
,
self
.
axis
)
return
"
%
s{
%
s}"
%
(
self
.
__class__
.
__name__
,
self
.
axis
)
def
c_code_cache_version_apply
(
self
,
node
):
def
c_code_cache_version_apply
(
self
,
node
):
return
None
return
(
1
,)
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
,
'<gpuarray_helper.h>'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
,
'<gpuarray_helper.h>'
]
...
@@ -43,9 +39,9 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
...
@@ -43,9 +39,9 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
return
[
os
.
path
.
dirname
(
__file__
)]
return
[
os
.
path
.
dirname
(
__file__
)]
def
get_params
(
self
,
node
):
def
get_params
(
self
,
node
):
return
node
.
inputs
[
0
]
.
type
.
context
return
node
.
inputs
[
0
]
.
type
.
context
def
make_node
(
self
,
x
):
def
make_node
(
self
,
x
):
assert
x
.
type
.
dtype
==
'float32'
,
"Only float32 supported for GpuCumSum"
assert
x
.
type
.
dtype
==
'float32'
,
"Only float32 supported for GpuCumSum"
x
=
as_gpuarray_variable
(
x
,
infer_context_name
(
x
))
x
=
as_gpuarray_variable
(
x
,
infer_context_name
(
x
))
...
@@ -57,17 +53,10 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
...
@@ -57,17 +53,10 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
raise
ValueError
(
'axis(={0}) out of bounds'
.
format
(
self
.
axis
))
raise
ValueError
(
'axis(={0}) out of bounds'
.
format
(
self
.
axis
))
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
# copied from neighbour.py
def
perform
(
self
,
node
,
inp
,
out
,
ctx
):
# Disable the perform method from the CPU version
Op
.
perform
(
self
,
node
,
inp
,
out
,
ctx
)
def
gpu_kernels
(
self
,
node
,
nodename
):
def
gpu_kernels
(
self
,
node
,
nodename
):
kernels
=
[]
kernels
=
[]
# cumadd
# cumadd
kname
=
"k_cumadd"
kname
=
"k_cumadd"
k_var
=
"k_cumadd_"
+
nodename
k_var
=
"k_cumadd_"
+
nodename
dtype_x
=
node
.
inputs
[
0
]
.
dtype
dtype_x
=
node
.
inputs
[
0
]
.
dtype
flags
=
Kernel
.
get_flags
(
dtype_x
)
flags
=
Kernel
.
get_flags
(
dtype_x
)
...
@@ -77,7 +66,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
...
@@ -77,7 +66,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
ga_ssize inputStrides_y,
ga_ssize inputStrides_y,
ga_ssize inputStrides_z,
ga_ssize inputStrides_z,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_x, ga_ssize outputStrides_y,
ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
ga_ssize outputStrides_z, const int offsetY, const int offsetZ,
const int beforeLastElementIdx, const int lastElementIdx){
const int beforeLastElementIdx, const int lastElementIdx){
int idY = blockIdx.y + offsetY;
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
int idZ = blockIdx.z + offsetZ;
...
@@ -90,22 +79,22 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
...
@@ -90,22 +79,22 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
output[idx_last_output] = input[idx_last_input] + output[idx_beforelast];
output[idx_last_output] = input[idx_last_input] + output[idx_beforelast];
}
}
"""
%
locals
()
"""
%
locals
()
params
=
[
gpuarray
.
GpuArray
,
gpuarray
.
GpuArray
,
gpuarray
.
SSIZE
,
params
=
[
gpuarray
.
GpuArray
,
gpuarray
.
GpuArray
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
'intc'
,
]
]
kernels
.
append
(
Kernel
(
code
=
code
,
name
=
kname
,
params
=
params
,
kernels
.
append
(
Kernel
(
code
=
code
,
name
=
kname
,
params
=
params
,
flags
=
flags
,
objvar
=
k_var
))
flags
=
flags
,
objvar
=
k_var
))
# blockCumSum
# blockCumSum
kname
=
"k_blockCumSum"
kname
=
"k_blockCumSum"
k_var
=
"k_blockCumSum_"
+
nodename
k_var
=
"k_blockCumSum_"
+
nodename
params
=
[
gpuarray
.
GpuArray
,
gpuarray
.
GpuArray
,
gpuarray
.
SIZE
,
params
=
[
gpuarray
.
GpuArray
,
gpuarray
.
GpuArray
,
gpuarray
.
SIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
'int32'
,
'int32'
,
gpuarray
.
GpuArray
,]
'int32'
,
'int32'
,
gpuarray
.
GpuArray
,
]
code
=
"""
code
=
"""
// helper functions
// helper functions
WITHIN_KERNEL
WITHIN_KERNEL
void k_reductionPhase(float* partialCumSum) {
void k_reductionPhase(float* partialCumSum) {
...
@@ -199,10 +188,10 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
...
@@ -199,10 +188,10 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
kernels
.
append
(
Kernel
(
code
=
code
,
name
=
kname
,
params
=
params
,
kernels
.
append
(
Kernel
(
code
=
code
,
name
=
kname
,
params
=
params
,
flags
=
flags
,
objvar
=
k_var
))
flags
=
flags
,
objvar
=
k_var
))
# k_finalCumSum
# k_finalCumSum
kname
=
"k_finalCumSum"
kname
=
"k_finalCumSum"
k_var
=
"k_finalCumSum_"
+
nodename
k_var
=
"k_finalCumSum_"
+
nodename
code
=
"""
code
=
"""
KERNEL void k_finalCumSum(float* output, float* blockSum, size_t nbElementsPerCumsum,
KERNEL void k_finalCumSum(float* output, float* blockSum, size_t nbElementsPerCumsum,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
int offsetY, int offsetZ) {
int offsetY, int offsetZ) {
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
...
@@ -226,12 +215,11 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
...
@@ -226,12 +215,11 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
"""
"""
params
=
[
gpuarray
.
GpuArray
,
gpuarray
.
GpuArray
,
gpuarray
.
SIZE
,
params
=
[
gpuarray
.
GpuArray
,
gpuarray
.
GpuArray
,
gpuarray
.
SIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
gpuarray
.
SSIZE
,
'int32'
,
'int32'
,]
'int32'
,
'int32'
,
]
kernels
.
append
(
Kernel
(
code
=
code
,
name
=
kname
,
params
=
params
,
kernels
.
append
(
Kernel
(
code
=
code
,
name
=
kname
,
params
=
params
,
flags
=
flags
,
objvar
=
k_var
))
flags
=
flags
,
objvar
=
k_var
))
return
kernels
return
kernels
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
if
node
.
inputs
[
0
]
.
type
.
context
.
kind
!=
'cuda'
:
if
node
.
inputs
[
0
]
.
type
.
context
.
kind
!=
'cuda'
:
raise
NotImplementedError
(
"cuda only"
)
raise
NotImplementedError
(
"cuda only"
)
...
@@ -257,9 +245,9 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
...
@@ -257,9 +245,9 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
}
}
{ // Namespace for kernel calls //
{ // Namespace for kernel calls //
size_t max_threads_dim0;
size_t max_threads_dim0;
size_t max_grid_size1;
size_t max_grid_size1;
size_t max_grid_size2;
size_t max_grid_size2;
int err;
int err;
err =
%(ctx)
s->ops->property(
%(ctx)
s->ctx, NULL, NULL, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
err =
%(ctx)
s->ops->property(
%(ctx)
s->ctx, NULL, NULL, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
if (err != GA_NO_ERROR){
if (err != GA_NO_ERROR){
...
@@ -331,7 +319,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
...
@@ -331,7 +319,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
}
}
// Perform cumsum on array of even size.
// Perform cumsum on array of even size.
size_t nbElementsPerCumsum = shape[axis] - (shape[axis]
%% 2
);
size_t nbElementsPerCumsum = shape[axis] - (shape[axis]
%% 2
);
// Determine how many elements can be processed in one block.
// Determine how many elements can be processed in one block.
size_t dimBlockX = ceil((nbElementsPerCumsum > 2*maxThreads ? 2*maxThreads : nbElementsPerCumsum) / 2.0);
size_t dimBlockX = ceil((nbElementsPerCumsum > 2*maxThreads ? 2*maxThreads : nbElementsPerCumsum) / 2.0);
// Determine how many blocks are needed in total.
// Determine how many blocks are needed in total.
size_t dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX)); // Nb. of blocks needed per cumsum.
size_t dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX)); // Nb. of blocks needed per cumsum.
...
@@ -389,7 +377,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
...
@@ -389,7 +377,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
size_t sharedBytes = (2*dimBlockX) * sizeof(float);
size_t sharedBytes = (2*dimBlockX) * sizeof(float);
void* kernel_params[] = {(void*) input->ga.data,
void* kernel_params[] = {(void*) input->ga.data,
(void*) output->ga.data,
(void*) output->ga.data,
(void*) &nbElementsPerCumsum,
(void*) &nbElementsPerCumsum,
(void*) &inputStrides_x,
(void*) &inputStrides_x,
(void*) &inputStrides_y,
(void*) &inputStrides_y,
(void*) &inputStrides_z,
(void*) &inputStrides_z,
...
@@ -417,7 +405,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
...
@@ -417,7 +405,7 @@ class GpuCumsum(GpuKernelBase, HideC, CumsumOp):
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
size_t dimBlock[3] = {dimBlockX, 1, 1};
size_t dimBlock[3] = {dimBlockX, 1, 1};
void* kernel_params[] = {(void*) output->ga.data,
void* kernel_params[] = {(void*) output->ga.data,
(void*) deviceBlockSum->ga.data,
(void*) deviceBlockSum->ga.data,
(void*) &nbElementsPerCumsum,
(void*) &nbElementsPerCumsum,
(void*) &outputStrides_x,
(void*) &outputStrides_x,
(void*) &outputStrides_y,
(void*) &outputStrides_y,
...
...
theano/sandbox/gpuarray/tests/test_extra_ops.py
浏览文件 @
d87cf8b4
...
@@ -10,6 +10,7 @@ import theano
...
@@ -10,6 +10,7 @@ import theano
import
theano.tensor.tests.test_extra_ops
import
theano.tensor.tests.test_extra_ops
from
theano.tensor.extra_ops
import
cumsum
,
CumsumOp
from
theano.tensor.extra_ops
import
cumsum
,
CumsumOp
from
theano.tests.unittest_tools
import
SkipTest
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
from
.config
import
mode_with_gpu
,
test_ctx_name
from
.config
import
mode_with_gpu
,
test_ctx_name
...
@@ -28,8 +29,6 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
...
@@ -28,8 +29,6 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
self
.
max_threads_dim0
=
test_ctx
.
maxlsize0
self
.
max_threads_dim0
=
test_ctx
.
maxlsize0
self
.
max_grid_size1
=
test_ctx
.
maxgsize2
self
.
max_grid_size1
=
test_ctx
.
maxgsize2
def
test_Strides1D
(
self
):
def
test_Strides1D
(
self
):
x
=
T
.
fvector
(
'x'
)
x
=
T
.
fvector
(
'x'
)
...
@@ -113,16 +112,16 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
...
@@ -113,16 +112,16 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
utt
.
assert_allclose
(
np
.
cumsum
(
a
[:
i
]),
f
(
a
[:
i
]))
utt
.
assert_allclose
(
np
.
cumsum
(
a
[:
i
]),
f
(
a
[:
i
]))
# Use multiple GPU threadblocks
# Use multiple GPU threadblocks
a
=
np
.
random
.
random
((
block_max_size
+
2
,
))
.
astype
(
"float32"
)
a
=
np
.
random
.
random
((
block_max_size
+
2
,
))
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
# Use recursive cumsum
# Use recursive cumsum
a
=
np
.
ones
((
block_max_size
*
(
block_max_size
+
1
)
+
2
,),
a
=
np
.
ones
((
block_max_size
*
(
block_max_size
+
1
)
+
2
,),
dtype
=
"float32"
)
dtype
=
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
),
f
(
a
))
def
test_GpuCumsum2D
(
self
):
def
test_GpuCumsum2D
(
self
):
block_max_size
=
self
.
max_threads_dim0
*
2
block_max_size
=
self
.
max_threads_dim0
*
2
x
=
T
.
fmatrix
(
'x'
)
x
=
T
.
fmatrix
(
'x'
)
for
shape_axis
,
axis
in
zip
([
0
,
1
,
0
,
1
,
0
],
[
0
,
1
,
None
,
-
1
,
-
2
]):
for
shape_axis
,
axis
in
zip
([
0
,
1
,
0
,
1
,
0
],
[
0
,
1
,
None
,
-
1
,
-
2
]):
...
@@ -143,21 +142,21 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
...
@@ -143,21 +142,21 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks
# Use multiple GPU threadblocks
a_shape
=
[
5
,
5
]
a_shape
=
[
5
,
5
]
a_shape
[
shape_axis
]
=
block_max_size
+
2
a_shape
[
shape_axis
]
=
block_max_size
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use multiple GPU gridblocks
# Use multiple GPU gridblocks
a_shape
=
[
4
,
4
]
a_shape
=
[
4
,
4
]
a_shape
[
1
-
shape_axis
]
=
self
.
max_grid_size1
+
1
a_shape
[
1
-
shape_axis
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
),
rtol
=
5e-5
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
),
rtol
=
5e-5
)
# Use recursive cumsum
# Use recursive cumsum
a_shape
=
[
3
,
3
]
a_shape
=
[
3
,
3
]
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
def
test_GpuCumsum3D
(
self
):
def
test_GpuCumsum3D
(
self
):
...
@@ -182,32 +181,32 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
...
@@ -182,32 +181,32 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks (along accumulation axis)
# Use multiple GPU threadblocks (along accumulation axis)
a_shape
=
[
2
,
2
,
2
]
a_shape
=
[
2
,
2
,
2
]
a_shape
[
shape_axis
]
=
block_max_size
+
2
a_shape
[
shape_axis
]
=
block_max_size
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use multiple GPU gridblocks (not along accumulation axis)
# Use multiple GPU gridblocks (not along accumulation axis)
a_shape
=
[
5
,
5
,
5
]
a_shape
=
[
5
,
5
,
5
]
a_shape
[(
shape_axis
+
1
)
%
3
]
=
self
.
max_grid_size1
+
1
a_shape
[(
shape_axis
+
1
)
%
3
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
if
axis
is
None
:
if
axis
is
None
:
# Avoid floating point error
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
a_shape
=
[
5
,
5
,
5
]
a_shape
=
[
5
,
5
,
5
]
a_shape
[(
shape_axis
+
2
)
%
3
]
=
self
.
max_grid_size1
+
1
a_shape
[(
shape_axis
+
2
)
%
3
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
if
axis
is
None
:
if
axis
is
None
:
# Avoid floating point error
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
# Use recursive cumsum (along accumulation axis)
# Use recursive cumsum (along accumulation axis)
a_shape
=
[
3
,
3
,
3
]
a_shape
=
[
3
,
3
,
3
]
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np
.
cumsum
(
a
,
axis
=
axis
),
f
(
a
))
def
test_GpuCumsum4D
(
self
):
def
test_GpuCumsum4D
(
self
):
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论