Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
b3ce3640
提交
b3ce3640
authored
3月 11, 2022
作者:
Maxim Kochurov
提交者:
Brandon T. Willard
4月 07, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Remove tests.gpuarray
上级
c803c67e
隐藏空白字符变更
内嵌
并排
正在显示
33 个修改的文件
包含
0 行增加
和
10502 行删除
+0
-10502
__init__.py
tests/gpuarray/__init__.py
+0
-0
tstgpueye.c
tests/gpuarray/c_code/tstgpueye.c
+0
-51
check_dnn_conv.py
tests/gpuarray/check_dnn_conv.py
+0
-1307
config.py
tests/gpuarray/config.py
+0
-43
rnn_support.py
tests/gpuarray/rnn_support.py
+0
-288
run_dnn_conv.py
tests/gpuarray/run_dnn_conv.py
+0
-261
test_abstractconv.py
tests/gpuarray/test_abstractconv.py
+0
-405
test_basic_ops.py
tests/gpuarray/test_basic_ops.py
+0
-660
test_blas.py
tests/gpuarray/test_blas.py
+0
-282
test_blocksparse.py
tests/gpuarray/test_blocksparse.py
+0
-72
test_cgpukernelbase.py
tests/gpuarray/test_cgpukernelbase.py
+0
-77
test_ctc.py
tests/gpuarray/test_ctc.py
+0
-180
test_dnn.py
tests/gpuarray/test_dnn.py
+0
-0
test_elemwise.py
tests/gpuarray/test_elemwise.py
+0
-513
test_extra_ops.py
tests/gpuarray/test_extra_ops.py
+0
-269
test_fft.py
tests/gpuarray/test_fft.py
+0
-268
test_gemmcorr.py
tests/gpuarray/test_gemmcorr.py
+0
-363
test_gemmcorr3d.py
tests/gpuarray/test_gemmcorr3d.py
+0
-295
test_linalg.py
tests/gpuarray/test_linalg.py
+0
-685
test_misc.py
tests/gpuarray/test_misc.py
+0
-19
test_multinomial.py
tests/gpuarray/test_multinomial.py
+0
-373
test_neighbours.py
tests/gpuarray/test_neighbours.py
+0
-9
test_nnet.py
tests/gpuarray/test_nnet.py
+0
-315
test_opt.py
tests/gpuarray/test_opt.py
+0
-1551
test_others.py
tests/gpuarray/test_others.py
+0
-49
test_pickle.py
tests/gpuarray/test_pickle.py
+0
-73
test_pool.py
tests/gpuarray/test_pool.py
+0
-328
test_reduction.py
tests/gpuarray/test_reduction.py
+0
-215
test_rng_mrg.py
tests/gpuarray/test_rng_mrg.py
+0
-194
test_scan.py
tests/gpuarray/test_scan.py
+0
-727
test_sort.py
tests/gpuarray/test_sort.py
+0
-9
test_subtensor.py
tests/gpuarray/test_subtensor.py
+0
-438
test_type.py
tests/gpuarray/test_type.py
+0
-183
没有找到文件。
tests/gpuarray/__init__.py
deleted
100644 → 0
浏览文件 @
c803c67e
tests/gpuarray/c_code/tstgpueye.c
deleted
100644 → 0
浏览文件 @
c803c67e
#section kernels
#kernel eye : *, size, size, size :
#include <cluda.h>
/* The eye name will be used to generate supporting objects. The only
you probably need to care about is the kernel object which will be
named 'k_' + <the name above> (k_eye in this case). This name also
has to match the kernel function name below.
*/
KERNEL
void
eye
(
GLOBAL_MEM
DTYPE_OUTPUT_0
*
a
,
ga_size
a_off
,
ga_size
n
,
ga_size
m
)
{
a
=
(
GLOBAL_MEM
DTYPE_OUTPUT_0
*
)(((
GLOBAL_MEM
char
*
)
a
)
+
a_off
);
ga_size
nb
=
n
<
m
?
n
:
m
;
for
(
ga_size
i
=
LID_0
;
i
<
nb
;
i
+=
LDIM_0
)
{
a
[
i
*
m
+
i
]
=
1
;
}
}
#section support_code_struct
int
APPLY_SPECIFIC
(
tstgpueye
)(
PyArrayObject
*
n
,
PyArrayObject
*
m
,
PyGpuArrayObject
**
z
,
PARAMS_TYPE
*
params
)
{
size_t
dims
[
2
]
=
{
0
,
0
};
size_t
ls
,
gs
;
void
*
args
[
3
];
int
err
;
dims
[
0
]
=
((
DTYPE_INPUT_0
*
)
PyArray_DATA
(
n
))[
0
];
dims
[
1
]
=
((
DTYPE_INPUT_1
*
)
PyArray_DATA
(
m
))[
0
];
Py_XDECREF
(
*
z
);
*
z
=
pygpu_zeros
(
2
,
dims
,
params
->
typecode
,
GA_C_ORDER
,
params
->
context
,
Py_None
);
if
(
*
z
==
NULL
)
return
-
1
;
ls
=
1
;
gs
=
256
;
/* The eye_call name comes from the kernel declaration above. */
err
=
eye_call
(
1
,
&
gs
,
&
ls
,
0
,
(
*
z
)
->
ga
.
data
,
(
*
z
)
->
ga
.
offset
,
dims
[
0
],
dims
[
1
]);
if
(
err
!=
GA_NO_ERROR
)
{
PyErr_Format
(
PyExc_RuntimeError
,
"gpuarray error: kEye: %s. n%lu, m=%lu."
,
GpuKernel_error
(
&
k_eye
,
err
),
(
unsigned
long
)
dims
[
0
],
(
unsigned
long
)
dims
[
1
]);
return
-
1
;
}
return
0
;
}
tests/gpuarray/check_dnn_conv.py
deleted
100644 → 0
浏览文件 @
c803c67e
#!/usr/bin/env python
# Without args, this script executes all its tests like `pytest -vs`
# python check_dnn_conv.py
# If there is only one arg `infos`, this script prints some infos about
# supported algorithms and data type configurations for current GPU and cuDNN version.
# python check_dnn_conv.py infos
# If there is only one arg `list`, this script prints all test cases without running them.
# python check_dnn_conv.py list
# Else, any arg will be directly passed to pytest.
# python check_dnn_conv.py -xvs # verbose mode, capture output, exit at first error.
import
math
import
sys
from
itertools
import
chain
,
product
import
numpy
as
np
import
pytest
from
aesarat.tensor.type
import
TensorType
import
aesara
import
tests.unittest_tools
as
utt
from
aesara.configdefaults
import
SUPPORTED_DNN_CONV_ALGO_RUNTIME
from
aesara.gpuarray
import
cudnn_defs
from
aesara.gpuarray.dnn
import
GpuDnnConv
,
GpuDnnConvGradI
,
GpuDnnConvGradW
from
aesara.gpuarray.dnn
import
_dnn_conv
as
dnn_conv
from
aesara.gpuarray.dnn
import
_dnn_gradinput
as
dnn_gradinput
from
aesara.gpuarray.dnn
import
_dnn_gradweight
as
dnn_gradweight
from
aesara.gpuarray.dnn
import
version
from
aesara.tensor.nnet.abstract_conv
import
assert_conv_shape
,
get_conv_output_shape
from
aesara.tensor.nnet.corr
import
CorrMM
,
CorrMM_gradInputs
,
CorrMM_gradWeights
from
aesara.tensor.nnet.corr3d
import
Corr3dMM
,
Corr3dMMGradInputs
,
Corr3dMMGradWeights
from
tests.gpuarray.config
import
mode_with_gpu
,
ref_cast
def
check_dtype_config_support
(
dtype
,
precision
):
# We use FWD 2D to check it.
# Based on documentation, algo small (CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM)
# should support all configurations, for both v5.1, v6 and v7.
inputs
=
aesara
.
shared
(
np
.
zeros
((
1
,
1
,
2
,
2
),
dtype
=
dtype
))
filters
=
aesara
.
shared
(
np
.
zeros
((
1
,
1
,
2
,
2
),
dtype
=
dtype
))
conv
=
dnn_conv
(
inputs
,
filters
,
precision
=
precision
,
algo
=
"small"
)
f
=
aesara
.
function
([],
conv
,
mode
=
mode_with_gpu
)
try
:
f
()
except
RuntimeError
as
e
:
assert
"CUDNN_STATUS_ARCH_MISMATCH"
in
str
(
e
)
return
False
return
True
cudnn
=
cudnn_defs
.
get_definitions
(
version
(
raises
=
False
))
class
ConvCase
:
"""
Helper class to describe a special test case quickly.
This handles only 2D and 3D cases.
"""
FWD
,
GRADINPUT
,
GRADWEIGHT
=
0
,
1
,
2
def
__init__
(
self
,
type
,
inputs_shape
,
filters_shape
,
algo
=
None
,
dtype
=
None
,
precision
=
None
,
subsample
=
None
,
dilation
=
None
,
border_mode
=
"valid"
,
conv_mode
=
"conv"
,
alpha
=
1
,
beta
=
0
,
should_fail
=
False
,
):
assert
type
in
(
ConvCase
.
FWD
,
ConvCase
.
GRADINPUT
,
ConvCase
.
GRADWEIGHT
)
assert
len
(
inputs_shape
)
==
len
(
filters_shape
)
in
(
4
,
5
)
ndim
=
len
(
inputs_shape
)
-
2
if
dtype
is
None
:
dtype
=
aesara
.
config
.
floatX
if
precision
is
None
:
precision
=
aesara
.
config
.
floatX
if
subsample
is
None
:
subsample
=
(
1
,)
*
ndim
if
dilation
is
None
:
dilation
=
(
1
,)
*
ndim
assert
dtype
in
(
"float16"
,
"float32"
,
"float64"
)
assert
precision
in
(
"float16"
,
"float32"
,
"float64"
)
assert
len
(
subsample
)
==
len
(
dilation
)
==
ndim
assert
border_mode
in
(
"valid"
,
"full"
,
"half"
)
or
(
isinstance
(
border_mode
,
(
list
,
tuple
))
and
len
(
border_mode
)
==
ndim
)
assert
conv_mode
in
(
"conv"
,
"cross"
)
assert
alpha
!=
0
self
.
type
=
type
self
.
ndim
=
ndim
self
.
algo
=
algo
self
.
inputs_shape
=
inputs_shape
self
.
filters_shape
=
filters_shape
self
.
dtype
=
dtype
self
.
precision
=
precision
self
.
subsample
=
subsample
self
.
dilation
=
dilation
self
.
border_mode
=
border_mode
self
.
conv_mode
=
conv_mode
self
.
alpha
=
alpha
self
.
beta
=
beta
self
.
should_fail
=
bool
(
should_fail
)
def
is_fwd
(
self
):
return
self
.
type
==
ConvCase
.
FWD
def
is_bwd_filter
(
self
):
return
self
.
type
==
ConvCase
.
GRADWEIGHT
def
is_bwd_data
(
self
):
return
self
.
type
==
ConvCase
.
GRADINPUT
def
get_case
(
self
):
return
(
self
.
algo
,
self
.
dtype
,
self
.
precision
,
(
self
.
inputs_shape
,
self
.
filters_shape
,
self
.
subsample
,
self
.
dilation
,
self
.
border_mode
,
self
.
conv_mode
,
self
.
alpha
,
self
.
beta
,
),
)
@staticmethod
def
fwd
(
*
args
,
**
kwargs
):
return
ConvCase
(
ConvCase
.
FWD
,
*
args
,
**
kwargs
)
@staticmethod
def
bwd_filter
(
*
args
,
**
kwargs
):
return
ConvCase
(
ConvCase
.
GRADWEIGHT
,
*
args
,
**
kwargs
)
@staticmethod
def
bwd_data
(
*
args
,
**
kwargs
):
return
ConvCase
(
ConvCase
.
GRADINPUT
,
*
args
,
**
kwargs
)
class
ConvCaseGenerator
:
"""
Main class used to generate test cases.
This handles only 2D and 3D cases.
"""
def
_as_tuple_of_tuples
(
self
,
iterable
):
return
tuple
(
tuple
(
sequence
)
for
sequence
in
iterable
)
def
__init__
(
self
,
ndim
,
alpha
=
2
,
beta
=-
3
,
batch_size
=
2
,
input_channels
=
3
,
inputs_sizes
=
None
,
output_channels
=
2
,
filters_sizes
=
None
,
subsamples
=
None
,
dilations
=
None
,
borders
=
None
,
with_border_valid
=
True
,
with_border_half
=
True
,
with_border_full
=
True
,
):
self
.
ndim
=
int
(
ndim
)
self
.
alpha
=
float
(
alpha
)
self
.
beta
=
float
(
beta
)
self
.
batch_size
=
int
(
batch_size
)
self
.
input_channels
=
int
(
input_channels
)
self
.
output_channels
=
int
(
output_channels
)
assert
self
.
ndim
in
(
2
,
3
)
assert
self
.
alpha
!=
0
assert
self
.
batch_size
>
0
assert
self
.
input_channels
>
0
assert
self
.
output_channels
>
0
# NB: it is quite arbitrary to choose default values for inputs sizes and filters sizes.
# Here, we just put some values that may generate errors in some cases, but that should be OK for other cases.
# For instance, input size 300 is > 256, that is a limit for certain algorithms (cf. documentation).
# Filter size 40 is > 32 and > 16, that are limits for certain algorithms (cf. documentation).
# We should either manually specify sizes, or give an appropriate filter to this generator
# before testing values (see `self.get_cases()`).
if
inputs_sizes
is
None
:
inputs_sizes
=
((
5
,)
*
self
.
ndim
,
(
300
,
5
)
+
(
2
,)
*
(
self
.
ndim
-
2
))
if
filters_sizes
is
None
:
filters_sizes
=
((
4
,)
*
self
.
ndim
,
(
40
,
4
)
+
(
2
,)
*
(
self
.
ndim
-
2
))
if
borders
is
None
:
borders
=
((
1
,)
*
self
.
ndim
,
tuple
(
range
(
1
,
self
.
ndim
+
1
)))
if
subsamples
is
None
:
subsamples
=
((
1
,)
*
self
.
ndim
,
tuple
(
range
(
1
,
self
.
ndim
+
1
)))
if
dilations
is
None
:
dilations
=
((
1
,)
*
self
.
ndim
,)
if
cudnn
.
version
>=
6
:
dilations
+=
(
tuple
(
range
(
1
,
self
.
ndim
+
1
)),)
for
sequence_list
in
(
inputs_sizes
,
filters_sizes
,
borders
,
subsamples
,
dilations
,
):
assert
isinstance
(
sequence_list
,
(
tuple
,
list
))
and
all
(
isinstance
(
sequence
,
(
tuple
,
list
))
and
len
(
sequence
)
==
self
.
ndim
for
sequence
in
sequence_list
),
(
self
.
ndim
,
sequence_list
)
self
.
auto_borders
=
tuple
()
if
with_border_valid
:
self
.
auto_borders
+=
(
"valid"
,)
if
with_border_half
:
self
.
auto_borders
+=
(
"half"
,)
if
with_border_full
:
self
.
auto_borders
+=
(
"full"
,)
self
.
inputs_sizes
=
self
.
_as_tuple_of_tuples
(
inputs_sizes
)
self
.
filters_sizes
=
self
.
_as_tuple_of_tuples
(
filters_sizes
)
self
.
borders
=
self
.
_as_tuple_of_tuples
(
borders
)
self
.
subsamples
=
self
.
_as_tuple_of_tuples
(
subsamples
)
self
.
dilations
=
self
.
_as_tuple_of_tuples
(
dilations
)
@staticmethod
def
get_if_valid_conv_output_shape
(
case_tuple
):
# Filter function to keep only cases that produce valid convolution output shapes.
out_shp
=
get_conv_output_shape
(
case_tuple
[
0
],
# input shape
case_tuple
[
1
],
# filter shape
case_tuple
[
4
],
# border mode
case_tuple
[
2
],
# subsample
case_tuple
[
3
],
)
# dilation
try
:
return
assert_conv_shape
(
out_shp
)
except
ValueError
:
return
False
def
get_cases
(
self
,
filter
=
None
):
# Generate an iterator of tuples with format:
# (input shape, filter shape, subsample, dilation, border mode, convolution mode, alpha, beta)
# filter may be a callable that gets one tuple (with format specified above) and returns
# a boolean, so that tuple is kept only if filter(tuple) is True.
all_batch_sizes
=
(
self
.
batch_size
,)
all_input_channels
=
(
self
.
input_channels
,)
all_input_sizes
=
self
.
inputs_sizes
all_output_channels
=
(
self
.
output_channels
,)
all_filter_sizes
=
self
.
filters_sizes
all_subsamples
=
self
.
subsamples
all_dilations
=
self
.
dilations
all_border_modes
=
self
.
auto_borders
+
self
.
borders
all_conv_modes
=
(
"conv"
,
"cross"
)
all_alphas
=
(
self
.
alpha
,)
all_betas
=
(
0
,)
if
self
.
beta
==
0
else
(
0
,
self
.
beta
)
all_input_shapes
=
(
(
bs
,
ic
)
+
ins
for
bs
in
all_batch_sizes
for
ic
in
all_input_channels
for
ins
in
all_input_sizes
)
all_filter_shapes
=
(
(
oc
,
ic
)
+
fis
for
oc
in
all_output_channels
for
ic
in
all_input_channels
for
fis
in
all_filter_sizes
)
if
callable
(
filter
):
def
local_filter
(
case_tuple
):
return
ConvCaseGenerator
.
get_if_valid_conv_output_shape
(
case_tuple
)
and
filter
(
case_tuple
)
else
:
local_filter
=
ConvCaseGenerator
.
get_if_valid_conv_output_shape
return
filter
(
local_filter
,
product
(
all_input_shapes
,
all_filter_shapes
,
all_subsamples
,
all_dilations
,
all_border_modes
,
all_conv_modes
,
all_alphas
,
all_betas
,
),
)
class
ConvCaseGeneratorChain
:
"""
Helper class to concatenate many conv case generators.
"""
def
__init__
(
self
,
*
conv_case_generators
):
assert
all
(
isinstance
(
g
,
ConvCaseGenerator
)
for
g
in
conv_case_generators
)
self
.
generators
=
conv_case_generators
def
get_cases
(
self
,
filter
=
None
):
return
chain
(
*
[
generator
.
get_cases
(
filter
)
for
generator
in
self
.
generators
])
class
CuDNNV51ConvCaseGenerator
:
"""
Helper class to generate specific test cases for every algorithm supported by cuDNN V5.1.
Same class exists for cuDNN V6.0 (see below).
This should help avoid test cases that are intended to fail according to cuDNN documentation.
"""
NONE
=
"none"
FFT
=
"fft"
FFT_TILING
=
"fft_tiling"
WINOGRAD
=
"winograd"
WINOGRAD_NON_FUSED
=
"winograd_non_fused"
# Protected interface.
def
_dilations
(
self
,
ndim
):
return
[(
1
,)
*
ndim
]
def
_fwd_fft
(
self
,
ndim
):
inputs_sizes
=
[(
10
,)
*
ndim
,
(
240
,
5
)
+
(
2
,)
*
(
ndim
-
2
)]
filters_sizes
=
[
tuple
(
range
(
9
,
9
-
ndim
,
-
1
))]
subsamples
=
[(
1
,)
*
ndim
]
return
ConvCaseGenerator
(
ndim
=
ndim
,
inputs_sizes
=
inputs_sizes
,
filters_sizes
=
filters_sizes
,
subsamples
=
subsamples
,
dilations
=
self
.
_dilations
(
ndim
),
)
def
_fwd_fft_tiling
(
self
,
ndim
,
dtype
,
precision
):
if
ndim
==
2
:
filters_sizes
=
[(
32
,
5
)]
if
ndim
==
3
:
filters_sizes
=
[(
16
,
5
,
5
)]
subsamples
=
[(
1
,)
*
ndim
]
return
ConvCaseGenerator
(
ndim
=
ndim
,
filters_sizes
=
filters_sizes
,
subsamples
=
subsamples
,
dilations
=
self
.
_dilations
(
ndim
),
)
def
_fwd_winograd
(
self
,
ndim
):
filters_sizes
=
[(
3
,)
*
ndim
]
subsamples
=
[(
1
,)
*
ndim
]
return
ConvCaseGenerator
(
ndim
=
ndim
,
filters_sizes
=
filters_sizes
,
subsamples
=
subsamples
,
dilations
=
self
.
_dilations
(
ndim
),
)
def
_fwd_winograd_non_fused
(
self
,
ndim
,
dtype
,
precision
):
filters_sizes
=
[(
3
,)
*
ndim
]
if
not
(
dtype
==
precision
==
"float16"
):
filters_sizes
+=
[(
5
,)
*
ndim
]
subsamples
=
[(
1
,)
*
ndim
]
return
ConvCaseGenerator
(
ndim
=
ndim
,
filters_sizes
=
filters_sizes
,
subsamples
=
subsamples
,
dilations
=
self
.
_dilations
(
ndim
),
)
def
_gw_fft
(
self
,
ndim
):
return
self
.
_fwd_fft
(
ndim
)
def
_gw_winograd_non_fused
(
self
,
ndim
,
dtype
,
precision
):
return
self
.
_fwd_winograd_non_fused
(
ndim
,
dtype
,
precision
)
def
_gi_fft
(
self
,
ndim
):
return
self
.
_fwd_fft
(
ndim
)
def
_gi_fft_tiling
(
self
,
ndim
,
dtype
,
precision
):
return
self
.
_fwd_fft_tiling
(
ndim
,
dtype
,
precision
)
def
_gi_winograd
(
self
,
ndim
):
return
self
.
_fwd_winograd
(
ndim
)
def
_gi_winograd_non_fused
(
self
,
ndim
,
dtype
,
precision
):
return
self
.
_fwd_winograd_non_fused
(
ndim
,
dtype
,
precision
)
def
_fwd_runtime
(
self
,
ndim
,
dtype
,
precision
):
return
ConvCaseGenerator
(
ndim
=
ndim
,
dilations
=
self
.
_dilations
(
ndim
))
def
_gw_runtime
(
self
,
ndim
,
dtype
,
precision
):
return
self
.
_fwd_runtime
(
ndim
,
dtype
,
precision
)
def
_gi_runtime
(
self
,
ndim
,
dtype
,
precision
):
return
self
.
_fwd_runtime
(
ndim
,
dtype
,
precision
)
# Public interface.
def
fwd
(
self
,
algo
,
ndim
,
dtype
,
precision
):
if
algo
==
self
.
FFT
:
return
self
.
_fwd_fft
(
ndim
)
if
algo
==
self
.
FFT_TILING
:
return
self
.
_fwd_fft_tiling
(
ndim
,
dtype
,
precision
)
if
algo
==
self
.
WINOGRAD
:
return
self
.
_fwd_winograd
(
ndim
)
if
algo
==
self
.
WINOGRAD_NON_FUSED
:
return
self
.
_fwd_winograd_non_fused
(
ndim
,
dtype
,
precision
)
if
algo
in
SUPPORTED_DNN_CONV_ALGO_RUNTIME
:
return
self
.
_fwd_runtime
(
ndim
,
dtype
,
precision
)
return
ConvCaseGenerator
(
ndim
=
ndim
,
dilations
=
self
.
_dilations
(
ndim
))
def
gw
(
self
,
algo
,
ndim
,
dtype
,
precision
):
if
algo
==
self
.
FFT
:
return
self
.
_gw_fft
(
ndim
)
if
algo
==
self
.
WINOGRAD_NON_FUSED
:
return
self
.
_gw_winograd_non_fused
(
ndim
,
dtype
,
precision
)
if
algo
in
SUPPORTED_DNN_CONV_ALGO_RUNTIME
:
return
self
.
_gw_runtime
(
ndim
,
dtype
,
precision
)
return
ConvCaseGenerator
(
ndim
=
ndim
,
dilations
=
self
.
_dilations
(
ndim
))
def
gi
(
self
,
algo
,
ndim
,
dtype
,
precision
):
if
algo
==
self
.
FFT
:
return
self
.
_gi_fft
(
ndim
)
if
algo
==
self
.
FFT_TILING
:
return
self
.
_gi_fft_tiling
(
ndim
,
dtype
,
precision
)
if
algo
==
self
.
WINOGRAD
:
return
self
.
_gi_winograd
(
ndim
)
if
algo
==
self
.
WINOGRAD_NON_FUSED
:
return
self
.
_gi_winograd_non_fused
(
ndim
,
dtype
,
precision
)
if
algo
in
SUPPORTED_DNN_CONV_ALGO_RUNTIME
:
return
self
.
_gi_runtime
(
ndim
,
dtype
,
precision
)
return
ConvCaseGenerator
(
ndim
=
ndim
,
dilations
=
self
.
_dilations
(
ndim
))
class
CuDNNV6ConvCaseGenerator
(
CuDNNV51ConvCaseGenerator
):
def
_fwd_none
(
self
,
ndim
):
# All dilations allowed.
return
ConvCaseGenerator
(
ndim
=
ndim
)
def
_fwd_fft_tiling
(
self
,
ndim
,
dtype
,
precision
):
if
ndim
==
2
:
subsamples
=
[(
1
,
1
)]
# wDesc's filter height must be greater than convDesc's zero-padding height
# wDesc's filter width must be greater than convDesc's zero-padding width
generators
=
[]
if
(
dtype
,
precision
)
!=
(
"float64"
,
"float64"
):
# Filter sizes with every dimension != 1 is not supported for DOUBLE_CONFIG.
filters_sizes
=
[(
32
,
5
),
(
10
,
10
)]
borders
=
[(
1
,
1
),
(
6
,
4
)]
generators
+=
[
ConvCaseGenerator
(
ndim
=
ndim
,
dilations
=
self
.
_dilations
(
ndim
),
subsamples
=
subsamples
,
filters_sizes
=
filters_sizes
,
borders
=
borders
,
)
]
filters_sizes
=
[(
256
,
1
),
(
5
,
1
)]
borders
=
[(
1
,
0
),
(
2
,
0
)]
generators
+=
[
ConvCaseGenerator
(
ndim
=
ndim
,
dilations
=
self
.
_dilations
(
ndim
),
subsamples
=
subsamples
,
filters_sizes
=
filters_sizes
,
borders
=
borders
,
)
]
return
ConvCaseGeneratorChain
(
*
generators
)
if
ndim
==
3
:
return
super
()
.
_fwd_fft_tiling
(
ndim
,
dtype
,
precision
)
def
_gw_none
(
self
,
ndim
):
return
self
.
_fwd_none
(
ndim
)
def
_gw_fft_tiling
(
self
,
ndim
):
inputs_sizes
=
[(
247
,
1
),
(
20
,
1
)]
filters_sizes
=
[(
3
,
1
),
(
10
,
1
)]
subsamples
=
[(
1
,)
*
ndim
]
borders
=
[(
1
,
0
),
(
2
,
0
)]
return
ConvCaseGenerator
(
ndim
=
ndim
,
inputs_sizes
=
inputs_sizes
,
filters_sizes
=
filters_sizes
,
subsamples
=
subsamples
,
borders
=
borders
,
dilations
=
self
.
_dilations
(
ndim
),
)
def
_gi_none
(
self
,
ndim
):
return
self
.
_fwd_none
(
ndim
)
def
_fwd_runtime
(
self
,
ndim
,
dtype
,
precision
):
if
ndim
==
2
and
dtype
==
precision
==
"float16"
:
return
ConvCaseGenerator
(
ndim
=
ndim
,
dilations
=
self
.
_dilations
(
ndim
))
return
super
()
.
_fwd_runtime
(
ndim
,
dtype
,
precision
)
def
_gw_runtime
(
self
,
ndim
,
dtype
,
precision
):
if
ndim
==
2
and
dtype
==
precision
==
"float16"
:
return
ConvCaseGenerator
(
ndim
=
ndim
,
dilations
=
self
.
_dilations
(
ndim
))
return
super
()
.
_gw_runtime
(
ndim
,
dtype
,
precision
)
def
_gi_runtime
(
self
,
ndim
,
dtype
,
precision
):
if
ndim
==
2
and
dtype
==
precision
==
"float16"
:
return
ConvCaseGenerator
(
ndim
=
ndim
,
dilations
=
self
.
_dilations
(
ndim
))
return
super
()
.
_gi_runtime
(
ndim
,
dtype
,
precision
)
def
fwd
(
self
,
algo
,
ndim
,
dtype
,
precision
):
if
algo
==
self
.
NONE
:
return
self
.
_fwd_none
(
ndim
)
return
super
()
.
fwd
(
algo
,
ndim
,
dtype
,
precision
)
def
gw
(
self
,
algo
,
ndim
,
dtype
,
precision
):
if
algo
==
self
.
NONE
:
return
self
.
_gw_none
(
ndim
)
if
algo
==
self
.
FFT_TILING
:
return
self
.
_gw_fft_tiling
(
ndim
)
return
super
()
.
gw
(
algo
,
ndim
,
dtype
,
precision
)
def
gi
(
self
,
algo
,
ndim
,
dtype
,
precision
):
if
algo
==
self
.
NONE
:
return
self
.
_gi_none
(
ndim
)
return
super
()
.
gi
(
algo
,
ndim
,
dtype
,
precision
)
cudnn_conv_case_generator
=
(
CuDNNV51ConvCaseGenerator
()
if
cudnn
.
version
<
6
else
CuDNNV6ConvCaseGenerator
()
)
class
BaseTestDnnConv
:
"""
Base class for exhaustive tests. Use its subclasses
to run actual tests.
"""
# Abstract attributes.
ndim
=
2
fwd_algorithms
=
None
bwd_filter_algorithms
=
None
bwd_data_algorithms
=
None
cpu_conv_class
=
None
cpu_gradinput_class
=
None
cpu_gradweight_class
=
None
special_cases
=
[]
# List of special ConvCases.
runtime_shapes
=
(
[]
)
# Tuple of tuples with format: n_times, (inputs_shape, filters_shape)
# Utility methods.
def
_next_ten_exponent
(
self
,
val
):
# Return exponent for the next ten power that follows val.
# val should be a positive integer.
# Examples:
# for 0 to 9, returns 1 (=> 10**1 == 10)
# for 10 to 99, returns 2 (=> 10**2 == 100)
ten_exponent
=
1
while
val
//
10
>
0
:
ten_exponent
+=
1
val
//=
10
return
ten_exponent
def
scale_numpy_arrays_inplace
(
self
,
A
,
B
,
alpha
):
scale_factor
=
1
# Scale down simultaneously A and B if alpha is not 1.
if
alpha
!=
1
:
scale_factor
*=
alpha
# Normalize A and B simultaneously so that any values in these tensors are in interval [0, 1)
max_a
=
math
.
floor
(
abs
(
A
.
max
()))
max_b
=
math
.
floor
(
abs
(
B
.
max
()))
if
max_a
or
max_b
:
m_a
=
self
.
_next_ten_exponent
(
max_a
)
m_b
=
self
.
_next_ten_exponent
(
max_b
)
max_m
=
max
(
m_a
,
m_b
)
scale_factor
*=
10
**
max_m
if
scale_factor
!=
1
:
A
/=
scale_factor
B
/=
scale_factor
def
get_atol_rtol
(
self
,
algo
,
dtype
,
precision
):
if
dtype
==
"float16"
:
# Raise tolerance for float16
return
(
5e-2
,
5e-2
)
if
algo
==
"winograd_non_fused"
and
dtype
==
precision
==
"float32"
:
# Raise tolerance for winograd_non_fused in FLOAT_CONFIG.
return
(
1e-4
,
1e-4
)
return
None
,
None
def
__init__
(
self
):
self
.
dtype_configs
=
cudnn
.
get_supported_dtype_configs
(
check_dtype_config_support
)
def
array_like_conv_output
(
self
,
inputs_shape
,
filters_shape
,
border_mode
,
subsample
,
dilation
,
dtype
):
# Return a random array with inferred convolution output shape.
out_shp
=
get_conv_output_shape
(
inputs_shape
,
filters_shape
,
border_mode
,
subsample
,
dilation
)
out_shp
=
assert_conv_shape
(
out_shp
)
return
np
.
random
.
random
(
out_shp
)
.
astype
(
dtype
)
def
run_conv_fwd
(
self
,
algo
,
dtype
,
precision
,
parameters
):
(
inputs_shape
,
filters_shape
,
subsample
,
dilation
,
border_mode
,
conv_mode
,
alpha
,
beta
,
)
=
parameters
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
dtype
)
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
dtype
)
# Scale down the input values to prevent very large absolute errors
# due to float rounding
inputs_val
/=
10
filters_val
/=
10
inputs
=
aesara
.
shared
(
inputs_val
)
filters
=
aesara
.
shared
(
filters_val
)
if
beta
==
0
:
out
=
None
else
:
out
=
self
.
array_like_conv_output
(
inputs_shape
,
filters_shape
,
border_mode
,
subsample
,
dilation
,
dtype
)
out
/=
10
# Compile an Aesara function for the cuDNN implementation
conv
=
dnn_conv
(
img
=
inputs
,
kerns
=
filters
,
alpha
=
alpha
,
beta
=
beta
,
out
=
out
,
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
conv_mode
=
conv_mode
,
algo
=
algo
,
precision
=
precision
,
)
f
=
aesara
.
function
([],
conv
,
mode
=
mode_with_gpu
)
# If conv_mode is 'conv' the reference implementation should use
# filters flipped according to the width, height and time axis
if
conv_mode
==
"conv"
:
if
inputs
.
ndim
==
5
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
,
::
-
1
]
else
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
]
else
:
flipped_filters
=
filters
# Compile an Aesara function for the reference implementation
conv_ref
=
self
.
cpu_conv_class
(
border_mode
=
border_mode
,
subsample
=
subsample
,
filter_dilation
=
dilation
)(
ref_cast
(
inputs
),
flipped_filters
)
f_ref
=
aesara
.
function
([],
conv_ref
,
mode
=
"FAST_RUN"
)
# Compare the results of the two implementations
res_ref
=
f_ref
()
res
=
np
.
asarray
(
f
())
if
algo
in
cudnn
.
deterministic_fwd_algorithms
:
utt
.
assert_allclose
(
res
,
np
.
asarray
(
f
()))
atol
,
rtol
=
self
.
get_atol_rtol
(
algo
,
dtype
,
precision
)
if
beta
==
0
:
cpu_res
=
alpha
*
res_ref
else
:
cpu_res
=
alpha
*
res_ref
+
beta
*
out
self
.
scale_numpy_arrays_inplace
(
cpu_res
,
res
,
alpha
)
utt
.
assert_allclose
(
cpu_res
,
res
,
rtol
=
rtol
,
atol
=
atol
)
def
run_conv_gradinput
(
self
,
algo
,
dtype
,
precision
,
parameters
):
(
inputs_shape
,
filters_shape
,
subsample
,
dilation
,
border_mode
,
conv_mode
,
alpha
,
beta
,
)
=
parameters
if
beta
==
0
:
inputs_val
=
None
else
:
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
dtype
)
inputs_val
/=
10
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
dtype
)
topgrad_val
=
self
.
array_like_conv_output
(
inputs_shape
,
filters_shape
,
border_mode
,
subsample
,
dilation
,
dtype
)
# Scale down the input values to prevent absolute errors in utt.assert_allclose.
filters_val
/=
10
topgrad_val
/=
10
filters
=
aesara
.
shared
(
filters_val
)
topgrad
=
aesara
.
shared
(
topgrad_val
)
# Compile an Aesara function for the cuDNN implementation
grad_i
=
dnn_gradinput
(
filters
,
topgrad
,
inputs_shape
,
alpha
=
alpha
,
beta
=
beta
,
out
=
inputs_val
,
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
conv_mode
=
conv_mode
,
algo
=
algo
,
precision
=
precision
,
)
f
=
aesara
.
function
([],
grad_i
,
mode
=
mode_with_gpu
)
# If conv_mode is 'conv' the reference implementation should use
# filters flipped according to the width, height and time axis
if
conv_mode
==
"conv"
:
if
filters
.
ndim
==
5
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
,
::
-
1
]
else
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
]
else
:
flipped_filters
=
filters
# Compile an Aesara function for the reference implementation
grad_i_ref
=
self
.
cpu_gradinput_class
(
border_mode
=
border_mode
,
subsample
=
subsample
,
filter_dilation
=
dilation
)(
ref_cast
(
flipped_filters
),
ref_cast
(
topgrad
),
inputs_shape
[
2
:])
f_ref
=
aesara
.
function
([],
grad_i_ref
,
mode
=
"FAST_RUN"
)
# Compare the results of the two implementations
res_ref
=
f_ref
()
res
=
np
.
asarray
(
f
())
if
algo
in
cudnn
.
deterministic_bwd_data_algorithms
:
utt
.
assert_allclose
(
res
,
np
.
asarray
(
f
()))
atol
,
rtol
=
self
.
get_atol_rtol
(
algo
,
dtype
,
precision
)
if
beta
==
0
:
cpu_res
=
alpha
*
res_ref
else
:
cpu_res
=
alpha
*
res_ref
+
beta
*
inputs_val
self
.
scale_numpy_arrays_inplace
(
cpu_res
,
res
,
alpha
)
utt
.
assert_allclose
(
cpu_res
,
res
,
rtol
=
rtol
,
atol
=
atol
)
def
run_conv_gradweight
(
self
,
algo
,
dtype
,
precision
,
parameters
):
(
inputs_shape
,
filters_shape
,
subsample
,
dilation
,
border_mode
,
conv_mode
,
alpha
,
beta
,
)
=
parameters
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
dtype
)
if
beta
==
0
:
filters_val
=
None
else
:
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
dtype
)
filters_val
/=
10
topgrad_val
=
self
.
array_like_conv_output
(
inputs_shape
,
filters_shape
,
border_mode
,
subsample
,
dilation
,
dtype
)
# Scale down the input values to prevent absolute errors in utt.assert_allclose.
inputs_val
/=
10
topgrad_val
/=
10
inputs
=
aesara
.
shared
(
inputs_val
)
topgrad
=
aesara
.
shared
(
topgrad_val
)
# Compile an Aesara function for the cuDNN implementation
grad_w
=
dnn_gradweight
(
inputs
,
topgrad
,
filters_shape
,
alpha
=
alpha
,
beta
=
beta
,
out
=
filters_val
,
border_mode
=
border_mode
,
subsample
=
subsample
,
dilation
=
dilation
,
conv_mode
=
conv_mode
,
algo
=
algo
,
precision
=
precision
,
)
f
=
aesara
.
function
([],
grad_w
,
mode
=
mode_with_gpu
)
# Compile an Aesara function for the reference implementation
grad_w_ref
=
self
.
cpu_gradweight_class
(
border_mode
=
border_mode
,
subsample
=
subsample
,
filter_dilation
=
dilation
)(
ref_cast
(
inputs
),
ref_cast
(
topgrad
),
filters_shape
[
2
:])
if
conv_mode
==
"conv"
:
if
inputs
.
ndim
==
5
:
grad_w_ref
=
grad_w_ref
[:,
:,
::
-
1
,
::
-
1
,
::
-
1
]
else
:
grad_w_ref
=
grad_w_ref
[:,
:,
::
-
1
,
::
-
1
]
f_ref
=
aesara
.
function
([],
grad_w_ref
,
mode
=
"FAST_RUN"
)
# Compare the results of the two implementations
res_ref
=
f_ref
()
res
=
np
.
asarray
(
f
())
if
algo
in
cudnn
.
deterministic_bwd_filter_algorithms
:
utt
.
assert_allclose
(
res
,
np
.
asarray
(
f
()))
atol
,
rtol
=
self
.
get_atol_rtol
(
algo
,
dtype
,
precision
)
if
beta
==
0
:
cpu_res
=
alpha
*
res_ref
else
:
cpu_res
=
alpha
*
res_ref
+
beta
*
filters_val
self
.
scale_numpy_arrays_inplace
(
cpu_res
,
res
,
alpha
)
utt
.
assert_allclose
(
cpu_res
,
res
,
rtol
=
rtol
,
atol
=
atol
)
def
should_fail
(
self
,
function
,
*
args
):
try
:
print
(
"(should fail)"
,
file
=
sys
.
stderr
,
end
=
" "
)
function
(
*
args
)
except
Exception
:
pass
else
:
raise
AssertionError
(
"Should fail"
,
callable
.
__name__
,
*
args
)
def
should_fail_fwd
(
self
,
*
args
):
self
.
should_fail
(
self
.
run_conv_fwd
,
*
args
)
def
should_fail_gradinput
(
self
,
*
args
):
self
.
should_fail
(
self
.
run_conv_gradinput
,
*
args
)
def
should_fail_gradweight
(
self
,
*
args
):
self
.
should_fail
(
self
.
run_conv_gradweight
,
*
args
)
def
get_expected_tcount
(
self
):
"""Utility function to get expected test count without actually running pytest."""
return
(
sum
(
1
for
t
in
self
.
test_fwd
())
+
sum
(
1
for
t
in
self
.
test_gradweight
())
+
sum
(
1
for
t
in
self
.
test_gradinput
())
+
sum
(
1
for
t
in
self
.
test_fwd_runtime_algorithms
())
+
sum
(
1
for
t
in
self
.
test_gradweight_runtime_algorithms
())
+
sum
(
1
for
t
in
self
.
test_gradinput_runtime_algorithms
())
)
# Iterable test methods.
def
test_fwd
(
self
):
for
dtype
,
precision
in
self
.
dtype_configs
:
algos
=
[
algo
for
algo
in
self
.
fwd_algorithms
if
cudnn
.
fwd_algo_supports_dtype_config
(
algo
,
dtype
,
precision
,
self
.
ndim
)
]
for
algo
in
algos
:
for
parameters
in
cudnn_conv_case_generator
.
fwd
(
algo
,
self
.
ndim
,
dtype
,
precision
)
.
get_cases
():
self
.
run_conv_fwd
(
algo
,
dtype
,
precision
,
parameters
)
if
algos
:
# Some algorithms support current data type configuration for current ndim.
# So, an algorithm could be chosen at runtime.
for
algo
in
SUPPORTED_DNN_CONV_ALGO_RUNTIME
:
for
parameters
in
cudnn_conv_case_generator
.
fwd
(
algo
,
self
.
ndim
,
dtype
,
precision
)
.
get_cases
():
self
.
run_conv_fwd
(
algo
,
dtype
,
precision
,
parameters
)
for
dnn_case
in
self
.
special_cases
:
if
dnn_case
.
is_fwd
():
if
dnn_case
.
should_fail
:
self
.
should_fail_fwd
(
dnn_case
.
get_case
())
else
:
self
.
run_conv_fwd
(
dnn_case
.
get_case
())
def
test_gradinput
(
self
):
for
dtype
,
precision
in
self
.
dtype_configs
:
algos
=
[
algo
for
algo
in
self
.
bwd_data_algorithms
if
cudnn
.
bwd_data_algo_supports_dtype_config
(
algo
,
dtype
,
precision
,
self
.
ndim
)
]
for
algo
in
algos
:
for
parameters
in
cudnn_conv_case_generator
.
gi
(
algo
,
self
.
ndim
,
dtype
,
precision
)
.
get_cases
():
self
.
run_conv_gradinput
(
algo
,
dtype
,
precision
,
parameters
)
if
algos
:
# Some algorithms support current data type configuration for current ndim.
# So, an algorithm could be chosen at runtime.
for
algo
in
SUPPORTED_DNN_CONV_ALGO_RUNTIME
:
for
parameters
in
cudnn_conv_case_generator
.
gi
(
algo
,
self
.
ndim
,
dtype
,
precision
)
.
get_cases
():
self
.
run_conv_gradinput
(
algo
,
dtype
,
precision
,
parameters
)
for
dnn_case
in
self
.
special_cases
:
if
dnn_case
.
is_bwd_data
():
if
dnn_case
.
should_fail
:
self
.
should_fail_gradinput
(
dnn_case
.
get_case
())
else
:
self
.
run_conv_gradinput
(
dnn_case
.
get_case
())
def
test_gradweight
(
self
):
for
dtype
,
precision
in
self
.
dtype_configs
:
algos
=
[
algo
for
algo
in
self
.
bwd_filter_algorithms
if
cudnn
.
bwd_filter_algo_supports_dtype_config
(
algo
,
dtype
,
precision
,
self
.
ndim
)
]
for
algo
in
algos
:
for
parameters
in
cudnn_conv_case_generator
.
gw
(
algo
,
self
.
ndim
,
dtype
,
precision
)
.
get_cases
():
self
.
run_conv_gradweight
(
algo
,
dtype
,
precision
,
parameters
)
if
algos
:
# Some algorithms support current data type configuration for current ndim.
# So, an algorithm could be chosen at runtime.
for
algo
in
SUPPORTED_DNN_CONV_ALGO_RUNTIME
:
for
parameters
in
cudnn_conv_case_generator
.
gw
(
algo
,
self
.
ndim
,
dtype
,
precision
)
.
get_cases
():
self
.
run_conv_gradweight
(
algo
,
dtype
,
precision
,
parameters
)
for
dnn_case
in
self
.
special_cases
:
if
dnn_case
.
is_bwd_filter
():
if
dnn_case
.
should_fail
:
self
.
should_fail_gradweight
(
dnn_case
.
get_case
())
else
:
self
.
run_conv_gradweight
(
dnn_case
.
get_case
())
# The 3 following tests are intended to be run with aesara flag `cmodule__debug=True`.
# The output message should then be analyzed to check if runtime algorithms are
# reused, reloaded from cache or updated, depending on what we expect from
# dnn_fwd/dnn_gi/dnn_gw current codes. I currently don't know a better way
# to efficiently test implemented cuDNN convolution caches.
def
test_fwd_runtime_algorithms
(
self
):
dtype
=
"float32"
unit_shape
=
(
1
,)
*
self
.
ndim
_broadcastable
=
[
False
]
*
(
2
+
self
.
ndim
)
def
run_fwd_runtime_algorithm
(
algo
):
inputs
=
TensorType
(
dtype
,
_broadcastable
)()
filters
=
TensorType
(
dtype
,
_broadcastable
)()
# Scale down the input values to prevent very large absolute errors
# due to float rounding
lower_inputs
=
inputs
/
10
lower_filters
=
filters
/
10
conv
=
dnn_conv
(
img
=
lower_inputs
,
kerns
=
lower_filters
,
algo
=
algo
,
precision
=
dtype
,
subsample
=
unit_shape
,
dilation
=
unit_shape
,
)
f
=
aesara
.
function
([
inputs
,
filters
],
conv
,
mode
=
mode_with_gpu
)
if
self
.
ndim
==
3
:
flipped_filters
=
lower_filters
[:,
:,
::
-
1
,
::
-
1
,
::
-
1
]
else
:
flipped_filters
=
lower_filters
[:,
:,
::
-
1
,
::
-
1
]
conv_ref
=
self
.
cpu_conv_class
(
subsample
=
unit_shape
)(
ref_cast
(
lower_inputs
),
flipped_filters
)
f_ref
=
aesara
.
function
([
inputs
,
filters
],
conv_ref
,
mode
=
"FAST_RUN"
)
runtime_shapes
=
self
.
runtime_shapes
if
algo
in
(
"time_once"
,
"guess_once"
):
runtime_shapes
=
[
list
(
runtime_shapes
[
0
])]
runtime_shapes
[
0
][
0
]
=
5
for
ntimes
,
(
inputs_shape
,
filters_shape
)
in
runtime_shapes
:
print
(
"Shapes:"
,
inputs_shape
,
filters_shape
)
for
i
in
range
(
ntimes
):
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
dtype
)
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
dtype
)
gpu_res
=
np
.
asarray
(
f
(
inputs_val
,
filters_val
))
cpu_res
=
f_ref
(
inputs_val
,
filters_val
)
self
.
scale_numpy_arrays_inplace
(
cpu_res
,
gpu_res
,
1
)
utt
.
assert_allclose
(
cpu_res
,
gpu_res
)
for
algo
in
SUPPORTED_DNN_CONV_ALGO_RUNTIME
:
run_fwd_runtime_algorithm
(
algo
)
def
test_gradinput_runtime_algorithms
(
self
):
dtype
=
"float32"
unit_shape
=
(
1
,)
*
self
.
ndim
_broadcastable
=
[
False
]
*
(
2
+
self
.
ndim
)
def
run_gradinput_runtime_algorithm
(
algo
):
aesara
.
config
.
dnn__conv__algo_bwd_data
=
algo
inputs
=
TensorType
(
dtype
,
_broadcastable
)()
filters
=
TensorType
(
dtype
,
_broadcastable
)()
conv
=
dnn_conv
(
img
=
inputs
,
kerns
=
filters
,
algo
=
algo
,
precision
=
dtype
,
subsample
=
unit_shape
,
dilation
=
unit_shape
,
)
grad_i
=
aesara
.
gradient
.
grad
(
conv
.
sum
(),
[
inputs
])
f
=
aesara
.
function
([
inputs
,
filters
],
grad_i
,
mode
=
mode_with_gpu
)
assert
1
==
len
(
[
node
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
if
isinstance
(
node
.
op
,
GpuDnnConvGradI
)
]
)
assert
not
any
(
isinstance
(
node
.
op
,
GpuDnnConv
)
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
)
assert
not
any
(
isinstance
(
node
.
op
,
GpuDnnConvGradW
)
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
)
if
self
.
ndim
==
3
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
,
::
-
1
]
else
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
]
conv_ref
=
self
.
cpu_conv_class
(
subsample
=
unit_shape
)(
ref_cast
(
inputs
),
flipped_filters
)
grad_i_ref
=
aesara
.
gradient
.
grad
(
conv_ref
.
sum
(),
[
inputs
])
f_ref
=
aesara
.
function
([
inputs
,
filters
],
grad_i_ref
,
mode
=
"FAST_RUN"
)
runtime_shapes
=
self
.
runtime_shapes
if
algo
in
(
"time_once"
,
"guess_once"
):
runtime_shapes
=
[
list
(
runtime_shapes
[
0
])]
runtime_shapes
[
0
][
0
]
=
5
for
ntimes
,
(
inputs_shape
,
filters_shape
)
in
runtime_shapes
:
print
(
"Shapes:"
,
inputs_shape
,
filters_shape
)
for
i
in
range
(
ntimes
):
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
dtype
)
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
dtype
)
gpu_res
=
f
(
inputs_val
,
filters_val
)
cpu_res
=
f_ref
(
inputs_val
,
filters_val
)
utt
.
assert_allclose
(
cpu_res
,
np
.
asarray
(
gpu_res
))
for
algo
in
SUPPORTED_DNN_CONV_ALGO_RUNTIME
:
run_gradinput_runtime_algorithm
(
algo
)
def
test_gradweight_runtime_algorithms
(
self
):
dtype
=
"float32"
unit_shape
=
(
1
,)
*
self
.
ndim
_broadcastable
=
[
False
]
*
(
2
+
self
.
ndim
)
def
run_gradweight_runtime_algorithm
(
algo
):
with
aesara
.
config
.
change_flags
(
dnn__conv__algo_bwd_filter
=
algo
):
inputs
=
TensorType
(
dtype
,
_broadcastable
)()
filters
=
TensorType
(
dtype
,
_broadcastable
)()
conv
=
dnn_conv
(
img
=
inputs
,
kerns
=
filters
,
algo
=
algo
,
precision
=
dtype
,
subsample
=
unit_shape
,
dilation
=
unit_shape
,
)
grad_w
=
aesara
.
gradient
.
grad
(
conv
.
sum
(),
[
filters
])
f
=
aesara
.
function
([
inputs
,
filters
],
grad_w
,
mode
=
mode_with_gpu
)
assert
1
==
len
(
[
node
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
if
isinstance
(
node
.
op
,
GpuDnnConvGradW
)
]
)
assert
not
any
(
isinstance
(
node
.
op
,
GpuDnnConv
)
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
)
assert
not
any
(
isinstance
(
node
.
op
,
GpuDnnConvGradI
)
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
)
if
self
.
ndim
==
3
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
,
::
-
1
]
else
:
flipped_filters
=
filters
[:,
:,
::
-
1
,
::
-
1
]
conv_ref
=
self
.
cpu_conv_class
(
subsample
=
unit_shape
)(
ref_cast
(
inputs
),
flipped_filters
)
grad_w_ref
=
aesara
.
gradient
.
grad
(
conv_ref
.
sum
(),
[
filters
])
f_ref
=
aesara
.
function
([
inputs
,
filters
],
grad_w_ref
,
mode
=
"FAST_RUN"
)
runtime_shapes
=
self
.
runtime_shapes
if
algo
in
(
"time_once"
,
"guess_once"
):
runtime_shapes
=
[
list
(
runtime_shapes
[
0
])]
runtime_shapes
[
0
][
0
]
=
5
for
ntimes
,
(
inputs_shape
,
filters_shape
)
in
runtime_shapes
:
print
(
"Shapes:"
,
inputs_shape
,
filters_shape
)
for
i
in
range
(
ntimes
):
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
dtype
)
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
dtype
)
gpu_res
=
f
(
inputs_val
,
filters_val
)
cpu_res
=
f_ref
(
inputs_val
,
filters_val
)
utt
.
assert_allclose
(
cpu_res
,
np
.
asarray
(
gpu_res
))
for
algo
in
SUPPORTED_DNN_CONV_ALGO_RUNTIME
:
run_gradweight_runtime_algorithm
(
algo
)
class
TestDnnConv2D
(
BaseTestDnnConv
):
ndim
=
2
fwd_algorithms
=
cudnn
.
cudnnConvolutionFwdAlgo_t
.
get_aliases
()
bwd_filter_algorithms
=
cudnn
.
cudnnConvolutionBwdFilterAlgo_t
.
get_aliases
()
bwd_data_algorithms
=
cudnn
.
cudnnConvolutionBwdDataAlgo_t
.
get_aliases
()
cpu_conv_class
=
CorrMM
cpu_gradinput_class
=
CorrMM_gradInputs
cpu_gradweight_class
=
CorrMM_gradWeights
special_cases
=
[
ConvCase
.
bwd_filter
(
algo
=
"deterministic"
,
dtype
=
"float32"
,
precision
=
"float32"
,
inputs_shape
=
(
1
,
1
,
541211
,
10
),
filters_shape
=
(
50
,
1
,
3
,
10
),
border_mode
=
(
1
,
0
),
should_fail
=
(
cudnn
.
version
<=
6
),
),
ConvCase
.
fwd
(
algo
=
"small"
,
dtype
=
"float32"
,
precision
=
"float32"
,
inputs_shape
=
(
65536
,
2
,
2
,
2
),
filters_shape
=
(
1
,
2
,
2
,
2
),
),
# NB: Due to current workaround (see dnn_fwd.c), this test won't fail for cuDNN < v6100.
ConvCase
.
fwd
(
algo
=
"small"
,
dtype
=
"float32"
,
precision
=
"float32"
,
inputs_shape
=
(
65537
,
2
,
2
,
2
),
filters_shape
=
(
1
,
2
,
2
,
2
),
),
]
runtime_shapes
=
[
(
3
,
[(
2
,
3
,
10
,
9
),
(
5
,
3
,
7
,
7
)]),
(
1
,
[(
1
,
1
,
100
,
200
),
(
1
,
1
,
50
,
200
)]),
(
1
,
[(
4
,
2
,
20
,
20
),
(
2
,
2
,
20
,
19
)]),
(
3
,
[(
2
,
3
,
10
,
9
),
(
5
,
3
,
7
,
7
)]),
# cache should be used
(
1
,
[(
2
,
2
,
50
,
50
),
(
5
,
2
,
25
,
31
)]),
(
1
,
[(
1
,
1
,
100
,
200
),
(
1
,
1
,
50
,
200
)]),
# cache should be used
(
1
,
[(
4
,
2
,
20
,
20
),
(
2
,
2
,
20
,
19
)]),
# cache should be used
(
1
,
[(
1
,
2
,
3
,
4
),
(
6
,
2
,
2
,
1
)]),
]
class
TestDnnConv3D
(
BaseTestDnnConv
):
ndim
=
3
fwd_algorithms
=
cudnn
.
conv3d_fwd_algorithms
bwd_filter_algorithms
=
cudnn
.
conv3d_bwd_filter_algorithms
bwd_data_algorithms
=
cudnn
.
conv3d_bwd_data_algorithms
cpu_conv_class
=
Corr3dMM
cpu_gradinput_class
=
Corr3dMMGradInputs
cpu_gradweight_class
=
Corr3dMMGradWeights
special_cases
=
[
ConvCase
.
fwd
(
algo
=
"small"
,
dtype
=
"float32"
,
precision
=
"float32"
,
inputs_shape
=
(
65536
,
2
,
2
,
2
,
2
),
filters_shape
=
(
1
,
2
,
2
,
2
,
2
),
),
# NB: Due to current workaround (see dnn_fwd.c), this test won't fail for cuDNN < v6100.
ConvCase
.
fwd
(
algo
=
"small"
,
dtype
=
"float32"
,
precision
=
"float32"
,
inputs_shape
=
(
65537
,
2
,
2
,
2
,
2
),
filters_shape
=
(
1
,
2
,
2
,
2
,
2
),
),
]
runtime_shapes
=
[
(
3
,
[(
2
,
3
,
5
,
10
,
9
),
(
5
,
3
,
4
,
7
,
7
)]),
(
1
,
[(
1
,
1
,
5
,
100
,
200
),
(
1
,
1
,
4
,
50
,
200
)]),
(
1
,
[(
4
,
2
,
20
,
20
,
20
),
(
2
,
2
,
20
,
19
,
18
)]),
(
3
,
[(
2
,
3
,
5
,
10
,
9
),
(
5
,
3
,
4
,
7
,
7
)]),
# cache should be used
(
1
,
[(
2
,
2
,
50
,
50
,
5
),
(
5
,
2
,
25
,
31
,
4
)]),
(
1
,
[(
1
,
1
,
5
,
100
,
200
),
(
1
,
1
,
4
,
50
,
200
)]),
# cache should be used
(
1
,
[(
4
,
2
,
20
,
20
,
20
),
(
2
,
2
,
20
,
19
,
18
)]),
# cache should be used
(
1
,
[(
1
,
2
,
3
,
4
,
5
),
(
6
,
2
,
3
,
2
,
1
)]),
]
def
test_true_half_config_support
():
# For cuDNN V5.1 and V6.0:
# "TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0)"
if
not
check_dtype_config_support
(
"float16"
,
"float16"
):
pytest
.
skip
(
"FWD: TRUE_HALF_CONFIG not supported on this GPU."
)
class
CheckDnn
:
"""
Utility functions for scripting and infos printing.
"""
@staticmethod
def
dtype_config_to_str
(
dtype_config
):
dtype
,
precision
=
dtype_config
if
dtype
==
precision
==
"float16"
:
return
"TRUE_HALF_CONFIG"
if
dtype
==
"float16"
and
precision
==
"float32"
:
return
"PSEUDO_HALF_CONFIG"
if
dtype
==
precision
==
"float32"
:
return
"FLOAT_CONFIG"
if
dtype
==
precision
==
"float64"
:
return
"DOUBLE_CONFIG"
raise
ValueError
(
"unknown data type configuration"
,
dtype_config
)
@staticmethod
def
print_infos
(
count_tests
=
True
):
# Print infos about tests and cuDNN supported algorithms and configurations.
test_2d
=
TestDnnConv2D
()
test_3d
=
TestDnnConv3D
()
print
()
print
(
"Available data type configurations:"
,
", "
.
join
(
CheckDnn
.
dtype_config_to_str
(
d
)
for
d
in
cudnn
.
get_supported_dtype_configs
(
check_dtype_config_support
)
),
)
print
()
print
(
"2D algorithms:"
)
print
(
"FWD :"
,
", "
.
join
(
test_2d
.
fwd_algorithms
))
print
(
"BWD FILTER :"
,
", "
.
join
(
test_2d
.
bwd_filter_algorithms
))
print
(
"BWD DATA :"
,
", "
.
join
(
test_2d
.
bwd_data_algorithms
))
print
()
print
(
"3D algorithms:"
)
print
(
"FWD :"
,
", "
.
join
(
test_3d
.
fwd_algorithms
))
print
(
"BWD FILTER :"
,
", "
.
join
(
test_3d
.
bwd_filter_algorithms
))
print
(
"BWD DATA :"
,
", "
.
join
(
test_3d
.
bwd_data_algorithms
))
print
()
if
count_tests
:
count_tests_2d
=
test_2d
.
get_expected_tcount
()
count_tests_3d
=
test_3d
.
get_expected_tcount
()
print
(
count_tests_2d
,
"conv2D test cases."
)
print
(
count_tests_3d
,
"conv3D test cases."
)
print
(
"1 supplementary test."
)
print
(
count_tests_2d
+
count_tests_3d
+
1
,
"total conv tests."
)
print
()
@staticmethod
def
print_tests
():
# Print test cases without running them.
for
test
in
(
TestDnnConv2D
(),
TestDnnConv3D
()):
for
tcase
in
test
.
test_fwd
():
print
(
tcase
[
0
]
.
__name__
,
*
tcase
[
1
:])
for
tcase
in
test
.
test_gradinput
():
print
(
tcase
[
0
]
.
__name__
,
*
tcase
[
1
:])
for
tcase
in
test
.
test_gradweight
():
print
(
tcase
[
0
]
.
__name__
,
*
tcase
[
1
:])
for
tcase
in
test
.
test_fwd_runtime_algorithms
():
print
(
tcase
[
0
]
.
__name__
,
*
tcase
[
1
:])
for
tcase
in
test
.
test_gradinput_runtime_algorithms
():
print
(
tcase
[
0
]
.
__name__
,
*
tcase
[
1
:])
for
tcase
in
test
.
test_gradweight_runtime_algorithms
():
print
(
tcase
[
0
]
.
__name__
,
*
tcase
[
1
:])
print
(
test_true_half_config_support
.
__name__
)
tests/gpuarray/config.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
pytest
import
aesara.gpuarray
import
aesara.tensor
if
aesara
.
gpuarray
.
pygpu
is
None
:
pytest
.
skip
(
"pygpu not installed"
,
allow_module_level
=
True
)
init_error
=
None
if
not
aesara
.
gpuarray
.
pygpu_activated
and
not
aesara
.
config
.
force_device
:
try
:
aesara
.
gpuarray
.
init_dev
(
"cuda"
)
except
Exception
as
e
:
init_error
=
e
if
not
aesara
.
gpuarray
.
pygpu_activated
:
if
init_error
:
pytest
.
skip
(
str
(
init_error
),
allow_module_level
=
True
)
else
:
pytest
.
skip
(
"pygpu disabled"
,
allow_module_level
=
True
)
test_ctx_name
=
None
if
aesara
.
config
.
mode
==
"FAST_COMPILE"
:
mode_with_gpu
=
(
aesara
.
compile
.
mode
.
get_mode
(
"FAST_RUN"
)
.
including
(
"gpuarray"
)
.
excluding
(
"gpu"
)
)
mode_without_gpu
=
aesara
.
compile
.
mode
.
get_mode
(
"FAST_RUN"
)
.
excluding
(
"gpuarray"
)
else
:
mode_with_gpu
=
(
aesara
.
compile
.
mode
.
get_default_mode
()
.
including
(
"gpuarray"
)
.
excluding
(
"gpu"
)
)
mode_without_gpu
=
aesara
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
"gpuarray"
)
mode_without_gpu
.
check_py_code
=
False
# If using float16, cast reference input to float32
def
ref_cast
(
x
):
if
x
.
type
.
dtype
==
"float16"
:
x
=
aesara
.
tensor
.
cast
(
x
,
"float32"
)
return
x
tests/gpuarray/rnn_support.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
aesara
from
aesara.tensor.math
import
dot
,
sigmoid
,
tanh
class
Model
:
def
__init__
(
self
,
name
=
""
):
self
.
name
=
name
self
.
layers
=
[]
self
.
params
=
[]
self
.
other_updates
=
{}
def
add_layer
(
self
,
layer
):
self
.
layers
.
append
(
layer
)
for
p
in
layer
.
params
:
self
.
params
.
append
(
p
)
if
hasattr
(
layer
,
"other_updates"
):
for
y
in
layer
.
other_updates
:
self
.
other_updates
[
y
[
0
]]
=
y
[
1
]
def
get_params
(
self
):
return
self
.
params
def
uniform
(
stdev
,
size
):
"""uniform distribution with the given stdev and size"""
return
np
.
random
.
uniform
(
low
=-
stdev
*
np
.
sqrt
(
3
),
high
=
stdev
*
np
.
sqrt
(
3
),
size
=
size
)
.
astype
(
aesara
.
config
.
floatX
)
def
linear_transform_weights
(
input_dim
,
output_dim
,
param_list
=
None
,
name
=
""
):
"aesara shared variable given input and output dimension"
weight_inialization
=
uniform
(
np
.
sqrt
(
2.0
/
input_dim
),
(
input_dim
,
output_dim
))
W
=
aesara
.
shared
(
weight_inialization
,
name
=
name
)
assert
param_list
is
not
None
param_list
.
append
(
W
)
return
W
def
bias_weights
(
length
,
param_list
=
None
,
name
=
""
):
"aesara shared variable for bias unit, given length"
bias_initialization
=
np
.
zeros
(
length
)
.
astype
(
aesara
.
config
.
floatX
)
bias
=
aesara
.
shared
(
bias_initialization
,
name
=
name
)
if
param_list
is
not
None
:
param_list
.
append
(
bias
)
return
bias
class
Layer
:
"""Generic Layer Template which all layers should inherit"""
def
__init__
(
self
,
name
=
""
):
self
.
name
=
name
self
.
params
=
[]
def
get_params
(
self
):
return
self
.
params
class
GRU
(
Layer
):
def
__init__
(
self
,
input_dim
,
output_dim
,
input_layer
,
s0
=
None
,
name
=
""
):
"""Layers information"""
self
.
name
=
name
self
.
input_dim
=
input_dim
self
.
hidden_dim
=
output_dim
self
.
output_dim
=
output_dim
self
.
input_layer
=
input_layer
self
.
X
=
input_layer
.
output
()
self
.
s0
=
s0
self
.
params
=
[]
"""Layers weights"""
"""self.params is passed so that any parameters could be appended to it"""
self
.
W_r
=
linear_transform_weights
(
input_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".W_r"
)
self
.
b_wr
=
bias_weights
(
(
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b_wr"
)
self
.
W_i
=
linear_transform_weights
(
input_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".W_i"
)
self
.
b_wi
=
bias_weights
(
(
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b_wi"
)
self
.
W_h
=
linear_transform_weights
(
input_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".W_h"
)
self
.
b_wh
=
bias_weights
(
(
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b_wh"
)
self
.
R_r
=
linear_transform_weights
(
output_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".R_r"
)
self
.
b_rr
=
bias_weights
(
(
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b_rr"
)
self
.
R_i
=
linear_transform_weights
(
output_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".R_i"
)
self
.
b_ru
=
bias_weights
(
(
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b_ru"
)
self
.
R_h
=
linear_transform_weights
(
output_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".R_h"
)
self
.
b_rh
=
bias_weights
(
(
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b_rh"
)
"""step through processed input to create output"""
def
step
(
inp
,
s_prev
):
i_t
=
sigmoid
(
dot
(
inp
,
self
.
W_i
)
+
dot
(
s_prev
,
self
.
R_i
)
+
self
.
b_wi
+
self
.
b_ru
)
r_t
=
sigmoid
(
dot
(
inp
,
self
.
W_r
)
+
dot
(
s_prev
,
self
.
R_r
)
+
self
.
b_wr
+
self
.
b_rr
)
h_hat_t
=
tanh
(
dot
(
inp
,
self
.
W_h
)
+
(
r_t
*
(
dot
(
s_prev
,
self
.
R_h
)
+
self
.
b_rh
))
+
self
.
b_wh
)
s_curr
=
((
1.0
-
i_t
)
*
h_hat_t
)
+
(
i_t
*
s_prev
)
return
s_curr
outputs_info
=
self
.
s0
states
,
updates
=
aesara
.
scan
(
fn
=
step
,
sequences
=
[
self
.
X
],
outputs_info
=
outputs_info
)
self
.
Y
=
states
def
output
(
self
):
return
self
.
Y
class
LSTM
(
Layer
):
def
__init__
(
self
,
input_dim
,
output_dim
,
input_layer
,
s0
=
None
,
c0
=
None
,
name
=
""
):
"""Layers information"""
self
.
name
=
name
self
.
input_dim
=
input_dim
self
.
hidden_dim
=
output_dim
self
.
output_dim
=
output_dim
self
.
input_layer
=
input_layer
self
.
X
=
input_layer
.
output
()
self
.
s0
=
s0
self
.
c0
=
c0
self
.
params
=
[]
"""Layers weights"""
"""self.params is passed so that any parameters could be appended to it"""
self
.
W_i
=
linear_transform_weights
(
input_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".W_i"
)
self
.
b_wi
=
bias_weights
(
(
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b_wi"
)
self
.
W_f
=
linear_transform_weights
(
input_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".W_f"
)
self
.
b_wf
=
bias_weights
(
(
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b_wf"
)
self
.
W_c
=
linear_transform_weights
(
input_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".W_c"
)
self
.
b_wc
=
bias_weights
(
(
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b_wc"
)
self
.
W_o
=
linear_transform_weights
(
input_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".W_o"
)
self
.
b_wo
=
bias_weights
(
(
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b_wo"
)
self
.
R_i
=
linear_transform_weights
(
output_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".R_i"
)
self
.
b_ri
=
bias_weights
(
(
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b_ri"
)
self
.
R_f
=
linear_transform_weights
(
output_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".R_f"
)
self
.
b_rf
=
bias_weights
(
(
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b_rf"
)
self
.
R_c
=
linear_transform_weights
(
output_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".R_c"
)
self
.
b_rc
=
bias_weights
(
(
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b_rc"
)
self
.
R_o
=
linear_transform_weights
(
output_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".R_o"
)
self
.
b_ro
=
bias_weights
(
(
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b_ro"
)
"""step through processed input to create output"""
def
step
(
x_t
,
h_tm1
,
c_tm1
):
i_t
=
sigmoid
(
dot
(
x_t
,
self
.
W_i
)
+
dot
(
h_tm1
,
self
.
R_i
)
+
self
.
b_wi
+
self
.
b_ri
)
f_t
=
sigmoid
(
dot
(
x_t
,
self
.
W_f
)
+
dot
(
h_tm1
,
self
.
R_f
)
+
self
.
b_wf
+
self
.
b_rf
)
o_t
=
sigmoid
(
dot
(
x_t
,
self
.
W_o
)
+
dot
(
h_tm1
,
self
.
R_o
)
+
self
.
b_ro
+
self
.
b_wo
)
c_hat_t
=
tanh
(
dot
(
x_t
,
self
.
W_c
)
+
dot
(
h_tm1
,
self
.
R_c
)
+
self
.
b_wc
+
self
.
b_rc
)
c_t
=
f_t
*
c_tm1
+
i_t
*
c_hat_t
h_t
=
o_t
*
tanh
(
c_t
)
return
h_t
,
c_t
outputs_info
=
[
self
.
s0
,
self
.
c0
]
states
,
updates
=
aesara
.
scan
(
fn
=
step
,
sequences
=
[
self
.
X
],
outputs_info
=
outputs_info
)
self
.
Y
=
states
[
0
]
self
.
C
=
states
[
1
]
def
output
(
self
):
return
self
.
Y
class
FC
(
Layer
):
def
__init__
(
self
,
input_dim
,
output_dim
,
input_layer
,
name
=
""
):
self
.
input_layer
=
input_layer
self
.
name
=
name
self
.
params
=
[]
self
.
input_dim
=
input_dim
self
.
output_dim
=
output_dim
self
.
X
=
self
.
input_layer
.
output
()
self
.
W
=
linear_transform_weights
(
input_dim
,
output_dim
,
param_list
=
self
.
params
,
name
=
name
+
".W"
)
self
.
b
=
bias_weights
((
output_dim
,),
param_list
=
self
.
params
,
name
=
name
+
".b"
)
def
output
(
self
):
return
dot
(
self
.
X
,
self
.
W
)
+
self
.
b
class
WrapperLayer
(
Layer
):
def
__init__
(
self
,
X
,
name
=
""
):
self
.
params
=
[]
self
.
name
=
name
self
.
X
=
X
def
output
(
self
):
return
self
.
X
tests/gpuarray/run_dnn_conv.py
deleted
100644 → 0
浏览文件 @
c803c67e
# This script allows to run one specific cuDNN convolution test case.
# This script should not be imported, but only used as a program.
# python run_dnn_conv.py --help # Print help.
# python run_dnn_conv.py {fwd|bwd-filter|bwd-data} {2d|3d} -a <algo> -i <inputShape> -f <filterShape> ...
import
argparse
import
sys
import
aesara
from
aesara.configdefaults
import
SUPPORTED_DNN_CONV_ALGO_RUNTIME
from
aesara.gpuarray.cudnn_defs
import
(
DOUBLE
,
DOUBLE_CONFIG
,
FLOAT
,
FLOAT_CONFIG
,
HALF
,
PSEUDO_HALF_CONFIG
,
TRUE_HALF_CONFIG
,
)
from
aesara.tensor.nnet.abstract_conv
import
get_conv_output_shape
from
tests.gpuarray.check_dnn_conv
import
CheckDnn
,
TestDnnConv2D
,
TestDnnConv3D
,
cudnn
if
__name__
!=
"__main__"
:
raise
ImportError
(
"This script must not be imported."
)
class
TupleAction
(
argparse
.
Action
):
# Tuple extractor for command line args parser.
def
__call__
(
self
,
parser
,
namespace
,
values
,
option_string
=
None
):
values
=
tuple
(
int
(
v
)
for
v
in
values
.
split
(
","
))
setattr
(
namespace
,
self
.
dest
,
values
)
class
BorderAction
(
TupleAction
):
# Border extractor for command line args parser.
def
__call__
(
self
,
parser
,
namespace
,
values
,
option_string
=
None
):
if
values
not
in
(
"valid"
,
"full"
,
"half"
):
super
()
.
__call__
(
parser
,
namespace
,
values
,
option_string
)
else
:
setattr
(
namespace
,
self
.
dest
,
values
)
args
=
sys
.
argv
[
1
:]
computations
=
FWD
,
BWD_FILTER
,
BWD_DATA
=
(
"fwd"
,
"gradweight"
,
"gradinput"
)
algorithms
=
(
tuple
(
sorted
(
list
(
set
(
cudnn
.
cudnnConvolutionFwdAlgo_t
.
get_aliases
()
+
cudnn
.
cudnnConvolutionBwdFilterAlgo_t
.
get_aliases
()
+
cudnn
.
cudnnConvolutionBwdDataAlgo_t
.
get_aliases
()
)
)
)
)
+
SUPPORTED_DNN_CONV_ALGO_RUNTIME
)
types
=
(
HALF
,
FLOAT
,
DOUBLE
)
data_type_configurations
=
dict
(
TRUE_HALF_CONFIG
=
TRUE_HALF_CONFIG
,
PSEUDO_HALF_CONFIG
=
PSEUDO_HALF_CONFIG
,
FLOAT_CONFIG
=
FLOAT_CONFIG
,
DOUBLE_CONFIG
=
DOUBLE_CONFIG
,
)
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"computation"
,
choices
=
computations
,
help
=
"Computation to run."
)
parser
.
add_argument
(
"-a"
,
"--algo"
,
choices
=
algorithms
,
required
=
True
,
help
=
"Algorithm to use for computation."
,
)
parser
.
add_argument
(
"-i"
,
"--input-shape"
,
action
=
TupleAction
,
required
=
True
,
help
=
"Input shape. Comma-separated list of integers (no spaces)."
,
)
parser
.
add_argument
(
"-f"
,
"--filter-shape"
,
action
=
TupleAction
,
required
=
True
,
help
=
"Filter shape. Comma-separated list of integers (no spaces)."
,
)
parser
.
add_argument
(
"-D"
,
"--dtype-config"
,
choices
=
list
(
sorted
(
data_type_configurations
.
keys
())),
default
=
None
,
help
=
"Data type configuration for (data type; precision). Default (aesara floatX; aesara floatX). "
"To specify data type configuration, you can either use this option or set data type and "
'precision separately with "-t" and "-p" options.'
,
)
parser
.
add_argument
(
"-t"
,
"--dtype"
,
choices
=
types
,
default
=
None
,
help
=
"Data type (default aesara floatX)."
,
)
parser
.
add_argument
(
"-p"
,
"--precision"
,
choices
=
types
,
default
=
None
,
help
=
"Precision (default aesara floatX)."
,
)
parser
.
add_argument
(
"-s"
,
"--subsample"
,
action
=
TupleAction
,
help
=
"Subsample. Comma-separated list of integers (no spaces). "
"Default: 1 per dimension."
,
)
parser
.
add_argument
(
"-d"
,
"--dilation"
,
action
=
TupleAction
,
help
=
"Dilation. Comma-separated list of integers (no spaces). "
"Default: 1 per dimension."
,
)
parser
.
add_argument
(
"-b"
,
"--border-mode"
,
default
=
"valid"
,
action
=
BorderAction
,
help
=
'Border mode. "valid" (default), "full", "half" '
"or a comma-separated list of integers (no spaces)."
,
)
parser
.
add_argument
(
"-c"
,
"--conv-mode"
,
choices
=
(
"conv"
,
"cross"
),
default
=
"conv"
,
help
=
"Conv mode (default: conv)."
,
)
parser
.
add_argument
(
"-A"
,
"--alpha"
,
type
=
float
,
default
=
1
,
help
=
"alpha (floating), must not be zero. Default 1."
,
)
parser
.
add_argument
(
"-B"
,
"--beta"
,
type
=
float
,
default
=
0
,
help
=
"beta (floating). Default 0."
)
parser
.
add_argument
(
"-I"
,
"--print-infos"
,
action
=
"store_true"
,
default
=
False
,
help
=
"Print some infos before testing."
,
)
args
=
parser
.
parse_args
(
args
)
test
=
args
.
computation
if
len
(
args
.
input_shape
)
!=
len
(
args
.
filter_shape
):
raise
ValueError
(
"Expected same length for input shape and filter shape"
)
if
len
(
args
.
input_shape
)
not
in
(
4
,
5
):
raise
ValueError
(
"Expected length 4 or 5 for input shape"
)
ndim
=
len
(
args
.
input_shape
)
-
2
if
ndim
==
2
:
tests
=
TestDnnConv2D
()
elif
ndim
==
3
:
tests
=
TestDnnConv3D
()
if
args
.
subsample
is
None
:
args
.
subsample
=
(
1
,)
*
ndim
if
args
.
dilation
is
None
:
args
.
dilation
=
(
1
,)
*
ndim
if
not
(
ndim
==
len
(
args
.
subsample
)
==
len
(
args
.
dilation
)):
raise
ValueError
(
f
"Expected parameters sized for {int(ndim)} dimensions."
)
if
isinstance
(
args
.
border_mode
,
tuple
)
and
ndim
!=
len
(
args
.
border_mode
):
raise
ValueError
(
f
"Expected borders sized for {int(ndim)} dimensions."
)
if
args
.
alpha
==
0
:
raise
ValueError
(
"Nothing could be computed if alpha is 0."
)
if
args
.
dtype_config
is
None
:
if
args
.
dtype
is
None
:
args
.
dtype
=
aesara
.
config
.
floatX
if
args
.
precision
is
None
:
args
.
precision
=
aesara
.
config
.
floatX
else
:
if
args
.
dtype
is
not
None
or
args
.
precision
is
not
None
:
raise
ValueError
(
"You must specify either -D <data-type-configuration> "
"or (-t <data-type> -p <precision>), not both."
)
args
.
dtype
,
args
.
precision
=
data_type_configurations
[
args
.
dtype_config
]
if
(
args
.
dtype
,
args
.
precision
)
not
in
cudnn
.
get_supported_dtype_configs
():
raise
ValueError
(
f
"Unsupported data type configuration {args.dtype} {args.precision}."
)
if
args
.
algo
not
in
SUPPORTED_DNN_CONV_ALGO_RUNTIME
:
check_config
=
False
if
test
==
FWD
:
check_config
=
cudnn
.
fwd_algo_supports_dtype_config
(
args
.
algo
,
args
.
dtype
,
args
.
precision
,
ndim
)
elif
test
==
BWD_FILTER
:
check_config
=
cudnn
.
bwd_filter_algo_supports_dtype_config
(
args
.
algo
,
args
.
dtype
,
args
.
precision
,
ndim
)
elif
test
==
BWD_DATA
:
check_config
=
cudnn
.
bwd_data_algo_supports_dtype_config
(
args
.
algo
,
args
.
dtype
,
args
.
precision
,
ndim
)
if
not
check_config
:
print
(
"Warning:
%
s computation does not normally support configuration (
%
s,
%
s) for algo
%
s."
%
(
test
,
args
.
dtype
,
args
.
precision
,
args
.
algo
),
file
=
sys
.
stderr
,
)
algo
=
args
.
algo
dtype
=
args
.
dtype
precision
=
args
.
precision
parameters
=
(
args
.
input_shape
,
args
.
filter_shape
,
args
.
subsample
,
args
.
dilation
,
args
.
border_mode
,
args
.
conv_mode
,
args
.
alpha
,
args
.
beta
,
)
if
args
.
print_infos
:
CheckDnn
.
print_infos
(
count_tests
=
False
)
print
(
"======================"
)
print
(
"Running"
,
test
,
algo
,
dtype
,
precision
,
*
parameters
)
if
test
==
FWD
:
tests
.
run_conv_fwd
(
algo
,
dtype
,
precision
,
parameters
)
expected_output_shape
=
get_conv_output_shape
(
args
.
input_shape
,
args
.
filter_shape
,
args
.
border_mode
,
args
.
subsample
,
args
.
dilation
,
)
elif
test
==
BWD_FILTER
:
tests
.
run_conv_gradweight
(
algo
,
dtype
,
precision
,
parameters
)
expected_output_shape
=
args
.
filter_shape
elif
test
==
BWD_DATA
:
tests
.
run_conv_gradinput
(
algo
,
dtype
,
precision
,
parameters
)
expected_output_shape
=
args
.
input_shape
print
(
"Computed shape:"
,
expected_output_shape
)
print
(
"... OK"
)
tests/gpuarray/test_abstractconv.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
pytest
pygpu
=
pytest
.
importorskip
(
"pygpu"
)
gpuarray
=
pygpu
.
gpuarray
from
aesara.gpuarray.blas
import
(
GpuCorr3dMM
,
GpuCorr3dMM_gradInputs
,
GpuCorr3dMM_gradWeights
,
GpuCorrMM
,
GpuCorrMM_gradInputs
,
GpuCorrMM_gradWeights
,
)
from
aesara.gpuarray.dnn
import
(
GpuDnnConv
,
GpuDnnConvGradI
,
GpuDnnConvGradW
,
dnn_available
,
)
from
aesara.gpuarray.type
import
GpuArrayType
,
get_context
,
gpuarray_shared_constructor
from
tests.gpuarray.config
import
mode_with_gpu
,
test_ctx_name
from
tests.tensor.nnet.test_abstract_conv
import
(
BaseTestConv2d
,
BaseTestConv3d
,
TestConv2dTranspose
,
TestConvTypes
,
)
gpu_ftensor4
=
GpuArrayType
(
dtype
=
"float32"
,
broadcastable
=
(
False
,)
*
4
)
class
TestDnnConv2d
(
BaseTestConv2d
):
@classmethod
def
setup_class
(
cls
):
super
()
.
setup_class
()
cls
.
shared
=
staticmethod
(
gpuarray_shared_constructor
)
# provide_shape is not used by the cuDNN implementation
cls
.
provide_shape
=
[
False
]
@pytest.mark.skipif
(
dnn_available
(
test_ctx_name
),
reason
=
dnn_available
.
msg
)
def
run_test_case
(
self
,
i
,
f
,
s
,
b
,
flip
,
provide_shape
,
fd
=
(
1
,
1
)):
mode
=
mode_with_gpu
if
fd
!=
(
1
,
1
):
pytest
.
skip
(
"Doesn't have CUDNN implementation"
)
o
=
self
.
get_output_shape
(
i
,
f
,
s
,
b
,
fd
)
self
.
run_fwd
(
inputs_shape
=
i
,
filters_shape
=
f
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuDnnConv
,
)
self
.
run_gradweight
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuDnnConvGradW
,
)
self
.
run_gradinput
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuDnnConvGradI
,
)
@pytest.mark.skipif
(
dnn_available
(
test_ctx_name
),
reason
=
dnn_available
.
msg
)
def
run_test_case_gi
(
self
,
i
,
f
,
o
,
s
,
b
,
flip
,
provide_shape
,
fd
=
(
1
,
1
),
expect_error
=
False
):
if
fd
!=
(
1
,
1
):
pytest
.
skip
(
"Doesn't have CUDNN implementation"
)
mode
=
mode_with_gpu
if
not
expect_error
:
self
.
run_gradinput
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuDnnConvGradI
,
filter_dilation
=
fd
,
)
else
:
with
pytest
.
raises
((
RuntimeError
,
ValueError
)):
self
.
run_gradinput
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
False
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuDnnConvGradI
,
ref
=
None
,
filter_dilation
=
fd
,
)
class
TestDnnConv3d
(
BaseTestConv3d
):
@classmethod
def
setup_class
(
cls
):
super
()
.
setup_class
()
cls
.
shared
=
staticmethod
(
gpuarray_shared_constructor
)
# provide_shape is not used by the cuDNN implementation
cls
.
provide_shape
=
[
False
]
@pytest.mark.skipif
(
dnn_available
(
test_ctx_name
),
reason
=
dnn_available
.
msg
)
def
run_test_case
(
self
,
i
,
f
,
s
,
b
,
flip
,
provide_shape
,
fd
=
(
1
,
1
,
1
)):
mode
=
mode_with_gpu
if
fd
!=
(
1
,
1
,
1
):
pytest
.
skip
(
"Doesn't have CUDNN implementation"
)
o
=
self
.
get_output_shape
(
i
,
f
,
s
,
b
,
fd
)
self
.
run_fwd
(
inputs_shape
=
i
,
filters_shape
=
f
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuDnnConv
,
)
self
.
run_gradweight
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuDnnConvGradW
,
)
self
.
run_gradinput
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuDnnConvGradI
,
)
@pytest.mark.skipif
(
dnn_available
(
test_ctx_name
),
reason
=
dnn_available
.
msg
)
def
run_test_case_gi
(
self
,
i
,
f
,
o
,
s
,
b
,
flip
,
provide_shape
,
fd
=
(
1
,
1
,
1
),
expect_error
=
False
):
if
fd
!=
(
1
,
1
,
1
):
pytest
.
skip
(
"Doesn't have CUDNN implementation"
)
mode
=
mode_with_gpu
if
not
expect_error
:
self
.
run_gradinput
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuDnnConvGradI
,
filter_dilation
=
fd
,
)
else
:
with
pytest
.
raises
((
RuntimeError
,
ValueError
)):
self
.
run_gradinput
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
False
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuDnnConvGradI
,
ref
=
None
,
filter_dilation
=
fd
,
)
class
TestCorrMMConv2d
(
BaseTestConv2d
):
@classmethod
def
setup_class
(
cls
):
super
()
.
setup_class
()
cls
.
shared
=
staticmethod
(
gpuarray_shared_constructor
)
cls
.
mode
=
mode_with_gpu
.
excluding
(
"cudnn"
)
def
run_test_case
(
self
,
i
,
f
,
s
,
b
,
flip
,
provide_shape
,
fd
=
(
1
,
1
)):
mode
=
self
.
mode
o
=
self
.
get_output_shape
(
i
,
f
,
s
,
b
,
fd
)
self
.
run_fwd
(
inputs_shape
=
i
,
filters_shape
=
f
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
(
GpuCorrMM
,
GpuCorrMM_gradWeights
,
GpuCorrMM_gradInputs
),
filter_dilation
=
fd
,
)
self
.
run_gradweight
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuCorrMM_gradWeights
,
filter_dilation
=
fd
,
)
self
.
run_gradinput
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuCorrMM_gradInputs
,
filter_dilation
=
fd
,
)
def
run_test_case_gi
(
self
,
i
,
f
,
o
,
s
,
b
,
flip
,
provide_shape
,
fd
=
(
1
,
1
),
expect_error
=
False
):
mode
=
self
.
mode
if
not
expect_error
:
self
.
run_gradinput
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuCorrMM_gradInputs
,
filter_dilation
=
fd
,
)
else
:
with
pytest
.
raises
(
ValueError
):
self
.
run_gradinput
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
False
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuCorrMM_gradInputs
,
ref
=
None
,
filter_dilation
=
fd
,
)
class
TestCorrMMConv3d
(
BaseTestConv3d
):
@classmethod
def
setup_class
(
cls
):
super
()
.
setup_class
()
cls
.
shared
=
staticmethod
(
gpuarray_shared_constructor
)
cls
.
mode
=
mode_with_gpu
.
excluding
(
"cudnn"
)
def
run_test_case
(
self
,
i
,
f
,
s
,
b
,
flip
,
provide_shape
,
fd
=
(
1
,
1
,
1
)):
mode
=
self
.
mode
o
=
self
.
get_output_shape
(
i
,
f
,
s
,
b
,
fd
)
self
.
run_fwd
(
inputs_shape
=
i
,
filters_shape
=
f
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
(
GpuCorr3dMM
,
GpuCorr3dMM_gradWeights
,
GpuCorr3dMM_gradInputs
),
filter_dilation
=
fd
,
)
self
.
run_gradweight
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuCorr3dMM_gradWeights
,
filter_dilation
=
fd
,
)
self
.
run_gradinput
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuCorr3dMM_gradInputs
,
filter_dilation
=
fd
,
)
def
run_test_case_gi
(
self
,
i
,
f
,
o
,
s
,
b
,
flip
,
provide_shape
,
fd
=
(
1
,
1
,
1
),
expect_error
=
False
):
mode
=
self
.
mode
if
not
expect_error
:
self
.
run_gradinput
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
True
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuCorr3dMM_gradInputs
,
filter_dilation
=
fd
,
)
else
:
with
pytest
.
raises
(
ValueError
):
self
.
run_gradinput
(
inputs_shape
=
i
,
filters_shape
=
f
,
output_shape
=
o
,
subsample
=
s
,
verify_grad
=
False
,
mode
=
mode
,
provide_shape
=
provide_shape
,
border_mode
=
b
,
filter_flip
=
flip
,
target_op
=
GpuCorr3dMM_gradInputs
,
ref
=
None
,
filter_dilation
=
fd
,
)
class
TestDnnConvTypes
(
TestConvTypes
):
def
setup_method
(
self
):
self
.
input
=
gpu_ftensor4
()
self
.
filters
=
gpu_ftensor4
()
self
.
topgrad
=
gpu_ftensor4
()
self
.
constant_tensor
=
gpuarray
.
array
(
np
.
zeros
((
3
,
5
,
7
,
11
),
dtype
=
"float32"
),
context
=
get_context
(
test_ctx_name
)
)
super
()
.
setup_method
()
class
TestConv2dTranspose
(
TestConv2dTranspose
):
mode
=
mode_with_gpu
tests/gpuarray/test_basic_ops.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
pytest
import
aesara
import
aesara.tensor
as
at
from
aesara.gpuarray.basic_ops
import
(
GpuAlloc
,
GpuAllocEmpty
,
GpuContiguous
,
GpuEye
,
GpuFromHost
,
GpuJoin
,
GpuReshape
,
GpuSplit
,
GpuToGpu
,
GpuTri
,
HostFromGpu
,
gpu_contiguous
,
gpu_join
,
host_from_gpu
,
)
from
aesara.gpuarray.elemwise
import
GpuDimShuffle
,
GpuElemwise
from
aesara.gpuarray.subtensor
import
GpuSubtensor
from
aesara.gpuarray.type
import
GpuArrayType
,
get_context
,
gpuarray_shared_constructor
from
aesara.tensor.basic
import
Alloc
,
MakeVector
,
Split
,
alloc
from
aesara.tensor.shape
import
Shape
,
Shape_i
from
aesara.tensor.type
import
TensorType
,
fmatrix
,
iscalar
,
lscalar
,
matrix
# Don't import test classes otherwise they get tested as part of the file
from
tests
import
unittest_tools
as
utt
from
tests.gpuarray.config
import
mode_with_gpu
,
mode_without_gpu
,
test_ctx_name
from
tests.tensor.test_basic
import
(
TestAlloc
,
TestComparison
,
TestJoinAndSplit
,
TestReshape
,
)
from
tests.tensor.utils
import
random
,
safe_make_node
pygpu
=
pytest
.
importorskip
(
"pygpu"
)
gpuarray
=
pygpu
.
gpuarray
rng
=
np
.
random
.
default_rng
(
seed
=
utt
.
fetch_seed
())
def
inplace_func
(
inputs
,
outputs
,
mode
=
None
,
allow_input_downcast
=
False
,
on_unused_input
=
"raise"
,
name
=
None
,
):
if
mode
is
None
:
mode
=
mode_with_gpu
return
aesara
.
function
(
inputs
,
outputs
,
mode
=
mode
,
allow_input_downcast
=
allow_input_downcast
,
accept_inplace
=
True
,
on_unused_input
=
on_unused_input
,
name
=
name
,
)
def
fake_shared
(
value
,
name
=
None
,
strict
=
False
,
allow_downcast
=
None
,
**
kwargs
):
from
aesara.tensor.sharedvar
import
scalar_constructor
,
tensor_constructor
for
c
in
(
gpuarray_shared_constructor
,
tensor_constructor
,
scalar_constructor
):
try
:
return
c
(
value
,
name
=
name
,
strict
=
strict
,
allow_downcast
=
allow_downcast
,
**
kwargs
)
except
TypeError
:
continue
def
rand_gpuarray
(
*
shape
,
**
kwargs
):
r
=
rng
.
random
(
shape
)
*
2
-
1
dtype
=
kwargs
.
pop
(
"dtype"
,
aesara
.
config
.
floatX
)
cls
=
kwargs
.
pop
(
"cls"
,
None
)
if
len
(
kwargs
)
!=
0
:
raise
TypeError
(
"Unexpected argument
%
s"
,
list
(
kwargs
.
keys
())[
0
])
return
gpuarray
.
array
(
r
,
dtype
=
dtype
,
cls
=
cls
,
context
=
get_context
(
test_ctx_name
))
def
makeTester
(
name
,
op
,
gpu_op
,
cases
,
checks
=
None
,
mode_gpu
=
mode_with_gpu
,
mode_nogpu
=
mode_without_gpu
,
skip
=
False
,
eps
=
1e-10
,
):
if
checks
is
None
:
checks
=
{}
_op
=
op
_gpu_op
=
gpu_op
_cases
=
cases
_skip
=
skip
_checks
=
checks
class
Checker
(
utt
.
OptimizationTestMixin
):
op
=
staticmethod
(
_op
)
gpu_op
=
staticmethod
(
_gpu_op
)
cases
=
_cases
skip
=
_skip
checks
=
_checks
def
setup_method
(
self
):
eval
(
self
.
__class__
.
__module__
+
"."
+
self
.
__class__
.
__name__
)
def
test_all
(
self
):
if
skip
:
pytest
.
skip
(
skip
)
for
testname
,
inputs
in
cases
.
items
():
for
_
in
range
(
len
(
inputs
)):
if
type
(
inputs
[
_
])
is
float
:
inputs
[
_
]
=
np
.
asarray
(
inputs
[
_
],
dtype
=
aesara
.
config
.
floatX
)
self
.
run_case
(
testname
,
inputs
)
def
run_case
(
self
,
testname
,
inputs
):
inputs_ref
=
[
aesara
.
shared
(
inp
)
for
inp
in
inputs
]
inputs_tst
=
[
aesara
.
shared
(
inp
)
for
inp
in
inputs
]
try
:
node_ref
=
safe_make_node
(
self
.
op
,
*
inputs_ref
)
node_tst
=
safe_make_node
(
self
.
op
,
*
inputs_tst
)
except
Exception
as
exc
:
err_msg
=
(
"Test
%
s::
%
s: Error occurred while making "
"a node with inputs
%
s"
)
%
(
self
.
gpu_op
,
testname
,
inputs
)
exc
.
args
+=
(
err_msg
,)
raise
try
:
f_ref
=
inplace_func
([],
node_ref
.
outputs
,
mode
=
mode_nogpu
)
f_tst
=
inplace_func
([],
node_tst
.
outputs
,
mode
=
mode_gpu
)
except
Exception
as
exc
:
err_msg
=
(
"Test
%
s::
%
s: Error occurred while trying to "
"make a Function"
)
%
(
self
.
gpu_op
,
testname
)
exc
.
args
+=
(
err_msg
,)
raise
self
.
assertFunctionContains1
(
f_tst
,
self
.
gpu_op
)
ref_e
=
None
try
:
expecteds
=
f_ref
()
except
Exception
as
exc
:
ref_e
=
exc
try
:
variables
=
f_tst
()
except
Exception
as
exc
:
if
ref_e
is
None
:
err_msg
=
(
"Test
%
s::
%
s: exception when calling the "
"Function"
)
%
(
self
.
gpu_op
,
testname
)
exc
.
args
+=
(
err_msg
,)
raise
else
:
# if we raised an exception of the same type we're good.
if
isinstance
(
exc
,
type
(
ref_e
)):
return
else
:
err_msg
=
(
"Test
%
s::
%
s: exception raised during test "
"call was not the same as the reference "
"call (got:
%
s, expected
%
s)"
%
(
self
.
gpu_op
,
testname
,
type
(
exc
),
type
(
ref_e
))
)
exc
.
args
+=
(
err_msg
,)
raise
for
i
,
(
variable
,
expected
)
in
enumerate
(
zip
(
variables
,
expecteds
)):
condition
=
(
variable
.
dtype
!=
expected
.
dtype
or
variable
.
shape
!=
expected
.
shape
or
not
TensorType
.
values_eq_approx
(
variable
,
expected
)
)
assert
not
condition
,
(
"Test
%
s::
%
s: Output
%
s gave the wrong "
"value. With inputs
%
s, expected
%
s "
"(dtype
%
s), got
%
s (dtype
%
s)."
%
(
self
.
op
,
testname
,
i
,
inputs
,
expected
,
expected
.
dtype
,
variable
,
variable
.
dtype
,
)
)
for
description
,
check
in
self
.
checks
.
items
():
assert
check
(
inputs
,
variables
),
(
"Test
%
s::
%
s: Failed check:
%
s "
"(inputs were
%
s, outputs were
%
s)"
)
%
(
self
.
op
,
testname
,
description
,
inputs
,
variables
)
Checker
.
__name__
=
name
if
hasattr
(
Checker
,
"__qualname__"
):
Checker
.
__qualname__
=
name
return
Checker
def
test_transfer_cpu_gpu
():
a
=
fmatrix
(
"a"
)
g
=
GpuArrayType
(
dtype
=
"float32"
,
broadcastable
=
(
False
,
False
))(
"g"
)
av
=
np
.
asarray
(
rng
.
random
((
5
,
4
)),
dtype
=
"float32"
)
gv
=
gpuarray
.
array
(
av
,
context
=
get_context
(
test_ctx_name
))
f
=
aesara
.
function
([
a
],
GpuFromHost
(
test_ctx_name
)(
a
))
fv
=
f
(
av
)
assert
GpuArrayType
.
values_eq
(
fv
,
gv
)
f
=
aesara
.
function
([
g
],
host_from_gpu
(
g
))
fv
=
f
(
gv
)
assert
np
.
all
(
fv
==
av
)
def
test_transfer_gpu_gpu
():
g
=
GpuArrayType
(
dtype
=
"float32"
,
broadcastable
=
(
False
,
False
),
context_name
=
test_ctx_name
)()
av
=
np
.
asarray
(
rng
.
random
((
5
,
4
)),
dtype
=
"float32"
)
gv
=
gpuarray
.
array
(
av
,
context
=
get_context
(
test_ctx_name
))
mode
=
mode_with_gpu
.
excluding
(
"cut_gpua_host_transfers"
,
"local_cut_gpua_host_gpua"
)
f
=
aesara
.
function
([
g
],
GpuToGpu
(
test_ctx_name
)(
g
),
mode
=
mode
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
GpuToGpu
)
fv
=
f
(
gv
)
assert
GpuArrayType
.
values_eq
(
fv
,
gv
)
def
test_transfer_strided
():
# This is just to ensure that it works in aesara
# libgpuarray has a much more comprehensive suit of tests to
# ensure correctness
a
=
fmatrix
(
"a"
)
g
=
GpuArrayType
(
dtype
=
"float32"
,
broadcastable
=
(
False
,
False
))(
"g"
)
av
=
np
.
asarray
(
rng
.
random
((
5
,
8
)),
dtype
=
"float32"
)
gv
=
gpuarray
.
array
(
av
,
context
=
get_context
(
test_ctx_name
))
av
=
av
[:,
::
2
]
gv
=
gv
[:,
::
2
]
f
=
aesara
.
function
([
a
],
GpuFromHost
(
test_ctx_name
)(
a
))
fv
=
f
(
av
)
assert
GpuArrayType
.
values_eq
(
fv
,
gv
)
f
=
aesara
.
function
([
g
],
host_from_gpu
(
g
))
fv
=
f
(
gv
)
assert
np
.
all
(
fv
==
av
)
def
gpu_alloc_expected
(
x
,
*
shp
):
g
=
gpuarray
.
empty
(
shp
,
dtype
=
x
.
dtype
,
context
=
get_context
(
test_ctx_name
))
g
[:]
=
x
return
g
TestGpuAlloc
=
makeTester
(
name
=
"GpuAllocTester"
,
# The +1 is there to allow the lift to the GPU.
op
=
lambda
*
args
:
alloc
(
*
args
)
+
1
,
gpu_op
=
GpuAlloc
(
test_ctx_name
),
cases
=
dict
(
correct01
=
(
random
(),
np
.
int32
(
7
)),
# just gives a DeepCopyOp with possibly wrong results on the CPU
# correct01_bcast=(random(1), np.int32(7)),
correct02
=
(
random
(),
np
.
int32
(
4
),
np
.
int32
(
7
)),
correct12
=
(
random
(
7
),
np
.
int32
(
4
),
np
.
int32
(
7
)),
correct13
=
(
random
(
7
),
np
.
int32
(
2
),
np
.
int32
(
4
),
np
.
int32
(
7
)),
correct23
=
(
random
(
4
,
7
),
np
.
int32
(
2
),
np
.
int32
(
4
),
np
.
int32
(
7
)),
bad_shape12
=
(
random
(
7
),
np
.
int32
(
7
),
np
.
int32
(
5
)),
),
)
class
TestGPUAlloc
(
TestAlloc
):
dtype
=
"float32"
mode
=
mode_with_gpu
shared
=
staticmethod
(
gpuarray_shared_constructor
)
allocs
=
[
GpuAlloc
(
test_ctx_name
),
GpuAlloc
(
test_ctx_name
),
Alloc
()]
def
test_alloc_empty
():
for
dt
in
[
"float32"
,
"int8"
]:
f
=
aesara
.
function
([],
GpuAllocEmpty
(
dt
,
context_name
=
test_ctx_name
)(
2
,
3
))
assert
len
(
f
.
maker
.
fgraph
.
apply_nodes
)
==
1
out
=
f
()
assert
out
.
shape
==
(
2
,
3
)
assert
out
.
dtype
==
dt
f
=
aesara
.
function
(
[],
[
GpuAllocEmpty
(
"uint64"
,
test_ctx_name
)(
3
,
2
),
GpuAllocEmpty
(
"uint64"
,
test_ctx_name
)(
3
,
2
),
],
)
out
=
f
()
assert
out
[
0
]
.
shape
==
(
3
,
2
)
assert
out
[
0
]
.
dtype
==
"uint64"
assert
out
[
1
]
.
shape
==
(
3
,
2
)
assert
out
[
1
]
.
dtype
==
"uint64"
assert
(
len
(
[
node
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
if
isinstance
(
node
.
op
,
GpuAllocEmpty
)
]
)
==
1
)
def
test_shape
():
x
=
GpuArrayType
(
dtype
=
"float32"
,
broadcastable
=
[
False
,
False
,
False
])()
v
=
gpuarray
.
zeros
((
3
,
4
,
5
),
dtype
=
"float32"
,
context
=
get_context
(
test_ctx_name
))
f
=
aesara
.
function
([
x
],
x
.
shape
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
np
.
all
(
f
(
v
)
==
(
3
,
4
,
5
))
if
aesara
.
config
.
mode
!=
"FAST_COMPILE"
:
assert
len
(
topo
)
==
4
assert
isinstance
(
topo
[
0
]
.
op
,
Shape_i
)
assert
isinstance
(
topo
[
1
]
.
op
,
Shape_i
)
assert
isinstance
(
topo
[
2
]
.
op
,
Shape_i
)
assert
isinstance
(
topo
[
3
]
.
op
,
MakeVector
)
mode
=
mode_with_gpu
.
excluding
(
"local_shape_to_shape_i"
)
f
=
aesara
.
function
([
x
],
x
.
shape
,
mode
=
mode
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
np
.
all
(
f
(
v
)
==
(
3
,
4
,
5
))
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
Shape
)
def
test_gpu_contiguous
():
a
=
fmatrix
(
"a"
)
i
=
iscalar
(
"i"
)
a_val
=
np
.
asarray
(
np
.
random
.
random
(
4
,
5
),
dtype
=
"float32"
)
# The reshape is needed otherwise we make the subtensor on the CPU
# to transfer less data.
f
=
aesara
.
function
(
[
a
,
i
],
gpu_contiguous
(
a
.
reshape
((
5
,
4
))[::
i
]),
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
any
(
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
)
assert
any
(
isinstance
(
node
.
op
,
GpuContiguous
)
for
node
in
topo
)
assert
f
(
a_val
,
1
)
.
flags
.
c_contiguous
assert
f
(
a_val
,
2
)
.
flags
.
c_contiguous
assert
f
(
a_val
,
2
)
.
flags
.
c_contiguous
class
TestGPUReshape
(
TestReshape
):
def
setup_method
(
self
):
self
.
shared
=
gpuarray_shared_constructor
self
.
op
=
GpuReshape
self
.
mode
=
mode_with_gpu
self
.
ignore_topo
=
(
HostFromGpu
,
GpuFromHost
,
aesara
.
compile
.
DeepCopyOp
,
GpuDimShuffle
,
GpuElemwise
,
Shape_i
,
MakeVector
,
)
assert
self
.
op
==
GpuReshape
class
TestGPUComparison
(
TestComparison
):
def
setup_method
(
self
):
self
.
mode
=
mode_with_gpu
self
.
shared
=
gpuarray_shared_constructor
self
.
dtypes
=
[
"float64"
,
"float32"
]
class
TestGPUJoinAndSplit
(
TestJoinAndSplit
):
def
setup_method
(
self
):
self
.
mode
=
mode_with_gpu
.
excluding
(
"constant_folding"
)
self
.
join_op
=
GpuJoin
()
self
.
split_op_class
=
GpuSplit
# Use join instead of MakeVector since there is no MakeVector on GPU
self
.
make_vector_op
=
GpuJoin
()
# this is to avoid errors with limited devices
self
.
floatX
=
"float32"
self
.
hide_error
=
aesara
.
config
.
mode
not
in
[
"DebugMode"
,
"DEBUG_MODE"
]
def
shared
(
x
,
**
kwargs
):
return
gpuarray_shared_constructor
(
x
,
target
=
test_ctx_name
,
**
kwargs
)
self
.
shared
=
shared
def
test_gpusplit_opt
(
self
):
# Test that we move the node to the GPU
# Also test float16 computation at the same time.
rng
=
np
.
random
.
default_rng
(
seed
=
utt
.
fetch_seed
())
m
=
self
.
shared
(
rng
.
random
((
4
,
6
))
.
astype
(
"float16"
))
o
=
Split
(
2
)(
m
,
0
,
[
2
,
2
])
assert
o
[
0
]
.
dtype
==
"float16"
f
=
aesara
.
function
([],
o
,
mode
=
self
.
mode
)
assert
any
(
[
isinstance
(
node
.
op
,
self
.
split_op_class
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
o1
,
o2
=
f
()
assert
np
.
allclose
(
o1
,
m
.
get_value
(
borrow
=
True
)[:
2
])
assert
np
.
allclose
(
o2
,
m
.
get_value
(
borrow
=
True
)[
2
:])
def
test_gpujoin_gpualloc
():
a
=
fmatrix
(
"a"
)
a_val
=
np
.
asarray
(
np
.
random
.
random
(
4
,
5
),
dtype
=
"float32"
)
b
=
fmatrix
(
"b"
)
b_val
=
np
.
asarray
(
np
.
random
.
random
(
3
,
5
),
dtype
=
"float32"
)
f
=
aesara
.
function
(
[
a
,
b
],
at
.
join
(
0
,
at
.
zeros_like
(
a
),
at
.
ones_like
(
b
))
+
4
,
mode
=
mode_without_gpu
,
)
f_gpu
=
aesara
.
function
(
[
a
,
b
],
at
.
join
(
0
,
at
.
zeros_like
(
a
),
at
.
ones_like
(
b
)),
mode
=
mode_with_gpu
)
f_gpu2
=
aesara
.
function
(
[
a
,
b
],
at
.
join
(
0
,
at
.
zeros_like
(
a
),
at
.
ones_like
(
b
))
+
4
,
mode
=
mode_with_gpu
)
assert
sum
([
node
.
op
==
at
.
alloc
for
node
in
f
.
maker
.
fgraph
.
toposort
()])
==
2
assert
sum
([
node
.
op
==
at
.
join_
for
node
in
f
.
maker
.
fgraph
.
toposort
()])
==
1
assert
(
sum
([
isinstance
(
node
.
op
,
GpuAlloc
)
for
node
in
f_gpu
.
maker
.
fgraph
.
toposort
()])
==
2
)
assert
sum
([
node
.
op
==
gpu_join
for
node
in
f_gpu
.
maker
.
fgraph
.
toposort
()])
==
1
assert
(
sum
([
isinstance
(
node
.
op
,
GpuAlloc
)
for
node
in
f_gpu2
.
maker
.
fgraph
.
toposort
()])
==
2
)
assert
sum
([
node
.
op
==
gpu_join
for
node
in
f_gpu2
.
maker
.
fgraph
.
toposort
()])
==
1
assert
np
.
allclose
(
f
(
a_val
,
b_val
),
f_gpu2
(
a_val
,
b_val
))
def
test_gpueye
():
def
check
(
dtype
,
N
,
M_
=
None
,
k
=
0
):
# Aesara does not accept None as a tensor.
# So we must use a real value.
M
=
M_
# Currently DebugMode does not support None as inputs even if this is
# allowed.
if
M
is
None
:
M
=
N
N_symb
=
iscalar
()
M_symb
=
iscalar
()
k_symb
=
iscalar
()
out
=
at
.
eye
(
N_symb
,
M_symb
,
k_symb
,
dtype
=
dtype
)
+
np
.
array
(
1
)
.
astype
(
dtype
)
f
=
aesara
.
function
([
N_symb
,
M_symb
,
k_symb
],
out
,
mode
=
mode_with_gpu
)
result
=
np
.
asarray
(
f
(
N
,
M
,
k
))
-
np
.
array
(
1
)
.
astype
(
dtype
)
assert
np
.
allclose
(
result
,
np
.
eye
(
N
,
M_
,
k
,
dtype
=
dtype
))
assert
result
.
dtype
==
np
.
dtype
(
dtype
)
assert
any
(
isinstance
(
node
.
op
,
GpuEye
)
for
node
in
f
.
maker
.
fgraph
.
toposort
())
for
dtype
in
[
"float32"
,
"int32"
,
"float16"
]:
check
(
dtype
,
3
)
# M != N, k = 0
check
(
dtype
,
3
,
5
)
check
(
dtype
,
5
,
3
)
# N == M, k != 0
check
(
dtype
,
3
,
3
,
1
)
check
(
dtype
,
3
,
3
,
-
1
)
# N < M, k != 0
check
(
dtype
,
3
,
5
,
1
)
check
(
dtype
,
3
,
5
,
-
1
)
# N > M, k != 0
check
(
dtype
,
5
,
3
,
1
)
check
(
dtype
,
5
,
3
,
-
1
)
# k > M, -k > N, k > M, k > N
check
(
dtype
,
5
,
3
,
3
)
check
(
dtype
,
3
,
5
,
3
)
check
(
dtype
,
5
,
3
,
-
3
)
check
(
dtype
,
3
,
5
,
-
3
)
check
(
dtype
,
5
,
3
,
6
)
check
(
dtype
,
3
,
5
,
-
6
)
def
test_hostfromgpu_shape_i
():
# Test that the shape is lifted over hostfromgpu
m
=
mode_with_gpu
.
including
(
"local_dot_to_dot22"
,
"local_dot22_to_dot22scalar"
,
"specialize"
)
a
=
fmatrix
(
"a"
)
ca
=
aesara
.
gpuarray
.
type
.
GpuArrayType
(
"float32"
,
(
False
,
False
))()
av
=
np
.
asarray
(
np
.
random
.
random
(
5
,
4
),
dtype
=
"float32"
)
cv
=
gpuarray
.
asarray
(
np
.
random
.
random
(
5
,
4
),
dtype
=
"float32"
,
context
=
get_context
(
test_ctx_name
)
)
f
=
aesara
.
function
([
a
],
GpuFromHost
(
test_ctx_name
)(
a
),
mode
=
m
)
assert
any
(
isinstance
(
x
.
op
,
GpuFromHost
)
for
x
in
f
.
maker
.
fgraph
.
toposort
())
f
=
aesara
.
function
([
a
],
GpuFromHost
(
test_ctx_name
)(
a
)
.
shape
,
mode
=
m
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
isinstance
(
topo
[
0
]
.
op
,
Shape_i
)
assert
isinstance
(
topo
[
1
]
.
op
,
Shape_i
)
assert
isinstance
(
topo
[
2
]
.
op
,
MakeVector
)
assert
tuple
(
f
(
av
))
==
(
5
,
4
)
f
=
aesara
.
function
([
ca
],
host_from_gpu
(
ca
),
mode
=
m
)
assert
host_from_gpu
in
[
x
.
op
for
x
in
f
.
maker
.
fgraph
.
toposort
()]
f
=
aesara
.
function
([
ca
],
host_from_gpu
(
ca
)
.
shape
,
mode
=
m
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
isinstance
(
topo
[
0
]
.
op
,
Shape_i
)
assert
isinstance
(
topo
[
1
]
.
op
,
Shape_i
)
assert
isinstance
(
topo
[
2
]
.
op
,
MakeVector
)
assert
tuple
(
f
(
cv
))
==
(
5
,
4
)
def
test_Gpujoin_inplace
():
# Test Gpujoin to work inplace.
#
# This function tests the case when several elements are passed to the
# Gpujoin function but all except one of them are empty. In this case
# Gpujoin should work inplace and the output should be the view of the
# non-empty element.
s
=
lscalar
()
data
=
np
.
array
([
3
,
4
,
5
],
dtype
=
aesara
.
config
.
floatX
)
x
=
gpuarray_shared_constructor
(
data
,
borrow
=
True
)
z
=
at
.
zeros
((
s
,))
join
=
GpuJoin
(
view
=
0
)
c
=
join
(
0
,
x
,
z
)
f
=
aesara
.
function
([
s
],
aesara
.
Out
(
c
,
borrow
=
True
))
if
not
isinstance
(
mode_with_gpu
,
aesara
.
compile
.
debugmode
.
DebugMode
):
assert
x
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)
is
f
(
0
)
assert
np
.
allclose
(
f
(
0
),
[
3
,
4
,
5
])
def
test_gpu_tril_triu
():
def
check_l
(
m
,
k
=
0
):
m_symb
=
matrix
(
dtype
=
m
.
dtype
)
k_symb
=
iscalar
()
f
=
aesara
.
function
(
[
m_symb
,
k_symb
],
at
.
tril
(
m_symb
,
k_symb
),
mode
=
mode_with_gpu
)
result
=
f
(
m
,
k
)
assert
np
.
allclose
(
result
,
np
.
tril
(
m
,
k
))
assert
result
.
dtype
==
np
.
dtype
(
dtype
)
assert
any
(
isinstance
(
node
.
op
,
GpuTri
)
for
node
in
f
.
maker
.
fgraph
.
toposort
())
def
check_u
(
m
,
k
=
0
):
m_symb
=
matrix
(
dtype
=
m
.
dtype
)
k_symb
=
iscalar
()
f
=
aesara
.
function
(
[
m_symb
,
k_symb
],
at
.
triu
(
m_symb
,
k_symb
),
mode
=
mode_with_gpu
)
result
=
f
(
m
,
k
)
assert
np
.
allclose
(
result
,
np
.
triu
(
m
,
k
))
assert
result
.
dtype
==
np
.
dtype
(
dtype
)
assert
any
(
isinstance
(
node
.
op
,
GpuTri
)
for
node
in
f
.
maker
.
fgraph
.
toposort
())
test_rng
=
np
.
random
.
default_rng
(
seed
=
utt
.
fetch_seed
())
for
dtype
in
[
"float64"
,
"float32"
,
"float16"
]:
# try a big one
m
=
np
.
asarray
(
test_rng
.
random
((
5000
,
5000
))
*
2
-
1
,
dtype
=
dtype
)
check_l
(
m
,
0
)
check_l
(
m
,
1
)
check_l
(
m
,
-
1
)
check_u
(
m
,
0
)
check_u
(
m
,
1
)
check_u
(
m
,
-
1
)
m
=
np
.
asarray
(
test_rng
.
random
((
10
,
10
))
*
2
-
1
,
dtype
=
dtype
)
check_l
(
m
,
0
)
check_l
(
m
,
1
)
check_l
(
m
,
-
1
)
check_u
(
m
,
0
)
check_u
(
m
,
1
)
check_u
(
m
,
-
1
)
m
=
np
.
asarray
(
test_rng
.
random
((
10
,
5
))
*
2
-
1
,
dtype
=
dtype
)
check_l
(
m
,
0
)
check_l
(
m
,
1
)
check_l
(
m
,
-
1
)
check_u
(
m
,
0
)
check_u
(
m
,
1
)
check_u
(
m
,
-
1
)
def
test_gputri
():
def
check
(
dtype
,
N
,
M_
=
None
,
k
=
0
):
# Aesara does not accept None as a tensor.
# So we must use a real value.
M
=
M_
# Currently DebugMode does not support None as inputs even if this is
# allowed.
if
M
is
None
:
M
=
N
N_symb
=
iscalar
()
M_symb
=
iscalar
()
k_symb
=
iscalar
()
out
=
at
.
tri
(
N_symb
,
M_symb
,
k_symb
,
dtype
=
dtype
)
+
np
.
array
(
1
)
.
astype
(
dtype
)
f
=
aesara
.
function
([
N_symb
,
M_symb
,
k_symb
],
out
,
mode
=
mode_with_gpu
)
result
=
np
.
asarray
(
f
(
N
,
M
,
k
))
-
np
.
array
(
1
)
.
astype
(
dtype
)
assert
np
.
allclose
(
result
,
np
.
tri
(
N
,
M_
,
k
,
dtype
=
dtype
))
assert
result
.
dtype
==
np
.
dtype
(
dtype
)
assert
any
(
isinstance
(
node
.
op
,
GpuTri
)
for
node
in
f
.
maker
.
fgraph
.
toposort
())
for
dtype
in
(
"float64"
,
"float32"
,
"int32"
,
"float16"
):
# try a big one
check
(
dtype
,
1000
,
1000
,
0
)
check
(
dtype
,
1000
,
1000
,
-
400
)
check
(
dtype
,
1000
,
1000
,
400
)
check
(
dtype
,
5
)
# M != N, k = 0
check
(
dtype
,
3
,
5
)
check
(
dtype
,
5
,
3
)
# N == M, k != 0
check
(
dtype
,
3
,
3
,
1
)
check
(
dtype
,
3
,
3
,
-
1
)
# N < M, k != 0
check
(
dtype
,
3
,
5
,
1
)
check
(
dtype
,
3
,
5
,
-
1
)
# N > M, k != 0
check
(
dtype
,
5
,
3
,
1
)
check
(
dtype
,
5
,
3
,
-
1
)
# k > M, -k > N, k > M, k > N
check
(
dtype
,
5
,
3
,
3
)
check
(
dtype
,
3
,
5
,
3
)
check
(
dtype
,
5
,
3
,
-
3
)
check
(
dtype
,
3
,
5
,
-
3
)
check
(
dtype
,
5
,
3
,
6
)
check
(
dtype
,
3
,
5
,
-
6
)
tests/gpuarray/test_blas.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
itertools
import
numpy
as
np
import
aesara
from
aesara.configdefaults
import
config
from
aesara.gpuarray
import
gpuarray_shared_constructor
from
aesara.gpuarray.blas
import
(
GpuGemm
,
GpuGer
,
gpu_dot22
,
gpugemm_inplace
,
gpugemm_no_inplace
,
gpugemmbatch_inplace
,
gpugemv_inplace
,
gpugemv_no_inplace
,
gpuger_inplace
,
gpuger_no_inplace
,
)
from
aesara.tensor.blas
import
(
BatchedDot
,
_dot22
,
batched_dot
,
gemm_inplace
,
gemv
,
gemv_inplace
,
)
from
aesara.tensor.math
import
dot
from
aesara.tensor.type
import
matrix
,
tensor
,
tensor3
,
vector
from
tests
import
unittest_tools
as
utt
from
tests.gpuarray.config
import
mode_with_gpu
,
test_ctx_name
from
tests.gpuarray.test_basic_ops
import
makeTester
,
rand
from
tests.tensor.test_blas
import
BaseGemv
,
TestGer
TestGpuGemv
=
makeTester
(
"GpuGemvTester"
,
op
=
gemv_inplace
,
gpu_op
=
gpugemv_inplace
,
# It doesn't support float16
cases
=
dict
(
dot_vv
=
[
rand
(
1
),
1.0
,
rand
(
1
,
2
),
rand
(
2
),
0.0
],
dot_vm
=
[
rand
(
3
),
1.0
,
rand
(
3
,
2
),
rand
(
2
),
0.0
],
float32
=
[
rand
(
3
)
.
astype
(
"float32"
),
np
.
float32
(
1
),
rand
(
3
,
2
)
.
astype
(
"float32"
),
rand
(
2
)
.
astype
(
"float32"
),
np
.
float32
(
0
),
],
float64
=
[
rand
(
3
)
.
astype
(
"float64"
),
np
.
float64
(
1
),
rand
(
3
,
2
)
.
astype
(
"float64"
),
rand
(
2
)
.
astype
(
"float64"
),
np
.
float64
(
0
),
],
# test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
# test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
# test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
test_stride
=
[
rand
(
3
)[::
-
1
],
1.0
,
rand
(
3
,
2
)[::
-
1
],
rand
(
2
)[::
-
1
],
0.0
],
),
)
def
test_float16
():
# gemv (gemm called)
float16_data
=
[
rand
(
3
)
.
astype
(
"float16"
),
np
.
asarray
(
1
,
dtype
=
np
.
float32
),
rand
(
3
,
3
)
.
astype
(
"float16"
),
rand
(
3
)
.
astype
(
"float16"
),
np
.
asarray
(
0.5
,
dtype
=
np
.
float32
),
]
float16_shared
=
[
gpuarray_shared_constructor
(
val
,
target
=
test_ctx_name
)
for
val
in
float16_data
]
o
=
gemv
(
*
float16_shared
)
f
=
aesara
.
function
([],
o
,
mode
=
mode_with_gpu
)
y
,
alpha
,
A
,
x
,
beta
=
float16_data
out
=
f
()
utt
.
assert_allclose
(
np
.
asarray
(
out
),
alpha
*
np
.
dot
(
A
,
x
)
+
beta
*
y
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
any
(
isinstance
(
n
.
op
,
GpuGemm
)
for
n
in
topo
)
# gemm
float16_data
=
[
rand
(
3
,
3
)
.
astype
(
"float16"
),
np
.
asarray
(
1
,
dtype
=
np
.
float32
),
rand
(
3
,
3
)
.
astype
(
"float16"
),
rand
(
3
,
3
)
.
astype
(
"float16"
),
np
.
asarray
(
0.5
,
dtype
=
np
.
float32
),
]
float16_shared
=
[
gpuarray_shared_constructor
(
val
,
target
=
test_ctx_name
)
for
val
in
float16_data
]
o
=
gpugemm_no_inplace
(
*
float16_shared
)
f
=
aesara
.
function
([],
o
)
y
,
alpha
,
A
,
x
,
beta
=
float16_data
out
=
f
()
utt
.
assert_allclose
(
np
.
asarray
(
out
),
alpha
*
np
.
dot
(
A
,
x
)
+
beta
*
y
)
# dot22
float16_data
=
[
rand
(
3
,
3
)
.
astype
(
"float16"
),
rand
(
3
,
3
)
.
astype
(
"float16"
)]
float16_shared
=
[
gpuarray_shared_constructor
(
val
)
for
val
in
float16_data
]
o
=
gpu_dot22
(
*
float16_shared
)
f
=
aesara
.
function
([],
o
)
x
,
y
=
float16_data
out
=
f
()
utt
.
assert_allclose
(
np
.
asarray
(
out
),
np
.
dot
(
x
,
y
))
class
TestGpuSgemv
(
BaseGemv
,
utt
.
OptimizationTestMixin
):
mode
=
mode_with_gpu
dtype
=
"float32"
gemv
=
gpugemv_no_inplace
gemv_inplace
=
gpugemv_inplace
@staticmethod
def
shared
(
val
):
try
:
return
gpuarray_shared_constructor
(
val
)
except
TypeError
:
return
aesara
.
shared
(
val
)
TestGpuGemm
=
makeTester
(
"GpuGemmTester"
,
op
=
gemm_inplace
,
gpu_op
=
gpugemm_inplace
,
# float16 tested in test_float16
cases
=
dict
(
test1
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.0
],
test2
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
1.0
],
test3
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
-
1.0
],
test4
=
[
rand
(
3
,
4
),
0.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.0
],
test5
=
[
rand
(
3
,
4
),
0.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.6
],
test6
=
[
rand
(
3
,
4
),
0.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
-
1.0
],
test7
=
[
rand
(
3
,
4
),
-
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.0
],
test8
=
[
rand
(
3
,
4
),
-
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
1.1
],
float32
=
[
rand
(
3
,
4
)
.
astype
(
"float32"
),
np
.
float32
(
-
1.0
),
rand
(
3
,
5
)
.
astype
(
"float32"
),
rand
(
5
,
4
)
.
astype
(
"float32"
),
np
.
float32
(
-
1.1
),
],
float64
=
[
rand
(
3
,
4
)
.
astype
(
"float64"
),
np
.
float64
(
-
1.0
),
rand
(
3
,
5
)
.
astype
(
"float64"
),
rand
(
5
,
4
)
.
astype
(
"float64"
),
np
.
float64
(
-
1.1
),
],
# test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
# test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
# test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
# test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
),
)
gemm_batched_tests
=
{
"test_b
%
im
%
ik
%
in
%
i"
%
(
b
,
m
,
k
,
n
):
[
rand
(
b
,
m
,
n
),
rand
(),
rand
(
b
,
m
,
k
),
rand
(
b
,
k
,
n
),
rand
()]
for
b
,
m
,
k
,
n
in
itertools
.
combinations
([
2
,
3
,
5
,
7
,
11
,
13
],
4
)
}
gemm_batched_tests
[
"float16"
]
=
[
rand
(
3
,
4
,
7
)
.
astype
(
"float16"
),
rand
()
.
astype
(
"float16"
),
rand
(
3
,
4
,
4
)
.
astype
(
"float16"
),
rand
(
3
,
4
,
7
)
.
astype
(
"float16"
),
rand
()
.
astype
(
"float16"
),
]
gemm_batched_tests
[
"float32"
]
=
[
rand
(
3
,
4
,
7
)
.
astype
(
"float32"
),
rand
()
.
astype
(
"float32"
),
rand
(
3
,
4
,
4
)
.
astype
(
"float32"
),
rand
(
3
,
4
,
7
)
.
astype
(
"float32"
),
rand
()
.
astype
(
"float32"
),
]
gemm_batched_tests
[
"float64"
]
=
[
rand
(
3
,
4
,
7
)
.
astype
(
"float64"
),
rand
()
.
astype
(
"float64"
),
rand
(
3
,
4
,
4
)
.
astype
(
"float64"
),
rand
(
3
,
4
,
7
)
.
astype
(
"float64"
),
rand
()
.
astype
(
"float64"
),
]
TestGpuGemmBatch
=
makeTester
(
"GpuGemmBatchTester"
,
op
=
lambda
z
,
alpha
,
x
,
y
,
beta
:
alpha
*
BatchedDot
()(
x
,
y
)
+
beta
*
z
,
gpu_op
=
gpugemmbatch_inplace
,
cases
=
gemm_batched_tests
,
)
class
TestGpuGemmBatchStrided
:
def
test_basic
(
self
):
# Reported in https://github.com/Theano/Theano/issues/5730
x
=
tensor3
()
y
=
tensor3
()
z
=
batched_dot
(
x
,
y
[:,
0
,
:,
np
.
newaxis
])
f
=
aesara
.
function
([
x
,
y
],
z
,
mode
=
mode_with_gpu
)
x_num
=
np
.
arange
(
32
*
19
*
600
,
dtype
=
config
.
floatX
)
.
reshape
((
32
,
19
,
600
))
y_num
=
np
.
arange
(
7
*
32
*
600
,
dtype
=
config
.
floatX
)
.
reshape
((
32
,
7
,
600
))
f
(
x_num
,
y_num
)
assert
f
.
maker
.
fgraph
.
toposort
()[
-
2
]
.
op
.
inplace
class
TestGpuSger
(
TestGer
):
def
setup_method
(
self
):
self
.
mode
=
mode_with_gpu
dtype
=
self
.
dtype
=
"float32"
# optimization isn't dtype-dependent
self
.
A
=
tensor
(
dtype
=
dtype
,
broadcastable
=
(
False
,
False
))
self
.
a
=
tensor
(
dtype
=
dtype
,
broadcastable
=
())
self
.
x
=
tensor
(
dtype
=
dtype
,
broadcastable
=
(
False
,))
self
.
y
=
tensor
(
dtype
=
dtype
,
broadcastable
=
(
False
,))
self
.
ger_destructive
=
gpuger_inplace
# data on the gpu make the op always inplace
self
.
ger
=
gpuger_inplace
self
.
gemm
=
gpugemm_inplace
super
()
.
setup_method
()
class
TestGpuSgerNoTransfer
(
TestGpuSger
):
shared
=
staticmethod
(
gpuarray_shared_constructor
)
class
TestGpuGer_OpContract
(
utt
.
OpContractTestMixin
):
def
setup_method
(
self
):
self
.
ops
=
[
gpuger_no_inplace
,
gpuger_inplace
]
def
clone
(
self
,
op
):
return
GpuGer
(
inplace
=
op
.
inplace
)
TestGpuDot22
=
makeTester
(
"GpuDot22Tester"
,
op
=
_dot22
,
gpu_op
=
gpu_dot22
,
cases
=
dict
(
test1
=
[
rand
(
3
,
4
),
rand
(
4
,
5
)],
test2
=
[
rand
(
1
,
4
),
rand
(
4
,
5
)],
test3
=
[
rand
(
3
,
1
),
rand
(
1
,
5
)],
test4
=
[
rand
(
3
,
4
),
rand
(
4
,
1
)],
# test5=[rand(0, 4), rand(4, 5)],
# test6=[rand(3, 0), rand(0, 5)],
# test7=[rand(3, 4), rand(4, 0)],
# test8=[rand(0, 4), rand(4, 0)],
# test9=[rand(0, 0), rand(0, 0)],
),
)
def
test_gemv_zeros
():
W
=
matrix
()
v
=
vector
()
f
=
aesara
.
function
([
W
,
v
],
W
.
dot
(
v
),
mode
=
mode_with_gpu
)
# Apply to an empty matrix shape (5,0) and an empty vector shape (0,)
dim
=
1000
A
=
np
.
zeros
((
dim
,
0
),
dtype
=
aesara
.
config
.
floatX
)
b
=
np
.
zeros
((
0
,),
dtype
=
aesara
.
config
.
floatX
)
tmp
=
f
(
A
,
b
)
assert
np
.
allclose
(
tmp
,
np
.
zeros
((
dim
,)))
def
test_gemv_dot_strides
():
# Reported in https://github.com/Theano/Theano/issues/6142
xv
=
rand
(
5
)
yv
=
rand
(
5
,
1
)
x
=
gpuarray_shared_constructor
(
xv
)
y
=
gpuarray_shared_constructor
(
yv
,
broadcastable
=
(
False
,
True
))
f
=
aesara
.
function
([],
dot
(
x
,
y
[::
-
1
]),
mode
=
mode_with_gpu
)
out
=
f
()
utt
.
assert_allclose
(
out
,
np
.
dot
(
xv
,
yv
[::
-
1
]))
tests/gpuarray/test_blocksparse.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
pytest
import
aesara
import
tests.unittest_tools
as
utt
from
aesara.gpuarray.blocksparse
import
(
GpuSparseBlockGemv
,
GpuSparseBlockOuter
,
gpu_sparse_block_gemv
,
gpu_sparse_block_outer
,
)
from
aesara.gpuarray.type
import
gpuarray_shared_constructor
from
aesara.tensor.type
import
fmatrix
,
ftensor3
,
lmatrix
from
tests.gpuarray.config
import
mode_with_gpu
,
test_ctx_name
from
tests.tensor.nnet.test_blocksparse
import
TestBlockSparseGemvAndOuter
class
TestBlockSparseGemvAndOuterGPUarray
(
TestBlockSparseGemvAndOuter
):
def
setup_method
(
self
):
self
.
mode
=
mode_with_gpu
.
excluding
(
"constant_folding"
)
self
.
gemv_op
=
gpu_sparse_block_gemv
self
.
outer_op
=
gpu_sparse_block_outer
self
.
gemv_class
=
GpuSparseBlockGemv
self
.
outer_class
=
GpuSparseBlockOuter
super
()
.
setup_method
()
@pytest.mark.skip
(
reason
=
"""
This test is temporarily disabled since we disabled the output_merge
and alpha_merge optimizations for blocksparse due to brokenness.
Re-enable when those are re-added.
"""
)
def
test_blocksparse_grad_merge
(
self
):
b
=
fmatrix
()
h
=
ftensor3
()
iIdx
=
lmatrix
()
oIdx
=
lmatrix
()
W_val
,
h_val
,
iIdx_val
,
b_val
,
oIdx_val
=
self
.
gemv_data
()
W
=
gpuarray_shared_constructor
(
W_val
,
context
=
test_ctx_name
)
o
=
gpu_sparse_block_gemv
(
b
.
take
(
oIdx
,
axis
=
0
),
W
,
h
,
iIdx
,
oIdx
)
gW
=
aesara
.
grad
(
o
.
sum
(),
W
)
lr
=
np
.
asarray
(
0.05
,
dtype
=
"float32"
)
upd
=
W
-
lr
*
gW
f1
=
aesara
.
function
([
h
,
iIdx
,
b
,
oIdx
],
updates
=
[(
W
,
upd
)],
mode
=
mode_with_gpu
)
# Make sure the lr update was merged.
assert
isinstance
(
f1
.
maker
.
fgraph
.
outputs
[
0
]
.
owner
.
op
,
GpuSparseBlockOuter
)
# Exclude the merge optimizations.
mode
=
mode_with_gpu
.
excluding
(
"local_merge_blocksparse_alpha"
)
mode
=
mode
.
excluding
(
"local_merge_blocksparse_output"
)
f2
=
aesara
.
function
([
h
,
iIdx
,
b
,
oIdx
],
updates
=
[(
W
,
upd
)],
mode
=
mode
)
# Make sure the lr update is not merged.
assert
not
isinstance
(
f2
.
maker
.
fgraph
.
outputs
[
0
]
.
owner
.
op
,
GpuSparseBlockOuter
)
f2
(
h_val
,
iIdx_val
,
b_val
,
oIdx_val
)
W_ref
=
W
.
get_value
()
# reset the var
W
.
set_value
(
W_val
)
f1
(
h_val
,
iIdx_val
,
b_val
,
oIdx_val
)
W_opt
=
W
.
get_value
()
utt
.
assert_allclose
(
W_ref
,
W_opt
)
tests/gpuarray/test_cgpukernelbase.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
pytest
import
aesara
from
aesara
import
config
from
aesara
import
tensor
as
at
from
aesara.gpuarray.basic_ops
import
CGpuKernelBase
from
aesara.gpuarray.type
import
GpuArrayType
,
get_context
,
gpu_context_type
from
aesara.gradient
import
grad_undefined
from
aesara.graph.basic
import
Apply
from
aesara.link.c.params_type
import
ParamsType
from
aesara.scalar
import
int32
as
int_t
class
GpuEye
(
CGpuKernelBase
):
"""Eye for GPU.
This is an implementation to test that `CGpuKernelBase` works and also
to use as an example in the docs. It is not used for user graphs.
"""
__props__
=
(
"dtype"
,
"context_name"
)
params_type
=
ParamsType
(
typecode
=
int_t
,
context
=
gpu_context_type
)
def
__init__
(
self
,
dtype
=
None
,
context_name
=
None
):
if
dtype
is
None
:
dtype
=
config
.
floatX
self
.
dtype
=
dtype
self
.
context_name
=
context_name
super
()
.
__init__
([
"c_code/tstgpueye.c"
],
"APPLY_SPECIFIC(tstgpueye)"
)
def
get_params
(
self
,
node
):
pygpu_gpuarray
=
pytest
.
importorskip
(
"pygpu.gpuarray"
)
return
self
.
params_type
.
get_params
(
typecode
=
pygpu_gpuarray
.
dtype_to_typecode
(
self
.
dtype
),
context
=
get_context
(
self
.
context_name
),
)
def
c_headers
(
self
,
**
kwargs
):
return
[
"<gpuarray/types.h>"
,
"<gpuarray/kernel.h>"
]
def
make_node
(
self
,
n
,
m
):
n
=
at
.
as_tensor_variable
(
n
)
m
=
at
.
as_tensor_variable
(
m
)
assert
n
.
ndim
==
0
assert
m
.
ndim
==
0
otype
=
GpuArrayType
(
dtype
=
self
.
dtype
,
broadcastable
=
(
False
,
False
),
context_name
=
self
.
context_name
,
)
return
Apply
(
self
,
[
n
,
m
],
[
otype
()])
def
infer_shape
(
self
,
fgraph
,
node
,
in_shapes
):
out_shape
=
[
node
.
inputs
[
0
],
node
.
inputs
[
1
]]
return
[
out_shape
]
def
grad
(
self
,
inp
,
grads
):
return
[
grad_undefined
(
self
,
i
,
inp
[
i
])
for
i
in
range
(
2
)]
def
test_cgpukernelbase
():
# Import inside the function to prevent the back-end from being
# initialized when reloading the GpuEye object from cache.
from
.config
import
mode_with_gpu
,
test_ctx_name
op
=
GpuEye
(
dtype
=
"int32"
,
context_name
=
test_ctx_name
)
f
=
aesara
.
function
([],
op
(
4
,
5
),
mode
=
mode_with_gpu
)
r
=
f
()
assert
r
.
dtype
==
"int32"
assert
(
np
.
asarray
(
r
)
==
np
.
eye
(
4
,
5
,
dtype
=
"int32"
))
.
all
()
tests/gpuarray/test_ctc.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
pytest
import
aesara
import
aesara.gpuarray
from
aesara.gpuarray.ctc
import
GpuConnectionistTemporalClassification
,
gpu_ctc
from
aesara.gradient
import
grad
from
aesara.tensor.math
import
mean
from
aesara.tensor.nnet.ctc
import
(
ConnectionistTemporalClassification
,
ctc
,
ctc_available
,
)
from
tests
import
unittest_tools
as
utt
from
tests.gpuarray.config
import
mode_with_gpu
,
mode_without_gpu
from
tests.tensor.nnet.test_ctc
import
setup_ctc_case
,
setup_grad_case
,
setup_torch_case
@pytest.mark.skipif
(
not
ctc_available
(),
reason
=
"Optional library warp-ctc not available"
)
class
TestCTC
:
def
check_ctc
(
self
,
activations
,
labels
,
input_length
,
expected_costs
,
expected_grads
):
# Create symbolic variables
t_activations
=
aesara
.
shared
(
activations
,
name
=
"activations"
)
t_activation_times
=
aesara
.
shared
(
input_length
,
name
=
"activation_times"
)
t_labels
=
aesara
.
shared
(
labels
,
name
=
"labels"
)
inputs
=
[
t_activations
,
t_labels
,
t_activation_times
]
# Execute several tests for each test case
self
.
check_expected_values
(
t_activations
,
t_labels
,
t_activation_times
,
expected_costs
,
expected_grads
)
self
.
compare_gpu_and_cpu_values
(
*
inputs
)
self
.
check_grads_disabled
(
*
inputs
)
self
.
run_gpu_optimization_with_grad
(
*
inputs
)
self
.
run_gpu_optimization_no_grad
(
*
inputs
)
def
setup_cpu_op
(
self
,
activations
,
labels
,
input_length
,
compute_grad
=
True
,
mode
=
mode_without_gpu
,
):
cpu_ctc_cost
=
ctc
(
activations
,
labels
,
input_length
)
outputs
=
[
cpu_ctc_cost
]
if
compute_grad
:
# Symbolic gradient of CTC cost
cpu_ctc_grad
=
grad
(
mean
(
cpu_ctc_cost
),
activations
)
outputs
+=
[
cpu_ctc_grad
]
return
aesara
.
function
([],
outputs
,
mode
=
mode
)
def
setup_gpu_op
(
self
,
activations
,
labels
,
input_length
,
compute_grad
=
True
):
gpu_ctc_cost
=
gpu_ctc
(
activations
,
labels
,
input_length
)
outputs
=
[
gpu_ctc_cost
]
if
compute_grad
:
# Symbolic gradient of CTC cost
gpu_ctc_grad
=
grad
(
mean
(
gpu_ctc_cost
),
activations
)
outputs
+=
[
gpu_ctc_grad
]
return
aesara
.
function
([],
outputs
,
mode
=
mode_with_gpu
)
def
check_expected_values
(
self
,
activations
,
labels
,
input_length
,
expected_costs
,
expected_grads
):
gpu_train
=
self
.
setup_gpu_op
(
activations
,
labels
,
input_length
)
gpu_cost
,
gpu_grad
=
gpu_train
()
# Transfer costs from GPU memory to host
cost_from_gpu
=
np
.
asarray
(
gpu_cost
)
# Transfer gradients from GPU memory to host
grad_from_gpu
=
np
.
asarray
(
gpu_grad
)
# Check that results are in conformance with expected values
utt
.
assert_allclose
(
expected_grads
/
cost_from_gpu
.
shape
[
0
],
grad_from_gpu
)
utt
.
assert_allclose
(
expected_costs
,
cost_from_gpu
)
def
compare_gpu_and_cpu_values
(
self
,
activations
,
labels
,
input_length
):
cpu_train
=
self
.
setup_cpu_op
(
activations
,
labels
,
input_length
)
cpu_cost
,
cpu_grad
=
cpu_train
()
gpu_train
=
self
.
setup_gpu_op
(
activations
,
labels
,
input_length
)
gpu_cost
,
gpu_grad
=
gpu_train
()
# Transfer costs from GPU memory to host
cost_from_gpu
=
np
.
asarray
(
gpu_cost
)
# Transfer gradients from GPU memory to host
grad_from_gpu
=
np
.
asarray
(
gpu_grad
)
# Check that results are in conformance with expected values
utt
.
assert_allclose
(
cpu_grad
,
grad_from_gpu
)
utt
.
assert_allclose
(
cpu_cost
,
cost_from_gpu
)
def
check_grads_disabled
(
self
,
activations
,
labels
,
input_length
):
"""
Check if optimization to disable gradients is working
"""
gpu_ctc_cost
=
gpu_ctc
(
activations
,
labels
,
input_length
)
gpu_ctc_function
=
aesara
.
function
([],
[
gpu_ctc_cost
])
for
node
in
gpu_ctc_function
.
maker
.
fgraph
.
apply_nodes
:
if
isinstance
(
node
.
op
,
GpuConnectionistTemporalClassification
):
assert
node
.
op
.
compute_grad
is
False
def
run_gpu_optimization_with_grad
(
self
,
activations
,
labels
,
input_length
):
# Compile CPU function with optimization
cpu_lifted_train
=
self
.
setup_cpu_op
(
activations
,
labels
,
input_length
,
mode
=
mode_with_gpu
)
# Check whether Op is lifted to the GPU
assert
self
.
has_only_gpu_op
(
cpu_lifted_train
)
def
run_gpu_optimization_no_grad
(
self
,
activations
,
labels
,
input_length
):
cpu_train
=
self
.
setup_cpu_op
(
activations
,
labels
,
input_length
,
compute_grad
=
False
)
cpu_cost
=
cpu_train
()
# Compile CPU function with optimization
cpu_lifted_test
=
self
.
setup_cpu_op
(
activations
,
labels
,
input_length
,
compute_grad
=
False
,
mode
=
mode_with_gpu
)
# Check whether Op is lifted to the GPU
assert
self
.
has_only_gpu_op
(
cpu_lifted_test
)
gpu_cost
=
cpu_lifted_test
()
# Transfer costs from GPU memory to host
cost_from_gpu
=
np
.
asarray
(
gpu_cost
)
# Compare values from CPU and GPU Ops
utt
.
assert_allclose
(
cpu_cost
,
cost_from_gpu
)
def
has_only_gpu_op
(
self
,
function
):
has_cpu_instance
=
False
has_gpu_instance
=
False
for
node
in
function
.
maker
.
fgraph
.
apply_nodes
:
if
isinstance
(
node
.
op
,
ConnectionistTemporalClassification
):
has_cpu_instance
=
True
if
isinstance
(
node
.
op
,
GpuConnectionistTemporalClassification
):
has_gpu_instance
=
True
return
has_gpu_instance
and
(
not
has_cpu_instance
)
# Test obtained from Torch tutorial at:
# https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md
def
test_torch_case
(
self
):
(
activations
,
labels
,
activation_times
,
expected_costs
,
expected_grads
,
)
=
setup_torch_case
()
self
.
check_ctc
(
activations
,
labels
,
activation_times
,
expected_costs
,
expected_grads
)
def
test_ctc
(
self
):
(
activations
,
labels
,
input_length
,
expected_costs
,
expected_grads
,
)
=
setup_ctc_case
()
self
.
check_ctc
(
activations
,
labels
,
input_length
,
expected_costs
,
expected_grads
)
def
test_verify_grad
(
self
):
def
ctc_op_functor
(
labels
,
in_lengths
):
def
wrapper
(
acts
):
# Create auxiliary symbolic variables
t_activation_times
=
aesara
.
shared
(
in_lengths
,
name
=
"activation_times"
)
t_labels
=
aesara
.
shared
(
labels
,
name
=
"labels"
)
return
gpu_ctc
(
acts
,
t_labels
,
t_activation_times
)
return
wrapper
activations
,
labels
,
activation_times
=
setup_grad_case
()
ctc_op
=
ctc_op_functor
(
labels
,
activation_times
)
utt
.
verify_grad
(
ctc_op
,
[
activations
],
mode
=
mode_with_gpu
)
tests/gpuarray/test_dnn.py
deleted
100644 → 0
浏览文件 @
c803c67e
This source diff could not be displayed because it is too large. You can
view the blob
instead.
tests/gpuarray/test_elemwise.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
pytest
import
aesara
import
aesara.scalar
as
aes
import
aesara.tensor
as
at
pygpu
=
pytest
.
importorskip
(
"pygpu"
)
gpuarray
=
pygpu
.
ndgpuarray
from
copy
import
copy
from
aesara.compile.debugmode
import
DebugMode
from
aesara.compile.mode
import
Mode
from
aesara.gpuarray.dnn
import
GpuDnnReduction
from
aesara.gpuarray.elemwise
import
(
GpuCAReduceCPY
,
GpuCAReduceCuda
,
GpuDimShuffle
,
GpuElemwise
,
GpuErfcinv
,
GpuErfinv
,
)
from
aesara.gpuarray.type
import
GpuArrayType
,
get_context
,
gpuarray_shared_constructor
from
aesara.link.basic
import
PerformLinker
from
aesara.link.c.basic
import
CLinker
from
aesara.tensor.math
import
erfcinv
,
erfinv
,
mul
,
tanh
from
aesara.tensor.type
import
bvector
,
float_dtypes
,
fmatrix
,
fvector
,
vector
from
tests.gpuarray.config
import
mode_with_gpu
,
mode_without_gpu
,
test_ctx_name
from
tests.gpuarray.test_basic_ops
import
rand_gpuarray
from
tests.tensor
import
test_elemwise
from
tests.unittest_tools
import
assert_allclose
# This is actually a test for GpuElemwise
class
TestGpuBroadcast
(
test_elemwise
.
TestBroadcast
):
cop
=
GpuElemwise
ctype
=
GpuArrayType
# The order is important
linkers
=
[
PerformLinker
,
CLinker
]
def
rand_cval
(
self
,
shp
):
return
rand_gpuarray
(
*
shp
,
cls
=
gpuarray
)
def
test_elemwise_pow
():
# Test that GpuElemwise(pow) can compile with any combination of integer
# or float input dtype.
dtypes
=
[
"uint8"
,
"uint16"
,
"uint32"
,
"uint64"
,
"int8"
,
"int16"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
]
for
dtype_base
in
dtypes
:
for
dtype_exp
in
dtypes
:
# Compile a gpu function with the specified dtypes
base_val
=
np
.
random
.
randint
(
0
,
5
,
size
=
10
)
.
astype
(
dtype_base
)
exp_val
=
np
.
random
.
randint
(
0
,
3
,
size
=
10
)
.
astype
(
dtype_exp
)
base
=
vector
(
dtype
=
dtype_base
)
exp
=
gpuarray_shared_constructor
(
exp_val
)
assert
exp
.
dtype
==
dtype_exp
output
=
base
**
exp
f
=
aesara
.
function
([
base
],
output
,
mode
=
mode_with_gpu
)
# We don't transfer to the GPU when the output dtype is int*
n
=
len
(
[
n
for
n
in
f
.
maker
.
fgraph
.
apply_nodes
if
isinstance
(
n
.
op
,
GpuElemwise
)]
)
assert
n
==
(
output
.
dtype
in
float_dtypes
)
# Call the function to make sure the output is valid
out
=
f
(
base_val
)
expected_out
=
base_val
**
exp_val
assert_allclose
(
out
,
expected_out
)
class
TestMathErrorFunctions
:
dtypes
=
[
"float64"
,
"float32"
,
"float16"
]
default_arrays
=
{}
expected_erfinv_outputs
=
{}
expected_erfcinv_outputs
=
{}
@classmethod
def
setup_class
(
cls
):
scipy_special
=
pytest
.
importorskip
(
"scipy.special"
)
# NB: erfinv is defined in ]-1;1[, and erfcinv is defined in ]0;2[,
# so we just take some values in an interval that covers both domains
# (this will also allow to test some values outside the domains).
# We take [-5;5[ by default and we concatenate it 1000 times
# to have the GPU ops run on large data.
default_array
=
[
x
/
10.0
for
x
in
range
(
-
50
,
50
)]
*
1000
for
dtype
in
cls
.
dtypes
:
numpy_array
=
np
.
asarray
(
default_array
,
dtype
=
dtype
)
cls
.
default_arrays
[
dtype
]
=
numpy_array
cls
.
expected_erfinv_outputs
[
dtype
]
=
scipy_special
.
erfinv
(
numpy_array
)
cls
.
expected_erfcinv_outputs
[
dtype
]
=
scipy_special
.
erfcinv
(
numpy_array
)
# Since there are infinite values, we need to disable that check
# in DebugMode if needed
if
isinstance
(
mode_with_gpu
,
DebugMode
):
cls
.
mode_with_gpu
=
copy
(
mode_with_gpu
)
cls
.
mode_with_gpu
.
check_isfinite
=
False
else
:
cls
.
mode_with_gpu
=
mode_with_gpu
if
isinstance
(
mode_without_gpu
,
DebugMode
):
cls
.
mode_without_gpu
=
copy
(
mode_without_gpu
)
cls
.
mode_without_gpu
.
check_isfinite
=
False
else
:
cls
.
mode_without_gpu
=
mode_without_gpu
def
check_gpu_scalar_op
(
self
,
aesara_function
,
scalar_optype
):
for
node
in
aesara_function
.
maker
.
fgraph
.
apply_nodes
:
if
isinstance
(
node
.
op
,
GpuElemwise
)
and
isinstance
(
node
.
op
.
scalar_op
,
scalar_optype
):
return
True
aesara
.
printing
.
debugprint
(
aesara_function
)
return
False
def
test_elemwise_erfinv
(
self
):
for
dtype
in
self
.
dtypes
:
vec
=
vector
(
dtype
=
dtype
)
output
=
erfinv
(
vec
)
f_host
=
aesara
.
function
(
[
vec
],
output
,
name
=
"HOST/erfinv/"
+
dtype
,
mode
=
self
.
mode_without_gpu
,
)
f_gpu
=
aesara
.
function
(
[
vec
],
output
,
name
=
"GPU/erfinv/"
+
dtype
,
mode
=
self
.
mode_with_gpu
)
assert
(
len
(
[
n
for
n
in
f_host
.
maker
.
fgraph
.
apply_nodes
if
isinstance
(
n
.
op
,
GpuElemwise
)
]
)
==
0
)
if
not
aesara
.
config
.
device
.
startswith
(
"opencl"
):
assert
self
.
check_gpu_scalar_op
(
f_gpu
,
GpuErfinv
),
'Function graph does not contains scalar op "GpuErfinv".'
vector_val
=
self
.
default_arrays
[
dtype
]
f_host
(
vector_val
)
f_gpu
(
vector_val
)
out_host
=
f_host
(
vector_val
)
out_gpu
=
f_gpu
(
vector_val
)
assert_allclose
(
out_host
,
out_gpu
)
assert_allclose
(
self
.
expected_erfinv_outputs
[
dtype
],
out_gpu
)
def
test_elemwise_erfcinv
(
self
):
for
dtype
in
self
.
dtypes
:
vec
=
vector
(
dtype
=
dtype
)
output
=
erfcinv
(
vec
)
f_host
=
aesara
.
function
(
[
vec
],
output
,
name
=
"HOST/erfcinv/"
+
dtype
,
mode
=
self
.
mode_without_gpu
,
)
f_gpu
=
aesara
.
function
(
[
vec
],
output
,
name
=
"GPU/erfcinv/"
+
dtype
,
mode
=
self
.
mode_with_gpu
)
assert
(
len
(
[
n
for
n
in
f_host
.
maker
.
fgraph
.
apply_nodes
if
isinstance
(
n
.
op
,
GpuElemwise
)
]
)
==
0
)
if
not
aesara
.
config
.
device
.
startswith
(
"opencl"
):
assert
self
.
check_gpu_scalar_op
(
f_gpu
,
GpuErfcinv
),
'Function graph does not contains scalar op "GpuErfcinv".'
vector_val
=
self
.
default_arrays
[
dtype
]
f_host
(
vector_val
)
f_gpu
(
vector_val
)
out_host
=
f_host
(
vector_val
)
out_gpu
=
f_gpu
(
vector_val
)
assert_allclose
(
out_host
,
out_gpu
)
assert_allclose
(
self
.
expected_erfcinv_outputs
[
dtype
],
out_gpu
)
class
TestFloat16
:
def
test_composite_elemwise_float16
(
self
):
w
=
bvector
()
x
=
vector
(
dtype
=
"float16"
)
y
=
fvector
()
cz
=
tanh
(
x
+
at
.
cast
(
y
,
"float16"
))
o
=
(
cz
-
cz
**
2
+
at
.
cast
(
x
,
"int16"
)
+
at
.
cast
(
x
,
"float32"
)
+
at
.
cast
(
w
,
"float16"
)
-
at
.
constant
(
np
.
float16
(
1.0
))
)
aesara
.
function
([
w
,
x
,
y
],
o
,
mode
=
mode_with_gpu
)
v
=
vector
(
dtype
=
"uint8"
)
w
=
vector
(
dtype
=
"float16"
)
x
=
vector
(
dtype
=
"float16"
)
y
=
vector
(
dtype
=
"float16"
)
z
=
vector
(
dtype
=
"float16"
)
o
=
at
.
switch
(
v
,
mul
(
w
,
x
,
y
),
z
)
aesara
.
function
([
v
,
w
,
x
,
y
,
z
],
o
,
mode
=
mode_with_gpu
)
def
test_cast_float16
(
self
):
f16
=
vector
(
dtype
=
"float16"
)
f32
=
fvector
()
i8
=
bvector
()
f
=
aesara
.
function
(
[
f16
,
f32
,
i8
],
[
f16
.
astype
(
"float32"
),
f32
.
astype
(
"float16"
),
f32
.
astype
(
"float64"
),
f16
.
astype
(
"int8"
),
f32
.
astype
(
"int8"
),
i8
.
astype
(
"float16"
),
i8
.
astype
(
"float32"
),
],
mode
=
mode_with_gpu
,
)
d1
=
(
np
.
random
.
rand
(
4
)
*
10
)
.
astype
(
"float16"
)
d2
=
(
np
.
random
.
rand
(
5
)
*
10
)
.
astype
(
"float32"
)
d3
=
(
np
.
random
.
rand
(
6
)
*
10
)
.
astype
(
"int8"
)
res
=
f
(
d1
,
d2
,
d3
)
for
i
,
out
in
enumerate
(
f
.
outputs
):
dtype
=
out
.
variable
.
dtype
assert
res
[
i
]
.
dtype
==
dtype
inp
=
out
.
variable
.
owner
.
inputs
[
0
]
if
inp
.
dtype
==
"float16"
:
d
=
d1
elif
inp
.
dtype
==
"float32"
:
d
=
d2
else
:
d
=
d3
assert_allclose
(
d
.
astype
(
dtype
),
res
[
i
])
class
TestGpuDimShuffle
(
test_elemwise
.
TestDimShuffle
):
op
=
GpuDimShuffle
class
TestGpuCAReduceCPY
(
test_elemwise
.
TestCAReduce
):
dtypes
=
[
"float32"
]
bin_dtypes
=
[
"uint8"
,
"int8"
]
op
=
GpuCAReduceCPY
reds
=
[
aes
.
add
,
aes
.
mul
]
pre_scalar_op
=
None
mode
=
mode_with_gpu
def
test_perform
(
self
):
for
dtype
in
self
.
dtypes
+
self
.
bin_dtypes
:
for
op
in
self
.
reds
:
self
.
with_mode
(
Mode
(
linker
=
"py"
,
optimizer
=
mode_with_gpu
.
optimizer
),
op
,
dtype
=
dtype
,
pre_scalar_op
=
self
.
pre_scalar_op
,
)
def
test_perform_nan
(
self
):
for
dtype
in
self
.
dtypes
:
if
not
dtype
.
startswith
(
"float"
):
continue
for
op
in
self
.
reds
:
self
.
with_mode
(
Mode
(
linker
=
"py"
,
optimizer
=
mode_with_gpu
.
optimizer
),
op
,
dtype
=
dtype
,
test_nan
=
True
,
pre_scalar_op
=
self
.
pre_scalar_op
,
)
def
test_c
(
self
):
for
dtype
in
self
.
dtypes
+
self
.
bin_dtypes
:
for
op
in
self
.
reds
:
self
.
with_mode
(
Mode
(
linker
=
"c"
,
optimizer
=
mode_with_gpu
.
optimizer
),
op
,
dtype
=
dtype
,
pre_scalar_op
=
self
.
pre_scalar_op
,
)
def
test_c_nan
(
self
):
for
dtype
in
self
.
dtypes
:
if
not
dtype
.
startswith
(
"float"
):
continue
for
op
in
self
.
reds
:
self
.
with_mode
(
Mode
(
linker
=
"c"
,
optimizer
=
mode_with_gpu
.
optimizer
),
op
,
dtype
=
dtype
,
test_nan
=
True
,
pre_scalar_op
=
self
.
pre_scalar_op
,
)
def
test_infer_shape
(
self
):
for
dtype
in
self
.
dtypes
:
super
()
.
test_infer_shape
(
dtype
)
class
TestGpuCAReduceCuda
(
TestGpuCAReduceCPY
):
dtypes
=
[
"float32"
,
"int64"
]
bin_dtypes
=
[
"uint8"
,
"int8"
]
cases
=
[
((
5
,
6
),
None
),
((
5
,
6
),
(
0
,
1
)),
((
5
,
6
),
(
0
,)),
((
5
,
6
),
(
1
,)),
((
5
,
6
),
(
-
1
,)),
((
5
,
6
),
(
-
2
,)),
# ((5, 6), ()), #reduce on no axis(copy) isn't implemented
# ((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
# ((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
((
5
,
0
),
None
),
((
5
,
0
),
(
0
,)),
((
5
,
0
),
(
1
,)),
# ((5, 0), ()), reduce on no axis isn't implemented
# ((), None), reduce on no axis isn't implemented
# ((), ()) reduce on no axis isn't implemented
# Test all GPU cases implemented
((
1
,
0
),
(
1
,)),
((
0
,
1
),
(
1
,)),
((
0
,
0
),
(
1
,)),
((
0
,
0
,
0
),
(
1
,
2
)),
((
0
,
0
,
0
,
0
),
(
1
,
2
,
3
)),
((
2
,
1
),
(
1
,)),
((
1
,
2
),
(
1
,)),
((
100
,
3
,
1300
),
[
1
]),
((
0
,),
[
0
]),
((
5
,),
[
0
]),
((
0
,
0
),
[
0
,
1
]),
((
1
,
0
),
[
0
,
1
]),
((
5
,
4
),
[
0
,
1
]),
((
33
,
31
),
[
0
,
1
]),
((
5
,
4
),
[
1
]),
((
5
,
4
),
[
0
]),
# need something bigger then 32 for some opt test.
((
5
,
4
,
3
),
[
0
]),
((
5
,
4
,
3
),
[
1
]),
((
5
,
4
,
3
),
[
0
,
1
]),
((
5
,
4
,
3
),
[
2
]),
((
5
,
4
,
3
),
[
1
,
2
]),
((
5
,
4
,
3
),
[
0
,
1
,
2
]),
((
0
,
0
,
0
,
0
),
[
0
,
1
,
2
,
3
]),
((
5
,
4
,
3
,
20
),
[
2
,
3
]),
((
5
,
4
,
3
,
2
),
[
0
,
1
,
2
,
3
]),
((
5
,
4
,
3
,
2
),
[
0
,
2
,
3
]),
((
5
,
4
,
3
,
2
),
[
1
,
2
,
3
]),
# test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
((
4100
,
3
),
[
0
]),
((
3
,
4101
),
[
0
]),
# 10
((
1024
,
33
),
[
0
]),
((
33
,
1024
),
[
0
]),
# 10
((
1025
,
33
),
[
0
]),
((
33
,
1025
),
[
0
]),
# 10
((
4100
,
3
),
[
1
]),
((
3
,
4101
),
[
1
]),
# 01
((
1024
,
33
),
[
1
]),
((
33
,
1024
),
[
1
]),
# 01
((
1025
,
33
),
[
1
]),
((
33
,
1025
),
[
1
]),
# 01
((
4100
,
3
),
[
0
,
1
]),
((
3
,
4101
),
[
0
,
1
]),
# 11
((
1024
,
33
),
[
0
,
1
]),
((
33
,
1024
),
[
0
,
1
]),
# 01
((
1025
,
33
),
[
0
,
1
]),
((
33
,
1025
),
[
0
,
1
]),
# 01
((
4100
,
4
,
3
),
[
0
]),
((
5
,
4100
,
3
),
[
0
]),
((
5
,
4
,
4100
),
[
0
]),
((
3
,
65536
,
1
),
[
0
]),
# 100
((
4100
,
4
,
3
),
[
1
]),
((
5
,
4100
,
3
),
[
1
]),
((
5
,
4
,
4100
),
[
1
]),
# 010
((
4100
,
4
,
3
),
[
2
]),
((
5
,
4100
,
3
),
[
2
]),
((
5
,
4
,
4100
),
[
2
]),
# 001
((
4100
,
4
,
3
),
[
0
,
1
]),
((
5
,
4100
,
3
),
[
0
,
1
]),
((
5
,
4
,
4100
),
[
0
,
1
]),
# 110
((
4100
,
4
,
3
),
[
1
,
2
]),
((
5
,
4100
,
3
),
[
1
,
2
]),
((
5
,
4
,
4100
),
[
1
,
2
]),
# 011
((
4100
,
4
,
3
),
[
0
,
2
]),
((
5
,
4100
,
3
),
[
0
,
2
]),
((
5
,
4
,
4100
),
[
0
,
2
]),
# 101
((
4100
,
4
,
3
),
[
0
,
1
,
2
]),
((
5
,
4100
,
3
),
[
0
,
1
,
2
]),
((
5
,
4
,
4100
),
[
0
,
1
,
2
]),
# 111
((
65
,
4
,
3
),
[
0
,
1
,
2
]),
((
5
,
65
,
3
),
[
0
,
1
,
2
]),
((
5
,
4
,
65
),
[
0
,
1
,
2
]),
# 111
# reduce over 2d
((
4100
,
4
,
3
,
2
),
[
2
,
3
]),
((
4
,
4100
,
3
,
2
),
[
2
,
3
]),
((
4
,
3
,
4100
,
2
),
[
2
,
3
]),
((
4
,
3
,
2
,
4100
),
[
2
,
3
]),
# 0011
((
4100
,
4
,
3
,
2
),
[
1
,
3
]),
((
4
,
4100
,
3
,
2
),
[
1
,
3
]),
((
4
,
3
,
4100
,
2
),
[
1
,
3
]),
((
4
,
3
,
2
,
4100
),
[
1
,
3
]),
# 0101
# ((4100, 4, 3, 2), [1, 2]), ((4, 4100, 3, 2), [1, 2]), ((4, 3, 4100, 2), [1, 2]), ((4, 3, 2, 4100), [1, 2]), # 0110 by reshape
# ((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]), # 1001 by reshape
# ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]), # 1010 not implemented
# ((4100, 4, 3, 2), [0, 1]), ((4, 4100, 3, 2), [0, 1]), ((4, 3, 4100, 2), [0, 1]), ((4, 3, 2, 4100), [0, 1]), # 1100 by reshape
# reduce over 3d
# 3d not tested: 1101, 1110, 1111
# ((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]), # 1101 by reshape
# ((4100, 4, 3, 2), [0, 1, 2]), ((4, 4100, 3, 2), [0, 1, 2]), ((4, 3, 4100, 2), [0, 1, 2]), ((4, 3, 2, 4100), [0, 1, 2]), # 1110 by reshape
((
4100
,
4
,
3
,
2
),
[
0
,
2
,
3
]),
((
4
,
4100
,
3
,
2
),
[
0
,
2
,
3
]),
((
4
,
3
,
4100
,
2
),
[
0
,
2
,
3
]),
# ((4,3,2,4100),[0,2,3]), # 1011
((
4100
,
4
,
3
,
2
),
[
1
,
2
,
3
]),
((
4
,
4100
,
3
,
2
),
[
1
,
2
,
3
]),
((
4
,
3
,
4100
,
2
),
[
1
,
2
,
3
]),
((
4
,
3
,
2
,
4100
),
[
1
,
2
,
3
]),
# 0111
((
65
,
4
,
3
,
2
),
[
1
,
2
,
3
]),
((
4
,
65
,
3
,
2
),
[
1
,
2
,
3
]),
((
4
,
3
,
65
,
2
),
[
1
,
2
,
3
]),
((
4
,
3
,
2
,
65
),
[
1
,
2
,
3
]),
# 0111
# reduce over 4d
((
4100
,
2
,
3
,
4
),
[
0
,
1
,
2
,
3
]),
((
2
,
4100
,
3
,
4
),
[
0
,
1
,
2
,
3
]),
((
2
,
3
,
4100
,
4
),
[
0
,
1
,
2
,
3
]),
((
2
,
3
,
4
,
4100
),
[
0
,
1
,
2
,
3
]),
((
128
,
1
,
3
,
3
),
[
0
,
1
,
2
,
3
]),
# 1111
# test pattern implemented by reshape
# Skip them as this test the op directly, not the optimization with reshape
# ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
# ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
# ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
# ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
# ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
# ((5,4,3,10,11),[1,2]),
]
op
=
GpuCAReduceCuda
reds
=
[
aes
.
add
,
aes
.
mul
,
aes
.
scalar_maximum
,
aes
.
scalar_minimum
]
pre_scalar_op
=
None
def
test_perform_noopt
(
self
):
return
def
test_perform
(
self
):
return
def
test_perform_nan
(
self
):
return
def
setup_method
(
self
):
super
()
.
setup_method
()
if
get_context
(
test_ctx_name
)
.
kind
!=
b
"cuda"
:
pytest
.
skip
(
"Cuda specific tests"
)
class
TestGpuReduceDtype
(
test_elemwise
.
TestReduceDtype
):
mode
=
mode_with_gpu
.
excluding
(
"local_cut_useless_reduce"
)
# GpuDnnReduction doesn't cover all cases, but should cover some
op
=
(
GpuCAReduceCuda
,
GpuDnnReduction
)
# Currently we don't support reduction on 0 axis
axes
=
[
None
,
0
,
1
,
1
,
[
0
],
[
1
],
[
0
,
1
]]
# We don't support complex dtype
dtypes
=
[
"int8"
,
"int16"
,
"int32"
,
"int64"
,
"uint8"
,
"uint16"
,
"uint32"
,
"uint64"
,
"float32"
,
"float64"
,
]
def
setup_method
(
self
):
if
get_context
(
test_ctx_name
)
.
kind
!=
b
"cuda"
:
pytest
.
skip
(
"Cuda specific tests"
)
def
speed_reduce10
():
data
=
np
.
random
.
rand
(
1000
,
1000
)
.
astype
(
"float32"
)
m
=
fmatrix
()
f
=
aesara
.
function
([
m
],
[
m
.
sum
(
axis
=
0
),
m
.
T
.
sum
(
axis
=
0
)],
mode
=
mode_with_gpu
)
f
(
data
)
tests/gpuarray/test_extra_ops.py
deleted
100644 → 0
浏览文件 @
c803c67e
from
functools
import
partial
from
itertools
import
product
import
numpy
as
np
import
pytest
import
aesara
import
aesara.tensor.math
as
tm
from
aesara.gpuarray.extra_ops
import
GpuCumOp
from
aesara.gpuarray.type
import
get_context
from
aesara.tensor.extra_ops
import
CumOp
from
aesara.tensor.type
import
fmatrix
,
ftensor3
,
ftensor4
,
fvector
,
tensor3
from
tests
import
unittest_tools
as
utt
from
tests.gpuarray.config
import
mode_with_gpu
,
test_ctx_name
from
tests.tensor.test_extra_ops
import
TestCumOp
class
TestGpuCumOp
(
TestCumOp
):
mode
=
mode_with_gpu
def
setup_method
(
self
):
super
()
.
setup_method
()
test_ctx
=
get_context
(
test_ctx_name
)
if
test_ctx
.
kind
!=
b
"cuda"
:
pytest
.
skip
(
"Cuda specific tests"
)
self
.
max_threads_dim0
=
test_ctx
.
maxlsize0
self
.
max_grid_size1
=
test_ctx
.
maxgsize2
self
.
op_class
=
CumOp
# The CPU implementation is not so accurate, which throws out DebugMode.
# Since propagating .tag.values_eq_approx to the output of every
# GpuFromHost seems overkill, we just relax the rtol for these tests
self
.
old_rtol
=
tm
.
float32_rtol
tm
.
float32_rtol
*=
2
def
teardown_method
(
self
):
super
()
.
teardown_method
()
# Restore rtol
tm
.
float32_rtol
=
self
.
old_rtol
@pytest.mark.skipif
(
aesara
.
config
.
floatX
!=
"float32"
,
reason
=
f
"Gpucumop not implemented for dtype {aesara.config.floatX}"
,
)
@pytest.mark.parametrized
(
"mode"
,
[
"mul"
,
"add"
])
def
test_infer_shape
(
self
,
mode
):
op_class
=
partial
(
self
.
op_class
,
mode
=
mode
)
x
=
tensor3
(
"x"
)
a
=
np
.
random
.
random
((
3
,
5
,
2
))
.
astype
(
aesara
.
config
.
floatX
)
for
axis
in
range
(
-
len
(
a
.
shape
),
len
(
a
.
shape
)):
self
.
_compile_and_check
([
x
],
[
op_class
(
axis
=
axis
)(
x
)],
[
a
],
GpuCumOp
)
@pytest.mark.parametrized
(
"mode"
,
[
"mul"
,
"add"
])
def
test_Strides1D
(
self
,
mode
):
op_class
=
partial
(
self
.
op_class
,
mode
=
mode
)
np_func
=
dict
(
add
=
np
.
cumsum
,
mul
=
np
.
cumprod
)[
mode
]
x
=
fvector
(
"x"
)
for
axis
in
(
0
,
None
,
-
1
):
a
=
np
.
random
.
random
((
42
,))
.
astype
(
"float32"
)
cumop_function
=
aesara
.
function
(
[
x
],
op_class
(
axis
=
axis
)(
x
),
mode
=
self
.
mode
)
slicings
=
[
slice
(
None
,
None
,
None
),
# Normal strides
slice
(
None
,
None
,
2
),
# Stepped strides
slice
(
None
,
None
,
-
1
),
# Negative strides
]
# Cartesian product of all slicings to test.
for
slicing
in
product
(
slicings
,
repeat
=
x
.
ndim
):
f
=
aesara
.
function
(
[
x
],
op_class
(
axis
=
axis
)(
x
[
slicing
]),
mode
=
self
.
mode
)
assert
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
GpuCumOp
)
]
utt
.
assert_allclose
(
np_func
(
a
[
slicing
],
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np_func
(
a
[
slicing
],
axis
=
axis
),
cumop_function
(
a
[
slicing
])
)
@pytest.mark.parametrized
(
"mode"
,
[
"mul"
,
"add"
])
def
test_Strides2D
(
self
,
mode
):
np_func
=
dict
(
add
=
np
.
cumsum
,
mul
=
np
.
cumprod
)[
mode
]
op_class
=
partial
(
self
.
op_class
,
mode
=
mode
)
x
=
fmatrix
(
"x"
)
for
axis
in
(
0
,
1
,
None
,
-
1
,
-
2
):
a
=
np
.
random
.
random
((
42
,
30
))
.
astype
(
"float32"
)
cumop_function
=
aesara
.
function
(
[
x
],
op_class
(
axis
=
axis
)(
x
),
mode
=
self
.
mode
)
slicings
=
[
slice
(
None
,
None
,
None
),
# Normal strides
slice
(
None
,
None
,
2
),
# Stepped strides
slice
(
None
,
None
,
-
1
),
# Negative strides
]
# Cartesian product of all slicings to test.
for
slicing
in
product
(
slicings
,
repeat
=
x
.
ndim
):
f
=
aesara
.
function
(
[
x
],
op_class
(
axis
=
axis
)(
x
[
slicing
]),
mode
=
self
.
mode
)
assert
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
GpuCumOp
)
]
utt
.
assert_allclose
(
np_func
(
a
[
slicing
],
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np_func
(
a
[
slicing
],
axis
=
axis
),
cumop_function
(
a
[
slicing
])
)
@pytest.mark.parametrized
(
"mode"
,
[
"mul"
,
"add"
])
def
test_Strides3D
(
self
,
mode
):
np_func
=
dict
(
add
=
np
.
cumsum
,
mul
=
np
.
cumprod
)[
mode
]
op_class
=
partial
(
self
.
op_class
,
mode
=
mode
)
x
=
ftensor3
(
"x"
)
for
axis
in
(
0
,
1
,
2
,
None
,
-
1
,
-
2
,
-
3
):
a
=
np
.
random
.
random
((
42
,
30
,
25
))
.
astype
(
"float32"
)
cumop_function
=
aesara
.
function
(
[
x
],
op_class
(
axis
=
axis
)(
x
),
mode
=
self
.
mode
)
slicings
=
[
slice
(
None
,
None
,
None
),
# Normal strides
slice
(
None
,
None
,
2
),
# Stepped strides
slice
(
None
,
None
,
-
1
),
# Negative strides
]
# Cartesian product of all slicings to test.
for
slicing
in
product
(
slicings
,
repeat
=
x
.
ndim
):
f
=
aesara
.
function
(
[
x
],
op_class
(
axis
=
axis
)(
x
[
slicing
]),
mode
=
self
.
mode
)
assert
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
GpuCumOp
)
]
utt
.
assert_allclose
(
np_func
(
a
[
slicing
],
axis
=
axis
),
f
(
a
))
utt
.
assert_allclose
(
np_func
(
a
[
slicing
],
axis
=
axis
),
cumop_function
(
a
[
slicing
])
)
@pytest.mark.parametrized
(
"mode"
,
[
"mul"
,
"add"
])
def
test_GpuCumOp1D
(
self
,
mode
):
np_func
=
dict
(
add
=
np
.
cumsum
,
mul
=
np
.
cumprod
)[
mode
]
op_class
=
partial
(
self
.
op_class
,
mode
=
mode
)
block_max_size
=
self
.
max_threads_dim0
*
2
x
=
fvector
(
"x"
)
f
=
aesara
.
function
([
x
],
op_class
(
axis
=
0
)(
x
),
mode
=
self
.
mode
)
assert
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
GpuCumOp
)]
# Extensive testing for the first 1025 sizes
a
=
np
.
random
.
random
(
1025
)
.
astype
(
"float32"
)
for
i
in
range
(
a
.
shape
[
0
]):
utt
.
assert_allclose
(
np_func
(
a
[:
i
]),
f
(
a
[:
i
]))
# Use multiple GPU threadblocks
a
=
np
.
random
.
random
((
block_max_size
+
2
,))
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np_func
(
a
),
f
(
a
))
# Use recursive cumop
a
=
np
.
ones
((
block_max_size
*
(
block_max_size
+
1
)
+
2
,),
dtype
=
"float32"
)
utt
.
assert_allclose
(
np_func
(
a
),
f
(
a
))
@pytest.mark.parametrized
(
"mode"
,
[
"mul"
,
"add"
])
def
test_GpuCumOp2D
(
self
,
mode
):
np_func
=
dict
(
add
=
np
.
cumsum
,
mul
=
np
.
cumprod
)[
mode
]
op_class
=
partial
(
self
.
op_class
,
mode
=
mode
)
block_max_size
=
self
.
max_threads_dim0
*
2
x
=
fmatrix
(
"x"
)
for
shape_axis
,
axis
in
zip
([
0
,
1
,
0
,
1
,
0
],
[
0
,
1
,
None
,
-
1
,
-
2
]):
f
=
aesara
.
function
([
x
],
op_class
(
axis
=
axis
)(
x
),
mode
=
self
.
mode
)
assert
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
GpuCumOp
)]
# Extensive testing for the first 1025 sizes
a_shape
=
[
5
,
5
]
a_shape
[
shape_axis
]
=
1025
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
slices
=
[
slice
(
None
),
slice
(
None
)]
for
i
in
range
(
a
.
shape
[
shape_axis
]):
slices
[
shape_axis
]
=
slice
(
i
)
fa
=
f
(
a
[
slices
])
npa
=
np_func
(
a
[
slices
],
axis
=
axis
)
utt
.
assert_allclose
(
npa
,
fa
)
# Use multiple GPU threadblocks
a_shape
=
[
5
,
5
]
a_shape
[
shape_axis
]
=
block_max_size
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np_func
(
a
,
axis
=
axis
),
f
(
a
))
# Use multiple GPU gridblocks
a_shape
=
[
4
,
4
]
a_shape
[
1
-
shape_axis
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np_func
(
a
,
axis
=
axis
),
f
(
a
),
rtol
=
5e-5
)
# Use recursive cumop
a_shape
=
[
3
,
3
]
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
utt
.
assert_allclose
(
np_func
(
a
,
axis
=
axis
),
f
(
a
))
@pytest.mark.parametrized
(
"mode"
,
[
"mul"
,
"add"
])
def
test_GpuCumOp3D
(
self
,
mode
):
np_func
=
dict
(
add
=
np
.
cumsum
,
mul
=
np
.
cumprod
)[
mode
]
op_class
=
partial
(
self
.
op_class
,
mode
=
mode
)
block_max_size
=
self
.
max_threads_dim0
*
2
x
=
ftensor3
(
"x"
)
for
shape_axis
,
axis
in
zip
([
0
,
1
,
2
,
0
,
2
,
1
,
0
],
[
0
,
1
,
2
,
None
,
-
1
,
-
2
,
-
3
]):
f
=
aesara
.
function
([
x
],
op_class
(
axis
=
axis
)(
x
),
mode
=
self
.
mode
)
assert
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
GpuCumOp
)]
# Extensive testing for the first 1025 sizes
a_shape
=
[
5
,
5
,
5
]
a_shape
[
shape_axis
]
=
1025
a
=
np
.
random
.
rand
(
*
a_shape
)
.
astype
(
"float32"
)
slices
=
[
slice
(
None
),
slice
(
None
),
slice
(
None
)]
for
i
in
range
(
a
.
shape
[
shape_axis
]):
slices
[
shape_axis
]
=
slice
(
i
)
fa
=
f
(
a
[
slices
])
npa
=
np_func
(
a
[
slices
],
axis
=
axis
)
utt
.
assert_allclose
(
npa
,
fa
)
# Use multiple GPU threadblocks (along accumulation axis)
a_shape
=
[
2
,
2
,
2
]
a_shape
[
shape_axis
]
=
block_max_size
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np_func
(
a
,
axis
=
axis
),
f
(
a
))
# Use multiple GPU gridblocks (not along accumulation axis)
a_shape
=
[
5
,
5
,
5
]
a_shape
[(
shape_axis
+
1
)
%
3
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
if
axis
is
None
:
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np_func
(
a
,
axis
=
axis
),
f
(
a
))
a_shape
=
[
5
,
5
,
5
]
a_shape
[(
shape_axis
+
2
)
%
3
]
=
self
.
max_grid_size1
+
1
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
if
axis
is
None
:
# Avoid floating point error
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
np_func
(
a
,
axis
=
axis
),
f
(
a
))
# Use recursive cumop (along accumulation axis)
a_shape
=
[
3
,
3
,
3
]
a_shape
[
shape_axis
]
=
block_max_size
*
(
block_max_size
+
1
)
+
2
a
=
np
.
random
.
random
(
a_shape
)
.
astype
(
"float32"
)
a
=
np
.
sign
(
a
-
0.5
)
.
astype
(
"float32"
)
# Avoid floating point error
utt
.
assert_allclose
(
np_func
(
a
,
axis
=
axis
),
f
(
a
))
@pytest.mark.parametrized
(
"mode"
,
[
"mul"
,
"add"
])
def
test_GpuCumOp4D
(
self
,
mode
):
op_class
=
partial
(
self
.
op_class
,
mode
=
mode
)
# Should not use the GPU version.
x
=
ftensor4
(
"x"
)
f
=
aesara
.
function
([
x
],
op_class
(
axis
=
1
)(
x
),
mode
=
self
.
mode
)
assert
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
CumOp
)]
tests/gpuarray/test_fft.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
pytest
import
aesara
import
aesara.gpuarray.fft
from
aesara.gpuarray.fft
import
pycuda_available
,
pygpu_available
,
skcuda_available
from
aesara.tensor.type
import
matrix
from
tests
import
unittest_tools
as
utt
from
tests.gpuarray.config
import
mode_with_gpu
# Skip tests if pygpu is not available.
if
not
pygpu_available
:
# noqa
pytest
.
skip
(
"Optional package pygpu not available"
,
allow_module_level
=
True
)
if
not
skcuda_available
:
# noqa
pytest
.
skip
(
"Optional package scikit-cuda not available"
,
allow_module_level
=
True
)
if
not
pycuda_available
:
# noqa
pytest
.
skip
(
"Optional package pycuda not available"
,
allow_module_level
=
True
)
# Transform sizes
N
=
32
class
TestFFT
:
def
test_1Dfft
(
self
):
inputs_val
=
np
.
random
.
random
((
1
,
N
))
.
astype
(
"float32"
)
x
=
matrix
(
"x"
,
dtype
=
"float32"
)
rfft
=
aesara
.
gpuarray
.
fft
.
curfft
(
x
)
f_rfft
=
aesara
.
function
([
x
],
rfft
,
mode
=
mode_with_gpu
)
res_rfft
=
f_rfft
(
inputs_val
)
res_rfft_comp
=
np
.
asarray
(
res_rfft
[:,
:,
0
])
+
1
j
*
np
.
asarray
(
res_rfft
[:,
:,
1
]
)
rfft_ref
=
np
.
fft
.
rfft
(
inputs_val
,
axis
=
1
)
utt
.
assert_allclose
(
rfft_ref
,
res_rfft_comp
)
m
=
rfft
.
type
()
irfft
=
aesara
.
gpuarray
.
fft
.
cuirfft
(
m
)
f_irfft
=
aesara
.
function
([
m
],
irfft
,
mode
=
mode_with_gpu
)
res_irfft
=
f_irfft
(
res_rfft
)
utt
.
assert_allclose
(
inputs_val
,
np
.
asarray
(
res_irfft
))
# The numerical gradient of the FFT is sensitive, must set large
# enough epsilon to get good accuracy.
eps
=
1e-1
def
f_rfft
(
inp
):
return
aesara
.
gpuarray
.
fft
.
curfft
(
inp
)
inputs_val
=
np
.
random
.
random
((
1
,
N
))
.
astype
(
"float32"
)
utt
.
verify_grad
(
f_rfft
,
[
inputs_val
],
eps
=
eps
,
mode
=
mode_with_gpu
)
def
f_irfft
(
inp
):
return
aesara
.
gpuarray
.
fft
.
cuirfft
(
inp
)
inputs_val
=
np
.
random
.
random
((
1
,
N
//
2
+
1
,
2
))
.
astype
(
"float32"
)
utt
.
verify_grad
(
f_irfft
,
[
inputs_val
],
eps
=
eps
,
mode
=
mode_with_gpu
)
def
test_rfft
(
self
):
inputs_val
=
np
.
random
.
random
((
1
,
N
,
N
))
.
astype
(
"float32"
)
inputs
=
aesara
.
shared
(
inputs_val
)
rfft
=
aesara
.
gpuarray
.
fft
.
curfft
(
inputs
)
f_rfft
=
aesara
.
function
([],
rfft
,
mode
=
mode_with_gpu
)
res_rfft
=
f_rfft
()
res_rfft_comp
=
np
.
asarray
(
res_rfft
[:,
:,
:,
0
])
+
1
j
*
np
.
asarray
(
res_rfft
[:,
:,
:,
1
]
)
rfft_ref
=
np
.
fft
.
rfftn
(
inputs_val
,
axes
=
(
1
,
2
))
utt
.
assert_allclose
(
rfft_ref
,
res_rfft_comp
,
atol
=
1e-4
,
rtol
=
1e-4
)
def
test_irfft
(
self
):
inputs_val
=
np
.
random
.
random
((
1
,
N
,
N
))
.
astype
(
"float32"
)
inputs
=
aesara
.
shared
(
inputs_val
)
fft
=
aesara
.
gpuarray
.
fft
.
curfft
(
inputs
)
f_fft
=
aesara
.
function
([],
fft
,
mode
=
mode_with_gpu
)
res_fft
=
f_fft
()
m
=
fft
.
type
()
ifft
=
aesara
.
gpuarray
.
fft
.
cuirfft
(
m
)
f_ifft
=
aesara
.
function
([
m
],
ifft
,
mode
=
mode_with_gpu
)
res_ifft
=
f_ifft
(
res_fft
)
utt
.
assert_allclose
(
inputs_val
,
np
.
asarray
(
res_ifft
))
inputs_val
=
np
.
random
.
random
((
1
,
N
,
N
,
2
))
.
astype
(
"float32"
)
inputs
=
aesara
.
shared
(
inputs_val
)
irfft
=
aesara
.
gpuarray
.
fft
.
cuirfft
(
inputs
)
f_irfft
=
aesara
.
function
([],
irfft
,
mode
=
mode_with_gpu
)
res_irfft
=
f_irfft
()
inputs_ref
=
inputs_val
[
...
,
0
]
+
inputs_val
[
...
,
1
]
*
1
j
irfft_ref
=
np
.
fft
.
irfftn
(
inputs_ref
,
axes
=
(
1
,
2
))
utt
.
assert_allclose
(
irfft_ref
,
res_irfft
,
atol
=
1e-4
,
rtol
=
1e-4
)
def
test_type
(
self
):
inputs_val
=
np
.
random
.
random
((
1
,
N
))
.
astype
(
"float64"
)
inputs
=
aesara
.
shared
(
inputs_val
)
with
pytest
.
raises
(
AssertionError
):
aesara
.
gpuarray
.
fft
.
curfft
(
inputs
)
with
pytest
.
raises
(
AssertionError
):
aesara
.
gpuarray
.
fft
.
cuirfft
(
inputs
)
def
test_norm
(
self
):
inputs_val
=
np
.
random
.
random
((
1
,
N
,
N
))
.
astype
(
"float32"
)
inputs
=
aesara
.
shared
(
inputs_val
)
# Unitary normalization
rfft
=
aesara
.
gpuarray
.
fft
.
curfft
(
inputs
,
norm
=
"ortho"
)
f_rfft
=
aesara
.
function
([],
rfft
,
mode
=
mode_with_gpu
)
res_rfft
=
f_rfft
()
res_rfft_comp
=
np
.
asarray
(
res_rfft
[:,
:,
:,
0
])
+
1
j
*
np
.
asarray
(
res_rfft
[:,
:,
:,
1
]
)
rfft_ref
=
np
.
fft
.
rfftn
(
inputs_val
,
axes
=
(
1
,
2
))
utt
.
assert_allclose
(
rfft_ref
/
N
,
res_rfft_comp
,
atol
=
1e-4
,
rtol
=
1e-4
)
# No normalization
rfft
=
aesara
.
gpuarray
.
fft
.
curfft
(
inputs
,
norm
=
"no_norm"
)
f_rfft
=
aesara
.
function
([],
rfft
,
mode
=
mode_with_gpu
)
res_rfft
=
f_rfft
()
res_rfft_comp
=
np
.
asarray
(
res_rfft
[:,
:,
:,
0
])
+
1
j
*
np
.
asarray
(
res_rfft
[:,
:,
:,
1
]
)
utt
.
assert_allclose
(
rfft_ref
,
res_rfft_comp
,
atol
=
1e-4
,
rtol
=
1e-4
)
# Inverse FFT inputs
inputs_val
=
np
.
random
.
random
((
1
,
N
,
N
//
2
+
1
,
2
))
.
astype
(
"float32"
)
inputs
=
aesara
.
shared
(
inputs_val
)
inputs_ref
=
inputs_val
[:,
:,
:,
0
]
+
1
j
*
inputs_val
[:,
:,
:,
1
]
# Unitary normalization inverse FFT
irfft
=
aesara
.
gpuarray
.
fft
.
cuirfft
(
inputs
,
norm
=
"ortho"
)
f_irfft
=
aesara
.
function
([],
irfft
,
mode
=
mode_with_gpu
)
res_irfft
=
f_irfft
()
irfft_ref
=
np
.
fft
.
irfftn
(
inputs_ref
,
axes
=
(
1
,
2
))
utt
.
assert_allclose
(
irfft_ref
*
N
,
res_irfft
,
atol
=
1e-4
,
rtol
=
1e-4
)
# No normalization inverse FFT
irfft
=
aesara
.
gpuarray
.
fft
.
cuirfft
(
inputs
,
norm
=
"no_norm"
)
f_irfft
=
aesara
.
function
([],
irfft
,
mode
=
mode_with_gpu
)
res_irfft
=
f_irfft
()
utt
.
assert_allclose
(
irfft_ref
*
N
**
2
,
res_irfft
,
atol
=
1e-4
,
rtol
=
1e-4
)
def
test_grad
(
self
):
# The numerical gradient of the FFT is sensitive, must set large
# enough epsilon to get good accuracy.
eps
=
1e-1
def
f_rfft
(
inp
):
return
aesara
.
gpuarray
.
fft
.
curfft
(
inp
)
inputs_val
=
np
.
random
.
random
((
1
,
N
,
N
))
.
astype
(
"float32"
)
utt
.
verify_grad
(
f_rfft
,
[
inputs_val
],
eps
=
eps
,
mode
=
mode_with_gpu
)
def
f_irfft
(
inp
):
return
aesara
.
gpuarray
.
fft
.
cuirfft
(
inp
)
inputs_val
=
np
.
random
.
random
((
1
,
N
,
N
//
2
+
1
,
2
))
.
astype
(
"float32"
)
utt
.
verify_grad
(
f_irfft
,
[
inputs_val
],
eps
=
eps
,
mode
=
mode_with_gpu
)
def
f_rfft
(
inp
):
return
aesara
.
gpuarray
.
fft
.
curfft
(
inp
,
norm
=
"ortho"
)
inputs_val
=
np
.
random
.
random
((
1
,
N
,
N
))
.
astype
(
"float32"
)
utt
.
verify_grad
(
f_rfft
,
[
inputs_val
],
eps
=
eps
,
mode
=
mode_with_gpu
)
def
f_irfft
(
inp
):
return
aesara
.
gpuarray
.
fft
.
cuirfft
(
inp
,
norm
=
"no_norm"
)
inputs_val
=
np
.
random
.
random
((
1
,
N
,
N
//
2
+
1
,
2
))
.
astype
(
"float32"
)
utt
.
verify_grad
(
f_irfft
,
[
inputs_val
],
eps
=
eps
,
mode
=
mode_with_gpu
)
def
test_odd
(
self
):
M
=
N
-
1
inputs_val
=
np
.
random
.
random
((
1
,
M
,
M
))
.
astype
(
"float32"
)
inputs
=
aesara
.
shared
(
inputs_val
)
rfft
=
aesara
.
gpuarray
.
fft
.
curfft
(
inputs
)
f_rfft
=
aesara
.
function
([],
rfft
,
mode
=
mode_with_gpu
)
res_rfft
=
f_rfft
()
res_rfft_comp
=
np
.
asarray
(
res_rfft
[:,
:,
:,
0
])
+
1
j
*
np
.
asarray
(
res_rfft
[:,
:,
:,
1
]
)
rfft_ref
=
np
.
fft
.
rfftn
(
inputs_val
,
s
=
(
M
,
M
),
axes
=
(
1
,
2
))
utt
.
assert_allclose
(
rfft_ref
,
res_rfft_comp
,
atol
=
1e-4
,
rtol
=
1e-4
)
m
=
rfft
.
type
()
ifft
=
aesara
.
gpuarray
.
fft
.
cuirfft
(
m
,
is_odd
=
True
)
f_ifft
=
aesara
.
function
([
m
],
ifft
,
mode
=
mode_with_gpu
)
res_ifft
=
f_ifft
(
res_rfft
)
utt
.
assert_allclose
(
inputs_val
,
np
.
asarray
(
res_ifft
))
inputs_val
=
np
.
random
.
random
((
1
,
M
,
M
//
2
+
1
,
2
))
.
astype
(
"float32"
)
inputs
=
aesara
.
shared
(
inputs_val
)
irfft
=
aesara
.
gpuarray
.
fft
.
cuirfft
(
inputs
,
norm
=
"ortho"
,
is_odd
=
True
)
f_irfft
=
aesara
.
function
([],
irfft
,
mode
=
mode_with_gpu
)
res_irfft
=
f_irfft
()
inputs_ref
=
inputs_val
[:,
:,
:,
0
]
+
1
j
*
inputs_val
[:,
:,
:,
1
]
irfft_ref
=
np
.
fft
.
irfftn
(
inputs_ref
,
s
=
(
M
,
M
),
axes
=
(
1
,
2
))
*
M
utt
.
assert_allclose
(
irfft_ref
,
res_irfft
,
atol
=
1e-4
,
rtol
=
1e-4
)
# The numerical gradient of the FFT is sensitive, must set large
# enough epsilon to get good accuracy.
eps
=
1e-1
def
f_rfft
(
inp
):
return
aesara
.
gpuarray
.
fft
.
curfft
(
inp
)
inputs_val
=
np
.
random
.
random
((
1
,
M
,
M
))
.
astype
(
"float32"
)
utt
.
verify_grad
(
f_rfft
,
[
inputs_val
],
eps
=
eps
,
mode
=
mode_with_gpu
)
def
f_irfft
(
inp
):
return
aesara
.
gpuarray
.
fft
.
cuirfft
(
inp
,
is_odd
=
True
)
inputs_val
=
np
.
random
.
random
((
1
,
M
,
M
//
2
+
1
,
2
))
.
astype
(
"float32"
)
utt
.
verify_grad
(
f_irfft
,
[
inputs_val
],
eps
=
eps
,
mode
=
mode_with_gpu
)
def
f_rfft
(
inp
):
return
aesara
.
gpuarray
.
fft
.
curfft
(
inp
,
norm
=
"ortho"
)
inputs_val
=
np
.
random
.
random
((
1
,
M
,
M
))
.
astype
(
"float32"
)
utt
.
verify_grad
(
f_rfft
,
[
inputs_val
],
eps
=
eps
,
mode
=
mode_with_gpu
)
def
f_irfft
(
inp
):
return
aesara
.
gpuarray
.
fft
.
cuirfft
(
inp
,
norm
=
"no_norm"
,
is_odd
=
True
)
inputs_val
=
np
.
random
.
random
((
1
,
M
,
M
//
2
+
1
,
2
))
.
astype
(
"float32"
)
utt
.
verify_grad
(
f_irfft
,
[
inputs_val
],
eps
=
eps
,
mode
=
mode_with_gpu
)
def
test_params
(
self
):
inputs_val
=
np
.
random
.
random
((
1
,
N
))
.
astype
(
"float32"
)
inputs
=
aesara
.
shared
(
inputs_val
)
with
pytest
.
raises
(
ValueError
):
aesara
.
gpuarray
.
fft
.
curfft
(
inputs
,
norm
=
123
)
inputs_val
=
np
.
random
.
random
((
1
,
N
//
2
+
1
,
2
))
.
astype
(
"float32"
)
inputs
=
aesara
.
shared
(
inputs_val
)
with
pytest
.
raises
(
ValueError
):
aesara
.
gpuarray
.
fft
.
cuirfft
(
inputs
,
norm
=
123
)
with
pytest
.
raises
(
ValueError
):
aesara
.
gpuarray
.
fft
.
cuirfft
(
inputs
,
is_odd
=
123
)
tests/gpuarray/test_gemmcorr.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
aesara
from
aesara.configdefaults
import
config
from
aesara.gpuarray.blas
import
GpuCorrMM
,
GpuCorrMM_gradInputs
,
GpuCorrMM_gradWeights
from
aesara.gpuarray.type
import
gpuarray_shared_constructor
from
aesara.tensor.nnet.corr
import
CorrMM
,
CorrMM_gradInputs
,
CorrMM_gradWeights
from
tests
import
unittest_tools
as
utt
from
tests.gpuarray.config
import
mode_with_gpu
,
mode_without_gpu
,
ref_cast
from
tests.tensor.nnet.test_abstract_conv
import
(
TestAsymmetricPadding
,
TestCausalConv
,
TestGroupedConvNoOptim
,
TestUnsharedConv
,
)
class
TestCorrMM
:
def
run_conv_valid
(
self
,
inputs_shape
,
filters_shape
,
border_mode
=
"valid"
,
filter_dilation
=
(
1
,
1
),
subsample
=
(
1
,
1
),
unshared
=
False
,
verify_grad
=
False
,
):
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
if
unshared
:
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
1
,
2
,
5
,
3
,
4
)]
else
:
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
conv_ref
=
CorrMM
(
border_mode
=
border_mode
,
filter_dilation
=
filter_dilation
,
subsample
=
subsample
,
unshared
=
unshared
,
)(
ref_cast
(
inputs
),
ref_cast
(
filters
))
f_ref
=
aesara
.
function
([],
conv_ref
,
mode
=
mode_without_gpu
)
conv
=
GpuCorrMM
(
border_mode
=
border_mode
,
filter_dilation
=
filter_dilation
,
subsample
=
subsample
,
unshared
=
unshared
,
)(
inputs
,
filters
)
f
=
aesara
.
function
([],
conv
,
mode
=
mode_with_gpu
)
res_ref
=
f_ref
()
res
=
f
()
utt
.
assert_allclose
(
res_ref
,
res
)
if
verify_grad
:
utt
.
verify_grad
(
GpuCorrMM
(
border_mode
=
border_mode
,
filter_dilation
=
filter_dilation
,
subsample
=
subsample
,
unshared
=
unshared
,
),
[
inputs_val
,
filters_val
],
mode
=
mode_with_gpu
,
)
def
test_valid
(
self
):
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
6
,
12
,
1
))
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
6
,
12
,
1
),
subsample
=
(
2
,
2
)
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
6
,
12
,
1
),
subsample
=
(
3
,
3
)
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
6
,
12
,
1
),
subsample
=
(
3
,
2
)
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
6
,
12
,
1
),
subsample
=
(
1
,
2
)
)
def
test_border_mode
(
self
):
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
6
,
12
,
1
),
border_mode
=
"valid"
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
6
,
12
,
1
),
border_mode
=
"half"
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
6
,
12
,
1
),
border_mode
=
"full"
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
6
,
12
,
1
),
border_mode
=
(
0
,
0
),
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
6
,
12
,
1
),
border_mode
=
(
1
,
2
),
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
6
,
12
,
1
),
border_mode
=
(
3
,
2
),
)
def
test_filter_dilation
(
self
):
inputs_shape
=
[
16
,
20
,
12
,
1
]
filters_shape
=
[
10
,
6
,
5
,
1
]
for
filter_dilation
in
[(
2
,
1
),
(
1
,
2
)]:
for
border_mode
in
[
"valid"
,
"half"
,
"full"
]:
self
.
run_conv_valid
(
inputs_shape
=
inputs_shape
,
filters_shape
=
filters_shape
,
filter_dilation
=
filter_dilation
,
border_mode
=
border_mode
,
)
def
test_verify_gradients
(
self
):
# use a small example to check the gradients
inputs_shape
=
[
2
,
7
,
9
,
1
]
filters_shape
=
[
1
,
3
,
3
,
1
]
for
filter_dilation
in
[(
2
,
1
),
(
1
,
2
)]:
for
border_mode
in
[
"valid"
,
"half"
,
"full"
,
(
2
,
1
)]:
self
.
run_conv_valid
(
inputs_shape
=
inputs_shape
,
filters_shape
=
filters_shape
,
filter_dilation
=
filter_dilation
,
border_mode
=
border_mode
,
verify_grad
=
True
,
)
def
test_unshared
(
self
):
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
15
,
1
,
6
,
12
,
1
),
unshared
=
True
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
8
,
1
,
6
,
12
,
1
),
subsample
=
(
2
,
2
),
unshared
=
True
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
5
,
1
,
6
,
12
,
1
),
subsample
=
(
3
,
3
),
unshared
=
True
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
5
,
1
,
6
,
12
,
1
),
subsample
=
(
3
,
2
),
unshared
=
True
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
15
,
1
,
6
,
12
,
1
),
subsample
=
(
1
,
2
),
unshared
=
True
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
15
,
1
,
6
,
12
,
1
),
border_mode
=
"valid"
,
unshared
=
True
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
21
,
13
,
6
,
12
,
1
),
border_mode
=
"half"
,
unshared
=
True
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
25
,
23
,
6
,
12
,
1
),
border_mode
=
"full"
,
unshared
=
True
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
15
,
1
,
6
,
12
,
1
),
border_mode
=
(
0
,
0
),
unshared
=
True
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
17
,
5
,
6
,
12
,
1
),
border_mode
=
(
1
,
2
),
unshared
=
True
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
21
,
5
,
6
,
12
,
1
),
border_mode
=
(
3
,
2
),
unshared
=
True
,
)
def
run_gradweight
(
self
,
inputs_shape
,
filters_shape
,
dCdH_shape
,
subsample
=
(
1
,
1
)):
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
dCdH_shape
=
[
dCdH_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
dCdH_val
=
np
.
random
.
random
(
dCdH_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
dCdH
=
gpuarray_shared_constructor
(
dCdH_val
)
shape
=
gpuarray_shared_constructor
(
np
.
array
(
filters_shape
[
2
:]))
if
subsample
==
(
1
,
1
):
conv_ref
=
CorrMM_gradWeights
(
subsample
=
subsample
)(
ref_cast
(
inputs
),
ref_cast
(
dCdH
)
)
conv_gemm
=
GpuCorrMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
)
else
:
conv_ref
=
CorrMM_gradWeights
(
subsample
=
subsample
)(
ref_cast
(
inputs
),
ref_cast
(
dCdH
),
shape
=
shape
)
conv_gemm
=
GpuCorrMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
,
shape
=
shape
)
f_ref
=
aesara
.
function
([],
conv_ref
,
mode
=
mode_without_gpu
)
f
=
aesara
.
function
([],
conv_gemm
,
mode
=
mode_with_gpu
)
res_ref
=
f_ref
()
res
=
f
()
utt
.
assert_allclose
(
res_ref
,
res
)
def
test_gradweight
(
self
):
self
.
run_gradweight
(
inputs_shape
=
(
16
,
10
,
12
,
1
),
filters_shape
=
(
10
,
6
,
12
,
1
),
dCdH_shape
=
(
16
,
5
,
1
,
10
),
subsample
=
(
1
,
1
),
)
self
.
run_gradweight
(
inputs_shape
=
(
16
,
20
,
10
,
1
),
filters_shape
=
(
10
,
6
,
4
,
1
),
dCdH_shape
=
(
16
,
8
,
4
,
10
),
subsample
=
(
2
,
2
),
)
self
.
run_gradweight
(
inputs_shape
=
(
16
,
20
,
10
,
1
),
filters_shape
=
(
10
,
6
,
3
,
1
),
dCdH_shape
=
(
16
,
5
,
3
,
10
),
subsample
=
(
3
,
3
),
)
self
.
run_gradweight
(
inputs_shape
=
(
16
,
20
,
12
,
1
),
filters_shape
=
(
10
,
6
,
12
,
1
),
dCdH_shape
=
(
16
,
8
,
1
,
10
),
subsample
=
(
2
,
1
),
)
def
run_gradinput
(
self
,
inputs_shape
,
filters_shape
,
subsample
=
(
1
,
1
)):
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
3
,
1
,
2
)]
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
bottom_height
=
(
inputs_shape
[
2
]
-
1
)
*
subsample
[
0
]
+
filters_shape
[
2
]
bottom_width
=
(
inputs_shape
[
3
]
-
1
)
*
subsample
[
1
]
+
filters_shape
[
3
]
bottom_shape
=
gpuarray_shared_constructor
(
np
.
array
([
bottom_height
,
bottom_width
])
)
if
subsample
==
(
1
,
1
):
conv_ref
=
CorrMM_gradInputs
(
subsample
=
subsample
)(
kern
=
ref_cast
(
filters
),
topgrad
=
ref_cast
(
inputs
)
)
conv_gemm
=
GpuCorrMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
)
else
:
conv_ref
=
CorrMM_gradInputs
(
subsample
=
subsample
)(
kern
=
ref_cast
(
filters
),
topgrad
=
ref_cast
(
inputs
),
shape
=
bottom_shape
)
conv_gemm
=
GpuCorrMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
,
shape
=
bottom_shape
)
f_ref
=
aesara
.
function
([],
conv_ref
,
mode
=
mode_without_gpu
)
f
=
aesara
.
function
([],
conv_gemm
,
mode
=
mode_with_gpu
)
res_ref
=
f_ref
()
res
=
f
()
utt
.
assert_allclose
(
res_ref
,
res
)
def
test_gradinput
(
self
):
self
.
run_gradinput
(
inputs_shape
=
(
16
,
15
,
12
,
10
),
filters_shape
=
(
10
,
6
,
12
,
1
))
self
.
run_gradinput
(
inputs_shape
=
(
16
,
15
,
12
,
10
),
filters_shape
=
(
10
,
6
,
12
,
1
),
subsample
=
(
2
,
2
),
)
self
.
run_gradinput
(
inputs_shape
=
(
16
,
15
,
12
,
10
),
filters_shape
=
(
10
,
6
,
12
,
1
),
subsample
=
(
3
,
3
),
)
self
.
run_gradinput
(
inputs_shape
=
(
16
,
15
,
12
,
10
),
filters_shape
=
(
10
,
6
,
12
,
1
),
subsample
=
(
3
,
1
),
)
def
test_large_input
(
self
):
# This tests the number-of-threads computation
# by making (channels * height) > (max_threads_dim ** 2).
# (See also issue #5165.)
self
.
run_conv_valid
(
inputs_shape
=
(
1
,
1024
,
3
,
1024
),
filters_shape
=
(
1
,
1
,
1
,
1024
),
verify_grad
=
False
,
)
self
.
run_gradinput
(
inputs_shape
=
(
1
,
1024
,
3
,
1
),
filters_shape
=
(
1
,
1
,
1
,
1024
))
class
TestGroupGpuCorr2d
(
TestGroupedConvNoOptim
):
mode
=
mode_with_gpu
.
excluding
(
"cudnn"
)
conv_op
=
GpuCorrMM
conv_gradw_op
=
GpuCorrMM_gradWeights
conv_gradi_op
=
GpuCorrMM_gradInputs
flip_filter
=
True
is_dnn
=
False
class
TestUnsharedGpuCorr2d
(
TestUnsharedConv
):
mode
=
mode_with_gpu
conv2d_op
=
GpuCorrMM
conv2d_gradw_op
=
GpuCorrMM_gradWeights
conv2d_gradi_op
=
GpuCorrMM_gradInputs
class
TestAsymmetricGpu
(
TestAsymmetricPadding
):
mode
=
mode_with_gpu
conv2d_op
=
GpuCorrMM
conv2d_gradw_op
=
GpuCorrMM_gradWeights
conv2d_gradi_op
=
GpuCorrMM_gradInputs
class
TestCausalGpuCorr
(
TestCausalConv
):
mode
=
mode_with_gpu
tests/gpuarray/test_gemmcorr3d.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
aesara
from
aesara.configdefaults
import
config
from
aesara.gpuarray.blas
import
(
GpuCorr3dMM
,
GpuCorr3dMM_gradInputs
,
GpuCorr3dMM_gradWeights
,
)
from
aesara.gpuarray.type
import
gpuarray_shared_constructor
from
aesara.tensor.nnet.corr3d
import
Corr3dMM
,
Corr3dMMGradInputs
,
Corr3dMMGradWeights
from
tests
import
unittest_tools
as
utt
from
tests.gpuarray.config
import
mode_with_gpu
,
mode_without_gpu
,
ref_cast
from
tests.tensor.nnet.test_abstract_conv
import
TestGroupedConv3dNoOptim
class
TestCorr3dMM
:
def
run_conv_valid
(
self
,
inputs_shape
,
filters_shape
,
border_mode
=
"valid"
,
filter_dilation
=
(
1
,
1
,
1
),
subsample
=
(
1
,
1
,
1
),
verify_grad
=
False
,
):
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
conv_ref
=
Corr3dMM
(
border_mode
=
border_mode
,
filter_dilation
=
filter_dilation
,
subsample
=
subsample
,
)(
ref_cast
(
inputs
),
ref_cast
(
filters
))
f_ref
=
aesara
.
function
([],
conv_ref
,
mode
=
mode_without_gpu
)
conv
=
GpuCorr3dMM
(
border_mode
=
border_mode
,
filter_dilation
=
filter_dilation
,
subsample
=
subsample
,
)(
inputs
,
filters
)
f
=
aesara
.
function
([],
conv
,
mode
=
mode_with_gpu
)
res_ref
=
f_ref
()
res
=
f
()
utt
.
assert_allclose
(
res_ref
,
res
)
if
verify_grad
:
utt
.
verify_grad
(
GpuCorr3dMM
(
border_mode
=
border_mode
,
filter_dilation
=
filter_dilation
,
subsample
=
subsample
,
),
[
inputs_val
,
filters_val
],
mode
=
mode_with_gpu
,
)
def
test_valid
(
self
):
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
16
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
)
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
subsample
=
(
2
,
2
,
2
),
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
subsample
=
(
2
,
2
,
2
),
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
subsample
=
(
3
,
3
,
3
),
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
subsample
=
(
3
,
3
,
3
),
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
subsample
=
(
3
,
2
,
1
),
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
subsample
=
(
1
,
2
,
3
),
)
def
test_border_mode
(
self
):
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
border_mode
=
"valid"
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
border_mode
=
"half"
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
border_mode
=
"full"
,
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
border_mode
=
(
0
,
0
,
0
),
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
border_mode
=
(
1
,
2
,
3
),
)
self
.
run_conv_valid
(
inputs_shape
=
(
16
,
20
,
12
,
15
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
border_mode
=
(
3
,
2
,
1
),
)
def
test_filter_dilation
(
self
):
inputs_shape
=
[
16
,
20
,
12
,
15
,
1
]
filters_shape
=
[
10
,
6
,
5
,
4
,
1
]
for
filter_dilation
in
[(
2
,
1
,
1
),
(
1
,
2
,
1
),
(
1
,
1
,
2
)]:
for
border_mode
in
[
"valid"
,
"half"
,
"full"
]:
self
.
run_conv_valid
(
inputs_shape
=
inputs_shape
,
filters_shape
=
filters_shape
,
filter_dilation
=
filter_dilation
,
border_mode
=
border_mode
,
)
def
test_verify_gradients
(
self
):
# use a small example to check the gradients
inputs_shape
=
[
2
,
7
,
9
,
6
,
1
]
filters_shape
=
[
1
,
3
,
3
,
2
,
1
]
for
filter_dilation
in
[(
2
,
1
,
1
),
(
1
,
2
,
1
),
(
1
,
1
,
2
)]:
for
border_mode
in
[
"valid"
,
"half"
,
"full"
,
(
2
,
1
,
3
)]:
self
.
run_conv_valid
(
inputs_shape
=
inputs_shape
,
filters_shape
=
filters_shape
,
filter_dilation
=
filter_dilation
,
border_mode
=
border_mode
,
verify_grad
=
True
,
)
def
run_gradweight
(
self
,
inputs_shape
,
filters_shape
,
dCdH_shape
,
subsample
=
(
1
,
1
,
1
)
):
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
dCdH_shape
=
[
dCdH_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
dCdH_val
=
np
.
random
.
random
(
dCdH_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
dCdH
=
gpuarray_shared_constructor
(
dCdH_val
)
shape
=
gpuarray_shared_constructor
(
np
.
array
(
filters_shape
[
2
:]))
if
subsample
==
(
1
,
1
,
1
):
conv_ref
=
Corr3dMMGradWeights
(
subsample
=
subsample
)(
ref_cast
(
inputs
),
ref_cast
(
dCdH
)
)
conv_gemm
=
GpuCorr3dMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
)
else
:
conv_ref
=
Corr3dMMGradWeights
(
subsample
=
subsample
)(
ref_cast
(
inputs
),
ref_cast
(
dCdH
),
shape
=
shape
)
conv_gemm
=
GpuCorr3dMM_gradWeights
(
subsample
=
subsample
)(
inputs
,
dCdH
,
shape
=
shape
)
f_ref
=
aesara
.
function
([],
conv_ref
,
mode
=
mode_without_gpu
)
f
=
aesara
.
function
([],
conv_gemm
,
mode
=
mode_with_gpu
)
res_ref
=
f_ref
()
res
=
f
()
utt
.
assert_allclose
(
res_ref
,
res
)
def
test_gradweight
(
self
):
self
.
run_gradweight
(
inputs_shape
=
(
16
,
10
,
12
,
16
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
dCdH_shape
=
(
16
,
5
,
1
,
13
,
10
),
subsample
=
(
1
,
1
,
1
),
)
self
.
run_gradweight
(
inputs_shape
=
(
16
,
20
,
10
,
16
,
1
),
filters_shape
=
(
10
,
6
,
4
,
4
,
1
),
dCdH_shape
=
(
16
,
8
,
4
,
7
,
10
),
subsample
=
(
2
,
2
,
2
),
)
self
.
run_gradweight
(
inputs_shape
=
(
16
,
20
,
10
,
16
,
1
),
filters_shape
=
(
10
,
6
,
3
,
4
,
1
),
dCdH_shape
=
(
16
,
5
,
3
,
5
,
10
),
subsample
=
(
3
,
3
,
3
),
)
self
.
run_gradweight
(
inputs_shape
=
(
16
,
20
,
12
,
16
,
1
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
dCdH_shape
=
(
16
,
8
,
1
,
5
,
10
),
subsample
=
(
2
,
1
,
3
),
)
def
run_gradinput
(
self
,
inputs_shape
,
filters_shape
,
subsample
=
(
1
,
1
,
1
)):
inputs_shape
=
[
inputs_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
filters_shape
=
[
filters_shape
[
i
]
for
i
in
(
0
,
4
,
1
,
2
,
3
)]
inputs_val
=
np
.
random
.
random
(
inputs_shape
)
.
astype
(
config
.
floatX
)
filters_val
=
np
.
random
.
random
(
filters_shape
)
.
astype
(
config
.
floatX
)
inputs
=
gpuarray_shared_constructor
(
inputs_val
)
filters
=
gpuarray_shared_constructor
(
filters_val
)
bottom_height
=
(
inputs_shape
[
2
]
-
1
)
*
subsample
[
0
]
+
filters_shape
[
2
]
bottom_width
=
(
inputs_shape
[
3
]
-
1
)
*
subsample
[
1
]
+
filters_shape
[
3
]
bottom_depth
=
(
inputs_shape
[
4
]
-
1
)
*
subsample
[
2
]
+
filters_shape
[
4
]
bottom_shape
=
gpuarray_shared_constructor
(
np
.
array
([
bottom_height
,
bottom_width
,
bottom_depth
])
)
if
subsample
==
(
1
,
1
,
1
):
conv_ref
=
Corr3dMMGradInputs
(
subsample
=
subsample
)(
kern
=
ref_cast
(
filters
),
topgrad
=
ref_cast
(
inputs
)
)
conv_gemm
=
GpuCorr3dMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
)
else
:
conv_ref
=
Corr3dMMGradInputs
(
subsample
=
subsample
)(
kern
=
ref_cast
(
filters
),
topgrad
=
ref_cast
(
inputs
),
shape
=
bottom_shape
)
conv_gemm
=
GpuCorr3dMM_gradInputs
(
subsample
=
subsample
)(
kern
=
filters
,
topgrad
=
inputs
,
shape
=
bottom_shape
)
f_ref
=
aesara
.
function
([],
conv_ref
,
mode
=
mode_without_gpu
)
f
=
aesara
.
function
([],
conv_gemm
,
mode
=
mode_with_gpu
)
res_ref
=
f_ref
()
res
=
f
()
utt
.
assert_allclose
(
res_ref
,
res
)
def
test_gradinput
(
self
):
self
.
run_gradinput
(
inputs_shape
=
(
16
,
15
,
12
,
12
,
10
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
)
)
self
.
run_gradinput
(
inputs_shape
=
(
16
,
15
,
12
,
12
,
10
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
subsample
=
(
2
,
2
,
2
),
)
self
.
run_gradinput
(
inputs_shape
=
(
16
,
15
,
12
,
12
,
10
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
subsample
=
(
3
,
3
,
3
),
)
self
.
run_gradinput
(
inputs_shape
=
(
16
,
15
,
12
,
12
,
10
),
filters_shape
=
(
10
,
6
,
12
,
4
,
1
),
subsample
=
(
3
,
1
,
2
),
)
def
test_large_input
(
self
):
# This tests the number-of-threads computation
# by making (channels * height) > (max_threads_dim ** 2).
# (See also issue #5165.)
self
.
run_conv_valid
(
inputs_shape
=
(
1
,
1024
,
3
,
3
,
1024
),
filters_shape
=
(
1
,
1
,
1
,
1
,
1024
),
verify_grad
=
False
,
)
self
.
run_gradinput
(
inputs_shape
=
(
1
,
1024
,
3
,
3
,
1
),
filters_shape
=
(
1
,
1
,
1
,
1
,
1024
)
)
class
TestGroupGpuCorr3d
(
TestGroupedConv3dNoOptim
):
mode
=
mode_with_gpu
.
excluding
(
"cudnn"
)
conv_op
=
GpuCorr3dMM
conv_gradw_op
=
GpuCorr3dMM_gradWeights
conv_gradi_op
=
GpuCorr3dMM_gradInputs
tests/gpuarray/test_linalg.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
pytest
from
numpy.linalg.linalg
import
LinAlgError
import
aesara
from
aesara.configdefaults
import
config
from
aesara.gpuarray
import
gpuarray_shared_constructor
from
aesara.gpuarray.linalg
import
(
GpuCholesky
,
GpuCublasTriangularSolve
,
GpuCusolverSolve
,
GpuMagmaCholesky
,
GpuMagmaEigh
,
GpuMagmaMatrixInverse
,
GpuMagmaQR
,
GpuMagmaSVD
,
cusolver_available
,
gpu_cholesky
,
gpu_matrix_inverse
,
gpu_qr
,
gpu_solve
,
gpu_solve_lower_triangular
,
gpu_svd
,
)
from
aesara.tensor.nlinalg
import
SVD
,
MatrixInverse
,
QRFull
,
eigh
,
matrix_inverse
,
qr
from
aesara.tensor.slinalg
import
Cholesky
,
cholesky
from
aesara.tensor.type
import
fmatrix
,
matrix
,
tensor3
,
vector
from
tests
import
unittest_tools
as
utt
from
tests.gpuarray.config
import
mode_with_gpu
,
mode_without_gpu
from
tests.gpuarray.test_basic_ops
import
random
@pytest.mark.skipif
(
not
cusolver_available
,
reason
=
"Optional package scikits.cuda.cusolver not available"
,
)
class
TestCusolver
:
def
run_gpu_solve
(
self
,
A_val
,
x_val
,
A_struct
=
None
):
b_val
=
np
.
dot
(
A_val
,
x_val
)
b_val_trans
=
np
.
dot
(
A_val
.
T
,
x_val
)
A
=
matrix
(
"A"
,
dtype
=
"float32"
)
b
=
matrix
(
"b"
,
dtype
=
"float32"
)
b_trans
=
matrix
(
"b"
,
dtype
=
"float32"
)
if
A_struct
is
None
:
solver
=
gpu_solve
(
A
,
b
)
solver_trans
=
gpu_solve
(
A
,
b_trans
,
trans
=
"T"
)
else
:
solver
=
gpu_solve
(
A
,
b
,
A_struct
)
solver_trans
=
gpu_solve
(
A
,
b_trans
,
A_struct
,
trans
=
"T"
)
fn
=
aesara
.
function
(
[
A
,
b
,
b_trans
],
[
solver
,
solver_trans
],
mode
=
mode_with_gpu
)
res
=
fn
(
A_val
,
b_val
,
b_val_trans
)
x_res
=
np
.
array
(
res
[
0
])
x_res_trans
=
np
.
array
(
res
[
1
])
utt
.
assert_allclose
(
x_val
,
x_res
)
utt
.
assert_allclose
(
x_val
,
x_res_trans
)
def
test_diag_solve
(
self
):
np
.
random
.
seed
(
1
)
A_val
=
np
.
asarray
([[
2
,
0
,
0
],
[
0
,
1
,
0
],
[
0
,
0
,
1
]],
dtype
=
"float32"
)
x_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
A_val
.
shape
[
1
],
1
))
.
astype
(
"float32"
)
self
.
run_gpu_solve
(
A_val
,
x_val
)
def
test_bshape_solve
(
self
):
# Test when shape of b (k, m) is such as m > k
np
.
random
.
seed
(
1
)
A_val
=
np
.
asarray
([[
2
,
0
,
0
],
[
0
,
1
,
0
],
[
0
,
0
,
1
]],
dtype
=
"float32"
)
x_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
A_val
.
shape
[
1
],
A_val
.
shape
[
1
]
+
1
)
)
.
astype
(
"float32"
)
self
.
run_gpu_solve
(
A_val
,
x_val
)
def
test_sym_solve
(
self
):
np
.
random
.
seed
(
1
)
A_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
5
))
.
astype
(
"float32"
)
A_sym
=
np
.
dot
(
A_val
,
A_val
.
T
)
x_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
A_val
.
shape
[
1
],
1
))
.
astype
(
"float32"
)
self
.
run_gpu_solve
(
A_sym
,
x_val
,
"symmetric"
)
def
test_orth_solve
(
self
):
np
.
random
.
seed
(
1
)
A_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
5
))
.
astype
(
"float32"
)
A_orth
=
np
.
linalg
.
svd
(
A_val
)[
0
]
x_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
A_orth
.
shape
[
1
],
1
))
.
astype
(
"float32"
)
self
.
run_gpu_solve
(
A_orth
,
x_val
)
def
test_uni_rand_solve
(
self
):
np
.
random
.
seed
(
1
)
A_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
5
))
.
astype
(
"float32"
)
x_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
A_val
.
shape
[
1
],
4
))
.
astype
(
"float32"
)
self
.
run_gpu_solve
(
A_val
,
x_val
)
def
test_linalgerrsym_solve
(
self
):
np
.
random
.
seed
(
1
)
A_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
5
))
.
astype
(
"float32"
)
x_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
A_val
.
shape
[
1
],
4
))
.
astype
(
"float32"
)
A_val
=
np
.
dot
(
A_val
.
T
,
A_val
)
# make A singular
A_val
[:,
2
]
=
A_val
[:,
1
]
+
A_val
[:,
3
]
A
=
matrix
(
"A"
,
dtype
=
"float32"
)
b
=
matrix
(
"b"
,
dtype
=
"float32"
)
solver
=
gpu_solve
(
A
,
b
,
"symmetric"
)
fn
=
aesara
.
function
([
A
,
b
],
[
solver
],
mode
=
mode_with_gpu
)
with
pytest
.
raises
(
LinAlgError
):
fn
(
A_val
,
x_val
)
def
test_linalgerr_solve
(
self
):
np
.
random
.
seed
(
1
)
A_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
5
))
.
astype
(
"float32"
)
x_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
A_val
.
shape
[
1
],
4
))
.
astype
(
"float32"
)
# make A singular
A_val
[:,
2
]
=
0
A
=
matrix
(
"A"
,
dtype
=
"float32"
)
b
=
matrix
(
"b"
,
dtype
=
"float32"
)
solver
=
gpu_solve
(
A
,
b
,
trans
=
"T"
)
fn
=
aesara
.
function
([
A
,
b
],
[
solver
],
mode
=
mode_with_gpu
)
with
pytest
.
raises
(
LinAlgError
):
fn
(
A_val
,
x_val
)
def
verify_solve_grad
(
self
,
m
,
n
,
A_structure
,
lower
,
rng
):
# ensure diagonal elements of A relatively large to avoid numerical
# precision issues
A_val
=
(
rng
.
normal
(
size
=
(
m
,
m
))
*
0.5
+
np
.
eye
(
m
))
.
astype
(
config
.
floatX
)
if
A_structure
==
"lower_triangular"
:
A_val
=
np
.
tril
(
A_val
)
elif
A_structure
==
"upper_triangular"
:
A_val
=
np
.
triu
(
A_val
)
if
n
is
None
:
b_val
=
rng
.
normal
(
size
=
m
)
.
astype
(
config
.
floatX
)
else
:
b_val
=
rng
.
normal
(
size
=
(
m
,
n
))
.
astype
(
config
.
floatX
)
eps
=
None
if
config
.
floatX
==
"float64"
:
eps
=
2e-8
if
A_structure
in
(
"lower_triangular"
,
"upper_triangular"
):
solve_op
=
GpuCublasTriangularSolve
(
lower
=
lower
)
else
:
solve_op
=
GpuCusolverSolve
(
A_structure
=
"general"
)
utt
.
verify_grad
(
solve_op
,
[
A_val
,
b_val
],
3
,
rng
,
eps
=
eps
)
def
test_solve_grad
(
self
):
rng
=
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
structures
=
[
"general"
,
"lower_triangular"
,
"upper_triangular"
]
for
A_structure
in
structures
:
lower
=
A_structure
==
"lower_triangular"
# self.verify_solve_grad(5, None, A_structure, lower, rng)
self
.
verify_solve_grad
(
6
,
1
,
A_structure
,
lower
,
rng
)
self
.
verify_solve_grad
(
4
,
3
,
A_structure
,
lower
,
rng
)
# lower should have no effect for A_structure == 'general' so also
# check lower=True case
self
.
verify_solve_grad
(
4
,
3
,
"general"
,
lower
=
True
,
rng
=
rng
)
@pytest.mark.skipif
(
not
cusolver_available
,
reason
=
"Optional package scikits.cuda.cusolver not available"
,
)
class
TestGpuCholesky
:
def
get_gpu_cholesky_func
(
self
,
lower
=
True
,
inplace
=
False
):
# Helper function to compile function from GPU Cholesky op.
A
=
matrix
(
"A"
,
dtype
=
"float32"
)
cholesky_op
=
GpuCholesky
(
lower
=
lower
,
inplace
=
inplace
)
chol_A
=
cholesky_op
(
A
)
return
aesara
.
function
([
A
],
chol_A
,
accept_inplace
=
inplace
,
mode
=
mode_with_gpu
)
def
compare_gpu_cholesky_to_np
(
self
,
A_val
,
lower
=
True
,
inplace
=
False
):
# Helper function to compare op output to np.cholesky output.
chol_A_val
=
np
.
linalg
.
cholesky
(
A_val
)
if
not
lower
:
chol_A_val
=
chol_A_val
.
T
fn
=
self
.
get_gpu_cholesky_func
(
lower
,
inplace
)
res
=
fn
(
A_val
)
chol_A_res
=
np
.
array
(
res
)
utt
.
assert_allclose
(
chol_A_res
,
chol_A_val
)
def
test_gpu_cholesky_opt
(
self
):
A
=
matrix
(
"A"
,
dtype
=
"float32"
)
fn
=
aesara
.
function
([
A
],
cholesky
(
A
),
mode
=
mode_with_gpu
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuCholesky
)
for
node
in
fn
.
maker
.
fgraph
.
toposort
()]
)
def
test_invalid_input_fail_non_square
(
self
):
# Invalid Cholesky input test with non-square matrix as input.
A_val
=
np
.
random
.
normal
(
size
=
(
3
,
2
))
.
astype
(
"float32"
)
fn
=
self
.
get_gpu_cholesky_func
(
True
,
False
)
with
pytest
.
raises
(
ValueError
):
fn
(
A_val
)
def
test_invalid_input_fail_vector
(
self
):
# Invalid Cholesky input test with vector as input.
def
invalid_input_func
():
A
=
vector
(
"A"
,
dtype
=
"float32"
)
GpuCholesky
(
lower
=
True
,
inplace
=
False
)(
A
)
with
pytest
.
raises
(
AssertionError
):
invalid_input_func
()
def
test_invalid_input_fail_tensor3
(
self
):
# Invalid Cholesky input test with 3D tensor as input.
def
invalid_input_func
():
A
=
tensor3
(
"A"
,
dtype
=
"float32"
)
GpuCholesky
(
lower
=
True
,
inplace
=
False
)(
A
)
with
pytest
.
raises
(
AssertionError
):
invalid_input_func
()
@utt.assertFailure_fast
def
test_diag_chol
(
self
):
# Diagonal matrix input Cholesky test.
for
lower
in
[
True
,
False
]:
for
inplace
in
[
True
,
False
]:
# make sure all diagonal elements are positive so positive-definite
A_val
=
np
.
diag
(
np
.
random
.
uniform
(
size
=
5
)
.
astype
(
"float32"
)
+
1
)
self
.
compare_gpu_cholesky_to_np
(
A_val
,
lower
=
lower
,
inplace
=
inplace
)
@utt.assertFailure_fast
def
test_dense_chol_lower
(
self
):
# Dense matrix input lower-triangular Cholesky test.
for
lower
in
[
True
,
False
]:
for
inplace
in
[
True
,
False
]:
M_val
=
np
.
random
.
normal
(
size
=
(
3
,
3
))
.
astype
(
"float32"
)
# A = M.dot(M) will be positive definite for all non-singular M
A_val
=
M_val
.
dot
(
M_val
.
T
)
self
.
compare_gpu_cholesky_to_np
(
A_val
,
lower
=
lower
,
inplace
=
inplace
)
def
test_invalid_input_fail_non_symmetric
(
self
):
# Invalid Cholesky input test with non-symmetric input.
# (Non-symmetric real input must also be non-positive definite).
A_val
=
None
while
True
:
A_val
=
np
.
random
.
normal
(
size
=
(
3
,
3
))
.
astype
(
"float32"
)
if
not
np
.
allclose
(
A_val
,
A_val
.
T
):
break
fn
=
self
.
get_gpu_cholesky_func
(
True
,
False
)
with
pytest
.
raises
(
LinAlgError
):
fn
(
A_val
)
def
test_invalid_input_fail_negative_definite
(
self
):
# Invalid Cholesky input test with negative-definite input.
M_val
=
np
.
random
.
normal
(
size
=
(
3
,
3
))
.
astype
(
"float32"
)
# A = -M.dot(M) will be negative definite for all non-singular M
A_val
=
-
M_val
.
dot
(
M_val
.
T
)
fn
=
self
.
get_gpu_cholesky_func
(
True
,
False
)
with
pytest
.
raises
(
LinAlgError
):
fn
(
A_val
)
@pytest.mark.skipif
(
not
cusolver_available
,
reason
=
"Optional package scikits.cuda.cusolver not available"
,
)
class
TestGpuCholesky64
:
def
get_gpu_cholesky_func
(
self
,
lower
=
True
,
inplace
=
False
):
# Helper function to compile function from GPU Cholesky op.
A
=
matrix
(
"A"
,
dtype
=
"float64"
)
cholesky_op
=
GpuCholesky
(
lower
=
lower
,
inplace
=
inplace
)
chol_A
=
cholesky_op
(
A
)
return
aesara
.
function
([
A
],
chol_A
,
accept_inplace
=
inplace
,
mode
=
mode_with_gpu
)
def
compare_gpu_cholesky_to_np
(
self
,
A_val
,
lower
=
True
,
inplace
=
False
):
# Helper function to compare op output to np.cholesky output.
chol_A_val
=
np
.
linalg
.
cholesky
(
A_val
)
if
not
lower
:
chol_A_val
=
chol_A_val
.
T
fn
=
self
.
get_gpu_cholesky_func
(
lower
,
inplace
)
res
=
fn
(
A_val
)
chol_A_res
=
np
.
array
(
res
)
utt
.
assert_allclose
(
chol_A_res
,
chol_A_val
)
def
test_gpu_cholesky_opt
(
self
):
A
=
matrix
(
"A"
,
dtype
=
"float64"
)
fn
=
aesara
.
function
([
A
],
cholesky
(
A
),
mode
=
mode_with_gpu
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuCholesky
)
for
node
in
fn
.
maker
.
fgraph
.
toposort
()]
)
def
test_invalid_input_fail_non_square
(
self
):
# Invalid Cholesky input test with non-square matrix as input.
A_val
=
np
.
random
.
normal
(
size
=
(
3
,
2
))
.
astype
(
"float64"
)
fn
=
self
.
get_gpu_cholesky_func
(
True
,
False
)
with
pytest
.
raises
(
ValueError
):
fn
(
A_val
)
def
test_invalid_input_fail_vector
(
self
):
# Invalid Cholesky input test with vector as input.
def
invalid_input_func
():
A
=
vector
(
"A"
,
dtype
=
"float64"
)
GpuCholesky
(
lower
=
True
,
inplace
=
False
)(
A
)
with
pytest
.
raises
(
AssertionError
):
invalid_input_func
()
def
test_invalid_input_fail_tensor3
(
self
):
# Invalid Cholesky input test with 3D tensor as input.
def
invalid_input_func
():
A
=
tensor3
(
"A"
,
dtype
=
"float64"
)
GpuCholesky
(
lower
=
True
,
inplace
=
False
)(
A
)
with
pytest
.
raises
(
AssertionError
):
invalid_input_func
()
@utt.assertFailure_fast
def
test_diag_chol
(
self
):
# Diagonal matrix input Cholesky test.
for
lower
in
[
True
,
False
]:
for
inplace
in
[
True
,
False
]:
# make sure all diagonal elements are positive so positive-definite
A_val
=
np
.
diag
(
np
.
random
.
uniform
(
size
=
5
)
.
astype
(
"float64"
)
+
1
)
self
.
compare_gpu_cholesky_to_np
(
A_val
,
lower
=
lower
,
inplace
=
inplace
)
@utt.assertFailure_fast
def
test_dense_chol_lower
(
self
):
# Dense matrix input lower-triangular Cholesky test.
for
lower
in
[
True
,
False
]:
for
inplace
in
[
True
,
False
]:
M_val
=
np
.
random
.
normal
(
size
=
(
3
,
3
))
.
astype
(
"float64"
)
# A = M.dot(M) will be positive definite for all non-singular M
A_val
=
M_val
.
dot
(
M_val
.
T
)
self
.
compare_gpu_cholesky_to_np
(
A_val
,
lower
=
lower
,
inplace
=
inplace
)
def
test_invalid_input_fail_non_symmetric
(
self
):
# Invalid Cholesky input test with non-symmetric input.
# (Non-symmetric real input must also be non-positive definite).
A_val
=
None
while
True
:
A_val
=
np
.
random
.
normal
(
size
=
(
3
,
3
))
.
astype
(
"float64"
)
if
not
np
.
allclose
(
A_val
,
A_val
.
T
):
break
fn
=
self
.
get_gpu_cholesky_func
(
True
,
False
)
with
pytest
.
raises
(
LinAlgError
):
fn
(
A_val
)
def
test_invalid_input_fail_negative_definite
(
self
):
# Invalid Cholesky input test with negative-definite input.
M_val
=
np
.
random
.
normal
(
size
=
(
3
,
3
))
.
astype
(
"float64"
)
# A = -M.dot(M) will be negative definite for all non-singular M
A_val
=
-
M_val
.
dot
(
M_val
.
T
)
fn
=
self
.
get_gpu_cholesky_func
(
True
,
False
)
with
pytest
.
raises
(
LinAlgError
):
fn
(
A_val
)
@pytest.mark.skipif
(
not
config
.
magma__enabled
,
reason
=
"Magma is not enabled, skipping test"
)
class
TestMagma
:
def
test_magma_opt_float16
(
self
):
ops_to_gpu
=
[
(
MatrixInverse
(),
GpuMagmaMatrixInverse
),
(
SVD
(),
GpuMagmaSVD
),
(
QRFull
(
mode
=
"reduced"
),
GpuMagmaQR
),
# TODO: add support for float16 to Eigh numpy
# (Eigh(), GpuMagmaEigh),
(
Cholesky
(),
GpuMagmaCholesky
),
]
for
op
,
gpu_op
in
ops_to_gpu
:
A
=
matrix
(
"A"
,
dtype
=
"float16"
)
fn
=
aesara
.
function
([
A
],
op
(
A
),
mode
=
mode_with_gpu
.
excluding
(
"cusolver"
))
assert
any
(
[
isinstance
(
node
.
op
,
gpu_op
)
for
node
in
fn
.
maker
.
fgraph
.
toposort
()]
)
def
test_gpu_matrix_inverse
(
self
):
A
=
fmatrix
(
"A"
)
fn
=
aesara
.
function
([
A
],
gpu_matrix_inverse
(
A
),
mode
=
mode_with_gpu
)
N
=
1000
test_rng
=
np
.
random
.
default_rng
(
seed
=
1
)
# Copied from tests.tensor.utils.random.
A_val
=
test_rng
.
random
((
N
,
N
))
.
astype
(
"float32"
)
*
2
-
1
A_val_inv
=
fn
(
A_val
)
utt
.
assert_allclose
(
np
.
eye
(
N
),
np
.
dot
(
A_val_inv
,
A_val
),
atol
=
1e-2
)
@utt.assertFailure_fast
def
test_gpu_matrix_inverse_inplace
(
self
):
N
=
1000
test_rng
=
np
.
random
.
default_rng
(
seed
=
1
)
A_val_gpu
=
gpuarray_shared_constructor
(
test_rng
.
random
((
N
,
N
))
.
astype
(
"float32"
)
*
2
-
1
)
A_val_copy
=
A_val_gpu
.
get_value
()
A_val_gpu_inv
=
GpuMagmaMatrixInverse
()(
A_val_gpu
)
fn
=
aesara
.
function
(
[],
A_val_gpu_inv
,
mode
=
mode_with_gpu
,
updates
=
[(
A_val_gpu
,
A_val_gpu_inv
)]
)
assert
any
(
[
node
.
op
.
inplace
for
node
in
fn
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
node
.
op
,
GpuMagmaMatrixInverse
)
]
)
fn
()
utt
.
assert_allclose
(
np
.
eye
(
N
),
np
.
dot
(
A_val_gpu
.
get_value
(),
A_val_copy
),
atol
=
5e-3
)
@utt.assertFailure_fast
def
test_gpu_matrix_inverse_inplace_opt
(
self
):
A
=
fmatrix
(
"A"
)
fn
=
aesara
.
function
([
A
],
matrix_inverse
(
A
),
mode
=
mode_with_gpu
)
assert
any
(
[
node
.
op
.
inplace
for
node
in
fn
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
node
.
op
,
GpuMagmaMatrixInverse
)
]
)
def
run_gpu_svd
(
self
,
A_val
,
full_matrices
=
True
,
compute_uv
=
True
):
A
=
fmatrix
(
"A"
)
f
=
aesara
.
function
(
[
A
],
gpu_svd
(
A
,
full_matrices
=
full_matrices
,
compute_uv
=
compute_uv
),
mode
=
mode_with_gpu
,
)
return
f
(
A_val
)
def
assert_column_orthonormal
(
self
,
Ot
):
utt
.
assert_allclose
(
np
.
dot
(
Ot
.
T
,
Ot
),
np
.
eye
(
Ot
.
shape
[
1
]))
def
check_svd
(
self
,
A
,
U
,
S
,
VT
,
rtol
=
None
,
atol
=
None
):
S_m
=
np
.
zeros_like
(
A
)
np
.
fill_diagonal
(
S_m
,
S
)
utt
.
assert_allclose
(
np
.
dot
(
np
.
dot
(
U
,
S_m
),
VT
),
A
,
rtol
=
rtol
,
atol
=
atol
)
def
test_gpu_svd_wide
(
self
):
A
=
random
(
100
,
50
)
.
astype
(
"float32"
)
M
,
N
=
A
.
shape
U
,
S
,
VT
=
self
.
run_gpu_svd
(
A
)
self
.
assert_column_orthonormal
(
U
)
self
.
assert_column_orthonormal
(
VT
.
T
)
self
.
check_svd
(
A
,
U
,
S
,
VT
)
U
,
S
,
VT
=
self
.
run_gpu_svd
(
A
,
full_matrices
=
False
)
assert
U
.
shape
[
1
],
min
(
M
,
N
)
self
.
assert_column_orthonormal
(
U
)
assert
VT
.
shape
[
0
],
min
(
M
,
N
)
self
.
assert_column_orthonormal
(
VT
.
T
)
def
test_gpu_svd_tall
(
self
):
A
=
random
(
50
,
100
)
.
astype
(
"float32"
)
M
,
N
=
A
.
shape
U
,
S
,
VT
=
self
.
run_gpu_svd
(
A
)
self
.
assert_column_orthonormal
(
U
)
self
.
assert_column_orthonormal
(
VT
.
T
)
self
.
check_svd
(
A
,
U
,
S
,
VT
)
U
,
S
,
VT
=
self
.
run_gpu_svd
(
A
,
full_matrices
=
False
)
assert
U
.
shape
[
1
],
min
(
M
,
N
)
self
.
assert_column_orthonormal
(
U
)
assert
VT
.
shape
[
0
],
min
(
M
,
N
)
self
.
assert_column_orthonormal
(
VT
.
T
)
def
test_gpu_singular_values
(
self
):
A
=
fmatrix
(
"A"
)
f_cpu
=
aesara
.
function
(
[
A
],
aesara
.
tensor
.
nlinalg
.
svd
(
A
,
compute_uv
=
False
),
mode
=
mode_without_gpu
)
f_gpu
=
aesara
.
function
([
A
],
gpu_svd
(
A
,
compute_uv
=
False
),
mode
=
mode_with_gpu
)
A_val
=
random
(
50
,
100
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
f_cpu
(
A_val
),
f_gpu
(
A_val
))
A_val
=
random
(
100
,
50
)
.
astype
(
"float32"
)
utt
.
assert_allclose
(
f_cpu
(
A_val
),
f_gpu
(
A_val
))
def
run_gpu_cholesky
(
self
,
A_val
,
lower
=
True
):
A
=
fmatrix
(
"A"
)
f
=
aesara
.
function
(
[
A
],
GpuMagmaCholesky
(
lower
=
lower
)(
A
),
mode
=
mode_with_gpu
.
excluding
(
"cusolver"
),
)
return
f
(
A_val
)
def
rand_symmetric
(
self
,
N
):
A
=
random
(
N
,
N
)
.
astype
(
"float32"
)
# ensure that eigenvalues are not too small which sometimes results in
# magma cholesky failure due to gpu limited numerical precision
D
,
W
=
np
.
linalg
.
eigh
(
A
)
D
[
D
<
1
]
=
1
V_m
=
np
.
zeros_like
(
A
)
np
.
fill_diagonal
(
V_m
,
D
)
return
np
.
dot
(
np
.
dot
(
W
.
T
,
V_m
),
W
)
def
check_cholesky
(
self
,
N
,
lower
=
True
,
rtol
=
None
,
atol
=
None
):
A
=
self
.
rand_symmetric
(
N
)
L
=
self
.
run_gpu_cholesky
(
A
,
lower
=
lower
)
if
not
lower
:
L
=
L
.
T
utt
.
assert_allclose
(
np
.
dot
(
L
,
L
.
T
),
A
,
rtol
=
rtol
,
atol
=
atol
)
def
test_gpu_cholesky
(
self
):
self
.
check_cholesky
(
1000
,
atol
=
1e-3
)
self
.
check_cholesky
(
1000
,
lower
=
False
,
atol
=
1e-3
)
def
test_gpu_cholesky_opt
(
self
):
A
=
matrix
(
"A"
,
dtype
=
"float32"
)
fn
=
aesara
.
function
([
A
],
cholesky
(
A
),
mode
=
mode_with_gpu
.
excluding
(
"cusolver"
))
assert
any
(
[
isinstance
(
node
.
op
,
GpuMagmaCholesky
)
for
node
in
fn
.
maker
.
fgraph
.
toposort
()
]
)
@utt.assertFailure_fast
def
test_gpu_cholesky_inplace
(
self
):
A
=
self
.
rand_symmetric
(
1000
)
A_gpu
=
gpuarray_shared_constructor
(
A
)
A_copy
=
A_gpu
.
get_value
()
C
=
GpuMagmaCholesky
()(
A_gpu
)
fn
=
aesara
.
function
([],
C
,
mode
=
mode_with_gpu
,
updates
=
[(
A_gpu
,
C
)])
assert
any
(
[
node
.
op
.
inplace
for
node
in
fn
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
node
.
op
,
GpuMagmaCholesky
)
]
)
fn
()
L
=
A_gpu
.
get_value
()
utt
.
assert_allclose
(
np
.
dot
(
L
,
L
.
T
),
A_copy
,
atol
=
1e-3
)
@utt.assertFailure_fast
def
test_gpu_cholesky_inplace_opt
(
self
):
A
=
fmatrix
(
"A"
)
fn
=
aesara
.
function
([
A
],
GpuMagmaCholesky
()(
A
),
mode
=
mode_with_gpu
)
assert
any
(
[
node
.
op
.
inplace
for
node
in
fn
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
node
.
op
,
GpuMagmaCholesky
)
]
)
def
run_gpu_qr
(
self
,
A_val
,
complete
=
True
):
A
=
fmatrix
(
"A"
)
fn
=
aesara
.
function
([
A
],
gpu_qr
(
A
,
complete
=
complete
),
mode
=
mode_with_gpu
)
return
fn
(
A_val
)
def
check_gpu_qr
(
self
,
M
,
N
,
complete
=
True
,
rtol
=
None
,
atol
=
None
):
A
=
random
(
M
,
N
)
.
astype
(
"float32"
)
if
complete
:
Q_gpu
,
R_gpu
=
self
.
run_gpu_qr
(
A
,
complete
=
complete
)
else
:
R_gpu
=
self
.
run_gpu_qr
(
A
,
complete
=
complete
)
Q_np
,
R_np
=
np
.
linalg
.
qr
(
A
,
mode
=
"reduced"
)
utt
.
assert_allclose
(
R_np
,
R_gpu
,
rtol
=
rtol
,
atol
=
atol
)
if
complete
:
utt
.
assert_allclose
(
Q_np
,
Q_gpu
,
rtol
=
rtol
,
atol
=
atol
)
def
test_gpu_qr
(
self
):
self
.
check_gpu_qr
(
1000
,
500
,
atol
=
1e-3
)
self
.
check_gpu_qr
(
1000
,
500
,
complete
=
False
,
atol
=
1e-3
)
self
.
check_gpu_qr
(
500
,
1000
,
atol
=
1e-3
)
self
.
check_gpu_qr
(
500
,
1000
,
complete
=
False
,
atol
=
1e-3
)
def
test_gpu_qr_opt
(
self
):
A
=
fmatrix
(
"A"
)
fn
=
aesara
.
function
([
A
],
qr
(
A
),
mode
=
mode_with_gpu
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuMagmaQR
)
and
node
.
op
.
complete
for
node
in
fn
.
maker
.
fgraph
.
toposort
()
]
)
def
test_gpu_qr_incomplete_opt
(
self
):
A
=
fmatrix
(
"A"
)
fn
=
aesara
.
function
([
A
],
qr
(
A
,
mode
=
"r"
),
mode
=
mode_with_gpu
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuMagmaQR
)
and
not
node
.
op
.
complete
for
node
in
fn
.
maker
.
fgraph
.
toposort
()
]
)
def
run_gpu_eigh
(
self
,
A_val
,
UPLO
=
"L"
,
compute_v
=
True
):
A
=
fmatrix
(
"A"
)
fn
=
aesara
.
function
(
[
A
],
GpuMagmaEigh
(
UPLO
=
UPLO
,
compute_v
=
compute_v
)(
A
),
mode
=
mode_with_gpu
)
return
fn
(
A_val
)
def
check_gpu_eigh
(
self
,
N
,
UPLO
=
"L"
,
compute_v
=
True
,
rtol
=
None
,
atol
=
None
):
A
=
random
(
N
,
N
)
.
astype
(
"float32"
)
A
=
np
.
dot
(
A
.
T
,
A
)
d_np
,
v_np
=
np
.
linalg
.
eigh
(
A
,
UPLO
=
UPLO
)
if
compute_v
:
d_gpu
,
v_gpu
=
self
.
run_gpu_eigh
(
A
,
UPLO
=
UPLO
,
compute_v
=
compute_v
)
else
:
d_gpu
=
self
.
run_gpu_eigh
(
A
,
UPLO
=
UPLO
,
compute_v
=
False
)
utt
.
assert_allclose
(
d_np
,
d_gpu
,
rtol
=
rtol
,
atol
=
atol
)
if
compute_v
:
utt
.
assert_allclose
(
np
.
eye
(
N
),
np
.
dot
(
v_gpu
,
v_gpu
.
T
),
rtol
=
rtol
,
atol
=
atol
)
D_m
=
np
.
zeros_like
(
A
)
np
.
fill_diagonal
(
D_m
,
d_gpu
)
utt
.
assert_allclose
(
A
,
np
.
dot
(
np
.
dot
(
v_gpu
,
D_m
),
v_gpu
.
T
),
rtol
=
rtol
,
atol
=
atol
)
def
test_gpu_eigh
(
self
):
self
.
check_gpu_eigh
(
1000
,
UPLO
=
"L"
,
atol
=
1e-3
)
self
.
check_gpu_eigh
(
1000
,
UPLO
=
"U"
,
atol
=
1e-3
)
self
.
check_gpu_eigh
(
1000
,
UPLO
=
"L"
,
compute_v
=
False
,
atol
=
1e-3
)
self
.
check_gpu_eigh
(
1000
,
UPLO
=
"U"
,
compute_v
=
False
,
atol
=
1e-3
)
def
test_gpu_eigh_opt
(
self
):
A
=
fmatrix
(
"A"
)
fn
=
aesara
.
function
([
A
],
eigh
(
A
),
mode
=
mode_with_gpu
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuMagmaEigh
)
for
node
in
fn
.
maker
.
fgraph
.
toposort
()]
)
# mostly copied from aesara/tensor/tests/test_slinalg.py
def
test_cholesky_grad
():
rng
=
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
r
=
rng
.
standard_normal
((
5
,
5
))
.
astype
(
config
.
floatX
)
# The dots are inside the graph since Cholesky needs separable matrices
# Check the default.
utt
.
verify_grad
(
lambda
r
:
gpu_cholesky
(
r
.
dot
(
r
.
T
)),
[
r
],
3
,
rng
)
# Explicit lower-triangular.
utt
.
verify_grad
(
lambda
r
:
GpuCholesky
(
lower
=
True
)(
r
.
dot
(
r
.
T
)),
[
r
],
3
,
rng
)
# Explicit upper-triangular.
utt
.
verify_grad
(
lambda
r
:
GpuCholesky
(
lower
=
False
)(
r
.
dot
(
r
.
T
)),
[
r
],
3
,
rng
)
def
test_cholesky_grad_indef
():
x
=
matrix
()
mat
=
np
.
array
([[
1
,
0.2
],
[
0.2
,
-
2
]])
.
astype
(
config
.
floatX
)
cholesky
=
GpuCholesky
(
lower
=
True
)
chol_f
=
aesara
.
function
([
x
],
aesara
.
gradient
.
grad
(
cholesky
(
x
)
.
sum
(),
[
x
]))
with
pytest
.
raises
(
LinAlgError
):
chol_f
(
mat
)
# cholesky = GpuCholesky(lower=True, on_error='nan')
# chol_f = function([x], grad(gpu_cholesky(x).sum(), [x]))
# assert np.all(np.isnan(chol_f(matrix)))
def
test_lower_triangular_and_cholesky_grad
():
# Random lower triangular system is ill-conditioned.
#
# Reference
# -----------
# Viswanath, Divakar, and L. N. Trefethen. "Condition numbers of random triangular matrices."
# SIAM Journal on Matrix Analysis and Applications 19.2 (1998): 564-581.
#
# Use smaller number of N when using float32
if
config
.
floatX
==
"float64"
:
N
=
100
else
:
N
=
5
rng
=
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
r
=
rng
.
standard_normal
((
N
,
N
))
.
astype
(
config
.
floatX
)
y
=
rng
.
random
((
N
,
1
))
.
astype
(
config
.
floatX
)
def
f
(
r
,
y
):
PD
=
r
.
dot
(
r
.
T
)
L
=
gpu_cholesky
(
PD
)
A
=
gpu_solve_lower_triangular
(
L
,
y
)
AAT
=
aesara
.
tensor
.
dot
(
A
,
A
.
T
)
B
=
AAT
+
aesara
.
tensor
.
eye
(
N
)
LB
=
gpu_cholesky
(
B
)
return
aesara
.
tensor
.
sum
(
aesara
.
tensor
.
log
(
aesara
.
tensor
.
diag
(
LB
)))
utt
.
verify_grad
(
f
,
[
r
,
y
],
3
,
rng
)
tests/gpuarray/test_misc.py
deleted
100644 → 0
浏览文件 @
c803c67e
# Test that normally could be outside gpuarray, to have all gpuarray
# tests in the same directory, we put them here.
import
numpy
as
np
import
aesara
from
aesara.compile.nanguardmode
import
NanGuardMode
from
aesara.tensor.type
import
vector
from
tests.gpuarray.config
import
mode_with_gpu
def
test_nan_guard_mode
():
# Also test that abs uint* and bool have c code.
for
dtype
in
[
"uint8"
,
"int64"
,
"bool"
]:
x
=
vector
(
dtype
=
dtype
)
y
=
x
+
1
mode
=
NanGuardMode
(
nan_is_error
=
True
,
optimizer
=
mode_with_gpu
.
optimizer
)
f
=
aesara
.
function
([
x
],
y
,
mode
=
mode
)
d
=
np
.
asarray
([
23
,
7
])
.
astype
(
dtype
)
assert
np
.
allclose
(
f
(
d
),
d
+
1
)
tests/gpuarray/test_multinomial.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
pytest
import
aesara
import
tests.unittest_tools
as
utt
from
aesara
import
function
from
aesara.configdefaults
import
config
from
aesara.gpuarray.multinomial
import
(
GPUAChoiceFromUniform
,
GPUAMultinomialFromUniform
,
)
from
aesara.sandbox
import
multinomial
from
aesara.sandbox.rng_mrg
import
MRG_RandomStream
as
RandomStream
from
aesara.tensor.type
import
fmatrix
,
frow
,
fvector
,
iscalar
,
matrix
,
vector
from
tests.gpuarray.config
import
mode_with_gpu
def
test_multinomial_output_dtype
():
# This tests the MultinomialFromUniform Op directly, not going through the
# multinomial() call in GPU random generation.
p
=
fmatrix
()
u
=
fvector
()
for
dtype
in
[
"int64"
,
"float32"
,
"float16"
,
"float64"
,
"int32"
,
"auto"
]:
m
=
aesara
.
sandbox
.
multinomial
.
MultinomialFromUniform
(
dtype
)(
p
,
u
)
# the m*2 allows the multinomial to reuse output
f
=
function
([
p
,
u
],
m
*
2
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
assert
any
(
[
type
(
node
.
op
)
is
GPUAMultinomialFromUniform
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
# test that both first and second samples can be drawn
utt
.
assert_allclose
(
f
([[
1
,
0
],
[
0
,
1
]],
[
0.1
,
0.1
]),
[[
2
,
0
],
[
0
,
2
]])
# test that both second labels can be drawn
r
=
f
([[
0.2
,
0.8
],
[
0.3
,
0.7
]],
[
0.31
,
0.31
])
utt
.
assert_allclose
(
r
,
[[
0
,
2
],
[
0
,
2
]])
# test that both first labels can be drawn
r
=
f
([[
0.2
,
0.8
],
[
0.3
,
0.7
]],
[
0.21
,
0.21
])
utt
.
assert_allclose
(
r
,
[[
0
,
2
],
[
2
,
0
]])
# change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r
=
f
([[
0.2
,
0.8
]],
[
0.25
])
utt
.
assert_allclose
(
r
,
[[
0
,
2
]])
def
test_multinomial_input_dtype
():
# This tests the MultinomialFromUniform Op directly, not going through the
# multinomial() call in GPU random generation.
for
idtype
in
[
"float32"
,
"float16"
,
"float64"
]:
for
odtype
in
[
"float32"
,
"float16"
,
"float64"
,
"int32"
]:
p
=
matrix
(
"p"
,
idtype
)
u
=
vector
(
"u"
,
idtype
)
# p = dmatrix('p')
# u = dvector('u')
m
=
aesara
.
sandbox
.
multinomial
.
MultinomialFromUniform
(
odtype
)(
p
,
u
)
# the m*2 allows the multinomial to reuse output
f
=
function
([
p
,
u
],
m
*
2
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
assert
any
(
[
type
(
node
.
op
)
is
GPUAMultinomialFromUniform
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
# test that both first and second samples can be drawn
utt
.
assert_allclose
(
f
([[
1
,
0
],
[
0
,
1
]],
[
0.1
,
0.1
]),
[[
2
,
0
],
[
0
,
2
]])
# test that both second labels can be drawn
r
=
f
([[
0.2
,
0.8
],
[
0.3
,
0.7
]],
[
0.31
,
0.31
])
utt
.
assert_allclose
(
r
,
[[
0
,
2
],
[
0
,
2
]])
# test that both first labels can be drawn
r
=
f
([[
0.2
,
0.8
],
[
0.3
,
0.7
]],
[
0.21
,
0.21
])
utt
.
assert_allclose
(
r
,
[[
0
,
2
],
[
2
,
0
]])
# change the size to make sure output gets reallocated ok
# and also make sure that the GPU version doesn't screw up the
# transposed-ness
r
=
f
([[
0.2
,
0.8
]],
[
0.25
])
utt
.
assert_allclose
(
r
,
[[
0
,
2
]])
# TODO: check a bigger example (make sure blocking on GPU is handled correctly)
def
test_multinomial_large
():
# DEBUG_MODE will test this on GPU
p
=
fmatrix
()
u
=
fvector
()
m
=
aesara
.
sandbox
.
multinomial
.
MultinomialFromUniform
(
"auto"
)(
p
,
u
)
f
=
function
([
p
,
u
],
m
*
2
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
assert
any
(
[
type
(
node
.
op
)
is
GPUAMultinomialFromUniform
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
pval
=
np
.
arange
(
10000
*
4
,
dtype
=
"float32"
)
.
reshape
((
10000
,
4
))
+
0.1
pval
=
pval
/
pval
.
sum
(
axis
=
1
)[:,
None
]
uval
=
np
.
ones_like
(
pval
[:,
0
])
*
0.5
mval
=
f
(
pval
,
uval
)
assert
mval
.
shape
==
pval
.
shape
if
config
.
cast_policy
==
"custom"
:
assert
mval
.
dtype
==
pval
.
dtype
elif
config
.
cast_policy
==
"numpy+floatX"
:
assert
mval
.
dtype
==
config
.
floatX
elif
config
.
cast_policy
==
"numpy"
:
assert
mval
.
dtype
==
"float64"
else
:
raise
NotImplementedError
(
config
.
cast_policy
)
utt
.
assert_allclose
(
mval
.
sum
(
axis
=
1
),
2
)
asdf
=
np
.
asarray
([
0
,
0
,
2
,
0
])
+
0
*
pval
utt
.
assert_allclose
(
mval
,
asdf
)
# broadcast over all rows
def
test_gpu_opt_dtypes
():
# Test if the returned samples are of the datatype specified
for
dtype
in
[
"uint32"
,
"float32"
,
"int64"
,
"float64"
]:
p
=
fmatrix
()
u
=
fvector
()
m
=
aesara
.
sandbox
.
multinomial
.
MultinomialFromUniform
(
dtype
)(
p
,
u
)
f
=
function
([
p
,
u
],
m
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
assert
any
(
[
type
(
node
.
op
)
is
GPUAMultinomialFromUniform
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
pval
=
np
.
arange
(
10000
*
4
,
dtype
=
"float32"
)
.
reshape
((
10000
,
4
))
+
0.1
pval
=
pval
/
pval
.
sum
(
axis
=
1
)[:,
None
]
uval
=
np
.
ones_like
(
pval
[:,
0
])
*
0.5
samples
=
f
(
pval
,
uval
)
assert
samples
.
dtype
==
dtype
,
f
"{samples.dtype} != {dtype}"
def
test_gpu_opt
():
# Does have some overlap with test_multinomial_0
# We test the case where we put the op on the gpu when the output
# is moved to the gpu.
p
=
fmatrix
()
u
=
fvector
()
m
=
aesara
.
sandbox
.
multinomial
.
MultinomialFromUniform
(
"auto"
)(
p
,
u
)
assert
m
.
dtype
==
"float32"
,
m
.
dtype
f
=
function
([
p
,
u
],
m
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
assert
any
(
[
type
(
node
.
op
)
is
GPUAMultinomialFromUniform
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
pval
=
np
.
arange
(
10000
*
4
,
dtype
=
"float32"
)
.
reshape
((
10000
,
4
))
+
0.1
pval
=
pval
/
pval
.
sum
(
axis
=
1
)[:,
None
]
uval
=
np
.
ones_like
(
pval
[:,
0
])
*
0.5
f
(
pval
,
uval
)
# Test with a row, it was failing in the past.
r
=
frow
()
m
=
aesara
.
sandbox
.
multinomial
.
MultinomialFromUniform
(
"auto"
)(
r
,
u
)
assert
m
.
dtype
==
"float32"
,
m
.
dtype
f
=
function
([
r
,
u
],
m
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
assert
any
(
[
type
(
node
.
op
)
is
GPUAMultinomialFromUniform
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
pval
=
np
.
arange
(
1
*
4
,
dtype
=
"float32"
)
.
reshape
((
1
,
4
))
+
0.1
pval
=
pval
/
pval
.
sum
(
axis
=
1
)[:,
None
]
uval
=
np
.
ones_like
(
pval
[:,
0
])
*
0.5
f
(
pval
,
uval
)
class
TestOPWor
:
def
test_select_distinct
(
self
):
# Tests that ChoiceFromUniform always selects distinct elements
p
=
fmatrix
()
u
=
fvector
()
n
=
iscalar
()
m
=
multinomial
.
ChoiceFromUniform
(
odtype
=
"auto"
)(
p
,
u
,
n
)
f
=
function
([
p
,
u
,
n
],
m
,
allow_input_downcast
=
True
)
n_elements
=
1000
all_indices
=
range
(
n_elements
)
np
.
random
.
seed
(
12345
)
for
i
in
[
5
,
10
,
50
,
100
,
500
,
n_elements
]:
uni
=
np
.
random
.
rand
(
i
)
.
astype
(
config
.
floatX
)
pvals
=
np
.
random
.
randint
(
1
,
100
,
(
1
,
n_elements
))
.
astype
(
config
.
floatX
)
pvals
/=
pvals
.
sum
(
1
)
res
=
f
(
pvals
,
uni
,
i
)
res
=
np
.
squeeze
(
res
)
assert
len
(
res
)
==
i
,
res
assert
np
.
all
(
np
.
in1d
(
np
.
unique
(
res
),
all_indices
)),
res
def
test_fail_select_alot
(
self
):
# Tests that ChoiceFromUniform fails when asked to sample more
# elements than the actual number of elements
p
=
fmatrix
()
u
=
fvector
()
n
=
iscalar
()
m
=
multinomial
.
ChoiceFromUniform
(
odtype
=
"auto"
)(
p
,
u
,
n
)
f
=
function
([
p
,
u
,
n
],
m
,
allow_input_downcast
=
True
)
n_elements
=
100
n_selected
=
200
np
.
random
.
seed
(
12345
)
uni
=
np
.
random
.
rand
(
n_selected
)
.
astype
(
config
.
floatX
)
pvals
=
np
.
random
.
randint
(
1
,
100
,
(
1
,
n_elements
))
.
astype
(
config
.
floatX
)
pvals
/=
pvals
.
sum
(
1
)
with
pytest
.
raises
(
ValueError
):
f
(
pvals
,
uni
,
n_selected
)
def
test_select_proportional_to_weight
(
self
):
# Tests that ChoiceFromUniform selects elements, on average,
# proportional to the their probabilities
p
=
fmatrix
()
u
=
fvector
()
n
=
iscalar
()
m
=
multinomial
.
ChoiceFromUniform
(
odtype
=
"auto"
)(
p
,
u
,
n
)
f
=
function
([
p
,
u
,
n
],
m
,
allow_input_downcast
=
True
)
n_elements
=
100
n_selected
=
10
mean_rtol
=
0.0005
np
.
random
.
seed
(
12345
)
pvals
=
np
.
random
.
randint
(
1
,
100
,
(
1
,
n_elements
))
.
astype
(
config
.
floatX
)
pvals
/=
pvals
.
sum
(
1
)
avg_pvals
=
np
.
zeros
((
n_elements
,),
dtype
=
config
.
floatX
)
for
rep
in
range
(
10000
):
uni
=
np
.
random
.
rand
(
n_selected
)
.
astype
(
config
.
floatX
)
res
=
f
(
pvals
,
uni
,
n_selected
)
res
=
np
.
squeeze
(
res
)
avg_pvals
[
res
]
+=
1
avg_pvals
/=
avg_pvals
.
sum
()
avg_diff
=
np
.
mean
(
abs
(
avg_pvals
-
pvals
))
assert
avg_diff
<
mean_rtol
,
avg_diff
class
TestFunctionWor
:
def
test_select_distinct
(
self
):
# Tests that multinomial_wo_replacement always selects distinct elements
th_rng
=
RandomStream
(
12345
)
p
=
fmatrix
()
n
=
iscalar
()
m
=
th_rng
.
multinomial_wo_replacement
(
pvals
=
p
,
n
=
n
)
f
=
function
([
p
,
n
],
m
,
allow_input_downcast
=
True
)
n_elements
=
1000
all_indices
=
range
(
n_elements
)
np
.
random
.
seed
(
12345
)
for
i
in
[
5
,
10
,
50
,
100
,
500
,
n_elements
]:
pvals
=
np
.
random
.
randint
(
1
,
100
,
(
1
,
n_elements
))
.
astype
(
config
.
floatX
)
pvals
/=
pvals
.
sum
(
1
)
res
=
f
(
pvals
,
i
)
res
=
np
.
squeeze
(
res
)
assert
len
(
res
)
==
i
assert
np
.
all
(
np
.
in1d
(
np
.
unique
(
res
),
all_indices
)),
res
def
test_fail_select_alot
(
self
):
# Tests that multinomial_wo_replacement fails when asked to sample more
# elements than the actual number of elements
th_rng
=
RandomStream
(
12345
)
p
=
fmatrix
()
n
=
iscalar
()
m
=
th_rng
.
multinomial_wo_replacement
(
pvals
=
p
,
n
=
n
)
f
=
function
([
p
,
n
],
m
,
allow_input_downcast
=
True
)
n_elements
=
100
n_selected
=
200
np
.
random
.
seed
(
12345
)
pvals
=
np
.
random
.
randint
(
1
,
100
,
(
1
,
n_elements
))
.
astype
(
config
.
floatX
)
pvals
/=
pvals
.
sum
(
1
)
with
pytest
.
raises
(
ValueError
):
f
(
pvals
,
n_selected
)
def
test_select_proportional_to_weight
(
self
):
# Tests that multinomial_wo_replacement selects elements, on average,
# proportional to the their probabilities
th_rng
=
RandomStream
(
12345
)
p
=
fmatrix
()
n
=
iscalar
()
m
=
th_rng
.
multinomial_wo_replacement
(
pvals
=
p
,
n
=
n
)
f
=
function
([
p
,
n
],
m
,
allow_input_downcast
=
True
)
n_elements
=
100
n_selected
=
10
mean_rtol
=
0.0005
np
.
random
.
seed
(
12345
)
pvals
=
np
.
random
.
randint
(
1
,
100
,
(
1
,
n_elements
))
.
astype
(
config
.
floatX
)
pvals
/=
pvals
.
sum
(
1
)
avg_pvals
=
np
.
zeros
((
n_elements
,),
dtype
=
config
.
floatX
)
for
rep
in
range
(
10000
):
res
=
f
(
pvals
,
n_selected
)
res
=
np
.
squeeze
(
res
)
avg_pvals
[
res
]
+=
1
avg_pvals
/=
avg_pvals
.
sum
()
avg_diff
=
np
.
mean
(
abs
(
avg_pvals
-
pvals
))
assert
avg_diff
<
mean_rtol
def
test_gpu_opt_wor
():
# We test the case where we put the op on the gpu when the output
# is moved to the gpu.
p
=
fmatrix
()
u
=
fvector
()
n
=
iscalar
()
for
replace
in
[
False
,
True
]:
m
=
multinomial
.
ChoiceFromUniform
(
odtype
=
"auto"
,
replace
=
replace
)(
p
,
u
,
n
)
assert
m
.
dtype
==
"int64"
,
m
.
dtype
f
=
function
([
p
,
u
,
n
],
m
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
assert
any
(
[
type
(
node
.
op
)
is
GPUAChoiceFromUniform
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
n_samples
=
3
pval
=
np
.
arange
(
10000
*
4
,
dtype
=
"float32"
)
.
reshape
((
10000
,
4
))
+
0.1
pval
=
pval
/
pval
.
sum
(
axis
=
1
)[:,
None
]
uval
=
np
.
ones
(
pval
.
shape
[
0
]
*
n_samples
)
*
0.5
f
(
pval
,
uval
,
n_samples
)
# Test with a row, it was failing in the past.
r
=
frow
()
m
=
multinomial
.
ChoiceFromUniform
(
"auto"
,
replace
=
replace
)(
r
,
u
,
n
)
assert
m
.
dtype
==
"int64"
,
m
.
dtype
f
=
function
([
r
,
u
,
n
],
m
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
)
assert
any
(
[
type
(
node
.
op
)
is
GPUAChoiceFromUniform
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
pval
=
np
.
arange
(
1
*
4
,
dtype
=
"float32"
)
.
reshape
((
1
,
4
))
+
0.1
pval
=
pval
/
pval
.
sum
(
axis
=
1
)[:,
None
]
uval
=
np
.
ones_like
(
pval
[:,
0
])
*
0.5
f
(
pval
,
uval
,
1
)
tests/gpuarray/test_neighbours.py
deleted
100644 → 0
浏览文件 @
c803c67e
from
aesara.gpuarray.neighbours
import
GpuImages2Neibs
from
tests.gpuarray.config
import
mode_with_gpu
from
tests.tensor.nnet
import
test_neighbours
class
TestGpuImages2Neibs
(
test_neighbours
.
TestImages2Neibs
):
mode
=
mode_with_gpu
op
=
GpuImages2Neibs
dtypes
=
[
"int64"
,
"float32"
,
"float64"
]
tests/gpuarray/test_nnet.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
aesara
import
aesara.tensor
as
at
import
tests.unittest_tools
as
utt
from
aesara.gpuarray.nnet
import
(
GpuCrossentropySoftmax1HotWithBiasDx
,
GpuCrossentropySoftmaxArgmax1HotWithBias
,
GpuSoftmax
,
GpuSoftmaxWithBias
,
)
from
aesara.gradient
import
grad
from
aesara.tensor.math
import
argmax
,
log
,
mean
from
aesara.tensor.nnet
import
crossentropy_softmax_1hot_with_bias_dx
from
aesara.tensor.type
import
fmatrix
,
fvector
,
lvector
,
matrix
,
vector
from
tests.gpuarray.config
import
mode_with_gpu
,
mode_without_gpu
mode_wo_cudnn
=
mode_with_gpu
.
excluding
(
"cudnn"
)
def
test_GpuCrossentropySoftmaxArgmax1HotWithBias
():
# This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias
# We check that we loop when their is too much threads
n_in
=
1000
batch_size
=
4097
n_out
=
1250
if
not
isinstance
(
mode_with_gpu
,
aesara
.
compile
.
debugmode
.
DebugMode
):
n_in
=
4098
n_out
=
4099
y
=
lvector
(
"y"
)
b
=
fvector
(
"b"
)
# we precompute the dot with big shape before to allow the test of
# GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
# (the launch timed out and was terminated) on GPU card not
# powerful enough. We need the big shape to check for corner
# case.
dot_result
=
fmatrix
(
"dot_result"
)
xx
=
np
.
asarray
(
np
.
random
.
rand
(
batch_size
,
n_in
),
dtype
=
np
.
float32
)
yy
=
np
.
ones
((
batch_size
,),
dtype
=
"int32"
)
b_values
=
np
.
zeros
((
n_out
,),
dtype
=
"float32"
)
W_values
=
np
.
asarray
(
np
.
random
.
rand
(
n_in
,
n_out
),
dtype
=
"float32"
)
dot_value
=
np
.
asarray
(
np
.
dot
(
xx
,
W_values
),
dtype
=
"float32"
)
del
W_values
p_y_given_x
=
aesara
.
tensor
.
nnet
.
softmax
(
dot_result
+
b
)
y_pred
=
argmax
(
p_y_given_x
,
axis
=-
1
)
loss
=
-
mean
(
log
(
p_y_given_x
)[
at
.
arange
(
y
.
shape
[
0
]),
y
])
dW
=
grad
(
loss
,
dot_result
)
classify
=
aesara
.
function
(
inputs
=
[
y
,
b
,
dot_result
],
outputs
=
[
loss
,
y_pred
,
dW
],
mode
=
mode_without_gpu
)
classify_gpu
=
aesara
.
function
(
inputs
=
[
y
,
b
,
dot_result
],
outputs
=
[
loss
,
y_pred
,
dW
],
mode
=
mode_with_gpu
)
assert
any
(
[
isinstance
(
node
.
op
,
aesara
.
tensor
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
)
for
node
in
classify
.
maker
.
fgraph
.
toposort
()
]
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuCrossentropySoftmaxArgmax1HotWithBias
)
for
node
in
classify_gpu
.
maker
.
fgraph
.
toposort
()
]
)
out
=
classify
(
yy
,
b_values
,
dot_value
)
gout
=
classify_gpu
(
yy
,
b_values
,
dot_value
)
assert
len
(
out
)
==
len
(
gout
)
==
3
utt
.
assert_allclose
(
out
[
0
],
gout
[
0
])
utt
.
assert_allclose
(
out
[
2
],
gout
[
2
],
atol
=
3e-6
)
utt
.
assert_allclose
(
out
[
1
],
gout
[
1
])
def
test_GpuCrossentropySoftmax1HotWithBiasDx
():
# This is basic test for GpuCrossentropySoftmax1HotWithBiasDx
# We check that we loop when their is too much threads
batch_size
=
4097
n_out
=
1250
if
not
isinstance
(
mode_with_gpu
,
aesara
.
compile
.
debugmode
.
DebugMode
):
n_out
=
4099
softmax_output_value
=
np
.
random
.
rand
(
batch_size
,
n_out
)
.
astype
(
"float32"
)
dnll_value
=
np
.
asarray
(
np
.
random
.
rand
(
batch_size
),
dtype
=
"float32"
)
y_idx_value
=
np
.
random
.
randint
(
low
=
0
,
high
=
5
,
size
=
batch_size
)
softmax_output
=
fmatrix
()
softmax_output
/=
softmax_output
.
sum
(
axis
=
1
)
.
reshape
(
softmax_output
.
shape
[
1
],
1
)
op
=
crossentropy_softmax_1hot_with_bias_dx
(
dnll_value
,
softmax_output
,
y_idx_value
)
cpu_f
=
aesara
.
function
([
softmax_output
],
op
,
mode
=
mode_without_gpu
)
gpu_f
=
aesara
.
function
([
softmax_output
],
op
,
mode
=
mode_with_gpu
)
# aesara.printing.debugprint(cpu_f)
# aesara.printing.debugprint(gpu_f)
assert
any
(
[
isinstance
(
node
.
op
,
aesara
.
tensor
.
nnet
.
CrossentropySoftmax1HotWithBiasDx
)
for
node
in
cpu_f
.
maker
.
fgraph
.
toposort
()
]
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuCrossentropySoftmax1HotWithBiasDx
)
for
node
in
gpu_f
.
maker
.
fgraph
.
toposort
()
]
)
cpu_out
=
cpu_f
(
softmax_output_value
)
gpu_out
=
gpu_f
(
softmax_output_value
)
rtol
=
1e-5
atol
=
1e-6
utt
.
assert_allclose
(
cpu_out
,
gpu_out
,
rtol
=
rtol
,
atol
=
atol
)
def
test_softmax_with_bias_float16
():
softmax_with_bias_unittest_template
(
dtypeInput
=
"float16"
,
dtypeBias
=
"float32"
)
softmax_with_bias_unittest_template
(
dtypeInput
=
"float16"
,
dtypeBias
=
"float16"
)
softmax_with_bias_unittest_template
(
dtypeInput
=
"float32"
,
dtypeBias
=
"float16"
)
def
test_softmax_with_bias_float32
():
softmax_with_bias_unittest_template
(
dtypeInput
=
"float32"
,
dtypeBias
=
"float32"
)
def
test_softmax_with_bias_float64
():
softmax_with_bias_unittest_template
(
dtypeInput
=
"float32"
,
dtypeBias
=
"float64"
)
softmax_with_bias_unittest_template
(
dtypeInput
=
"float64"
,
dtypeBias
=
"float32"
)
softmax_with_bias_unittest_template
(
dtypeInput
=
"float64"
,
dtypeBias
=
"float64"
)
def
softmax_with_bias_unittest_template
(
dtypeInput
,
dtypeBias
):
# This is a basic test for GpuSoftmaxWithBias.
#
# We check that we loop when there are too many blocks.
#
# TODO: check that we loop when there are too many threads. (THIS IS
# NOT IMPLEMENTED)
x
=
matrix
(
"x"
,
dtype
=
dtypeInput
)
b
=
vector
(
"b"
,
dtype
=
dtypeBias
)
z
=
aesara
.
tensor
.
nnet
.
softmax_with_bias
(
x
,
b
)
f
=
aesara
.
function
([
x
,
b
],
z
,
mode
=
mode_without_gpu
)
f_gpu
=
aesara
.
function
([
x
,
b
],
z
,
mode
=
mode_with_gpu
)
assert
f
.
maker
.
fgraph
.
toposort
()[
-
1
]
.
op
==
aesara
.
tensor
.
nnet
.
softmax_with_bias
assert
isinstance
(
f_gpu
.
maker
.
fgraph
.
toposort
()[
-
2
]
.
op
,
GpuSoftmaxWithBias
)
def
cmp
(
n
,
m
):
data
=
np
.
random
.
uniform
(
1e-7
,
1
,
(
n
,
m
))
.
astype
(
dtype
=
dtypeInput
)
b_data
=
np
.
random
.
uniform
(
1e-7
,
1
,
(
m
,))
.
astype
(
dtype
=
dtypeBias
)
out
=
f
(
data
,
b_data
)
gout
=
f_gpu
(
data
,
b_data
)
utt
.
assert_allclose
(
out
,
gout
)
cmp
(
2
,
5
)
# we need to test n>32*1024 to check that we make the block loop.
cmp
(
2
<<
15
,
5
)
cmp
(
4074
,
400
)
cmp
(
784
,
784
)
cmp
(
4
,
1000
)
cmp
(
4
,
1024
)
cmp
(
4
,
2000
)
cmp
(
4
,
2024
)
# GTX285 don't have enough shared mem for this case.
cmp
(
4
,
4074
)
# The GTX580, 680 and kepler don't have enough shared memory.
cmp
(
2
,
10000
)
cmp
(
128
,
16
*
1024
)
cmp
(
128
,
64
*
1024
)
def
test_softmax_float16
():
softmax_unittest_template
(
"float16"
)
def
test_softmax_float32
():
softmax_unittest_template
(
"float32"
)
def
test_softmax_float64
():
softmax_unittest_template
(
"float64"
)
def
softmax_unittest_template
(
dtypeInput
):
# This is basic test for GpuSoftmax.
#
# We check that we loop when their is too much block
# We use slower code when there isn't enough shared memory
x
=
matrix
(
"x"
,
dtype
=
dtypeInput
)
z
=
aesara
.
tensor
.
nnet
.
softmax
(
x
)
f
=
aesara
.
function
([
x
],
z
,
mode
=
mode_without_gpu
)
f_gpu
=
aesara
.
function
([
x
],
z
,
mode
=
mode_wo_cudnn
)
assert
f
.
maker
.
fgraph
.
toposort
()[
-
1
]
.
op
==
aesara
.
tensor
.
nnet
.
softmax_legacy
assert
isinstance
(
f_gpu
.
maker
.
fgraph
.
toposort
()[
-
2
]
.
op
,
GpuSoftmax
)
def
cmp
(
n
,
m
):
data
=
np
.
random
.
uniform
(
0
,
1
,
(
n
,
m
))
.
astype
(
dtype
=
dtypeInput
)
out
=
f
(
data
)
gout
=
f_gpu
(
data
)
utt
.
assert_allclose
(
out
,
gout
)
# we need to test n>32*1024 to check that we make the block loop.
cmp
(
2
,
5
)
cmp
(
2
<<
15
,
5
)
cmp
(
4074
,
400
)
cmp
(
784
,
784
)
cmp
(
4
,
1000
)
cmp
(
4
,
1024
)
cmp
(
4
,
2000
)
cmp
(
4
,
2024
)
# The GTX285 don't have enough shared memory.
cmp
(
4
,
4074
)
# The GTX580, 680 and kepler don't have enough shared memory.
cmp
(
2
,
10000
)
cmp
(
128
,
16
*
1024
)
cmp
(
128
,
64
*
1024
)
class
TestSoftMax
:
gpu_op
=
GpuSoftmax
mode
=
mode_wo_cudnn
def
_test_softmax
(
self
,
x
,
x_gpu
,
f_z
,
f_gpu_z
,
cmp
):
# This is basic test for GpuSoftmax and GpuDnnSoftmax
#
# We check that we loop when there is too much block
# We use slower code when there isn't enough shared memory
f_z_out
=
f_z
(
x
)
f_gpu_z_out
=
f_gpu_z
(
x_gpu
)
f
=
aesara
.
function
([
x
],
f_z_out
,
mode
=
mode_without_gpu
)
f_gpu
=
aesara
.
function
([
x_gpu
],
f_gpu_z_out
,
mode
=
self
.
mode
)
self
.
_check_types
(
f
,
f_gpu
,
aesara
.
tensor
.
nnet
.
Softmax
,
self
.
gpu_op
)
# we need to test n>32*1024 to check that we make the block loop.
cmp
(
1
,
5
,
f
,
f_gpu
)
cmp
(
2
,
5
,
f
,
f_gpu
)
cmp
(
10
,
5
,
f
,
f_gpu
)
cmp
(
100
,
5
,
f
,
f_gpu
)
cmp
(
1000
,
5
,
f
,
f_gpu
)
cmp
(
10000
,
5
,
f
,
f_gpu
)
cmp
(
4074
,
400
,
f
,
f_gpu
)
cmp
(
784
,
784
,
f
,
f_gpu
)
cmp
(
4
,
1000
,
f
,
f_gpu
)
cmp
(
4
,
1024
,
f
,
f_gpu
)
cmp
(
4
,
2000
,
f
,
f_gpu
)
cmp
(
4
,
2024
,
f
,
f_gpu
)
# The GTX285 don't have enough shared memory.
cmp
(
4
,
4074
,
f
,
f_gpu
)
# The GTX580, 680 and kepler don't have enough shared memory.
cmp
(
2
,
10000
,
f
,
f_gpu
)
cmp
(
128
,
16
*
1024
,
f
,
f_gpu
)
cmp
(
128
,
64
*
1024
,
f
,
f_gpu
)
# cudnn permits no more than 2^15 - 1 rows
cmp
((
2
<<
15
)
-
1
,
5
,
f
,
f_gpu
)
cmp
(
5
,
2
<<
15
,
f
,
f_gpu
)
return
f
,
f_gpu
def
_cmp
(
self
,
n
,
m
,
f
,
f_gpu
):
data
=
np
.
arange
(
n
*
m
,
dtype
=
"float32"
)
.
reshape
(
n
,
m
)
out
=
f
(
data
)
gout
=
f_gpu
(
data
)
utt
.
assert_allclose
(
out
,
gout
)
def
_check_types
(
self
,
graph
,
graph_gpu
,
f_type
,
f_gpu_type
):
assert
isinstance
(
graph
.
maker
.
fgraph
.
toposort
()[
-
1
]
.
op
,
f_type
)
assert
(
len
(
[
node
for
node
in
graph_gpu
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
node
.
op
,
f_gpu_type
)
]
)
==
1
)
def
test_softmax
(
self
):
x
=
fmatrix
(
"x"
)
z
=
aesara
.
tensor
.
nnet
.
softmax_legacy
f
,
f_gpu
=
self
.
_test_softmax
(
x
,
x
,
z
,
z
,
self
.
_cmp
)
self
.
_cmp
(
2
<<
15
,
5
,
f
,
f_gpu
)
def
test_softmax_shape_0
(
self
):
x
=
fmatrix
(
"x"
)
z
=
aesara
.
tensor
.
nnet
.
softmax_legacy
f
,
f_gpu
=
self
.
_test_softmax
(
x
,
x
,
z
,
z
,
self
.
_cmp
)
# Aesara can handle that case, but cudnn can't
self
.
_cmp
(
0
,
10
,
f
,
f_gpu
)
tests/gpuarray/test_opt.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
pytest
import
aesara
import
aesara.gpuarray
import
aesara.tensor.slinalg
as
slinalg
from
aesara
import
tensor
as
at
from
aesara.breakpoint
import
PdbBreakpoint
from
aesara.configdefaults
import
config
from
aesara.gpuarray
import
basic_ops
,
blas
,
dnn
,
opt
from
aesara.gpuarray.basic_ops
import
(
GpuAlloc
,
GpuAllocEmpty
,
GpuFromHost
,
GpuReshape
,
HostFromGpu
,
host_from_gpu
,
)
from
aesara.gpuarray.blas
import
GpuGemm
from
aesara.gpuarray.dnn
import
GpuDnnReduction
from
aesara.gpuarray.elemwise
import
(
Elemwise
,
GpuCAReduceCPY
,
GpuCAReduceCuda
,
GpuElemwise
,
max_inputs_to_GpuElemwise
,
)
from
aesara.gpuarray.linalg
import
GpuCholesky
,
GpuCusolverSolve
,
cusolver_available
from
aesara.gpuarray.subtensor
import
GpuSubtensor
from
aesara.gpuarray.type
import
GpuArrayType
,
get_context
,
gpuarray_shared_constructor
from
aesara.graph.opt
import
check_stack_trace
from
aesara.raise_op
import
Assert
,
assert_op
from
aesara.tensor.basic
import
Alloc
,
AllocEmpty
,
MakeVector
,
Rebroadcast
from
aesara.tensor.blas
import
batched_dot
from
aesara.tensor.math
import
dot
,
eq
,
exp
,
gt
,
tanh
from
aesara.tensor.nnet
import
abstract_conv
from
aesara.tensor.type
import
(
TensorType
,
bmatrix
,
cscalar
,
fmatrix
,
fscalar
,
ftensor4
,
iscalar
,
ivector
,
lscalar
,
lvector
,
matrix
,
scalar
,
tensor3
,
vector
,
)
from
tests
import
unittest_tools
as
utt
from
tests.gpuarray.config
import
mode_with_gpu
,
mode_without_gpu
,
test_ctx_name
from
tests.tensor.test_basic
import
TestSpecifyShape
from
tests.test_ifelse
import
TestIfelse
def
_check_stack_trace
(
thing
):
from
aesara.tensor.shape
import
Shape
,
Shape_i
def
_ops_to_check
(
op
):
if
not
isinstance
(
op
,
aesara
.
graph
.
op
.
Op
):
op
=
op
.
op
# assume it is an apply node
return
not
isinstance
(
op
,
(
Shape_i
,
Shape
,
aesara
.
compile
.
ops
.
DeepCopyOp
,
MakeVector
,
aesara
.
tensor
.
subtensor
.
Subtensor
,
aesara
.
tensor
.
elemwise
.
Elemwise
,
aesara
.
ifelse
.
IfElse
,
GpuFromHost
,
HostFromGpu
,
),
)
return
check_stack_trace
(
thing
,
ops_to_check
=
_ops_to_check
,
bug_print
=
"ignore"
)
def
test_local_assert
():
x
=
fmatrix
()
a
=
assert_op
(
x
,
eq
(
x
,
0
)
.
any
())
f
=
aesara
.
function
([
x
],
a
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
a_op
=
[
n
for
n
in
topo
if
isinstance
(
n
.
op
,
Assert
)]
assert
len
(
a_op
)
==
1
assert
isinstance
(
a_op
[
0
]
.
inputs
[
0
]
.
type
,
GpuArrayType
)
def
test_local_remove_all_assert
():
x
=
fmatrix
()
a
=
assert_op
(
x
,
eq
(
x
,
0
)
.
any
())
# By default `unsafe` should not be there
f
=
aesara
.
function
([
x
],
a
,
mode
=
mode_with_gpu
.
excluding
(
"unsafe"
))
topo
=
f
.
maker
.
fgraph
.
toposort
()
a_op
=
[
n
for
n
in
topo
if
isinstance
(
n
.
op
,
Assert
)]
assert
len
(
a_op
)
==
1
# Put `unsafe`
f
=
aesara
.
function
([
x
],
a
,
mode
=
mode_with_gpu
.
including
(
"unsafe"
))
topo
=
f
.
maker
.
fgraph
.
toposort
()
a_op
=
[
n
for
n
in
topo
if
isinstance
(
n
.
op
,
Assert
)]
assert
len
(
a_op
)
==
0
# Remove `unsafe`
f
=
aesara
.
function
([
x
],
a
,
mode
=
mode_with_gpu
.
excluding
(
"unsafe"
))
topo
=
f
.
maker
.
fgraph
.
toposort
()
a_op
=
[
n
for
n
in
topo
if
isinstance
(
n
.
op
,
Assert
)]
assert
len
(
a_op
)
==
1
def
test_local_gpu_contiguous_gpu_contiguous
():
a
=
fmatrix
()
o1
=
basic_ops
.
gpu_contiguous
(
a
)
o2
=
basic_ops
.
gpu_contiguous
(
o1
)
f1
=
aesara
.
function
([
a
],
o1
,
mode
=
mode_with_gpu
)
f2
=
aesara
.
function
([
a
],
o2
,
mode
=
mode_with_gpu
)
assert
1
==
len
(
[
node
for
node
in
f1
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
node
.
op
,
basic_ops
.
GpuContiguous
)
]
)
assert
1
==
len
(
[
node
for
node
in
f2
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
node
.
op
,
basic_ops
.
GpuContiguous
)
]
)
assert
_check_stack_trace
(
f1
)
assert
_check_stack_trace
(
f2
)
def
test_local_gpu_contiguous
():
a
=
fmatrix
()
o
=
aesara
.
tensor
.
extra_ops
.
cpu_contiguous
(
a
)
f
=
aesara
.
function
([
a
],
o
,
mode
=
mode_with_gpu
)
assert
1
==
len
(
[
node
for
node
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
node
.
op
,
basic_ops
.
GpuContiguous
)
]
)
f
([[
2.0
]])
assert
_check_stack_trace
(
f
)
def
test_flatten
():
m
=
fmatrix
()
f
=
aesara
.
function
([
m
],
m
.
flatten
(),
mode
=
mode_with_gpu
)
val
=
np
.
random
.
rand
(
10
,
11
)
.
astype
(
"float32"
)
res
=
f
(
val
)
utt
.
assert_allclose
(
res
,
val
.
flatten
())
assert
res
.
shape
==
val
.
flatten
()
.
shape
assert
GpuReshape
in
[
type
(
node
.
op
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
val
=
np
.
random
.
rand
(
10
,
11
)
.
astype
(
"float32"
)
res
=
f
(
val
)
utt
.
assert_allclose
(
res
,
val
.
flatten
())
assert
res
.
shape
==
val
.
flatten
()
.
shape
assert
GpuReshape
in
[
type
(
node
.
op
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
assert
_check_stack_trace
(
f
)
f
=
aesara
.
function
(
[
m
],
m
.
flatten
(
ndim
=
2
),
mode
=
mode_with_gpu
.
excluding
(
"local_useless_reshape"
)
)
val
=
np
.
random
.
rand
(
10
,
11
)
.
astype
(
"float32"
)
res
=
f
(
val
)
utt
.
assert_allclose
(
res
,
val
)
assert
res
.
shape
==
val
.
shape
assert
GpuReshape
in
[
type
(
node
.
op
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
assert
_check_stack_trace
(
f
)
m
=
tensor3
()
f
=
aesara
.
function
([
m
],
m
.
flatten
(
ndim
=
2
),
mode
=
mode_with_gpu
)
val
=
np
.
random
.
rand
(
10
,
11
,
12
)
.
astype
(
"float32"
)
res
=
f
(
val
)
utt
.
assert_allclose
(
res
,
val
.
reshape
(
10
,
-
1
))
assert
res
.
shape
==
val
.
reshape
(
10
,
-
1
)
.
shape
assert
GpuReshape
in
[
type
(
node
.
op
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
assert
_check_stack_trace
(
f
)
def
test_reduce
():
kind
=
get_context
(
test_ctx_name
)
.
kind
for
method
,
param
in
[
(
"sum"
,
dict
(
acc_dtype
=
"float32"
)),
(
"prod"
,
dict
(
acc_dtype
=
"float32"
)),
(
"max"
,
{}),
(
"min"
,
{}),
]:
m
=
fmatrix
()
f
=
aesara
.
function
(
[
m
],
getattr
(
m
,
method
)(
axis
=
0
,
**
param
),
mode
=
mode_with_gpu
)
# assert _check_stack_trace(f) this op is ok but since
# it is using GpuCAReduceCuda that has an empty stack
# trace, this assertion gives error.
val
=
np
.
random
.
rand
(
10
,
11
)
.
astype
(
"float32"
)
res
=
f
(
val
)
utt
.
assert_allclose
(
res
,
getattr
(
val
,
method
)(
axis
=
0
))
assert
res
.
shape
==
(
11
,)
topo
=
f
.
maker
.
fgraph
.
toposort
()
ops
=
[
type
(
node
.
op
)
for
node
in
topo
]
if
kind
==
b
"opencl"
and
method
in
[
"max"
,
"min"
]:
assert
not
(
GpuCAReduceCuda
in
ops
or
GpuCAReduceCPY
in
ops
or
GpuDnnReduction
in
ops
)
else
:
assert
(
GpuCAReduceCuda
in
ops
or
GpuCAReduceCPY
in
ops
or
GpuDnnReduction
in
ops
)
def
test_local_gpualloc_memset_0
():
i
=
iscalar
()
z
=
np
.
zeros
((
1
,),
dtype
=
"float32"
)
o
=
np
.
ones
((
1
,),
dtype
=
"float32"
)
ones
=
np
.
ones
((
2
,),
dtype
=
"float32"
)
# Test with 0 from CPU op.
# Should not be transferred as the only client is the output
a
=
at
.
alloc
(
z
,
i
)
f
=
aesara
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
Alloc
)
assert
(
np
.
asarray
(
f
(
6
))
==
0
)
.
all
()
assert
_check_stack_trace
(
f
)
# Test with 0 from CPU op.
# Should be transferred as it is used by another op.
a
=
at
.
alloc
(
z
,
i
)
f
=
aesara
.
function
([
i
],
a
.
cumsum
(),
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
3
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
assert
(
np
.
asarray
(
f
(
6
))
==
0
)
.
all
()
assert
_check_stack_trace
(
f
)
# Test with 0
a
=
GpuAlloc
(
test_ctx_name
)(
z
,
i
)
f
=
aesara
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
and
topo
[
0
]
.
op
.
memset_0
assert
(
np
.
asarray
(
f
(
6
))
==
0
)
.
all
()
assert
_check_stack_trace
(
f
)
# Test with 1
a
=
GpuAlloc
(
test_ctx_name
)(
o
,
i
)
f
=
aesara
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
assert
not
topo
[
0
]
.
op
.
memset_0
assert
(
np
.
asarray
(
f
(
6
))
==
1
)
.
all
()
assert
_check_stack_trace
(
f
)
# Test with 1, 1
a
=
GpuAlloc
(
test_ctx_name
)(
ones
,
i
)
f
=
aesara
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
assert
not
topo
[
0
]
.
op
.
memset_0
assert
(
np
.
asarray
(
f
(
2
))
==
1
)
.
all
()
assert
_check_stack_trace
(
f
)
def
test_local_gpualloc_empty
():
i
=
iscalar
()
ii
=
iscalar
()
# Test with vector
# Should not be moved as the only client is the output
a
=
AllocEmpty
(
"float32"
)(
i
)
f
=
aesara
.
function
([
i
],
a
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
AllocEmpty
)
# This return not initialized data, so we can only check the shape
assert
f
(
3
)
.
shape
==
(
3
,)
assert
_check_stack_trace
(
f
)
# Test with vector
# Should be moved
a
=
AllocEmpty
(
"float32"
)(
i
)
f
=
aesara
.
function
([
i
],
a
.
cumsum
(),
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
3
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAllocEmpty
)
# This return not initialized data, so we can only check the shape
assert
f
(
3
)
.
shape
==
(
3
,)
assert
_check_stack_trace
(
f
)
# Test with matrix
a
=
AllocEmpty
(
"float32"
)(
i
,
ii
)
f
=
aesara
.
function
([
i
,
ii
],
a
.
cumsum
(
axis
=
0
),
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
3
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAllocEmpty
)
# This return not initialized data, so we can only check the shape
assert
f
(
3
,
4
)
.
shape
==
(
3
,
4
)
assert
_check_stack_trace
(
f
)
def
test_rebroadcast
():
d
=
np
.
random
.
rand
(
10
,
10
)
.
astype
(
"float32"
)
v
=
fmatrix
()
up
=
at
.
unbroadcast
(
v
.
sum
()
.
dimshuffle
(
"x"
,
"x"
),
0
,
1
)
f
=
aesara
.
function
([
v
],
[
up
],
mode
=
mode_with_gpu
)
f
(
d
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
rebrs
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
Rebroadcast
)]
assert
len
(
rebrs
)
==
1
rebr
=
rebrs
[
0
]
assert
isinstance
(
rebr
.
inputs
[
0
]
.
type
,
GpuArrayType
)
assert
isinstance
(
rebr
.
outputs
[
0
]
.
type
,
GpuArrayType
)
assert
_check_stack_trace
(
f
)
class
TestSpecifyShape
(
TestSpecifyShape
):
mode
=
mode_with_gpu
input_type
=
GpuArrayType
class
TestGpuIfelse
(
TestIfelse
):
mode
=
mode_with_gpu
@staticmethod
def
cast_output
(
v
):
return
basic_ops
.
as_gpuarray_variable
(
v
,
test_ctx_name
)
shared
=
staticmethod
(
gpuarray_shared_constructor
)
def
get_ifelse
(
self
,
n
):
return
aesara
.
ifelse
.
IfElse
(
n
,
gpu
=
True
,
as_view
=
True
)
def
test_lifter_with_inputs_of_graph
(
self
):
x
=
vector
()
cond
=
iscalar
()
f
=
aesara
.
function
(
[
x
,
cond
],
aesara
.
ifelse
.
ifelse
(
cond
,
x
.
mean
(),
x
.
sum
()),
mode
=
mode_with_gpu
)
assert
f
(
np
.
float32
([
1
,
2
,
3
]),
0
)
==
6
assert
_check_stack_trace
(
f
)
x
=
vector
()
cond
=
scalar
()
f
=
aesara
.
function
(
[
x
,
cond
],
aesara
.
ifelse
.
ifelse
(
cond
,
x
.
mean
(),
x
.
sum
()),
mode
=
mode_with_gpu
)
assert
f
(
np
.
float32
([
1
,
2
,
3
]),
0
)
==
6
assert
_check_stack_trace
(
f
)
def
test_lifter_with_shared_var
(
self
):
x
=
lscalar
(
"x"
)
y
=
gpuarray_shared_constructor
(
np
.
asarray
(
1
,
dtype
=
"float32"
),
target
=
test_ctx_name
)
z
=
at
.
constant
(
2.0
)
a
=
aesara
.
ifelse
.
ifelse
(
x
,
y
,
z
)
with
config
.
change_flags
(
on_opt_error
=
"raise"
):
aesara
.
function
([
x
],
[
a
],
mode
=
mode_with_gpu
)
def
test_print_op
():
# Test that print ops don't block gpu optimization
b
=
fmatrix
()
f
=
aesara
.
function
([
b
],
aesara
.
printing
.
Print
()(
b
)
*
2
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
isinstance
(
topo
[
0
]
.
op
,
GpuFromHost
)
assert
isinstance
(
topo
[
1
]
.
op
,
aesara
.
printing
.
Print
)
assert
isinstance
(
topo
[
2
]
.
op
,
GpuElemwise
)
assert
topo
[
3
]
.
op
==
host_from_gpu
assert
_check_stack_trace
(
f
)
f
(
np
.
random
.
random
((
5
,
5
))
.
astype
(
"float32"
))
def
test_pdbbreakpoint_op
():
# Test that PdbBreakpoint ops don't block gpu optimization
b
=
fmatrix
()
# Create a function composed of a breakpoint followed by
# some computation
condition
=
gt
(
b
.
sum
(),
0
)
b_monitored
=
PdbBreakpoint
(
name
=
"TestBreakpoint"
)(
condition
,
b
)
output
=
b_monitored
**
2
f
=
aesara
.
function
([
b
],
output
,
mode
=
mode_with_gpu
)
# Ensure that, in the compiled function, the computation following the
# breakpoint has been moved to the gpu.
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
isinstance
(
topo
[
-
2
]
.
op
,
GpuElemwise
)
assert
topo
[
-
1
]
.
op
==
host_from_gpu
assert
_check_stack_trace
(
f
)
def
test_local_gpu_elemwise_careduce
():
mode_with_gpu_no_cudnn
=
mode_with_gpu
.
excluding
(
"cudnn"
)
x
=
matrix
()
def
fn_sum_square
(
x
,
axis
):
return
(
x
*
x
)
.
sum
(
axis
=
axis
)
def
fn_sum_abs
(
x
,
axis
):
return
abs
(
x
)
.
sum
(
axis
=
axis
)
def
fn_max_abs
(
x
,
axis
):
return
abs
(
x
)
.
max
(
axis
=
axis
)
for
fn
,
pre_scalar_op
in
(
(
fn_sum_square
,
aesara
.
scalar
.
sqr
),
(
fn_sum_abs
,
aesara
.
scalar
.
abs_
),
(
fn_max_abs
,
aesara
.
scalar
.
abs_
),
):
for
axis
in
(
None
,
0
,
1
):
o
=
fn
(
x
,
axis
)
f
=
aesara
.
function
([
x
],
o
,
mode
=
mode_with_gpu_no_cudnn
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
3
assert
isinstance
(
topo
[
1
]
.
op
,
GpuCAReduceCuda
)
assert
topo
[
1
]
.
op
.
pre_scalar_op
==
pre_scalar_op
assert
_check_stack_trace
(
f
)
data
=
np
.
random
.
rand
(
3
,
4
)
.
astype
(
config
.
floatX
)
utt
.
assert_allclose
(
fn
(
data
,
axis
),
f
(
data
))
def
test_local_lift_dot22scalar
():
x
=
matrix
()
y
=
matrix
()
a
=
scalar
()
o
=
aesara
.
tensor
.
blas
.
Dot22Scalar
()(
x
,
y
,
a
)
f_cpu
=
aesara
.
function
([
x
,
y
,
a
],
o
)
f_gpu
=
aesara
.
function
([
x
,
y
,
a
],
o
,
mode
=
mode_with_gpu
)
assert
not
any
(
isinstance
(
n
.
op
,
aesara
.
tensor
.
blas
.
Dot22Scalar
)
for
n
in
f_gpu
.
maker
.
fgraph
.
apply_nodes
)
assert
any
(
isinstance
(
n
.
op
,
GpuGemm
)
for
n
in
f_gpu
.
maker
.
fgraph
.
apply_nodes
)
x_val
=
np
.
random
.
random
((
2
,
3
))
.
astype
(
config
.
floatX
)
y_val
=
np
.
random
.
random
((
3
,
4
))
.
astype
(
config
.
floatX
)
a_val
=
0.5
utt
.
assert_allclose
(
f_cpu
(
x_val
,
y_val
,
a_val
),
f_gpu
(
x_val
,
y_val
,
a_val
))
assert
_check_stack_trace
(
f_gpu
)
def
test_local_gpu_subtensor
():
# Test shared forced on CPU.
t
=
aesara
.
shared
(
np
.
zeros
(
20
,
"float32"
))
f
=
aesara
.
function
([],
t
[
3
:
4
],
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
any
(
type
(
node
.
op
)
is
aesara
.
tensor
.
subtensor
.
Subtensor
for
node
in
topo
)
assert
not
any
(
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
)
assert
_check_stack_trace
(
f
)
# Test graph input.
t
=
fmatrix
()
f
=
aesara
.
function
([
t
],
t
[
3
:
4
],
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
any
(
type
(
node
.
op
)
is
aesara
.
tensor
.
subtensor
.
Subtensor
for
node
in
topo
)
assert
not
any
(
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
)
assert
_check_stack_trace
(
f
)
# Test multiple use of the input
# We want the subtensor to be on the GPU to prevent multiple transfer.
t
=
fmatrix
()
f
=
aesara
.
function
([
t
],
[
t
[
3
:
4
],
t
+
1
],
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
not
any
(
type
(
node
.
op
)
is
aesara
.
tensor
.
subtensor
.
Subtensor
for
node
in
topo
)
assert
any
(
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
)
assert
_check_stack_trace
(
f
)
# Test multiple use of the input + input as output
# We want the subtensor to be on the GPU to prevent multiple transfer.
t
=
fmatrix
()
f
=
aesara
.
function
([
t
],
[
t
[
3
:
4
],
t
+
1
,
t
],
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
not
any
(
type
(
node
.
op
)
is
aesara
.
tensor
.
subtensor
.
Subtensor
for
node
in
topo
)
assert
any
(
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
)
assert
_check_stack_trace
(
f
)
# Test shared forced on CPU end we do computation on the output of
# the subtensor.
t
=
aesara
.
shared
(
np
.
zeros
(
20
,
"float32"
))
f
=
aesara
.
function
([],
t
[
3
:
4
]
+
1
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
any
(
type
(
node
.
op
)
is
aesara
.
tensor
.
subtensor
.
Subtensor
for
node
in
topo
)
assert
not
any
(
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
)
# Our optimizer isn't smart enough to move to the GPU Elemwise.
# If it where just a little bit smarter, it could wrongly move it to the GPU.
# If it where super smart, it would know it should not move it to the GPU.
assert
any
(
isinstance
(
node
.
op
,
aesara
.
tensor
.
elemwise
.
Elemwise
)
for
node
in
topo
)
assert
_check_stack_trace
(
f
)
def
test_local_gpu_elemwise
():
# Test local_gpu_elemwise when there is a dtype upcastable to float32
a
=
bmatrix
()
b
=
fmatrix
()
c
=
fmatrix
()
a_v
=
(
np
.
random
.
rand
(
4
,
5
)
*
10
)
.
astype
(
"int8"
)
b_v
=
(
np
.
random
.
rand
(
4
,
5
)
*
10
)
.
astype
(
"float32"
)
c_v
=
(
np
.
random
.
rand
(
4
,
5
)
*
10
)
.
astype
(
"float32"
)
# Due to optimization order, this composite is created when all
# the op are on the gpu.
f
=
aesara
.
function
([
a
,
b
,
c
],
a
+
b
+
c
,
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
sum
(
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
topo
)
==
1
assert
sum
(
type
(
node
.
op
)
==
aesara
.
tensor
.
elemwise
.
Elemwise
for
node
in
topo
)
==
0
utt
.
assert_allclose
(
f
(
a_v
,
b_v
,
c_v
),
a_v
+
b_v
+
c_v
)
assert
_check_stack_trace
(
f
)
# Now test with the composite already on the cpu before we move it
# to the gpu
a_s
=
aesara
.
scalar
.
int8
()
b_s
=
aesara
.
scalar
.
float32
()
c_s
=
aesara
.
scalar
.
float32
()
out_s
=
aesara
.
scalar
.
Composite
([
a_s
,
b_s
,
c_s
],
[
a_s
+
b_s
+
c_s
])
out_op
=
aesara
.
tensor
.
elemwise
.
Elemwise
(
out_s
)
f
=
aesara
.
function
([
a
,
b
,
c
],
out_op
(
a
,
b
,
c
),
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
sum
(
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
topo
)
==
1
assert
sum
(
type
(
node
.
op
)
==
aesara
.
tensor
.
elemwise
.
Elemwise
for
node
in
topo
)
==
0
utt
.
assert_allclose
(
f
(
a_v
,
b_v
,
c_v
),
a_v
+
b_v
+
c_v
)
assert
_check_stack_trace
(
f
)
return
# Not yet implemented
# Test multiple output
a_s
=
aesara
.
scalar
.
float32
()
a
=
fmatrix
()
from
aesara.scalar.basic
import
identity
out_s
=
aesara
.
scalar
.
Composite
(
[
a_s
,
b_s
,
c_s
],
[
identity
(
a_s
),
identity
(
c_s
),
identity
(
b_s
)]
)
outs_op
=
aesara
.
tensor
.
elemwise
.
Elemwise
(
out_s
)
f
=
aesara
.
function
([
a
,
b
,
c
],
outs_op
(
a
,
b
,
c
),
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
sum
(
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
topo
)
==
1
assert
sum
(
type
(
node
.
op
)
==
aesara
.
tensor
.
elemwise
.
Elemwise
for
node
in
topo
)
==
0
out
=
f
(
a_v
,
b_v
,
c_v
)
utt
.
assert_allclose
(
out
[
0
],
a_v
)
utt
.
assert_allclose
(
out
[
1
],
c_v
)
utt
.
assert_allclose
(
out
[
2
],
b_v
)
assert
_check_stack_trace
(
f
)
# Test multiple output
out_s
=
aesara
.
scalar
.
Composite
([
a_s
,
b_s
,
c_s
],
[
a_s
+
b_s
,
a_s
*
b_s
])
outs_op
=
aesara
.
tensor
.
elemwise
.
Elemwise
(
out_s
)
f
=
aesara
.
function
([
a
,
b
,
c
],
outs_op
(
a
,
b
,
c
),
mode
=
mode_with_gpu
)
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
sum
(
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
topo
)
==
1
assert
sum
(
type
(
node
.
op
)
==
aesara
.
tensor
.
elemwise
.
Elemwise
for
node
in
topo
)
==
0
out
=
f
(
a_v
,
b_v
,
c_v
)
utt
.
assert_allclose
(
out
[
0
],
a_v
+
b_v
)
utt
.
assert_allclose
(
out
[
1
],
a_v
*
c_v
)
assert
_check_stack_trace
(
f
)
# Test non-contiguous input
c
=
gpuarray_shared_constructor
(
np
.
asarray
(
c_v
,
dtype
=
"float32"
))
f
=
aesara
.
function
([
a
,
b
],
outs_op
(
a
[::
2
],
b
[::
2
],
c
[::
2
]),
mode
=
mode_with_gpu
)
out
=
f
(
a_v
,
b_v
)
utt
.
assert_allclose
(
out
[
0
],
a_v
[::
2
]
+
b_v
[::
2
])
utt
.
assert_allclose
(
out
[
1
],
a_v
[::
2
]
*
c_v
[::
2
])
assert
_check_stack_trace
(
f
)
def
test_many_arg_elemwise
():
# This test checks whether the + and * elemwise ops can handle
# extremely large numbers of arguments on gpu.
rng
=
np
.
random
.
default_rng
([
1
,
2
,
3
])
nb_of_inputs_overflows
=
[]
for
num_args
in
[
64
]:
for
op_to_test
in
[
aesara
.
tensor
.
add
,
aesara
.
tensor
.
mul
]:
for
nb_dim
in
[
2
,
8
]:
shapes
=
[
rng
.
integers
(
1
,
5
)
for
i
in
range
(
nb_dim
)]
args
=
[
np
.
cast
[
"float32"
](
rng
.
standard_normal
(
shapes
))
for
arg
in
range
(
0
,
num_args
)
]
symb_args
=
[
TensorType
(
"float32"
,
(
False
,)
*
nb_dim
)()
for
arg
in
range
(
0
,
num_args
)
]
outputs
=
[]
for
mode
in
[
mode_with_gpu
,
mode_without_gpu
]:
# test the optimization local_gpua_elemwise
output
=
op_to_test
(
*
symb_args
)
f
=
aesara
.
function
(
symb_args
,
output
,
mode
=
mode
)
outputs
.
append
(
f
(
*
args
))
# assert that the test was done on the gpu.
if
mode
is
mode_with_gpu
:
nb_of_inputs_overflows
.
append
(
max_inputs_to_GpuElemwise
(
output
.
owner
)
-
num_args
)
nodelst
=
[
node
for
node
in
f
.
maker
.
fgraph
.
apply_nodes
]
assert
any
(
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
nodelst
)
assert
not
any
(
isinstance
(
node
.
op
,
Elemwise
)
for
node
in
nodelst
if
not
isinstance
(
node
.
op
,
GpuElemwise
)
)
results_gpu
,
results_cpu
=
outputs
utt
.
assert_allclose
(
results_gpu
,
results_cpu
)
# Make sure we test at least one case with no number of inputs overflow
assert
any
(
overflow
>=
0
for
overflow
in
nb_of_inputs_overflows
)
# Make sure we test at least one case with number of inputs overflow
assert
any
(
overflow
<
0
for
overflow
in
nb_of_inputs_overflows
)
def
test_not_useless_scalar_gpuelemwise
():
# We don't want to move elemwise on scalar on the GPU when the
# result will not be used on the GPU!
with
config
.
change_flags
(
warn_float64
=
"ignore"
):
X
=
fmatrix
()
x
=
np
.
random
.
standard_normal
((
32
,
32
))
.
astype
(
np
.
float32
)
m1
=
aesara
.
shared
(
np
.
random
.
standard_normal
((
32
,
32
))
.
astype
(
np
.
float32
))
loss
=
(
X
-
dot
(
X
,
m1
))
.
norm
(
L
=
2
)
lr
=
aesara
.
shared
(
np
.
asarray
(
0.001
,
dtype
=
np
.
float32
))
grad
=
aesara
.
grad
(
loss
,
m1
)
train
=
aesara
.
function
(
inputs
=
[
X
],
updates
=
[(
m1
,
m1
-
lr
*
grad
)],
mode
=
mode_with_gpu
)
train
(
x
)
topo
=
train
.
maker
.
fgraph
.
toposort
()
gemms
=
[
app
for
app
in
topo
if
isinstance
(
app
.
op
,
GpuGemm
)]
assert
len
(
gemms
)
==
2
assert
isinstance
(
gemms
[
1
]
.
inputs
[
1
]
.
owner
.
op
,
aesara
.
tensor
.
elemwise
.
Elemwise
)
def
test_local_lift_abstractconv_gpu_shape
():
with
config
.
change_flags
(
on_opt_error
=
"raise"
):
s
=
ivector
()
a
=
ftensor4
()
b
=
ftensor4
()
c
=
aesara
.
tensor
.
nnet
.
abstract_conv
.
AbstractConv2d_gradWeights
()(
a
,
b
,
s
)
f
=
aesara
.
function
([
s
,
a
,
b
],
c
,
mode
=
mode_with_gpu
)
assert
_check_stack_trace
(
f
)
def
test_local_assert_no_cpu_op
():
rng
=
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
m
=
rng
.
uniform
(
-
1
,
1
,
(
10
,
10
))
.
astype
(
"float32"
)
ms
=
gpuarray_shared_constructor
(
m
,
name
=
"m_shared"
)
out
=
tanh
(
ms
)
.
dot
(
ms
.
T
)
mode_local_assert
=
mode_with_gpu
.
including
(
"assert_no_cpu_op"
)
mode_local_assert
=
mode_local_assert
.
excluding
(
"local_gpua_elemwise"
)
with
config
.
change_flags
(
assert_no_cpu_op
=
"raise"
,
on_opt_error
=
"ignore"
):
with
pytest
.
raises
(
AssertionError
):
aesara
.
function
([],
out
,
mode
=
mode_local_assert
)
with
config
.
change_flags
(
assert_no_cpu_op
=
"ignore"
):
f
=
aesara
.
function
([],
out
,
mode
=
mode_local_assert
)
assert
_check_stack_trace
(
f
)
def
test_no_complex
():
width_var
=
cscalar
()
freq_var
=
fscalar
()
signal_var
=
fscalar
()
stft_out
=
exp
(
width_var
*
freq_var
)
*
signal_var
f
=
aesara
.
function
([
width_var
,
freq_var
,
signal_var
],
stft_out
,
mode
=
mode_with_gpu
)
assert
_check_stack_trace
(
f
)
@utt.assertFailure_fast
@pytest.mark.skipif
(
not
cusolver_available
,
reason
=
"No cuSolver or SciPy"
)
def
test_local_lift_solve
():
A
=
fmatrix
()
b
=
fmatrix
()
o
=
slinalg
.
solve
(
A
,
b
)
f_cpu
=
aesara
.
function
([
A
,
b
],
o
,
mode_without_gpu
)
f_gpu
=
aesara
.
function
([
A
,
b
],
o
,
mode
=
mode_with_gpu
)
assert
not
any
(
isinstance
(
n
.
op
,
slinalg
.
Solve
)
for
n
in
f_gpu
.
maker
.
fgraph
.
apply_nodes
)
assert
any
(
isinstance
(
n
.
op
,
GpuCusolverSolve
)
and
n
.
op
.
inplace
for
n
in
f_gpu
.
maker
.
fgraph
.
apply_nodes
)
A_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
5
))
.
astype
(
"float32"
)
b_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
3
))
.
astype
(
"float32"
)
utt
.
assert_allclose
(
f_cpu
(
A_val
,
b_val
),
f_gpu
(
A_val
,
b_val
))
assert
_check_stack_trace
(
f_gpu
)
@pytest.mark.skipif
(
not
cusolver_available
,
reason
=
"No cuSolver or SciPy"
)
def
test_gpu_solve_not_inplace
():
A
=
fmatrix
()
b
=
fmatrix
()
s
=
slinalg
.
solve
(
A
,
b
)
o
=
dot
(
A
,
s
)
f_cpu
=
aesara
.
function
([
A
,
b
],
o
,
mode_without_gpu
)
f_gpu
=
aesara
.
function
([
A
,
b
],
o
,
mode
=
mode_with_gpu
)
count_not_inplace
=
len
(
[
n
.
op
for
n
in
f_gpu
.
maker
.
fgraph
.
apply_nodes
if
isinstance
(
n
.
op
,
GpuCusolverSolve
)
and
not
n
.
op
.
inplace
]
)
assert
count_not_inplace
==
1
,
count_not_inplace
A_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
5
))
.
astype
(
"float32"
)
b_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
3
))
.
astype
(
"float32"
)
utt
.
assert_allclose
(
f_cpu
(
A_val
,
b_val
),
f_gpu
(
A_val
,
b_val
))
@utt.assertFailure_fast
@pytest.mark.skipif
(
not
cusolver_available
,
reason
=
"No cuSolver or SciPy"
)
def
test_local_lift_cholesky
():
A
=
fmatrix
()
o
=
slinalg
.
cholesky
(
A
)
f_cpu
=
aesara
.
function
([
A
],
o
,
mode
=
mode_without_gpu
)
f_gpu
=
aesara
.
function
([
A
],
o
,
mode
=
mode_with_gpu
)
assert
not
any
(
isinstance
(
n
.
op
,
slinalg
.
Cholesky
)
for
n
in
f_gpu
.
maker
.
fgraph
.
apply_nodes
)
# GpuCholesky op in this graph should be inplace (as his input is not reused by other op).
assert
any
(
isinstance
(
n
.
op
,
GpuCholesky
)
and
n
.
op
.
inplace
for
n
in
f_gpu
.
maker
.
fgraph
.
apply_nodes
)
M_val
=
np
.
random
.
normal
(
size
=
(
3
,
3
))
.
astype
(
"float32"
)
# A = M.dot(M) will be positive definite for all non-singular M
A_val
=
M_val
.
dot
(
M_val
.
T
)
utt
.
assert_allclose
(
f_cpu
(
A_val
),
f_gpu
(
A_val
))
@pytest.mark.skipif
(
not
cusolver_available
,
reason
=
"No cuSolver or SciPy"
)
def
test_gpu_cholesky_not_inplace
():
A
=
fmatrix
()
A_squared
=
A
**
2
B
=
slinalg
.
cholesky
(
A_squared
)
D
=
B
+
A_squared
f_cpu
=
aesara
.
function
([
A
],
D
,
mode
=
mode_without_gpu
)
f_gpu
=
aesara
.
function
([
A
],
D
,
mode
=
mode_with_gpu
)
# GpuCholesky op in this graph should NOT be inplace (as his input is reused in another op)
count_cholesky_not_inplace
=
len
(
[
n
.
op
for
n
in
f_gpu
.
maker
.
fgraph
.
apply_nodes
if
isinstance
(
n
.
op
,
GpuCholesky
)
and
not
n
.
op
.
inplace
]
)
assert
count_cholesky_not_inplace
==
1
,
count_cholesky_not_inplace
M_val
=
np
.
random
.
normal
(
size
=
(
3
,
3
))
.
astype
(
"float32"
)
# A = M.dot(M) will be positive definite for all non-singular M
A_val
=
M_val
.
dot
(
M_val
.
T
)
utt
.
assert_allclose
(
f_cpu
(
A_val
),
f_gpu
(
A_val
))
def
test_local_gpua_advanced_incsubtensor
():
# test a corner case reported at gh-5589
target
=
ftensor4
()
y
=
target
.
dimshuffle
(
1
,
0
,
2
,
3
)
.
flatten
(
ndim
=
1
)
w
=
at
.
ones_like
(
y
)
w
=
aesara
.
tensor
.
subtensor
.
set_subtensor
(
w
[
eq
(
y
,
1.0
)
.
nonzero
()],
100
)
w
=
aesara
.
tensor
.
subtensor
.
set_subtensor
(
w
[
eq
(
y
,
-
1.0
)
.
nonzero
()],
0
)
f
=
aesara
.
function
([
target
],
w
)
assert
_check_stack_trace
(
f
)
def
test_batched_dot_lifter
():
# The CPU Op accepts 2D and 3D inputs, as well as mixed dtypes.
# Make sure the lifter adds the appropriate dimshuffles and casts
rng
=
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
def
randX
(
*
args
):
return
rng
.
random
(
args
)
.
astype
(
config
.
floatX
)
cases
=
[
(
randX
(
3
,
5
,
7
),
randX
(
3
,
7
)),
(
randX
(
3
,
5
),
randX
(
3
,
5
,
7
)),
(
randX
(
3
,
5
),
randX
(
3
,
5
)),
(
rng
.
random
((
3
,
5
,
7
))
.
astype
(
"float32"
),
randX
(
3
,
7
,
9
)),
(
rng
.
random
((
3
,
5
,
7
))
.
astype
(
"float64"
),
randX
(
3
,
7
,
9
)),
]
for
x_val
,
y_val
in
cases
:
x
=
TensorType
(
broadcastable
=
[
s
==
1
for
s
in
x_val
.
shape
],
dtype
=
x_val
.
dtype
)(
"x"
)
y
=
TensorType
(
broadcastable
=
[
s
==
1
for
s
in
y_val
.
shape
],
dtype
=
y_val
.
dtype
)(
"y"
)
z
=
batched_dot
(
x
,
y
)
f
=
aesara
.
function
([
x
,
y
],
z
,
mode
=
mode_with_gpu
)
f
(
x_val
,
y_val
)
assert
check_stack_trace
(
f
,
ops_to_check
=
"all"
)
def
test_crossentropycategorical1hot_lifter
():
rng
=
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
x
=
matrix
()
y
=
lvector
()
z
=
aesara
.
tensor
.
nnet
.
crossentropy_categorical_1hot
(
x
,
y
)
gx
=
aesara
.
grad
(
z
.
mean
(),
x
)
f
=
aesara
.
function
([
x
,
y
],
[
z
,
gx
],
mode
=
mode_with_gpu
)
assert
not
any
(
isinstance
(
n
.
op
,
(
aesara
.
tensor
.
nnet
.
CrossentropyCategorical1Hot
,
aesara
.
tensor
.
nnet
.
CrossentropyCategorical1HotGrad
,
),
)
for
n
in
f
.
maker
.
fgraph
.
apply_nodes
)
f
(
rng
.
uniform
(
0.1
,
0.9
,
(
13
,
5
))
.
astype
(
config
.
floatX
),
rng
.
integers
(
5
,
size
=
(
13
,)),
)
class
TestConv_opt
:
def
optimizer_2d
(
self
,
input_shapes
,
direction
,
include_tags
,
exclude_tags
,
op
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
),
filter_dilation
=
(
1
,
1
),
num_groups
=
1
,
unshared
=
False
,
optimiser
=
None
,
):
inp1
=
aesara
.
shared
(
np
.
random
.
random
(
input_shapes
[
0
])
.
astype
(
config
.
floatX
))
inp2
=
aesara
.
shared
(
np
.
random
.
random
(
input_shapes
[
1
])
.
astype
(
config
.
floatX
))
if
op
is
None
:
inp1
=
basic_ops
.
as_gpuarray_variable
(
inp1
,
test_ctx_name
)
inp2
=
basic_ops
.
as_gpuarray_variable
(
inp2
,
test_ctx_name
)
if
direction
==
0
:
conv_op
=
abstract_conv
.
AbstractConv2d
(
input_shapes
[
0
],
input_shapes
[
1
],
border_mode
=
border_mode
,
subsample
=
subsample
,
filter_dilation
=
filter_dilation
,
num_groups
=
num_groups
,
unshared
=
unshared
,
)(
inp1
,
inp2
)
if
direction
==
1
:
conv_op
=
abstract_conv
.
AbstractConv2d_gradWeights
(
imshp
=
input_shapes
[
0
],
kshp
=
input_shapes
[
2
],
border_mode
=
border_mode
,
subsample
=
subsample
,
filter_dilation
=
filter_dilation
,
num_groups
=
num_groups
,
unshared
=
unshared
,
)(
inp1
,
inp2
,
input_shapes
[
2
][
-
2
:])
if
direction
==
2
:
conv_op
=
abstract_conv
.
AbstractConv2d_gradInputs
(
imshp
=
input_shapes
[
2
],
kshp
=
input_shapes
[
1
],
border_mode
=
border_mode
,
subsample
=
subsample
,
filter_dilation
=
filter_dilation
,
num_groups
=
num_groups
,
unshared
=
unshared
,
)(
inp2
,
inp1
,
input_shapes
[
2
][
-
2
:])
with
config
.
change_flags
(
metaopt__optimizer_including
=
include_tags
,
metaopt__optimizer_excluding
=
exclude_tags
,
):
mode
=
(
mode_with_gpu
.
including
(
"conv_meta"
)
.
excluding
(
"conv_dnn"
)
.
excluding
(
"conv_gemm"
)
)
# All meta optimizer compile a new function. This need to know
# the current linker, but this information is not available,
# so it use the default mode.
if
op
is
None
:
# No convolutions optimization takes place
assert
optimiser
.
transform
(
None
,
conv_op
.
owner
)
is
None
else
:
ref_func
=
aesara
.
function
([],
conv_op
,
mode
=
mode_with_gpu
)
with
config
.
change_flags
(
mode
=
mode
):
conv_func
=
aesara
.
function
([],
conv_op
,
mode
=
mode
)
assert
any
(
[
isinstance
(
node
.
op
,
op
)
for
node
in
conv_func
.
maker
.
fgraph
.
toposort
()
]
)
utt
.
assert_allclose
(
conv_func
(),
ref_func
())
def
optimizer_3d
(
self
,
input_shapes
,
direction
,
include_tags
,
exclude_tags
,
op
,
border_mode
=
"valid"
,
subsample
=
(
1
,
1
,
1
),
filter_dilation
=
(
1
,
1
,
1
),
num_groups
=
1
,
optimiser
=
None
,
):
inp1
=
aesara
.
shared
(
np
.
random
.
random
(
input_shapes
[
0
])
.
astype
(
config
.
floatX
))
inp2
=
aesara
.
shared
(
np
.
random
.
random
(
input_shapes
[
1
])
.
astype
(
config
.
floatX
))
if
op
is
None
:
inp1
=
basic_ops
.
as_gpuarray_variable
(
inp1
,
None
)
inp2
=
basic_ops
.
as_gpuarray_variable
(
inp2
,
None
)
if
direction
==
0
:
conv_op
=
abstract_conv
.
AbstractConv3d
(
input_shapes
[
0
],
input_shapes
[
1
],
border_mode
=
border_mode
,
subsample
=
subsample
,
filter_dilation
=
filter_dilation
,
num_groups
=
num_groups
,
)(
inp1
,
inp2
)
if
direction
==
1
:
conv_op
=
abstract_conv
.
AbstractConv3d_gradWeights
(
input_shapes
[
0
],
input_shapes
[
2
],
border_mode
=
border_mode
,
subsample
=
subsample
,
filter_dilation
=
filter_dilation
,
num_groups
=
num_groups
,
)(
inp1
,
inp2
,
input_shapes
[
2
][
-
3
:])
if
direction
==
2
:
conv_op
=
abstract_conv
.
AbstractConv3d_gradInputs
(
input_shapes
[
2
],
input_shapes
[
1
],
border_mode
=
border_mode
,
subsample
=
subsample
,
filter_dilation
=
filter_dilation
,
num_groups
=
num_groups
,
)(
inp2
,
inp1
,
input_shapes
[
2
][
-
3
:])
with
config
.
change_flags
(
metaopt__optimizer_including
=
include_tags
,
metaopt__optimizer_excluding
=
exclude_tags
,
):
mode
=
(
mode_with_gpu
.
including
(
"conv_meta"
)
.
excluding
(
"conv_dnn"
)
.
excluding
(
"conv_gemm"
)
)
# All meta optimizer compile a new function. This need to know
# the current linker, but this information is not available,
# so it use the default mode.
if
op
is
None
:
# No convolutions optimization takes place
assert
optimiser
.
transform
(
None
,
conv_op
.
owner
)
is
None
return
elif
op
!=
"conv3d2d"
:
with
config
.
change_flags
(
mode
=
mode
):
conv_func
=
aesara
.
function
([],
conv_op
,
mode
=
mode
)
assert
any
(
[
isinstance
(
node
.
op
,
op
)
for
node
in
conv_func
.
maker
.
fgraph
.
toposort
()
]
)
else
:
with
config
.
change_flags
(
mode
=
mode
):
conv_func
=
aesara
.
function
(
[],
conv_op
,
mode
=
mode_with_gpu
.
including
(
"conv_meta"
)
)
ref_func
=
aesara
.
function
([],
conv_op
,
mode
=
mode_with_gpu
)
utt
.
assert_allclose
(
conv_func
(),
ref_func
())
@pytest.mark.skipif
(
config
.
cxx
==
""
,
reason
=
"Need a c compiler."
)
def
test_optimizers_2d
(
self
):
imshp2d
=
[(
2
,
3
,
5
,
5
),
(
2
,
2
,
5
,
7
),
(
2
,
1
,
3
,
3
)]
kshp2d
=
[(
4
,
3
,
3
,
3
),
(
3
,
2
,
3
,
5
),
(
4
,
1
,
1
,
1
)]
tshp2d
=
[(
2
,
4
,
3
,
3
),
(
2
,
3
,
3
,
3
),
(
2
,
4
,
3
,
3
)]
for
imshp
,
kshp
,
tshp
in
zip
(
imshp2d
,
kshp2d
,
tshp2d
):
# forward passes
self
.
optimizer_2d
(
[
imshp
,
kshp
,
tshp
],
0
,
""
,
"conv_dnn:alternative"
,
blas
.
GpuCorrMM
)
self
.
optimizer_2d
(
[
imshp
,
kshp
,
tshp
],
0
,
"alternative"
,
"conv_dnn:default"
,
blas
.
GpuCorrMM_gradWeights
,
)
self
.
optimizer_2d
(
[
imshp
,
kshp
,
tshp
],
0
,
""
,
"conv_gemm:alternative"
,
dnn
.
GpuDnnConv
)
self
.
optimizer_2d
(
[
imshp
,
kshp
,
tshp
],
0
,
"alternative"
,
"conv_gemm:default"
,
dnn
.
GpuDnnConvGradW
,
)
# backwards wrt weights
self
.
optimizer_2d
(
[
imshp
,
tshp
,
kshp
],
1
,
""
,
"conv_dnn:alternative"
,
blas
.
GpuCorrMM_gradWeights
,
)
self
.
optimizer_2d
(
[
imshp
,
tshp
,
kshp
],
1
,
"alternative"
,
"conv_dnn:default"
,
blas
.
GpuCorrMM
,
)
self
.
optimizer_2d
(
[
imshp
,
tshp
,
kshp
],
1
,
""
,
"conv_gemm:alternative"
,
dnn
.
GpuDnnConvGradW
)
self
.
optimizer_2d
(
[
imshp
,
tshp
,
kshp
],
1
,
"alternative"
,
"conv_gemm:default"
,
dnn
.
GpuDnnConv
,
)
# backwards wrt to inputs
self
.
optimizer_2d
(
[
tshp
,
kshp
,
imshp
],
2
,
""
,
"conv_dnn:alternative"
,
blas
.
GpuCorrMM_gradInputs
,
)
self
.
optimizer_2d
(
[
tshp
,
kshp
,
imshp
],
2
,
"alternative"
,
"conv_dnn:default"
,
blas
.
GpuCorrMM
,
)
self
.
optimizer_2d
(
[
tshp
,
kshp
,
imshp
],
2
,
""
,
"conv_gemm:alternative"
,
dnn
.
GpuDnnConvGradI
)
self
.
optimizer_2d
(
[
tshp
,
kshp
,
imshp
],
2
,
"alternative"
,
"conv_gemm:default"
,
dnn
.
GpuDnnConv
,
)
@pytest.mark.skipif
(
config
.
cxx
==
""
,
reason
=
"Need a c compiler."
)
def
test_optimizers_3d
(
self
):
imshp3d
=
[(
2
,
3
,
5
,
5
,
5
),
(
2
,
2
,
5
,
7
,
5
),
(
2
,
1
,
3
,
3
,
3
)]
kshp3d
=
[(
4
,
3
,
3
,
3
,
3
),
(
3
,
2
,
3
,
5
,
3
),
(
4
,
1
,
1
,
1
,
1
)]
tshp3d
=
[(
2
,
4
,
3
,
3
,
3
),
(
2
,
3
,
3
,
3
,
3
),
(
2
,
4
,
3
,
3
,
3
)]
for
imshp
,
kshp
,
tshp
in
zip
(
imshp3d
,
kshp3d
,
tshp3d
):
# forwards passes
self
.
optimizer_3d
(
[
imshp
,
kshp
,
tshp
],
0
,
""
,
"conv_dnn:alternative:conv3d2d"
,
blas
.
GpuCorr3dMM
,
)
self
.
optimizer_3d
(
[
imshp
,
kshp
,
tshp
],
0
,
"alternative"
,
"conv_dnn:default:conv3d2d"
,
blas
.
GpuCorr3dMM_gradWeights
,
)
self
.
optimizer_3d
([
imshp
,
kshp
,
tshp
],
0
,
"conv3d2d"
,
"default"
,
"conv3d2d"
)
self
.
optimizer_3d
(
[
imshp
,
kshp
,
tshp
],
0
,
"alternative"
,
"conv_gemm:default:conv3d2d"
,
dnn
.
GpuDnnConvGradW
,
)
self
.
optimizer_3d
(
[
imshp
,
kshp
,
tshp
],
0
,
""
,
"conv_gemm:alternative:conv3d2d"
,
dnn
.
GpuDnnConv
,
)
# backward pass wrt weight
self
.
optimizer_3d
(
[
imshp
,
tshp
,
kshp
],
1
,
""
,
"conv_dnn:alternative"
,
blas
.
GpuCorr3dMM_gradWeights
,
)
self
.
optimizer_3d
(
[
imshp
,
tshp
,
kshp
],
1
,
"alternative"
,
"conv_dnn:default"
,
blas
.
GpuCorr3dMM
,
)
self
.
optimizer_3d
(
[
imshp
,
tshp
,
kshp
],
1
,
"alternative"
,
"conv_gemm:default"
,
dnn
.
GpuDnnConv
,
)
self
.
optimizer_3d
(
[
imshp
,
tshp
,
kshp
],
1
,
""
,
"conv_gemm:alternative"
,
dnn
.
GpuDnnConvGradW
)
# backward pass wrt inputs
self
.
optimizer_3d
(
[
tshp
,
kshp
,
imshp
],
2
,
""
,
"conv_dnn:alternative"
,
blas
.
GpuCorr3dMM_gradInputs
,
)
self
.
optimizer_3d
(
[
tshp
,
kshp
,
imshp
],
2
,
"alternative"
,
"conv_dnn:default"
,
blas
.
GpuCorr3dMM
,
)
self
.
optimizer_3d
(
[
tshp
,
kshp
,
imshp
],
2
,
"alternative"
,
"conv_gemm:default"
,
dnn
.
GpuDnnConv
,
)
self
.
optimizer_3d
(
[
tshp
,
kshp
,
imshp
],
2
,
""
,
"conv_gemm:alternative"
,
dnn
.
GpuDnnConvGradI
)
@pytest.mark.skipif
(
config
.
cxx
==
""
,
reason
=
"Need a c compiler."
)
def
test_optimizers_non_default
(
self
):
# conv2d forward pass with Non-default border_mode and filter_dilation
imshp2d
=
[(
2
,
3
,
5
,
5
),
(
4
,
2
,
5
,
5
)]
kshp2d
=
[(
4
,
3
,
3
,
3
),
(
3
,
2
,
3
,
3
)]
filter_dilation
=
[(
1
,
1
),
(
2
,
2
)]
for
imshp
,
kshp
,
fdil
in
zip
(
imshp2d
,
kshp2d
,
filter_dilation
):
self
.
optimizer_2d
(
[
imshp
,
kshp
],
0
,
""
,
"conv_dnn:alternative"
,
blas
.
GpuCorrMM
,
border_mode
=
"full"
,
filter_dilation
=
fdil
,
)
self
.
optimizer_2d
(
[
imshp
,
kshp
],
0
,
"alternative"
,
"conv_dnn:default"
,
blas
.
GpuCorrMM_gradInputs
,
border_mode
=
"full"
,
filter_dilation
=
fdil
,
)
self
.
optimizer_2d
(
[
imshp
,
kshp
],
0
,
""
,
"conv_gemm:alternative"
,
dnn
.
GpuDnnConv
,
border_mode
=
"full"
,
filter_dilation
=
fdil
,
)
self
.
optimizer_2d
(
[
imshp
,
kshp
],
0
,
"alternative"
,
"conv_gemm:default"
,
dnn
.
GpuDnnConvGradI
,
border_mode
=
"full"
,
filter_dilation
=
fdil
,
)
# conv3d forward pass with Non-default border_mode and filter_dilation
imshp3d
=
[(
2
,
3
,
5
,
5
,
5
),
(
4
,
2
,
5
,
5
,
5
)]
kshp3d
=
[(
4
,
3
,
3
,
3
,
3
),
(
3
,
2
,
3
,
3
,
3
)]
filter_dilation
=
[(
1
,
1
,
1
),
(
2
,
2
,
2
)]
for
imshp
,
kshp
,
fdil
in
zip
(
imshp3d
,
kshp3d
,
filter_dilation
):
self
.
optimizer_3d
(
[
imshp
,
kshp
],
0
,
""
,
"conv_dnn:alternative:conv3d2d"
,
blas
.
GpuCorr3dMM
,
border_mode
=
"full"
,
filter_dilation
=
fdil
,
)
self
.
optimizer_3d
(
[
imshp
,
kshp
],
0
,
"alternative"
,
"conv_dnn:default:conv3d2d"
,
blas
.
GpuCorr3dMM_gradInputs
,
border_mode
=
"full"
,
filter_dilation
=
fdil
,
)
self
.
optimizer_3d
(
[
imshp
,
kshp
],
0
,
""
,
"conv_gemm:alternative:conv3d2d"
,
dnn
.
GpuDnnConv
,
border_mode
=
"full"
,
filter_dilation
=
fdil
,
)
self
.
optimizer_3d
(
[
imshp
,
kshp
],
0
,
"alternative"
,
"conv_gemm:default:conv3d2d"
,
dnn
.
GpuDnnConvGradI
,
border_mode
=
"full"
,
filter_dilation
=
fdil
,
)
# test non default num_groups for default optimizers
imshp2d
=
[(
2
,
6
,
5
,
5
),
(
2
,
4
,
5
,
5
)]
kshp2d
=
[(
3
,
2
,
3
,
3
),
(
2
,
2
,
3
,
3
)]
tshp2d
=
[(
2
,
3
,
3
,
3
),
(
2
,
2
,
3
,
3
)]
num_groups
=
[
3
,
2
]
for
imshp
,
kshp
,
tshp
,
groups
in
zip
(
imshp2d
,
kshp2d
,
tshp2d
,
num_groups
):
# forward pass
self
.
optimizer_2d
(
[
imshp
,
kshp
,
tshp
],
0
,
""
,
"conv_dnn:alternative"
,
blas
.
GpuCorrMM
,
num_groups
=
groups
,
)
self
.
optimizer_2d
(
[
imshp
,
kshp
,
tshp
],
0
,
""
,
"conv_gemm:alternative"
,
dnn
.
GpuDnnConv
,
num_groups
=
groups
,
)
# grad with respect to weights
self
.
optimizer_2d
(
[
imshp
,
tshp
,
kshp
],
1
,
""
,
"conv_dnn:alternative"
,
blas
.
GpuCorrMM_gradWeights
,
num_groups
=
groups
,
)
self
.
optimizer_2d
(
[
imshp
,
tshp
,
kshp
],
1
,
""
,
"conv_gemm:alternative"
,
dnn
.
GpuDnnConvGradW
,
num_groups
=
groups
,
)
# grad with respect to inputs
self
.
optimizer_2d
(
[
tshp
,
kshp
,
imshp
],
2
,
""
,
"conv_dnn:alternative"
,
blas
.
GpuCorrMM_gradInputs
,
num_groups
=
groups
,
)
self
.
optimizer_2d
(
[
tshp
,
kshp
,
imshp
],
2
,
""
,
"conv_gemm:alternative"
,
dnn
.
GpuDnnConvGradI
,
num_groups
=
groups
,
)
# test unshared for default optimizers
imshp2d
=
[(
2
,
2
,
4
,
4
),
(
3
,
2
,
5
,
3
)]
kshp2d
=
[(
2
,
2
,
2
,
2
,
3
,
3
),
(
2
,
3
,
1
,
2
,
3
,
3
)]
tshp2d
=
[(
2
,
2
,
2
,
2
),
(
3
,
2
,
3
,
1
)]
for
imshp
,
kshp
,
tshp
,
groups
in
zip
(
imshp2d
,
kshp2d
,
tshp2d
,
num_groups
):
# forward pass
self
.
optimizer_2d
(
[
imshp
,
kshp
,
tshp
],
0
,
""
,
"alternative"
,
blas
.
GpuCorrMM
,
unshared
=
True
)
# grad with respect to weights
self
.
optimizer_2d
(
[
imshp
,
tshp
,
kshp
],
1
,
""
,
"alternative"
,
blas
.
GpuCorrMM_gradWeights
,
unshared
=
True
,
)
# grad with respect to inputs
self
.
optimizer_2d
(
[
tshp
,
kshp
,
imshp
],
2
,
""
,
"alternative"
,
blas
.
GpuCorrMM_gradInputs
,
unshared
=
True
,
)
imshp3d
=
[(
2
,
6
,
5
,
5
,
5
),
(
2
,
4
,
5
,
5
,
5
)]
kshp3d
=
[(
3
,
2
,
3
,
3
,
3
),
(
2
,
2
,
3
,
3
,
3
)]
tshp3d
=
[(
2
,
3
,
3
,
3
,
3
),
(
2
,
2
,
3
,
3
,
3
)]
num_groups
=
[
3
,
2
]
for
imshp
,
kshp
,
tshp
,
groups
in
zip
(
imshp3d
,
kshp3d
,
tshp3d
,
num_groups
):
# forward pass
self
.
optimizer_3d
(
[
imshp
,
kshp
,
tshp
],
0
,
""
,
"conv_dnn:alternative:conv3d2d"
,
blas
.
GpuCorr3dMM
,
num_groups
=
groups
,
)
self
.
optimizer_3d
(
[
imshp
,
kshp
,
tshp
],
0
,
""
,
"conv_gemm:alternative:conv3d2d"
,
dnn
.
GpuDnnConv
,
num_groups
=
groups
,
)
# grad with respect to weights
self
.
optimizer_3d
(
[
imshp
,
tshp
,
kshp
],
1
,
""
,
"conv_dnn:alternative:conv3d2d"
,
blas
.
GpuCorr3dMM_gradWeights
,
num_groups
=
groups
,
)
self
.
optimizer_3d
(
[
imshp
,
tshp
,
kshp
],
1
,
""
,
"conv_gemm:alternative:conv3d2d"
,
dnn
.
GpuDnnConvGradW
,
num_groups
=
groups
,
)
# grad with respect to inputs
self
.
optimizer_3d
(
[
tshp
,
kshp
,
imshp
],
2
,
""
,
"conv_dnn:alternative:conv3d2d"
,
blas
.
GpuCorr3dMM_gradInputs
,
num_groups
=
groups
,
)
self
.
optimizer_3d
(
[
tshp
,
kshp
,
imshp
],
2
,
""
,
"conv_gemm:alternative:conv3d2d"
,
dnn
.
GpuDnnConvGradI
,
num_groups
=
groups
,
)
@pytest.mark.skipif
(
config
.
cxx
==
""
,
reason
=
"Need a c compiler."
)
def
test_returns_none_2d
(
self
):
# values given don't matter since it returns None
imshp
=
(
2
,
3
,
5
,
5
)
kshp
=
(
4
,
3
,
3
,
3
)
tshp
=
(
2
,
4
,
3
,
3
)
conv_direction
=
[
0
,
1
,
2
]
optimisers
=
[
[
opt
.
local_abstractconv_gemm_alt
,
opt
.
local_abstractconv_cudnn_alt
],
[
opt
.
local_abstractconv_gemm_gradweights_alt
,
opt
.
local_abstractconv_cudnn_alt
,
],
[
opt
.
local_abstractconv_gradinputs_gemm_alt
,
opt
.
local_abstractconv_cudnn_alt
,
],
]
# test that non default subsample returns None
for
opt_direction
,
direction
in
zip
(
optimisers
,
conv_direction
):
for
optimiser
in
opt_direction
:
self
.
optimizer_2d
(
[
imshp
,
kshp
,
tshp
],
direction
,
""
,
""
,
None
,
subsample
=
(
2
,
2
),
optimiser
=
optimiser
,
)
# test that non default num_groups returns None
for
opt_direction
,
direction
in
zip
(
optimisers
,
conv_direction
):
for
optimiser
in
opt_direction
:
self
.
optimizer_2d
(
[
imshp
,
kshp
,
tshp
],
direction
,
""
,
""
,
None
,
num_groups
=
3
,
optimiser
=
optimiser
,
)
# test that border_mode=half returns None
for
opt_direction
,
direction
in
zip
(
optimisers
,
conv_direction
):
for
optimiser
in
opt_direction
:
self
.
optimizer_2d
(
[
imshp
,
kshp
,
tshp
],
direction
,
""
,
""
,
None
,
border_mode
=
"half"
,
optimiser
=
optimiser
,
)
# test that Non-default filter dilation return None for
# direction 1
for
optimiser
in
optimisers
[
1
]:
self
.
optimizer_2d
(
[
imshp
,
kshp
,
tshp
],
1
,
""
,
""
,
None
,
filter_dilation
=
(
2
,
2
),
optimiser
=
optimiser
,
)
imshp
=
(
2
,
2
,
4
,
4
)
kshp
=
(
2
,
2
,
2
,
2
,
3
,
3
)
tshp
=
(
2
,
2
,
2
,
2
)
shape_perms
=
[[
imshp
,
kshp
,
tshp
],
[
imshp
,
tshp
,
kshp
],
[
tshp
,
kshp
,
imshp
]]
# test unshared convolution returns None
for
opt_direction
,
direction
,
perms
in
zip
(
optimisers
,
conv_direction
,
shape_perms
):
for
optimiser
in
opt_direction
:
self
.
optimizer_2d
(
perms
,
direction
,
""
,
""
,
None
,
unshared
=
True
,
optimiser
=
optimiser
)
@pytest.mark.skipif
(
config
.
cxx
==
""
,
reason
=
"Need a c compiler."
)
def
test_returns_none_3d
(
self
):
imshp
=
(
2
,
3
,
5
,
5
,
5
)
kshp
=
(
4
,
3
,
3
,
3
,
3
)
tshp
=
(
2
,
4
,
3
,
3
,
3
)
conv_direction
=
[
0
,
1
,
2
]
optimisers
=
[
[
opt
.
local_abstractconv3d_alt
,
opt
.
local_abstractconv3d_cudnn_alt
],
[
opt
.
local_abstractconv3d_gemm_gradweights_alt
,
opt
.
local_abstractconv3d_cudnn_alt
,
],
[
opt
.
local_abstractconv3d_gradinputs_gemm_alt
,
opt
.
local_abstractconv3d_cudnn_alt
,
],
]
# test that non default subsample returns None
for
opt_direction
,
direction
in
zip
(
optimisers
,
conv_direction
):
for
optimiser
in
opt_direction
:
self
.
optimizer_3d
(
[
imshp
,
kshp
,
tshp
],
direction
,
""
,
""
,
None
,
subsample
=
(
2
,
2
,
2
),
optimiser
=
optimiser
,
)
# test that non default num_groups returns None
for
opt_direction
,
direction
in
zip
(
optimisers
,
conv_direction
):
for
optimiser
in
opt_direction
:
self
.
optimizer_3d
(
[
imshp
,
kshp
,
tshp
],
direction
,
""
,
""
,
None
,
num_groups
=
3
,
optimiser
=
optimiser
,
)
# test that border_mode=half returns None
for
opt_direction
,
direction
in
zip
(
optimisers
,
conv_direction
):
for
optimiser
in
opt_direction
:
self
.
optimizer_3d
(
[
imshp
,
kshp
,
tshp
],
direction
,
""
,
""
,
None
,
border_mode
=
"half"
,
optimiser
=
optimiser
,
)
# test that Non-default filter dilation return None for
# direction 1
for
optimiser
in
optimisers
[
1
]:
self
.
optimizer_3d
(
[
imshp
,
kshp
,
tshp
],
1
,
""
,
""
,
None
,
filter_dilation
=
(
2
,
2
,
2
),
optimiser
=
optimiser
,
)
tests/gpuarray/test_others.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
pytest
pygpu
=
pytest
.
importorskip
(
"pygpu"
)
from
aesara.gpuarray.basic_ops
import
GpuFromHost
,
HostFromGpu
from
aesara.gpuarray.type
import
(
GpuArraySharedVariable
,
GpuArrayType
,
get_context
,
gpuarray_shared_constructor
,
)
from
aesara.misc.pkl_utils
import
dump
,
load
from
tests.gpuarray.config
import
mode_with_gpu
,
test_ctx_name
from
tests.misc.test_may_share_memory
import
may_share_memory_core
from
tests.tensor
import
test_opt
class
TestFusion
(
test_opt
.
TestFusion
):
mode
=
mode_with_gpu
.
excluding
(
"local_dnn_reduction"
)
_shared
=
staticmethod
(
gpuarray_shared_constructor
)
topo_exclude
=
(
GpuFromHost
,
HostFromGpu
)
def
test_may_share_memory
():
ctx
=
get_context
(
test_ctx_name
)
a
=
pygpu
.
empty
((
5
,
4
),
context
=
ctx
)
b
=
pygpu
.
empty
((
5
,
4
),
context
=
ctx
)
may_share_memory_core
(
a
,
b
)
def
test_dump_load
():
x
=
GpuArraySharedVariable
(
"x"
,
GpuArrayType
(
"float32"
,
(
1
,
1
),
name
=
"x"
,
context_name
=
test_ctx_name
),
[[
1
]],
False
,
)
with
open
(
"test"
,
"wb"
)
as
f
:
dump
(
x
,
f
)
with
open
(
"test"
,
"rb"
)
as
f
:
x
=
load
(
f
)
assert
x
.
name
==
"x"
np
.
testing
.
assert_allclose
(
x
.
get_value
(),
[[
1
]])
tests/gpuarray/test_pickle.py
deleted
100644 → 0
浏览文件 @
c803c67e
"""
Some pickle test when pygpu isn't there. The test when pygpu is
available are in test_type.py.
This is needed as we skip all the test file when pygpu isn't there in
regular test file.
"""
import
os
import
sys
from
pickle
import
Unpickler
import
numpy
as
np
import
pytest
from
aesara.configdefaults
import
config
from
aesara.gpuarray.type
import
ContextNotDefined
try
:
import
pygpu
# noqa: F401
have_pygpu
=
True
except
ImportError
:
have_pygpu
=
False
@pytest.mark.skip
(
reason
=
"These tests relied on saved/versioned pickled files."
)
@pytest.mark.skipif
(
have_pygpu
,
reason
=
"pygpu active"
)
def
test_unpickle_gpuarray_as_numpy_ndarray_flag1
():
oldflag
=
config
.
experimental__unpickle_gpu_on_cpu
config
.
experimental__unpickle_gpu_on_cpu
=
False
try
:
testfile_dir
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
fname
=
"GpuArray.pkl"
with
open
(
os
.
path
.
join
(
testfile_dir
,
fname
),
"rb"
)
as
fp
:
u
=
Unpickler
(
fp
,
encoding
=
"latin1"
)
with
pytest
.
raises
((
ImportError
,
ContextNotDefined
)):
u
.
load
()
finally
:
config
.
experimental__unpickle_gpu_on_cpu
=
oldflag
@pytest.mark.skip
(
reason
=
"These tests relied on saved/versioned pickled files."
)
def
test_unpickle_gpuarray_as_numpy_ndarray_flag2
():
oldflag
=
config
.
experimental__unpickle_gpu_on_cpu
config
.
experimental__unpickle_gpu_on_cpu
=
True
try
:
testfile_dir
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
fname
=
"GpuArray.pkl"
with
open
(
os
.
path
.
join
(
testfile_dir
,
fname
),
"rb"
)
as
fp
:
u
=
Unpickler
(
fp
,
encoding
=
"latin1"
)
try
:
mat
=
u
.
load
()
except
ImportError
:
# Windows sometimes fail with nonsensical errors like:
# ImportError: No module named type
# ImportError: No module named copy_reg
# when "type" and "copy_reg" are builtin modules.
if
sys
.
platform
==
"win32"
:
exc_type
,
exc_value
,
exc_trace
=
sys
.
exc_info
()
raise
raise
assert
isinstance
(
mat
,
np
.
ndarray
)
assert
mat
[
0
]
==
-
42.0
finally
:
config
.
experimental__unpickle_gpu_on_cpu
=
oldflag
tests/gpuarray/test_pool.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
copy
import
itertools
import
numpy
as
np
import
pytest
import
aesara
from
aesara
import
tensor
as
at
from
aesara.gpuarray.pool
import
(
GpuAveragePoolGrad
,
GpuDownsampleFactorMaxGradGrad
,
GpuMaxPoolGrad
,
GpuPool
,
)
from
aesara.gradient
import
Lop
,
Rop
,
grad
from
aesara.tensor.signal.pool
import
(
AveragePoolGrad
,
DownsampleFactorMaxGradGrad
,
MaxPoolGrad
,
Pool
,
)
from
tests
import
unittest_tools
as
utt
from
tests.gpuarray.config
import
mode_with_gpu
,
mode_without_gpu
from
tests.gpuarray.test_basic_ops
import
random
class
TestPool
:
def
test_pool_py_interface
(
self
):
shp
=
(
2
,
2
,
2
,
2
)
inp
=
aesara
.
shared
(
random
(
*
shp
),
"a"
)
inp
=
at
.
as_tensor_variable
(
inp
)
with
pytest
.
raises
(
ValueError
):
# test when pad >= ws
ds_op
=
GpuPool
(
ignore_border
=
True
,
ndim
=
2
)
ds_op
(
inp
,
[
2
,
2
],
pad
=
[
3
,
3
])
with
pytest
.
raises
(
ValueError
):
# test when ignore_border and pad >= 0
ds_op
=
GpuPool
(
ignore_border
=
False
,
ndim
=
2
)
ds_op
(
inp
,
[
2
,
2
],
pad
=
[
1
,
1
])
def
test_pool_c_interface
(
self
):
gpu_mode
=
mode_with_gpu
.
excluding
(
"cudnn"
)
gpu_mode
.
check_py_code
=
False
shp
=
(
2
,
2
,
2
,
2
)
inp
=
aesara
.
shared
(
random
(
*
shp
),
"a"
)
inp
=
at
.
as_tensor_variable
(
inp
)
with
pytest
.
raises
(
ValueError
):
# test when ignore_border and pad >= 0
ds_op
=
GpuPool
(
ignore_border
=
False
,
ndim
=
2
)
pad
=
at
.
as_tensor_variable
([
1
,
1
])
f
=
aesara
.
function
([],
ds_op
(
inp
,
[
2
,
2
],
pad
=
pad
),
mode
=
gpu_mode
)
f
()
def
test_pool_big_ws
(
self
):
gpu_mode
=
mode_with_gpu
.
excluding
(
"cudnn"
)
gpu_mode
.
check_py_code
=
False
shp
=
(
2
,
2
,
2
,
2
)
inp
=
aesara
.
shared
(
random
(
*
shp
),
"a"
)
inp
=
at
.
as_tensor_variable
(
inp
)
ds_op
=
GpuPool
(
ignore_border
=
False
,
mode
=
"average_exc_pad"
,
ndim
=
2
)
pad
=
at
.
as_tensor_variable
([
0
,
0
])
f
=
aesara
.
function
(
[],
ds_op
(
inp
,
[
5
,
5
],
stride
=
[
1
,
1
],
pad
=
pad
),
mode
=
gpu_mode
)
f
()
def
test_pool2d
():
shps
=
[
(
1
,
12
),
(
1
,
1
,
12
),
(
1
,
1
,
1
,
12
),
(
1
,
1
,
2
,
2
),
(
1
,
1
,
1
,
1
),
(
1
,
1
,
4
,
4
),
(
1
,
1
,
10
,
11
),
(
1
,
2
,
2
,
2
),
(
3
,
5
,
4
,
4
),
(
25
,
1
,
7
,
7
),
(
1
,
1
,
12
,
12
),
(
1
,
1
,
2
,
14
),
(
1
,
1
,
12
,
14
),
(
1
,
1
,
14
,
14
),
(
1
,
1
,
16
,
16
),
(
1
,
1
,
18
,
18
),
(
1
,
1
,
24
,
24
),
(
1
,
6
,
24
,
24
),
(
10
,
1
,
24
,
24
),
(
10
,
6
,
24
,
24
),
(
30
,
6
,
12
,
12
),
(
30
,
2
,
24
,
24
),
(
30
,
6
,
24
,
24
),
(
10
,
10
,
10
,
11
),
(
1
,
1
,
10
,
1025
),
(
1
,
1
,
10
,
1023
),
(
1
,
1
,
1025
,
10
),
(
1
,
1
,
1023
,
10
),
(
3
,
2
,
16
,
16
,
16
),
(
3
,
2
,
6
,
6
,
6
,
5
),
(
3
,
2
,
6
,
6
,
6
,
5
,
7
),
]
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
.
shuffle
(
shps
)
test_ws
=
(
2
,
2
),
(
3
,
2
),
(
1
,
1
)
test_st
=
(
2
,
2
),
(
3
,
2
),
(
1
,
1
)
test_mode
=
[
"max"
,
"sum"
,
"average_inc_pad"
,
"average_exc_pad"
]
ref_mode
=
copy
.
copy
(
mode_without_gpu
)
ref_mode
.
check_py_code
=
False
gpu_mode
=
mode_with_gpu
.
excluding
(
"cudnn"
)
gpu_mode
.
check_py_code
=
False
for
shp
in
shps
:
for
mode
,
ws
,
st
in
itertools
.
product
(
test_mode
,
test_ws
,
test_st
):
if
ws
[
0
]
>
shp
[
-
2
]
or
ws
[
1
]
>
shp
[
-
1
]:
continue
for
ignore_border
,
pad
in
zip
((
True
,
False
),
[(
1
,
1
),
(
0
,
0
)]):
if
pad
[
0
]
>=
ws
[
0
]
or
pad
[
1
]
>=
ws
[
1
]:
continue
if
mode
==
"average_exc_pad"
and
(
pad
[
0
]
>
0
or
pad
[
1
]
>
0
):
continue
# print('test_pool2d', shp, ws, st, pad, mode, ignore_border)
ds_op
=
Pool
(
ndim
=
len
(
ws
),
mode
=
mode
,
ignore_border
=
ignore_border
)
a
=
aesara
.
shared
(
random
(
*
shp
),
"a"
)
a_pooled
=
ds_op
(
at
.
as_tensor_variable
(
a
),
ws
,
st
,
pad
)
f
=
aesara
.
function
([],
a_pooled
,
mode
=
gpu_mode
)
f2
=
aesara
.
function
([],
a_pooled
,
mode
=
ref_mode
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuPool
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
)
assert
any
(
[
isinstance
(
node
.
op
,
Pool
)
for
node
in
f2
.
maker
.
fgraph
.
toposort
()]
)
assert
np
.
allclose
(
f
(),
f2
()),
(
shp
,
ws
,
st
,
pad
,
mode
,
ignore_border
)
a_pooled_grad
=
grad
(
a_pooled
.
sum
(),
a
)
g
=
aesara
.
function
([],
a_pooled_grad
,
mode
=
gpu_mode
)
g2
=
aesara
.
function
([],
a_pooled_grad
,
mode
=
ref_mode
)
if
mode
==
"max"
:
gop
=
GpuMaxPoolGrad
gop2
=
MaxPoolGrad
else
:
gop
=
GpuAveragePoolGrad
gop2
=
AveragePoolGrad
assert
any
(
[
isinstance
(
node
.
op
,
gop
)
for
node
in
g
.
maker
.
fgraph
.
toposort
()]
)
assert
any
(
[
isinstance
(
node
.
op
,
gop2
)
for
node
in
g2
.
maker
.
fgraph
.
toposort
()]
)
assert
np
.
allclose
(
g
(),
g2
()),
(
shp
,
ws
,
st
,
pad
,
mode
,
ignore_border
)
# test rop and grad grad for max pooling
# for average pooling grad grad is just average pooling grad
if
mode
!=
"max"
:
continue
ea
=
aesara
.
shared
(
random
(
*
shp
),
"ea"
)
gr
=
aesara
.
function
([],
Rop
(
a_pooled
,
a
,
ea
),
mode
=
gpu_mode
)
gr2
=
aesara
.
function
([],
Rop
(
a_pooled
,
a
,
ea
),
mode
=
ref_mode
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuDownsampleFactorMaxGradGrad
)
for
node
in
gr
.
maker
.
fgraph
.
toposort
()
]
)
assert
any
(
[
isinstance
(
node
.
op
,
DownsampleFactorMaxGradGrad
)
for
node
in
gr2
.
maker
.
fgraph
.
toposort
()
]
)
assert
np
.
allclose
(
gr
(),
gr2
()),
(
shp
,
ws
,
st
,
pad
,
mode
,
ignore_border
)
ggf
=
Lop
(
grad
((
a_pooled
**
2
)
.
sum
(),
a
),
a
,
a
)
gg
=
aesara
.
function
([],
ggf
,
mode
=
gpu_mode
)
gg2
=
aesara
.
function
([],
ggf
,
mode
=
ref_mode
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuDownsampleFactorMaxGradGrad
)
for
node
in
gg
.
maker
.
fgraph
.
toposort
()
]
)
assert
any
(
[
isinstance
(
node
.
op
,
DownsampleFactorMaxGradGrad
)
for
node
in
gg2
.
maker
.
fgraph
.
toposort
()
]
)
assert
np
.
allclose
(
gg
(),
gg2
()),
(
shp
,
ws
,
st
,
pad
,
mode
,
ignore_border
)
def
test_pool3d
():
shps
=
[
(
1
,
1
,
12
),
(
1
,
1
,
1
,
1
,
1
),
(
1
,
1
,
1
,
1
,
1025
),
(
1
,
1
,
2
,
2
,
2
),
(
1
,
1
,
7
,
7
,
7
),
(
1
,
1
,
9
,
10
,
11
),
(
1
,
6
,
18
,
18
,
18
),
(
1
,
1
,
6
,
24
,
24
),
(
1
,
10
,
1
,
24
,
24
),
(
1
,
10
,
6
,
24
,
24
),
(
1
,
30
,
6
,
12
,
12
),
(
1
,
30
,
2
,
24
,
24
),
(
1
,
30
,
6
,
24
,
24
),
(
1
,
10
,
10
,
10
,
11
),
(
1
,
1
,
10
,
10
,
1025
),
(
1
,
1
,
10
,
10
,
1023
),
(
1
,
1
,
10
,
1025
,
10
),
(
1
,
1
,
10
,
1023
,
10
),
(
3
,
2
,
6
,
6
,
6
,
5
),
(
3
,
2
,
6
,
6
,
6
,
5
,
7
),
]
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
.
shuffle
(
shps
)
test_ws
=
(
2
,
2
,
2
),
(
3
,
2
,
3
),
(
1
,
1
,
1
)
test_st
=
(
2
,
2
,
2
),
(
2
,
3
,
2
),
(
1
,
1
,
1
)
test_mode
=
[
"max"
,
"sum"
,
"average_inc_pad"
,
"average_exc_pad"
]
ref_mode
=
copy
.
copy
(
mode_without_gpu
)
ref_mode
.
check_py_code
=
False
gpu_mode
=
mode_with_gpu
.
excluding
(
"cudnn"
)
gpu_mode
.
check_py_code
=
False
for
shp
in
shps
:
for
mode
,
ws
,
st
in
itertools
.
product
(
test_mode
,
test_ws
,
test_st
):
if
ws
[
0
]
>
shp
[
-
3
]
or
ws
[
1
]
>
shp
[
-
2
]
or
ws
[
2
]
>
shp
[
-
1
]:
continue
for
ignore_border
,
pad
in
zip
((
True
,
False
),
[(
1
,
1
,
1
),
(
0
,
0
,
0
)]):
if
pad
[
0
]
>=
ws
[
0
]
or
pad
[
1
]
>=
ws
[
1
]
or
pad
[
2
]
>=
ws
[
2
]:
continue
if
mode
==
"average_exc_pad"
and
(
pad
[
0
]
>
0
or
pad
[
1
]
>
0
or
pad
[
2
]
>
0
):
continue
# print('test_pool3d', shp, ws, st, pad, mode, ignore_border)
ds_op
=
Pool
(
ndim
=
len
(
ws
),
mode
=
mode
,
ignore_border
=
ignore_border
)
a
=
aesara
.
shared
(
random
(
*
shp
),
"a"
)
a_pooled
=
ds_op
(
at
.
as_tensor_variable
(
a
),
ws
,
st
,
pad
)
f
=
aesara
.
function
([],
a_pooled
,
mode
=
gpu_mode
)
f2
=
aesara
.
function
([],
a_pooled
,
mode
=
ref_mode
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuPool
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
)
assert
any
(
[
isinstance
(
node
.
op
,
Pool
)
for
node
in
f2
.
maker
.
fgraph
.
toposort
()]
)
assert
np
.
allclose
(
f
(),
f2
()),
(
shp
,
ws
,
st
,
pad
,
mode
,
ignore_border
)
a_pooled_grad
=
grad
(
a_pooled
.
sum
(),
a
)
g
=
aesara
.
function
([],
a_pooled_grad
,
mode
=
gpu_mode
)
g2
=
aesara
.
function
([],
a_pooled_grad
,
mode
=
ref_mode
)
if
mode
==
"max"
:
gop
=
GpuMaxPoolGrad
gop2
=
MaxPoolGrad
else
:
gop
=
GpuAveragePoolGrad
gop2
=
AveragePoolGrad
assert
any
(
[
isinstance
(
node
.
op
,
gop
)
for
node
in
g
.
maker
.
fgraph
.
toposort
()]
)
assert
any
(
[
isinstance
(
node
.
op
,
gop2
)
for
node
in
g2
.
maker
.
fgraph
.
toposort
()]
)
assert
np
.
allclose
(
g
(),
g2
()),
(
shp
,
ws
,
st
,
pad
,
mode
,
ignore_border
)
# test rop and grad grad for max pooling
# for average pooling grad grad is just average pooling grad
if
mode
!=
"max"
:
continue
ea
=
aesara
.
shared
(
random
(
*
shp
),
"ea"
)
gr
=
aesara
.
function
([],
Rop
(
a_pooled
,
a
,
ea
),
mode
=
gpu_mode
)
gr2
=
aesara
.
function
([],
Rop
(
a_pooled
,
a
,
ea
),
mode
=
ref_mode
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuDownsampleFactorMaxGradGrad
)
for
node
in
gr
.
maker
.
fgraph
.
toposort
()
]
)
assert
any
(
[
isinstance
(
node
.
op
,
DownsampleFactorMaxGradGrad
)
for
node
in
gr2
.
maker
.
fgraph
.
toposort
()
]
)
assert
np
.
allclose
(
gr
(),
gr2
()),
(
shp
,
ws
,
st
,
pad
,
mode
,
ignore_border
)
ggf
=
Lop
(
grad
((
a_pooled
**
2
)
.
sum
(),
a
),
a
,
a
)
gg
=
aesara
.
function
([],
ggf
,
mode
=
gpu_mode
)
gg2
=
aesara
.
function
([],
ggf
,
mode
=
ref_mode
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuDownsampleFactorMaxGradGrad
)
for
node
in
gg
.
maker
.
fgraph
.
toposort
()
]
)
assert
any
(
[
isinstance
(
node
.
op
,
DownsampleFactorMaxGradGrad
)
for
node
in
gg2
.
maker
.
fgraph
.
toposort
()
]
)
assert
np
.
allclose
(
gg
(),
gg2
()),
(
shp
,
ws
,
st
,
pad
,
mode
,
ignore_border
)
tests/gpuarray/test_reduction.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
math
import
numpy
as
np
import
pytest
import
aesara
import
aesara.tensor
as
at
from
aesara.gpuarray
import
GpuArrayType
from
aesara.gpuarray.dnn
import
GpuDnnReduction
from
aesara.gpuarray.reduction
import
GpuMaxAndArgmax
from
aesara.tensor.math
import
argmax
from
aesara.tensor.math
import
max
as
at_max
from
tests
import
unittest_tools
as
utt
from
tests.gpuarray.config
import
mode_with_gpu
,
mode_without_gpu
from
tests.gpuarray.test_basic_ops
import
rand_gpuarray
# Number of values to be used in test tensors (except with 0-D tensors!).
test_size
=
10000
# NB: This order of "unsorted axes" is arbitrary and is here
# just to have the same information on profile output
# from one test to another.
unsorted_axes
=
(
2
,
4
,
0
,
3
,
1
)
np
.
random
.
seed
()
def
numpy_random_array
(
shapes
):
size
=
1
for
dimsize
in
shapes
:
size
*=
dimsize
return
np
.
random
.
normal
(
size
=
size
)
.
astype
(
aesara
.
config
.
floatX
)
.
reshape
(
shapes
)
def
numpy_maxandargmax
(
X
,
axis
=
None
):
if
axis
is
None
:
axis
=
list
(
range
(
X
.
ndim
))
elif
not
isinstance
(
axis
,
(
tuple
,
list
)):
axis
=
[
int
(
axis
)]
axis
=
list
(
set
(
axis
))
# remove duplicated values.
axis
.
sort
()
axis
=
tuple
(
axis
)
ref_max
=
np
.
max
(
X
,
axis
=
axis
)
# Following code is copied from MaxAndArgmax.perform():
# Numpy does not support multiple axes for argmax. Work around.
keep_axes
=
np
.
array
([
i
for
i
in
range
(
X
.
ndim
)
if
i
not
in
axis
],
dtype
=
"int64"
)
# Not-reduced axes in front
transposed_x
=
np
.
transpose
(
X
,
np
.
concatenate
((
keep_axes
,
axis
)))
kept_shape
=
transposed_x
.
shape
[:
len
(
keep_axes
)]
reduced_shape
=
transposed_x
.
shape
[
len
(
keep_axes
)
:]
new_shape
=
kept_shape
+
(
np
.
prod
(
reduced_shape
),)
new_shape
=
tuple
(
int
(
i
)
for
i
in
new_shape
)
reshaped_x
=
transposed_x
.
reshape
(
new_shape
)
return
(
ref_max
,
np
.
argmax
(
reshaped_x
,
axis
=-
1
))
def
check_if_gpu_reduce_in_graph
(
aesara_function
):
assert
any
(
isinstance
(
node
.
op
,
(
GpuMaxAndArgmax
,
GpuDnnReduction
))
for
node
in
aesara_function
.
maker
.
fgraph
.
apply_nodes
)
def
check_if_gpu_reduce_not_in_graph
(
aesara_function
):
assert
all
(
not
isinstance
(
node
.
op
,
(
GpuMaxAndArgmax
,
GpuDnnReduction
))
for
node
in
aesara_function
.
maker
.
fgraph
.
apply_nodes
)
class
BaseTest
:
# This attribute must be set in subclasses.
tensor_size
=
None
shape
=
None
dtype
=
aesara
.
config
.
floatX
def
get_shape
(
self
):
if
self
.
tensor_size
==
0
:
return
[]
return
[
int
(
math
.
ceil
(
math
.
pow
(
test_size
,
1
/
self
.
tensor_size
)))
]
*
self
.
tensor_size
def
setup_method
(
self
):
if
not
isinstance
(
self
.
tensor_size
,
int
):
pytest
.
skip
(
"No tensor ndim defined."
)
if
self
.
tensor_size
<
0
or
self
.
tensor_size
>
5
:
pytest
.
skip
(
"We allow from 0 (included) to 5 (included) dimensons for these tests."
)
if
self
.
shape
is
None
:
self
.
shape
=
self
.
get_shape
()
def
get_host_tensor
(
self
):
broadcastable
=
(
False
,)
*
self
.
tensor_size
return
at
.
tensor
(
self
.
dtype
,
broadcastable
)
def
get_gpu_tensor
(
self
):
broadcastable
=
(
False
,)
*
self
.
tensor_size
return
GpuArrayType
(
self
.
dtype
,
broadcastable
)()
def
get_host_value
(
self
):
return
numpy_random_array
(
self
.
shape
)
def
get_gpu_value
(
self
):
return
rand_gpuarray
(
*
self
.
shape
)
# NB: In compute_host() and compute_gpu(),
# the first call of the aesara function should be ignored in profiling,
# with Aesara config flag profiling__ignore_first_call=True.
def
compute_host
(
self
,
test_tensor
,
axis
):
M
=
self
.
get_host_tensor
()
f
=
aesara
.
function
(
[
M
],
[
at_max
(
M
,
axis
=
axis
),
argmax
(
M
,
axis
=
axis
)],
name
=
"shape:"
+
str
(
test_tensor
.
shape
)
+
"/axis:"
+
str
(
axis
)
+
"/HOST"
,
mode
=
mode_without_gpu
,
)
check_if_gpu_reduce_not_in_graph
(
f
)
f
(
test_tensor
)
aesara_max
,
aesara_argmax
=
f
(
test_tensor
)
ref_max
,
ref_argmax
=
numpy_maxandargmax
(
test_tensor
,
axis
=
axis
)
utt
.
assert_allclose
(
ref_max
,
aesara_max
)
utt
.
assert_allclose
(
ref_argmax
,
aesara_argmax
)
def
compute_gpu
(
self
,
test_gpu_tensor
,
test_host_tensor
,
axis
):
M
=
self
.
get_gpu_tensor
()
f
=
aesara
.
function
(
[
M
],
[
at_max
(
M
,
axis
=
axis
),
argmax
(
M
,
axis
=
axis
)],
name
=
"shape:"
+
str
(
test_gpu_tensor
.
shape
)
+
"/axis:"
+
str
(
axis
)
+
"/GPU"
,
mode
=
mode_with_gpu
,
)
check_if_gpu_reduce_in_graph
(
f
)
f
(
test_gpu_tensor
)
aesara_max
,
aesara_argmax
=
f
(
test_gpu_tensor
)
ref_max
,
ref_argmax
=
numpy_maxandargmax
(
test_host_tensor
,
axis
=
axis
)
utt
.
assert_allclose
(
ref_max
,
aesara_max
)
utt
.
assert_allclose
(
ref_argmax
,
aesara_argmax
)
def
compute
(
self
,
axis
=
None
):
# We want to run CPU op and GPU op on the same tensor randomly generated.
test_gpu_tensor
=
self
.
get_gpu_value
()
test_host_tensor
=
np
.
asarray
(
test_gpu_tensor
)
self
.
compute_host
(
test_host_tensor
,
axis
)
self
.
compute_gpu
(
test_gpu_tensor
,
test_host_tensor
,
axis
)
def
compute_axis
(
self
,
pos
):
if
self
.
tensor_size
!=
1
and
0
<=
pos
<
self
.
tensor_size
:
self
.
compute
(
pos
)
def
compute_some_axes
(
self
,
count
):
if
0
<=
count
<
self
.
tensor_size
:
self
.
compute
([
i
for
i
in
unsorted_axes
if
i
<
self
.
tensor_size
][:
count
])
# Equivalent to test reduction on all axes.
def
test_none
(
self
):
self
.
compute
(
None
)
def
test_axis_1
(
self
):
self
.
compute_axis
(
0
)
def
test_axis_2
(
self
):
self
.
compute_axis
(
1
)
def
test_axis_3
(
self
):
self
.
compute_axis
(
2
)
def
test_axis_4
(
self
):
self
.
compute_axis
(
3
)
def
test_axis_5
(
self
):
self
.
compute_axis
(
4
)
# For the tests below, we expect CPU op to run with Python implementation.
def
test_2_axes
(
self
):
self
.
compute_some_axes
(
2
)
def
test_3_axes
(
self
):
self
.
compute_some_axes
(
3
)
def
test_4_axes
(
self
):
self
.
compute_some_axes
(
4
)
class
TestScalar
(
BaseTest
):
tensor_size
=
0
class
TestVector
(
BaseTest
):
tensor_size
=
1
# Special case
class
TestRow
(
BaseTest
):
tensor_size
=
2
shape
=
[
1
,
test_size
]
# Special case
class
TestColumn
(
BaseTest
):
tensor_size
=
2
shape
=
[
test_size
,
1
]
class
TestMatrix
(
BaseTest
):
tensor_size
=
2
class
TestTensor5
(
BaseTest
):
tensor_size
=
5
tests/gpuarray/test_rng_mrg.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
functools
import
numpy
as
np
import
aesara
from
aesara
import
tensor
as
at
from
aesara.configdefaults
import
config
from
aesara.gpuarray.rng_mrg
import
GPUA_mrg_uniform
from
aesara.gpuarray.type
import
gpuarray_shared_constructor
from
aesara.sandbox
import
rng_mrg
from
aesara.sandbox.rng_mrg
import
MRG_RandomStream
from
tests
import
unittest_tools
as
utt
from
tests.gpuarray.config
import
mode_with_gpu
as
mode
from
tests.sandbox.test_rng_mrg
import
java_samples
,
rng_mrg_overflow
from
tests.sandbox.test_rng_mrg
import
test_f16_nonzero
as
cpu_f16_nonzero
def
test_consistency_GPUA_serial
():
# Verify that the random numbers generated by GPUA_mrg_uniform, serially,
# are the same as the reference (Java) implementation by L'Ecuyer et al.
seed
=
12345
n_samples
=
5
n_streams
=
12
n_substreams
=
7
samples
=
[]
curr_rstate
=
np
.
array
([
seed
]
*
6
,
dtype
=
"int32"
)
for
i
in
range
(
n_streams
):
stream_rstate
=
curr_rstate
.
copy
()
for
j
in
range
(
n_substreams
):
substream_rstate
=
np
.
array
([
stream_rstate
.
copy
()],
dtype
=
"int32"
)
# Transfer to device
rstate
=
gpuarray_shared_constructor
(
substream_rstate
)
new_rstate
,
sample
=
GPUA_mrg_uniform
.
new
(
rstate
,
ndim
=
None
,
dtype
=
"float32"
,
size
=
(
1
,)
)
rstate
.
default_update
=
new_rstate
# Not really necessary, just mimicking
# rng_mrg.MRG_RandomStream' behavior
sample
.
rstate
=
rstate
sample
.
update
=
(
rstate
,
new_rstate
)
# We need the sample back in the main memory
cpu_sample
=
at
.
as_tensor_variable
(
sample
)
f
=
aesara
.
function
([],
cpu_sample
,
mode
=
mode
)
for
k
in
range
(
n_samples
):
s
=
f
()
samples
.
append
(
s
)
# next substream
stream_rstate
=
rng_mrg
.
ff_2p72
(
stream_rstate
)
# next stream
curr_rstate
=
rng_mrg
.
ff_2p134
(
curr_rstate
)
samples
=
np
.
array
(
samples
)
.
flatten
()
assert
np
.
allclose
(
samples
,
java_samples
)
def
test_consistency_GPUA_parallel
():
# Verify that the random numbers generated by GPUA_mrg_uniform, in
# parallel, are the same as the reference (Java) implementation by
# L'Ecuyer et al.
seed
=
12345
n_samples
=
5
n_streams
=
12
n_substreams
=
7
# 7 samples will be drawn in parallel
samples
=
[]
curr_rstate
=
np
.
array
([
seed
]
*
6
,
dtype
=
"int32"
)
for
i
in
range
(
n_streams
):
stream_samples
=
[]
rstate
=
[
curr_rstate
.
copy
()]
for
j
in
range
(
1
,
n_substreams
):
rstate
.
append
(
rng_mrg
.
ff_2p72
(
rstate
[
-
1
]))
rstate
=
np
.
asarray
(
rstate
)
rstate
=
gpuarray_shared_constructor
(
rstate
)
new_rstate
,
sample
=
GPUA_mrg_uniform
.
new
(
rstate
,
ndim
=
None
,
dtype
=
"float32"
,
size
=
(
n_substreams
,)
)
rstate
.
default_update
=
new_rstate
# Not really necessary, just mimicking
# rng_mrg.MRG_RandomStream' behavior
sample
.
rstate
=
rstate
sample
.
update
=
(
rstate
,
new_rstate
)
# We need the sample back in the main memory
cpu_sample
=
at
.
as_tensor_variable
(
sample
)
f
=
aesara
.
function
([],
cpu_sample
,
mode
=
mode
)
for
k
in
range
(
n_samples
):
s
=
f
()
stream_samples
.
append
(
s
)
samples
.
append
(
np
.
array
(
stream_samples
)
.
T
.
flatten
())
# next stream
curr_rstate
=
rng_mrg
.
ff_2p134
(
curr_rstate
)
samples
=
np
.
array
(
samples
)
.
flatten
()
assert
np
.
allclose
(
samples
,
java_samples
)
def
test_GPUA_full_fill
():
# Make sure the whole sample buffer is filled. Also make sure
# large samples are consistent with CPU results.
# This needs to be large to trigger the problem on GPU
size
=
(
10
,
1000
)
R
=
MRG_RandomStream
(
234
)
uni
=
R
.
uniform
(
size
,
nstreams
=
60
*
256
)
f_cpu
=
aesara
.
function
([],
uni
)
rstate_gpu
=
gpuarray_shared_constructor
(
R
.
state_updates
[
-
1
][
0
]
.
get_value
())
new_rstate
,
sample
=
GPUA_mrg_uniform
.
new
(
rstate_gpu
,
ndim
=
None
,
dtype
=
"float32"
,
size
=
size
)
rstate_gpu
.
default_update
=
new_rstate
f_gpu
=
aesara
.
function
([],
sample
,
mode
=
mode
)
utt
.
assert_allclose
(
f_cpu
(),
f_gpu
())
def
test_overflow_gpu_new_backend
():
seed
=
12345
n_substreams
=
7
curr_rstate
=
np
.
array
([
seed
]
*
6
,
dtype
=
"int32"
)
rstate
=
[
curr_rstate
.
copy
()]
for
j
in
range
(
1
,
n_substreams
):
rstate
.
append
(
rng_mrg
.
ff_2p72
(
rstate
[
-
1
]))
rstate
=
np
.
asarray
(
rstate
)
rstate
=
gpuarray_shared_constructor
(
rstate
)
fct
=
functools
.
partial
(
GPUA_mrg_uniform
.
new
,
rstate
,
ndim
=
None
,
dtype
=
"float32"
)
# should raise error as the size overflows
sizes
=
[
(
2
**
31
,),
(
2
**
32
,),
(
2
**
15
,
2
**
16
,
),
(
2
,
2
**
15
,
2
**
15
),
]
rng_mrg_overflow
(
sizes
,
fct
,
mode
,
should_raise_error
=
True
)
# should not raise error
sizes
=
[(
2
**
5
,),
(
2
**
5
,
2
**
5
),
(
2
**
5
,
2
**
5
,
2
**
5
)]
rng_mrg_overflow
(
sizes
,
fct
,
mode
,
should_raise_error
=
False
)
# should support int32 sizes
sizes
=
[(
np
.
int32
(
2
**
10
),),
(
np
.
int32
(
2
),
np
.
int32
(
2
**
10
),
np
.
int32
(
2
**
10
))]
rng_mrg_overflow
(
sizes
,
fct
,
mode
,
should_raise_error
=
False
)
def
test_validate_input_types_gpuarray_backend
():
with
config
.
change_flags
(
compute_test_value
=
"raise"
):
rstate
=
np
.
zeros
((
7
,
6
),
dtype
=
"int32"
)
rstate
=
gpuarray_shared_constructor
(
rstate
)
rng_mrg
.
mrg_uniform
.
new
(
rstate
,
ndim
=
None
,
dtype
=
"float32"
,
size
=
(
3
,))
def
test_f16_nonzero
():
try
:
# To have aesara.shared(x) try to move on the GPU
aesara
.
compile
.
shared_constructor
(
gpuarray_shared_constructor
)
cpu_f16_nonzero
(
mode
=
mode
,
op_to_check
=
GPUA_mrg_uniform
)
finally
:
aesara
.
compile
.
shared_constructor
(
gpuarray_shared_constructor
,
remove
=
True
)
def
test_cpu_target_with_shared_variable
():
srng
=
MRG_RandomStream
()
s
=
np
.
random
.
rand
(
2
,
3
)
.
astype
(
"float32"
)
x
=
gpuarray_shared_constructor
(
s
,
name
=
"x"
)
try
:
# To have aesara.shared(x) try to move on the GPU
aesara
.
compile
.
shared_constructor
(
gpuarray_shared_constructor
)
y
=
srng
.
uniform
(
x
.
shape
,
target
=
"cpu"
)
y
.
name
=
"y"
z
=
(
x
*
y
)
.
sum
()
z
.
name
=
"z"
fz
=
aesara
.
function
([],
z
,
mode
=
mode
)
nodes
=
fz
.
maker
.
fgraph
.
toposort
()
assert
not
any
(
isinstance
(
node
.
op
,
GPUA_mrg_uniform
)
for
node
in
nodes
)
finally
:
aesara
.
compile
.
shared_constructor
(
gpuarray_shared_constructor
,
remove
=
True
)
tests/gpuarray/test_scan.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
pytest
import
aesara
import
aesara.sandbox.rng_mrg
from
aesara
import
gpuarray
from
aesara
import
tensor
as
at
from
aesara.gpuarray.basic_ops
import
GpuFromHost
,
HostFromGpu
from
aesara.gpuarray.elemwise
import
GpuElemwise
from
aesara.scan.basic
import
scan
from
aesara.scan.checkpoints
import
scan_checkpoints
from
aesara.scan.op
import
Scan
from
aesara.tensor.math
import
dot
from
aesara.tensor.math
import
sum
as
at_sum
from
aesara.tensor.type
import
fscalar
,
ftensor3
,
fvector
,
iscalar
,
vector
from
tests
import
unittest_tools
as
utt
from
tests.gpuarray.config
import
mode_with_gpu
,
test_ctx_name
pygpu_gpuarray
=
pytest
.
importorskip
(
"pygpy.gpuarray"
)
GpuArrayException
=
pygpu_gpuarray
.
GpuArrayException
if
aesara
.
config
.
mode
==
"FAST_COMPILE"
:
mode_with_opt
=
aesara
.
compile
.
mode
.
get_mode
(
"FAST_RUN"
)
else
:
mode_with_opt
=
aesara
.
compile
.
mode
.
get_default_mode
()
if
aesara
.
config
.
mode
in
(
"DEBUG_MODE"
,
"DebugMode"
):
mode_nodebug
=
aesara
.
compile
.
mode
.
get_mode
(
"FAST_RUN"
)
else
:
mode_nodebug
=
mode_with_opt
class
TestScan
:
def
test_one_sequence_one_output_weights_gpu1
(
self
):
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
u_t
*
W_in
+
x_tm1
*
W
u
=
fvector
(
"u"
)
x0
=
fscalar
(
"x0"
)
W_in
=
fscalar
(
"win"
)
W
=
fscalar
(
"w"
)
mode
=
mode_with_gpu
.
excluding
(
"InputToGpuOptimizer"
)
output
,
updates
=
scan
(
f_rnn
,
u
,
x0
,
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
mode
,
)
output
=
GpuFromHost
(
test_ctx_name
)(
output
)
f2
=
aesara
.
function
(
[
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
mode
,
)
rng
=
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.0
,
high
=
5.0
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
v_u
=
np
.
asarray
(
v_u
,
dtype
=
"float32"
)
v_x0
=
np
.
asarray
(
v_x0
,
dtype
=
"float32"
)
W
=
np
.
asarray
(
W
,
dtype
=
"float32"
)
W_in
=
np
.
asarray
(
W_in
,
dtype
=
"float32"
)
# compute the output in numpy
v_out
=
np
.
zeros
((
4
,))
v_out
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
for
step
in
range
(
1
,
4
):
v_out
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out
[
step
-
1
]
*
W
aesara_values
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
aesara_values
,
v_out
)
# TO DEL
topo
=
f2
.
maker
.
fgraph
.
toposort
()
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
scan
.
op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
topo
=
f2
.
maker
.
fgraph
.
toposort
()
assert
sum
([
isinstance
(
node
.
op
,
HostFromGpu
)
for
node
in
topo
])
==
0
assert
sum
([
isinstance
(
node
.
op
,
GpuFromHost
)
for
node
in
topo
])
==
4
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
scan
.
op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
scan_node_topo
=
scan_node
.
op
.
fn
.
maker
.
fgraph
.
toposort
()
# check that there is no gpu transfer in the inner loop.
assert
any
(
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
scan_node_topo
)
assert
not
any
(
isinstance
(
node
.
op
,
HostFromGpu
)
for
node
in
scan_node_topo
)
assert
not
any
(
isinstance
(
node
.
op
,
GpuFromHost
)
for
node
in
scan_node_topo
)
# This second version test the second case in the optimizer to the gpu.
def
test_one_sequence_one_output_weights_gpu2
(
self
):
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
u_t
*
W_in
+
x_tm1
*
W
u
=
fvector
(
"u"
)
x0
=
fscalar
(
"x0"
)
W_in
=
fscalar
(
"win"
)
W
=
fscalar
(
"w"
)
output
,
updates
=
scan
(
f_rnn
,
u
,
x0
,
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
mode_with_gpu
,
)
f2
=
aesara
.
function
(
[
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
,
)
# get random initial values
rng
=
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.0
,
high
=
5.0
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
# compute the output in numpy
v_out
=
np
.
zeros
((
4
,))
v_out
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
for
step
in
range
(
1
,
4
):
v_out
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out
[
step
-
1
]
*
W
aesara_values
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
aesara_values
,
v_out
)
topo
=
f2
.
maker
.
fgraph
.
toposort
()
assert
sum
([
isinstance
(
node
.
op
,
HostFromGpu
)
for
node
in
topo
])
==
1
assert
sum
([
isinstance
(
node
.
op
,
GpuFromHost
)
for
node
in
topo
])
==
4
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
scan
.
op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
scan_node_topo
=
scan_node
.
op
.
fn
.
maker
.
fgraph
.
toposort
()
# check that there is no gpu transfer in the inner loop.
assert
any
(
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
scan_node_topo
)
assert
not
any
(
isinstance
(
node
.
op
,
HostFromGpu
)
for
node
in
scan_node_topo
)
assert
not
any
(
isinstance
(
node
.
op
,
GpuFromHost
)
for
node
in
scan_node_topo
)
# This third test checks that scan can deal with a mixture of dtypes as
# outputs when is running on GPU
def
test_gpu3_mixture_dtype_outputs
(
self
):
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
(
u_t
*
W_in
+
x_tm1
*
W
,
at
.
cast
(
u_t
+
x_tm1
,
"int64"
))
u
=
fvector
(
"u"
)
x0
=
fscalar
(
"x0"
)
W_in
=
fscalar
(
"win"
)
W
=
fscalar
(
"w"
)
output
,
updates
=
scan
(
f_rnn
,
u
,
[
x0
,
None
],
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
mode_with_gpu
,
)
f2
=
aesara
.
function
(
[
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
,
)
# get random initial values
rng
=
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.0
,
high
=
5.0
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
# compute the output in numpy
v_out1
=
np
.
zeros
((
4
,))
v_out2
=
np
.
zeros
((
4
,),
dtype
=
"int64"
)
v_out1
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
v_out2
[
0
]
=
v_u
[
0
]
+
v_x0
for
step
in
range
(
1
,
4
):
v_out1
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out1
[
step
-
1
]
*
W
v_out2
[
step
]
=
np
.
int64
(
v_u
[
step
]
+
v_out1
[
step
-
1
])
aesara_out1
,
aesara_out2
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
aesara_out1
,
v_out1
)
utt
.
assert_allclose
(
aesara_out2
,
v_out2
)
topo
=
f2
.
maker
.
fgraph
.
toposort
()
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
scan
.
op
.
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
assert
scan_node
.
op
.
gpua
scan_node_topo
=
scan_node
.
op
.
fn
.
maker
.
fgraph
.
toposort
()
# check that there is no gpu transfer in the inner loop.
assert
not
any
(
isinstance
(
node
.
op
,
HostFromGpu
)
for
node
in
scan_node_topo
)
assert
not
any
(
isinstance
(
node
.
op
,
GpuFromHost
)
for
node
in
scan_node_topo
)
def
test_gpu4_gibbs_chain
(
self
):
rng
=
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
v_vsample
=
np
.
array
(
rng
.
binomial
(
1
,
0.5
,
size
=
(
3
,
20
),
),
dtype
=
"float32"
,
)
vsample
=
aesara
.
shared
(
v_vsample
)
trng
=
aesara
.
sandbox
.
rng_mrg
.
MRG_RandomStream
(
utt
.
fetch_seed
())
def
f
(
vsample_tm1
):
return
(
trng
.
binomial
(
vsample_tm1
.
shape
,
n
=
1
,
p
=
0.3
,
dtype
=
"float32"
)
*
vsample_tm1
)
aesara_vsamples
,
updates
=
scan
(
f
,
[],
vsample
,
[],
n_steps
=
10
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
mode_with_gpu
,
)
my_f
=
aesara
.
function
(
[],
aesara_vsamples
[
-
1
],
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
mode_with_gpu
,
)
# I leave this to tested by debugmode, this test was anyway
# more of does the graph compile kind of test
my_f
()
class
ScanGpuTests
:
"""
This class defines a number of tests for Scan on GPU as well as a few
helper functions for these tests. The GPU tests defined in this class are
independent of the GPU backend used. Because of this, a class inheriting
from ScanGpuTests should define the following attributes and methods to
make the tests run on a specific backend :
- self.gpu_backend : Reference to the backend module
- self.mode_with_opt : Compilation mode to force usage of the gpu backend
- self.is_scan_on_gpu(node) : Method to determine is a scan node has been
moved to run on a gpu under the specific
backend. Returns a boolean.
"""
def
test_one_sequence_one_output_weights_gpu1
(
self
):
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
u_t
*
W_in
+
x_tm1
*
W
u
=
fvector
(
"u"
)
x0
=
fscalar
(
"x0"
)
W_in
=
fscalar
(
"win"
)
W
=
fscalar
(
"w"
)
# The following line is needed to have the first case being used
# Otherwise, it is the second that is tested.
mode
=
self
.
mode_with_gpu
.
excluding
(
"InputToGpuOptimizer"
)
output
,
updates
=
scan
(
f_rnn
,
u
,
x0
,
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
mode
,
)
output
=
self
.
gpu_backend
.
gpu_from_host
(
output
)
f2
=
aesara
.
function
(
[
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
self
.
mode_with_gpu
,
)
# get random initial values
rng
=
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.0
,
high
=
5.0
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
v_u
=
np
.
asarray
(
v_u
,
dtype
=
"float32"
)
v_x0
=
np
.
asarray
(
v_x0
,
dtype
=
"float32"
)
W
=
np
.
asarray
(
W
,
dtype
=
"float32"
)
W_in
=
np
.
asarray
(
W_in
,
dtype
=
"float32"
)
# compute the output in numpy
v_out
=
np
.
zeros
((
4
,))
v_out
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
for
step
in
range
(
1
,
4
):
v_out
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out
[
step
-
1
]
*
W
aesara_values
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
aesara_values
,
v_out
)
# TO DEL
topo
=
f2
.
maker
.
fgraph
.
toposort
()
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
topo
=
f2
.
maker
.
fgraph
.
toposort
()
assert
(
sum
([
isinstance
(
node
.
op
,
self
.
gpu_backend
.
HostFromGpu
)
for
node
in
topo
])
==
0
)
assert
(
sum
([
isinstance
(
node
.
op
,
self
.
gpu_backend
.
GpuFromHost
)
for
node
in
topo
])
==
4
)
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
scan_node_topo
=
scan_node
.
op
.
fn
.
maker
.
fgraph
.
toposort
()
# check that there is no gpu transfer in the inner loop.
assert
any
(
[
isinstance
(
node
.
op
,
self
.
gpu_backend
.
GpuElemwise
)
for
node
in
scan_node_topo
]
)
assert
not
any
(
[
isinstance
(
node
.
op
,
self
.
gpu_backend
.
HostFromGpu
)
for
node
in
scan_node_topo
]
)
assert
not
any
(
[
isinstance
(
node
.
op
,
self
.
gpu_backend
.
GpuFromHost
)
for
node
in
scan_node_topo
]
)
# This second version test the second case in the optimizer to the gpu.
def
test_one_sequence_one_output_weights_gpu2
(
self
):
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
u_t
*
W_in
+
x_tm1
*
W
u
=
fvector
(
"u"
)
x0
=
fscalar
(
"x0"
)
W_in
=
fscalar
(
"win"
)
W
=
fscalar
(
"w"
)
output
,
updates
=
scan
(
f_rnn
,
u
,
x0
,
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
self
.
mode_with_gpu
,
)
f2
=
aesara
.
function
(
[
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
self
.
mode_with_gpu
,
)
# get random initial values
rng
=
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.0
,
high
=
5.0
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
# compute the output in numpy
v_out
=
np
.
zeros
((
4
,))
v_out
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
for
step
in
range
(
1
,
4
):
v_out
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out
[
step
-
1
]
*
W
aesara_values
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
aesara_values
,
v_out
)
topo
=
f2
.
maker
.
fgraph
.
toposort
()
assert
(
sum
([
isinstance
(
node
.
op
,
self
.
gpu_backend
.
HostFromGpu
)
for
node
in
topo
])
==
1
)
assert
(
sum
([
isinstance
(
node
.
op
,
self
.
gpu_backend
.
GpuFromHost
)
for
node
in
topo
])
==
4
)
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
scan_node_topo
=
scan_node
.
op
.
fn
.
maker
.
fgraph
.
toposort
()
# check that there is no gpu transfer in the inner loop.
assert
any
(
[
isinstance
(
node
.
op
,
self
.
gpu_backend
.
GpuElemwise
)
for
node
in
scan_node_topo
]
)
assert
not
any
(
[
isinstance
(
node
.
op
,
self
.
gpu_backend
.
HostFromGpu
)
for
node
in
scan_node_topo
]
)
assert
not
any
(
[
isinstance
(
node
.
op
,
self
.
gpu_backend
.
GpuFromHost
)
for
node
in
scan_node_topo
]
)
# This third test checks that scan can deal with a mixture of dtypes as
# outputs when is running on GPU
def
test_gpu3_mixture_dtype_outputs
(
self
):
def
f_rnn
(
u_t
,
x_tm1
,
W_in
,
W
):
return
(
u_t
*
W_in
+
x_tm1
*
W
,
at
.
cast
(
u_t
+
x_tm1
,
"int64"
))
u
=
fvector
(
"u"
)
x0
=
fscalar
(
"x0"
)
W_in
=
fscalar
(
"win"
)
W
=
fscalar
(
"w"
)
output
,
updates
=
scan
(
f_rnn
,
u
,
[
x0
,
None
],
[
W_in
,
W
],
n_steps
=
None
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
self
.
mode_with_gpu
,
)
f2
=
aesara
.
function
(
[
u
,
x0
,
W_in
,
W
],
output
,
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
self
.
mode_with_gpu
,
)
# get random initial values
rng
=
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
v_u
=
rng
.
uniform
(
size
=
(
4
,),
low
=-
5.0
,
high
=
5.0
)
v_x0
=
rng
.
uniform
()
W
=
rng
.
uniform
()
W_in
=
rng
.
uniform
()
# compute the output in numpy
v_out1
=
np
.
zeros
((
4
,))
v_out2
=
np
.
zeros
((
4
,),
dtype
=
"int64"
)
v_out1
[
0
]
=
v_u
[
0
]
*
W_in
+
v_x0
*
W
v_out2
[
0
]
=
v_u
[
0
]
+
v_x0
for
step
in
range
(
1
,
4
):
v_out1
[
step
]
=
v_u
[
step
]
*
W_in
+
v_out1
[
step
-
1
]
*
W
v_out2
[
step
]
=
np
.
int64
(
v_u
[
step
]
+
v_out1
[
step
-
1
])
aesara_out1
,
aesara_out2
=
f2
(
v_u
,
v_x0
,
W_in
,
W
)
utt
.
assert_allclose
(
aesara_out1
,
v_out1
)
utt
.
assert_allclose
(
aesara_out2
,
v_out2
)
topo
=
f2
.
maker
.
fgraph
.
toposort
()
scan_node
=
[
node
for
node
in
topo
if
isinstance
(
node
.
op
,
Scan
)]
assert
len
(
scan_node
)
==
1
scan_node
=
scan_node
[
0
]
assert
self
.
is_scan_on_gpu
(
scan_node
)
def
test_gibbs_chain
(
self
):
rng
=
np
.
random
.
default_rng
(
utt
.
fetch_seed
())
v_vsample
=
np
.
array
(
rng
.
binomial
(
1
,
0.5
,
size
=
(
3
,
20
),
),
dtype
=
"float32"
,
)
vsample
=
aesara
.
shared
(
v_vsample
)
trng
=
aesara
.
sandbox
.
rng_mrg
.
MRG_RandomStream
(
utt
.
fetch_seed
())
def
f
(
vsample_tm1
):
return
(
trng
.
binomial
(
vsample_tm1
.
shape
,
n
=
1
,
p
=
0.3
,
dtype
=
"float32"
)
*
vsample_tm1
)
aesara_vsamples
,
updates
=
scan
(
f
,
[],
vsample
,
[],
n_steps
=
10
,
truncate_gradient
=-
1
,
go_backwards
=
False
,
mode
=
self
.
mode_with_gpu
,
)
my_f
=
aesara
.
function
(
[],
aesara_vsamples
[
-
1
],
updates
=
updates
,
allow_input_downcast
=
True
,
mode
=
self
.
mode_with_gpu
,
)
# I leave this to tested by debugmode, this test was anyway more of
# doest the graph compile kind of test
my_f
()
def
test_gpu_memory_usage
(
self
):
# This test validates that the memory usage of the defined aesara
# function is reasonable when executed on the GPU. It checks for
# a bug in which one of scan's optimization was not applied which
# made the scan node compute large and unnecessary outputs which
# brought memory usage on the GPU to ~12G.
# Dimensionality of input and output data (not one-hot coded)
n_in
=
100
n_out
=
100
# Number of neurons in hidden layer
n_hid
=
4000
# Number of minibatches
mb_size
=
2
# Time steps in minibatch
mb_length
=
200
# Define input variables
xin
=
ftensor3
(
name
=
"xin"
)
yout
=
ftensor3
(
name
=
"yout"
)
# Initialize the network parameters
U
=
aesara
.
shared
(
np
.
zeros
((
n_in
,
n_hid
),
dtype
=
"float32"
),
name
=
"W_xin_to_l1"
)
V
=
aesara
.
shared
(
np
.
zeros
((
n_hid
,
n_hid
),
dtype
=
"float32"
),
name
=
"W_l1_to_l1"
)
W
=
aesara
.
shared
(
np
.
zeros
((
n_hid
,
n_out
),
dtype
=
"float32"
),
name
=
"W_l1_to_l2"
)
nparams
=
[
U
,
V
,
W
]
# Build the forward pass
l1_base
=
dot
(
xin
,
U
)
def
scan_l
(
baseline
,
last_step
):
return
baseline
+
dot
(
last_step
,
V
)
zero_output
=
at
.
alloc
(
np
.
asarray
(
0.0
,
dtype
=
"float32"
),
mb_size
,
n_hid
)
l1_out
,
_
=
scan
(
scan_l
,
sequences
=
[
l1_base
],
outputs_info
=
[
zero_output
],
mode
=
self
.
mode_with_gpu_nodebug
,
)
l2_out
=
dot
(
l1_out
,
W
)
# Compute the cost and take the gradient wrt params
cost
=
at_sum
((
l2_out
-
yout
)
**
2
)
grads
=
aesara
.
grad
(
cost
,
nparams
)
updates
=
list
(
zip
(
nparams
,
(
n
-
g
for
n
,
g
in
zip
(
nparams
,
grads
))))
# Compile the aesara function
feval_backprop
=
aesara
.
function
(
[
xin
,
yout
],
cost
,
updates
=
updates
,
mode
=
self
.
mode_with_gpu_nodebug
)
# Validate that the PushOutScanOutput optimization has been applied
# by checking the number of outputs of the grad Scan node in the
# compiled function.
nodes
=
feval_backprop
.
maker
.
fgraph
.
toposort
()
scan_nodes
=
[
n
for
n
in
nodes
if
isinstance
(
n
.
op
,
Scan
)]
# The grad scan is always the 2nd one according to toposort. If the
# optimization has been applied, it has 2 outputs, otherwise 3.
grad_scan_node
=
scan_nodes
[
1
]
assert
len
(
grad_scan_node
.
outputs
)
==
2
,
len
(
grad_scan_node
.
outputs
)
# Call the aesara function to ensure the absence of a memory error
feval_backprop
(
np
.
zeros
((
mb_length
,
mb_size
,
n_in
),
dtype
=
"float32"
),
np
.
zeros
((
mb_length
,
mb_size
,
n_out
),
dtype
=
"float32"
),
)
def
test_memory_reuse_gpudimshuffle
(
self
):
# Test the memory pre-allocation feature in scan when one output is
# the result of a GpuDimshuffle (because an optimization in
# GpuDimshuffle can cause issues with the memory pre-allocation
# where it falsely thinks that a pre-allocated memory region has
# been used when it hasn't).
def
inner_fn
(
seq1
,
recurrent_out
):
temp
=
seq1
+
recurrent_out
.
sum
()
output1
=
temp
.
dimshuffle
(
1
,
0
)
output2
=
temp
.
sum
()
+
recurrent_out
return
output1
,
output2
input1
=
ftensor3
()
init
=
ftensor3
()
outputs_info
=
[
None
,
init
]
out
,
_
=
scan
(
inner_fn
,
sequences
=
[
input1
],
outputs_info
=
outputs_info
,
mode
=
self
.
mode_with_gpu
,
)
out1
=
out
[
0
]
.
flatten
()
out2
=
out
[
1
]
.
flatten
()
fct
=
aesara
.
function
([
input1
,
init
],
[
out1
,
out2
],
mode
=
self
.
mode_with_gpu
)
output
=
fct
(
np
.
ones
((
2
,
1
,
1
),
dtype
=
"float32"
),
np
.
ones
((
1
,
1
,
1
),
dtype
=
"float32"
)
)
expected_output
=
(
np
.
array
([
2
,
4
],
dtype
=
"float32"
),
np
.
array
([
3
,
7
],
dtype
=
"float32"
),
)
utt
.
assert_allclose
(
output
,
expected_output
)
class
TestScanGpuarray
(
ScanGpuTests
):
"""
This class takes the gpu tests for scan that are defined in
class ScanGpuTests and runs them using the gpuarray backend.
"""
def
setup_method
(
self
):
self
.
gpu_backend
=
gpuarray
# This is unfortunate, but required
def
gpu_from_host
(
v
):
return
gpuarray
.
GpuFromHost
(
None
)(
v
)
self
.
gpu_backend
.
gpu_from_host
=
gpu_from_host
self
.
mode_with_gpu
=
mode_with_opt
.
including
(
"gpuarray"
,
"scan"
)
self
.
mode_with_gpu_nodebug
=
mode_nodebug
.
including
(
"gpuarray"
,
"scan"
)
# Skip the test if pygpu is not available
if
not
self
.
gpu_backend
.
pygpu_activated
:
pytest
.
skip
(
"Optional package pygpu disabled"
)
def
is_scan_on_gpu
(
self
,
node
):
return
node
.
op
.
info
.
get
(
"gpua"
,
False
)
class
TestScanCheckpoint
:
def
setup_method
(
self
):
self
.
k
=
iscalar
(
"k"
)
self
.
A
=
vector
(
"A"
)
result
,
_
=
scan
(
fn
=
lambda
prior_result
,
A
:
prior_result
*
A
,
outputs_info
=
at
.
ones_like
(
self
.
A
),
non_sequences
=
self
.
A
,
n_steps
=
self
.
k
,
)
result_check
,
_
=
scan_checkpoints
(
fn
=
lambda
prior_result
,
A
:
prior_result
*
A
,
outputs_info
=
at
.
ones_like
(
self
.
A
),
non_sequences
=
self
.
A
,
n_steps
=
self
.
k
,
save_every_N
=
100
,
)
self
.
result
=
result
[
-
1
]
self
.
result_check
=
result_check
[
-
1
]
self
.
grad_A
=
aesara
.
grad
(
self
.
result
.
sum
(),
self
.
A
)
self
.
grad_A_check
=
aesara
.
grad
(
self
.
result_check
.
sum
(),
self
.
A
)
def
test_memory
(
self
):
from
tests.gpuarray.config
import
mode_with_gpu
# noqa
f
=
aesara
.
function
(
inputs
=
[
self
.
A
,
self
.
k
],
outputs
=
self
.
grad_A
,
mode
=
mode_with_gpu
)
f_check
=
aesara
.
function
(
inputs
=
[
self
.
A
,
self
.
k
],
outputs
=
self
.
grad_A_check
,
mode
=
mode_with_gpu
)
free_gmem
=
aesara
.
gpuarray
.
type
.
_context_reg
[
None
]
.
free_gmem
data
=
np
.
ones
(
free_gmem
//
3000
,
dtype
=
np
.
float32
)
# Check that it works with the checkpoints
size
=
1000
if
isinstance
(
mode_with_gpu
,
aesara
.
compile
.
debugmode
.
DebugMode
):
size
=
100
f_check
(
data
,
size
)
# Check that the basic scan fails in that case
# Skip that check in DebugMode, as it can fail in different ways
if
not
isinstance
(
mode_with_gpu
,
aesara
.
compile
.
debugmode
.
DebugMode
):
with
pytest
.
raises
(
GpuArrayException
):
f
(
data
,
1000
)
tests/gpuarray/test_sort.py
deleted
100644 → 0
浏览文件 @
c803c67e
from
aesara.gpuarray.sort
import
GpuTopKOp
from
tests.gpuarray.config
import
mode_with_gpu
from
tests.tensor.test_sort
import
TestTopK
class
TestGpuTopK
(
TestTopK
):
mode
=
mode_with_gpu
dtype
=
"float32"
op_class
=
GpuTopKOp
tests/gpuarray/test_subtensor.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
numpy
as
np
import
aesara
from
aesara.compile
import
DeepCopyOp
from
aesara.gpuarray.basic_ops
import
GpuContiguous
,
GpuFromHost
,
HostFromGpu
from
aesara.gpuarray.elemwise
import
GpuDimShuffle
from
aesara.gpuarray.subtensor
import
(
GpuAdvancedIncSubtensor
,
GpuAdvancedIncSubtensor1
,
GpuAdvancedIncSubtensor1_dev20
,
GpuAdvancedSubtensor
,
GpuAdvancedSubtensor1
,
GpuAllocDiag
,
GpuExtractDiag
,
GpuIncSubtensor
,
GpuSubtensor
,
)
from
aesara.gpuarray.type
import
gpuarray_shared_constructor
from
aesara.tensor.basic
import
AllocDiag
,
ExtractDiag
from
aesara.tensor.math
import
sum
as
at_sum
from
aesara.tensor.subtensor
import
advanced_inc_subtensor1
,
inc_subtensor
from
aesara.tensor.type
import
ivectors
,
matrix
,
tensor
,
tensor4
,
vector
from
tests
import
unittest_tools
as
utt
from
tests.gpuarray.config
import
mode_with_gpu
,
test_ctx_name
from
tests.tensor.test_basic
import
TestAllocDiag
from
tests.tensor.test_subtensor
import
TestAdvancedSubtensor
,
TestSubtensor
class
TestGPUSubtensor
(
TestSubtensor
):
def
setup_method
(
self
):
def
shared
(
x
,
**
kwargs
):
return
gpuarray_shared_constructor
(
x
,
target
=
test_ctx_name
,
**
kwargs
)
self
.
shared
=
shared
self
.
sub
=
GpuSubtensor
self
.
inc_sub
=
GpuIncSubtensor
self
.
adv_sub1
=
GpuAdvancedSubtensor1
self
.
adv_incsub1
=
GpuAdvancedIncSubtensor1
self
.
adv_sub
=
GpuAdvancedSubtensor
self
.
dimshuffle
=
GpuDimShuffle
self
.
mode
=
mode_with_gpu
# avoid errors with limited devices
self
.
dtype
=
"float32"
self
.
ignore_topo
=
(
HostFromGpu
,
GpuFromHost
,
DeepCopyOp
,
GpuContiguous
)
# GPU opt can't run in fast_compile only.
self
.
fast_compile
=
False
assert
self
.
sub
==
GpuSubtensor
super
()
.
setup_method
()
class
TestGPUSubtensorF16
(
TestSubtensor
):
def
setup_method
(
self
):
def
shared
(
x
,
**
kwargs
):
return
gpuarray_shared_constructor
(
x
,
target
=
test_ctx_name
,
**
kwargs
)
self
.
shared
=
shared
self
.
sub
=
GpuSubtensor
self
.
inc_sub
=
GpuIncSubtensor
self
.
adv_sub1
=
GpuAdvancedSubtensor1
self
.
adv_incsub1
=
GpuAdvancedIncSubtensor1
self
.
adv_sub
=
GpuAdvancedSubtensor
self
.
dimshuffle
=
GpuDimShuffle
self
.
mode
=
mode_with_gpu
# avoid errors with limited devices
self
.
dtype
=
"float16"
# use floatX?
self
.
ignore_topo
=
(
HostFromGpu
,
GpuFromHost
,
DeepCopyOp
,
GpuContiguous
)
# GPU opt can't run in fast_compile only.
self
.
fast_compile
=
False
assert
self
.
sub
==
GpuSubtensor
super
()
.
setup_method
()
def
test_advinc_subtensor1
():
# Test the second case in the opt local_gpu_advanced_incsubtensor1
for
shp
in
[(
3
,
3
),
(
3
,
3
,
3
)]:
shared
=
gpuarray_shared_constructor
xval
=
np
.
arange
(
np
.
prod
(
shp
),
dtype
=
"float32"
)
.
reshape
(
shp
)
+
1
yval
=
np
.
empty
((
2
,)
+
shp
[
1
:],
dtype
=
"float32"
)
yval
[:]
=
10
x
=
shared
(
xval
,
name
=
"x"
)
y
=
tensor
(
dtype
=
"float32"
,
broadcastable
=
(
False
,)
*
len
(
shp
),
name
=
"y"
)
expr
=
advanced_inc_subtensor1
(
x
,
y
,
[
0
,
2
])
f
=
aesara
.
function
([
y
],
expr
,
mode
=
mode_with_gpu
)
assert
(
sum
(
[
isinstance
(
node
.
op
,
GpuAdvancedIncSubtensor1
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
==
1
)
rval
=
f
(
yval
)
rep
=
xval
.
copy
()
np
.
add
.
at
(
rep
,
[
0
,
2
],
yval
)
assert
np
.
allclose
(
rval
,
rep
)
def
test_advinc_subtensor1_dtype
():
# Test the mixed dtype case
shp
=
(
3
,
4
)
for
dtype1
,
dtype2
in
[
(
"float32"
,
"int8"
),
(
"float32"
,
"float64"
),
(
"uint64"
,
"int8"
),
(
"int64"
,
"uint8"
),
(
"float16"
,
"int8"
),
(
"float16"
,
"float64"
),
(
"float16"
,
"float16"
),
]:
shared
=
gpuarray_shared_constructor
xval
=
np
.
arange
(
np
.
prod
(
shp
),
dtype
=
dtype1
)
.
reshape
(
shp
)
+
1
yval
=
np
.
empty
((
2
,)
+
shp
[
1
:],
dtype
=
dtype2
)
yval
[:]
=
10
x
=
shared
(
xval
,
name
=
"x"
)
y
=
tensor
(
dtype
=
yval
.
dtype
,
broadcastable
=
(
False
,)
*
len
(
yval
.
shape
),
name
=
"y"
)
expr
=
advanced_inc_subtensor1
(
x
,
y
,
[
0
,
2
])
f
=
aesara
.
function
([
y
],
expr
,
mode
=
mode_with_gpu
)
assert
(
sum
(
[
isinstance
(
node
.
op
,
GpuAdvancedIncSubtensor1_dev20
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
==
1
)
rval
=
f
(
yval
)
rep
=
xval
.
copy
()
np
.
add
.
at
(
rep
,
[[
0
,
2
]],
yval
)
assert
np
.
allclose
(
rval
,
rep
)
@aesara.config.change_flags
(
deterministic
=
"more"
)
def
test_deterministic_flag
():
shp
=
(
3
,
4
)
for
dtype1
,
dtype2
in
[(
"float32"
,
"int8"
)]:
shared
=
gpuarray_shared_constructor
xval
=
np
.
arange
(
np
.
prod
(
shp
),
dtype
=
dtype1
)
.
reshape
(
shp
)
+
1
yval
=
np
.
empty
((
2
,)
+
shp
[
1
:],
dtype
=
dtype2
)
yval
[:]
=
10
x
=
shared
(
xval
,
name
=
"x"
)
y
=
tensor
(
dtype
=
yval
.
dtype
,
broadcastable
=
(
False
,)
*
len
(
yval
.
shape
),
name
=
"y"
)
expr
=
advanced_inc_subtensor1
(
x
,
y
,
[
0
,
2
])
f
=
aesara
.
function
([
y
],
expr
,
mode
=
mode_with_gpu
)
assert
(
sum
(
[
isinstance
(
node
.
op
,
GpuAdvancedIncSubtensor1
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
==
1
)
rval
=
f
(
yval
)
rep
=
xval
.
copy
()
np
.
add
.
at
(
rep
,
[[
0
,
2
]],
yval
)
assert
np
.
allclose
(
rval
,
rep
)
def
test_advinc_subtensor1_vector_scalar
():
# Test the case where x is a vector and y a scalar
shp
=
(
3
,)
for
dtype1
,
dtype2
in
[
(
"float32"
,
"int8"
),
(
"float32"
,
"float64"
),
(
"float16"
,
"int8"
),
(
"float16"
,
"float64"
),
(
"float16"
,
"float16"
),
(
"int8"
,
"int8"
),
(
"int16"
,
"int16"
),
]:
shared
=
gpuarray_shared_constructor
xval
=
np
.
arange
(
np
.
prod
(
shp
),
dtype
=
dtype1
)
.
reshape
(
shp
)
+
1
yval
=
np
.
asarray
(
10
,
dtype
=
dtype2
)
x
=
shared
(
xval
,
name
=
"x"
)
y
=
tensor
(
dtype
=
yval
.
dtype
,
broadcastable
=
(
False
,)
*
len
(
yval
.
shape
),
name
=
"y"
)
expr
=
advanced_inc_subtensor1
(
x
,
y
,
[
0
,
2
])
f
=
aesara
.
function
([
y
],
expr
,
mode
=
mode_with_gpu
)
assert
(
sum
(
[
isinstance
(
node
.
op
,
(
GpuAdvancedIncSubtensor1_dev20
,
GpuAdvancedIncSubtensor1
),
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
==
1
)
rval
=
f
(
yval
)
rep
=
xval
.
copy
()
rep
[[
0
,
2
]]
+=
yval
assert
np
.
allclose
(
rval
,
rep
)
def
test_incsub_f16
():
shp
=
(
3
,
3
)
shared
=
gpuarray_shared_constructor
xval
=
np
.
arange
(
np
.
prod
(
shp
),
dtype
=
"float16"
)
.
reshape
(
shp
)
+
1
yval
=
np
.
empty
((
2
,)
+
shp
[
1
:],
dtype
=
"float16"
)
yval
[:]
=
2
x
=
shared
(
xval
,
name
=
"x"
)
y
=
tensor
(
dtype
=
"float16"
,
broadcastable
=
(
False
,)
*
len
(
shp
),
name
=
"y"
)
expr
=
advanced_inc_subtensor1
(
x
,
y
,
[
0
,
2
])
f
=
aesara
.
function
([
y
],
expr
,
mode
=
mode_with_gpu
)
assert
(
sum
(
[
isinstance
(
node
.
op
,
GpuAdvancedIncSubtensor1
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
==
1
)
rval
=
f
(
yval
)
rep
=
xval
.
copy
()
np
.
add
.
at
(
rep
,
[[
0
,
2
]],
yval
)
assert
np
.
allclose
(
rval
,
rep
)
expr
=
inc_subtensor
(
x
[
1
:],
y
)
f
=
aesara
.
function
([
y
],
expr
,
mode
=
mode_with_gpu
)
assert
(
sum
(
[
isinstance
(
node
.
op
,
GpuIncSubtensor
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
)
==
1
)
rval
=
f
(
yval
)
rep
=
xval
.
copy
()
rep
[
1
:]
+=
yval
assert
np
.
allclose
(
rval
,
rep
)
def
test_incsub_offset
():
# Test for https://github.com/Theano/Theano/issues/5670
# Build a GPU variable which value will have an offset (x1)
x
=
gpuarray_shared_constructor
(
np
.
zeros
(
5
,
dtype
=
aesara
.
config
.
floatX
))
x1
=
x
[
1
:]
# Use inc_subtensor on it
y
=
vector
()
z
=
inc_subtensor
(
x1
[
2
:],
y
)
# Use updates so that inc_subtensor can happen inplace
f
=
aesara
.
function
([
y
],
z
,
updates
=
{
x
:
z
},
mode
=
mode_with_gpu
)
utt
.
assert_allclose
(
f
([
1
,
2
]),
np
.
array
([
0
,
0
,
1
,
2
],
dtype
=
aesara
.
config
.
floatX
))
class
TestGPUAdvancedSubtensor
(
TestAdvancedSubtensor
):
def
setup_method
(
self
):
self
.
shared
=
gpuarray_shared_constructor
self
.
sub
=
GpuAdvancedSubtensor
self
.
inc_sub
=
GpuAdvancedIncSubtensor
self
.
mode
=
mode_with_gpu
# avoid errors with limited devices
self
.
dtype
=
"float32"
# floatX?
self
.
ignore_topo
=
(
HostFromGpu
,
GpuFromHost
,
DeepCopyOp
)
# GPU opt can't run in fast_compile only.
self
.
fast_compile
=
False
assert
self
.
sub
==
GpuAdvancedSubtensor
super
()
.
setup_method
()
class
TestGPUAdvancedSubtensorF16
(
TestAdvancedSubtensor
):
def
setup_method
(
self
):
self
.
shared
=
gpuarray_shared_constructor
self
.
sub
=
GpuAdvancedSubtensor
self
.
mode
=
mode_with_gpu
# avoid errors with limited devices
self
.
dtype
=
"float16"
# floatX?
self
.
ignore_topo
=
(
HostFromGpu
,
GpuFromHost
,
DeepCopyOp
)
# GPU opt can't run in fast_compile only.
self
.
fast_compile
=
False
assert
self
.
sub
==
GpuAdvancedSubtensor
super
()
.
setup_method
()
def
test_adv_subtensor
():
# Test the advancedsubtensor on gpu.
shp
=
(
2
,
3
,
4
)
shared
=
gpuarray_shared_constructor
xval
=
np
.
arange
(
np
.
prod
(
shp
),
dtype
=
aesara
.
config
.
floatX
)
.
reshape
(
shp
)
idx1
,
idx2
=
ivectors
(
"idx1"
,
"idx2"
)
idxs
=
[
idx1
,
None
,
slice
(
0
,
2
,
1
),
idx2
,
None
]
x
=
shared
(
xval
,
name
=
"x"
)
expr
=
x
[
idxs
]
f
=
aesara
.
function
([
idx1
,
idx2
],
expr
,
mode
=
mode_with_gpu
)
assert
(
sum
(
[
isinstance
(
node
.
op
,
GpuAdvancedSubtensor
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()
]
)
==
1
)
idx1_val
=
[
0
,
1
]
idx2_val
=
[
0
,
1
]
rval
=
f
(
idx1_val
,
idx2_val
)
rep
=
xval
[
idx1_val
,
None
,
slice
(
0
,
2
,
1
),
idx2_val
,
None
]
assert
np
.
allclose
(
rval
,
rep
)
class
TestGpuExtractDiag
:
def
test_extractdiag_opt
(
self
):
x
=
matrix
()
fn
=
aesara
.
function
([
x
],
ExtractDiag
()(
x
),
mode
=
mode_with_gpu
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuExtractDiag
)
for
node
in
fn
.
maker
.
fgraph
.
toposort
()]
)
def
test_matrix
(
self
):
x
=
matrix
()
np_x
=
np
.
arange
(
77
)
.
reshape
(
7
,
11
)
.
astype
(
aesara
.
config
.
floatX
)
fn
=
aesara
.
function
([
x
],
GpuExtractDiag
()(
x
),
mode
=
mode_with_gpu
)
assert
np
.
allclose
(
fn
(
np_x
),
np_x
.
diagonal
())
fn
=
aesara
.
function
([
x
],
GpuExtractDiag
(
2
)(
x
),
mode
=
mode_with_gpu
)
assert
np
.
allclose
(
fn
(
np_x
),
np_x
.
diagonal
(
2
))
fn
=
aesara
.
function
([
x
],
GpuExtractDiag
(
-
3
)(
x
),
mode
=
mode_with_gpu
)
assert
np
.
allclose
(
fn
(
np_x
),
np_x
.
diagonal
(
-
3
))
def
test_tensor
(
self
):
x
=
tensor4
()
np_x
=
np
.
arange
(
30107
)
.
reshape
(
7
,
11
,
17
,
23
)
.
astype
(
aesara
.
config
.
floatX
)
for
offset
,
axis1
,
axis2
in
[
(
1
,
0
,
1
),
(
-
1
,
0
,
1
),
(
0
,
1
,
0
),
(
-
2
,
1
,
0
),
(
-
3
,
1
,
0
),
(
-
2
,
2
,
0
),
(
3
,
3
,
0
),
(
-
1
,
3
,
2
),
(
2
,
2
,
3
),
(
-
1
,
2
,
1
),
(
1
,
3
,
1
),
(
-
1
,
1
,
3
),
]:
assert
np
.
allclose
(
GpuExtractDiag
(
offset
,
axis1
,
axis2
)(
x
)
.
eval
({
x
:
np_x
}),
np_x
.
diagonal
(
offset
,
axis1
,
axis2
),
)
def
test_tensor_float16
(
self
):
x
=
tensor4
()
np_x
=
np
.
arange
(
30107
)
.
reshape
(
7
,
11
,
17
,
23
)
.
astype
(
"float16"
)
for
offset
,
axis1
,
axis2
in
[
(
1
,
0
,
1
),
(
-
1
,
0
,
1
),
(
0
,
1
,
0
),
(
-
2
,
1
,
0
),
(
-
3
,
1
,
0
),
(
-
2
,
2
,
0
),
(
3
,
3
,
0
),
(
-
1
,
3
,
2
),
(
2
,
2
,
3
),
(
-
1
,
2
,
1
),
(
1
,
3
,
1
),
(
-
1
,
1
,
3
),
]:
assert
np
.
allclose
(
GpuExtractDiag
(
offset
,
axis1
,
axis2
)(
x
)
.
eval
({
x
:
np_x
}),
np_x
.
diagonal
(
offset
,
axis1
,
axis2
),
)
class
TestGpuAllocDiag
(
TestAllocDiag
):
def
setup_method
(
self
):
self
.
alloc_diag
=
GpuAllocDiag
self
.
mode
=
mode_with_gpu
super
()
.
setup_method
()
def
test_allocdiag_opt
(
self
):
x
=
vector
()
fn
=
aesara
.
function
([
x
],
AllocDiag
()(
x
),
mode
=
mode_with_gpu
)
assert
any
(
[
isinstance
(
node
.
op
,
GpuAllocDiag
)
for
node
in
fn
.
maker
.
fgraph
.
toposort
()]
)
def
test_matrix
(
self
):
x
=
vector
()
np_x
=
np
.
arange
(
7
)
.
astype
(
aesara
.
config
.
floatX
)
fn
=
aesara
.
function
([
x
],
GpuAllocDiag
()(
x
),
mode
=
mode_with_gpu
)
assert
np
.
allclose
(
fn
(
np_x
),
np
.
diag
(
np_x
))
fn
=
aesara
.
function
([
x
],
GpuAllocDiag
(
2
)(
x
),
mode
=
mode_with_gpu
)
assert
np
.
allclose
(
fn
(
np_x
),
np
.
diag
(
np_x
,
2
))
fn
=
aesara
.
function
([
x
],
GpuAllocDiag
(
-
3
)(
x
),
mode
=
mode_with_gpu
)
assert
np
.
allclose
(
fn
(
np_x
),
np
.
diag
(
np_x
,
-
3
))
def
test_grad
(
self
):
x
=
vector
()
np_x
=
np
.
random
.
randn
(
7
)
.
astype
(
aesara
.
config
.
floatX
)
# offset = 0 case:
mtx_x
=
GpuAllocDiag
()(
x
)
sum_mtx_x
=
at_sum
(
mtx_x
)
grad_x
=
aesara
.
grad
(
sum_mtx_x
,
x
)
grad_mtx_x
=
aesara
.
grad
(
sum_mtx_x
,
mtx_x
)
fn_grad_x
=
aesara
.
function
([
x
],
grad_x
,
mode
=
mode_with_gpu
)
fn_grad_mtx_x
=
aesara
.
function
([
x
],
grad_mtx_x
,
mode
=
mode_with_gpu
)
computed_grad_x
=
fn_grad_x
(
np_x
)
computed_grad_mtx_x
=
fn_grad_mtx_x
(
np_x
)
true_grad_x
=
np
.
diagonal
(
computed_grad_mtx_x
,
0
)
assert
np
.
allclose
(
computed_grad_x
,
true_grad_x
)
# offset > 0 case:
mtx_x
=
GpuAllocDiag
(
2
)(
x
)
sum_mtx_x
=
at_sum
(
mtx_x
)
grad_x
=
aesara
.
grad
(
sum_mtx_x
,
x
)
grad_mtx_x
=
aesara
.
grad
(
sum_mtx_x
,
mtx_x
)
fn_grad_x
=
aesara
.
function
([
x
],
grad_x
,
mode
=
mode_with_gpu
)
fn_grad_mtx_x
=
aesara
.
function
([
x
],
grad_mtx_x
,
mode
=
mode_with_gpu
)
computed_grad_x
=
fn_grad_x
(
np_x
)
computed_grad_mtx_x
=
fn_grad_mtx_x
(
np_x
)
true_grad_x
=
np
.
diagonal
(
computed_grad_mtx_x
,
2
)
assert
np
.
allclose
(
computed_grad_x
,
true_grad_x
)
# offset < 0 case:
mtx_x
=
GpuAllocDiag
(
-
3
)(
x
)
sum_mtx_x
=
at_sum
(
mtx_x
)
grad_x
=
aesara
.
grad
(
sum_mtx_x
,
x
)
grad_mtx_x
=
aesara
.
grad
(
sum_mtx_x
,
mtx_x
)
fn_grad_x
=
aesara
.
function
([
x
],
grad_x
,
mode
=
mode_with_gpu
)
fn_grad_mtx_x
=
aesara
.
function
([
x
],
grad_mtx_x
,
mode
=
mode_with_gpu
)
computed_grad_x
=
fn_grad_x
(
np_x
)
computed_grad_mtx_x
=
fn_grad_mtx_x
(
np_x
)
true_grad_x
=
np
.
diagonal
(
computed_grad_mtx_x
,
-
3
)
assert
np
.
allclose
(
computed_grad_x
,
true_grad_x
)
# assert
tests/gpuarray/test_type.py
deleted
100644 → 0
浏览文件 @
c803c67e
import
os
from
pickle
import
Unpickler
import
numpy
as
np
import
pytest
import
aesara
from
aesara.compile.ops
import
DeepCopyOp
,
ViewOp
from
aesara.configdefaults
import
config
from
aesara.gpuarray.type
import
GpuArrayType
,
gpuarray_shared_constructor
from
aesara.tensor.basic
import
Rebroadcast
from
aesara.tensor.shape
import
specify_shape
from
aesara.tensor.type
import
row
from
tests.gpuarray.config
import
test_ctx_name
from
tests.gpuarray.test_basic_ops
import
rand_gpuarray
pygpu
=
pytest
.
importorskip
(
"pygpu"
)
# Disabled for now
# from tests.tensor.test_sharedvar import makeSharedTester
def
test_deep_copy
():
for
dtype
in
(
"float16"
,
"float32"
):
a
=
rand_gpuarray
(
20
,
dtype
=
dtype
)
g
=
GpuArrayType
(
dtype
=
dtype
,
broadcastable
=
(
False
,))(
"g"
)
f
=
aesara
.
function
([
g
],
g
)
assert
isinstance
(
f
.
maker
.
fgraph
.
toposort
()[
0
]
.
op
,
DeepCopyOp
)
res
=
f
(
a
)
assert
GpuArrayType
.
values_eq
(
res
,
a
)
def
test_view
():
for
dtype
in
(
"float16"
,
"float32"
):
a
=
rand_gpuarray
(
20
,
dtype
=
dtype
)
g
=
GpuArrayType
(
dtype
=
dtype
,
broadcastable
=
(
False
,))(
"g"
)
m
=
aesara
.
compile
.
get_default_mode
()
.
excluding
(
"local_view_op"
)
f
=
aesara
.
function
([
g
],
ViewOp
()(
g
),
mode
=
m
)
assert
isinstance
(
f
.
maker
.
fgraph
.
toposort
()[
0
]
.
op
,
ViewOp
)
res
=
f
(
a
)
assert
GpuArrayType
.
values_eq
(
res
,
a
)
def
test_rebroadcast
():
for
dtype
in
(
"float16"
,
"float32"
):
a
=
rand_gpuarray
(
1
,
dtype
=
dtype
)
g
=
GpuArrayType
(
dtype
=
dtype
,
broadcastable
=
(
False
,))(
"g"
)
f
=
aesara
.
function
([
g
],
Rebroadcast
((
0
,
True
))(
g
))
assert
isinstance
(
f
.
maker
.
fgraph
.
toposort
()[
0
]
.
op
,
Rebroadcast
)
res
=
f
(
a
)
assert
GpuArrayType
.
values_eq
(
res
,
a
)
def
test_values_eq_approx
():
a
=
rand_gpuarray
(
20
,
dtype
=
"float32"
)
assert
GpuArrayType
.
values_eq_approx
(
a
,
a
)
b
=
a
.
copy
()
b
[
0
]
=
np
.
asarray
(
b
[
0
])
+
1.0
assert
not
GpuArrayType
.
values_eq_approx
(
a
,
b
)
b
=
a
.
copy
()
b
[
0
]
=
-
np
.
asarray
(
b
[
0
])
assert
not
GpuArrayType
.
values_eq_approx
(
a
,
b
)
def
test_specify_shape
():
for
dtype
in
(
"float16"
,
"float32"
):
a
=
rand_gpuarray
(
20
,
dtype
=
dtype
)
g
=
GpuArrayType
(
dtype
=
dtype
,
broadcastable
=
(
False
,))(
"g"
)
f
=
aesara
.
function
([
g
],
specify_shape
(
g
,
[
20
]))
f
(
a
)
def
test_filter_float
():
aesara
.
compile
.
shared_constructor
(
gpuarray_shared_constructor
)
try
:
s
=
aesara
.
shared
(
np
.
array
(
0.0
,
dtype
=
"float32"
),
target
=
test_ctx_name
)
aesara
.
function
([],
updates
=
[(
s
,
0.0
)])
finally
:
del
aesara
.
compile
.
sharedvalue
.
shared
.
constructors
[
-
1
]
def
test_filter_variable
():
# Test that filter_variable accepts more restrictive broadcast
gpu_row
=
GpuArrayType
(
dtype
=
aesara
.
config
.
floatX
,
broadcastable
=
(
True
,
False
))
gpu_matrix
=
GpuArrayType
(
dtype
=
aesara
.
config
.
floatX
,
broadcastable
=
(
False
,
False
))
r
=
gpu_row
()
m
=
gpu_matrix
.
filter_variable
(
r
)
assert
m
.
type
==
gpu_matrix
# On CPU as well
r
=
row
()
m
=
gpu_matrix
.
filter_variable
(
r
)
assert
m
.
type
==
gpu_matrix
def
test_gpuarray_shared_scalar
():
# By default, we don't put scalar as shared variable on the GPU
with
pytest
.
raises
(
TypeError
):
gpuarray_shared_constructor
(
np
.
asarray
(
1
,
dtype
=
"float32"
))
# But we can force that
gpuarray_shared_constructor
(
np
.
asarray
(
1
,
dtype
=
"float32"
),
target
=
test_ctx_name
)
def
test_unpickle_gpuarray_as_numpy_ndarray_flag0
():
# Test when pygpu isn't there for unpickle are in test_pickle.py
oldflag
=
config
.
experimental__unpickle_gpu_on_cpu
config
.
experimental__unpickle_gpu_on_cpu
=
False
try
:
testfile_dir
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
fname
=
"GpuArray.pkl"
with
open
(
os
.
path
.
join
(
testfile_dir
,
fname
),
"rb"
)
as
fp
:
u
=
Unpickler
(
fp
,
encoding
=
"latin1"
)
mat
=
u
.
load
()
assert
isinstance
(
mat
,
pygpu
.
gpuarray
.
GpuArray
)
assert
np
.
asarray
(
mat
)[
0
]
==
-
42.0
finally
:
config
.
experimental__unpickle_gpu_on_cpu
=
oldflag
# These tests are disabled because they expect the impossible
# @makeSharedTester(
# shared_constructor_=gpuarray_shared_constructor,
# dtype_=aesara.config.floatX,
# get_value_borrow_true_alias_=True,
# shared_borrow_true_alias_=True,
# set_value_borrow_true_alias_=True,
# set_value_inplace_=True,
# set_cast_value_inplace_=False,
# shared_constructor_accept_ndarray_=True,
# internal_type_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
# cls=pygpu._array.ndgpuarray),
# test_internal_type_=lambda a: isinstance(a, pygpu.gpuarray.GpuArray),
# aesara_fct_=aesara.tensor.exp,
# ref_fct_=np.exp,
# cast_value_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
# cls=pygpu._array.ndgpuarray))
# class TestSharedOptions(object):
# pass
# @makeSharedTester(
# shared_constructor_=gpuarray_shared_constructor,
# dtype_=aesara.config.floatX,
# get_value_borrow_true_alias_=False,
# shared_borrow_true_alias_=False,
# set_value_borrow_true_alias_=False,
# set_value_inplace_=True,
# set_cast_value_inplace_=True,
# shared_constructor_accept_ndarray_=True,
# internal_type_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
# cls=pygpu._array.ndgpuarray),
# test_internal_type_=lambda a: isinstance(a, pygpu.gpuarray.GpuArray),
# aesara_fct_=aesara.tensor.exp,
# ref_fct_=np.exp,
# cast_value_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
# cls=pygpu._array.ndgpuarray))
# class TestSharedOptions2(object):
# pass
def
test_set_value_non_contiguous
():
s
=
gpuarray_shared_constructor
(
np
.
asarray
([[
1.0
,
2.0
],
[
1.0
,
2.0
],
[
5
,
6
]]))
s
.
set_value
(
s
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)[::
2
],
borrow
=
True
)
assert
not
s
.
get_value
(
borrow
=
True
,
return_internal_type
=
True
)
.
flags
[
"C_CONTIGUOUS"
]
# In the past, this failed
s
.
set_value
([[
0
,
0
],
[
1
,
1
]])
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论