Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
43411345
提交
43411345
authored
12月 03, 2016
作者:
Gijs van Tulder
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Batch normalization optimizations for old gpu backend.
上级
ef21cb58
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
846 行增加
和
86 行删除
+846
-86
dnn.py
theano/sandbox/cuda/dnn.py
+507
-30
opt.py
theano/sandbox/cuda/opt.py
+25
-0
test_dnn.py
theano/sandbox/cuda/tests/test_dnn.py
+314
-56
没有找到文件。
theano/sandbox/cuda/dnn.py
浏览文件 @
43411345
...
...
@@ -18,6 +18,7 @@ from theano.tensor.nnet.abstract_conv import (get_conv_output_shape,
assert_conv_shape
)
from
theano.tensor.signal.pool
import
(
Pool
,
MaxPoolGrad
,
AveragePoolGrad
)
from
theano.tensor.nnet
import
bn
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda
import
GpuOp
,
dnn_available
...
...
@@ -2347,6 +2348,23 @@ class GpuDnnBatchNormBase(DnnBase):
epsilon
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
"""
__props__
=
(
'mode'
,
'epsilon'
)
...
...
@@ -2395,17 +2413,15 @@ cudnnStatus_t err%(name)s;
result
=
"""
cudnnStatus_t err
%(name)
s;
cudnnBatchNormMode_t mode
%(name)
s =
%(mode)
s;
double exponentialAverageFactor
%(name)
s =
%(exp_avg_factor)
f;
double epsilon
%(name)
s =
%(epsilon)
e;
"""
%
dict
(
name
=
name
,
mode
=
mode
,
exp_avg_factor
=
0
,
# deliberately unused
epsilon
=
self
.
epsilon
)
return
result
def
c_code_cache_version
(
self
):
return
(
3
,
version
())
return
(
4
,
version
())
class
GpuDnnBatchNormInference
(
GpuDnnBatchNormBase
):
...
...
@@ -2422,8 +2438,21 @@ class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
Note: scale, bias, mean and variance must follow the same tensor layout!
"""
__props__
=
(
'mode'
,
'epsilon'
,
'inplace'
)
tensor_descs
=
[
'bn_input'
,
'bn_output'
,
'bn_params'
]
def
__init__
(
self
,
mode
=
'per-activation'
,
epsilon
=
1e-4
,
inplace
=
False
):
super
(
GpuDnnBatchNormInference
,
self
)
.
__init__
(
mode
=
mode
,
epsilon
=
epsilon
)
self
.
inplace
=
inplace
if
self
.
inplace
:
self
.
destroy_map
=
{
0
:
[
0
]}
def
get_op_params
(
self
):
params
=
[]
if
self
.
inplace
:
params
.
append
((
'INPLACE_OUTPUT'
,
'1'
))
return
params
def
infer_shape
(
self
,
node
,
shape
):
# output shape equals shape of x
return
[
shape
[
0
]]
...
...
@@ -2460,10 +2489,16 @@ if (c_set_tensorNd(%(scale)s, bn_params_%(name)s) != 0)
}
// build and prepare the output variable
#ifdef INPLACE_OUTPUT
Py_XDECREF(
%(outp)
s);
%(outp)
s =
%(inp)
s;
Py_INCREF(
%(outp)
s);
#else
if (CudaNdarray_prep_output(&
%(outp)
s,
%(inp)
s->nd, CudaNdarray_HOST_DIMS(
%(inp)
s)) != 0)
{
%(fail)
s
}
#endif
// set output tensor descriptor from output tensor
if (c_set_tensorNd(
%(outp)
s, bn_output_
%(name)
s) != 0)
...
...
@@ -2494,6 +2529,16 @@ err%(name)s = cudnnBatchNormalizationForwardInference(
"""
%
dict
(
name
=
name
,
inp
=
inp
,
scale
=
scale
,
bias
=
bias
,
est_mean
=
est_mean
,
est_var
=
est_var
,
outp
=
outp
,
fail
=
sub
[
'fail'
])
# add params
define_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
,
check_input
=
False
)
result
=
"""
%(define_macros)
s
{
%(code)
s
}
%(undef_macros)
s
"""
%
dict
(
code
=
result
,
define_macros
=
define_macros
,
undef_macros
=
undef_macros
)
return
result
def
grad
(
self
,
inputs
,
grads
):
...
...
@@ -2537,28 +2582,84 @@ class GpuDnnBatchNorm(GpuDnnBatchNormBase):
Note: scale and bias must follow the same tensor layout!
"""
__props__
=
(
'mode'
,
'epsilon'
,
'running_averages'
,
'inplace_running_mean'
,
'inplace_running_var'
,
'inplace_output'
)
tensor_descs
=
[
'bn_input'
,
'bn_output'
,
'bn_params'
]
def
__init__
(
self
,
mode
=
'per-activation'
,
epsilon
=
1e-4
,
running_average_factor
=
0
,
running_averages
=
False
,
inplace_running_mean
=
False
,
inplace_running_var
=
False
,
inplace_output
=
False
):
super
(
GpuDnnBatchNorm
,
self
)
.
__init__
(
mode
=
mode
,
epsilon
=
epsilon
)
self
.
running_average_factor
=
running_average_factor
self
.
running_averages
=
running_averages
self
.
inplace_output
=
inplace_output
self
.
inplace_running_mean
=
inplace_running_mean
self
.
inplace_running_var
=
inplace_running_var
self
.
destroy_map
=
{}
if
self
.
inplace_output
:
self
.
destroy_map
[
0
]
=
[
0
]
if
self
.
running_averages
and
self
.
inplace_running_mean
:
self
.
destroy_map
[
3
]
=
[
3
]
if
self
.
running_averages
and
self
.
inplace_running_var
:
self
.
destroy_map
[
4
]
=
[
4
]
def
get_op_params
(
self
):
params
=
[]
if
self
.
inplace_output
:
params
.
append
((
'INPLACE_OUTPUT'
,
'1'
))
if
self
.
running_averages
:
params
.
append
((
'RUNNING_AVERAGES'
,
'1'
))
if
self
.
inplace_running_mean
:
params
.
append
((
'INPLACE_RUNNING_MEAN'
,
'1'
))
if
self
.
inplace_running_var
:
params
.
append
((
'INPLACE_RUNNING_VAR'
,
'1'
))
return
params
def
infer_shape
(
self
,
node
,
shape
):
# first output equals shape of x
#
second and third output
equal shape of scale
return
[
shape
[
0
]
,
shape
[
1
],
shape
[
1
]]
#
other outputs
equal shape of scale
return
[
shape
[
0
]
]
+
[
shape
[
1
]]
*
(
len
(
node
.
outputs
)
-
1
)
def
make_node
(
self
,
x
,
scale
,
bias
):
def
make_node
(
self
,
x
,
scale
,
bias
,
running_mean
=
None
,
running_var
=
None
):
assert
x
.
ndim
==
scale
.
ndim
==
bias
.
ndim
assert
x
.
ndim
in
(
4
,
5
)
assert
self
.
running_averages
==
(
running_mean
is
not
None
)
==
(
running_var
is
not
None
)
assert
(
running_mean
is
None
or
running_mean
.
ndim
==
x
.
ndim
)
assert
(
running_var
is
None
or
running_var
.
ndim
==
x
.
ndim
)
x
=
as_cuda_ndarray_variable
(
x
)
scale
=
as_cuda_ndarray_variable
(
scale
)
bias
=
as_cuda_ndarray_variable
(
bias
)
assert
x
.
ndim
==
scale
.
ndim
==
bias
.
ndim
assert
x
.
ndim
in
(
4
,
5
)
return
Apply
(
self
,
[
x
,
scale
,
bias
],
[
x
.
type
(),
scale
.
type
(),
scale
.
type
()])
inputs
=
[
x
,
scale
,
bias
]
output_types
=
[
x
.
type
(),
scale
.
type
(),
scale
.
type
()]
if
running_mean
is
not
None
and
running_var
is
not
None
:
inputs
.
append
(
as_cuda_ndarray_variable
(
running_mean
))
inputs
.
append
(
as_cuda_ndarray_variable
(
running_var
))
output_types
.
append
(
scale
.
type
())
output_types
.
append
(
scale
.
type
())
return
Apply
(
self
,
inputs
,
output_types
)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
# super call to prepare common configuration
result
=
super
(
GpuDnnBatchNorm
,
self
)
.
c_code
(
node
,
name
,
inputs
,
outputs
,
sub
)
# give sensible names to inputs and outputs
inp
,
scale
,
bias
=
inputs
outp
,
x_mean
,
x_invstd
=
outputs
inp
,
scale
,
bias
=
inputs
[:
3
]
outp
,
x_mean
,
x_invstd
=
outputs
[:
3
]
if
self
.
running_averages
:
running_average_factor
=
self
.
running_average_factor
in_running_mean
=
inputs
[
3
]
in_running_var
=
inputs
[
4
]
out_running_mean
=
outputs
[
3
]
out_running_var
=
outputs
[
4
]
else
:
running_average_factor
=
0.
in_running_mean
=
'NULL'
in_running_var
=
'NULL'
out_running_mean
=
'NULL'
out_running_var
=
'NULL'
# set input tensor descriptors from input tensors
result
+=
"""
...
...
@@ -2579,6 +2680,32 @@ if ((CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp
{
%(fail)
s
}
#ifdef RUNNING_AVERAGES
#ifdef INPLACE_RUNNING_MEAN
Py_XDECREF(
%(out_running_mean)
s);
CudaNdarray *running_mean
%(name)
s =
%(in_running_mean)
s;
Py_INCREF(running_mean
%(name)
s);
#else
if ((CudaNdarray_prep_output(&
%(out_running_mean)
s,
%(inp)
s->nd, CudaNdarray_HOST_DIMS(
%(scale)
s)) != 0) ||
(CudaNdarray_CopyFromCudaNdarray(
%(out_running_mean)
s,
%(in_running_mean)
s) != 0))
{
%(fail)
s
}
CudaNdarray *running_mean
%(name)
s =
%(out_running_mean)
s;
#endif
#ifdef INPLACE_RUNNING_VAR
Py_XDECREF(
%(out_running_var)
s);
CudaNdarray *running_var
%(name)
s =
%(in_running_var)
s;
Py_INCREF(running_var
%(name)
s);
#else
if ((CudaNdarray_prep_output(&
%(out_running_var)
s,
%(inp)
s->nd, CudaNdarray_HOST_DIMS(
%(scale)
s)) != 0) ||
(CudaNdarray_CopyFromCudaNdarray(
%(out_running_var)
s,
%(in_running_var)
s) != 0))
{
%(fail)
s
}
CudaNdarray *running_var
%(name)
s =
%(out_running_var)
s;
#endif
#endif
// set output tensor descriptor from output tensor
if (c_set_tensorNd(
%(outp)
s, bn_output_
%(name)
s) != 0)
...
...
@@ -2601,25 +2728,66 @@ err%(name)s = cudnnBatchNormalizationForwardTraining(
bn_params_
%(name)
s,
CudaNdarray_DEV_DATA(
%(scale)
s),
CudaNdarray_DEV_DATA(
%(bias)
s),
exponentialAverageFactor
%(name)
s,
NULL, // running mean, deliberately unused
NULL, // running var, deliberately unused
#ifdef RUNNING_AVERAGES
%(running_average_factor)
f,
CudaNdarray_DEV_DATA(running_mean
%(name)
s),
CudaNdarray_DEV_DATA(running_var
%(name)
s),
#else
0,
NULL,
NULL,
#endif
epsilon
%(name)
s,
CudaNdarray_DEV_DATA(
%(x_mean)
s),
CudaNdarray_DEV_DATA(
%(x_invstd)
s)
);
}
#ifdef RUNNING_AVERAGES
%(out_running_mean)
s = running_mean
%(name)
s;
%(out_running_var)
s = running_var
%(name)
s;
#endif
"""
%
dict
(
name
=
name
,
inp
=
inp
,
scale
=
scale
,
bias
=
bias
,
outp
=
outp
,
x_mean
=
x_mean
,
x_invstd
=
x_invstd
,
fail
=
sub
[
'fail'
])
x_mean
=
x_mean
,
x_invstd
=
x_invstd
,
running_average_factor
=
running_average_factor
,
in_running_mean
=
in_running_mean
,
in_running_var
=
in_running_var
,
out_running_mean
=
out_running_mean
,
out_running_var
=
out_running_var
,
fail
=
sub
[
'fail'
])
# add params
define_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
,
check_input
=
False
)
result
=
"""
%(define_macros)
s
{
%(code)
s
}
%(undef_macros)
s
"""
%
dict
(
code
=
result
,
define_macros
=
define_macros
,
undef_macros
=
undef_macros
)
return
result
def
grad
(
self
,
inputs
,
grads
):
x
,
scale
,
bias
=
inputs
x
,
scale
,
bias
=
inputs
[:
3
]
dy
=
grads
[
0
]
_
,
x_mean
,
x_invstd
=
self
(
x
,
scale
,
bias
)
return
GpuDnnBatchNormGrad
(
self
.
mode
,
self
.
epsilon
)(
x
,
dy
,
scale
,
x_mean
,
x_invstd
)
_
,
x_mean
,
x_invstd
=
self
(
*
inputs
)[:
3
]
disconnected_outputs
=
[]
# Optional running_mean and running_var.
for
i
in
range
(
3
,
len
(
inputs
)):
disconnected_outputs
.
append
(
DisconnectedType
()())
return
GpuDnnBatchNormGrad
(
self
.
mode
,
self
.
epsilon
)(
x
,
dy
,
scale
,
x_mean
,
x_invstd
)
+
disconnected_outputs
def
connection_pattern
(
self
,
node
):
patterns
=
[[
True
,
True
,
True
],
# x
[
True
,
True
,
True
],
# scale
[
True
,
True
,
True
]]
# bias
# Optional running_mean and running_var are only
# connected to their new values.
for
i
in
range
(
3
,
len
(
node
.
inputs
)):
patterns
[
0
]
.
append
(
True
)
for
pattern
in
patterns
[
1
:]:
pattern
.
append
(
False
)
patterns
.
append
([
False
]
*
(
i
)
+
[
True
])
return
patterns
class
GpuDnnBatchNormGrad
(
GpuDnnBatchNormBase
):
...
...
@@ -2722,7 +2890,8 @@ err%(name)s = cudnnBatchNormalizationBackward(
def
dnn_batch_normalization_train
(
inputs
,
gamma
,
beta
,
mode
=
'per-activation'
,
epsilon
=
1e-4
):
epsilon
=
1e-4
,
running_average_factor
=
0.1
,
running_mean
=
None
,
running_var
=
None
):
"""
Performs batch normalization of the given inputs, using the mean and
variance of the inputs.
...
...
@@ -2742,6 +2911,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
Returns
-------
...
...
@@ -2751,6 +2937,12 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
Means of `inputs` across the normalization axes.
invstd : tensor
Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes
-----
...
...
@@ -2762,31 +2954,78 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
axes = 0 if mode == 'per-activation' else (0, 2, 3)
mean = inputs.mean(axes, keepdims=True)
invstd = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon))
var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) +
\\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) +
\\
(m / (m - 1)) * var * running_average_factor
For 5d tensors, the axes are (0, 2, 3, 4).
"""
ndim
=
inputs
.
ndim
if
ndim
>
5
:
raise
ValueError
(
"dnn_batch_normalization_train currently supports "
"up to 5-dimensional tensors only, got
%
d"
%
ndim
)
if
gamma
.
ndim
!=
ndim
or
beta
.
ndim
!=
ndim
:
raise
ValueError
(
"gamma and beta must be of the same dimensionality "
"as inputs; got
%
d and
%
d instead of
%
d"
%
(
gamma
.
ndim
,
beta
.
ndim
,
ndim
))
if
(
running_mean
is
None
)
!=
(
running_var
is
None
):
raise
ValueError
(
"running_mean and running_var must either both be "
"given or both be None"
)
if
running_mean
is
not
None
and
running_mean
.
ndim
!=
ndim
:
raise
ValueError
(
"running_mean must be of the same dimensionality "
"as inputs; got
%
d instead of
%
d"
%
(
running_mean
.
ndim
,
ndim
))
if
running_var
is
not
None
and
running_var
.
ndim
!=
ndim
:
raise
ValueError
(
"running_var must be of the same dimensionality "
"as inputs; got
%
d instead of
%
d"
%
(
running_var
.
ndim
,
ndim
))
if
epsilon
<
1e-5
:
raise
ValueError
(
"epsilon must be at least 1e-5, got
%
f"
%
epsilon
)
running_averages
=
(
running_var
is
not
None
and
running_var
is
not
None
)
if
ndim
<
4
:
inputs
=
theano
.
tensor
.
shape_padright
(
inputs
,
4
-
ndim
)
gamma
=
theano
.
tensor
.
shape_padright
(
gamma
,
4
-
ndim
)
beta
=
theano
.
tensor
.
shape_padright
(
beta
,
4
-
ndim
)
batchnorm_op
=
GpuDnnBatchNorm
(
mode
=
mode
,
epsilon
=
epsilon
)
result
=
tuple
(
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
)))
if
running_averages
:
running_mean
=
theano
.
tensor
.
shape_padright
(
running_mean
,
4
-
ndim
)
running_var
=
theano
.
tensor
.
shape_padright
(
running_var
,
4
-
ndim
)
elif
ndim
>
5
:
inputs_shape
=
inputs
.
shape
params_shape
=
gamma
.
shape
inputs
=
theano
.
tensor
.
flatten
(
inputs
,
5
)
gamma
=
theano
.
tensor
.
flatten
(
gamma
,
5
)
beta
=
theano
.
tensor
.
flatten
(
beta
,
5
)
if
running_averages
:
running_mean
=
theano
.
tensor
.
flatten
(
running_mean
,
5
)
running_var
=
theano
.
tensor
.
flatten
(
running_var
,
5
)
batchnorm_op
=
GpuDnnBatchNorm
(
mode
=
mode
,
epsilon
=
epsilon
,
running_average_factor
=
running_average_factor
,
running_averages
=
running_averages
)
if
running_averages
:
out
,
mean
,
invstd
,
new_running_mean
,
new_running_var
=
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
),
running_mean
=
gpu_contiguous
(
running_mean
),
running_var
=
gpu_contiguous
(
running_var
))
if
new_running_mean
.
broadcastable
!=
running_mean
.
broadcastable
:
new_running_mean
=
tensor
.
patternbroadcast
(
new_running_mean
,
running_mean
.
broadcastable
)
if
new_running_var
.
broadcastable
!=
running_var
.
broadcastable
:
new_running_var
=
tensor
.
patternbroadcast
(
new_running_var
,
running_var
.
broadcastable
)
result
=
(
out
,
mean
,
invstd
,
new_running_mean
,
new_running_var
)
else
:
result
=
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
))
if
ndim
<
4
:
result
=
tuple
(
theano
.
tensor
.
flatten
(
r
,
ndim
)
for
r
in
result
)
elif
ndim
>
5
:
result
=
(
theano
.
tensor
.
reshape
(
result
[
0
],
inputs_shape
),)
+
tuple
(
theano
.
tensor
.
reshape
(
r
,
params_shape
)
for
r
in
result
[
1
:])
return
result
...
...
@@ -2839,9 +3078,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
For 5d tensors, the axes would be (0, 2, 3, 4).
"""
ndim
=
inputs
.
ndim
if
ndim
>
5
:
raise
ValueError
(
"dnn_batch_normalization_test currently supports "
"up to 5-dimensional tensors only, got
%
d"
%
ndim
)
if
gamma
.
ndim
!=
ndim
or
beta
.
ndim
!=
ndim
:
raise
ValueError
(
"gamma and beta must be of the same dimensionality "
"as inputs; got
%
d and
%
d instead of
%
d"
%
...
...
@@ -2859,12 +3095,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
beta
=
theano
.
tensor
.
shape_padright
(
beta
,
4
-
ndim
)
mean
=
theano
.
tensor
.
shape_padright
(
mean
,
4
-
ndim
)
var
=
theano
.
tensor
.
shape_padright
(
var
,
4
-
ndim
)
elif
ndim
>
5
:
inputs_shape
=
inputs
.
shape
inputs
=
theano
.
tensor
.
flatten
(
inputs
,
5
)
gamma
=
theano
.
tensor
.
flatten
(
gamma
,
5
)
beta
=
theano
.
tensor
.
flatten
(
beta
,
5
)
mean
=
theano
.
tensor
.
flatten
(
mean
,
5
)
var
=
theano
.
tensor
.
flatten
(
var
,
5
)
batchnorm_op
=
GpuDnnBatchNormInference
(
mode
=
mode
,
epsilon
=
epsilon
)
result
=
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
),
gpu_contiguous
(
mean
),
gpu_contiguous
(
var
))
if
ndim
<
4
:
result
=
theano
.
tensor
.
flatten
(
result
,
ndim
)
elif
ndim
>
5
:
result
=
theano
.
tensor
.
reshape
(
result
,
inputs_shape
)
return
result
...
...
@@ -3334,3 +3579,235 @@ def local_abstractconv3d_cudnn(node):
subsample
=
node
.
op
.
subsample
,
conv_mode
=
conv_mode
)
return
[
rval
]
@local_optimizer
([
bn
.
AbstractBatchNormTrain
])
def
local_abstract_batch_norm_train_cudnn
(
node
):
if
not
isinstance
(
node
.
op
,
bn
.
AbstractBatchNormTrain
):
return
None
x
,
scale
,
bias
,
epsilon
,
running_average_factor
=
node
.
inputs
[:
5
]
running_mean
=
node
.
inputs
[
5
]
if
len
(
node
.
inputs
)
>
5
else
None
running_var
=
node
.
inputs
[
6
]
if
len
(
node
.
inputs
)
>
6
else
None
# input on gpu? TODO what about the output?
x_on_gpu
=
(
isinstance
(
x
.
type
,
CudaNdarrayType
)
or
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)))
if
not
x_on_gpu
:
return
None
# convert axes to cuDNN mode
axes
=
tuple
(
node
.
op
.
axes
)
if
axes
==
(
0
,):
mode
=
'per-activation'
elif
axes
==
(
0
,)
+
tuple
(
range
(
2
,
x
.
ndim
)):
mode
=
'spatial'
else
:
return
None
try
:
eps
=
float
(
theano
.
tensor
.
get_scalar_constant_value
(
epsilon
))
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
eps
<
1e-5
:
return
None
try
:
running_average_factor
=
float
(
theano
.
tensor
.
get_scalar_constant_value
(
running_average_factor
))
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
not
dnn_available
():
return
None
x
=
as_cuda_ndarray_variable
(
x
)
scale
=
as_cuda_ndarray_variable
(
scale
)
bias
=
as_cuda_ndarray_variable
(
bias
)
inputs
=
[
x
,
scale
,
bias
,
mode
,
eps
,
running_average_factor
]
if
running_mean
is
not
None
and
running_var
is
not
None
:
inputs
.
append
(
running_mean
)
inputs
.
append
(
running_var
)
results
=
list
(
dnn_batch_normalization_train
(
*
inputs
))
# If the original output was on CPU, we have to transfer it
for
i
in
range
(
len
(
node
.
outputs
)):
if
isinstance
(
node
.
outputs
[
i
]
.
type
,
tensor
.
TensorType
):
results
[
i
]
=
tensor
.
as_tensor_variable
(
results
[
i
])
# TODO copy_stack_trace?
return
results
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNorm
],
inplace
=
True
)
def
local_gpu_batch_norm_inplace_output
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNorm
)
and
not
node
.
op
.
inplace_output
:
return
GpuDnnBatchNorm
(
mode
=
node
.
op
.
mode
,
epsilon
=
node
.
op
.
epsilon
,
running_average_factor
=
node
.
op
.
running_average_factor
,
running_averages
=
node
.
op
.
running_averages
,
inplace_running_mean
=
node
.
op
.
inplace_running_mean
,
inplace_running_var
=
node
.
op
.
inplace_running_var
,
inplace_output
=
True
)(
*
node
.
inputs
)
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNorm
],
inplace
=
True
)
def
local_gpu_batch_norm_inplace_running_mean
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNorm
)
and
node
.
op
.
running_averages
and
not
node
.
op
.
inplace_running_mean
:
return
GpuDnnBatchNorm
(
mode
=
node
.
op
.
mode
,
epsilon
=
node
.
op
.
epsilon
,
running_average_factor
=
node
.
op
.
running_average_factor
,
running_averages
=
node
.
op
.
running_averages
,
inplace_running_mean
=
True
,
inplace_running_var
=
node
.
op
.
inplace_running_var
,
inplace_output
=
node
.
op
.
inplace_output
)(
*
node
.
inputs
)
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNorm
],
inplace
=
True
)
def
local_gpu_batch_norm_inplace_running_var
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNorm
)
and
node
.
op
.
running_averages
and
not
node
.
op
.
inplace_running_var
:
return
GpuDnnBatchNorm
(
mode
=
node
.
op
.
mode
,
epsilon
=
node
.
op
.
epsilon
,
running_average_factor
=
node
.
op
.
running_average_factor
,
running_averages
=
node
.
op
.
running_averages
,
inplace_running_mean
=
node
.
op
.
inplace_running_mean
,
inplace_running_var
=
True
,
inplace_output
=
node
.
op
.
inplace_output
)(
*
node
.
inputs
)
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNormInference
],
inplace
=
True
)
def
local_gpu_batch_norm_inference_inplace
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNormInference
)
and
not
node
.
op
.
inplace
:
return
[
GpuDnnBatchNormInference
(
mode
=
node
.
op
.
mode
,
epsilon
=
node
.
op
.
epsilon
,
inplace
=
True
)(
*
node
.
inputs
)]
@local_optimizer
([
bn
.
AbstractBatchNormTrainGrad
])
def
local_abstract_batch_norm_train_grad_cudnn
(
node
):
if
not
isinstance
(
node
.
op
,
bn
.
AbstractBatchNormTrainGrad
):
return
None
x
,
dy
,
scale
,
x_mean
,
x_invstd
,
epsilon
=
node
.
inputs
# input on gpu? TODO what about the output?
x_on_gpu
=
(
isinstance
(
x
.
type
,
CudaNdarrayType
)
or
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)))
dy_on_gpu
=
(
isinstance
(
dy
.
type
,
CudaNdarrayType
)
or
(
dy
.
owner
and
isinstance
(
dy
.
owner
.
op
,
HostFromGpu
)))
if
not
(
x_on_gpu
or
dy_on_gpu
):
return
None
# convert axes to cuDNN mode
axes
=
tuple
(
node
.
op
.
axes
)
if
axes
==
(
0
,):
mode
=
'per-activation'
elif
axes
==
(
0
,)
+
tuple
(
range
(
2
,
x
.
ndim
)):
mode
=
'spatial'
else
:
return
None
ndim
=
x
.
ndim
if
ndim
<
4
:
x
=
theano
.
tensor
.
shape_padright
(
x
,
4
-
ndim
)
dy
=
theano
.
tensor
.
shape_padright
(
dy
,
4
-
ndim
)
scale
=
theano
.
tensor
.
shape_padright
(
scale
,
4
-
ndim
)
x_mean
=
theano
.
tensor
.
shape_padright
(
x_mean
,
4
-
ndim
)
x_invstd
=
theano
.
tensor
.
shape_padright
(
x_invstd
,
4
-
ndim
)
elif
ndim
>
5
:
x_shape
=
x
.
shape
params_shape
=
scale
.
shape
x
=
theano
.
tensor
.
flatten
(
x
,
5
)
dy
=
theano
.
tensor
.
flatten
(
dy
,
5
)
scale
=
theano
.
tensor
.
flatten
(
scale
,
5
)
x_mean
=
theano
.
tensor
.
flatten
(
x_mean
,
5
)
x_invstd
=
theano
.
tensor
.
flatten
(
x_invstd
,
5
)
try
:
eps
=
float
(
theano
.
tensor
.
get_scalar_constant_value
(
epsilon
))
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
eps
<
1e-5
:
return
None
if
not
dnn_available
():
return
None
x
=
as_cuda_ndarray_variable
(
x
)
dy
=
as_cuda_ndarray_variable
(
dy
)
scale
=
as_cuda_ndarray_variable
(
scale
)
x_mean
=
as_cuda_ndarray_variable
(
x_mean
)
x_invstd
=
as_cuda_ndarray_variable
(
x_invstd
)
g_wrt_inputs
,
g_wrt_scale
,
g_wrt_bias
=
\
GpuDnnBatchNormGrad
(
mode
,
epsilon
=
eps
)(
x
,
dy
,
scale
,
x_mean
,
x_invstd
)
if
ndim
<
4
:
g_wrt_inputs
=
theano
.
tensor
.
flatten
(
g_wrt_inputs
,
ndim
)
g_wrt_scale
=
theano
.
tensor
.
flatten
(
g_wrt_scale
,
ndim
)
g_wrt_bias
=
theano
.
tensor
.
flatten
(
g_wrt_bias
,
ndim
)
elif
ndim
>
5
:
g_wrt_inputs
=
theano
.
tensor
.
reshape
(
g_wrt_inputs
,
x_shape
)
g_wrt_scale
=
theano
.
tensor
.
reshape
(
g_wrt_scale
,
params_shape
)
g_wrt_bias
=
theano
.
tensor
.
reshape
(
g_wrt_bias
,
params_shape
)
# If the original output was on CPU, we have to transfer it
if
isinstance
(
node
.
outputs
[
0
]
.
type
,
tensor
.
TensorType
):
g_wrt_inputs
=
tensor
.
as_tensor_variable
(
g_wrt_inputs
)
if
isinstance
(
node
.
outputs
[
1
]
.
type
,
tensor
.
TensorType
):
g_wrt_scale
=
tensor
.
as_tensor_variable
(
g_wrt_scale
)
if
isinstance
(
node
.
outputs
[
2
]
.
type
,
tensor
.
TensorType
):
g_wrt_bias
=
tensor
.
as_tensor_variable
(
g_wrt_bias
)
# TODO copy_stack_trace?
return
[
g_wrt_inputs
,
g_wrt_scale
,
g_wrt_bias
]
@local_optimizer
([
bn
.
AbstractBatchNormInference
])
def
local_abstract_batch_norm_inference_cudnn
(
node
):
if
not
isinstance
(
node
.
op
,
bn
.
AbstractBatchNormInference
):
return
None
x
,
scale
,
bias
,
estimated_mean
,
estimated_variance
,
epsilon
=
node
.
inputs
axes
=
tuple
(
node
.
op
.
axes
)
if
axes
==
(
0
,):
mode
=
'per-activation'
elif
axes
==
(
0
,)
+
tuple
(
range
(
2
,
x
.
ndim
)):
mode
=
'spatial'
else
:
return
None
# input on gpu? TODO what about the output?
x_on_gpu
=
(
isinstance
(
x
.
type
,
CudaNdarrayType
)
or
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)))
if
not
x_on_gpu
:
return
None
try
:
eps
=
float
(
theano
.
tensor
.
get_scalar_constant_value
(
epsilon
))
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
eps
<
1e-5
:
return
None
if
not
dnn_available
():
return
None
x
=
as_cuda_ndarray_variable
(
x
)
scale
=
as_cuda_ndarray_variable
(
scale
)
bias
=
as_cuda_ndarray_variable
(
bias
)
estimated_mean
=
as_cuda_ndarray_variable
(
estimated_mean
)
estimated_variance
=
as_cuda_ndarray_variable
(
estimated_variance
)
out
=
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
estimated_mean
,
estimated_variance
,
mode
,
eps
)
# If the original output was on CPU, we have to transfer it
# TODO copy_stack_trace?
if
isinstance
(
node
.
outputs
[
0
]
.
type
,
tensor
.
TensorType
):
return
[
tensor
.
as_tensor_variable
(
out
)]
else
:
return
[
out
]
theano/sandbox/cuda/opt.py
浏览文件 @
43411345
...
...
@@ -3050,3 +3050,28 @@ conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
local_abstractconv3d_gradinputs_gemm
,
30
,
'conv_gemm'
,
'gpu'
,
'fast_compile'
,
'fast_run'
)
# Register cuDNN batch normalization implementation
abstract_batch_norm_groupopt
=
theano
.
gof
.
optdb
.
LocalGroupDB
()
abstract_batch_norm_groupopt
.
__name__
=
"gpu_batchnorm_opts"
register_opt
(
'fast_compile'
)(
abstract_batch_norm_groupopt
)
# cuDNN optimizations are only registered if cuDNN is available.
# (we import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn)
from
.dnn
import
(
local_abstract_batch_norm_train_cudnn
,
local_abstract_batch_norm_train_grad_cudnn
,
local_abstract_batch_norm_inference_cudnn
)
# noqa: 402
abstract_batch_norm_groupopt
.
register
(
'local_abstract_batch_norm_train_dnn'
,
local_abstract_batch_norm_train_cudnn
,
20
,
'batchnorm_dnn'
,
'gpu'
,
'fast_compile'
,
'fast_run'
,
'cudnn'
)
abstract_batch_norm_groupopt
.
register
(
'local_abstract_batch_norm_train_grad_dnn'
,
local_abstract_batch_norm_train_grad_cudnn
,
20
,
'batchnorm_dnn'
,
'gpu'
,
'fast_compile'
,
'fast_run'
,
'cudnn'
)
abstract_batch_norm_groupopt
.
register
(
'local_abstract_batch_norm_inference_dnn'
,
local_abstract_batch_norm_inference_cudnn
,
20
,
'batchnorm_dnn'
,
'gpu'
,
'fast_compile'
,
'fast_run'
,
'cudnn'
)
theano/sandbox/cuda/tests/test_dnn.py
浏览文件 @
43411345
from
__future__
import
absolute_import
,
print_function
,
division
from
collections
import
OrderedDict
import
logging
import
os
import
sys
...
...
@@ -18,6 +19,7 @@ import theano.tests.unittest_tools as utt
from
theano.tensor.signal.pool
import
pool_2d
,
pool_3d
from
theano.tensor.signal.pool
import
Pool
,
MaxPoolGrad
,
AveragePoolGrad
from
theano.tensor.nnet.abstract_conv
import
get_conv_output_shape
from
theano.tensor.nnet
import
bn
import
theano.sandbox.cuda.dnn
as
dnn
from
theano.sandbox.cuda.basic_ops
import
GpuAllocEmpty
,
gpu_alloc_empty
from
theano.sandbox.cuda
import
float32_shared_constructor
as
shared
...
...
@@ -730,52 +732,201 @@ def test_batchnorm_train():
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
tensor6
=
T
.
TensorType
(
theano
.
config
.
floatX
,
(
False
,)
*
6
)
for
mode
in
(
'per-activation'
,
'spatial'
):
for
vartype
in
(
T
.
ftensor5
,
T
.
ftensor4
,
T
.
ftensor3
,
T
.
fmatrix
,
T
.
fvector
):
x
,
scale
,
bias
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
))
for
vartype
in
(
tensor6
,
T
.
ftensor5
,
T
.
ftensor4
,
T
.
ftensor3
,
T
.
fmatrix
,
T
.
fvector
):
x
,
scale
,
bias
,
running_mean
,
running_var
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'running_mean'
,
'running_var'
))
ndim
=
x
.
ndim
eps
=
5e-3
# some non-standard value to test if it's used
# forward pass
out
,
x_mean
,
x_invstd
=
cuda
.
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
)
running_average_factor
=
0.3
# forward pass, direct interface
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
,
\
out_running_mean_gpu
,
out_running_var_gpu
=
\
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
,
running_average_factor
,
running_mean
,
running_var
)
# forward pass, abstract interface
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
,
\
out_running_mean_abstract
,
out_running_var_abstract
=
\
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
,
running_average_factor
,
running_mean
,
running_var
)
# reference forward pass
if
mode
==
'per-activation'
:
axes
=
(
0
,)
elif
mode
==
'spatial'
:
axes
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
x_mean2
=
x
.
mean
(
axis
=
axes
,
keepdims
=
True
)
x_invstd2
=
T
.
inv
(
T
.
sqrt
(
x
.
var
(
axis
=
axes
,
keepdims
=
True
)
+
eps
))
scale2
=
T
.
addbroadcast
(
scale
,
*
axes
)
bias2
=
T
.
addbroadcast
(
bias
,
*
axes
)
out2
=
(
x
-
x_mean2
)
*
(
scale2
*
x_invstd2
)
+
bias2
x_mean_ref
=
x
.
mean
(
axis
=
axes
,
keepdims
=
True
)
x_var_ref
=
x
.
var
(
axis
=
axes
,
keepdims
=
True
)
x_invstd_ref
=
T
.
inv
(
T
.
sqrt
(
x_var_ref
+
eps
))
scale_ref
=
T
.
addbroadcast
(
scale
,
*
axes
)
bias_ref
=
T
.
addbroadcast
(
bias
,
*
axes
)
m
=
T
.
cast
(
T
.
prod
(
x
.
shape
)
/
T
.
prod
(
scale
.
shape
),
theano
.
config
.
floatX
)
out_ref
=
(
x
-
x_mean_ref
)
*
(
scale_ref
*
x_invstd_ref
)
+
bias_ref
out_running_mean_ref
=
running_mean
*
(
1
-
running_average_factor
)
+
\
x_mean_ref
*
running_average_factor
out_running_var_ref
=
running_var
*
(
1
-
running_average_factor
)
+
\
(
m
/
(
m
-
1
))
*
x_var_ref
*
running_average_factor
# backward pass
dy
=
vartype
(
'dy'
)
grads
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out
:
dy
})
grads_gpu
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_gpu
:
dy
})
grads_abstract
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_abstract
:
dy
})
# reference backward pass
grads
2
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out2
:
dy
})
grads
_ref
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_ref
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out
,
x_mean
,
x_invstd
,
out2
,
x_mean2
,
x_invstd2
]
+
grads
+
grads2
,
mode
=
mode_with_gpu
)
f_gpu
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
,
out_running_mean_gpu
,
out_running_var_gpu
]
+
grads_gpu
,
mode
=
mode_with_gpu
)
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
,
out_running_mean_abstract
,
out_running_var_abstract
]
+
grads_abstract
,
mode
=
mode_with_gpu
)
f_ref
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out_ref
,
x_mean_ref
,
x_invstd_ref
,
out_running_mean_ref
,
out_running_var_ref
]
+
grads_ref
)
# check if the abstract Ops have been replaced
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormGrad
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
# run
for
data_shape
in
((
5
,
10
,
30
,
40
,
10
),
(
4
,
3
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
)):
for
data_shape
in
((
5
,
10
,
30
,
40
,
10
,
5
),
(
4
,
3
,
1
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
,
5
)):
data_shape
=
data_shape
[:
ndim
]
param_shape
=
tuple
(
1
if
d
in
axes
else
s
for
d
,
s
in
enumerate
(
data_shape
))
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
'float32'
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
'float32'
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
outputs
=
f
(
X
,
Scale
,
Bias
,
Dy
)
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Running_mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Running_var
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs_gpu
=
f_gpu
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
outputs_abstract
=
f_abstract
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
outputs_ref
=
f_ref
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
# compare outputs
utt
.
assert_allclose
(
outputs
[
0
],
outputs
[
0
+
3
])
# out
utt
.
assert_allclose
(
outputs
[
1
],
outputs
[
1
+
3
])
# mean
utt
.
assert_allclose
(
outputs
[
2
],
outputs
[
2
+
3
])
# invstd
utt
.
assert_allclose
(
outputs_gpu
[
0
],
outputs_ref
[
0
])
# out
utt
.
assert_allclose
(
outputs_gpu
[
1
],
outputs_ref
[
1
])
# mean
utt
.
assert_allclose
(
outputs_gpu
[
2
],
outputs_ref
[
2
])
# invstd
utt
.
assert_allclose
(
outputs_gpu
[
3
],
outputs_ref
[
3
])
# running_mean
utt
.
assert_allclose
(
numpy
.
nan_to_num
(
outputs_gpu
[
4
]),
numpy
.
nan_to_num
(
outputs_ref
[
4
]))
# running_var
utt
.
assert_allclose
(
outputs_abstract
[
0
],
outputs_ref
[
0
])
# out
utt
.
assert_allclose
(
outputs_abstract
[
1
],
outputs_ref
[
1
])
# mean
utt
.
assert_allclose
(
outputs_abstract
[
2
],
outputs_ref
[
2
])
# invstd
utt
.
assert_allclose
(
outputs_abstract
[
3
],
outputs_ref
[
3
])
# running_mean
utt
.
assert_allclose
(
numpy
.
nan_to_num
(
outputs_abstract
[
4
]),
numpy
.
nan_to_num
(
outputs_ref
[
4
]))
# running_var
# compare gradients
utt
.
assert_allclose
(
outputs
[
6
],
outputs
[
6
+
3
],
atol
=
1e-4
)
# dx
utt
.
assert_allclose
(
outputs
[
7
],
outputs
[
7
+
3
],
rtol
=
2e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs
[
8
],
outputs
[
8
+
3
])
# dbias
utt
.
assert_allclose
(
outputs_gpu
[
5
],
outputs_ref
[
5
],
atol
=
2e-4
)
# dx
utt
.
assert_allclose
(
outputs_gpu
[
6
],
outputs_ref
[
6
],
rtol
=
4e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs_gpu
[
7
],
outputs_ref
[
7
])
# dbias
utt
.
assert_allclose
(
outputs_abstract
[
5
],
outputs_ref
[
5
],
atol
=
2e-4
)
# dx
utt
.
assert_allclose
(
outputs_abstract
[
6
],
outputs_ref
[
6
],
rtol
=
4e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs_abstract
[
7
],
outputs_ref
[
7
])
# dbias
def
test_dnn_batchnorm_train_without_running_averages
():
# compile and run batch_normalization_train without running averages
if
not
cuda
.
dnn
.
dnn_available
():
raise
SkipTest
(
cuda
.
dnn
.
dnn_available
.
msg
)
if
cuda
.
dnn
.
version
()
<
(
5000
,
5000
):
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
x
,
scale
,
bias
,
dy
=
T
.
tensor4
(
'x'
),
T
.
tensor4
(
'scale'
),
T
.
tensor4
(
'bias'
),
T
.
tensor4
(
'dy'
)
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
# forward pass
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
=
\
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
)
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
=
\
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
)
# backward pass
grads_gpu
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_gpu
:
dy
})
grads_abstract
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_gpu
:
dy
})
# compile
f_gpu
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
]
+
grads_gpu
,
mode
=
mode_with_gpu
)
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
]
+
grads_abstract
,
mode
=
mode_with_gpu
)
# check if the abstract Ops have been replaced
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormGrad
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f_gpu
(
X
,
Scale
,
Bias
,
Dy
)
f_abstract
(
X
,
Scale
,
Bias
,
Dy
)
def
test_dnn_batchnorm_train_inplace
():
# test inplace_running_mean and inplace_running_var
if
not
cuda
.
dnn
.
dnn_available
():
raise
SkipTest
(
cuda
.
dnn
.
dnn_available
.
msg
)
if
cuda
.
dnn
.
version
()
<
(
5000
,
5000
):
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
x
,
scale
,
bias
=
T
.
tensor4
(
'x'
),
T
.
tensor4
(
'scale'
),
T
.
tensor4
(
'bias'
)
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
running_mean
=
shared
(
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
),
broadcastable
=
(
True
,
False
,
False
,
False
))
running_var
=
shared
(
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
),
broadcastable
=
(
True
,
False
,
False
,
False
))
# forward pass
out
,
x_mean
,
x_invstd
,
new_running_mean
,
new_running_var
=
\
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
,
epsilon
=
5e-3
,
running_average_factor
=
0.3
,
running_mean
=
running_mean
,
running_var
=
running_var
)
# update running averages
updates
=
OrderedDict
()
updates
[
running_mean
]
=
new_running_mean
updates
[
running_var
]
=
new_running_var
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
],
[
out
,
x_mean
,
x_invstd
],
updates
=
updates
,
mode
=
mode_with_gpu
)
# check for the inplace settings
nodes
=
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)]
assert
len
(
nodes
)
==
1
assert
nodes
[
0
]
.
op
.
inplace_running_mean
assert
nodes
[
0
]
.
op
.
inplace_running_var
assert
nodes
[
0
]
.
op
.
inplace_output
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f
(
X
,
Scale
,
Bias
)
def
test_batchnorm_inference
():
...
...
@@ -785,53 +936,160 @@ def test_batchnorm_inference():
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
tensor6
=
T
.
TensorType
(
theano
.
config
.
floatX
,
(
False
,)
*
6
)
for
mode
in
(
'per-activation'
,
'spatial'
):
for
vartype
in
(
T
.
ftensor5
,
T
.
ftensor4
,
T
.
ftensor3
,
T
.
fmatrix
,
T
.
fvector
):
x
,
scale
,
bias
,
mean
,
var
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
))
for
vartype
in
(
tensor6
,
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
,
T
.
vector
):
x
,
scale
,
bias
,
mean
,
var
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
))
ndim
=
x
.
ndim
eps
=
5e-3
# some non-standard value to test if it's used
# forward pass
out
=
cuda
.
dnn
.
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
mode
,
eps
)
# forward pass, direct interface
out_gpu
=
dnn
.
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
mode
,
eps
)
# forward pass, abstract interface
out_abstract
=
bn
.
batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
mode
,
eps
)
# reference forward pass
if
mode
==
'per-activation'
:
axes
=
(
0
,)
elif
mode
==
'spatial'
:
axes
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
scale
2
,
bias2
,
mean2
,
var2
=
(
T
.
addbroadcast
(
t
,
*
axes
)
for
t
in
(
scale
,
bias
,
mean
,
var
))
out
2
=
(
x
-
mean2
)
*
(
scale2
/
T
.
sqrt
(
var2
+
eps
))
+
bias2
scale
_ref
,
bias_ref
,
mean_ref
,
var_ref
=
(
T
.
addbroadcast
(
t
,
*
axes
)
for
t
in
(
scale
,
bias
,
mean
,
var
))
out
_ref
=
(
x
-
mean_ref
)
*
(
scale_ref
/
T
.
sqrt
(
var_ref
+
eps
))
+
bias_ref
# backward pass
dy
=
vartype
(
'dy'
)
grads
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out
:
dy
})
grads_gpu
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_gpu
:
dy
})
grads_abstract
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_abstract
:
dy
})
# reference backward pass
grads
2
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out2
:
dy
})
grads
_ref
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_ref
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out
,
out2
]
+
grads
+
grads2
,
mode
=
mode_with_gpu
)
f_gpu
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_gpu
]
+
grads_gpu
,
mode
=
mode_with_gpu
)
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_abstract
]
+
grads_abstract
,
mode
=
mode_with_gpu
)
f_ref
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_ref
]
+
grads_ref
)
# check if the abstract Ops have been replaced
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormInference
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
# run
for
data_shape
in
((
5
,
10
,
30
,
40
,
10
),
(
4
,
3
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
)):
for
data_shape
in
((
10
,
20
,
30
,
40
,
10
,
5
),
(
4
,
3
,
1
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
,
5
)):
data_shape
=
data_shape
[:
ndim
]
param_shape
=
tuple
(
1
if
d
in
axes
else
s
for
d
,
s
in
enumerate
(
data_shape
))
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
'float32'
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
'float32'
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
Mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
Var
=
numpy
.
random
.
rand
(
*
param_shape
)
.
astype
(
'float32'
)
outputs
=
f
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Var
=
numpy
.
random
.
rand
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs_gpu
=
f_gpu
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
outputs_abstract
=
f_abstract
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
outputs_ref
=
f_ref
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
# compare outputs
utt
.
assert_allclose
(
outputs
[
0
],
outputs
[
1
])
# out
utt
.
assert_allclose
(
outputs_gpu
[
0
],
outputs_ref
[
0
])
# out
utt
.
assert_allclose
(
outputs_abstract
[
0
],
outputs_ref
[
0
])
# out
# compare gradients
utt
.
assert_allclose
(
outputs
[
2
],
outputs
[
2
+
5
],
atol
=
4e-5
)
# dx
utt
.
assert_allclose
(
outputs
[
3
],
outputs
[
3
+
5
],
atol
=
4e-5
)
# dscale
utt
.
assert_allclose
(
outputs
[
4
],
outputs
[
4
+
5
])
# dbias
utt
.
assert_allclose
(
outputs
[
5
],
outputs
[
5
+
5
])
# dmean
utt
.
assert_allclose
(
outputs
[
6
],
outputs
[
6
+
5
],
rtol
=
2e-3
,
atol
=
4e-5
)
# dvar
utt
.
assert_allclose
(
outputs_gpu
[
1
],
outputs_ref
[
1
],
atol
=
4e-5
)
# dx
utt
.
assert_allclose
(
outputs_gpu
[
2
],
outputs_ref
[
2
],
atol
=
4e-5
)
# dscale
utt
.
assert_allclose
(
outputs_gpu
[
3
],
outputs_ref
[
3
])
# dbias
utt
.
assert_allclose
(
outputs_gpu
[
4
],
outputs_ref
[
4
])
# dmean
utt
.
assert_allclose
(
outputs_gpu
[
5
],
outputs_ref
[
5
],
rtol
=
2e-3
,
atol
=
4e-5
)
# dvar
utt
.
assert_allclose
(
outputs_abstract
[
1
],
outputs_ref
[
1
],
atol
=
4e-5
)
# dx
utt
.
assert_allclose
(
outputs_abstract
[
2
],
outputs_ref
[
2
],
atol
=
4e-5
)
# dscale
utt
.
assert_allclose
(
outputs_abstract
[
3
],
outputs_ref
[
3
])
# dbias
utt
.
assert_allclose
(
outputs_abstract
[
4
],
outputs_ref
[
4
])
# dmean
utt
.
assert_allclose
(
outputs_abstract
[
5
],
outputs_ref
[
5
],
rtol
=
2e-3
,
atol
=
4e-5
)
# dvar
def
test_batchnorm_inference_inplace
():
# test inplace
if
not
cuda
.
dnn
.
dnn_available
():
raise
SkipTest
(
cuda
.
dnn
.
dnn_available
.
msg
)
if
cuda
.
dnn
.
version
()
<
(
5000
,
5000
):
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
x
,
scale
,
bias
,
mean
,
var
=
(
T
.
tensor4
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
))
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
out
=
dnn
.
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
)
f
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
],
[
out
],
mode
=
mode_with_gpu
)
# check for the inplace settings
nodes
=
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormInference
)]
assert
len
(
nodes
)
==
1
assert
nodes
[
0
]
.
op
.
inplace
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Var
=
numpy
.
random
.
rand
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f
(
X
,
Scale
,
Bias
,
Mean
,
Var
)
def
test_dnn_batchnorm_valid_and_invalid_axes
():
if
not
cuda
.
dnn
.
dnn_available
():
raise
SkipTest
(
cuda
.
dnn
.
dnn_available
.
msg
)
if
cuda
.
dnn
.
version
()
<
(
5000
,
5000
):
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
for
vartype
in
(
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
):
x
,
scale
,
bias
,
mean
,
var
,
dy
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
,
'dy'
))
ndim
=
x
.
ndim
# supported: per-activation and spatial
valid_axes_lists
=
((
0
,),
(
0
,)
+
tuple
(
range
(
2
,
ndim
)))
# not supported: an axes list without 0 and including 1
invalid_axes_lists
=
(
tuple
(
range
(
1
,
ndim
)),)
for
axes
in
valid_axes_lists
+
invalid_axes_lists
:
# forward pass, abstract interface
out_train
,
x_mean
,
x_invstd
=
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
axes
)
out_test
=
bn
.
batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
axes
)
# backward pass
dy
=
vartype
(
'dy'
)
grads_train
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_train
:
dy
})
grads_test
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_test
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_train
,
x_mean
,
x_invstd
,
out_test
]
+
grads_train
+
grads_test
,
mode
=
mode_with_gpu
)
if
axes
in
valid_axes_lists
:
# check if the abstract Ops have been replaced by the cuDNN Ops
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormGrad
)
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormInference
)
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
else
:
# check if the abstract Ops have been replaced, but not by the cuDNN Ops
assert
not
any
([
isinstance
(
n
.
op
,
(
dnn
.
GpuDnnBatchNorm
,
dnn
.
GpuDnnBatchNormGrad
,
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
def
test_dnn_tag
():
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论