Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
c4293e69
提交
c4293e69
authored
11月 09, 2016
作者:
Gijs van Tulder
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add running averages to batch norm (no cuDNN yet).
上级
4f291961
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
307 行增加
和
55 行删除
+307
-55
dnn.py
theano/gpuarray/dnn.py
+19
-8
test_dnn.py
theano/gpuarray/tests/test_dnn.py
+71
-16
bn.py
theano/tensor/nnet/bn.py
+159
-18
test_bn.py
theano/tensor/nnet/tests/test_bn.py
+58
-13
没有找到文件。
theano/gpuarray/dnn.py
浏览文件 @
c4293e69
...
...
@@ -2949,7 +2949,9 @@ def local_abstract_batch_norm_train_cudnn(node):
if
not
isinstance
(
node
.
op
,
bn
.
AbstractBatchNormTrain
):
return
None
x
,
scale
,
bias
,
epsilon
=
node
.
inputs
x
,
scale
,
bias
,
epsilon
,
running_average_factor
=
node
.
inputs
[:
5
]
running_mean
=
node
.
inputs
[
5
]
if
len
(
node
.
inputs
)
>
5
else
None
running_var
=
node
.
inputs
[
6
]
if
len
(
node
.
inputs
)
>
6
else
None
# input on gpu? TODO what about the output?
x_on_gpu
=
(
isinstance
(
x
.
type
,
GpuArrayType
)
or
...
...
@@ -2983,15 +2985,24 @@ def local_abstract_batch_norm_train_cudnn(node):
out
,
mean
,
invstd
=
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
)
results
=
[
out
,
mean
,
invstd
]
if
running_mean
is
not
None
:
running_mean
=
running_mean
*
(
1
-
running_average_factor
)
+
\
mean
*
running_average_factor
results
.
append
(
running_mean
)
if
running_var
is
not
None
:
var
=
x
.
var
(
axis
=
axes
,
keepdims
=
True
)
m
=
tensor
.
cast
(
tensor
.
prod
(
x
.
shape
)
/
tensor
.
prod
(
scale
.
shape
),
theano
.
config
.
floatX
)
running_var
=
running_var
*
(
1
-
running_average_factor
)
+
\
(
m
/
(
m
-
1
))
*
var
*
running_average_factor
results
.
append
(
running_var
)
# If the original output was on CPU, we have to transfer it
if
isinstance
(
node
.
outputs
[
0
]
.
type
,
tensor
.
TensorType
):
out
=
tensor
.
as_tensor_variable
(
out
)
if
isinstance
(
node
.
outputs
[
1
]
.
type
,
tensor
.
TensorType
):
mean
=
tensor
.
as_tensor_variable
(
mean
)
if
isinstance
(
node
.
outputs
[
2
]
.
type
,
tensor
.
TensorType
):
invstd
=
tensor
.
as_tensor_variable
(
invstd
)
for
i
in
range
(
len
(
node
.
outputs
)):
if
isinstance
(
node
.
outputs
[
i
]
.
type
,
tensor
.
TensorType
):
results
[
i
]
=
tensor
.
as_tensor_variable
(
results
[
i
])
# TODO copy_stack_trace?
return
[
out
,
mean
,
invstd
]
return
results
@local_optimizer
([
bn
.
AbstractBatchNormTrainGrad
])
...
...
theano/gpuarray/tests/test_dnn.py
浏览文件 @
c4293e69
...
...
@@ -1384,26 +1384,39 @@ def test_dnn_batchnorm_train():
for
mode
in
(
'per-activation'
,
'spatial'
):
for
vartype
in
(
tensor6
,
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
,
T
.
vector
):
x
,
scale
,
bias
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
))
x
,
scale
,
bias
,
running_mean
,
running_var
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'running_mean'
,
'running_var'
))
ndim
=
x
.
ndim
eps
=
5e-3
# some non-standard value to test if it's used
running_average_factor
=
0.3
# forward pass, direct interface
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
=
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
)
# forward pass, abstract interface
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
=
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
)
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
,
\
out_running_mean_abstract
,
out_running_var_abstract
=
\
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
,
running_average_factor
,
running_mean
,
running_var
)
# reference forward pass
if
mode
==
'per-activation'
:
axes
=
(
0
,)
elif
mode
==
'spatial'
:
axes
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
x_mean_ref
=
x
.
mean
(
axis
=
axes
,
keepdims
=
True
)
x_invstd_ref
=
T
.
inv
(
T
.
sqrt
(
x
.
var
(
axis
=
axes
,
keepdims
=
True
)
+
eps
))
x_var_ref
=
x
.
var
(
axis
=
axes
,
keepdims
=
True
)
x_invstd_ref
=
T
.
inv
(
T
.
sqrt
(
x_var_ref
+
eps
))
scale_ref
=
T
.
addbroadcast
(
scale
,
*
axes
)
bias_ref
=
T
.
addbroadcast
(
bias
,
*
axes
)
m
=
T
.
cast
(
T
.
prod
(
x
.
shape
)
/
T
.
prod
(
scale
.
shape
),
theano
.
config
.
floatX
)
out_ref
=
(
x
-
x_mean_ref
)
*
(
scale_ref
*
x_invstd_ref
)
+
bias_ref
out_running_mean_ref
=
running_mean
*
(
1
-
running_average_factor
)
+
\
x_mean_ref
*
running_average_factor
out_running_var_ref
=
running_var
*
(
1
-
running_average_factor
)
+
\
(
m
/
(
m
-
1
))
*
x_var_ref
*
running_average_factor
# backward pass
dy
=
vartype
(
'dy'
)
grads_gpu
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_gpu
:
dy
})
...
...
@@ -1414,12 +1427,14 @@ def test_dnn_batchnorm_train():
f_gpu
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
]
+
grads_gpu
,
mode
=
mode_with_gpu
)
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
]
+
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
,
out_running_mean_abstract
,
out_running_var_abstract
]
+
grads_abstract
,
mode
=
mode_with_gpu
)
f_ref
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out_ref
,
x_mean_ref
,
x_invstd_ref
]
+
grads_ref
)
f_ref
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out_ref
,
x_mean_ref
,
x_invstd_ref
,
out_running_mean_ref
,
out_running_var_ref
]
+
grads_ref
)
# check if the abstract Ops have been replaced
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
...
...
@@ -1438,9 +1453,11 @@ def test_dnn_batchnorm_train():
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Running_mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Running_var
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs_gpu
=
f_gpu
(
X
,
Scale
,
Bias
,
Dy
)
outputs_abstract
=
f_abstract
(
X
,
Scale
,
Bias
,
Dy
)
outputs_ref
=
f_ref
(
X
,
Scale
,
Bias
,
Dy
)
outputs_abstract
=
f_abstract
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
outputs_ref
=
f_ref
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
# compare outputs
utt
.
assert_allclose
(
outputs_gpu
[
0
],
outputs_ref
[
0
])
# out
utt
.
assert_allclose
(
outputs_gpu
[
1
],
outputs_ref
[
1
])
# mean
...
...
@@ -1448,13 +1465,51 @@ def test_dnn_batchnorm_train():
utt
.
assert_allclose
(
outputs_abstract
[
0
],
outputs_ref
[
0
])
# out
utt
.
assert_allclose
(
outputs_abstract
[
1
],
outputs_ref
[
1
])
# mean
utt
.
assert_allclose
(
outputs_abstract
[
2
],
outputs_ref
[
2
])
# invstd
utt
.
assert_allclose
(
outputs_abstract
[
3
],
outputs_ref
[
3
])
# running_mean
utt
.
assert_allclose
(
numpy
.
nan_to_num
(
outputs_abstract
[
4
]),
numpy
.
nan_to_num
(
outputs_ref
[
4
]))
# running_var
# compare gradients
utt
.
assert_allclose
(
outputs_gpu
[
3
],
outputs_ref
[
3
],
atol
=
2e-4
)
# dx
utt
.
assert_allclose
(
outputs_gpu
[
4
],
outputs_ref
[
4
],
rtol
=
2e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs_gpu
[
5
],
outputs_ref
[
5
])
# dbias
utt
.
assert_allclose
(
outputs_abstract
[
3
],
outputs_ref
[
3
],
atol
=
2e-4
)
# dx
utt
.
assert_allclose
(
outputs_abstract
[
4
],
outputs_ref
[
4
],
rtol
=
2e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs_abstract
[
5
],
outputs_ref
[
5
])
# dbias
utt
.
assert_allclose
(
outputs_gpu
[
3
],
outputs_ref
[
5
],
atol
=
2e-4
)
# dx
utt
.
assert_allclose
(
outputs_gpu
[
4
],
outputs_ref
[
6
],
rtol
=
4e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs_gpu
[
5
],
outputs_ref
[
7
])
# dbias
utt
.
assert_allclose
(
outputs_abstract
[
5
],
outputs_ref
[
5
],
atol
=
2e-4
)
# dx
utt
.
assert_allclose
(
outputs_abstract
[
6
],
outputs_ref
[
6
],
rtol
=
4e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs_abstract
[
7
],
outputs_ref
[
7
])
# dbias
def
test_dnn_batchnorm_train_without_running_averages
():
# compile and run batch_normalization_train without running averages
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
if
dnn
.
version
(
raises
=
False
)
<
5000
:
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
x
,
scale
,
bias
,
dy
=
T
.
tensor4
(
'x'
),
T
.
tensor4
(
'scale'
),
T
.
tensor4
(
'bias'
),
T
.
tensor4
(
'dy'
)
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
# forward pass
out
,
x_mean
,
x_invstd
=
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
)
# backward pass
grads
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out
:
dy
})
# compile
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out
,
x_mean
,
x_invstd
]
+
grads
,
mode
=
mode_with_gpu
)
# check if the abstract Ops have been replaced
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormGrad
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f_abstract
(
X
,
Scale
,
Bias
,
Dy
)
def
test_batchnorm_inference
():
...
...
theano/tensor/nnet/bn.py
浏览文件 @
c4293e69
...
...
@@ -84,7 +84,8 @@ def batch_normalization(inputs, gamma, beta, mean, std,
def
batch_normalization_train
(
inputs
,
gamma
,
beta
,
axes
=
'per-activation'
,
epsilon
=
1e-4
):
epsilon
=
1e-4
,
running_average_factor
=
0.1
,
running_mean
=
None
,
running_var
=
None
):
"""
Performs batch normalization of the given inputs, using the mean and
variance of the inputs.
...
...
@@ -107,6 +108,23 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
Returns
-------
...
...
@@ -116,6 +134,12 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
Means of `inputs` across the normalization axes.
invstd : tensor
Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes
-----
...
...
@@ -131,14 +155,32 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
# for spatial normalization
axes = (0,) + tuple(range(2, inputs.ndim))
mean = inputs.mean(axes, keepdims=True)
invstd = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon))
var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) +
\\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) +
\\
(m / (m - 1)) * var * running_average_factor
"""
ndim
=
inputs
.
ndim
if
gamma
.
ndim
!=
ndim
or
beta
.
ndim
!=
ndim
:
raise
ValueError
(
"gamma and beta must be of the same dimensionality "
"as inputs; got
%
d and
%
d instead of
%
d"
%
(
gamma
.
ndim
,
beta
.
ndim
,
ndim
))
if
(
running_mean
is
None
)
!=
(
running_var
is
None
):
raise
ValueError
(
"running_mean and running_var must either both be "
"given or both be None"
)
if
running_mean
is
not
None
and
running_mean
.
ndim
!=
ndim
:
raise
ValueError
(
"running_mean must be of the same dimensionality "
"as inputs; got
%
d instead of
%
d"
%
(
running_mean
.
ndim
,
ndim
))
if
running_var
is
not
None
and
running_var
.
ndim
!=
ndim
:
raise
ValueError
(
"running_var must be of the same dimensionality "
"as inputs; got
%
d instead of
%
d"
%
(
running_var
.
ndim
,
ndim
))
if
epsilon
<
1e-5
:
raise
ValueError
(
"epsilon must be at least 1e-5, got
%
f"
%
epsilon
)
...
...
@@ -163,7 +205,23 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
beta
=
T
.
addbroadcast
(
beta
,
*
axes
)
batchnorm_op
=
AbstractBatchNormTrain
(
axes
=
axes
)
return
tuple
(
batchnorm_op
(
inputs
,
gamma
,
beta
,
epsilon
=
epsilon
))
if
running_mean
is
not
None
and
running_var
is
not
None
:
running_mean
=
as_tensor_variable
(
running_mean
)
running_var
=
as_tensor_variable
(
running_var
)
running_mean_bc
=
T
.
addbroadcast
(
running_mean
,
*
axes
)
running_var_bc
=
T
.
addbroadcast
(
running_var
,
*
axes
)
out
,
mean
,
invstd
,
new_running_mean
,
new_running_var
=
batchnorm_op
(
inputs
,
gamma
,
beta
,
epsilon
=
epsilon
,
running_average_factor
=
running_average_factor
,
running_mean
=
running_mean_bc
,
running_var
=
running_var_bc
)
if
new_running_mean
.
broadcastable
!=
running_mean
.
broadcastable
:
new_running_mean
=
T
.
patternbroadcast
(
new_running_mean
,
running_mean
.
broadcastable
)
if
new_running_var
.
broadcastable
!=
running_var
.
broadcastable
:
new_running_var
=
T
.
patternbroadcast
(
new_running_var
,
running_var
.
broadcastable
)
return
out
,
mean
,
invstd
,
new_running_mean
,
new_running_var
else
:
return
tuple
(
batchnorm_op
(
inputs
,
gamma
,
beta
,
epsilon
=
epsilon
))
def
batch_normalization_test
(
inputs
,
gamma
,
beta
,
mean
,
var
,
...
...
@@ -277,6 +335,23 @@ class AbstractBatchNormTrain(Op):
epsilon
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - running_average_factor) + batch mean * running_average_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - running_average_factor) + (m / (m - 1)) * batch var * running_average_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
"""
__props__
=
(
'axes'
,)
...
...
@@ -288,40 +363,85 @@ class AbstractBatchNormTrain(Op):
self
.
axes
=
axes
def
infer_shape
(
self
,
node
,
shape
):
return
[
shape
[
0
]
,
shape
[
1
],
shape
[
1
]]
return
[
shape
[
0
]
]
+
[
shape
[
1
]]
*
(
len
(
node
.
outputs
)
-
1
)
def
make_node
(
self
,
x
,
scale
,
bias
,
epsilon
=
1e-4
):
def
make_node
(
self
,
x
,
scale
,
bias
,
epsilon
=
1e-4
,
running_average_factor
=
0.1
,
running_mean
=
None
,
running_var
=
None
):
assert
x
.
ndim
==
scale
.
ndim
==
bias
.
ndim
assert
((
running_mean
is
None
and
running_var
is
None
)
or
(
running_mean
is
not
None
and
running_var
is
not
None
))
assert
(
running_mean
is
None
or
running_mean
.
ndim
==
x
.
ndim
)
assert
(
running_var
is
None
or
running_var
.
ndim
==
x
.
ndim
)
if
not
isinstance
(
epsilon
,
theano
.
Variable
):
epsilon
=
as_tensor_variable
(
epsilon
)
return
Apply
(
self
,
[
x
,
scale
,
bias
,
epsilon
],
[
x
.
type
(),
scale
.
type
(),
scale
.
type
()])
if
not
isinstance
(
running_average_factor
,
theano
.
Variable
):
running_average_factor
=
as_tensor_variable
(
running_average_factor
)
inputs
=
[
x
,
scale
,
bias
,
epsilon
,
running_average_factor
]
output_types
=
[
x
.
type
(),
scale
.
type
(),
scale
.
type
()]
if
running_mean
is
not
None
and
running_var
is
not
None
:
inputs
.
append
(
running_mean
)
inputs
.
append
(
running_var
)
output_types
.
append
(
scale
.
type
())
output_types
.
append
(
scale
.
type
())
return
Apply
(
self
,
inputs
,
output_types
)
def
grad
(
self
,
inputs
,
grads
):
x
,
scale
,
bias
,
epsilon
=
inputs
x
,
scale
,
bias
,
epsilon
,
running_average_factor
=
inputs
[:
5
]
dy
=
grads
[
0
]
_
,
x_mean
,
x_invstd
=
self
(
x
,
scale
,
bias
,
epsilon
)
_
,
x_mean
,
x_invstd
=
self
(
*
inputs
)[:
3
]
disconnected_outputs
=
[
theano
.
gradient
.
DisconnectedType
()(),
# epsilon
theano
.
gradient
.
DisconnectedType
()()]
# running_average_factor
# Optional running_mean and running_var.
for
i
in
range
(
5
,
len
(
inputs
)):
disconnected_outputs
.
append
(
theano
.
gradient
.
DisconnectedType
()())
return
AbstractBatchNormTrainGrad
(
self
.
axes
)(
x
,
dy
,
scale
,
x_mean
,
x_invstd
,
epsilon
)
+
[
theano
.
gradient
.
DisconnectedType
()()]
x
,
dy
,
scale
,
x_mean
,
x_invstd
,
epsilon
)
+
disconnected_outputs
def
connection_pattern
(
self
,
node
):
# Specificy that epsilon is not connected to outputs.
return
[[
True
,
True
,
True
],
[
True
,
True
,
True
],
[
True
,
True
,
True
],
[
False
,
False
,
False
]]
# Specificy that epsilon and running_average_factor are not connected to outputs.
patterns
=
[[
True
,
True
,
True
],
# x
[
True
,
True
,
True
],
# scale
[
True
,
True
,
True
],
# bias
[
False
,
False
,
False
],
# epsilon
[
False
,
False
,
False
]]
# running_average_factor
# Optional running_mean and running_var are only
# connected to their new values.
for
i
in
range
(
5
,
len
(
node
.
inputs
)):
patterns
[
0
]
.
append
(
True
)
for
pattern
in
patterns
[
1
:]:
pattern
.
append
(
False
)
patterns
.
append
([
False
]
*
(
3
+
i
-
5
)
+
[
True
])
return
patterns
def
perform
(
self
,
node
,
inputs
,
output_storage
):
x
,
scale
,
bias
,
epsilon
=
inputs
x
,
scale
,
bias
,
epsilon
,
running_average_factor
=
inputs
[:
5
]
axes
=
self
.
axes
if
min
(
axes
)
<
0
or
max
(
axes
)
>=
x
.
ndim
:
raise
ValueError
(
'axes should be less than ndim (<
%
d), but
%
s given'
%
(
x
.
ndim
,
str
(
axes
)))
mean
=
x
.
mean
(
axes
,
keepdims
=
True
)
invstd
=
1.0
/
numpy
.
sqrt
(
x
.
var
(
axes
,
keepdims
=
True
)
+
epsilon
)
var
=
x
.
var
(
axes
,
keepdims
=
True
)
invstd
=
1.0
/
numpy
.
sqrt
(
var
+
epsilon
)
out
=
(
x
-
mean
)
*
(
scale
*
invstd
)
+
bias
output_storage
[
0
][
0
]
=
out
output_storage
[
1
][
0
]
=
mean
output_storage
[
2
][
0
]
=
invstd
if
len
(
inputs
)
>
5
:
running_mean
=
inputs
[
5
]
running_mean
=
running_mean
*
(
1.0
-
running_average_factor
)
+
\
mean
*
running_average_factor
output_storage
[
3
][
0
]
=
running_mean
if
len
(
inputs
)
>
6
:
m
=
float
(
numpy
.
prod
(
x
.
shape
)
/
numpy
.
prod
(
scale
.
shape
))
running_var
=
inputs
[
6
]
running_var
=
running_var
*
(
1.0
-
running_average_factor
)
+
\
(
m
/
(
m
-
1
))
*
var
*
running_average_factor
output_storage
[
4
][
0
]
=
running_var
class
AbstractBatchNormInference
(
Op
):
"""
...
...
@@ -429,21 +549,42 @@ def local_abstract_batch_norm_train(node):
if
not
isinstance
(
node
.
op
,
AbstractBatchNormTrain
):
return
None
x
,
scale
,
bias
,
epsilon
=
node
.
inputs
x
,
scale
,
bias
,
epsilon
,
running_average_factor
=
node
.
inputs
[:
5
]
axes
=
node
.
op
.
axes
if
min
(
axes
)
<
0
or
max
(
axes
)
>
x
.
ndim
:
return
None
if
not
isinstance
(
x
.
type
,
TensorType
)
or
\
not
isinstance
(
scale
.
type
,
TensorType
)
or
\
not
isinstance
(
bias
.
type
,
TensorType
)
or
\
not
isinstance
(
epsilon
.
type
,
TensorType
):
not
isinstance
(
epsilon
.
type
,
TensorType
)
or
\
not
isinstance
(
running_average_factor
.
type
,
TensorType
):
return
None
# optional running_mean and running_var
if
len
(
node
.
inputs
)
>
5
and
not
isinstance
(
node
.
inputs
[
5
]
.
type
,
TensorType
):
return
None
if
len
(
node
.
inputs
)
>
6
and
not
isinstance
(
node
.
inputs
[
6
]
.
type
,
TensorType
):
return
None
mean
=
x
.
mean
(
axes
,
keepdims
=
True
)
invstd
=
T
.
inv
(
T
.
sqrt
(
x
.
var
(
axes
,
keepdims
=
True
)
+
epsilon
))
var
=
x
.
var
(
axes
,
keepdims
=
True
)
invstd
=
T
.
inv
(
T
.
sqrt
(
var
+
epsilon
))
out
=
(
x
-
mean
)
*
(
scale
*
invstd
)
+
bias
results
=
[
out
,
mean
,
invstd
]
if
len
(
node
.
inputs
)
>
5
:
running_mean
=
node
.
inputs
[
5
]
running_mean
=
running_mean
*
(
1.0
-
running_average_factor
)
+
\
mean
*
running_average_factor
results
.
append
(
running_mean
)
if
len
(
node
.
inputs
)
>
6
:
m
=
T
.
cast
(
T
.
prod
(
x
.
shape
)
/
T
.
prod
(
scale
.
shape
),
theano
.
config
.
floatX
)
running_var
=
node
.
inputs
[
6
]
running_var
=
running_var
*
(
1.0
-
running_average_factor
)
+
\
(
m
/
(
m
-
1
))
*
var
*
running_average_factor
results
.
append
(
running_var
)
# TODO copy_stack_trace?
return
[
out
,
mean
,
invstd
]
return
results
@local_optimizer
([
AbstractBatchNormTrainGrad
])
...
...
theano/tensor/nnet/tests/test_bn.py
浏览文件 @
c4293e69
...
...
@@ -148,9 +148,13 @@ def test_batch_normalization_train():
for
axes
in
(
'per-activation'
,
'spatial'
,
(
1
,
2
,
3
,
4
)):
for
vartype
in
(
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
,
T
.
vector
):
x
,
scale
,
bias
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
))
x
,
scale
,
bias
,
running_mean
,
running_var
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'running_mean'
,
'running_var'
))
ndim
=
x
.
ndim
eps
=
5e-3
# some non-standard value to test if it's used
running_average_factor
=
0.3
# remove non-existing axes
if
isinstance
(
axes
,
tuple
):
...
...
@@ -159,8 +163,10 @@ def test_batch_normalization_train():
continue
# forward pass
out
,
x_mean
,
x_invstd
=
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
axes
,
eps
)
out
,
x_mean
,
x_invstd
,
out_running_mean
,
out_running_var
=
\
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
axes
,
eps
,
running_average_factor
,
running_mean
,
running_var
)
# reference forward pass
if
axes
==
'per-activation'
:
axes2
=
(
0
,)
...
...
@@ -169,18 +175,25 @@ def test_batch_normalization_train():
else
:
axes2
=
axes
x_mean2
=
x
.
mean
(
axis
=
axes2
,
keepdims
=
True
)
x_invstd2
=
T
.
inv
(
T
.
sqrt
(
x
.
var
(
axis
=
axes2
,
keepdims
=
True
)
+
eps
))
x_var2
=
x
.
var
(
axis
=
axes2
,
keepdims
=
True
)
x_invstd2
=
T
.
inv
(
T
.
sqrt
(
x_var2
+
eps
))
scale2
=
T
.
addbroadcast
(
scale
,
*
axes2
)
bias2
=
T
.
addbroadcast
(
bias
,
*
axes2
)
out2
=
(
x
-
x_mean2
)
*
(
scale2
*
x_invstd2
)
+
bias2
m
=
T
.
cast
(
T
.
prod
(
x
.
shape
)
/
T
.
prod
(
scale
.
shape
),
theano
.
config
.
floatX
)
out_running_mean2
=
running_mean
*
(
1
-
running_average_factor
)
+
\
x_mean2
*
running_average_factor
out_running_var2
=
running_var
*
(
1
-
running_average_factor
)
+
\
(
m
/
(
m
-
1
))
*
x_var2
*
running_average_factor
# backward pass
dy
=
vartype
(
'dy'
)
grads
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out
:
dy
})
# reference backward pass
grads2
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out2
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out
,
x_mean
,
x_invstd
,
out2
,
x_mean2
,
x_invstd2
]
+
f
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out
,
x_mean
,
x_invstd
,
out_running_mean
,
out_running_var
,
out2
,
x_mean2
,
x_invstd2
,
out_running_mean2
,
out_running_var2
]
+
grads
+
grads2
,
mode
=
'FAST_RUN'
)
# check if the abstract Ops have been replaced
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
...
...
@@ -196,15 +209,47 @@ def test_batch_normalization_train():
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs
=
f
(
X
,
Scale
,
Bias
,
Dy
)
Running_mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Running_var
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs
=
f
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
# compare outputs
utt
.
assert_allclose
(
outputs
[
0
],
outputs
[
0
+
3
])
# out
utt
.
assert_allclose
(
outputs
[
1
],
outputs
[
1
+
3
])
# mean
utt
.
assert_allclose
(
outputs
[
2
],
outputs
[
2
+
3
])
# invstd
utt
.
assert_allclose
(
outputs
[
0
],
outputs
[
0
+
5
])
# out
utt
.
assert_allclose
(
outputs
[
1
],
outputs
[
1
+
5
])
# mean
utt
.
assert_allclose
(
outputs
[
2
],
outputs
[
2
+
5
])
# invstd
utt
.
assert_allclose
(
outputs
[
3
],
outputs
[
3
+
5
])
# running_mean
utt
.
assert_allclose
(
numpy
.
nan_to_num
(
outputs
[
4
]),
numpy
.
nan_to_num
(
outputs
[
4
+
5
]))
# running_var
# compare gradients
utt
.
assert_allclose
(
outputs
[
6
],
outputs
[
6
+
3
],
atol
=
1e-4
)
# dx
utt
.
assert_allclose
(
outputs
[
7
],
outputs
[
7
+
3
],
rtol
=
2e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs
[
8
],
outputs
[
8
+
3
])
# dbias
utt
.
assert_allclose
(
outputs
[
10
],
outputs
[
10
+
3
],
atol
=
1e-4
)
# dx
utt
.
assert_allclose
(
outputs
[
11
],
outputs
[
11
+
3
],
rtol
=
2e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs
[
12
],
outputs
[
12
+
3
])
# dbias
def
test_batch_normalization_train_without_running_averages
():
# compile and run batch_normalization_train without running averages
utt
.
seed_rng
()
x
,
scale
,
bias
,
dy
=
T
.
tensor4
(
'x'
),
T
.
tensor4
(
'scale'
),
T
.
tensor4
(
'bias'
),
T
.
tensor4
(
'dy'
)
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
# forward pass
out
,
x_mean
,
x_invstd
=
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
)
# backward pass
grads
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out
,
x_mean
,
x_invstd
]
+
grads
,
mode
=
'FAST_RUN'
)
# check if the abstract Ops have been replaced
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f
(
X
,
Scale
,
Bias
,
Dy
)
def
test_batch_normalization_test
():
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论