Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
8b9f7336
提交
8b9f7336
authored
1月 31, 2017
作者:
Frédéric Bastien
提交者:
GitHub
1月 31, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #5190 from gvtulder/f-batchnorm-abstract
Abstract Ops for batch normalization
上级
18f27c44
60238616
隐藏空白字符变更
内嵌
并排
正在显示
11 个修改的文件
包含
2584 行增加
和
198 行删除
+2584
-198
bn.txt
doc/library/tensor/nnet/bn.txt
+4
-1
dnn.py
theano/gpuarray/dnn.py
+401
-37
dnn_batchnorm.c
theano/gpuarray/dnn_batchnorm.c
+54
-2
dnn_batchnorm_inf.c
theano/gpuarray/dnn_batchnorm_inf.c
+6
-0
test_dnn.py
theano/gpuarray/tests/test_dnn.py
+303
-43
__init__.py
theano/sandbox/cuda/__init__.py
+12
-1
dnn.py
theano/sandbox/cuda/dnn.py
+529
-33
opt.py
theano/sandbox/cuda/opt.py
+25
-0
test_dnn.py
theano/sandbox/cuda/tests/test_dnn.py
+314
-56
bn.py
theano/tensor/nnet/bn.py
+636
-1
test_bn.py
theano/tensor/nnet/tests/test_bn.py
+300
-24
没有找到文件。
doc/library/tensor/nnet/bn.txt
浏览文件 @
8b9f7336
...
...
@@ -10,6 +10,9 @@
.. moduleauthor:: LISA
.. seealso:: cuDNN batch normalization: :class:`theano.gpuarray.dnn.dnn_batch_normalization_train`, :class:`theano.gpuarray.dnn.dnn_batch_normalization_test>`. They must be added manually as they do not have the same user interface.
.. autofunction:: theano.tensor.nnet.bn.batch_normalization_train
.. autofunction:: theano.tensor.nnet.bn.batch_normalization_test
.. seealso:: cuDNN batch normalization: :class:`theano.gpuarray.dnn.dnn_batch_normalization_train`, :class:`theano.gpuarray.dnn.dnn_batch_normalization_test>`.
.. autofunction:: theano.tensor.nnet.bn.batch_normalization
theano/gpuarray/dnn.py
浏览文件 @
8b9f7336
...
...
@@ -28,19 +28,20 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
assert_conv_shape
)
from
theano.tensor.signal.pool
import
(
Pool
,
MaxPoolGrad
,
AveragePoolGrad
)
from
theano.tensor.nnet
import
bn
from
.
import
pygpu
from
.type
import
(
get_context
,
gpu_context_type
,
list_contexts
,
GpuArraySharedVariable
)
from
.basic_ops
import
(
as_gpuarray_variable
,
infer_context_name
,
gpu_contiguous
,
gpu_alloc_empty
,
empty_like
,
GpuArrayType
)
empty_like
,
GpuArrayType
,
HostFromGpu
)
from
.elemwise
import
GpuElemwise
# These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from
.nnet
import
GpuSoftmax
from
.opt
import
(
gpu_seqopt
,
register_opt
,
pool_db
,
pool_db2
,
op_lifter
,
register_opt2
)
op_lifter
,
register_opt2
,
register_inplace
)
from
.opt_util
import
alpha_merge
,
output_merge
,
inplace_allocempty
,
pad_dims
,
unpad_dims
...
...
@@ -1389,13 +1390,13 @@ class GpuDnnPool(DnnBase):
res
.
append
((
shape
[
0
][
4
]
+
2
*
p
[
2
]
-
w
[
2
])
//
s
[
2
]
+
1
)
return
[
res
]
def
grad
(
self
,
inp
,
grads
):
def
L_op
(
self
,
inp
,
outputs
,
grads
):
img
,
ws
,
stride
,
pad
=
inp
grad
,
=
grads
grad
=
gpu_contiguous
(
grad
)
out
=
self
(
img
,
ws
,
stride
,
pad
)
out
,
=
outputs
g_out
=
GpuDnnPoolGrad
(
mode
=
self
.
mode
)(
img
,
out
,
grad
,
ws
,
stride
,
pad
)
...
...
@@ -1591,10 +1592,10 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
assert
x
.
ndim
==
4
return
Apply
(
self
,
[
x
],
[
x
.
type
()])
def
grad
(
self
,
inp
,
grads
):
def
L_op
(
self
,
inp
,
outputs
,
grads
):
x
,
=
inp
g_sm
,
=
grads
sm
=
self
(
x
)
sm
,
=
outputs
return
[
GpuDnnSoftmaxGrad
(
self
.
algo
,
self
.
mode
...
...
@@ -1646,48 +1647,131 @@ class GpuDnnBatchNorm(DnnBase):
epsilon
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
"""
__props__
=
(
'mode'
,)
__props__
=
(
'mode'
,
'running_averages'
,
'inplace_running_mean'
,
'inplace_running_var'
,
'inplace_output'
)
def
__init__
(
self
,
mode
=
'per-activation'
):
def
__init__
(
self
,
mode
=
'per-activation'
,
running_averages
=
False
,
inplace_running_mean
=
False
,
inplace_running_var
=
False
,
inplace_output
=
False
):
DnnBase
.
__init__
(
self
,
[
'dnn_batchnorm_base.c'
,
'dnn_batchnorm.c'
],
'dnn_batchnorm_op'
)
assert
(
mode
in
(
'per-activation'
,
'spatial'
))
self
.
mode
=
mode
self
.
running_averages
=
running_averages
self
.
inplace_output
=
inplace_output
self
.
inplace_running_mean
=
inplace_running_mean
self
.
inplace_running_var
=
inplace_running_var
self
.
destroy_map
=
{}
if
self
.
inplace_output
:
self
.
destroy_map
[
0
]
=
[
0
]
if
self
.
running_averages
and
self
.
inplace_running_mean
:
self
.
destroy_map
[
3
]
=
[
5
]
if
self
.
running_averages
and
self
.
inplace_running_var
:
self
.
destroy_map
[
4
]
=
[
6
]
def
__setstate__
(
self
,
d
):
self
.
__dict__
.
update
(
d
)
if
not
hasattr
(
self
,
'running_average_factor'
):
self
.
running_average_factor
=
0
if
not
hasattr
(
self
,
'running_averages'
):
self
.
running_averages
=
False
if
not
(
hasattr
(
self
,
'inplace_running_mean'
)
and
hasattr
(
self
,
'inplace_running_var'
)
and
hasattr
(
self
,
'inplace_output'
)):
self
.
inplace_running_mean
=
False
self
.
inplace_running_var
=
False
self
.
inplace_output
=
False
self
.
destroy_map
=
{}
def
get_op_params
(
self
):
params
=
[]
if
self
.
inplace_output
:
params
.
append
((
'INPLACE_OUTPUT'
,
'1'
))
if
self
.
running_averages
:
params
.
append
((
'RUNNING_AVERAGES'
,
'1'
))
if
self
.
inplace_running_mean
:
params
.
append
((
'INPLACE_RUNNING_MEAN'
,
'1'
))
if
self
.
inplace_running_var
:
params
.
append
((
'INPLACE_RUNNING_VAR'
,
'1'
))
params
.
append
((
'MODE'
,
(
"CUDNN_BATCHNORM_SPATIAL"
if
self
.
mode
==
"spatial"
else
"CUDNN_BATCHNORM_PER_ACTIVATION"
)))
return
params
def
infer_shape
(
self
,
node
,
shape
):
return
[
shape
[
0
]
,
shape
[
1
],
shape
[
1
]]
return
[
shape
[
0
]
]
+
[
shape
[
1
]]
*
(
len
(
node
.
outputs
)
-
1
)
def
make_node
(
self
,
x
,
scale
,
bias
,
epsilon
=
1e-4
):
def
make_node
(
self
,
x
,
scale
,
bias
,
epsilon
=
1e-4
,
running_average_factor
=
0.1
,
running_mean
=
None
,
running_var
=
None
):
assert
x
.
ndim
==
scale
.
ndim
==
bias
.
ndim
assert
x
.
ndim
in
(
4
,
5
)
assert
self
.
running_averages
==
(
running_mean
is
not
None
)
==
(
running_var
is
not
None
)
assert
(
running_mean
is
None
or
running_mean
.
ndim
==
x
.
ndim
)
assert
(
running_var
is
None
or
running_var
.
ndim
==
x
.
ndim
)
ctx_name
=
infer_context_name
(
x
,
scale
,
bias
)
x
=
as_gpuarray_variable
(
x
,
ctx_name
)
scale
=
as_gpuarray_variable
(
scale
,
ctx_name
)
bias
=
as_gpuarray_variable
(
bias
,
ctx_name
)
epsilon
=
as_scalar
(
epsilon
)
.
astype
(
'float64'
)
assert
x
.
ndim
==
scale
.
ndim
==
bias
.
ndim
assert
x
.
ndim
in
(
4
,
5
)
return
Apply
(
self
,
[
x
,
scale
,
bias
,
epsilon
],
[
x
.
type
(),
scale
.
type
(),
scale
.
type
()])
def
grad
(
self
,
inputs
,
grads
):
x
,
scale
,
bias
,
epsilon
=
inputs
running_average_factor
=
as_scalar
(
running_average_factor
)
.
astype
(
'float64'
)
inputs
=
[
x
,
scale
,
bias
,
epsilon
,
running_average_factor
]
output_types
=
[
x
.
type
(),
scale
.
type
(),
scale
.
type
()]
if
running_mean
is
not
None
and
running_var
is
not
None
:
inputs
.
append
(
as_gpuarray_variable
(
running_mean
,
ctx_name
))
inputs
.
append
(
as_gpuarray_variable
(
running_var
,
ctx_name
))
output_types
.
append
(
scale
.
type
())
output_types
.
append
(
scale
.
type
())
return
Apply
(
self
,
inputs
,
output_types
)
def
L_op
(
self
,
inputs
,
outputs
,
grads
):
x
,
scale
,
bias
,
epsilon
,
running_average_factor
=
inputs
[:
5
]
dy
=
grads
[
0
]
_
,
x_mean
,
x_invstd
=
self
(
x
,
scale
,
bias
,
epsilon
)
return
GpuDnnBatchNormGrad
(
self
.
mode
)(
x
,
dy
,
scale
,
x_mean
,
x_invstd
,
epsilon
)
+
[
DisconnectedType
()()]
_
,
x_mean
,
x_invstd
=
outputs
[:
3
]
disconnected_outputs
=
[
DisconnectedType
()(),
# epsilon
DisconnectedType
()()]
# running_average_factor
# Optional running_mean and running_var.
for
i
in
range
(
5
,
len
(
inputs
)):
disconnected_outputs
.
append
(
DisconnectedType
()())
return
GpuDnnBatchNormGrad
(
self
.
mode
)(
x
,
dy
,
scale
,
x_mean
,
x_invstd
,
epsilon
)
+
disconnected_outputs
def
connection_pattern
(
self
,
node
):
# Specificy that epsilon is not connected to outputs.
return
[[
True
,
True
,
True
],
[
True
,
True
,
True
],
[
True
,
True
,
True
],
[
False
,
False
,
False
]]
# Specificy that epsilon and running_average_factor are not connected to outputs.
patterns
=
[[
True
,
True
,
True
],
# x
[
True
,
True
,
True
],
# scale
[
True
,
True
,
True
],
# bias
[
False
,
False
,
False
],
# epsilon
[
False
,
False
,
False
]]
# running_average_factor
# Optional running_mean and running_var are only
# connected to their new values.
for
i
in
range
(
5
,
len
(
node
.
inputs
)):
patterns
[
0
]
.
append
(
True
)
for
pattern
in
patterns
[
1
:]:
pattern
.
append
(
False
)
patterns
.
append
([
False
]
*
(
3
+
i
-
5
)
+
[
True
])
return
patterns
class
GpuDnnBatchNormInference
(
DnnBase
):
...
...
@@ -1706,17 +1790,27 @@ class GpuDnnBatchNormInference(DnnBase):
value is 1e-5 (imposed by cuDNN).
"""
__props__
=
(
'mode'
,)
__props__
=
(
'mode'
,
'inplace'
)
def
__init__
(
self
,
mode
=
'per-activation'
):
def
__init__
(
self
,
mode
=
'per-activation'
,
inplace
=
False
):
DnnBase
.
__init__
(
self
,
[
'dnn_batchnorm_base.c'
,
'dnn_batchnorm_inf.c'
],
'dnn_batchnorm_op'
)
assert
(
mode
in
(
'per-activation'
,
'spatial'
))
self
.
mode
=
mode
self
.
inplace
=
inplace
if
self
.
inplace
:
self
.
destroy_map
=
{
0
:
[
0
]}
def
__setstate__
(
self
,
d
):
self
.
__dict__
.
update
(
d
)
if
not
hasattr
(
self
,
'inplace'
):
self
.
inplace
=
False
def
get_op_params
(
self
):
params
=
[]
if
self
.
inplace
:
params
.
append
((
'INPLACE_OUTPUT'
,
'1'
))
params
.
append
((
'MODE'
,
(
"CUDNN_BATCHNORM_SPATIAL"
if
self
.
mode
==
"spatial"
else
"CUDNN_BATCHNORM_PER_ACTIVATION"
)))
...
...
@@ -2404,7 +2498,8 @@ class RNNBlock(object):
def
dnn_batch_normalization_train
(
inputs
,
gamma
,
beta
,
mode
=
'per-activation'
,
epsilon
=
1e-4
):
epsilon
=
1e-4
,
running_average_factor
=
0.1
,
running_mean
=
None
,
running_var
=
None
):
"""
Performs batch normalization of the given inputs, using the mean and
variance of the inputs.
...
...
@@ -2424,6 +2519,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
Returns
-------
...
...
@@ -2431,8 +2543,14 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
Batch-normalized inputs.
mean : tensor
Means of `inputs` across the normalization axes.
stdinv
: tensor
invstd
: tensor
Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes
-----
...
...
@@ -2444,31 +2562,77 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
axes = 0 if mode == 'per-activation' else (0, 2, 3)
mean = inputs.mean(axes, keepdims=True)
stdinv = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon))
out = (inputs - mean) * gamma * stdinv + beta
var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) +
\\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) +
\\
(m / (m - 1)) * var * running_average_factor
For 5d tensors, the axes are (0, 2, 3, 4).
"""
ndim
=
inputs
.
ndim
if
ndim
>
5
:
raise
ValueError
(
"dnn_batch_normalization_train currently supports "
"up to 5-dimensional tensors only, got
%
d"
%
ndim
)
if
gamma
.
ndim
!=
ndim
or
beta
.
ndim
!=
ndim
:
raise
ValueError
(
"gamma and beta must be of the same dimensionality "
"as inputs; got
%
d and
%
d instead of
%
d"
%
(
gamma
.
ndim
,
beta
.
ndim
,
ndim
))
if
(
running_mean
is
None
)
!=
(
running_var
is
None
):
raise
ValueError
(
"running_mean and running_var must either both be "
"given or both be None"
)
if
running_mean
is
not
None
and
running_mean
.
ndim
!=
ndim
:
raise
ValueError
(
"running_mean must be of the same dimensionality "
"as inputs; got
%
d instead of
%
d"
%
(
running_mean
.
ndim
,
ndim
))
if
running_var
is
not
None
and
running_var
.
ndim
!=
ndim
:
raise
ValueError
(
"running_var must be of the same dimensionality "
"as inputs; got
%
d instead of
%
d"
%
(
running_var
.
ndim
,
ndim
))
if
epsilon
<
1e-5
:
raise
ValueError
(
"epsilon must be at least 1e-5, got
%
f"
%
epsilon
)
running_averages
=
(
running_mean
is
not
None
and
running_var
is
not
None
)
if
ndim
<
4
:
inputs
=
theano
.
tensor
.
shape_padright
(
inputs
,
4
-
ndim
)
gamma
=
theano
.
tensor
.
shape_padright
(
gamma
,
4
-
ndim
)
beta
=
theano
.
tensor
.
shape_padright
(
beta
,
4
-
ndim
)
batchnorm_op
=
GpuDnnBatchNorm
(
mode
=
mode
)
result
=
tuple
(
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
),
epsilon
=
epsilon
))
if
running_averages
:
running_mean
=
theano
.
tensor
.
shape_padright
(
running_mean
,
4
-
ndim
)
running_var
=
theano
.
tensor
.
shape_padright
(
running_var
,
4
-
ndim
)
elif
ndim
>
5
:
inputs_shape
=
inputs
.
shape
params_shape
=
gamma
.
shape
inputs
=
theano
.
tensor
.
flatten
(
inputs
,
5
)
gamma
=
theano
.
tensor
.
flatten
(
gamma
,
5
)
beta
=
theano
.
tensor
.
flatten
(
beta
,
5
)
if
running_averages
:
running_mean
=
theano
.
tensor
.
flatten
(
running_mean
,
5
)
running_var
=
theano
.
tensor
.
flatten
(
running_var
,
5
)
batchnorm_op
=
GpuDnnBatchNorm
(
mode
=
mode
,
running_averages
=
running_averages
)
if
running_averages
:
out
,
mean
,
invstd
,
new_running_mean
,
new_running_var
=
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
),
epsilon
=
epsilon
,
running_average_factor
=
running_average_factor
,
running_mean
=
gpu_contiguous
(
running_mean
),
running_var
=
gpu_contiguous
(
running_var
))
if
new_running_mean
.
broadcastable
!=
running_mean
.
broadcastable
:
new_running_mean
=
tensor
.
patternbroadcast
(
new_running_mean
,
running_mean
.
broadcastable
)
if
new_running_var
.
broadcastable
!=
running_var
.
broadcastable
:
new_running_var
=
tensor
.
patternbroadcast
(
new_running_var
,
running_var
.
broadcastable
)
result
=
(
out
,
mean
,
invstd
,
new_running_mean
,
new_running_var
)
else
:
result
=
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
),
epsilon
=
epsilon
)
if
ndim
<
4
:
result
=
tuple
(
theano
.
tensor
.
flatten
(
r
,
ndim
)
for
r
in
result
)
elif
ndim
>
5
:
result
=
(
theano
.
tensor
.
reshape
(
result
[
0
],
inputs_shape
),)
+
tuple
(
theano
.
tensor
.
reshape
(
r
,
params_shape
)
for
r
in
result
[
1
:])
return
result
...
...
@@ -2521,9 +2685,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
For 5d tensors, the axes would be (0, 2, 3, 4).
"""
ndim
=
inputs
.
ndim
if
ndim
>
5
:
raise
ValueError
(
"dnn_batch_normalization_test currently supports "
"up to 5-dimensional tensors only, got
%
d"
%
ndim
)
if
gamma
.
ndim
!=
ndim
or
beta
.
ndim
!=
ndim
:
raise
ValueError
(
"gamma and beta must be of the same dimensionality "
"as inputs; got
%
d and
%
d instead of
%
d"
%
...
...
@@ -2541,12 +2702,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
beta
=
theano
.
tensor
.
shape_padright
(
beta
,
4
-
ndim
)
mean
=
theano
.
tensor
.
shape_padright
(
mean
,
4
-
ndim
)
var
=
theano
.
tensor
.
shape_padright
(
var
,
4
-
ndim
)
elif
ndim
>
5
:
inputs_shape
=
inputs
.
shape
inputs
=
theano
.
tensor
.
flatten
(
inputs
,
5
)
gamma
=
theano
.
tensor
.
flatten
(
gamma
,
5
)
beta
=
theano
.
tensor
.
flatten
(
beta
,
5
)
mean
=
theano
.
tensor
.
flatten
(
mean
,
5
)
var
=
theano
.
tensor
.
flatten
(
var
,
5
)
batchnorm_op
=
GpuDnnBatchNormInference
(
mode
=
mode
)
result
=
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
),
gpu_contiguous
(
mean
),
gpu_contiguous
(
var
),
epsilon
=
epsilon
)
if
ndim
<
4
:
result
=
theano
.
tensor
.
flatten
(
result
,
ndim
)
elif
ndim
>
5
:
result
=
theano
.
tensor
.
reshape
(
result
,
inputs_shape
)
return
result
...
...
@@ -2928,3 +3098,197 @@ def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
out
=
GpuDnnSoftmaxGrad
(
'accurate'
,
'instance'
)(
gpu_contiguous
(
ins
[
0
]),
gpu_contiguous
(
ins
[
1
]))
return
[
out
.
dimshuffle
(
0
,
2
)]
@register_opt
(
'cudnn'
,
'fast_compile'
)
@op_lifter
([
bn
.
AbstractBatchNormTrain
])
@register_opt2
([
bn
.
AbstractBatchNormTrain
],
'cudnn'
,
'fast_compile'
)
def
local_abstract_batch_norm_train_cudnn
(
op
,
ctx_name
,
inputs
,
outputs
):
x
,
scale
,
bias
,
epsilon
,
running_average_factor
=
inputs
[:
5
]
running_mean
=
inputs
[
5
]
if
len
(
inputs
)
>
5
else
None
running_var
=
inputs
[
6
]
if
len
(
inputs
)
>
6
else
None
# convert axes to cuDNN mode
axes
=
tuple
(
op
.
axes
)
if
axes
==
(
0
,):
mode
=
'per-activation'
elif
axes
==
(
0
,)
+
tuple
(
range
(
2
,
x
.
ndim
)):
mode
=
'spatial'
else
:
return
None
try
:
eps
=
theano
.
tensor
.
get_scalar_constant_value
(
epsilon
)
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
eps
<
1e-5
:
return
None
try
:
running_average_factor
=
theano
.
tensor
.
get_scalar_constant_value
(
running_average_factor
)
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
ctx
=
infer_context_name
(
*
inputs
)
if
not
dnn_available
(
ctx
):
# TODO should this raise_no_cudnn?
return
None
x
=
as_gpuarray_variable
(
x
,
context_name
=
ctx
)
scale
=
as_gpuarray_variable
(
scale
,
context_name
=
ctx
)
bias
=
as_gpuarray_variable
(
bias
,
context_name
=
ctx
)
inputs
=
[
x
,
scale
,
bias
,
mode
,
eps
,
running_average_factor
]
if
running_mean
is
not
None
and
running_var
is
not
None
:
inputs
.
append
(
running_mean
)
inputs
.
append
(
running_var
)
results
=
list
(
dnn_batch_normalization_train
(
*
inputs
))
return
results
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNorm
],
inplace
=
True
)
def
local_batch_norm_inplace_output
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNorm
)
and
not
node
.
op
.
inplace_output
:
return
GpuDnnBatchNorm
(
mode
=
node
.
op
.
mode
,
running_averages
=
node
.
op
.
running_averages
,
inplace_running_mean
=
node
.
op
.
inplace_running_mean
,
inplace_running_var
=
node
.
op
.
inplace_running_var
,
inplace_output
=
True
)(
*
node
.
inputs
)
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNorm
],
inplace
=
True
)
def
local_batch_norm_inplace_running_mean
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNorm
)
and
node
.
op
.
running_averages
and
not
node
.
op
.
inplace_running_mean
:
return
GpuDnnBatchNorm
(
mode
=
node
.
op
.
mode
,
running_averages
=
node
.
op
.
running_averages
,
inplace_running_mean
=
True
,
inplace_running_var
=
node
.
op
.
inplace_running_var
,
inplace_output
=
node
.
op
.
inplace_output
)(
*
node
.
inputs
)
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNorm
],
inplace
=
True
)
def
local_batch_norm_inplace_running_var
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNorm
)
and
node
.
op
.
running_averages
and
not
node
.
op
.
inplace_running_var
:
return
GpuDnnBatchNorm
(
mode
=
node
.
op
.
mode
,
running_averages
=
node
.
op
.
running_averages
,
inplace_running_mean
=
node
.
op
.
inplace_running_mean
,
inplace_running_var
=
True
,
inplace_output
=
node
.
op
.
inplace_output
)(
*
node
.
inputs
)
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNormInference
],
inplace
=
True
)
def
local_batch_norm_inference_inplace
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNormInference
)
and
not
node
.
op
.
inplace
:
return
[
GpuDnnBatchNormInference
(
mode
=
node
.
op
.
mode
,
inplace
=
True
)(
*
node
.
inputs
)]
@register_opt
(
'cudnn'
,
'fast_compile'
)
@op_lifter
([
bn
.
AbstractBatchNormTrainGrad
])
@register_opt2
([
bn
.
AbstractBatchNormTrainGrad
],
'cudnn'
,
'fast_compile'
)
def
local_abstract_batch_norm_train_grad_cudnn
(
op
,
ctx_name
,
inputs
,
outputs
):
x
,
dy
,
scale
,
x_mean
,
x_invstd
,
epsilon
=
inputs
# input on gpu? TODO what about the output?
x_on_gpu
=
(
isinstance
(
x
.
type
,
GpuArrayType
)
or
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)))
dy_on_gpu
=
(
isinstance
(
dy
.
type
,
GpuArrayType
)
or
(
dy
.
owner
and
isinstance
(
dy
.
owner
.
op
,
HostFromGpu
)))
if
not
(
x_on_gpu
or
dy_on_gpu
):
return
None
# convert axes to cuDNN mode
axes
=
tuple
(
op
.
axes
)
if
axes
==
(
0
,):
mode
=
'per-activation'
elif
axes
==
(
0
,)
+
tuple
(
range
(
2
,
x
.
ndim
)):
mode
=
'spatial'
else
:
return
None
ndim
=
x
.
ndim
if
ndim
<
4
:
x
=
theano
.
tensor
.
shape_padright
(
x
,
4
-
ndim
)
dy
=
theano
.
tensor
.
shape_padright
(
dy
,
4
-
ndim
)
scale
=
theano
.
tensor
.
shape_padright
(
scale
,
4
-
ndim
)
x_mean
=
theano
.
tensor
.
shape_padright
(
x_mean
,
4
-
ndim
)
x_invstd
=
theano
.
tensor
.
shape_padright
(
x_invstd
,
4
-
ndim
)
elif
ndim
>
5
:
x_shape
=
x
.
shape
params_shape
=
scale
.
shape
x
=
theano
.
tensor
.
flatten
(
x
,
5
)
dy
=
theano
.
tensor
.
flatten
(
dy
,
5
)
scale
=
theano
.
tensor
.
flatten
(
scale
,
5
)
x_mean
=
theano
.
tensor
.
flatten
(
x_mean
,
5
)
x_invstd
=
theano
.
tensor
.
flatten
(
x_invstd
,
5
)
try
:
eps
=
theano
.
tensor
.
get_scalar_constant_value
(
epsilon
)
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
eps
<
1e-5
:
return
None
ctx
=
infer_context_name
(
*
inputs
)
if
not
dnn_available
(
ctx
):
# TODO should this raise_no_cudnn?
return
None
x
=
as_gpuarray_variable
(
x
,
context_name
=
ctx
)
dy
=
as_gpuarray_variable
(
dy
,
context_name
=
ctx
)
scale
=
as_gpuarray_variable
(
scale
,
context_name
=
ctx
)
x_mean
=
as_gpuarray_variable
(
x_mean
,
context_name
=
ctx
)
x_invstd
=
as_gpuarray_variable
(
x_invstd
,
context_name
=
ctx
)
g_wrt_inputs
,
g_wrt_scale
,
g_wrt_bias
=
\
GpuDnnBatchNormGrad
(
mode
)(
x
,
dy
,
scale
,
x_mean
,
x_invstd
,
eps
)
if
ndim
<
4
:
g_wrt_inputs
=
theano
.
tensor
.
flatten
(
g_wrt_inputs
,
ndim
)
g_wrt_scale
=
theano
.
tensor
.
flatten
(
g_wrt_scale
,
ndim
)
g_wrt_bias
=
theano
.
tensor
.
flatten
(
g_wrt_bias
,
ndim
)
elif
ndim
>
5
:
g_wrt_inputs
=
theano
.
tensor
.
reshape
(
g_wrt_inputs
,
x_shape
)
g_wrt_scale
=
theano
.
tensor
.
reshape
(
g_wrt_scale
,
params_shape
)
g_wrt_bias
=
theano
.
tensor
.
reshape
(
g_wrt_bias
,
params_shape
)
return
[
g_wrt_inputs
,
g_wrt_scale
,
g_wrt_bias
]
@register_opt
(
'cudnn'
,
'fast_compile'
)
@op_lifter
([
bn
.
AbstractBatchNormInference
])
@register_opt2
([
bn
.
AbstractBatchNormInference
],
'cudnn'
,
'fast_compile'
)
def
local_abstract_batch_norm_inference_cudnn
(
op
,
ctx_name
,
inputs
,
outputs
):
x
,
scale
,
bias
,
estimated_mean
,
estimated_variance
,
epsilon
=
inputs
axes
=
tuple
(
op
.
axes
)
if
axes
==
(
0
,):
mode
=
'per-activation'
elif
axes
==
(
0
,)
+
tuple
(
range
(
2
,
x
.
ndim
)):
mode
=
'spatial'
else
:
return
None
try
:
eps
=
theano
.
tensor
.
get_scalar_constant_value
(
epsilon
)
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
eps
<
1e-5
:
return
None
ctx
=
infer_context_name
(
*
inputs
)
if
not
dnn_available
(
ctx
):
# TODO should this raise_no_cudnn?
return
None
x
=
as_gpuarray_variable
(
x
,
context_name
=
ctx
)
scale
=
as_gpuarray_variable
(
scale
,
context_name
=
ctx
)
bias
=
as_gpuarray_variable
(
bias
,
context_name
=
ctx
)
estimated_mean
=
as_gpuarray_variable
(
estimated_mean
,
context_name
=
ctx
)
estimated_variance
=
as_gpuarray_variable
(
estimated_variance
,
context_name
=
ctx
)
out
=
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
estimated_mean
,
estimated_variance
,
mode
,
eps
)
return
[
out
]
theano/gpuarray/dnn_batchnorm.c
浏览文件 @
8b9f7336
...
...
@@ -2,8 +2,19 @@
int
dnn_batchnorm_op
(
PyGpuArrayObject
*
inp
,
PyGpuArrayObject
*
scale
,
PyGpuArrayObject
*
bias
,
npy_float64
epsilon
,
PyGpuArrayObject
**
outp
,
PyGpuArrayObject
**
x_mean
,
PyGpuArrayObject
**
x_invstd
,
cudnnHandle_t
_handle
)
{
npy_float64
running_average_factor
,
#ifdef RUNNING_AVERAGES
PyGpuArrayObject
*
in_running_mean
,
PyGpuArrayObject
*
in_running_var
,
#endif
PyGpuArrayObject
**
outp
,
PyGpuArrayObject
**
x_mean
,
PyGpuArrayObject
**
x_invstd
,
#ifdef RUNNING_AVERAGES
PyGpuArrayObject
**
out_running_mean
,
PyGpuArrayObject
**
out_running_var
,
#endif
cudnnHandle_t
_handle
)
{
PyGpuContextObject
*
c
=
inp
->
context
;
if
(
c_set_tensorNd
(
inp
,
bn_input
)
!=
0
)
...
...
@@ -16,8 +27,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
return
1
;
}
#ifdef INPLACE_OUTPUT
Py_XDECREF
(
*
outp
);
*
outp
=
inp
;
Py_INCREF
(
*
outp
);
#else
if
(
theano_prep_output
(
outp
,
inp
->
ga
.
nd
,
inp
->
ga
.
dimensions
,
inp
->
ga
.
typecode
,
GA_C_ORDER
,
c
)
!=
0
)
return
1
;
#endif
if
(
theano_prep_output
(
x_mean
,
scale
->
ga
.
nd
,
scale
->
ga
.
dimensions
,
scale
->
ga
.
typecode
,
GA_C_ORDER
,
c
)
!=
0
)
return
1
;
if
(
theano_prep_output
(
x_invstd
,
scale
->
ga
.
nd
,
scale
->
ga
.
dimensions
,
scale
->
ga
.
typecode
,
GA_C_ORDER
,
c
)
!=
0
)
...
...
@@ -26,6 +43,31 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
if
(
c_set_tensorNd
(
*
outp
,
bn_output
)
!=
0
)
return
1
;
#ifdef RUNNING_AVERAGES
#ifdef INPLACE_RUNNING_MEAN
Py_XDECREF
(
out_running_mean
);
PyGpuArrayObject
*
running_mean
=
in_running_mean
;
Py_INCREF
(
running_mean
);
#else
PyGpuArrayObject
*
running_mean
=
*
out_running_mean
;
running_mean
=
theano_try_copy
(
running_mean
,
in_running_mean
);
if
(
running_mean
==
NULL
)
{
return
1
;
}
#endif
#ifdef INPLACE_RUNNING_VAR
Py_XDECREF
(
out_running_var
);
PyGpuArrayObject
*
running_var
=
in_running_var
;
Py_INCREF
(
running_var
);
#else
PyGpuArrayObject
*
running_var
=
*
out_running_var
;
running_var
=
theano_try_copy
(
running_var
,
in_running_var
);
if
(
running_var
==
NULL
)
{
return
1
;
}
#endif
#endif
{
const
float
falpha
=
1
.;
const
float
fbeta
=
0
.;
...
...
@@ -52,9 +94,15 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
bn_params
,
PyGpuArray_DEV_DATA
(
scale
),
PyGpuArray_DEV_DATA
(
bias
),
#ifdef RUNNING_AVERAGES
running_average_factor
,
PyGpuArray_DEV_DATA
(
running_mean
),
PyGpuArray_DEV_DATA
(
running_var
),
#else
0
,
NULL
,
// running mean, deliberately unused
NULL
,
// running var, deliberately unused
#endif
epsilon
,
PyGpuArray_DEV_DATA
(
*
x_mean
),
PyGpuArray_DEV_DATA
(
*
x_invstd
)
...
...
@@ -64,6 +112,10 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
cudnnGetErrorString
(
err
));
return
1
;
}
#ifdef RUNNING_AVERAGES
*
out_running_mean
=
running_mean
;
*
out_running_var
=
running_var
;
#endif
}
return
0
;
}
theano/gpuarray/dnn_batchnorm_inf.c
浏览文件 @
8b9f7336
...
...
@@ -16,8 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
return
1
;
}
#ifdef INPLACE_OUTPUT
Py_XDECREF
(
*
outp
);
*
outp
=
inp
;
Py_INCREF
(
*
outp
);
#else
if
(
theano_prep_output
(
outp
,
inp
->
ga
.
nd
,
inp
->
ga
.
dimensions
,
inp
->
ga
.
typecode
,
GA_C_ORDER
,
c
)
!=
0
)
return
1
;
#endif
if
(
c_set_tensorNd
(
*
outp
,
bn_output
)
!=
0
)
return
1
;
...
...
theano/gpuarray/tests/test_dnn.py
浏览文件 @
8b9f7336
from
__future__
import
absolute_import
,
print_function
,
division
import
logging
from
collections
import
OrderedDict
from
nose.plugins.skip
import
SkipTest
from
nose_parameterized
import
parameterized
...
...
@@ -13,6 +14,7 @@ import theano.tests.unittest_tools as utt
from
theano.tensor.signal.pool
import
pool_2d
,
pool_3d
from
theano.tensor.signal.pool
import
Pool
,
MaxPoolGrad
,
AveragePoolGrad
from
theano.tensor.nnet.abstract_conv
import
get_conv_output_shape
from
theano.tensor.nnet
import
bn
from
..
import
dnn
from
..basic_ops
import
GpuAllocEmpty
...
...
@@ -1379,36 +1381,77 @@ def test_dnn_batchnorm_train():
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
utt
.
seed_rng
()
tensor6
=
T
.
TensorType
(
theano
.
config
.
floatX
,
(
False
,)
*
6
)
for
mode
in
(
'per-activation'
,
'spatial'
):
for
vartype
in
(
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
,
T
.
vector
):
x
,
scale
,
bias
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
))
for
vartype
in
(
tensor6
,
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
,
T
.
vector
):
x
,
scale
,
bias
,
running_mean
,
running_var
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'running_mean'
,
'running_var'
))
ndim
=
x
.
ndim
eps
=
5e-3
# some non-standard value to test if it's used
# forward pass
out
,
x_mean
,
x_invstd
=
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
)
running_average_factor
=
0.3
# forward pass, direct interface
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
,
\
out_running_mean_gpu
,
out_running_var_gpu
=
\
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
,
running_average_factor
,
running_mean
,
running_var
)
# forward pass, abstract interface
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
,
\
out_running_mean_abstract
,
out_running_var_abstract
=
\
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
,
running_average_factor
,
running_mean
,
running_var
)
# reference forward pass
if
mode
==
'per-activation'
:
axes
=
(
0
,)
elif
mode
==
'spatial'
:
axes
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
x_mean2
=
x
.
mean
(
axis
=
axes
,
keepdims
=
True
)
x_invstd2
=
T
.
inv
(
T
.
sqrt
(
x
.
var
(
axis
=
axes
,
keepdims
=
True
)
+
eps
))
scale2
=
T
.
addbroadcast
(
scale
,
*
axes
)
bias2
=
T
.
addbroadcast
(
bias
,
*
axes
)
out2
=
(
x
-
x_mean2
)
*
(
scale2
*
x_invstd2
)
+
bias2
x_mean_ref
=
x
.
mean
(
axis
=
axes
,
keepdims
=
True
)
x_var_ref
=
x
.
var
(
axis
=
axes
,
keepdims
=
True
)
x_invstd_ref
=
T
.
inv
(
T
.
sqrt
(
x_var_ref
+
eps
))
scale_ref
=
T
.
addbroadcast
(
scale
,
*
axes
)
bias_ref
=
T
.
addbroadcast
(
bias
,
*
axes
)
m
=
T
.
cast
(
T
.
prod
(
x
.
shape
)
/
T
.
prod
(
scale
.
shape
),
theano
.
config
.
floatX
)
out_ref
=
(
x
-
x_mean_ref
)
*
(
scale_ref
*
x_invstd_ref
)
+
bias_ref
out_running_mean_ref
=
running_mean
*
(
1
-
running_average_factor
)
+
\
x_mean_ref
*
running_average_factor
out_running_var_ref
=
running_var
*
(
1
-
running_average_factor
)
+
\
(
m
/
(
m
-
1
))
*
x_var_ref
*
running_average_factor
# backward pass
dy
=
vartype
(
'dy'
)
grads
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out
:
dy
})
grads_gpu
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_gpu
:
dy
})
grads_abstract
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_abstract
:
dy
})
# reference backward pass
grads
2
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out2
:
dy
})
grads
_ref
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_ref
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out
,
x_mean
,
x_invstd
,
out2
,
x_mean2
,
x_invstd2
]
+
grads
+
grads2
,
mode
=
mode_with_gpu
)
f_gpu
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
,
out_running_mean_gpu
,
out_running_var_gpu
]
+
grads_gpu
,
mode
=
mode_with_gpu
)
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
,
out_running_mean_abstract
,
out_running_var_abstract
]
+
grads_abstract
,
mode
=
mode_with_gpu
)
f_ref
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out_ref
,
x_mean_ref
,
x_invstd_ref
,
out_running_mean_ref
,
out_running_var_ref
]
+
grads_ref
,
mode
=
mode_without_gpu
)
# check if the abstract Ops have been replaced
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormGrad
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
# run
for
data_shape
in
((
5
,
10
,
30
,
40
,
10
),
(
4
,
3
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
)):
for
data_shape
in
((
5
,
10
,
30
,
40
,
10
,
5
),
(
4
,
3
,
1
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
,
5
)):
data_shape
=
data_shape
[:
ndim
]
param_shape
=
tuple
(
1
if
d
in
axes
else
s
for
d
,
s
in
enumerate
(
data_shape
))
...
...
@@ -1416,15 +1459,124 @@ def test_dnn_batchnorm_train():
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs
=
f
(
X
,
Scale
,
Bias
,
Dy
)
Running_mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Running_var
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs_gpu
=
f_gpu
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
outputs_abstract
=
f_abstract
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
outputs_ref
=
f_ref
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
# compare outputs
utt
.
assert_allclose
(
outputs
[
0
],
outputs
[
0
+
3
])
# out
utt
.
assert_allclose
(
outputs
[
1
],
outputs
[
1
+
3
])
# mean
utt
.
assert_allclose
(
outputs
[
2
],
outputs
[
2
+
3
])
# invstd
utt
.
assert_allclose
(
outputs_gpu
[
0
],
outputs_ref
[
0
])
# out
utt
.
assert_allclose
(
outputs_gpu
[
1
],
outputs_ref
[
1
])
# mean
utt
.
assert_allclose
(
outputs_gpu
[
2
],
outputs_ref
[
2
])
# invstd
utt
.
assert_allclose
(
outputs_gpu
[
3
],
outputs_ref
[
3
])
# running_mean
utt
.
assert_allclose
(
numpy
.
nan_to_num
(
outputs_gpu
[
4
]),
numpy
.
nan_to_num
(
outputs_ref
[
4
]))
# running_var
utt
.
assert_allclose
(
outputs_abstract
[
0
],
outputs_ref
[
0
])
# out
utt
.
assert_allclose
(
outputs_abstract
[
1
],
outputs_ref
[
1
])
# mean
utt
.
assert_allclose
(
outputs_abstract
[
2
],
outputs_ref
[
2
])
# invstd
utt
.
assert_allclose
(
outputs_abstract
[
3
],
outputs_ref
[
3
])
# running_mean
utt
.
assert_allclose
(
numpy
.
nan_to_num
(
outputs_abstract
[
4
]),
numpy
.
nan_to_num
(
outputs_ref
[
4
]))
# running_var
# compare gradients
utt
.
assert_allclose
(
outputs
[
6
],
outputs
[
6
+
3
],
atol
=
1e-4
)
# dx
utt
.
assert_allclose
(
outputs
[
7
],
outputs
[
7
+
3
],
rtol
=
2e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs
[
8
],
outputs
[
8
+
3
])
# dbias
utt
.
assert_allclose
(
outputs_gpu
[
5
],
outputs_ref
[
5
],
atol
=
2e-4
)
# dx
utt
.
assert_allclose
(
outputs_gpu
[
6
],
outputs_ref
[
6
],
rtol
=
4e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs_gpu
[
7
],
outputs_ref
[
7
])
# dbias
utt
.
assert_allclose
(
outputs_abstract
[
5
],
outputs_ref
[
5
],
atol
=
2e-4
)
# dx
utt
.
assert_allclose
(
outputs_abstract
[
6
],
outputs_ref
[
6
],
rtol
=
4e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs_abstract
[
7
],
outputs_ref
[
7
])
# dbias
def
test_dnn_batchnorm_train_without_running_averages
():
# compile and run batch_normalization_train without running averages
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
if
dnn
.
version
(
raises
=
False
)
<
5000
:
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
x
,
scale
,
bias
,
dy
=
T
.
tensor4
(
'x'
),
T
.
tensor4
(
'scale'
),
T
.
tensor4
(
'bias'
),
T
.
tensor4
(
'dy'
)
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
# forward pass
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
=
\
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
)
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
=
\
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
)
# backward pass
grads_gpu
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_gpu
:
dy
})
grads_abstract
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_gpu
:
dy
})
# compile
f_gpu
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
]
+
grads_gpu
,
mode
=
mode_with_gpu
)
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
]
+
grads_abstract
,
mode
=
mode_with_gpu
)
# check if the abstract Ops have been replaced
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormGrad
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f_gpu
(
X
,
Scale
,
Bias
,
Dy
)
f_abstract
(
X
,
Scale
,
Bias
,
Dy
)
def
test_dnn_batchnorm_train_inplace
():
# test inplace_running_mean and inplace_running_var
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
if
dnn
.
version
(
raises
=
False
)
<
5000
:
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
x
,
scale
,
bias
=
T
.
tensor4
(
'x'
),
T
.
tensor4
(
'scale'
),
T
.
tensor4
(
'bias'
)
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
running_mean
=
gpuarray_shared_constructor
(
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
),
broadcastable
=
(
True
,
False
,
False
,
False
))
running_var
=
gpuarray_shared_constructor
(
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
),
broadcastable
=
(
True
,
False
,
False
,
False
))
# forward pass
out
,
x_mean
,
x_invstd
,
new_running_mean
,
new_running_var
=
\
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
,
epsilon
=
5e-3
,
running_average_factor
=
0.3
,
running_mean
=
running_mean
,
running_var
=
running_var
)
# update running averages
updates
=
OrderedDict
()
updates
[
running_mean
]
=
new_running_mean
updates
[
running_var
]
=
new_running_var
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
],
[
out
,
x_mean
,
x_invstd
],
updates
=
updates
,
mode
=
mode_with_gpu
)
# check for the inplace settings
nodes
=
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)]
assert
len
(
nodes
)
==
1
assert
nodes
[
0
]
.
op
.
inplace_running_mean
assert
nodes
[
0
]
.
op
.
inplace_running_var
assert
nodes
[
0
]
.
op
.
inplace_output
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f
(
X
,
Scale
,
Bias
)
def
test_batchnorm_inference
():
...
...
@@ -1432,34 +1584,51 @@ def test_batchnorm_inference():
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
utt
.
seed_rng
()
tensor6
=
T
.
TensorType
(
theano
.
config
.
floatX
,
(
False
,)
*
6
)
for
mode
in
(
'per-activation'
,
'spatial'
):
for
vartype
in
(
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
,
T
.
vector
):
for
vartype
in
(
tensor6
,
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
,
T
.
vector
):
x
,
scale
,
bias
,
mean
,
var
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
))
ndim
=
x
.
ndim
eps
=
5e-3
# some non-standard value to test if it's used
# forward pass
out
=
dnn
.
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
mode
,
eps
)
# forward pass, direct interface
out_gpu
=
dnn
.
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
mode
,
eps
)
# forward pass, abstract interface
out_abstract
=
bn
.
batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
mode
,
eps
)
# reference forward pass
if
mode
==
'per-activation'
:
axes
=
(
0
,)
elif
mode
==
'spatial'
:
axes
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
scale
2
,
bias2
,
mean2
,
var2
=
(
T
.
addbroadcast
(
t
,
*
axes
)
for
t
in
(
scale
,
bias
,
mean
,
var
))
out
2
=
(
x
-
mean2
)
*
(
scale2
/
T
.
sqrt
(
var2
+
eps
))
+
bias2
scale
_ref
,
bias_ref
,
mean_ref
,
var_ref
=
(
T
.
addbroadcast
(
t
,
*
axes
)
for
t
in
(
scale
,
bias
,
mean
,
var
))
out
_ref
=
(
x
-
mean_ref
)
*
(
scale_ref
/
T
.
sqrt
(
var_ref
+
eps
))
+
bias_ref
# backward pass
dy
=
vartype
(
'dy'
)
grads
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out
:
dy
})
grads_gpu
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_gpu
:
dy
})
grads_abstract
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_abstract
:
dy
})
# reference backward pass
grads
2
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out2
:
dy
})
grads
_ref
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_ref
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out
,
out2
]
+
grads
+
grads2
,
mode
=
mode_with_gpu
)
f_gpu
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_gpu
]
+
grads_gpu
,
mode
=
mode_with_gpu
)
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_abstract
]
+
grads_abstract
,
mode
=
mode_with_gpu
)
f_ref
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_ref
]
+
grads_ref
)
# check if the abstract Ops have been replaced
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormInference
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
# run
for
data_shape
in
((
10
,
20
,
30
,
40
,
10
),
(
4
,
3
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
)):
for
data_shape
in
((
10
,
20
,
30
,
40
,
10
,
5
),
(
4
,
3
,
1
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
,
5
)):
data_shape
=
data_shape
[:
ndim
]
param_shape
=
tuple
(
1
if
d
in
axes
else
s
for
d
,
s
in
enumerate
(
data_shape
))
...
...
@@ -1469,15 +1638,106 @@ def test_batchnorm_inference():
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Var
=
numpy
.
random
.
rand
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs
=
f
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
outputs_gpu
=
f_gpu
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
outputs_abstract
=
f_abstract
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
outputs_ref
=
f_ref
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
# compare outputs
utt
.
assert_allclose
(
outputs
[
0
],
outputs
[
1
])
# out
utt
.
assert_allclose
(
outputs_gpu
[
0
],
outputs_ref
[
0
])
# out
utt
.
assert_allclose
(
outputs_abstract
[
0
],
outputs_ref
[
0
])
# out
# compare gradients
utt
.
assert_allclose
(
outputs
[
2
],
outputs
[
2
+
5
],
atol
=
4e-5
)
# dx
utt
.
assert_allclose
(
outputs
[
3
],
outputs
[
3
+
5
],
atol
=
4e-5
)
# dscale
utt
.
assert_allclose
(
outputs
[
4
],
outputs
[
4
+
5
])
# dbias
utt
.
assert_allclose
(
outputs
[
5
],
outputs
[
5
+
5
])
# dmean
utt
.
assert_allclose
(
outputs
[
6
],
outputs
[
6
+
5
],
rtol
=
2e-3
,
atol
=
4e-5
)
# dvar
utt
.
assert_allclose
(
outputs_gpu
[
1
],
outputs_ref
[
1
],
atol
=
4e-5
)
# dx
utt
.
assert_allclose
(
outputs_gpu
[
2
],
outputs_ref
[
2
],
atol
=
4e-5
)
# dscale
utt
.
assert_allclose
(
outputs_gpu
[
3
],
outputs_ref
[
3
])
# dbias
utt
.
assert_allclose
(
outputs_gpu
[
4
],
outputs_ref
[
4
])
# dmean
utt
.
assert_allclose
(
outputs_gpu
[
5
],
outputs_ref
[
5
],
rtol
=
2e-3
,
atol
=
4e-5
)
# dvar
utt
.
assert_allclose
(
outputs_abstract
[
1
],
outputs_ref
[
1
],
atol
=
4e-5
)
# dx
utt
.
assert_allclose
(
outputs_abstract
[
2
],
outputs_ref
[
2
],
atol
=
4e-5
)
# dscale
utt
.
assert_allclose
(
outputs_abstract
[
3
],
outputs_ref
[
3
])
# dbias
utt
.
assert_allclose
(
outputs_abstract
[
4
],
outputs_ref
[
4
])
# dmean
utt
.
assert_allclose
(
outputs_abstract
[
5
],
outputs_ref
[
5
],
rtol
=
2e-3
,
atol
=
4e-5
)
# dvar
def
test_batchnorm_inference_inplace
():
# test inplace
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
if
dnn
.
version
(
raises
=
False
)
<
5000
:
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
x
,
scale
,
bias
,
mean
,
var
=
(
T
.
tensor4
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
))
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
out
=
dnn
.
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
)
f
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
],
[
out
],
mode
=
mode_with_gpu
)
# check for the inplace settings
nodes
=
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormInference
)]
assert
len
(
nodes
)
==
1
assert
nodes
[
0
]
.
op
.
inplace
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Var
=
numpy
.
random
.
rand
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f
(
X
,
Scale
,
Bias
,
Mean
,
Var
)
def
test_dnn_batchnorm_valid_and_invalid_axes
():
if
not
dnn
.
dnn_available
(
test_ctx_name
):
raise
SkipTest
(
dnn
.
dnn_available
.
msg
)
if
dnn
.
version
(
raises
=
False
)
<
5000
:
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
for
vartype
in
(
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
):
x
,
scale
,
bias
,
mean
,
var
,
dy
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
,
'dy'
))
ndim
=
x
.
ndim
# supported: per-activation and spatial
valid_axes_lists
=
((
0
,),
(
0
,)
+
tuple
(
range
(
2
,
ndim
)))
# not supported: an axes list without 0 and including 1
invalid_axes_lists
=
(
tuple
(
range
(
1
,
ndim
)),)
for
axes
in
valid_axes_lists
+
invalid_axes_lists
:
# forward pass, abstract interface
out_train
,
x_mean
,
x_invstd
=
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
axes
)
out_test
=
bn
.
batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
axes
)
# backward pass
dy
=
vartype
(
'dy'
)
grads_train
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_train
:
dy
})
grads_test
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_test
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_train
,
x_mean
,
x_invstd
,
out_test
]
+
grads_train
+
grads_test
,
mode
=
mode_with_gpu
)
if
axes
in
valid_axes_lists
:
# check if the abstract Ops have been replaced by the cuDNN Ops
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormGrad
)
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormInference
)
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
else
:
# check if the abstract Ops have been replaced, but not by the cuDNN Ops
assert
not
any
([
isinstance
(
n
.
op
,
(
dnn
.
GpuDnnBatchNorm
,
dnn
.
GpuDnnBatchNormGrad
,
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
def
test_dnn_rnn_gru
():
...
...
theano/sandbox/cuda/__init__.py
浏览文件 @
8b9f7336
...
...
@@ -12,7 +12,7 @@ import warnings
import
theano
from
theano.compat
import
get_unbound_function
from
theano.compile
import
optdb
from
theano.gof
import
EquilibriumDB
,
SequenceDB
from
theano.gof
import
EquilibriumDB
,
SequenceDB
,
TopoOptimizer
from
theano.gof.cmodule
import
get_lib_extension
from
theano.gof.compilelock
import
get_lock
,
release_lock
from
theano
import
config
...
...
@@ -40,6 +40,17 @@ def register_opt(*tags, **kwargs):
return
f
def
register_inplace
(
*
tags
,
**
kwargs
):
def
f
(
local_opt
):
name
=
(
kwargs
and
kwargs
.
pop
(
'name'
))
or
local_opt
.
__name__
optdb
.
register
(
name
,
TopoOptimizer
(
local_opt
,
failure_callback
=
TopoOptimizer
.
warn_inplace
),
60
,
'fast_run'
,
'inplace'
,
'gpu'
,
*
tags
)
return
local_opt
return
f
_logger_name
=
'theano.sandbox.cuda'
_logger
=
logging
.
getLogger
(
_logger_name
)
...
...
theano/sandbox/cuda/dnn.py
浏览文件 @
8b9f7336
...
...
@@ -18,6 +18,7 @@ from theano.tensor.nnet.abstract_conv import (get_conv_output_shape,
assert_conv_shape
)
from
theano.tensor.signal.pool
import
(
Pool
,
MaxPoolGrad
,
AveragePoolGrad
)
from
theano.tensor.nnet
import
bn
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda
import
GpuOp
,
dnn_available
...
...
@@ -33,7 +34,7 @@ from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
from
theano.sandbox.cuda.nnet
import
GpuSoftmax
from
theano.sandbox.cuda.opt_util
import
(
alpha_merge
,
output_merge
,
pad_dims
,
unpad_dims
)
from
theano.sandbox.cuda
import
gpu_seqopt
,
register_opt
from
theano.sandbox.cuda
import
gpu_seqopt
,
register_opt
,
register_inplace
from
theano.sandbox.cuda.nvcc_compiler
import
NVCC_compiler
...
...
@@ -2347,6 +2348,23 @@ class GpuDnnBatchNormBase(DnnBase):
epsilon
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
"""
__props__
=
(
'mode'
,
'epsilon'
)
...
...
@@ -2395,17 +2413,15 @@ cudnnStatus_t err%(name)s;
result
=
"""
cudnnStatus_t err
%(name)
s;
cudnnBatchNormMode_t mode
%(name)
s =
%(mode)
s;
double exponentialAverageFactor
%(name)
s =
%(exp_avg_factor)
f;
double epsilon
%(name)
s =
%(epsilon)
e;
"""
%
dict
(
name
=
name
,
mode
=
mode
,
exp_avg_factor
=
0
,
# deliberately unused
epsilon
=
self
.
epsilon
)
return
result
def
c_code_cache_version
(
self
):
return
(
3
,
version
())
return
(
4
,
version
())
class
GpuDnnBatchNormInference
(
GpuDnnBatchNormBase
):
...
...
@@ -2422,8 +2438,26 @@ class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
Note: scale, bias, mean and variance must follow the same tensor layout!
"""
__props__
=
(
'mode'
,
'epsilon'
,
'inplace'
)
tensor_descs
=
[
'bn_input'
,
'bn_output'
,
'bn_params'
]
def
__init__
(
self
,
mode
=
'per-activation'
,
epsilon
=
1e-4
,
inplace
=
False
):
super
(
GpuDnnBatchNormInference
,
self
)
.
__init__
(
mode
=
mode
,
epsilon
=
epsilon
)
self
.
inplace
=
inplace
if
self
.
inplace
:
self
.
destroy_map
=
{
0
:
[
0
]}
def
__setstate__
(
self
,
d
):
self
.
__dict__
.
update
(
d
)
if
not
hasattr
(
self
,
'inplace'
):
self
.
inplace
=
False
def
get_op_params
(
self
):
params
=
[]
if
self
.
inplace
:
params
.
append
((
'INPLACE_OUTPUT'
,
'1'
))
return
params
def
infer_shape
(
self
,
node
,
shape
):
# output shape equals shape of x
return
[
shape
[
0
]]
...
...
@@ -2460,10 +2494,16 @@ if (c_set_tensorNd(%(scale)s, bn_params_%(name)s) != 0)
}
// build and prepare the output variable
#ifdef INPLACE_OUTPUT
Py_XDECREF(
%(outp)
s);
%(outp)
s =
%(inp)
s;
Py_INCREF(
%(outp)
s);
#else
if (CudaNdarray_prep_output(&
%(outp)
s,
%(inp)
s->nd, CudaNdarray_HOST_DIMS(
%(inp)
s)) != 0)
{
%(fail)
s
}
#endif
// set output tensor descriptor from output tensor
if (c_set_tensorNd(
%(outp)
s, bn_output_
%(name)
s) != 0)
...
...
@@ -2494,6 +2534,16 @@ err%(name)s = cudnnBatchNormalizationForwardInference(
"""
%
dict
(
name
=
name
,
inp
=
inp
,
scale
=
scale
,
bias
=
bias
,
est_mean
=
est_mean
,
est_var
=
est_var
,
outp
=
outp
,
fail
=
sub
[
'fail'
])
# add params
define_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
,
check_input
=
False
)
result
=
"""
%(define_macros)
s
{
%(code)
s
}
%(undef_macros)
s
"""
%
dict
(
code
=
result
,
define_macros
=
define_macros
,
undef_macros
=
undef_macros
)
return
result
def
grad
(
self
,
inputs
,
grads
):
...
...
@@ -2537,28 +2587,98 @@ class GpuDnnBatchNorm(GpuDnnBatchNormBase):
Note: scale and bias must follow the same tensor layout!
"""
__props__
=
(
'mode'
,
'epsilon'
,
'running_average_factor'
,
'running_averages'
,
'inplace_running_mean'
,
'inplace_running_var'
,
'inplace_output'
)
tensor_descs
=
[
'bn_input'
,
'bn_output'
,
'bn_params'
]
def
__init__
(
self
,
mode
=
'per-activation'
,
epsilon
=
1e-4
,
running_average_factor
=
0
,
running_averages
=
False
,
inplace_running_mean
=
False
,
inplace_running_var
=
False
,
inplace_output
=
False
):
super
(
GpuDnnBatchNorm
,
self
)
.
__init__
(
mode
=
mode
,
epsilon
=
epsilon
)
self
.
running_average_factor
=
running_average_factor
self
.
running_averages
=
running_averages
self
.
inplace_output
=
inplace_output
self
.
inplace_running_mean
=
inplace_running_mean
self
.
inplace_running_var
=
inplace_running_var
self
.
destroy_map
=
{}
if
self
.
inplace_output
:
self
.
destroy_map
[
0
]
=
[
0
]
if
self
.
running_averages
and
self
.
inplace_running_mean
:
self
.
destroy_map
[
3
]
=
[
3
]
if
self
.
running_averages
and
self
.
inplace_running_var
:
self
.
destroy_map
[
4
]
=
[
4
]
def
__setstate__
(
self
,
d
):
self
.
__dict__
.
update
(
d
)
if
not
hasattr
(
self
,
'running_average_factor'
):
self
.
running_average_factor
=
0
if
not
hasattr
(
self
,
'running_averages'
):
self
.
running_averages
=
False
if
not
(
hasattr
(
self
,
'inplace_running_mean'
)
and
hasattr
(
self
,
'inplace_running_var'
)
and
hasattr
(
self
,
'inplace_output'
)):
self
.
inplace_running_mean
=
False
self
.
inplace_running_var
=
False
self
.
inplace_output
=
False
self
.
destroy_map
=
{}
def
get_op_params
(
self
):
params
=
[]
if
self
.
inplace_output
:
params
.
append
((
'INPLACE_OUTPUT'
,
'1'
))
if
self
.
running_averages
:
params
.
append
((
'RUNNING_AVERAGES'
,
'1'
))
if
self
.
inplace_running_mean
:
params
.
append
((
'INPLACE_RUNNING_MEAN'
,
'1'
))
if
self
.
inplace_running_var
:
params
.
append
((
'INPLACE_RUNNING_VAR'
,
'1'
))
return
params
def
infer_shape
(
self
,
node
,
shape
):
# first output equals shape of x
#
second and third output
equal shape of scale
return
[
shape
[
0
]
,
shape
[
1
],
shape
[
1
]]
#
other outputs
equal shape of scale
return
[
shape
[
0
]
]
+
[
shape
[
1
]]
*
(
len
(
node
.
outputs
)
-
1
)
def
make_node
(
self
,
x
,
scale
,
bias
):
def
make_node
(
self
,
x
,
scale
,
bias
,
running_mean
=
None
,
running_var
=
None
):
assert
x
.
ndim
==
scale
.
ndim
==
bias
.
ndim
assert
x
.
ndim
in
(
4
,
5
)
assert
self
.
running_averages
==
(
running_mean
is
not
None
)
==
(
running_var
is
not
None
)
assert
(
running_mean
is
None
or
running_mean
.
ndim
==
x
.
ndim
)
assert
(
running_var
is
None
or
running_var
.
ndim
==
x
.
ndim
)
x
=
as_cuda_ndarray_variable
(
x
)
scale
=
as_cuda_ndarray_variable
(
scale
)
bias
=
as_cuda_ndarray_variable
(
bias
)
assert
x
.
ndim
==
scale
.
ndim
==
bias
.
ndim
assert
x
.
ndim
in
(
4
,
5
)
return
Apply
(
self
,
[
x
,
scale
,
bias
],
[
x
.
type
(),
scale
.
type
(),
scale
.
type
()])
inputs
=
[
x
,
scale
,
bias
]
output_types
=
[
x
.
type
(),
scale
.
type
(),
scale
.
type
()]
if
running_mean
is
not
None
and
running_var
is
not
None
:
inputs
.
append
(
as_cuda_ndarray_variable
(
running_mean
))
inputs
.
append
(
as_cuda_ndarray_variable
(
running_var
))
output_types
.
append
(
scale
.
type
())
output_types
.
append
(
scale
.
type
())
return
Apply
(
self
,
inputs
,
output_types
)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
# super call to prepare common configuration
result
=
super
(
GpuDnnBatchNorm
,
self
)
.
c_code
(
node
,
name
,
inputs
,
outputs
,
sub
)
# give sensible names to inputs and outputs
inp
,
scale
,
bias
=
inputs
outp
,
x_mean
,
x_invstd
=
outputs
inp
,
scale
,
bias
=
inputs
[:
3
]
outp
,
x_mean
,
x_invstd
=
outputs
[:
3
]
if
self
.
running_averages
:
running_average_factor
=
self
.
running_average_factor
in_running_mean
=
inputs
[
3
]
in_running_var
=
inputs
[
4
]
out_running_mean
=
outputs
[
3
]
out_running_var
=
outputs
[
4
]
else
:
running_average_factor
=
0.
in_running_mean
=
'NULL'
in_running_var
=
'NULL'
out_running_mean
=
'NULL'
out_running_var
=
'NULL'
# set input tensor descriptors from input tensors
result
+=
"""
...
...
@@ -2579,6 +2699,32 @@ if ((CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp
{
%(fail)
s
}
#ifdef RUNNING_AVERAGES
#ifdef INPLACE_RUNNING_MEAN
Py_XDECREF(
%(out_running_mean)
s);
CudaNdarray *running_mean
%(name)
s =
%(in_running_mean)
s;
Py_INCREF(running_mean
%(name)
s);
#else
if ((CudaNdarray_prep_output(&
%(out_running_mean)
s,
%(inp)
s->nd, CudaNdarray_HOST_DIMS(
%(scale)
s)) != 0) ||
(CudaNdarray_CopyFromCudaNdarray(
%(out_running_mean)
s,
%(in_running_mean)
s) != 0))
{
%(fail)
s
}
CudaNdarray *running_mean
%(name)
s =
%(out_running_mean)
s;
#endif
#ifdef INPLACE_RUNNING_VAR
Py_XDECREF(
%(out_running_var)
s);
CudaNdarray *running_var
%(name)
s =
%(in_running_var)
s;
Py_INCREF(running_var
%(name)
s);
#else
if ((CudaNdarray_prep_output(&
%(out_running_var)
s,
%(inp)
s->nd, CudaNdarray_HOST_DIMS(
%(scale)
s)) != 0) ||
(CudaNdarray_CopyFromCudaNdarray(
%(out_running_var)
s,
%(in_running_var)
s) != 0))
{
%(fail)
s
}
CudaNdarray *running_var
%(name)
s =
%(out_running_var)
s;
#endif
#endif
// set output tensor descriptor from output tensor
if (c_set_tensorNd(
%(outp)
s, bn_output_
%(name)
s) != 0)
...
...
@@ -2601,25 +2747,66 @@ err%(name)s = cudnnBatchNormalizationForwardTraining(
bn_params_
%(name)
s,
CudaNdarray_DEV_DATA(
%(scale)
s),
CudaNdarray_DEV_DATA(
%(bias)
s),
exponentialAverageFactor
%(name)
s,
NULL, // running mean, deliberately unused
NULL, // running var, deliberately unused
#ifdef RUNNING_AVERAGES
%(running_average_factor)
f,
CudaNdarray_DEV_DATA(running_mean
%(name)
s),
CudaNdarray_DEV_DATA(running_var
%(name)
s),
#else
0,
NULL,
NULL,
#endif
epsilon
%(name)
s,
CudaNdarray_DEV_DATA(
%(x_mean)
s),
CudaNdarray_DEV_DATA(
%(x_invstd)
s)
);
}
#ifdef RUNNING_AVERAGES
%(out_running_mean)
s = running_mean
%(name)
s;
%(out_running_var)
s = running_var
%(name)
s;
#endif
"""
%
dict
(
name
=
name
,
inp
=
inp
,
scale
=
scale
,
bias
=
bias
,
outp
=
outp
,
x_mean
=
x_mean
,
x_invstd
=
x_invstd
,
fail
=
sub
[
'fail'
])
x_mean
=
x_mean
,
x_invstd
=
x_invstd
,
running_average_factor
=
running_average_factor
,
in_running_mean
=
in_running_mean
,
in_running_var
=
in_running_var
,
out_running_mean
=
out_running_mean
,
out_running_var
=
out_running_var
,
fail
=
sub
[
'fail'
])
# add params
define_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
,
check_input
=
False
)
result
=
"""
%(define_macros)
s
{
%(code)
s
}
%(undef_macros)
s
"""
%
dict
(
code
=
result
,
define_macros
=
define_macros
,
undef_macros
=
undef_macros
)
return
result
def
grad
(
self
,
inputs
,
grads
):
x
,
scale
,
bias
=
inputs
x
,
scale
,
bias
=
inputs
[:
3
]
dy
=
grads
[
0
]
_
,
x_mean
,
x_invstd
=
self
(
x
,
scale
,
bias
)
return
GpuDnnBatchNormGrad
(
self
.
mode
,
self
.
epsilon
)(
x
,
dy
,
scale
,
x_mean
,
x_invstd
)
_
,
x_mean
,
x_invstd
=
self
(
*
inputs
)[:
3
]
disconnected_outputs
=
[]
# Optional running_mean and running_var.
for
i
in
range
(
3
,
len
(
inputs
)):
disconnected_outputs
.
append
(
DisconnectedType
()())
return
GpuDnnBatchNormGrad
(
self
.
mode
,
self
.
epsilon
)(
x
,
dy
,
scale
,
x_mean
,
x_invstd
)
+
disconnected_outputs
def
connection_pattern
(
self
,
node
):
patterns
=
[[
True
,
True
,
True
],
# x
[
True
,
True
,
True
],
# scale
[
True
,
True
,
True
]]
# bias
# Optional running_mean and running_var are only
# connected to their new values.
for
i
in
range
(
3
,
len
(
node
.
inputs
)):
patterns
[
0
]
.
append
(
True
)
for
pattern
in
patterns
[
1
:]:
pattern
.
append
(
False
)
patterns
.
append
([
False
]
*
(
i
)
+
[
True
])
return
patterns
class
GpuDnnBatchNormGrad
(
GpuDnnBatchNormBase
):
...
...
@@ -2722,7 +2909,8 @@ err%(name)s = cudnnBatchNormalizationBackward(
def
dnn_batch_normalization_train
(
inputs
,
gamma
,
beta
,
mode
=
'per-activation'
,
epsilon
=
1e-4
):
epsilon
=
1e-4
,
running_average_factor
=
0.1
,
running_mean
=
None
,
running_var
=
None
):
"""
Performs batch normalization of the given inputs, using the mean and
variance of the inputs.
...
...
@@ -2742,6 +2930,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
Returns
-------
...
...
@@ -2749,8 +2954,14 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
Batch-normalized inputs.
mean : tensor
Means of `inputs` across the normalization axes.
stdinv
: tensor
invstd
: tensor
Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes
-----
...
...
@@ -2762,31 +2973,78 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
axes = 0 if mode == 'per-activation' else (0, 2, 3)
mean = inputs.mean(axes, keepdims=True)
stdinv = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon))
out = (inputs - mean) * gamma * stdinv + beta
var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) +
\\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) +
\\
(m / (m - 1)) * var * running_average_factor
For 5d tensors, the axes are (0, 2, 3, 4).
"""
ndim
=
inputs
.
ndim
if
ndim
>
5
:
raise
ValueError
(
"dnn_batch_normalization_train currently supports "
"up to 5-dimensional tensors only, got
%
d"
%
ndim
)
if
gamma
.
ndim
!=
ndim
or
beta
.
ndim
!=
ndim
:
raise
ValueError
(
"gamma and beta must be of the same dimensionality "
"as inputs; got
%
d and
%
d instead of
%
d"
%
(
gamma
.
ndim
,
beta
.
ndim
,
ndim
))
if
(
running_mean
is
None
)
!=
(
running_var
is
None
):
raise
ValueError
(
"running_mean and running_var must either both be "
"given or both be None"
)
if
running_mean
is
not
None
and
running_mean
.
ndim
!=
ndim
:
raise
ValueError
(
"running_mean must be of the same dimensionality "
"as inputs; got
%
d instead of
%
d"
%
(
running_mean
.
ndim
,
ndim
))
if
running_var
is
not
None
and
running_var
.
ndim
!=
ndim
:
raise
ValueError
(
"running_var must be of the same dimensionality "
"as inputs; got
%
d instead of
%
d"
%
(
running_var
.
ndim
,
ndim
))
if
epsilon
<
1e-5
:
raise
ValueError
(
"epsilon must be at least 1e-5, got
%
f"
%
epsilon
)
running_averages
=
(
running_var
is
not
None
and
running_var
is
not
None
)
if
ndim
<
4
:
inputs
=
theano
.
tensor
.
shape_padright
(
inputs
,
4
-
ndim
)
gamma
=
theano
.
tensor
.
shape_padright
(
gamma
,
4
-
ndim
)
beta
=
theano
.
tensor
.
shape_padright
(
beta
,
4
-
ndim
)
batchnorm_op
=
GpuDnnBatchNorm
(
mode
=
mode
,
epsilon
=
epsilon
)
result
=
tuple
(
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
)))
if
running_averages
:
running_mean
=
theano
.
tensor
.
shape_padright
(
running_mean
,
4
-
ndim
)
running_var
=
theano
.
tensor
.
shape_padright
(
running_var
,
4
-
ndim
)
elif
ndim
>
5
:
inputs_shape
=
inputs
.
shape
params_shape
=
gamma
.
shape
inputs
=
theano
.
tensor
.
flatten
(
inputs
,
5
)
gamma
=
theano
.
tensor
.
flatten
(
gamma
,
5
)
beta
=
theano
.
tensor
.
flatten
(
beta
,
5
)
if
running_averages
:
running_mean
=
theano
.
tensor
.
flatten
(
running_mean
,
5
)
running_var
=
theano
.
tensor
.
flatten
(
running_var
,
5
)
batchnorm_op
=
GpuDnnBatchNorm
(
mode
=
mode
,
epsilon
=
epsilon
,
running_average_factor
=
running_average_factor
,
running_averages
=
running_averages
)
if
running_averages
:
out
,
mean
,
invstd
,
new_running_mean
,
new_running_var
=
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
),
running_mean
=
gpu_contiguous
(
running_mean
),
running_var
=
gpu_contiguous
(
running_var
))
if
new_running_mean
.
broadcastable
!=
running_mean
.
broadcastable
:
new_running_mean
=
tensor
.
patternbroadcast
(
new_running_mean
,
running_mean
.
broadcastable
)
if
new_running_var
.
broadcastable
!=
running_var
.
broadcastable
:
new_running_var
=
tensor
.
patternbroadcast
(
new_running_var
,
running_var
.
broadcastable
)
result
=
(
out
,
mean
,
invstd
,
new_running_mean
,
new_running_var
)
else
:
result
=
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
))
if
ndim
<
4
:
result
=
tuple
(
theano
.
tensor
.
flatten
(
r
,
ndim
)
for
r
in
result
)
elif
ndim
>
5
:
result
=
(
theano
.
tensor
.
reshape
(
result
[
0
],
inputs_shape
),)
+
tuple
(
theano
.
tensor
.
reshape
(
r
,
params_shape
)
for
r
in
result
[
1
:])
return
result
...
...
@@ -2839,9 +3097,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
For 5d tensors, the axes would be (0, 2, 3, 4).
"""
ndim
=
inputs
.
ndim
if
ndim
>
5
:
raise
ValueError
(
"dnn_batch_normalization_test currently supports "
"up to 5-dimensional tensors only, got
%
d"
%
ndim
)
if
gamma
.
ndim
!=
ndim
or
beta
.
ndim
!=
ndim
:
raise
ValueError
(
"gamma and beta must be of the same dimensionality "
"as inputs; got
%
d and
%
d instead of
%
d"
%
...
...
@@ -2859,12 +3114,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
beta
=
theano
.
tensor
.
shape_padright
(
beta
,
4
-
ndim
)
mean
=
theano
.
tensor
.
shape_padright
(
mean
,
4
-
ndim
)
var
=
theano
.
tensor
.
shape_padright
(
var
,
4
-
ndim
)
elif
ndim
>
5
:
inputs_shape
=
inputs
.
shape
inputs
=
theano
.
tensor
.
flatten
(
inputs
,
5
)
gamma
=
theano
.
tensor
.
flatten
(
gamma
,
5
)
beta
=
theano
.
tensor
.
flatten
(
beta
,
5
)
mean
=
theano
.
tensor
.
flatten
(
mean
,
5
)
var
=
theano
.
tensor
.
flatten
(
var
,
5
)
batchnorm_op
=
GpuDnnBatchNormInference
(
mode
=
mode
,
epsilon
=
epsilon
)
result
=
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
),
gpu_contiguous
(
mean
),
gpu_contiguous
(
var
))
if
ndim
<
4
:
result
=
theano
.
tensor
.
flatten
(
result
,
ndim
)
elif
ndim
>
5
:
result
=
theano
.
tensor
.
reshape
(
result
,
inputs_shape
)
return
result
...
...
@@ -3334,3 +3598,235 @@ def local_abstractconv3d_cudnn(node):
subsample
=
node
.
op
.
subsample
,
conv_mode
=
conv_mode
)
return
[
rval
]
@local_optimizer
([
bn
.
AbstractBatchNormTrain
])
def
local_abstract_batch_norm_train_cudnn
(
node
):
if
not
isinstance
(
node
.
op
,
bn
.
AbstractBatchNormTrain
):
return
None
x
,
scale
,
bias
,
epsilon
,
running_average_factor
=
node
.
inputs
[:
5
]
running_mean
=
node
.
inputs
[
5
]
if
len
(
node
.
inputs
)
>
5
else
None
running_var
=
node
.
inputs
[
6
]
if
len
(
node
.
inputs
)
>
6
else
None
# input on gpu? TODO what about the output?
x_on_gpu
=
(
isinstance
(
x
.
type
,
CudaNdarrayType
)
or
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)))
if
not
x_on_gpu
:
return
None
# convert axes to cuDNN mode
axes
=
tuple
(
node
.
op
.
axes
)
if
axes
==
(
0
,):
mode
=
'per-activation'
elif
axes
==
(
0
,)
+
tuple
(
range
(
2
,
x
.
ndim
)):
mode
=
'spatial'
else
:
return
None
try
:
eps
=
float
(
theano
.
tensor
.
get_scalar_constant_value
(
epsilon
))
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
eps
<
1e-5
:
return
None
try
:
running_average_factor
=
float
(
theano
.
tensor
.
get_scalar_constant_value
(
running_average_factor
))
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
not
dnn_available
():
return
None
x
=
as_cuda_ndarray_variable
(
x
)
scale
=
as_cuda_ndarray_variable
(
scale
)
bias
=
as_cuda_ndarray_variable
(
bias
)
inputs
=
[
x
,
scale
,
bias
,
mode
,
eps
,
running_average_factor
]
if
running_mean
is
not
None
and
running_var
is
not
None
:
inputs
.
append
(
running_mean
)
inputs
.
append
(
running_var
)
results
=
list
(
dnn_batch_normalization_train
(
*
inputs
))
# If the original output was on CPU, we have to transfer it
for
i
in
range
(
len
(
node
.
outputs
)):
if
isinstance
(
node
.
outputs
[
i
]
.
type
,
tensor
.
TensorType
):
results
[
i
]
=
tensor
.
as_tensor_variable
(
results
[
i
])
# TODO copy_stack_trace?
return
results
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNorm
],
inplace
=
True
)
def
local_gpu_batch_norm_inplace_output
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNorm
)
and
not
node
.
op
.
inplace_output
:
return
GpuDnnBatchNorm
(
mode
=
node
.
op
.
mode
,
epsilon
=
node
.
op
.
epsilon
,
running_average_factor
=
node
.
op
.
running_average_factor
,
running_averages
=
node
.
op
.
running_averages
,
inplace_running_mean
=
node
.
op
.
inplace_running_mean
,
inplace_running_var
=
node
.
op
.
inplace_running_var
,
inplace_output
=
True
)(
*
node
.
inputs
)
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNorm
],
inplace
=
True
)
def
local_gpu_batch_norm_inplace_running_mean
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNorm
)
and
node
.
op
.
running_averages
and
not
node
.
op
.
inplace_running_mean
:
return
GpuDnnBatchNorm
(
mode
=
node
.
op
.
mode
,
epsilon
=
node
.
op
.
epsilon
,
running_average_factor
=
node
.
op
.
running_average_factor
,
running_averages
=
node
.
op
.
running_averages
,
inplace_running_mean
=
True
,
inplace_running_var
=
node
.
op
.
inplace_running_var
,
inplace_output
=
node
.
op
.
inplace_output
)(
*
node
.
inputs
)
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNorm
],
inplace
=
True
)
def
local_gpu_batch_norm_inplace_running_var
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNorm
)
and
node
.
op
.
running_averages
and
not
node
.
op
.
inplace_running_var
:
return
GpuDnnBatchNorm
(
mode
=
node
.
op
.
mode
,
epsilon
=
node
.
op
.
epsilon
,
running_average_factor
=
node
.
op
.
running_average_factor
,
running_averages
=
node
.
op
.
running_averages
,
inplace_running_mean
=
node
.
op
.
inplace_running_mean
,
inplace_running_var
=
True
,
inplace_output
=
node
.
op
.
inplace_output
)(
*
node
.
inputs
)
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNormInference
],
inplace
=
True
)
def
local_gpu_batch_norm_inference_inplace
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNormInference
)
and
not
node
.
op
.
inplace
:
return
[
GpuDnnBatchNormInference
(
mode
=
node
.
op
.
mode
,
epsilon
=
node
.
op
.
epsilon
,
inplace
=
True
)(
*
node
.
inputs
)]
@local_optimizer
([
bn
.
AbstractBatchNormTrainGrad
])
def
local_abstract_batch_norm_train_grad_cudnn
(
node
):
if
not
isinstance
(
node
.
op
,
bn
.
AbstractBatchNormTrainGrad
):
return
None
x
,
dy
,
scale
,
x_mean
,
x_invstd
,
epsilon
=
node
.
inputs
# input on gpu? TODO what about the output?
x_on_gpu
=
(
isinstance
(
x
.
type
,
CudaNdarrayType
)
or
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)))
dy_on_gpu
=
(
isinstance
(
dy
.
type
,
CudaNdarrayType
)
or
(
dy
.
owner
and
isinstance
(
dy
.
owner
.
op
,
HostFromGpu
)))
if
not
(
x_on_gpu
or
dy_on_gpu
):
return
None
# convert axes to cuDNN mode
axes
=
tuple
(
node
.
op
.
axes
)
if
axes
==
(
0
,):
mode
=
'per-activation'
elif
axes
==
(
0
,)
+
tuple
(
range
(
2
,
x
.
ndim
)):
mode
=
'spatial'
else
:
return
None
ndim
=
x
.
ndim
if
ndim
<
4
:
x
=
theano
.
tensor
.
shape_padright
(
x
,
4
-
ndim
)
dy
=
theano
.
tensor
.
shape_padright
(
dy
,
4
-
ndim
)
scale
=
theano
.
tensor
.
shape_padright
(
scale
,
4
-
ndim
)
x_mean
=
theano
.
tensor
.
shape_padright
(
x_mean
,
4
-
ndim
)
x_invstd
=
theano
.
tensor
.
shape_padright
(
x_invstd
,
4
-
ndim
)
elif
ndim
>
5
:
x_shape
=
x
.
shape
params_shape
=
scale
.
shape
x
=
theano
.
tensor
.
flatten
(
x
,
5
)
dy
=
theano
.
tensor
.
flatten
(
dy
,
5
)
scale
=
theano
.
tensor
.
flatten
(
scale
,
5
)
x_mean
=
theano
.
tensor
.
flatten
(
x_mean
,
5
)
x_invstd
=
theano
.
tensor
.
flatten
(
x_invstd
,
5
)
try
:
eps
=
float
(
theano
.
tensor
.
get_scalar_constant_value
(
epsilon
))
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
eps
<
1e-5
:
return
None
if
not
dnn_available
():
return
None
x
=
as_cuda_ndarray_variable
(
x
)
dy
=
as_cuda_ndarray_variable
(
dy
)
scale
=
as_cuda_ndarray_variable
(
scale
)
x_mean
=
as_cuda_ndarray_variable
(
x_mean
)
x_invstd
=
as_cuda_ndarray_variable
(
x_invstd
)
g_wrt_inputs
,
g_wrt_scale
,
g_wrt_bias
=
\
GpuDnnBatchNormGrad
(
mode
,
epsilon
=
eps
)(
x
,
dy
,
scale
,
x_mean
,
x_invstd
)
if
ndim
<
4
:
g_wrt_inputs
=
theano
.
tensor
.
flatten
(
g_wrt_inputs
,
ndim
)
g_wrt_scale
=
theano
.
tensor
.
flatten
(
g_wrt_scale
,
ndim
)
g_wrt_bias
=
theano
.
tensor
.
flatten
(
g_wrt_bias
,
ndim
)
elif
ndim
>
5
:
g_wrt_inputs
=
theano
.
tensor
.
reshape
(
g_wrt_inputs
,
x_shape
)
g_wrt_scale
=
theano
.
tensor
.
reshape
(
g_wrt_scale
,
params_shape
)
g_wrt_bias
=
theano
.
tensor
.
reshape
(
g_wrt_bias
,
params_shape
)
# If the original output was on CPU, we have to transfer it
if
isinstance
(
node
.
outputs
[
0
]
.
type
,
tensor
.
TensorType
):
g_wrt_inputs
=
tensor
.
as_tensor_variable
(
g_wrt_inputs
)
if
isinstance
(
node
.
outputs
[
1
]
.
type
,
tensor
.
TensorType
):
g_wrt_scale
=
tensor
.
as_tensor_variable
(
g_wrt_scale
)
if
isinstance
(
node
.
outputs
[
2
]
.
type
,
tensor
.
TensorType
):
g_wrt_bias
=
tensor
.
as_tensor_variable
(
g_wrt_bias
)
# TODO copy_stack_trace?
return
[
g_wrt_inputs
,
g_wrt_scale
,
g_wrt_bias
]
@local_optimizer
([
bn
.
AbstractBatchNormInference
])
def
local_abstract_batch_norm_inference_cudnn
(
node
):
if
not
isinstance
(
node
.
op
,
bn
.
AbstractBatchNormInference
):
return
None
x
,
scale
,
bias
,
estimated_mean
,
estimated_variance
,
epsilon
=
node
.
inputs
axes
=
tuple
(
node
.
op
.
axes
)
if
axes
==
(
0
,):
mode
=
'per-activation'
elif
axes
==
(
0
,)
+
tuple
(
range
(
2
,
x
.
ndim
)):
mode
=
'spatial'
else
:
return
None
# input on gpu? TODO what about the output?
x_on_gpu
=
(
isinstance
(
x
.
type
,
CudaNdarrayType
)
or
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)))
if
not
x_on_gpu
:
return
None
try
:
eps
=
float
(
theano
.
tensor
.
get_scalar_constant_value
(
epsilon
))
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
eps
<
1e-5
:
return
None
if
not
dnn_available
():
return
None
x
=
as_cuda_ndarray_variable
(
x
)
scale
=
as_cuda_ndarray_variable
(
scale
)
bias
=
as_cuda_ndarray_variable
(
bias
)
estimated_mean
=
as_cuda_ndarray_variable
(
estimated_mean
)
estimated_variance
=
as_cuda_ndarray_variable
(
estimated_variance
)
out
=
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
estimated_mean
,
estimated_variance
,
mode
,
eps
)
# If the original output was on CPU, we have to transfer it
# TODO copy_stack_trace?
if
isinstance
(
node
.
outputs
[
0
]
.
type
,
tensor
.
TensorType
):
return
[
tensor
.
as_tensor_variable
(
out
)]
else
:
return
[
out
]
theano/sandbox/cuda/opt.py
浏览文件 @
8b9f7336
...
...
@@ -3050,3 +3050,28 @@ conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
local_abstractconv3d_gradinputs_gemm
,
30
,
'conv_gemm'
,
'gpu'
,
'fast_compile'
,
'fast_run'
)
# Register cuDNN batch normalization implementation
abstract_batch_norm_groupopt
=
theano
.
gof
.
optdb
.
LocalGroupDB
()
abstract_batch_norm_groupopt
.
__name__
=
"gpu_batchnorm_opts"
register_opt
(
'fast_compile'
)(
abstract_batch_norm_groupopt
)
# cuDNN optimizations are only registered if cuDNN is available.
# (we import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn)
from
.dnn
import
(
local_abstract_batch_norm_train_cudnn
,
local_abstract_batch_norm_train_grad_cudnn
,
local_abstract_batch_norm_inference_cudnn
)
# noqa: 402
abstract_batch_norm_groupopt
.
register
(
'local_abstract_batch_norm_train_dnn'
,
local_abstract_batch_norm_train_cudnn
,
20
,
'batchnorm_dnn'
,
'gpu'
,
'fast_compile'
,
'fast_run'
,
'cudnn'
)
abstract_batch_norm_groupopt
.
register
(
'local_abstract_batch_norm_train_grad_dnn'
,
local_abstract_batch_norm_train_grad_cudnn
,
20
,
'batchnorm_dnn'
,
'gpu'
,
'fast_compile'
,
'fast_run'
,
'cudnn'
)
abstract_batch_norm_groupopt
.
register
(
'local_abstract_batch_norm_inference_dnn'
,
local_abstract_batch_norm_inference_cudnn
,
20
,
'batchnorm_dnn'
,
'gpu'
,
'fast_compile'
,
'fast_run'
,
'cudnn'
)
theano/sandbox/cuda/tests/test_dnn.py
浏览文件 @
8b9f7336
from
__future__
import
absolute_import
,
print_function
,
division
from
collections
import
OrderedDict
import
logging
import
os
import
sys
...
...
@@ -18,6 +19,7 @@ import theano.tests.unittest_tools as utt
from
theano.tensor.signal.pool
import
pool_2d
,
pool_3d
from
theano.tensor.signal.pool
import
Pool
,
MaxPoolGrad
,
AveragePoolGrad
from
theano.tensor.nnet.abstract_conv
import
get_conv_output_shape
from
theano.tensor.nnet
import
bn
import
theano.sandbox.cuda.dnn
as
dnn
from
theano.sandbox.cuda.basic_ops
import
GpuAllocEmpty
,
gpu_alloc_empty
from
theano.sandbox.cuda
import
float32_shared_constructor
as
shared
...
...
@@ -730,52 +732,201 @@ def test_batchnorm_train():
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
tensor6
=
T
.
TensorType
(
theano
.
config
.
floatX
,
(
False
,)
*
6
)
for
mode
in
(
'per-activation'
,
'spatial'
):
for
vartype
in
(
T
.
ftensor5
,
T
.
ftensor4
,
T
.
ftensor3
,
T
.
fmatrix
,
T
.
fvector
):
x
,
scale
,
bias
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
))
for
vartype
in
(
tensor6
,
T
.
ftensor5
,
T
.
ftensor4
,
T
.
ftensor3
,
T
.
fmatrix
,
T
.
fvector
):
x
,
scale
,
bias
,
running_mean
,
running_var
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'running_mean'
,
'running_var'
))
ndim
=
x
.
ndim
eps
=
5e-3
# some non-standard value to test if it's used
# forward pass
out
,
x_mean
,
x_invstd
=
cuda
.
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
)
running_average_factor
=
0.3
# forward pass, direct interface
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
,
\
out_running_mean_gpu
,
out_running_var_gpu
=
\
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
,
running_average_factor
,
running_mean
,
running_var
)
# forward pass, abstract interface
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
,
\
out_running_mean_abstract
,
out_running_var_abstract
=
\
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
,
running_average_factor
,
running_mean
,
running_var
)
# reference forward pass
if
mode
==
'per-activation'
:
axes
=
(
0
,)
elif
mode
==
'spatial'
:
axes
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
x_mean2
=
x
.
mean
(
axis
=
axes
,
keepdims
=
True
)
x_invstd2
=
T
.
inv
(
T
.
sqrt
(
x
.
var
(
axis
=
axes
,
keepdims
=
True
)
+
eps
))
scale2
=
T
.
addbroadcast
(
scale
,
*
axes
)
bias2
=
T
.
addbroadcast
(
bias
,
*
axes
)
out2
=
(
x
-
x_mean2
)
*
(
scale2
*
x_invstd2
)
+
bias2
x_mean_ref
=
x
.
mean
(
axis
=
axes
,
keepdims
=
True
)
x_var_ref
=
x
.
var
(
axis
=
axes
,
keepdims
=
True
)
x_invstd_ref
=
T
.
inv
(
T
.
sqrt
(
x_var_ref
+
eps
))
scale_ref
=
T
.
addbroadcast
(
scale
,
*
axes
)
bias_ref
=
T
.
addbroadcast
(
bias
,
*
axes
)
m
=
T
.
cast
(
T
.
prod
(
x
.
shape
)
/
T
.
prod
(
scale
.
shape
),
theano
.
config
.
floatX
)
out_ref
=
(
x
-
x_mean_ref
)
*
(
scale_ref
*
x_invstd_ref
)
+
bias_ref
out_running_mean_ref
=
running_mean
*
(
1
-
running_average_factor
)
+
\
x_mean_ref
*
running_average_factor
out_running_var_ref
=
running_var
*
(
1
-
running_average_factor
)
+
\
(
m
/
(
m
-
1
))
*
x_var_ref
*
running_average_factor
# backward pass
dy
=
vartype
(
'dy'
)
grads
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out
:
dy
})
grads_gpu
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_gpu
:
dy
})
grads_abstract
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_abstract
:
dy
})
# reference backward pass
grads
2
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out2
:
dy
})
grads
_ref
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_ref
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out
,
x_mean
,
x_invstd
,
out2
,
x_mean2
,
x_invstd2
]
+
grads
+
grads2
,
mode
=
mode_with_gpu
)
f_gpu
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
,
out_running_mean_gpu
,
out_running_var_gpu
]
+
grads_gpu
,
mode
=
mode_with_gpu
)
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
,
out_running_mean_abstract
,
out_running_var_abstract
]
+
grads_abstract
,
mode
=
mode_with_gpu
)
f_ref
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out_ref
,
x_mean_ref
,
x_invstd_ref
,
out_running_mean_ref
,
out_running_var_ref
]
+
grads_ref
)
# check if the abstract Ops have been replaced
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormGrad
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
# run
for
data_shape
in
((
5
,
10
,
30
,
40
,
10
),
(
4
,
3
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
)):
for
data_shape
in
((
5
,
10
,
30
,
40
,
10
,
5
),
(
4
,
3
,
1
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
,
5
)):
data_shape
=
data_shape
[:
ndim
]
param_shape
=
tuple
(
1
if
d
in
axes
else
s
for
d
,
s
in
enumerate
(
data_shape
))
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
'float32'
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
'float32'
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
outputs
=
f
(
X
,
Scale
,
Bias
,
Dy
)
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Running_mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Running_var
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs_gpu
=
f_gpu
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
outputs_abstract
=
f_abstract
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
outputs_ref
=
f_ref
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
# compare outputs
utt
.
assert_allclose
(
outputs
[
0
],
outputs
[
0
+
3
])
# out
utt
.
assert_allclose
(
outputs
[
1
],
outputs
[
1
+
3
])
# mean
utt
.
assert_allclose
(
outputs
[
2
],
outputs
[
2
+
3
])
# invstd
utt
.
assert_allclose
(
outputs_gpu
[
0
],
outputs_ref
[
0
])
# out
utt
.
assert_allclose
(
outputs_gpu
[
1
],
outputs_ref
[
1
])
# mean
utt
.
assert_allclose
(
outputs_gpu
[
2
],
outputs_ref
[
2
])
# invstd
utt
.
assert_allclose
(
outputs_gpu
[
3
],
outputs_ref
[
3
])
# running_mean
utt
.
assert_allclose
(
numpy
.
nan_to_num
(
outputs_gpu
[
4
]),
numpy
.
nan_to_num
(
outputs_ref
[
4
]))
# running_var
utt
.
assert_allclose
(
outputs_abstract
[
0
],
outputs_ref
[
0
])
# out
utt
.
assert_allclose
(
outputs_abstract
[
1
],
outputs_ref
[
1
])
# mean
utt
.
assert_allclose
(
outputs_abstract
[
2
],
outputs_ref
[
2
])
# invstd
utt
.
assert_allclose
(
outputs_abstract
[
3
],
outputs_ref
[
3
])
# running_mean
utt
.
assert_allclose
(
numpy
.
nan_to_num
(
outputs_abstract
[
4
]),
numpy
.
nan_to_num
(
outputs_ref
[
4
]))
# running_var
# compare gradients
utt
.
assert_allclose
(
outputs
[
6
],
outputs
[
6
+
3
],
atol
=
1e-4
)
# dx
utt
.
assert_allclose
(
outputs
[
7
],
outputs
[
7
+
3
],
rtol
=
2e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs
[
8
],
outputs
[
8
+
3
])
# dbias
utt
.
assert_allclose
(
outputs_gpu
[
5
],
outputs_ref
[
5
],
atol
=
2e-4
)
# dx
utt
.
assert_allclose
(
outputs_gpu
[
6
],
outputs_ref
[
6
],
rtol
=
4e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs_gpu
[
7
],
outputs_ref
[
7
])
# dbias
utt
.
assert_allclose
(
outputs_abstract
[
5
],
outputs_ref
[
5
],
atol
=
2e-4
)
# dx
utt
.
assert_allclose
(
outputs_abstract
[
6
],
outputs_ref
[
6
],
rtol
=
4e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs_abstract
[
7
],
outputs_ref
[
7
])
# dbias
def
test_dnn_batchnorm_train_without_running_averages
():
# compile and run batch_normalization_train without running averages
if
not
cuda
.
dnn
.
dnn_available
():
raise
SkipTest
(
cuda
.
dnn
.
dnn_available
.
msg
)
if
cuda
.
dnn
.
version
()
<
(
5000
,
5000
):
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
x
,
scale
,
bias
,
dy
=
T
.
tensor4
(
'x'
),
T
.
tensor4
(
'scale'
),
T
.
tensor4
(
'bias'
),
T
.
tensor4
(
'dy'
)
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
# forward pass
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
=
\
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
)
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
=
\
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
)
# backward pass
grads_gpu
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_gpu
:
dy
})
grads_abstract
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_gpu
:
dy
})
# compile
f_gpu
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
]
+
grads_gpu
,
mode
=
mode_with_gpu
)
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
]
+
grads_abstract
,
mode
=
mode_with_gpu
)
# check if the abstract Ops have been replaced
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormGrad
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f_gpu
(
X
,
Scale
,
Bias
,
Dy
)
f_abstract
(
X
,
Scale
,
Bias
,
Dy
)
def
test_dnn_batchnorm_train_inplace
():
# test inplace_running_mean and inplace_running_var
if
not
cuda
.
dnn
.
dnn_available
():
raise
SkipTest
(
cuda
.
dnn
.
dnn_available
.
msg
)
if
cuda
.
dnn
.
version
()
<
(
5000
,
5000
):
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
x
,
scale
,
bias
=
T
.
tensor4
(
'x'
),
T
.
tensor4
(
'scale'
),
T
.
tensor4
(
'bias'
)
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
running_mean
=
shared
(
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
),
broadcastable
=
(
True
,
False
,
False
,
False
))
running_var
=
shared
(
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
),
broadcastable
=
(
True
,
False
,
False
,
False
))
# forward pass
out
,
x_mean
,
x_invstd
,
new_running_mean
,
new_running_var
=
\
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
,
epsilon
=
5e-3
,
running_average_factor
=
0.3
,
running_mean
=
running_mean
,
running_var
=
running_var
)
# update running averages
updates
=
OrderedDict
()
updates
[
running_mean
]
=
new_running_mean
updates
[
running_var
]
=
new_running_var
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
],
[
out
,
x_mean
,
x_invstd
],
updates
=
updates
,
mode
=
mode_with_gpu
)
# check for the inplace settings
nodes
=
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)]
assert
len
(
nodes
)
==
1
assert
nodes
[
0
]
.
op
.
inplace_running_mean
assert
nodes
[
0
]
.
op
.
inplace_running_var
assert
nodes
[
0
]
.
op
.
inplace_output
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f
(
X
,
Scale
,
Bias
)
def
test_batchnorm_inference
():
...
...
@@ -785,53 +936,160 @@ def test_batchnorm_inference():
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
tensor6
=
T
.
TensorType
(
theano
.
config
.
floatX
,
(
False
,)
*
6
)
for
mode
in
(
'per-activation'
,
'spatial'
):
for
vartype
in
(
T
.
ftensor5
,
T
.
ftensor4
,
T
.
ftensor3
,
T
.
fmatrix
,
T
.
fvector
):
x
,
scale
,
bias
,
mean
,
var
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
))
for
vartype
in
(
tensor6
,
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
,
T
.
vector
):
x
,
scale
,
bias
,
mean
,
var
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
))
ndim
=
x
.
ndim
eps
=
5e-3
# some non-standard value to test if it's used
# forward pass
out
=
cuda
.
dnn
.
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
mode
,
eps
)
# forward pass, direct interface
out_gpu
=
dnn
.
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
mode
,
eps
)
# forward pass, abstract interface
out_abstract
=
bn
.
batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
mode
,
eps
)
# reference forward pass
if
mode
==
'per-activation'
:
axes
=
(
0
,)
elif
mode
==
'spatial'
:
axes
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
scale
2
,
bias2
,
mean2
,
var2
=
(
T
.
addbroadcast
(
t
,
*
axes
)
for
t
in
(
scale
,
bias
,
mean
,
var
))
out
2
=
(
x
-
mean2
)
*
(
scale2
/
T
.
sqrt
(
var2
+
eps
))
+
bias2
scale
_ref
,
bias_ref
,
mean_ref
,
var_ref
=
(
T
.
addbroadcast
(
t
,
*
axes
)
for
t
in
(
scale
,
bias
,
mean
,
var
))
out
_ref
=
(
x
-
mean_ref
)
*
(
scale_ref
/
T
.
sqrt
(
var_ref
+
eps
))
+
bias_ref
# backward pass
dy
=
vartype
(
'dy'
)
grads
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out
:
dy
})
grads_gpu
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_gpu
:
dy
})
grads_abstract
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_abstract
:
dy
})
# reference backward pass
grads
2
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out2
:
dy
})
grads
_ref
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_ref
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out
,
out2
]
+
grads
+
grads2
,
mode
=
mode_with_gpu
)
f_gpu
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_gpu
]
+
grads_gpu
,
mode
=
mode_with_gpu
)
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_abstract
]
+
grads_abstract
,
mode
=
mode_with_gpu
)
f_ref
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_ref
]
+
grads_ref
)
# check if the abstract Ops have been replaced
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormInference
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
# run
for
data_shape
in
((
5
,
10
,
30
,
40
,
10
),
(
4
,
3
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
)):
for
data_shape
in
((
10
,
20
,
30
,
40
,
10
,
5
),
(
4
,
3
,
1
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
,
5
)):
data_shape
=
data_shape
[:
ndim
]
param_shape
=
tuple
(
1
if
d
in
axes
else
s
for
d
,
s
in
enumerate
(
data_shape
))
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
'float32'
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
'float32'
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
Mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
Var
=
numpy
.
random
.
rand
(
*
param_shape
)
.
astype
(
'float32'
)
outputs
=
f
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Var
=
numpy
.
random
.
rand
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs_gpu
=
f_gpu
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
outputs_abstract
=
f_abstract
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
outputs_ref
=
f_ref
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
# compare outputs
utt
.
assert_allclose
(
outputs
[
0
],
outputs
[
1
])
# out
utt
.
assert_allclose
(
outputs_gpu
[
0
],
outputs_ref
[
0
])
# out
utt
.
assert_allclose
(
outputs_abstract
[
0
],
outputs_ref
[
0
])
# out
# compare gradients
utt
.
assert_allclose
(
outputs
[
2
],
outputs
[
2
+
5
],
atol
=
4e-5
)
# dx
utt
.
assert_allclose
(
outputs
[
3
],
outputs
[
3
+
5
],
atol
=
4e-5
)
# dscale
utt
.
assert_allclose
(
outputs
[
4
],
outputs
[
4
+
5
])
# dbias
utt
.
assert_allclose
(
outputs
[
5
],
outputs
[
5
+
5
])
# dmean
utt
.
assert_allclose
(
outputs
[
6
],
outputs
[
6
+
5
],
rtol
=
2e-3
,
atol
=
4e-5
)
# dvar
utt
.
assert_allclose
(
outputs_gpu
[
1
],
outputs_ref
[
1
],
atol
=
4e-5
)
# dx
utt
.
assert_allclose
(
outputs_gpu
[
2
],
outputs_ref
[
2
],
atol
=
4e-5
)
# dscale
utt
.
assert_allclose
(
outputs_gpu
[
3
],
outputs_ref
[
3
])
# dbias
utt
.
assert_allclose
(
outputs_gpu
[
4
],
outputs_ref
[
4
])
# dmean
utt
.
assert_allclose
(
outputs_gpu
[
5
],
outputs_ref
[
5
],
rtol
=
2e-3
,
atol
=
4e-5
)
# dvar
utt
.
assert_allclose
(
outputs_abstract
[
1
],
outputs_ref
[
1
],
atol
=
4e-5
)
# dx
utt
.
assert_allclose
(
outputs_abstract
[
2
],
outputs_ref
[
2
],
atol
=
4e-5
)
# dscale
utt
.
assert_allclose
(
outputs_abstract
[
3
],
outputs_ref
[
3
])
# dbias
utt
.
assert_allclose
(
outputs_abstract
[
4
],
outputs_ref
[
4
])
# dmean
utt
.
assert_allclose
(
outputs_abstract
[
5
],
outputs_ref
[
5
],
rtol
=
2e-3
,
atol
=
4e-5
)
# dvar
def
test_batchnorm_inference_inplace
():
# test inplace
if
not
cuda
.
dnn
.
dnn_available
():
raise
SkipTest
(
cuda
.
dnn
.
dnn_available
.
msg
)
if
cuda
.
dnn
.
version
()
<
(
5000
,
5000
):
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
x
,
scale
,
bias
,
mean
,
var
=
(
T
.
tensor4
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
))
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
out
=
dnn
.
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
)
f
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
],
[
out
],
mode
=
mode_with_gpu
)
# check for the inplace settings
nodes
=
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormInference
)]
assert
len
(
nodes
)
==
1
assert
nodes
[
0
]
.
op
.
inplace
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Var
=
numpy
.
random
.
rand
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f
(
X
,
Scale
,
Bias
,
Mean
,
Var
)
def
test_dnn_batchnorm_valid_and_invalid_axes
():
if
not
cuda
.
dnn
.
dnn_available
():
raise
SkipTest
(
cuda
.
dnn
.
dnn_available
.
msg
)
if
cuda
.
dnn
.
version
()
<
(
5000
,
5000
):
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
for
vartype
in
(
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
):
x
,
scale
,
bias
,
mean
,
var
,
dy
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
,
'dy'
))
ndim
=
x
.
ndim
# supported: per-activation and spatial
valid_axes_lists
=
((
0
,),
(
0
,)
+
tuple
(
range
(
2
,
ndim
)))
# not supported: an axes list without 0 and including 1
invalid_axes_lists
=
(
tuple
(
range
(
1
,
ndim
)),)
for
axes
in
valid_axes_lists
+
invalid_axes_lists
:
# forward pass, abstract interface
out_train
,
x_mean
,
x_invstd
=
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
axes
)
out_test
=
bn
.
batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
axes
)
# backward pass
dy
=
vartype
(
'dy'
)
grads_train
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_train
:
dy
})
grads_test
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_test
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_train
,
x_mean
,
x_invstd
,
out_test
]
+
grads_train
+
grads_test
,
mode
=
mode_with_gpu
)
if
axes
in
valid_axes_lists
:
# check if the abstract Ops have been replaced by the cuDNN Ops
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormGrad
)
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormInference
)
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
else
:
# check if the abstract Ops have been replaced, but not by the cuDNN Ops
assert
not
any
([
isinstance
(
n
.
op
,
(
dnn
.
GpuDnnBatchNorm
,
dnn
.
GpuDnnBatchNormGrad
,
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
def
test_dnn_tag
():
...
...
theano/tensor/nnet/bn.py
浏览文件 @
8b9f7336
from
__future__
import
absolute_import
,
print_function
,
division
import
numpy
import
theano
from
theano
import
Apply
,
Op
from
theano.gof
import
local_optimizer
from
theano.gof.opt
import
copy_stack_trace
from
theano.tensor
import
as_tensor_variable
,
TensorType
from
theano.tensor
import
basic
as
T
from
theano.tensor.opt
import
register_specialize_device
from
theano.scalar
import
Composite
from
theano.scalar
import
add
,
sub
,
true_div
,
mul
...
...
@@ -37,7 +44,7 @@ def batch_normalization(inputs, gamma, beta, mean, std,
"""
This function will build the symbolic graph for applying batch normalization
to a set of activations.
Also works on GPUs
Also works on GPUs
, but is not optimized using cuDNN.
.. versionadded:: 0.7.1
...
...
@@ -75,3 +82,631 @@ def batch_normalization(inputs, gamma, beta, mean, std,
raise
ValueError
(
'mode must be either "low_mem", "high_mem"'
)
return
rval
def
_prepare_batch_normalization_axes
(
axes
,
ndim
):
if
axes
==
'per-activation'
:
axes
=
(
0
,)
elif
axes
==
'spatial'
:
axes
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
elif
isinstance
(
axes
,
(
tuple
,
list
,
numpy
.
ndarray
)):
axes
=
tuple
(
int
(
a
)
for
a
in
axes
)
else
:
raise
ValueError
(
'invalid axes:
%
s'
,
str
(
axes
))
axes
=
tuple
(
sorted
(
axes
))
if
len
(
axes
)
==
0
:
raise
ValueError
(
'there should be at least one normalization axis'
)
if
min
(
axes
)
<
0
or
max
(
axes
)
>=
ndim
:
raise
ValueError
(
'axes should be less than ndim (<
%
d), but
%
s given'
%
(
ndim
,
str
(
axes
)))
non_bc_axes
=
tuple
(
i
for
i
in
range
(
ndim
)
if
i
not
in
axes
)
return
axes
,
non_bc_axes
def
batch_normalization_train
(
inputs
,
gamma
,
beta
,
axes
=
'per-activation'
,
epsilon
=
1e-4
,
running_average_factor
=
0.1
,
running_mean
=
None
,
running_var
=
None
):
"""
Performs batch normalization of the given inputs, using the mean and
variance of the inputs.
Parameters
----------
axes : 'per-activation', 'spatial' or a tuple of ints
The axes along which the input should be normalized. ``'per-activation'``
normalizes per activation and is equal to ``axes=(0,)``.
``'spatial'`` shares normalization factors across spatial dimensions
(i.e., all dimensions past the second), which for 4D inputs would be
equal to ``axes=(0, 2, 3)``.
gamma : tensor
Learnable scale factors. The shape must match the shape of `inputs`,
except for the axes in `axes`. These axes should be set to 1 or be
skipped altogether (such that `gamma.ndim == inputs.ndim - len(axes)`).
beta : tensor
Learnable biases. Must match the tensor layout of `gamma`.
epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None. The shape should match that of `gamma` and `beta`.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None. The shape should match that of `gamma` and `beta`.
Returns
-------
out : tensor
Batch-normalized inputs.
mean : tensor
Means of `inputs` across the normalization axes.
invstd : tensor
Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes
-----
If per-activation or spatial normalization is selected, this operation
will use the cuDNN implementation. (This requires cuDNN 5 or newer.)
The returned values are equivalent to:
.. code-block:: python
# for per-activation normalization
axes = (0,)
# for spatial normalization
axes = (0,) + tuple(range(2, inputs.ndim))
mean = inputs.mean(axes, keepdims=True)
var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) +
\\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) +
\\
(m / (m - 1)) * var * running_average_factor
"""
ndim
=
inputs
.
ndim
axes
,
non_bc_axes
=
_prepare_batch_normalization_axes
(
axes
,
ndim
)
# have the parameter tensors been broadcasted yet?
if
gamma
.
ndim
==
ndim
:
params_ndim
=
ndim
else
:
params_ndim
=
len
(
non_bc_axes
)
params_dimshuffle_pattern
=
[
'x'
]
*
ndim
for
i
,
axis
in
enumerate
(
non_bc_axes
):
params_dimshuffle_pattern
[
axis
]
=
i
if
gamma
.
ndim
!=
params_ndim
or
beta
.
ndim
!=
params_ndim
:
raise
ValueError
(
"gamma and beta dimensionality must match the "
"number of non-normalized axes, or have the "
"same number of dimensions as the inputs; "
"got
%
d and
%
d instead of
%
d"
%
(
gamma
.
ndim
,
beta
.
ndim
,
params_ndim
))
if
(
running_mean
is
None
)
!=
(
running_var
is
None
):
raise
ValueError
(
"running_mean and running_var must either both be "
"given or both be None"
)
if
running_mean
is
not
None
and
running_mean
.
ndim
!=
params_ndim
:
raise
ValueError
(
"running_mean must be of the same dimensionality "
"as gamma and beta; got
%
d instead of
%
d"
%
(
running_mean
.
ndim
,
params_ndim
))
if
running_var
is
not
None
and
running_var
.
ndim
!=
params_ndim
:
raise
ValueError
(
"running_var must be of the same dimensionality "
"as gamma and beta; got
%
d instead of
%
d"
%
(
running_var
.
ndim
,
params_ndim
))
# epsilon will be converted to floatX later. we need to check
# for rounding errors now, since numpy.float32(1e-5) < 1e-5.
epsilon
=
numpy
.
cast
[
theano
.
config
.
floatX
](
epsilon
)
if
epsilon
<
1e-5
:
raise
ValueError
(
"epsilon must be at least 1e-5, got
%
s"
%
str
(
epsilon
))
inputs
=
as_tensor_variable
(
inputs
)
gamma
=
as_tensor_variable
(
gamma
)
beta
=
as_tensor_variable
(
beta
)
if
params_ndim
!=
ndim
:
gamma
=
gamma
.
dimshuffle
(
params_dimshuffle_pattern
)
beta
=
beta
.
dimshuffle
(
params_dimshuffle_pattern
)
else
:
gamma
=
T
.
addbroadcast
(
gamma
,
*
axes
)
beta
=
T
.
addbroadcast
(
beta
,
*
axes
)
batchnorm_op
=
AbstractBatchNormTrain
(
axes
=
axes
)
if
running_mean
is
not
None
and
running_var
is
not
None
:
running_mean
=
as_tensor_variable
(
running_mean
)
running_var
=
as_tensor_variable
(
running_var
)
if
params_ndim
!=
ndim
:
running_mean
=
running_mean
.
dimshuffle
(
params_dimshuffle_pattern
)
running_var
=
running_var
.
dimshuffle
(
params_dimshuffle_pattern
)
else
:
running_mean
=
T
.
addbroadcast
(
running_mean
,
*
axes
)
running_var
=
T
.
addbroadcast
(
running_var
,
*
axes
)
out
,
mean
,
invstd
,
new_running_mean
,
new_running_var
=
batchnorm_op
(
inputs
,
gamma
,
beta
,
epsilon
=
epsilon
,
running_average_factor
=
running_average_factor
,
running_mean
=
running_mean
,
running_var
=
running_var
)
if
new_running_mean
.
broadcastable
!=
running_mean
.
broadcastable
:
new_running_mean
=
T
.
patternbroadcast
(
new_running_mean
,
running_mean
.
broadcastable
)
if
new_running_var
.
broadcastable
!=
running_var
.
broadcastable
:
new_running_var
=
T
.
patternbroadcast
(
new_running_var
,
running_var
.
broadcastable
)
results
=
(
out
,
mean
,
invstd
,
new_running_mean
,
new_running_var
)
else
:
results
=
batchnorm_op
(
inputs
,
gamma
,
beta
,
epsilon
=
epsilon
)
if
params_ndim
!=
ndim
:
# remove the broadcasted dimensions (except from the output)
results
=
([
results
[
0
]]
+
[
r
.
dimshuffle
(
non_bc_axes
)
for
r
in
results
[
1
:]])
return
tuple
(
results
)
def
batch_normalization_test
(
inputs
,
gamma
,
beta
,
mean
,
var
,
axes
=
'per-activation'
,
epsilon
=
1e-4
):
"""
Performs batch normalization of the given inputs, using the given mean and
variance.
Parameters
----------
axes : 'per-activation', 'spatial' or a tuple of ints
The axes along which the input should be normalized. ``'per-activation'``
normalizes per activation and is equal to ``axes=(0,)``.
``'spatial'`` shares normalization factors across spatial dimensions
(i.e., all dimensions past the second), which for 4D inputs would be
equal to ``axes=(0, 2, 3)``.
gamma : tensor
Scale factors. The shape must match the shape of `inputs`,
except for the axes in `axes`. These axes should be set to 1 or be
skipped altogether (such that `gamma.ndim == inputs.ndim - len(axes)`).
beta : tensor
Biases. Must match the tensor layout of `gamma`.
mean : tensor
Means. Usually these are running averages computed during training.
Must match the tensor layout of `gamma`.
var : tensor
Variances. Usually these are running averages computed during training.
Must match the tensor layout of `gamma`.
epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
Returns
-------
out : tensor
Batch-normalized inputs.
Notes
-----
If per-activation or spatial normalization is selected, this operation
will use the cuDNN implementation. (This requires cuDNN 5 or newer.)
The returned value is equivalent to:
.. code-block:: python
# for per-activation normalization
axes = (0,)
# for spatial normalization
axes = (0,) + tuple(range(2, inputs.ndim))
gamma, beta, mean, var = (T.addbroadcast(t, *axes)
for t in (gamma, beta, mean, var))
out = (inputs - mean) * gamma / T.sqrt(var + epsilon) + beta
"""
ndim
=
inputs
.
ndim
axes
,
non_bc_axes
=
_prepare_batch_normalization_axes
(
axes
,
ndim
)
# have the parameter tensors been broadcasted yet?
if
gamma
.
ndim
==
ndim
:
params_ndim
=
ndim
else
:
params_ndim
=
len
(
non_bc_axes
)
params_dimshuffle_pattern
=
[
'x'
]
*
ndim
for
i
,
axis
in
enumerate
(
non_bc_axes
):
params_dimshuffle_pattern
[
axis
]
=
i
if
gamma
.
ndim
!=
params_ndim
or
beta
.
ndim
!=
params_ndim
:
raise
ValueError
(
"gamma and beta dimensionality must match the "
"number of non-normalized axes, or have the "
"same number of dimensions as the inputs; "
"got
%
d and
%
d instead of
%
d"
%
(
gamma
.
ndim
,
beta
.
ndim
,
params_ndim
))
if
mean
.
ndim
!=
params_ndim
or
var
.
ndim
!=
params_ndim
:
raise
ValueError
(
"mean and var must be of the same dimensionality "
"as gamma and beta; got
%
d and
%
d instead of
%
d"
%
(
mean
.
ndim
,
var
.
ndim
,
params_ndim
))
# epsilon will be converted to floatX later. we need to check
# for rounding errors now, since numpy.float32(1e-5) < 1e-5.
epsilon
=
numpy
.
cast
[
theano
.
config
.
floatX
](
epsilon
)
if
epsilon
<
1e-5
:
raise
ValueError
(
"epsilon must be at least 1e-5, got
%
s"
%
str
(
epsilon
))
gamma
=
as_tensor_variable
(
gamma
)
beta
=
as_tensor_variable
(
beta
)
mean
=
as_tensor_variable
(
mean
)
var
=
as_tensor_variable
(
var
)
if
params_ndim
!=
ndim
:
gamma
=
gamma
.
dimshuffle
(
params_dimshuffle_pattern
)
beta
=
beta
.
dimshuffle
(
params_dimshuffle_pattern
)
mean
=
mean
.
dimshuffle
(
params_dimshuffle_pattern
)
var
=
var
.
dimshuffle
(
params_dimshuffle_pattern
)
else
:
gamma
=
T
.
addbroadcast
(
gamma
,
*
axes
)
beta
=
T
.
addbroadcast
(
beta
,
*
axes
)
mean
=
T
.
addbroadcast
(
mean
,
*
axes
)
var
=
T
.
addbroadcast
(
var
,
*
axes
)
batchnorm_op
=
AbstractBatchNormInference
(
axes
=
axes
)
return
batchnorm_op
(
inputs
,
gamma
,
beta
,
mean
,
var
,
epsilon
=
epsilon
)
class
AbstractBatchNormTrain
(
Op
):
"""
Abstract Op for Batch Normalization.
Parameters
----------
axes : a tuple of ints
The axes along which the input should be normalized.
x : tensor
The input to be normalized along `axes`.
scale : tensor
`scale` should have the same number of dimensions as `x`.
All dimensions listed in `axes` should have length 1.
bias : tensor
`bias` should have the same number of dimensions as `x`.
All dimensions listed in `axes` should have length 1.
epsilon
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - running_average_factor) + batch mean * running_average_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - running_average_factor) + (m / (m - 1)) * batch var * running_average_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
"""
__props__
=
(
'axes'
,)
def
__init__
(
self
,
axes
=
(
0
,)):
assert
isinstance
(
axes
,
(
tuple
,
list
))
assert
len
(
axes
)
>
0
axes
=
tuple
(
int
(
a
)
for
a
in
axes
)
self
.
axes
=
axes
def
infer_shape
(
self
,
node
,
shape
):
return
[
shape
[
0
]]
+
[
shape
[
1
]]
*
(
len
(
node
.
outputs
)
-
1
)
def
make_node
(
self
,
x
,
scale
,
bias
,
epsilon
=
1e-4
,
running_average_factor
=
0.1
,
running_mean
=
None
,
running_var
=
None
):
assert
x
.
ndim
==
scale
.
ndim
==
bias
.
ndim
assert
((
running_mean
is
None
and
running_var
is
None
)
or
(
running_mean
is
not
None
and
running_var
is
not
None
))
assert
(
running_mean
is
None
or
running_mean
.
ndim
==
x
.
ndim
)
assert
(
running_var
is
None
or
running_var
.
ndim
==
x
.
ndim
)
if
not
isinstance
(
epsilon
,
theano
.
Variable
):
epsilon
=
as_tensor_variable
(
epsilon
)
if
not
isinstance
(
running_average_factor
,
theano
.
Variable
):
running_average_factor
=
as_tensor_variable
(
running_average_factor
)
inputs
=
[
x
,
scale
,
bias
,
epsilon
,
running_average_factor
]
output_types
=
[
x
.
type
(),
scale
.
type
(),
scale
.
type
()]
if
running_mean
is
not
None
and
running_var
is
not
None
:
inputs
.
append
(
running_mean
)
inputs
.
append
(
running_var
)
output_types
.
append
(
scale
.
type
())
output_types
.
append
(
scale
.
type
())
return
Apply
(
self
,
inputs
,
output_types
)
def
L_op
(
self
,
inputs
,
outputs
,
grads
):
x
,
scale
,
bias
,
epsilon
,
running_average_factor
=
inputs
[:
5
]
dy
=
grads
[
0
]
_
,
x_mean
,
x_invstd
=
outputs
[:
3
]
disconnected_outputs
=
[
theano
.
gradient
.
DisconnectedType
()(),
# epsilon
theano
.
gradient
.
DisconnectedType
()()]
# running_average_factor
# Optional running_mean and running_var.
for
i
in
range
(
5
,
len
(
inputs
)):
disconnected_outputs
.
append
(
theano
.
gradient
.
DisconnectedType
()())
return
AbstractBatchNormTrainGrad
(
self
.
axes
)(
x
,
dy
,
scale
,
x_mean
,
x_invstd
,
epsilon
)
+
disconnected_outputs
def
connection_pattern
(
self
,
node
):
# Specificy that epsilon and running_average_factor are not connected to outputs.
patterns
=
[[
True
,
True
,
True
],
# x
[
True
,
True
,
True
],
# scale
[
True
,
True
,
True
],
# bias
[
False
,
False
,
False
],
# epsilon
[
False
,
False
,
False
]]
# running_average_factor
# Optional running_mean and running_var are only
# connected to their new values.
for
i
in
range
(
5
,
len
(
node
.
inputs
)):
patterns
[
0
]
.
append
(
True
)
for
pattern
in
patterns
[
1
:]:
pattern
.
append
(
False
)
patterns
.
append
([
False
]
*
(
3
+
i
-
5
)
+
[
True
])
return
patterns
def
perform
(
self
,
node
,
inputs
,
output_storage
):
x
,
scale
,
bias
,
epsilon
,
running_average_factor
=
inputs
[:
5
]
axes
=
self
.
axes
if
min
(
axes
)
<
0
or
max
(
axes
)
>=
x
.
ndim
:
raise
ValueError
(
'axes should be less than ndim (<
%
d), but
%
s given'
%
(
x
.
ndim
,
str
(
axes
)))
mean
=
x
.
mean
(
axes
,
keepdims
=
True
)
var
=
x
.
var
(
axes
,
keepdims
=
True
)
invstd
=
1.0
/
numpy
.
sqrt
(
var
+
epsilon
)
out
=
(
x
-
mean
)
*
(
scale
*
invstd
)
+
bias
output_storage
[
0
][
0
]
=
out
output_storage
[
1
][
0
]
=
mean
output_storage
[
2
][
0
]
=
invstd
if
len
(
inputs
)
>
5
:
running_mean
=
inputs
[
5
]
running_mean
=
running_mean
*
(
1.0
-
running_average_factor
)
+
\
mean
*
running_average_factor
output_storage
[
3
][
0
]
=
running_mean
if
len
(
inputs
)
>
6
:
m
=
float
(
numpy
.
prod
(
x
.
shape
)
/
numpy
.
prod
(
scale
.
shape
))
running_var
=
inputs
[
6
]
running_var
=
running_var
*
(
1.0
-
running_average_factor
)
+
\
(
m
/
(
m
-
1
))
*
var
*
running_average_factor
output_storage
[
4
][
0
]
=
running_var
class
AbstractBatchNormInference
(
Op
):
"""
Abstract Op for Batch Normalization.
Parameters
----------
axes : a tuple of ints
The axes along which the input is normalized.
epsilon
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
"""
__props__
=
(
'axes'
,)
def
__init__
(
self
,
axes
=
(
0
,)):
assert
isinstance
(
axes
,
(
tuple
,
list
))
assert
len
(
axes
)
>
0
axes
=
tuple
(
int
(
a
)
for
a
in
axes
)
self
.
axes
=
axes
def
infer_shape
(
self
,
node
,
shape
):
return
[
shape
[
0
]]
def
make_node
(
self
,
x
,
scale
,
bias
,
estimated_mean
,
estimated_variance
,
epsilon
=
1e-4
):
assert
x
.
ndim
==
scale
.
ndim
==
bias
.
ndim
==
estimated_mean
.
ndim
==
estimated_variance
.
ndim
if
not
isinstance
(
epsilon
,
theano
.
Variable
):
epsilon
=
as_tensor_variable
(
epsilon
)
return
Apply
(
self
,
[
x
,
scale
,
bias
,
estimated_mean
,
estimated_variance
,
epsilon
],
[
x
.
type
()])
def
grad
(
self
,
inputs
,
grads
):
x
,
scale
,
bias
,
est_mean
,
est_var
,
epsilon
=
inputs
dy
=
grads
[
0
]
axes
=
self
.
axes
if
min
(
axes
)
<
0
or
max
(
axes
)
>=
x
.
ndim
:
raise
ValueError
(
'axes should be less than ndim (<
%
d), but
%
s given'
%
(
x
.
ndim
,
str
(
axes
)))
scale
,
bias
,
est_mean
,
est_var
=
(
theano
.
tensor
.
addbroadcast
(
t
,
*
axes
)
for
t
in
(
scale
,
bias
,
est_mean
,
est_var
))
# define helper expressions
est_var_eps
=
est_var
+
epsilon
est_std
=
theano
.
tensor
.
sqrt
(
est_var_eps
)
two
=
theano
.
tensor
.
constant
(
2.
)
# define and return gradients
dx
=
dy
*
(
scale
/
est_std
)
dscale
=
(
dy
*
(
x
-
est_mean
))
.
sum
(
axes
,
keepdims
=
True
)
/
est_std
dbias
=
dy
.
sum
(
axes
,
keepdims
=
True
)
dmean
=
-
dy
.
sum
(
axes
,
keepdims
=
True
)
*
(
scale
/
est_std
)
dvar
=
-
(
dy
*
(
x
-
est_mean
))
.
sum
(
axes
,
keepdims
=
True
)
*
(
scale
/
(
two
*
est_var_eps
*
est_std
))
return
[
dx
,
dscale
,
dbias
,
dmean
,
dvar
,
theano
.
gradient
.
DisconnectedType
()()]
def
connection_pattern
(
self
,
node
):
# Specificy that epsilon is not connected to outputs.
return
[[
True
],
[
True
],
[
True
],
[
True
],
[
True
],
[
False
]]
def
perform
(
self
,
node
,
inputs
,
output_storage
):
x
,
scale
,
bias
,
estimated_mean
,
estimated_variance
,
epsilon
=
inputs
out
=
(
x
-
estimated_mean
)
*
(
scale
/
numpy
.
sqrt
(
estimated_variance
+
epsilon
))
+
bias
output_storage
[
0
][
0
]
=
out
class
AbstractBatchNormTrainGrad
(
Op
):
__props__
=
(
'axes'
,)
def
__init__
(
self
,
axes
=
(
0
,)):
assert
isinstance
(
axes
,
(
tuple
,
list
))
assert
len
(
axes
)
>
0
axes
=
tuple
(
int
(
a
)
for
a
in
axes
)
self
.
axes
=
axes
def
make_node
(
self
,
x
,
dy
,
scale
,
x_mean
,
x_invstd
,
epsilon
=
1e-4
):
assert
x
.
ndim
==
dy
.
ndim
==
scale
.
ndim
==
x_mean
.
ndim
==
x_invstd
.
ndim
if
not
isinstance
(
epsilon
,
theano
.
Variable
):
epsilon
=
as_tensor_variable
(
epsilon
)
return
Apply
(
self
,
[
x
,
dy
,
scale
,
x_mean
,
x_invstd
,
epsilon
],
[
x
.
type
(),
scale
.
type
(),
scale
.
type
()])
def
infer_shape
(
self
,
node
,
shape
):
return
[
shape
[
0
],
shape
[
2
],
shape
[
2
]]
def
perform
(
self
,
node
,
inputs
,
output_storage
):
x
,
dy
,
scale
,
x_mean
,
x_invstd
,
epsilon
=
inputs
axes
=
self
.
axes
if
min
(
axes
)
<
0
or
max
(
axes
)
>=
x
.
ndim
:
raise
ValueError
(
'axes should be less than ndim (<
%
d), but
%
s given'
%
(
x
.
ndim
,
str
(
axes
)))
x_diff
=
x
-
x_mean
mean_dy_x_diff
=
numpy
.
mean
(
dy
*
x_diff
,
axis
=
axes
,
keepdims
=
True
)
c
=
(
dy
*
x_invstd
)
-
(
x_diff
*
mean_dy_x_diff
*
(
x_invstd
**
3
))
g_wrt_inputs
=
scale
*
(
c
-
numpy
.
mean
(
c
,
axis
=
axes
,
keepdims
=
True
))
g_wrt_scale
=
numpy
.
sum
(
dy
*
x_invstd
*
x_diff
,
axis
=
axes
,
keepdims
=
True
)
g_wrt_bias
=
numpy
.
sum
(
dy
,
axis
=
axes
,
keepdims
=
True
)
output_storage
[
0
][
0
]
=
g_wrt_inputs
output_storage
[
1
][
0
]
=
g_wrt_scale
output_storage
[
2
][
0
]
=
g_wrt_bias
@local_optimizer
([
AbstractBatchNormTrain
])
def
local_abstract_batch_norm_train
(
node
):
if
not
isinstance
(
node
.
op
,
AbstractBatchNormTrain
):
return
None
x
,
scale
,
bias
,
epsilon
,
running_average_factor
=
node
.
inputs
[:
5
]
axes
=
node
.
op
.
axes
if
min
(
axes
)
<
0
or
max
(
axes
)
>
x
.
ndim
:
return
None
if
not
isinstance
(
x
.
type
,
TensorType
)
or
\
not
isinstance
(
scale
.
type
,
TensorType
)
or
\
not
isinstance
(
bias
.
type
,
TensorType
)
or
\
not
isinstance
(
epsilon
.
type
,
TensorType
)
or
\
not
isinstance
(
running_average_factor
.
type
,
TensorType
):
return
None
# optional running_mean and running_var
if
len
(
node
.
inputs
)
>
5
and
not
isinstance
(
node
.
inputs
[
5
]
.
type
,
TensorType
):
return
None
if
len
(
node
.
inputs
)
>
6
and
not
isinstance
(
node
.
inputs
[
6
]
.
type
,
TensorType
):
return
None
mean
=
x
.
mean
(
axes
,
keepdims
=
True
)
var
=
x
.
var
(
axes
,
keepdims
=
True
)
invstd
=
T
.
inv
(
T
.
sqrt
(
var
+
epsilon
))
out
=
(
x
-
mean
)
*
(
scale
*
invstd
)
+
bias
results
=
[
out
,
mean
,
invstd
]
if
len
(
node
.
inputs
)
>
5
:
running_mean
=
node
.
inputs
[
5
]
running_mean
=
running_mean
*
(
1.0
-
running_average_factor
)
+
\
mean
*
running_average_factor
results
.
append
(
running_mean
)
if
len
(
node
.
inputs
)
>
6
:
m
=
T
.
cast
(
T
.
prod
(
x
.
shape
)
/
T
.
prod
(
scale
.
shape
),
theano
.
config
.
floatX
)
running_var
=
node
.
inputs
[
6
]
running_var
=
running_var
*
(
1.0
-
running_average_factor
)
+
\
(
m
/
(
m
-
1
))
*
var
*
running_average_factor
results
.
append
(
running_var
)
results
=
[
T
.
patternbroadcast
(
r
,
r_orig
.
broadcastable
)
for
(
r
,
r_orig
)
in
zip
(
results
,
node
.
outputs
)]
for
var
in
theano
.
gof
.
graph
.
variables
(
node
.
inputs
,
results
):
if
var
not
in
node
.
inputs
:
copy_stack_trace
(
node
.
outputs
[
0
],
var
)
return
results
@local_optimizer
([
AbstractBatchNormTrainGrad
])
def
local_abstract_batch_norm_train_grad
(
node
):
if
not
isinstance
(
node
.
op
,
AbstractBatchNormTrainGrad
):
return
None
x
,
dy
,
scale
,
x_mean
,
x_invstd
,
epsilon
=
node
.
inputs
axes
=
node
.
op
.
axes
if
min
(
axes
)
<
0
or
max
(
axes
)
>
x
.
ndim
:
return
None
if
not
isinstance
(
x
.
type
,
TensorType
)
or
\
not
isinstance
(
dy
.
type
,
TensorType
)
or
\
not
isinstance
(
scale
.
type
,
TensorType
)
or
\
not
isinstance
(
x_mean
.
type
,
TensorType
)
or
\
not
isinstance
(
x_invstd
.
type
,
TensorType
)
or
\
not
isinstance
(
epsilon
.
type
,
TensorType
):
return
None
x_diff
=
x
-
x_mean
mean_dy_x_diff
=
T
.
mean
(
dy
*
x_diff
,
axis
=
axes
,
keepdims
=
True
)
c
=
(
dy
*
x_invstd
)
-
x_diff
*
(
mean_dy_x_diff
*
(
x_invstd
**
3
))
g_wrt_inputs
=
scale
*
(
c
-
T
.
mean
(
c
,
axis
=
axes
,
keepdims
=
True
))
g_wrt_scale
=
T
.
sum
(
dy
*
x_invstd
*
x_diff
,
axis
=
axes
,
keepdims
=
True
)
g_wrt_bias
=
T
.
sum
(
dy
,
axis
=
axes
,
keepdims
=
True
)
results
=
[
g_wrt_inputs
,
g_wrt_scale
,
g_wrt_bias
]
results
=
[
T
.
patternbroadcast
(
r
,
r_orig
.
broadcastable
)
for
(
r
,
r_orig
)
in
zip
(
results
,
node
.
outputs
)]
for
var
in
theano
.
gof
.
graph
.
variables
(
node
.
inputs
,
results
):
if
var
not
in
node
.
inputs
:
copy_stack_trace
(
node
.
outputs
[
0
],
var
)
return
results
@local_optimizer
([
AbstractBatchNormInference
])
def
local_abstract_batch_norm_inference
(
node
):
if
not
isinstance
(
node
.
op
,
AbstractBatchNormInference
):
return
None
x
,
scale
,
bias
,
estimated_mean
,
estimated_variance
,
epsilon
=
node
.
inputs
if
not
isinstance
(
x
.
type
,
TensorType
)
or
\
not
isinstance
(
scale
.
type
,
TensorType
)
or
\
not
isinstance
(
bias
.
type
,
TensorType
)
or
\
not
isinstance
(
estimated_mean
.
type
,
TensorType
)
or
\
not
isinstance
(
estimated_variance
.
type
,
TensorType
)
or
\
not
isinstance
(
epsilon
.
type
,
TensorType
):
return
None
result
=
(
x
-
estimated_mean
)
*
(
scale
/
T
.
sqrt
(
estimated_variance
+
epsilon
))
+
bias
result
=
T
.
patternbroadcast
(
result
,
node
.
outputs
[
0
]
.
broadcastable
)
for
var
in
theano
.
gof
.
graph
.
variables
(
node
.
inputs
,
[
result
]):
if
var
not
in
node
.
inputs
:
copy_stack_trace
(
node
.
outputs
[
0
],
var
)
return
[
result
]
# Register Cpu Optmization
bn_groupopt
=
theano
.
gof
.
optdb
.
LocalGroupDB
()
bn_groupopt
.
__name__
=
'batchnorm_opts'
register_specialize_device
(
bn_groupopt
,
'fast_compile'
,
'fast_run'
)
bn_groupopt
.
register
(
'local_abstract_batch_norm_train'
,
local_abstract_batch_norm_train
,
30
,
'fast_compile'
,
'fast_run'
)
bn_groupopt
.
register
(
'local_abstract_batch_norm_train_grad'
,
local_abstract_batch_norm_train_grad
,
30
,
'fast_compile'
,
'fast_run'
)
bn_groupopt
.
register
(
'local_abstract_batch_norm_inference'
,
local_abstract_batch_norm_inference
,
30
,
'fast_compile'
,
'fast_run'
)
theano/tensor/nnet/tests/test_bn.py
浏览文件 @
8b9f7336
from
__future__
import
absolute_import
,
print_function
,
division
import
theano
import
theano.tensor
as
T
from
theano.tests
import
unittest_tools
as
utt
import
numpy
from
theano.tensor.nnet
.bn
import
batch_normalizatio
n
from
theano.tensor.nnet
import
b
n
def
test_BNComposite
():
...
...
@@ -39,7 +40,7 @@ def test_BNComposite():
f_ref
=
theano
.
function
([
x
,
b
,
g
,
m
,
v
],
[
bn_ref_op
])
res_ref
=
f_ref
(
X
,
G
,
B
,
M
,
V
)
for
mode
in
[
'low_mem'
,
'high_mem'
]:
bn_op
=
batch_normalization
(
x
,
g
,
b
,
m
,
v
,
mode
=
mode
)
bn_op
=
b
n
.
b
atch_normalization
(
x
,
g
,
b
,
m
,
v
,
mode
=
mode
)
f
=
theano
.
function
([
x
,
b
,
g
,
m
,
v
],
[
bn_op
])
res
=
f
(
X
,
G
,
B
,
M
,
V
)
utt
.
assert_allclose
(
res_ref
,
res
)
...
...
@@ -47,7 +48,7 @@ def test_BNComposite():
theano
.
config
.
compute_test_value
=
orig
def
test_bn
():
def
test_b
atch_normalizatio
n
():
def
bn_ref
(
x
,
G
,
B
,
M
,
V
):
n
=
(
x
-
M
)
/
V
...
...
@@ -70,28 +71,28 @@ def test_bn():
f_ref
=
theano
.
function
([
x
,
b
,
g
,
m
,
v
],
[
bn_ref_op
])
res_ref
=
f_ref
(
X
,
G
,
B
,
M
,
V
)
for
mode
in
[
'low_mem'
,
'high_mem'
]:
bn_op
=
batch_normalization
(
x
,
g
,
b
,
m
,
v
,
mode
=
mode
)
bn_op
=
b
n
.
b
atch_normalization
(
x
,
g
,
b
,
m
,
v
,
mode
=
mode
)
f
=
theano
.
function
([
x
,
b
,
g
,
m
,
v
],
[
bn_op
])
res
=
f
(
X
,
G
,
B
,
M
,
V
)
utt
.
assert_allclose
(
res_ref
,
res
)
def
bn
(
inputs
,
gamma
,
beta
,
mean
,
std
):
return
batch_normalization
(
inputs
,
gamma
,
beta
,
mean
,
std
,
mode
=
mode
)
utt
.
verify_grad
(
bn
,
[
X
,
G
,
B
,
M
,
V
])
def
bn
_f
(
inputs
,
gamma
,
beta
,
mean
,
std
):
return
b
n
.
b
atch_normalization
(
inputs
,
gamma
,
beta
,
mean
,
std
,
mode
=
mode
)
utt
.
verify_grad
(
bn
_f
,
[
X
,
G
,
B
,
M
,
V
])
bn_ref_op
=
bn_ref
(
x
,
g
,
b
,
x
.
mean
(
axis
=
0
,
keepdims
=
True
),
x
.
std
(
axis
=
0
,
keepdims
=
True
))
f_ref
=
theano
.
function
([
x
,
b
,
g
],
[
bn_ref_op
])
res_ref
=
f_ref
(
X
,
G
,
B
)
for
mode
in
[
'low_mem'
,
'high_mem'
]:
bn_op
=
batch_normalization
(
x
,
g
,
b
,
x
.
mean
(
axis
=
0
,
keepdims
=
True
),
x
.
std
(
axis
=
0
,
keepdims
=
True
),
mode
=
mode
)
bn_op
=
b
n
.
b
atch_normalization
(
x
,
g
,
b
,
x
.
mean
(
axis
=
0
,
keepdims
=
True
),
x
.
std
(
axis
=
0
,
keepdims
=
True
),
mode
=
mode
)
f
=
theano
.
function
([
x
,
b
,
g
],
[
bn_op
])
res
=
f
(
X
,
G
,
B
)
utt
.
assert_allclose
(
res_ref
,
res
)
def
bn
(
inputs
,
gamma
,
beta
,
mean
,
std
):
return
batch_normalization
(
inputs
,
gamma
,
beta
,
mean
,
std
,
mode
=
mode
)
utt
.
verify_grad
(
b
atch_normalization
,
[
X
,
G
,
B
,
X
.
mean
(
axis
=
0
)[
numpy
.
newaxis
],
X
.
std
(
axis
=
0
)[
numpy
.
newaxis
]])
def
bn
_f
(
inputs
,
gamma
,
beta
,
mean
,
std
):
return
b
n
.
b
atch_normalization
(
inputs
,
gamma
,
beta
,
mean
,
std
,
mode
=
mode
)
utt
.
verify_grad
(
b
n_f
,
[
X
,
G
,
B
,
X
.
mean
(
axis
=
0
)[
numpy
.
newaxis
],
X
.
std
(
axis
=
0
)[
numpy
.
newaxis
]])
def
test_bn_feature_maps
():
...
...
@@ -122,21 +123,296 @@ def test_bn_feature_maps():
res_ref
=
f_ref
(
X
,
G
,
B
,
M
,
V
)
for
mode
in
[
'low_mem'
,
'high_mem'
]:
bn_op
=
batch_normalization
(
x
,
g
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
b
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
m
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
v
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
mode
=
mode
)
bn_op
=
b
n
.
b
atch_normalization
(
x
,
g
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
b
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
m
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
v
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
mode
=
mode
)
f
=
theano
.
function
([
x
,
b
,
g
,
m
,
v
],
[
bn_op
])
res
=
f
(
X
,
G
,
B
,
M
,
V
)
utt
.
assert_allclose
(
res_ref
,
res
)
def
conv_bn
(
inputs
,
gamma
,
beta
,
mean
,
std
):
return
batch_normalization
(
inputs
,
gamma
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
beta
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
mean
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
std
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
mode
=
mode
)
return
b
n
.
b
atch_normalization
(
inputs
,
gamma
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
beta
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
mean
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
std
.
dimshuffle
(
'x'
,
0
,
'x'
,
'x'
),
mode
=
mode
)
utt
.
verify_grad
(
conv_bn
,
[
X
,
G
,
B
,
M
,
V
])
def
test_batch_normalization_train
():
utt
.
seed_rng
()
for
axes
in
(
'per-activation'
,
'spatial'
,
(
1
,
2
,
3
,
4
)):
for
vartype
in
(
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
,
T
.
vector
):
x
,
scale
,
bias
,
running_mean
,
running_var
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'running_mean'
,
'running_var'
))
ndim
=
x
.
ndim
eps
=
5e-3
# some non-standard value to test if it's used
running_average_factor
=
0.3
# remove non-existing axes
if
isinstance
(
axes
,
tuple
):
axes
=
tuple
(
i
for
i
in
axes
if
i
<
ndim
)
if
len
(
axes
)
==
0
:
continue
# forward pass
out
,
x_mean
,
x_invstd
,
out_running_mean
,
out_running_var
=
\
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
axes
,
eps
,
running_average_factor
,
running_mean
,
running_var
)
# reference forward pass
if
axes
==
'per-activation'
:
axes2
=
(
0
,)
elif
axes
==
'spatial'
:
axes2
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
else
:
axes2
=
axes
x_mean2
=
x
.
mean
(
axis
=
axes2
,
keepdims
=
True
)
x_var2
=
x
.
var
(
axis
=
axes2
,
keepdims
=
True
)
x_invstd2
=
T
.
inv
(
T
.
sqrt
(
x_var2
+
eps
))
scale2
=
T
.
addbroadcast
(
scale
,
*
axes2
)
bias2
=
T
.
addbroadcast
(
bias
,
*
axes2
)
out2
=
(
x
-
x_mean2
)
*
(
scale2
*
x_invstd2
)
+
bias2
m
=
T
.
cast
(
T
.
prod
(
x
.
shape
)
/
T
.
prod
(
scale
.
shape
),
theano
.
config
.
floatX
)
out_running_mean2
=
running_mean
*
(
1
-
running_average_factor
)
+
\
x_mean2
*
running_average_factor
out_running_var2
=
running_var
*
(
1
-
running_average_factor
)
+
\
(
m
/
(
m
-
1
))
*
x_var2
*
running_average_factor
# backward pass
dy
=
vartype
(
'dy'
)
grads
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out
:
dy
})
# reference backward pass
grads2
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out2
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out
,
x_mean
,
x_invstd
,
out_running_mean
,
out_running_var
,
out2
,
x_mean2
,
x_invstd2
,
out_running_mean2
,
out_running_var2
]
+
grads
+
grads2
)
# check if the abstract Ops have been replaced
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
# run
for
data_shape
in
((
5
,
10
,
30
,
40
,
10
),
(
4
,
3
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
)):
data_shape
=
data_shape
[:
ndim
]
param_shape
=
tuple
(
1
if
d
in
axes2
else
s
for
d
,
s
in
enumerate
(
data_shape
))
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Running_mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Running_var
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs
=
f
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
# compare outputs
utt
.
assert_allclose
(
outputs
[
0
],
outputs
[
0
+
5
])
# out
utt
.
assert_allclose
(
outputs
[
1
],
outputs
[
1
+
5
])
# mean
utt
.
assert_allclose
(
outputs
[
2
],
outputs
[
2
+
5
])
# invstd
utt
.
assert_allclose
(
outputs
[
3
],
outputs
[
3
+
5
])
# running_mean
utt
.
assert_allclose
(
numpy
.
nan_to_num
(
outputs
[
4
]),
numpy
.
nan_to_num
(
outputs
[
4
+
5
]))
# running_var
# compare gradients
utt
.
assert_allclose
(
outputs
[
10
],
outputs
[
10
+
3
],
atol
=
1e-4
)
# dx
utt
.
assert_allclose
(
outputs
[
11
],
outputs
[
11
+
3
],
rtol
=
2e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs
[
12
],
outputs
[
12
+
3
])
# dbias
def
test_batch_normalization_train_without_running_averages
():
# compile and run batch_normalization_train without running averages
utt
.
seed_rng
()
x
,
scale
,
bias
,
dy
=
T
.
tensor4
(
'x'
),
T
.
tensor4
(
'scale'
),
T
.
tensor4
(
'bias'
),
T
.
tensor4
(
'dy'
)
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
# forward pass
out
,
x_mean
,
x_invstd
=
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
)
# backward pass
grads
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out
,
x_mean
,
x_invstd
]
+
grads
)
# check if the abstract Ops have been replaced
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f
(
X
,
Scale
,
Bias
,
Dy
)
def
test_batch_normalization_train_broadcast
():
for
axes
in
(
'per-activation'
,
'spatial'
,
(
1
,
2
,
3
,
4
)):
for
vartype
in
(
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
,
T
.
vector
):
x
=
vartype
(
'x'
)
ndim
=
x
.
ndim
eps
=
5e-3
# some non-standard value to test if it's used
running_average_factor
=
0.3
# remove non-existing axes
if
isinstance
(
axes
,
tuple
):
axes
=
tuple
(
i
for
i
in
axes
if
i
<
ndim
)
if
len
(
axes
)
==
0
:
continue
# convert axes to explicit list
if
axes
==
'per-activation'
:
axes2
=
(
0
,)
elif
axes
==
'spatial'
:
axes2
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
else
:
axes2
=
axes
# compute axes for parameter tensors
non_bc_axes
=
tuple
(
i
for
i
in
range
(
ndim
)
if
i
not
in
axes2
)
params_dimshuffle
=
[
'x'
]
*
ndim
for
i
,
axis
in
enumerate
(
non_bc_axes
):
params_dimshuffle
[
axis
]
=
i
# construct non-broadcasted parameter variables
param_type
=
T
.
TensorType
(
x
.
dtype
,
(
False
,)
*
len
(
non_bc_axes
))
scale
,
bias
,
running_mean
,
running_var
=
(
param_type
(
n
)
for
n
in
(
'scale'
,
'bias'
,
'running_mean'
,
'running_var'
))
# broadcast parameter variables
scale_bc
=
scale
.
dimshuffle
(
params_dimshuffle
)
bias_bc
=
bias
.
dimshuffle
(
params_dimshuffle
)
running_mean_bc
=
running_mean
.
dimshuffle
(
params_dimshuffle
)
running_var_bc
=
running_var
.
dimshuffle
(
params_dimshuffle
)
# batch_normalization_train with original, non-broadcasted variables
train_non_bc
=
\
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
axes
,
eps
,
running_average_factor
,
running_mean
,
running_var
)
# batch_normalization_train with broadcasted variables
train_bc
=
\
bn
.
batch_normalization_train
(
x
,
scale_bc
,
bias_bc
,
axes
,
eps
,
running_average_factor
,
running_mean_bc
,
running_var_bc
)
train_bc
=
tuple
([
train_bc
[
0
]]
+
# out
[
r
.
dimshuffle
(
non_bc_axes
)
for
r
in
train_bc
[
1
:]])
# batch_normalization_test with original, non-broadcasted variables
test_non_bc
=
\
bn
.
batch_normalization_test
(
x
,
scale
,
bias
,
running_mean
,
running_var
,
axes
,
eps
)
# batch_normalization_test with broadcasted variables
test_bc
=
\
bn
.
batch_normalization_test
(
x
,
scale_bc
,
bias_bc
,
running_mean_bc
,
running_var_bc
,
axes
,
eps
)
# subtract the results of the non-broadcasted and broadcasted calls
results_non_bc
=
train_non_bc
+
(
test_non_bc
,)
results_bc
=
train_bc
+
(
test_bc
,)
results
=
[
abs
(
r
-
r_bc
)
for
(
r
,
r_bc
)
in
zip
(
results_non_bc
,
results_bc
)]
# compile to compute all differences
f
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
],
T
.
sum
(
sum
(
results
)))
# the paired ops are exactly the same, so the optimizer should have
# collapsed the sum of differences to a constant zero
nodes
=
f
.
maker
.
fgraph
.
toposort
()
if
theano
.
config
.
mode
!=
"FAST_COMPILE"
:
assert
len
(
nodes
)
==
1
assert
isinstance
(
nodes
[
0
]
.
op
,
theano
.
compile
.
DeepCopyOp
)
inputs
=
[
numpy
.
asarray
(
numpy
.
random
.
rand
(
*
((
4
,)
*
n
)),
x
.
dtype
)
for
n
in
[
x
.
ndim
,
scale
.
ndim
,
bias
.
ndim
,
running_mean
.
ndim
,
running_var
.
ndim
]]
assert
0.0
==
f
(
*
inputs
)
def
test_batch_normalization_test
():
for
axes
in
(
'per-activation'
,
'spatial'
,
(
1
,
2
,
3
,
4
)):
for
vartype
in
(
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
,
T
.
vector
):
x
,
scale
,
bias
,
mean
,
var
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
))
ndim
=
x
.
ndim
eps
=
5e-3
# some non-standard value to test if it's used
# remove non-existing axes
if
isinstance
(
axes
,
tuple
):
axes
=
tuple
(
i
for
i
in
axes
if
i
<
ndim
)
if
len
(
axes
)
==
0
:
continue
# forward pass
out
=
bn
.
batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
axes
,
eps
)
# reference forward pass
if
axes
==
'per-activation'
:
axes2
=
(
0
,)
elif
axes
==
'spatial'
:
axes2
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
else
:
axes2
=
axes
scale2
,
bias2
,
mean2
,
var2
=
(
T
.
addbroadcast
(
t
,
*
axes2
)
for
t
in
(
scale
,
bias
,
mean
,
var
))
out2
=
(
x
-
mean2
)
*
(
scale2
/
T
.
sqrt
(
var2
+
eps
))
+
bias2
# backward pass
dy
=
vartype
(
'dy'
)
grads
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out
:
dy
})
# reference backward pass
grads2
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out2
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out
,
out2
]
+
grads
+
grads2
)
# check if the abstract Ops have been replaced
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
# run
for
data_shape
in
((
10
,
20
,
30
,
40
,
10
),
(
4
,
3
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
)):
data_shape
=
data_shape
[:
ndim
]
param_shape
=
tuple
(
1
if
d
in
axes2
else
s
for
d
,
s
in
enumerate
(
data_shape
))
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Var
=
numpy
.
random
.
rand
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs
=
f
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
# compare outputs
utt
.
assert_allclose
(
outputs
[
0
],
outputs
[
1
])
# out
# compare gradients
utt
.
assert_allclose
(
outputs
[
2
],
outputs
[
2
+
5
],
atol
=
4e-5
)
# dx
utt
.
assert_allclose
(
outputs
[
3
],
outputs
[
3
+
5
],
atol
=
4e-5
)
# dscale
utt
.
assert_allclose
(
outputs
[
4
],
outputs
[
4
+
5
])
# dbias
utt
.
assert_allclose
(
outputs
[
5
],
outputs
[
5
+
5
])
# dmean
utt
.
assert_allclose
(
outputs
[
6
],
outputs
[
6
+
5
],
rtol
=
2e-3
,
atol
=
4e-5
)
# dvar
def
test_batch_normalization_broadcastable
():
# check if the broadcastable pattern is preserved by the optimizations
x
,
dy
,
scale
,
bias
,
mean
,
var
=
(
T
.
scalar
(
n
)
.
dimshuffle
([
'x'
]
*
5
)
for
n
in
(
'x'
,
'dy'
,
'scale'
,
'bias'
,
'mean'
,
'var'
))
# forward pass
out_train
,
x_mean
,
x_invstd
=
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
'spatial'
)
out_test
=
bn
.
batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
'spatial'
)
# backward pass
grads_train
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_train
:
dy
})
grads_test
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_test
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_train
,
x_mean
,
x_invstd
,
out_test
]
+
grads_train
+
grads_test
)
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论