Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
43411345
提交
43411345
authored
12月 03, 2016
作者:
Gijs van Tulder
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Batch normalization optimizations for old gpu backend.
上级
ef21cb58
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
846 行增加
和
86 行删除
+846
-86
dnn.py
theano/sandbox/cuda/dnn.py
+507
-30
opt.py
theano/sandbox/cuda/opt.py
+25
-0
test_dnn.py
theano/sandbox/cuda/tests/test_dnn.py
+314
-56
没有找到文件。
theano/sandbox/cuda/dnn.py
浏览文件 @
43411345
...
@@ -18,6 +18,7 @@ from theano.tensor.nnet.abstract_conv import (get_conv_output_shape,
...
@@ -18,6 +18,7 @@ from theano.tensor.nnet.abstract_conv import (get_conv_output_shape,
assert_conv_shape
)
assert_conv_shape
)
from
theano.tensor.signal.pool
import
(
from
theano.tensor.signal.pool
import
(
Pool
,
MaxPoolGrad
,
AveragePoolGrad
)
Pool
,
MaxPoolGrad
,
AveragePoolGrad
)
from
theano.tensor.nnet
import
bn
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda
import
GpuOp
,
dnn_available
from
theano.sandbox.cuda
import
GpuOp
,
dnn_available
...
@@ -2347,6 +2348,23 @@ class GpuDnnBatchNormBase(DnnBase):
...
@@ -2347,6 +2348,23 @@ class GpuDnnBatchNormBase(DnnBase):
epsilon
epsilon
Epsilon value used in the batch normalization formula. Minimum allowed
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
"""
"""
__props__
=
(
'mode'
,
'epsilon'
)
__props__
=
(
'mode'
,
'epsilon'
)
...
@@ -2395,17 +2413,15 @@ cudnnStatus_t err%(name)s;
...
@@ -2395,17 +2413,15 @@ cudnnStatus_t err%(name)s;
result
=
"""
result
=
"""
cudnnStatus_t err
%(name)
s;
cudnnStatus_t err
%(name)
s;
cudnnBatchNormMode_t mode
%(name)
s =
%(mode)
s;
cudnnBatchNormMode_t mode
%(name)
s =
%(mode)
s;
double exponentialAverageFactor
%(name)
s =
%(exp_avg_factor)
f;
double epsilon
%(name)
s =
%(epsilon)
e;
double epsilon
%(name)
s =
%(epsilon)
e;
"""
%
dict
(
name
=
name
,
"""
%
dict
(
name
=
name
,
mode
=
mode
,
mode
=
mode
,
exp_avg_factor
=
0
,
# deliberately unused
epsilon
=
self
.
epsilon
)
epsilon
=
self
.
epsilon
)
return
result
return
result
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
3
,
version
())
return
(
4
,
version
())
class
GpuDnnBatchNormInference
(
GpuDnnBatchNormBase
):
class
GpuDnnBatchNormInference
(
GpuDnnBatchNormBase
):
...
@@ -2422,8 +2438,21 @@ class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
...
@@ -2422,8 +2438,21 @@ class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
Note: scale, bias, mean and variance must follow the same tensor layout!
Note: scale, bias, mean and variance must follow the same tensor layout!
"""
"""
__props__
=
(
'mode'
,
'epsilon'
,
'inplace'
)
tensor_descs
=
[
'bn_input'
,
'bn_output'
,
'bn_params'
]
tensor_descs
=
[
'bn_input'
,
'bn_output'
,
'bn_params'
]
def
__init__
(
self
,
mode
=
'per-activation'
,
epsilon
=
1e-4
,
inplace
=
False
):
super
(
GpuDnnBatchNormInference
,
self
)
.
__init__
(
mode
=
mode
,
epsilon
=
epsilon
)
self
.
inplace
=
inplace
if
self
.
inplace
:
self
.
destroy_map
=
{
0
:
[
0
]}
def
get_op_params
(
self
):
params
=
[]
if
self
.
inplace
:
params
.
append
((
'INPLACE_OUTPUT'
,
'1'
))
return
params
def
infer_shape
(
self
,
node
,
shape
):
def
infer_shape
(
self
,
node
,
shape
):
# output shape equals shape of x
# output shape equals shape of x
return
[
shape
[
0
]]
return
[
shape
[
0
]]
...
@@ -2460,10 +2489,16 @@ if (c_set_tensorNd(%(scale)s, bn_params_%(name)s) != 0)
...
@@ -2460,10 +2489,16 @@ if (c_set_tensorNd(%(scale)s, bn_params_%(name)s) != 0)
}
}
// build and prepare the output variable
// build and prepare the output variable
#ifdef INPLACE_OUTPUT
Py_XDECREF(
%(outp)
s);
%(outp)
s =
%(inp)
s;
Py_INCREF(
%(outp)
s);
#else
if (CudaNdarray_prep_output(&
%(outp)
s,
%(inp)
s->nd, CudaNdarray_HOST_DIMS(
%(inp)
s)) != 0)
if (CudaNdarray_prep_output(&
%(outp)
s,
%(inp)
s->nd, CudaNdarray_HOST_DIMS(
%(inp)
s)) != 0)
{
{
%(fail)
s
%(fail)
s
}
}
#endif
// set output tensor descriptor from output tensor
// set output tensor descriptor from output tensor
if (c_set_tensorNd(
%(outp)
s, bn_output_
%(name)
s) != 0)
if (c_set_tensorNd(
%(outp)
s, bn_output_
%(name)
s) != 0)
...
@@ -2494,6 +2529,16 @@ err%(name)s = cudnnBatchNormalizationForwardInference(
...
@@ -2494,6 +2529,16 @@ err%(name)s = cudnnBatchNormalizationForwardInference(
"""
%
dict
(
name
=
name
,
inp
=
inp
,
scale
=
scale
,
bias
=
bias
,
est_mean
=
est_mean
,
"""
%
dict
(
name
=
name
,
inp
=
inp
,
scale
=
scale
,
bias
=
bias
,
est_mean
=
est_mean
,
est_var
=
est_var
,
outp
=
outp
,
fail
=
sub
[
'fail'
])
est_var
=
est_var
,
outp
=
outp
,
fail
=
sub
[
'fail'
])
# add params
define_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
,
check_input
=
False
)
result
=
"""
%(define_macros)
s
{
%(code)
s
}
%(undef_macros)
s
"""
%
dict
(
code
=
result
,
define_macros
=
define_macros
,
undef_macros
=
undef_macros
)
return
result
return
result
def
grad
(
self
,
inputs
,
grads
):
def
grad
(
self
,
inputs
,
grads
):
...
@@ -2537,28 +2582,84 @@ class GpuDnnBatchNorm(GpuDnnBatchNormBase):
...
@@ -2537,28 +2582,84 @@ class GpuDnnBatchNorm(GpuDnnBatchNormBase):
Note: scale and bias must follow the same tensor layout!
Note: scale and bias must follow the same tensor layout!
"""
"""
__props__
=
(
'mode'
,
'epsilon'
,
'running_averages'
,
'inplace_running_mean'
,
'inplace_running_var'
,
'inplace_output'
)
tensor_descs
=
[
'bn_input'
,
'bn_output'
,
'bn_params'
]
tensor_descs
=
[
'bn_input'
,
'bn_output'
,
'bn_params'
]
def
__init__
(
self
,
mode
=
'per-activation'
,
epsilon
=
1e-4
,
running_average_factor
=
0
,
running_averages
=
False
,
inplace_running_mean
=
False
,
inplace_running_var
=
False
,
inplace_output
=
False
):
super
(
GpuDnnBatchNorm
,
self
)
.
__init__
(
mode
=
mode
,
epsilon
=
epsilon
)
self
.
running_average_factor
=
running_average_factor
self
.
running_averages
=
running_averages
self
.
inplace_output
=
inplace_output
self
.
inplace_running_mean
=
inplace_running_mean
self
.
inplace_running_var
=
inplace_running_var
self
.
destroy_map
=
{}
if
self
.
inplace_output
:
self
.
destroy_map
[
0
]
=
[
0
]
if
self
.
running_averages
and
self
.
inplace_running_mean
:
self
.
destroy_map
[
3
]
=
[
3
]
if
self
.
running_averages
and
self
.
inplace_running_var
:
self
.
destroy_map
[
4
]
=
[
4
]
def
get_op_params
(
self
):
params
=
[]
if
self
.
inplace_output
:
params
.
append
((
'INPLACE_OUTPUT'
,
'1'
))
if
self
.
running_averages
:
params
.
append
((
'RUNNING_AVERAGES'
,
'1'
))
if
self
.
inplace_running_mean
:
params
.
append
((
'INPLACE_RUNNING_MEAN'
,
'1'
))
if
self
.
inplace_running_var
:
params
.
append
((
'INPLACE_RUNNING_VAR'
,
'1'
))
return
params
def
infer_shape
(
self
,
node
,
shape
):
def
infer_shape
(
self
,
node
,
shape
):
# first output equals shape of x
# first output equals shape of x
#
second and third output
equal shape of scale
#
other outputs
equal shape of scale
return
[
shape
[
0
]
,
shape
[
1
],
shape
[
1
]]
return
[
shape
[
0
]
]
+
[
shape
[
1
]]
*
(
len
(
node
.
outputs
)
-
1
)
def
make_node
(
self
,
x
,
scale
,
bias
):
def
make_node
(
self
,
x
,
scale
,
bias
,
running_mean
=
None
,
running_var
=
None
):
assert
x
.
ndim
==
scale
.
ndim
==
bias
.
ndim
assert
x
.
ndim
in
(
4
,
5
)
assert
self
.
running_averages
==
(
running_mean
is
not
None
)
==
(
running_var
is
not
None
)
assert
(
running_mean
is
None
or
running_mean
.
ndim
==
x
.
ndim
)
assert
(
running_var
is
None
or
running_var
.
ndim
==
x
.
ndim
)
x
=
as_cuda_ndarray_variable
(
x
)
x
=
as_cuda_ndarray_variable
(
x
)
scale
=
as_cuda_ndarray_variable
(
scale
)
scale
=
as_cuda_ndarray_variable
(
scale
)
bias
=
as_cuda_ndarray_variable
(
bias
)
bias
=
as_cuda_ndarray_variable
(
bias
)
assert
x
.
ndim
==
scale
.
ndim
==
bias
.
ndim
inputs
=
[
x
,
scale
,
bias
]
assert
x
.
ndim
in
(
4
,
5
)
output_types
=
[
x
.
type
(),
scale
.
type
(),
scale
.
type
()]
return
Apply
(
self
,
[
x
,
scale
,
bias
],
[
x
.
type
(),
scale
.
type
(),
scale
.
type
()])
if
running_mean
is
not
None
and
running_var
is
not
None
:
inputs
.
append
(
as_cuda_ndarray_variable
(
running_mean
))
inputs
.
append
(
as_cuda_ndarray_variable
(
running_var
))
output_types
.
append
(
scale
.
type
())
output_types
.
append
(
scale
.
type
())
return
Apply
(
self
,
inputs
,
output_types
)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
# super call to prepare common configuration
# super call to prepare common configuration
result
=
super
(
GpuDnnBatchNorm
,
self
)
.
c_code
(
node
,
name
,
inputs
,
outputs
,
sub
)
result
=
super
(
GpuDnnBatchNorm
,
self
)
.
c_code
(
node
,
name
,
inputs
,
outputs
,
sub
)
# give sensible names to inputs and outputs
# give sensible names to inputs and outputs
inp
,
scale
,
bias
=
inputs
inp
,
scale
,
bias
=
inputs
[:
3
]
outp
,
x_mean
,
x_invstd
=
outputs
outp
,
x_mean
,
x_invstd
=
outputs
[:
3
]
if
self
.
running_averages
:
running_average_factor
=
self
.
running_average_factor
in_running_mean
=
inputs
[
3
]
in_running_var
=
inputs
[
4
]
out_running_mean
=
outputs
[
3
]
out_running_var
=
outputs
[
4
]
else
:
running_average_factor
=
0.
in_running_mean
=
'NULL'
in_running_var
=
'NULL'
out_running_mean
=
'NULL'
out_running_var
=
'NULL'
# set input tensor descriptors from input tensors
# set input tensor descriptors from input tensors
result
+=
"""
result
+=
"""
...
@@ -2579,6 +2680,32 @@ if ((CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp
...
@@ -2579,6 +2680,32 @@ if ((CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp
{
{
%(fail)
s
%(fail)
s
}
}
#ifdef RUNNING_AVERAGES
#ifdef INPLACE_RUNNING_MEAN
Py_XDECREF(
%(out_running_mean)
s);
CudaNdarray *running_mean
%(name)
s =
%(in_running_mean)
s;
Py_INCREF(running_mean
%(name)
s);
#else
if ((CudaNdarray_prep_output(&
%(out_running_mean)
s,
%(inp)
s->nd, CudaNdarray_HOST_DIMS(
%(scale)
s)) != 0) ||
(CudaNdarray_CopyFromCudaNdarray(
%(out_running_mean)
s,
%(in_running_mean)
s) != 0))
{
%(fail)
s
}
CudaNdarray *running_mean
%(name)
s =
%(out_running_mean)
s;
#endif
#ifdef INPLACE_RUNNING_VAR
Py_XDECREF(
%(out_running_var)
s);
CudaNdarray *running_var
%(name)
s =
%(in_running_var)
s;
Py_INCREF(running_var
%(name)
s);
#else
if ((CudaNdarray_prep_output(&
%(out_running_var)
s,
%(inp)
s->nd, CudaNdarray_HOST_DIMS(
%(scale)
s)) != 0) ||
(CudaNdarray_CopyFromCudaNdarray(
%(out_running_var)
s,
%(in_running_var)
s) != 0))
{
%(fail)
s
}
CudaNdarray *running_var
%(name)
s =
%(out_running_var)
s;
#endif
#endif
// set output tensor descriptor from output tensor
// set output tensor descriptor from output tensor
if (c_set_tensorNd(
%(outp)
s, bn_output_
%(name)
s) != 0)
if (c_set_tensorNd(
%(outp)
s, bn_output_
%(name)
s) != 0)
...
@@ -2601,25 +2728,66 @@ err%(name)s = cudnnBatchNormalizationForwardTraining(
...
@@ -2601,25 +2728,66 @@ err%(name)s = cudnnBatchNormalizationForwardTraining(
bn_params_
%(name)
s,
bn_params_
%(name)
s,
CudaNdarray_DEV_DATA(
%(scale)
s),
CudaNdarray_DEV_DATA(
%(scale)
s),
CudaNdarray_DEV_DATA(
%(bias)
s),
CudaNdarray_DEV_DATA(
%(bias)
s),
exponentialAverageFactor
%(name)
s,
#ifdef RUNNING_AVERAGES
NULL, // running mean, deliberately unused
%(running_average_factor)
f,
NULL, // running var, deliberately unused
CudaNdarray_DEV_DATA(running_mean
%(name)
s),
CudaNdarray_DEV_DATA(running_var
%(name)
s),
#else
0,
NULL,
NULL,
#endif
epsilon
%(name)
s,
epsilon
%(name)
s,
CudaNdarray_DEV_DATA(
%(x_mean)
s),
CudaNdarray_DEV_DATA(
%(x_mean)
s),
CudaNdarray_DEV_DATA(
%(x_invstd)
s)
CudaNdarray_DEV_DATA(
%(x_invstd)
s)
);
);
}
}
#ifdef RUNNING_AVERAGES
%(out_running_mean)
s = running_mean
%(name)
s;
%(out_running_var)
s = running_var
%(name)
s;
#endif
"""
%
dict
(
name
=
name
,
inp
=
inp
,
scale
=
scale
,
bias
=
bias
,
outp
=
outp
,
"""
%
dict
(
name
=
name
,
inp
=
inp
,
scale
=
scale
,
bias
=
bias
,
outp
=
outp
,
x_mean
=
x_mean
,
x_invstd
=
x_invstd
,
fail
=
sub
[
'fail'
])
x_mean
=
x_mean
,
x_invstd
=
x_invstd
,
running_average_factor
=
running_average_factor
,
in_running_mean
=
in_running_mean
,
in_running_var
=
in_running_var
,
out_running_mean
=
out_running_mean
,
out_running_var
=
out_running_var
,
fail
=
sub
[
'fail'
])
# add params
define_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
,
check_input
=
False
)
result
=
"""
%(define_macros)
s
{
%(code)
s
}
%(undef_macros)
s
"""
%
dict
(
code
=
result
,
define_macros
=
define_macros
,
undef_macros
=
undef_macros
)
return
result
return
result
def
grad
(
self
,
inputs
,
grads
):
def
grad
(
self
,
inputs
,
grads
):
x
,
scale
,
bias
=
inputs
x
,
scale
,
bias
=
inputs
[:
3
]
dy
=
grads
[
0
]
dy
=
grads
[
0
]
_
,
x_mean
,
x_invstd
=
self
(
x
,
scale
,
bias
)
_
,
x_mean
,
x_invstd
=
self
(
*
inputs
)[:
3
]
return
GpuDnnBatchNormGrad
(
self
.
mode
,
self
.
epsilon
)(
x
,
dy
,
scale
,
disconnected_outputs
=
[]
x_mean
,
x_invstd
)
# Optional running_mean and running_var.
for
i
in
range
(
3
,
len
(
inputs
)):
disconnected_outputs
.
append
(
DisconnectedType
()())
return
GpuDnnBatchNormGrad
(
self
.
mode
,
self
.
epsilon
)(
x
,
dy
,
scale
,
x_mean
,
x_invstd
)
+
disconnected_outputs
def
connection_pattern
(
self
,
node
):
patterns
=
[[
True
,
True
,
True
],
# x
[
True
,
True
,
True
],
# scale
[
True
,
True
,
True
]]
# bias
# Optional running_mean and running_var are only
# connected to their new values.
for
i
in
range
(
3
,
len
(
node
.
inputs
)):
patterns
[
0
]
.
append
(
True
)
for
pattern
in
patterns
[
1
:]:
pattern
.
append
(
False
)
patterns
.
append
([
False
]
*
(
i
)
+
[
True
])
return
patterns
class
GpuDnnBatchNormGrad
(
GpuDnnBatchNormBase
):
class
GpuDnnBatchNormGrad
(
GpuDnnBatchNormBase
):
...
@@ -2722,7 +2890,8 @@ err%(name)s = cudnnBatchNormalizationBackward(
...
@@ -2722,7 +2890,8 @@ err%(name)s = cudnnBatchNormalizationBackward(
def
dnn_batch_normalization_train
(
inputs
,
gamma
,
beta
,
mode
=
'per-activation'
,
def
dnn_batch_normalization_train
(
inputs
,
gamma
,
beta
,
mode
=
'per-activation'
,
epsilon
=
1e-4
):
epsilon
=
1e-4
,
running_average_factor
=
0.1
,
running_mean
=
None
,
running_var
=
None
):
"""
"""
Performs batch normalization of the given inputs, using the mean and
Performs batch normalization of the given inputs, using the mean and
variance of the inputs.
variance of the inputs.
...
@@ -2742,6 +2911,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
...
@@ -2742,6 +2911,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon : float
epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
Returns
Returns
-------
-------
...
@@ -2751,6 +2937,12 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
...
@@ -2751,6 +2937,12 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
Means of `inputs` across the normalization axes.
Means of `inputs` across the normalization axes.
invstd : tensor
invstd : tensor
Inverse standard deviations of `inputs` across the normalization axes.
Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes
Notes
-----
-----
...
@@ -2762,31 +2954,78 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
...
@@ -2762,31 +2954,78 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
axes = 0 if mode == 'per-activation' else (0, 2, 3)
axes = 0 if mode == 'per-activation' else (0, 2, 3)
mean = inputs.mean(axes, keepdims=True)
mean = inputs.mean(axes, keepdims=True)
invstd = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon))
var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) +
\\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) +
\\
(m / (m - 1)) * var * running_average_factor
For 5d tensors, the axes are (0, 2, 3, 4).
For 5d tensors, the axes are (0, 2, 3, 4).
"""
"""
ndim
=
inputs
.
ndim
ndim
=
inputs
.
ndim
if
ndim
>
5
:
raise
ValueError
(
"dnn_batch_normalization_train currently supports "
"up to 5-dimensional tensors only, got
%
d"
%
ndim
)
if
gamma
.
ndim
!=
ndim
or
beta
.
ndim
!=
ndim
:
if
gamma
.
ndim
!=
ndim
or
beta
.
ndim
!=
ndim
:
raise
ValueError
(
"gamma and beta must be of the same dimensionality "
raise
ValueError
(
"gamma and beta must be of the same dimensionality "
"as inputs; got
%
d and
%
d instead of
%
d"
%
"as inputs; got
%
d and
%
d instead of
%
d"
%
(
gamma
.
ndim
,
beta
.
ndim
,
ndim
))
(
gamma
.
ndim
,
beta
.
ndim
,
ndim
))
if
(
running_mean
is
None
)
!=
(
running_var
is
None
):
raise
ValueError
(
"running_mean and running_var must either both be "
"given or both be None"
)
if
running_mean
is
not
None
and
running_mean
.
ndim
!=
ndim
:
raise
ValueError
(
"running_mean must be of the same dimensionality "
"as inputs; got
%
d instead of
%
d"
%
(
running_mean
.
ndim
,
ndim
))
if
running_var
is
not
None
and
running_var
.
ndim
!=
ndim
:
raise
ValueError
(
"running_var must be of the same dimensionality "
"as inputs; got
%
d instead of
%
d"
%
(
running_var
.
ndim
,
ndim
))
if
epsilon
<
1e-5
:
if
epsilon
<
1e-5
:
raise
ValueError
(
"epsilon must be at least 1e-5, got
%
f"
%
epsilon
)
raise
ValueError
(
"epsilon must be at least 1e-5, got
%
f"
%
epsilon
)
running_averages
=
(
running_var
is
not
None
and
running_var
is
not
None
)
if
ndim
<
4
:
if
ndim
<
4
:
inputs
=
theano
.
tensor
.
shape_padright
(
inputs
,
4
-
ndim
)
inputs
=
theano
.
tensor
.
shape_padright
(
inputs
,
4
-
ndim
)
gamma
=
theano
.
tensor
.
shape_padright
(
gamma
,
4
-
ndim
)
gamma
=
theano
.
tensor
.
shape_padright
(
gamma
,
4
-
ndim
)
beta
=
theano
.
tensor
.
shape_padright
(
beta
,
4
-
ndim
)
beta
=
theano
.
tensor
.
shape_padright
(
beta
,
4
-
ndim
)
batchnorm_op
=
GpuDnnBatchNorm
(
mode
=
mode
,
epsilon
=
epsilon
)
if
running_averages
:
result
=
tuple
(
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
running_mean
=
theano
.
tensor
.
shape_padright
(
running_mean
,
4
-
ndim
)
gpu_contiguous
(
beta
)))
running_var
=
theano
.
tensor
.
shape_padright
(
running_var
,
4
-
ndim
)
elif
ndim
>
5
:
inputs_shape
=
inputs
.
shape
params_shape
=
gamma
.
shape
inputs
=
theano
.
tensor
.
flatten
(
inputs
,
5
)
gamma
=
theano
.
tensor
.
flatten
(
gamma
,
5
)
beta
=
theano
.
tensor
.
flatten
(
beta
,
5
)
if
running_averages
:
running_mean
=
theano
.
tensor
.
flatten
(
running_mean
,
5
)
running_var
=
theano
.
tensor
.
flatten
(
running_var
,
5
)
batchnorm_op
=
GpuDnnBatchNorm
(
mode
=
mode
,
epsilon
=
epsilon
,
running_average_factor
=
running_average_factor
,
running_averages
=
running_averages
)
if
running_averages
:
out
,
mean
,
invstd
,
new_running_mean
,
new_running_var
=
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
),
running_mean
=
gpu_contiguous
(
running_mean
),
running_var
=
gpu_contiguous
(
running_var
))
if
new_running_mean
.
broadcastable
!=
running_mean
.
broadcastable
:
new_running_mean
=
tensor
.
patternbroadcast
(
new_running_mean
,
running_mean
.
broadcastable
)
if
new_running_var
.
broadcastable
!=
running_var
.
broadcastable
:
new_running_var
=
tensor
.
patternbroadcast
(
new_running_var
,
running_var
.
broadcastable
)
result
=
(
out
,
mean
,
invstd
,
new_running_mean
,
new_running_var
)
else
:
result
=
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
))
if
ndim
<
4
:
if
ndim
<
4
:
result
=
tuple
(
theano
.
tensor
.
flatten
(
r
,
ndim
)
for
r
in
result
)
result
=
tuple
(
theano
.
tensor
.
flatten
(
r
,
ndim
)
for
r
in
result
)
elif
ndim
>
5
:
result
=
(
theano
.
tensor
.
reshape
(
result
[
0
],
inputs_shape
),)
+
tuple
(
theano
.
tensor
.
reshape
(
r
,
params_shape
)
for
r
in
result
[
1
:])
return
result
return
result
...
@@ -2839,9 +3078,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
...
@@ -2839,9 +3078,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
For 5d tensors, the axes would be (0, 2, 3, 4).
For 5d tensors, the axes would be (0, 2, 3, 4).
"""
"""
ndim
=
inputs
.
ndim
ndim
=
inputs
.
ndim
if
ndim
>
5
:
raise
ValueError
(
"dnn_batch_normalization_test currently supports "
"up to 5-dimensional tensors only, got
%
d"
%
ndim
)
if
gamma
.
ndim
!=
ndim
or
beta
.
ndim
!=
ndim
:
if
gamma
.
ndim
!=
ndim
or
beta
.
ndim
!=
ndim
:
raise
ValueError
(
"gamma and beta must be of the same dimensionality "
raise
ValueError
(
"gamma and beta must be of the same dimensionality "
"as inputs; got
%
d and
%
d instead of
%
d"
%
"as inputs; got
%
d and
%
d instead of
%
d"
%
...
@@ -2859,12 +3095,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
...
@@ -2859,12 +3095,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
beta
=
theano
.
tensor
.
shape_padright
(
beta
,
4
-
ndim
)
beta
=
theano
.
tensor
.
shape_padright
(
beta
,
4
-
ndim
)
mean
=
theano
.
tensor
.
shape_padright
(
mean
,
4
-
ndim
)
mean
=
theano
.
tensor
.
shape_padright
(
mean
,
4
-
ndim
)
var
=
theano
.
tensor
.
shape_padright
(
var
,
4
-
ndim
)
var
=
theano
.
tensor
.
shape_padright
(
var
,
4
-
ndim
)
elif
ndim
>
5
:
inputs_shape
=
inputs
.
shape
inputs
=
theano
.
tensor
.
flatten
(
inputs
,
5
)
gamma
=
theano
.
tensor
.
flatten
(
gamma
,
5
)
beta
=
theano
.
tensor
.
flatten
(
beta
,
5
)
mean
=
theano
.
tensor
.
flatten
(
mean
,
5
)
var
=
theano
.
tensor
.
flatten
(
var
,
5
)
batchnorm_op
=
GpuDnnBatchNormInference
(
mode
=
mode
,
epsilon
=
epsilon
)
batchnorm_op
=
GpuDnnBatchNormInference
(
mode
=
mode
,
epsilon
=
epsilon
)
result
=
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
result
=
batchnorm_op
(
gpu_contiguous
(
inputs
),
gpu_contiguous
(
gamma
),
gpu_contiguous
(
beta
),
gpu_contiguous
(
mean
),
gpu_contiguous
(
beta
),
gpu_contiguous
(
mean
),
gpu_contiguous
(
var
))
gpu_contiguous
(
var
))
if
ndim
<
4
:
if
ndim
<
4
:
result
=
theano
.
tensor
.
flatten
(
result
,
ndim
)
result
=
theano
.
tensor
.
flatten
(
result
,
ndim
)
elif
ndim
>
5
:
result
=
theano
.
tensor
.
reshape
(
result
,
inputs_shape
)
return
result
return
result
...
@@ -3334,3 +3579,235 @@ def local_abstractconv3d_cudnn(node):
...
@@ -3334,3 +3579,235 @@ def local_abstractconv3d_cudnn(node):
subsample
=
node
.
op
.
subsample
,
subsample
=
node
.
op
.
subsample
,
conv_mode
=
conv_mode
)
conv_mode
=
conv_mode
)
return
[
rval
]
return
[
rval
]
@local_optimizer
([
bn
.
AbstractBatchNormTrain
])
def
local_abstract_batch_norm_train_cudnn
(
node
):
if
not
isinstance
(
node
.
op
,
bn
.
AbstractBatchNormTrain
):
return
None
x
,
scale
,
bias
,
epsilon
,
running_average_factor
=
node
.
inputs
[:
5
]
running_mean
=
node
.
inputs
[
5
]
if
len
(
node
.
inputs
)
>
5
else
None
running_var
=
node
.
inputs
[
6
]
if
len
(
node
.
inputs
)
>
6
else
None
# input on gpu? TODO what about the output?
x_on_gpu
=
(
isinstance
(
x
.
type
,
CudaNdarrayType
)
or
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)))
if
not
x_on_gpu
:
return
None
# convert axes to cuDNN mode
axes
=
tuple
(
node
.
op
.
axes
)
if
axes
==
(
0
,):
mode
=
'per-activation'
elif
axes
==
(
0
,)
+
tuple
(
range
(
2
,
x
.
ndim
)):
mode
=
'spatial'
else
:
return
None
try
:
eps
=
float
(
theano
.
tensor
.
get_scalar_constant_value
(
epsilon
))
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
eps
<
1e-5
:
return
None
try
:
running_average_factor
=
float
(
theano
.
tensor
.
get_scalar_constant_value
(
running_average_factor
))
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
not
dnn_available
():
return
None
x
=
as_cuda_ndarray_variable
(
x
)
scale
=
as_cuda_ndarray_variable
(
scale
)
bias
=
as_cuda_ndarray_variable
(
bias
)
inputs
=
[
x
,
scale
,
bias
,
mode
,
eps
,
running_average_factor
]
if
running_mean
is
not
None
and
running_var
is
not
None
:
inputs
.
append
(
running_mean
)
inputs
.
append
(
running_var
)
results
=
list
(
dnn_batch_normalization_train
(
*
inputs
))
# If the original output was on CPU, we have to transfer it
for
i
in
range
(
len
(
node
.
outputs
)):
if
isinstance
(
node
.
outputs
[
i
]
.
type
,
tensor
.
TensorType
):
results
[
i
]
=
tensor
.
as_tensor_variable
(
results
[
i
])
# TODO copy_stack_trace?
return
results
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNorm
],
inplace
=
True
)
def
local_gpu_batch_norm_inplace_output
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNorm
)
and
not
node
.
op
.
inplace_output
:
return
GpuDnnBatchNorm
(
mode
=
node
.
op
.
mode
,
epsilon
=
node
.
op
.
epsilon
,
running_average_factor
=
node
.
op
.
running_average_factor
,
running_averages
=
node
.
op
.
running_averages
,
inplace_running_mean
=
node
.
op
.
inplace_running_mean
,
inplace_running_var
=
node
.
op
.
inplace_running_var
,
inplace_output
=
True
)(
*
node
.
inputs
)
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNorm
],
inplace
=
True
)
def
local_gpu_batch_norm_inplace_running_mean
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNorm
)
and
node
.
op
.
running_averages
and
not
node
.
op
.
inplace_running_mean
:
return
GpuDnnBatchNorm
(
mode
=
node
.
op
.
mode
,
epsilon
=
node
.
op
.
epsilon
,
running_average_factor
=
node
.
op
.
running_average_factor
,
running_averages
=
node
.
op
.
running_averages
,
inplace_running_mean
=
True
,
inplace_running_var
=
node
.
op
.
inplace_running_var
,
inplace_output
=
node
.
op
.
inplace_output
)(
*
node
.
inputs
)
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNorm
],
inplace
=
True
)
def
local_gpu_batch_norm_inplace_running_var
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNorm
)
and
node
.
op
.
running_averages
and
not
node
.
op
.
inplace_running_var
:
return
GpuDnnBatchNorm
(
mode
=
node
.
op
.
mode
,
epsilon
=
node
.
op
.
epsilon
,
running_average_factor
=
node
.
op
.
running_average_factor
,
running_averages
=
node
.
op
.
running_averages
,
inplace_running_mean
=
node
.
op
.
inplace_running_mean
,
inplace_running_var
=
True
,
inplace_output
=
node
.
op
.
inplace_output
)(
*
node
.
inputs
)
@register_inplace
()
@local_optimizer
([
GpuDnnBatchNormInference
],
inplace
=
True
)
def
local_gpu_batch_norm_inference_inplace
(
node
):
if
isinstance
(
node
.
op
,
GpuDnnBatchNormInference
)
and
not
node
.
op
.
inplace
:
return
[
GpuDnnBatchNormInference
(
mode
=
node
.
op
.
mode
,
epsilon
=
node
.
op
.
epsilon
,
inplace
=
True
)(
*
node
.
inputs
)]
@local_optimizer
([
bn
.
AbstractBatchNormTrainGrad
])
def
local_abstract_batch_norm_train_grad_cudnn
(
node
):
if
not
isinstance
(
node
.
op
,
bn
.
AbstractBatchNormTrainGrad
):
return
None
x
,
dy
,
scale
,
x_mean
,
x_invstd
,
epsilon
=
node
.
inputs
# input on gpu? TODO what about the output?
x_on_gpu
=
(
isinstance
(
x
.
type
,
CudaNdarrayType
)
or
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)))
dy_on_gpu
=
(
isinstance
(
dy
.
type
,
CudaNdarrayType
)
or
(
dy
.
owner
and
isinstance
(
dy
.
owner
.
op
,
HostFromGpu
)))
if
not
(
x_on_gpu
or
dy_on_gpu
):
return
None
# convert axes to cuDNN mode
axes
=
tuple
(
node
.
op
.
axes
)
if
axes
==
(
0
,):
mode
=
'per-activation'
elif
axes
==
(
0
,)
+
tuple
(
range
(
2
,
x
.
ndim
)):
mode
=
'spatial'
else
:
return
None
ndim
=
x
.
ndim
if
ndim
<
4
:
x
=
theano
.
tensor
.
shape_padright
(
x
,
4
-
ndim
)
dy
=
theano
.
tensor
.
shape_padright
(
dy
,
4
-
ndim
)
scale
=
theano
.
tensor
.
shape_padright
(
scale
,
4
-
ndim
)
x_mean
=
theano
.
tensor
.
shape_padright
(
x_mean
,
4
-
ndim
)
x_invstd
=
theano
.
tensor
.
shape_padright
(
x_invstd
,
4
-
ndim
)
elif
ndim
>
5
:
x_shape
=
x
.
shape
params_shape
=
scale
.
shape
x
=
theano
.
tensor
.
flatten
(
x
,
5
)
dy
=
theano
.
tensor
.
flatten
(
dy
,
5
)
scale
=
theano
.
tensor
.
flatten
(
scale
,
5
)
x_mean
=
theano
.
tensor
.
flatten
(
x_mean
,
5
)
x_invstd
=
theano
.
tensor
.
flatten
(
x_invstd
,
5
)
try
:
eps
=
float
(
theano
.
tensor
.
get_scalar_constant_value
(
epsilon
))
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
eps
<
1e-5
:
return
None
if
not
dnn_available
():
return
None
x
=
as_cuda_ndarray_variable
(
x
)
dy
=
as_cuda_ndarray_variable
(
dy
)
scale
=
as_cuda_ndarray_variable
(
scale
)
x_mean
=
as_cuda_ndarray_variable
(
x_mean
)
x_invstd
=
as_cuda_ndarray_variable
(
x_invstd
)
g_wrt_inputs
,
g_wrt_scale
,
g_wrt_bias
=
\
GpuDnnBatchNormGrad
(
mode
,
epsilon
=
eps
)(
x
,
dy
,
scale
,
x_mean
,
x_invstd
)
if
ndim
<
4
:
g_wrt_inputs
=
theano
.
tensor
.
flatten
(
g_wrt_inputs
,
ndim
)
g_wrt_scale
=
theano
.
tensor
.
flatten
(
g_wrt_scale
,
ndim
)
g_wrt_bias
=
theano
.
tensor
.
flatten
(
g_wrt_bias
,
ndim
)
elif
ndim
>
5
:
g_wrt_inputs
=
theano
.
tensor
.
reshape
(
g_wrt_inputs
,
x_shape
)
g_wrt_scale
=
theano
.
tensor
.
reshape
(
g_wrt_scale
,
params_shape
)
g_wrt_bias
=
theano
.
tensor
.
reshape
(
g_wrt_bias
,
params_shape
)
# If the original output was on CPU, we have to transfer it
if
isinstance
(
node
.
outputs
[
0
]
.
type
,
tensor
.
TensorType
):
g_wrt_inputs
=
tensor
.
as_tensor_variable
(
g_wrt_inputs
)
if
isinstance
(
node
.
outputs
[
1
]
.
type
,
tensor
.
TensorType
):
g_wrt_scale
=
tensor
.
as_tensor_variable
(
g_wrt_scale
)
if
isinstance
(
node
.
outputs
[
2
]
.
type
,
tensor
.
TensorType
):
g_wrt_bias
=
tensor
.
as_tensor_variable
(
g_wrt_bias
)
# TODO copy_stack_trace?
return
[
g_wrt_inputs
,
g_wrt_scale
,
g_wrt_bias
]
@local_optimizer
([
bn
.
AbstractBatchNormInference
])
def
local_abstract_batch_norm_inference_cudnn
(
node
):
if
not
isinstance
(
node
.
op
,
bn
.
AbstractBatchNormInference
):
return
None
x
,
scale
,
bias
,
estimated_mean
,
estimated_variance
,
epsilon
=
node
.
inputs
axes
=
tuple
(
node
.
op
.
axes
)
if
axes
==
(
0
,):
mode
=
'per-activation'
elif
axes
==
(
0
,)
+
tuple
(
range
(
2
,
x
.
ndim
)):
mode
=
'spatial'
else
:
return
None
# input on gpu? TODO what about the output?
x_on_gpu
=
(
isinstance
(
x
.
type
,
CudaNdarrayType
)
or
(
x
.
owner
and
isinstance
(
x
.
owner
.
op
,
HostFromGpu
)))
if
not
x_on_gpu
:
return
None
try
:
eps
=
float
(
theano
.
tensor
.
get_scalar_constant_value
(
epsilon
))
except
theano
.
tensor
.
NotScalarConstantError
:
return
None
if
eps
<
1e-5
:
return
None
if
not
dnn_available
():
return
None
x
=
as_cuda_ndarray_variable
(
x
)
scale
=
as_cuda_ndarray_variable
(
scale
)
bias
=
as_cuda_ndarray_variable
(
bias
)
estimated_mean
=
as_cuda_ndarray_variable
(
estimated_mean
)
estimated_variance
=
as_cuda_ndarray_variable
(
estimated_variance
)
out
=
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
estimated_mean
,
estimated_variance
,
mode
,
eps
)
# If the original output was on CPU, we have to transfer it
# TODO copy_stack_trace?
if
isinstance
(
node
.
outputs
[
0
]
.
type
,
tensor
.
TensorType
):
return
[
tensor
.
as_tensor_variable
(
out
)]
else
:
return
[
out
]
theano/sandbox/cuda/opt.py
浏览文件 @
43411345
...
@@ -3050,3 +3050,28 @@ conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
...
@@ -3050,3 +3050,28 @@ conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
local_abstractconv3d_gradinputs_gemm
,
30
,
local_abstractconv3d_gradinputs_gemm
,
30
,
'conv_gemm'
,
'conv_gemm'
,
'gpu'
,
'fast_compile'
,
'fast_run'
)
'gpu'
,
'fast_compile'
,
'fast_run'
)
# Register cuDNN batch normalization implementation
abstract_batch_norm_groupopt
=
theano
.
gof
.
optdb
.
LocalGroupDB
()
abstract_batch_norm_groupopt
.
__name__
=
"gpu_batchnorm_opts"
register_opt
(
'fast_compile'
)(
abstract_batch_norm_groupopt
)
# cuDNN optimizations are only registered if cuDNN is available.
# (we import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn)
from
.dnn
import
(
local_abstract_batch_norm_train_cudnn
,
local_abstract_batch_norm_train_grad_cudnn
,
local_abstract_batch_norm_inference_cudnn
)
# noqa: 402
abstract_batch_norm_groupopt
.
register
(
'local_abstract_batch_norm_train_dnn'
,
local_abstract_batch_norm_train_cudnn
,
20
,
'batchnorm_dnn'
,
'gpu'
,
'fast_compile'
,
'fast_run'
,
'cudnn'
)
abstract_batch_norm_groupopt
.
register
(
'local_abstract_batch_norm_train_grad_dnn'
,
local_abstract_batch_norm_train_grad_cudnn
,
20
,
'batchnorm_dnn'
,
'gpu'
,
'fast_compile'
,
'fast_run'
,
'cudnn'
)
abstract_batch_norm_groupopt
.
register
(
'local_abstract_batch_norm_inference_dnn'
,
local_abstract_batch_norm_inference_cudnn
,
20
,
'batchnorm_dnn'
,
'gpu'
,
'fast_compile'
,
'fast_run'
,
'cudnn'
)
theano/sandbox/cuda/tests/test_dnn.py
浏览文件 @
43411345
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
from
collections
import
OrderedDict
import
logging
import
logging
import
os
import
os
import
sys
import
sys
...
@@ -18,6 +19,7 @@ import theano.tests.unittest_tools as utt
...
@@ -18,6 +19,7 @@ import theano.tests.unittest_tools as utt
from
theano.tensor.signal.pool
import
pool_2d
,
pool_3d
from
theano.tensor.signal.pool
import
pool_2d
,
pool_3d
from
theano.tensor.signal.pool
import
Pool
,
MaxPoolGrad
,
AveragePoolGrad
from
theano.tensor.signal.pool
import
Pool
,
MaxPoolGrad
,
AveragePoolGrad
from
theano.tensor.nnet.abstract_conv
import
get_conv_output_shape
from
theano.tensor.nnet.abstract_conv
import
get_conv_output_shape
from
theano.tensor.nnet
import
bn
import
theano.sandbox.cuda.dnn
as
dnn
import
theano.sandbox.cuda.dnn
as
dnn
from
theano.sandbox.cuda.basic_ops
import
GpuAllocEmpty
,
gpu_alloc_empty
from
theano.sandbox.cuda.basic_ops
import
GpuAllocEmpty
,
gpu_alloc_empty
from
theano.sandbox.cuda
import
float32_shared_constructor
as
shared
from
theano.sandbox.cuda
import
float32_shared_constructor
as
shared
...
@@ -730,52 +732,201 @@ def test_batchnorm_train():
...
@@ -730,52 +732,201 @@ def test_batchnorm_train():
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
utt
.
seed_rng
()
tensor6
=
T
.
TensorType
(
theano
.
config
.
floatX
,
(
False
,)
*
6
)
for
mode
in
(
'per-activation'
,
'spatial'
):
for
mode
in
(
'per-activation'
,
'spatial'
):
for
vartype
in
(
T
.
ftensor5
,
T
.
ftensor4
,
T
.
ftensor3
,
T
.
fmatrix
,
T
.
fvector
):
for
vartype
in
(
tensor6
,
T
.
ftensor5
,
T
.
ftensor4
,
T
.
ftensor3
,
T
.
fmatrix
,
T
.
fvector
):
x
,
scale
,
bias
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
))
x
,
scale
,
bias
,
running_mean
,
running_var
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'running_mean'
,
'running_var'
))
ndim
=
x
.
ndim
ndim
=
x
.
ndim
eps
=
5e-3
# some non-standard value to test if it's used
eps
=
5e-3
# some non-standard value to test if it's used
running_average_factor
=
0.3
# forward pass
out
,
x_mean
,
x_invstd
=
cuda
.
dnn
.
dnn_batch_normalization_train
(
# forward pass, direct interface
x
,
scale
,
bias
,
mode
,
eps
)
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
,
\
out_running_mean_gpu
,
out_running_var_gpu
=
\
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
,
running_average_factor
,
running_mean
,
running_var
)
# forward pass, abstract interface
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
,
\
out_running_mean_abstract
,
out_running_var_abstract
=
\
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
mode
,
eps
,
running_average_factor
,
running_mean
,
running_var
)
# reference forward pass
# reference forward pass
if
mode
==
'per-activation'
:
if
mode
==
'per-activation'
:
axes
=
(
0
,)
axes
=
(
0
,)
elif
mode
==
'spatial'
:
elif
mode
==
'spatial'
:
axes
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
axes
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
x_mean2
=
x
.
mean
(
axis
=
axes
,
keepdims
=
True
)
x_mean_ref
=
x
.
mean
(
axis
=
axes
,
keepdims
=
True
)
x_invstd2
=
T
.
inv
(
T
.
sqrt
(
x
.
var
(
axis
=
axes
,
keepdims
=
True
)
+
eps
))
x_var_ref
=
x
.
var
(
axis
=
axes
,
keepdims
=
True
)
scale2
=
T
.
addbroadcast
(
scale
,
*
axes
)
x_invstd_ref
=
T
.
inv
(
T
.
sqrt
(
x_var_ref
+
eps
))
bias2
=
T
.
addbroadcast
(
bias
,
*
axes
)
scale_ref
=
T
.
addbroadcast
(
scale
,
*
axes
)
out2
=
(
x
-
x_mean2
)
*
(
scale2
*
x_invstd2
)
+
bias2
bias_ref
=
T
.
addbroadcast
(
bias
,
*
axes
)
m
=
T
.
cast
(
T
.
prod
(
x
.
shape
)
/
T
.
prod
(
scale
.
shape
),
theano
.
config
.
floatX
)
out_ref
=
(
x
-
x_mean_ref
)
*
(
scale_ref
*
x_invstd_ref
)
+
bias_ref
out_running_mean_ref
=
running_mean
*
(
1
-
running_average_factor
)
+
\
x_mean_ref
*
running_average_factor
out_running_var_ref
=
running_var
*
(
1
-
running_average_factor
)
+
\
(
m
/
(
m
-
1
))
*
x_var_ref
*
running_average_factor
# backward pass
# backward pass
dy
=
vartype
(
'dy'
)
dy
=
vartype
(
'dy'
)
grads
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out
:
dy
})
grads_gpu
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_gpu
:
dy
})
grads_abstract
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_abstract
:
dy
})
# reference backward pass
# reference backward pass
grads
2
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out2
:
dy
})
grads
_ref
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_ref
:
dy
})
# compile
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
f_gpu
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out
,
x_mean
,
x_invstd
,
out2
,
x_mean2
,
x_invstd2
]
+
[
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
,
grads
+
grads2
,
mode
=
mode_with_gpu
)
out_running_mean_gpu
,
out_running_var_gpu
]
+
grads_gpu
,
mode
=
mode_with_gpu
)
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
,
out_running_mean_abstract
,
out_running_var_abstract
]
+
grads_abstract
,
mode
=
mode_with_gpu
)
f_ref
=
theano
.
function
([
x
,
scale
,
bias
,
running_mean
,
running_var
,
dy
],
[
out_ref
,
x_mean_ref
,
x_invstd_ref
,
out_running_mean_ref
,
out_running_var_ref
]
+
grads_ref
)
# check if the abstract Ops have been replaced
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormGrad
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
# run
# run
for
data_shape
in
((
5
,
10
,
30
,
40
,
10
),
(
4
,
3
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
)):
for
data_shape
in
((
5
,
10
,
30
,
40
,
10
,
5
),
(
4
,
3
,
1
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
,
5
)):
data_shape
=
data_shape
[:
ndim
]
data_shape
=
data_shape
[:
ndim
]
param_shape
=
tuple
(
1
if
d
in
axes
else
s
param_shape
=
tuple
(
1
if
d
in
axes
else
s
for
d
,
s
in
enumerate
(
data_shape
))
for
d
,
s
in
enumerate
(
data_shape
))
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
'float32'
)
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
'float32'
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs
=
f
(
X
,
Scale
,
Bias
,
Dy
)
Running_mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Running_var
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs_gpu
=
f_gpu
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
outputs_abstract
=
f_abstract
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
outputs_ref
=
f_ref
(
X
,
Scale
,
Bias
,
Running_mean
,
Running_var
,
Dy
)
# compare outputs
# compare outputs
utt
.
assert_allclose
(
outputs
[
0
],
outputs
[
0
+
3
])
# out
utt
.
assert_allclose
(
outputs_gpu
[
0
],
outputs_ref
[
0
])
# out
utt
.
assert_allclose
(
outputs
[
1
],
outputs
[
1
+
3
])
# mean
utt
.
assert_allclose
(
outputs_gpu
[
1
],
outputs_ref
[
1
])
# mean
utt
.
assert_allclose
(
outputs
[
2
],
outputs
[
2
+
3
])
# invstd
utt
.
assert_allclose
(
outputs_gpu
[
2
],
outputs_ref
[
2
])
# invstd
utt
.
assert_allclose
(
outputs_gpu
[
3
],
outputs_ref
[
3
])
# running_mean
utt
.
assert_allclose
(
numpy
.
nan_to_num
(
outputs_gpu
[
4
]),
numpy
.
nan_to_num
(
outputs_ref
[
4
]))
# running_var
utt
.
assert_allclose
(
outputs_abstract
[
0
],
outputs_ref
[
0
])
# out
utt
.
assert_allclose
(
outputs_abstract
[
1
],
outputs_ref
[
1
])
# mean
utt
.
assert_allclose
(
outputs_abstract
[
2
],
outputs_ref
[
2
])
# invstd
utt
.
assert_allclose
(
outputs_abstract
[
3
],
outputs_ref
[
3
])
# running_mean
utt
.
assert_allclose
(
numpy
.
nan_to_num
(
outputs_abstract
[
4
]),
numpy
.
nan_to_num
(
outputs_ref
[
4
]))
# running_var
# compare gradients
# compare gradients
utt
.
assert_allclose
(
outputs
[
6
],
outputs
[
6
+
3
],
atol
=
1e-4
)
# dx
utt
.
assert_allclose
(
outputs_gpu
[
5
],
outputs_ref
[
5
],
atol
=
2e-4
)
# dx
utt
.
assert_allclose
(
outputs
[
7
],
outputs
[
7
+
3
],
rtol
=
2e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs_gpu
[
6
],
outputs_ref
[
6
],
rtol
=
4e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs
[
8
],
outputs
[
8
+
3
])
# dbias
utt
.
assert_allclose
(
outputs_gpu
[
7
],
outputs_ref
[
7
])
# dbias
utt
.
assert_allclose
(
outputs_abstract
[
5
],
outputs_ref
[
5
],
atol
=
2e-4
)
# dx
utt
.
assert_allclose
(
outputs_abstract
[
6
],
outputs_ref
[
6
],
rtol
=
4e-4
,
atol
=
1e-4
)
# dscale
utt
.
assert_allclose
(
outputs_abstract
[
7
],
outputs_ref
[
7
])
# dbias
def
test_dnn_batchnorm_train_without_running_averages
():
# compile and run batch_normalization_train without running averages
if
not
cuda
.
dnn
.
dnn_available
():
raise
SkipTest
(
cuda
.
dnn
.
dnn_available
.
msg
)
if
cuda
.
dnn
.
version
()
<
(
5000
,
5000
):
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
x
,
scale
,
bias
,
dy
=
T
.
tensor4
(
'x'
),
T
.
tensor4
(
'scale'
),
T
.
tensor4
(
'bias'
),
T
.
tensor4
(
'dy'
)
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
# forward pass
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
=
\
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
)
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
=
\
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
)
# backward pass
grads_gpu
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_gpu
:
dy
})
grads_abstract
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_gpu
:
dy
})
# compile
f_gpu
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out_gpu
,
x_mean_gpu
,
x_invstd_gpu
]
+
grads_gpu
,
mode
=
mode_with_gpu
)
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
dy
],
[
out_abstract
,
x_mean_abstract
,
x_invstd_abstract
]
+
grads_abstract
,
mode
=
mode_with_gpu
)
# check if the abstract Ops have been replaced
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormGrad
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f_gpu
(
X
,
Scale
,
Bias
,
Dy
)
f_abstract
(
X
,
Scale
,
Bias
,
Dy
)
def
test_dnn_batchnorm_train_inplace
():
# test inplace_running_mean and inplace_running_var
if
not
cuda
.
dnn
.
dnn_available
():
raise
SkipTest
(
cuda
.
dnn
.
dnn_available
.
msg
)
if
cuda
.
dnn
.
version
()
<
(
5000
,
5000
):
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
x
,
scale
,
bias
=
T
.
tensor4
(
'x'
),
T
.
tensor4
(
'scale'
),
T
.
tensor4
(
'bias'
)
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
running_mean
=
shared
(
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
),
broadcastable
=
(
True
,
False
,
False
,
False
))
running_var
=
shared
(
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
),
broadcastable
=
(
True
,
False
,
False
,
False
))
# forward pass
out
,
x_mean
,
x_invstd
,
new_running_mean
,
new_running_var
=
\
dnn
.
dnn_batch_normalization_train
(
x
,
scale
,
bias
,
'per-activation'
,
epsilon
=
5e-3
,
running_average_factor
=
0.3
,
running_mean
=
running_mean
,
running_var
=
running_var
)
# update running averages
updates
=
OrderedDict
()
updates
[
running_mean
]
=
new_running_mean
updates
[
running_var
]
=
new_running_var
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
],
[
out
,
x_mean
,
x_invstd
],
updates
=
updates
,
mode
=
mode_with_gpu
)
# check for the inplace settings
nodes
=
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)]
assert
len
(
nodes
)
==
1
assert
nodes
[
0
]
.
op
.
inplace_running_mean
assert
nodes
[
0
]
.
op
.
inplace_running_var
assert
nodes
[
0
]
.
op
.
inplace_output
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f
(
X
,
Scale
,
Bias
)
def
test_batchnorm_inference
():
def
test_batchnorm_inference
():
...
@@ -785,53 +936,160 @@ def test_batchnorm_inference():
...
@@ -785,53 +936,160 @@ def test_batchnorm_inference():
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
utt
.
seed_rng
()
tensor6
=
T
.
TensorType
(
theano
.
config
.
floatX
,
(
False
,)
*
6
)
for
mode
in
(
'per-activation'
,
'spatial'
):
for
mode
in
(
'per-activation'
,
'spatial'
):
for
vartype
in
(
T
.
ftensor5
,
T
.
ftensor4
,
T
.
ftensor3
,
T
.
fmatrix
,
T
.
fvector
):
for
vartype
in
(
tensor6
,
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
,
T
.
vector
):
x
,
scale
,
bias
,
mean
,
var
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
x
,
scale
,
bias
,
mean
,
var
=
(
vartype
(
n
)
'bias'
,
'mean'
,
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
))
'var'
))
ndim
=
x
.
ndim
ndim
=
x
.
ndim
eps
=
5e-3
# some non-standard value to test if it's used
eps
=
5e-3
# some non-standard value to test if it's used
# forward pass
# forward pass, direct interface
out
=
cuda
.
dnn
.
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
out_gpu
=
dnn
.
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
mode
,
eps
)
var
,
mode
,
eps
)
# forward pass, abstract interface
out_abstract
=
bn
.
batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
mode
,
eps
)
# reference forward pass
# reference forward pass
if
mode
==
'per-activation'
:
if
mode
==
'per-activation'
:
axes
=
(
0
,)
axes
=
(
0
,)
elif
mode
==
'spatial'
:
elif
mode
==
'spatial'
:
axes
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
axes
=
(
0
,)
+
tuple
(
range
(
2
,
ndim
))
scale
2
,
bias2
,
mean2
,
var2
=
(
T
.
addbroadcast
(
t
,
*
axes
)
scale
_ref
,
bias_ref
,
mean_ref
,
var_ref
=
(
T
.
addbroadcast
(
t
,
*
axes
)
for
t
in
(
scale
,
bias
,
mean
,
var
))
for
t
in
(
scale
,
bias
,
mean
,
var
))
out
2
=
(
x
-
mean2
)
*
(
scale2
/
T
.
sqrt
(
var2
+
eps
))
+
bias2
out
_ref
=
(
x
-
mean_ref
)
*
(
scale_ref
/
T
.
sqrt
(
var_ref
+
eps
))
+
bias_ref
# backward pass
# backward pass
dy
=
vartype
(
'dy'
)
dy
=
vartype
(
'dy'
)
grads
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out
:
dy
})
grads_gpu
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_gpu
:
dy
})
grads_abstract
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_abstract
:
dy
})
# reference backward pass
# reference backward pass
grads
2
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out2
:
dy
})
grads
_ref
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_ref
:
dy
})
# compile
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
f_gpu
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out
,
out2
]
+
grads
+
grads2
,
mode
=
mode_with_gpu
)
[
out_gpu
]
+
grads_gpu
,
mode
=
mode_with_gpu
)
f_abstract
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_abstract
]
+
grads_abstract
,
mode
=
mode_with_gpu
)
f_ref
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_ref
]
+
grads_ref
)
# check if the abstract Ops have been replaced
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormInference
)
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f_abstract
.
maker
.
fgraph
.
toposort
()])
# run
# run
for
data_shape
in
((
5
,
10
,
30
,
40
,
10
),
(
4
,
3
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
)):
for
data_shape
in
((
10
,
20
,
30
,
40
,
10
,
5
),
(
4
,
3
,
1
,
1
,
1
,
1
),
(
1
,
1
,
5
,
5
,
5
,
5
)):
data_shape
=
data_shape
[:
ndim
]
data_shape
=
data_shape
[:
ndim
]
param_shape
=
tuple
(
1
if
d
in
axes
else
s
param_shape
=
tuple
(
1
if
d
in
axes
else
s
for
d
,
s
in
enumerate
(
data_shape
))
for
d
,
s
in
enumerate
(
data_shape
))
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
'float32'
)
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
'float32'
)
Dy
=
-
1
+
2
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
'float32'
)
Mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Var
=
numpy
.
random
.
rand
(
*
param_shape
)
.
astype
(
'float32'
)
Var
=
numpy
.
random
.
rand
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
outputs
=
f
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
outputs_gpu
=
f_gpu
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
outputs_abstract
=
f_abstract
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
outputs_ref
=
f_ref
(
X
,
Scale
,
Bias
,
Mean
,
Var
,
Dy
)
# compare outputs
# compare outputs
utt
.
assert_allclose
(
outputs
[
0
],
outputs
[
1
])
# out
utt
.
assert_allclose
(
outputs_gpu
[
0
],
outputs_ref
[
0
])
# out
utt
.
assert_allclose
(
outputs_abstract
[
0
],
outputs_ref
[
0
])
# out
# compare gradients
# compare gradients
utt
.
assert_allclose
(
outputs
[
2
],
outputs
[
2
+
5
],
atol
=
4e-5
)
# dx
utt
.
assert_allclose
(
outputs_gpu
[
1
],
outputs_ref
[
1
],
atol
=
4e-5
)
# dx
utt
.
assert_allclose
(
outputs
[
3
],
outputs
[
3
+
5
],
atol
=
4e-5
)
# dscale
utt
.
assert_allclose
(
outputs_gpu
[
2
],
outputs_ref
[
2
],
atol
=
4e-5
)
# dscale
utt
.
assert_allclose
(
outputs
[
4
],
outputs
[
4
+
5
])
# dbias
utt
.
assert_allclose
(
outputs_gpu
[
3
],
outputs_ref
[
3
])
# dbias
utt
.
assert_allclose
(
outputs
[
5
],
outputs
[
5
+
5
])
# dmean
utt
.
assert_allclose
(
outputs_gpu
[
4
],
outputs_ref
[
4
])
# dmean
utt
.
assert_allclose
(
outputs
[
6
],
outputs
[
6
+
5
],
rtol
=
2e-3
,
atol
=
4e-5
)
# dvar
utt
.
assert_allclose
(
outputs_gpu
[
5
],
outputs_ref
[
5
],
rtol
=
2e-3
,
atol
=
4e-5
)
# dvar
utt
.
assert_allclose
(
outputs_abstract
[
1
],
outputs_ref
[
1
],
atol
=
4e-5
)
# dx
utt
.
assert_allclose
(
outputs_abstract
[
2
],
outputs_ref
[
2
],
atol
=
4e-5
)
# dscale
utt
.
assert_allclose
(
outputs_abstract
[
3
],
outputs_ref
[
3
])
# dbias
utt
.
assert_allclose
(
outputs_abstract
[
4
],
outputs_ref
[
4
])
# dmean
utt
.
assert_allclose
(
outputs_abstract
[
5
],
outputs_ref
[
5
],
rtol
=
2e-3
,
atol
=
4e-5
)
# dvar
def
test_batchnorm_inference_inplace
():
# test inplace
if
not
cuda
.
dnn
.
dnn_available
():
raise
SkipTest
(
cuda
.
dnn
.
dnn_available
.
msg
)
if
cuda
.
dnn
.
version
()
<
(
5000
,
5000
):
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
utt
.
seed_rng
()
x
,
scale
,
bias
,
mean
,
var
=
(
T
.
tensor4
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
))
data_shape
=
(
5
,
10
,
30
,
25
)
param_shape
=
(
1
,
10
,
30
,
25
)
out
=
dnn
.
dnn_batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
)
f
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
],
[
out
],
mode
=
mode_with_gpu
)
# check for the inplace settings
nodes
=
[
n
for
n
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormInference
)]
assert
len
(
nodes
)
==
1
assert
nodes
[
0
]
.
op
.
inplace
# run
X
=
4
+
3
*
numpy
.
random
.
randn
(
*
data_shape
)
.
astype
(
theano
.
config
.
floatX
)
Scale
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Bias
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Mean
=
numpy
.
random
.
randn
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
Var
=
numpy
.
random
.
rand
(
*
param_shape
)
.
astype
(
theano
.
config
.
floatX
)
f
(
X
,
Scale
,
Bias
,
Mean
,
Var
)
def
test_dnn_batchnorm_valid_and_invalid_axes
():
if
not
cuda
.
dnn
.
dnn_available
():
raise
SkipTest
(
cuda
.
dnn
.
dnn_available
.
msg
)
if
cuda
.
dnn
.
version
()
<
(
5000
,
5000
):
raise
SkipTest
(
"batch normalization requires cudnn v5+"
)
for
vartype
in
(
T
.
tensor5
,
T
.
tensor4
,
T
.
tensor3
,
T
.
matrix
):
x
,
scale
,
bias
,
mean
,
var
,
dy
=
(
vartype
(
n
)
for
n
in
(
'x'
,
'scale'
,
'bias'
,
'mean'
,
'var'
,
'dy'
))
ndim
=
x
.
ndim
# supported: per-activation and spatial
valid_axes_lists
=
((
0
,),
(
0
,)
+
tuple
(
range
(
2
,
ndim
)))
# not supported: an axes list without 0 and including 1
invalid_axes_lists
=
(
tuple
(
range
(
1
,
ndim
)),)
for
axes
in
valid_axes_lists
+
invalid_axes_lists
:
# forward pass, abstract interface
out_train
,
x_mean
,
x_invstd
=
bn
.
batch_normalization_train
(
x
,
scale
,
bias
,
axes
)
out_test
=
bn
.
batch_normalization_test
(
x
,
scale
,
bias
,
mean
,
var
,
axes
)
# backward pass
dy
=
vartype
(
'dy'
)
grads_train
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
],
known_grads
=
{
out_train
:
dy
})
grads_test
=
T
.
grad
(
None
,
wrt
=
[
x
,
scale
,
bias
,
mean
,
var
],
known_grads
=
{
out_test
:
dy
})
# compile
f
=
theano
.
function
([
x
,
scale
,
bias
,
mean
,
var
,
dy
],
[
out_train
,
x_mean
,
x_invstd
,
out_test
]
+
grads_train
+
grads_test
,
mode
=
mode_with_gpu
)
if
axes
in
valid_axes_lists
:
# check if the abstract Ops have been replaced by the cuDNN Ops
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNorm
)
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormGrad
)
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
assert
any
([
isinstance
(
n
.
op
,
dnn
.
GpuDnnBatchNormInference
)
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
assert
not
any
([
isinstance
(
n
.
op
,
(
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
else
:
# check if the abstract Ops have been replaced, but not by the cuDNN Ops
assert
not
any
([
isinstance
(
n
.
op
,
(
dnn
.
GpuDnnBatchNorm
,
dnn
.
GpuDnnBatchNormGrad
,
bn
.
AbstractBatchNormTrain
,
bn
.
AbstractBatchNormInference
,
bn
.
AbstractBatchNormTrainGrad
))
for
n
in
f
.
maker
.
fgraph
.
toposort
()])
def
test_dnn_tag
():
def
test_dnn_tag
():
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论