提交 c4293e69 authored 作者: Gijs van Tulder's avatar Gijs van Tulder

Add running averages to batch norm (no cuDNN yet).

上级 4f291961
......@@ -2949,7 +2949,9 @@ def local_abstract_batch_norm_train_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrain):
return None
x, scale, bias, epsilon = node.inputs
x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
running_mean = node.inputs[5] if len(node.inputs) > 5 else None
running_var = node.inputs[6] if len(node.inputs) > 6 else None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, GpuArrayType) or
......@@ -2983,15 +2985,24 @@ def local_abstract_batch_norm_train_cudnn(node):
out, mean, invstd = dnn_batch_normalization_train(x, scale, bias, mode, eps)
results = [out, mean, invstd]
if running_mean is not None:
running_mean = running_mean * (1 - running_average_factor) + \
mean * running_average_factor
results.append(running_mean)
if running_var is not None:
var = x.var(axis=axes, keepdims=True)
m = tensor.cast(tensor.prod(x.shape) / tensor.prod(scale.shape), theano.config.floatX)
running_var = running_var * (1 - running_average_factor) + \
(m / (m - 1)) * var * running_average_factor
results.append(running_var)
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType):
out = tensor.as_tensor_variable(out)
if isinstance(node.outputs[1].type, tensor.TensorType):
mean = tensor.as_tensor_variable(mean)
if isinstance(node.outputs[2].type, tensor.TensorType):
invstd = tensor.as_tensor_variable(invstd)
for i in range(len(node.outputs)):
if isinstance(node.outputs[i].type, tensor.TensorType):
results[i] = tensor.as_tensor_variable(results[i])
# TODO copy_stack_trace?
return [out, mean, invstd]
return results
@local_optimizer([bn.AbstractBatchNormTrainGrad])
......
......@@ -1384,26 +1384,39 @@ def test_dnn_batchnorm_train():
for mode in ('per-activation', 'spatial'):
for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias'))
x, scale, bias, running_mean, running_var = (vartype(n)
for n in ('x', 'scale', 'bias',
'running_mean',
'running_var'))
ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used
running_average_factor = 0.3
# forward pass, direct interface
out_gpu, x_mean_gpu, x_invstd_gpu = dnn.dnn_batch_normalization_train(
x, scale, bias, mode, eps)
# forward pass, abstract interface
out_abstract, x_mean_abstract, x_invstd_abstract = bn.batch_normalization_train(
x, scale, bias, mode, eps)
out_abstract, x_mean_abstract, x_invstd_abstract, \
out_running_mean_abstract, out_running_var_abstract = \
bn.batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# reference forward pass
if mode == 'per-activation':
axes = (0,)
elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim))
x_mean_ref = x.mean(axis=axes, keepdims=True)
x_invstd_ref = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps))
x_var_ref = x.var(axis=axes, keepdims=True)
x_invstd_ref = T.inv(T.sqrt(x_var_ref + eps))
scale_ref = T.addbroadcast(scale, *axes)
bias_ref = T.addbroadcast(bias, *axes)
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
out_running_mean_ref = running_mean * (1 - running_average_factor) + \
x_mean_ref * running_average_factor
out_running_var_ref = running_var * (1 - running_average_factor) + \
(m / (m - 1)) * x_var_ref * running_average_factor
# backward pass
dy = vartype('dy')
grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
......@@ -1414,12 +1427,14 @@ def test_dnn_batchnorm_train():
f_gpu = theano.function([x, scale, bias, dy],
[out_gpu, x_mean_gpu, x_invstd_gpu] + grads_gpu,
mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract] +
f_abstract = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract,
out_running_mean_abstract, out_running_var_abstract] +
grads_abstract,
mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, dy],
[out_ref, x_mean_ref, x_invstd_ref] + grads_ref)
f_ref = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_ref, x_mean_ref, x_invstd_ref,
out_running_mean_ref, out_running_var_ref] + grads_ref)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f_abstract.maker.fgraph.toposort()])
......@@ -1438,9 +1453,11 @@ def test_dnn_batchnorm_train():
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs_gpu = f_gpu(X, Scale, Bias, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Dy)
outputs_ref = f_ref(X, Scale, Bias, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
# compare outputs
utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_gpu[1], outputs_ref[1]) # mean
......@@ -1448,13 +1465,51 @@ def test_dnn_batchnorm_train():
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs_abstract[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_abstract[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
# compare gradients
utt.assert_allclose(outputs_gpu[3], outputs_ref[3], atol=2e-4) # dx
utt.assert_allclose(outputs_gpu[4], outputs_ref[4], rtol=2e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_gpu[5], outputs_ref[5]) # dbias
utt.assert_allclose(outputs_abstract[3], outputs_ref[3], atol=2e-4) # dx
utt.assert_allclose(outputs_abstract[4], outputs_ref[4], rtol=2e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_abstract[5], outputs_ref[5]) # dbias
utt.assert_allclose(outputs_gpu[3], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_gpu[4], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_gpu[5], outputs_ref[7]) # dbias
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_abstract[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_abstract[7], outputs_ref[7]) # dbias
def test_dnn_batchnorm_train_without_running_averages():
# compile and run batch_normalization_train without running averages
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
# forward pass
out, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
# compile
f_abstract = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads, mode=mode_with_gpu)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm)
for n in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
for n in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f_abstract.maker.fgraph.toposort()])
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f_abstract(X, Scale, Bias, Dy)
def test_batchnorm_inference():
......
差异被折叠。
......@@ -148,9 +148,13 @@ def test_batch_normalization_train():
for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias'))
x, scale, bias, running_mean, running_var = (vartype(n)
for n in ('x', 'scale', 'bias',
'running_mean',
'running_var'))
ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used
running_average_factor = 0.3
# remove non-existing axes
if isinstance(axes, tuple):
......@@ -159,8 +163,10 @@ def test_batch_normalization_train():
continue
# forward pass
out, x_mean, x_invstd = bn.batch_normalization_train(
x, scale, bias, axes, eps)
out, x_mean, x_invstd, out_running_mean, out_running_var = \
bn.batch_normalization_train(
x, scale, bias, axes, eps,
running_average_factor, running_mean, running_var)
# reference forward pass
if axes == 'per-activation':
axes2 = (0,)
......@@ -169,18 +175,25 @@ def test_batch_normalization_train():
else:
axes2 = axes
x_mean2 = x.mean(axis=axes2, keepdims=True)
x_invstd2 = T.inv(T.sqrt(x.var(axis=axes2, keepdims=True) + eps))
x_var2 = x.var(axis=axes2, keepdims=True)
x_invstd2 = T.inv(T.sqrt(x_var2 + eps))
scale2 = T.addbroadcast(scale, *axes2)
bias2 = T.addbroadcast(bias, *axes2)
out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
out_running_mean2 = running_mean * (1 - running_average_factor) + \
x_mean2 * running_average_factor
out_running_var2 = running_var * (1 - running_average_factor) + \
(m / (m - 1)) * x_var2 * running_average_factor
# backward pass
dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
# reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
# compile
f = theano.function([x, scale, bias, dy],
[out, x_mean, x_invstd, out2, x_mean2, x_invstd2] +
f = theano.function([x, scale, bias, running_mean, running_var, dy],
[out, x_mean, x_invstd, out_running_mean, out_running_var,
out2, x_mean2, x_invstd2, out_running_mean2, out_running_var2] +
grads + grads2, mode='FAST_RUN')
# check if the abstract Ops have been replaced
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
......@@ -196,15 +209,47 @@ def test_batch_normalization_train():
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Dy)
Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy)
# compare outputs
utt.assert_allclose(outputs[0], outputs[0 + 3]) # out
utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean
utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd
utt.assert_allclose(outputs[0], outputs[0 + 5]) # out
utt.assert_allclose(outputs[1], outputs[1 + 5]) # mean
utt.assert_allclose(outputs[2], outputs[2 + 5]) # invstd
utt.assert_allclose(outputs[3], outputs[3 + 5]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs[4]),
numpy.nan_to_num(outputs[4 + 5])) # running_var
# compare gradients
utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4) # dx
utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias
utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4) # dx
utt.assert_allclose(outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs[12], outputs[12 + 3]) # dbias
def test_batch_normalization_train_without_running_averages():
# compile and run batch_normalization_train without running averages
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
# forward pass
out, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
# compile
f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads, mode='FAST_RUN')
# check if the abstract Ops have been replaced
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f.maker.fgraph.toposort()])
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias, Dy)
def test_batch_normalization_test():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论