提交 c4293e69 authored 作者: Gijs van Tulder's avatar Gijs van Tulder

Add running averages to batch norm (no cuDNN yet).

上级 4f291961
...@@ -2949,7 +2949,9 @@ def local_abstract_batch_norm_train_cudnn(node): ...@@ -2949,7 +2949,9 @@ def local_abstract_batch_norm_train_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrain): if not isinstance(node.op, bn.AbstractBatchNormTrain):
return None return None
x, scale, bias, epsilon = node.inputs x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
running_mean = node.inputs[5] if len(node.inputs) > 5 else None
running_var = node.inputs[6] if len(node.inputs) > 6 else None
# input on gpu? TODO what about the output? # input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, GpuArrayType) or x_on_gpu = (isinstance(x.type, GpuArrayType) or
...@@ -2983,15 +2985,24 @@ def local_abstract_batch_norm_train_cudnn(node): ...@@ -2983,15 +2985,24 @@ def local_abstract_batch_norm_train_cudnn(node):
out, mean, invstd = dnn_batch_normalization_train(x, scale, bias, mode, eps) out, mean, invstd = dnn_batch_normalization_train(x, scale, bias, mode, eps)
results = [out, mean, invstd]
if running_mean is not None:
running_mean = running_mean * (1 - running_average_factor) + \
mean * running_average_factor
results.append(running_mean)
if running_var is not None:
var = x.var(axis=axes, keepdims=True)
m = tensor.cast(tensor.prod(x.shape) / tensor.prod(scale.shape), theano.config.floatX)
running_var = running_var * (1 - running_average_factor) + \
(m / (m - 1)) * var * running_average_factor
results.append(running_var)
# If the original output was on CPU, we have to transfer it # If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType): for i in range(len(node.outputs)):
out = tensor.as_tensor_variable(out) if isinstance(node.outputs[i].type, tensor.TensorType):
if isinstance(node.outputs[1].type, tensor.TensorType): results[i] = tensor.as_tensor_variable(results[i])
mean = tensor.as_tensor_variable(mean)
if isinstance(node.outputs[2].type, tensor.TensorType):
invstd = tensor.as_tensor_variable(invstd)
# TODO copy_stack_trace? # TODO copy_stack_trace?
return [out, mean, invstd] return results
@local_optimizer([bn.AbstractBatchNormTrainGrad]) @local_optimizer([bn.AbstractBatchNormTrainGrad])
......
...@@ -1384,26 +1384,39 @@ def test_dnn_batchnorm_train(): ...@@ -1384,26 +1384,39 @@ def test_dnn_batchnorm_train():
for mode in ('per-activation', 'spatial'): for mode in ('per-activation', 'spatial'):
for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector): for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias')) x, scale, bias, running_mean, running_var = (vartype(n)
for n in ('x', 'scale', 'bias',
'running_mean',
'running_var'))
ndim = x.ndim ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used eps = 5e-3 # some non-standard value to test if it's used
running_average_factor = 0.3
# forward pass, direct interface # forward pass, direct interface
out_gpu, x_mean_gpu, x_invstd_gpu = dnn.dnn_batch_normalization_train( out_gpu, x_mean_gpu, x_invstd_gpu = dnn.dnn_batch_normalization_train(
x, scale, bias, mode, eps) x, scale, bias, mode, eps)
# forward pass, abstract interface # forward pass, abstract interface
out_abstract, x_mean_abstract, x_invstd_abstract = bn.batch_normalization_train( out_abstract, x_mean_abstract, x_invstd_abstract, \
x, scale, bias, mode, eps) out_running_mean_abstract, out_running_var_abstract = \
bn.batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# reference forward pass # reference forward pass
if mode == 'per-activation': if mode == 'per-activation':
axes = (0,) axes = (0,)
elif mode == 'spatial': elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim)) axes = (0,) + tuple(range(2, ndim))
x_mean_ref = x.mean(axis=axes, keepdims=True) x_mean_ref = x.mean(axis=axes, keepdims=True)
x_invstd_ref = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps)) x_var_ref = x.var(axis=axes, keepdims=True)
x_invstd_ref = T.inv(T.sqrt(x_var_ref + eps))
scale_ref = T.addbroadcast(scale, *axes) scale_ref = T.addbroadcast(scale, *axes)
bias_ref = T.addbroadcast(bias, *axes) bias_ref = T.addbroadcast(bias, *axes)
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
out_running_mean_ref = running_mean * (1 - running_average_factor) + \
x_mean_ref * running_average_factor
out_running_var_ref = running_var * (1 - running_average_factor) + \
(m / (m - 1)) * x_var_ref * running_average_factor
# backward pass # backward pass
dy = vartype('dy') dy = vartype('dy')
grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy}) grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
...@@ -1414,12 +1427,14 @@ def test_dnn_batchnorm_train(): ...@@ -1414,12 +1427,14 @@ def test_dnn_batchnorm_train():
f_gpu = theano.function([x, scale, bias, dy], f_gpu = theano.function([x, scale, bias, dy],
[out_gpu, x_mean_gpu, x_invstd_gpu] + grads_gpu, [out_gpu, x_mean_gpu, x_invstd_gpu] + grads_gpu,
mode=mode_with_gpu) mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, dy], f_abstract = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract] + [out_abstract, x_mean_abstract, x_invstd_abstract,
out_running_mean_abstract, out_running_var_abstract] +
grads_abstract, grads_abstract,
mode=mode_with_gpu) mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, dy], f_ref = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_ref, x_mean_ref, x_invstd_ref] + grads_ref) [out_ref, x_mean_ref, x_invstd_ref,
out_running_mean_ref, out_running_var_ref] + grads_ref)
# check if the abstract Ops have been replaced # check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f_abstract.maker.fgraph.toposort()]) in f_abstract.maker.fgraph.toposort()])
...@@ -1438,9 +1453,11 @@ def test_dnn_batchnorm_train(): ...@@ -1438,9 +1453,11 @@ def test_dnn_batchnorm_train():
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX) Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX) Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs_gpu = f_gpu(X, Scale, Bias, Dy) outputs_gpu = f_gpu(X, Scale, Bias, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Dy) outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Dy) outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
# compare outputs # compare outputs
utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_gpu[1], outputs_ref[1]) # mean utt.assert_allclose(outputs_gpu[1], outputs_ref[1]) # mean
...@@ -1448,13 +1465,51 @@ def test_dnn_batchnorm_train(): ...@@ -1448,13 +1465,51 @@ def test_dnn_batchnorm_train():
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[1], outputs_ref[1]) # mean utt.assert_allclose(outputs_abstract[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs_abstract[2], outputs_ref[2]) # invstd utt.assert_allclose(outputs_abstract[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_abstract[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
# compare gradients # compare gradients
utt.assert_allclose(outputs_gpu[3], outputs_ref[3], atol=2e-4) # dx utt.assert_allclose(outputs_gpu[3], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_gpu[4], outputs_ref[4], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs_gpu[4], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_gpu[5], outputs_ref[5]) # dbias utt.assert_allclose(outputs_gpu[5], outputs_ref[7]) # dbias
utt.assert_allclose(outputs_abstract[3], outputs_ref[3], atol=2e-4) # dx utt.assert_allclose(outputs_abstract[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_abstract[4], outputs_ref[4], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs_abstract[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_abstract[5], outputs_ref[5]) # dbias utt.assert_allclose(outputs_abstract[7], outputs_ref[7]) # dbias
def test_dnn_batchnorm_train_without_running_averages():
# compile and run batch_normalization_train without running averages
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
# forward pass
out, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
# compile
f_abstract = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads, mode=mode_with_gpu)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm)
for n in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
for n in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f_abstract.maker.fgraph.toposort()])
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f_abstract(X, Scale, Bias, Dy)
def test_batchnorm_inference(): def test_batchnorm_inference():
......
...@@ -84,7 +84,8 @@ def batch_normalization(inputs, gamma, beta, mean, std, ...@@ -84,7 +84,8 @@ def batch_normalization(inputs, gamma, beta, mean, std,
def batch_normalization_train(inputs, gamma, beta, axes='per-activation', def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
epsilon=1e-4): epsilon=1e-4, running_average_factor=0.1,
running_mean=None, running_var=None):
""" """
Performs batch normalization of the given inputs, using the mean and Performs batch normalization of the given inputs, using the mean and
variance of the inputs. variance of the inputs.
...@@ -107,6 +108,23 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation', ...@@ -107,6 +108,23 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
epsilon : float epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN). value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
Returns Returns
------- -------
...@@ -116,6 +134,12 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation', ...@@ -116,6 +134,12 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
Means of `inputs` across the normalization axes. Means of `inputs` across the normalization axes.
invstd : tensor invstd : tensor
Inverse standard deviations of `inputs` across the normalization axes. Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes Notes
----- -----
...@@ -131,14 +155,32 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation', ...@@ -131,14 +155,32 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
# for spatial normalization # for spatial normalization
axes = (0,) + tuple(range(2, inputs.ndim)) axes = (0,) + tuple(range(2, inputs.ndim))
mean = inputs.mean(axes, keepdims=True) mean = inputs.mean(axes, keepdims=True)
invstd = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon)) var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) + \\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) + \\
(m / (m - 1)) * var * running_average_factor
""" """
ndim = inputs.ndim ndim = inputs.ndim
if gamma.ndim != ndim or beta.ndim != ndim: if gamma.ndim != ndim or beta.ndim != ndim:
raise ValueError("gamma and beta must be of the same dimensionality " raise ValueError("gamma and beta must be of the same dimensionality "
"as inputs; got %d and %d instead of %d" % "as inputs; got %d and %d instead of %d" %
(gamma.ndim, beta.ndim, ndim)) (gamma.ndim, beta.ndim, ndim))
if (running_mean is None) != (running_var is None):
raise ValueError("running_mean and running_var must either both be "
"given or both be None")
if running_mean is not None and running_mean.ndim != ndim:
raise ValueError("running_mean must be of the same dimensionality "
"as inputs; got %d instead of %d" %
(running_mean.ndim, ndim))
if running_var is not None and running_var.ndim != ndim:
raise ValueError("running_var must be of the same dimensionality "
"as inputs; got %d instead of %d" %
(running_var.ndim, ndim))
if epsilon < 1e-5: if epsilon < 1e-5:
raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon) raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)
...@@ -163,7 +205,23 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation', ...@@ -163,7 +205,23 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
beta = T.addbroadcast(beta, *axes) beta = T.addbroadcast(beta, *axes)
batchnorm_op = AbstractBatchNormTrain(axes=axes) batchnorm_op = AbstractBatchNormTrain(axes=axes)
return tuple(batchnorm_op(inputs, gamma, beta, epsilon=epsilon))
if running_mean is not None and running_var is not None:
running_mean = as_tensor_variable(running_mean)
running_var = as_tensor_variable(running_var)
running_mean_bc = T.addbroadcast(running_mean, *axes)
running_var_bc = T.addbroadcast(running_var, *axes)
out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
inputs, gamma, beta, epsilon=epsilon,
running_average_factor=running_average_factor,
running_mean=running_mean_bc, running_var=running_var_bc)
if new_running_mean.broadcastable != running_mean.broadcastable:
new_running_mean = T.patternbroadcast(new_running_mean, running_mean.broadcastable)
if new_running_var.broadcastable != running_var.broadcastable:
new_running_var = T.patternbroadcast(new_running_var, running_var.broadcastable)
return out, mean, invstd, new_running_mean, new_running_var
else:
return tuple(batchnorm_op(inputs, gamma, beta, epsilon=epsilon))
def batch_normalization_test(inputs, gamma, beta, mean, var, def batch_normalization_test(inputs, gamma, beta, mean, var,
...@@ -277,6 +335,23 @@ class AbstractBatchNormTrain(Op): ...@@ -277,6 +335,23 @@ class AbstractBatchNormTrain(Op):
epsilon epsilon
Epsilon value used in the batch normalization formula. Minimum allowed Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN). value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - running_average_factor) + batch mean * running_average_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - running_average_factor) + (m / (m - 1)) * batch var * running_average_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
""" """
__props__ = ('axes',) __props__ = ('axes',)
...@@ -288,40 +363,85 @@ class AbstractBatchNormTrain(Op): ...@@ -288,40 +363,85 @@ class AbstractBatchNormTrain(Op):
self.axes = axes self.axes = axes
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[0], shape[1], shape[1]] return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)
def make_node(self, x, scale, bias, epsilon=1e-4): def make_node(self, x, scale, bias, epsilon=1e-4,
running_average_factor=0.1,
running_mean=None, running_var=None):
assert x.ndim == scale.ndim == bias.ndim assert x.ndim == scale.ndim == bias.ndim
assert ((running_mean is None and running_var is None) or
(running_mean is not None and running_var is not None))
assert (running_mean is None or running_mean.ndim == x.ndim)
assert (running_var is None or running_var.ndim == x.ndim)
if not isinstance(epsilon, theano.Variable): if not isinstance(epsilon, theano.Variable):
epsilon = as_tensor_variable(epsilon) epsilon = as_tensor_variable(epsilon)
return Apply(self, [x, scale, bias, epsilon], [x.type(), scale.type(), scale.type()]) if not isinstance(running_average_factor, theano.Variable):
running_average_factor = as_tensor_variable(running_average_factor)
inputs = [x, scale, bias, epsilon, running_average_factor]
output_types = [x.type(), scale.type(), scale.type()]
if running_mean is not None and running_var is not None:
inputs.append(running_mean)
inputs.append(running_var)
output_types.append(scale.type())
output_types.append(scale.type())
return Apply(self, inputs, output_types)
def grad(self, inputs, grads): def grad(self, inputs, grads):
x, scale, bias, epsilon = inputs x, scale, bias, epsilon, running_average_factor = inputs[:5]
dy = grads[0] dy = grads[0]
_, x_mean, x_invstd = self(x, scale, bias, epsilon) _, x_mean, x_invstd = self(*inputs)[:3]
disconnected_outputs = [
theano.gradient.DisconnectedType()(), # epsilon
theano.gradient.DisconnectedType()()] # running_average_factor
# Optional running_mean and running_var.
for i in range(5, len(inputs)):
disconnected_outputs.append(theano.gradient.DisconnectedType()())
return AbstractBatchNormTrainGrad(self.axes)( return AbstractBatchNormTrainGrad(self.axes)(
x, dy, scale, x_mean, x_invstd, epsilon) + [theano.gradient.DisconnectedType()()] x, dy, scale, x_mean, x_invstd, epsilon) + disconnected_outputs
def connection_pattern(self, node): def connection_pattern(self, node):
# Specificy that epsilon is not connected to outputs. # Specificy that epsilon and running_average_factor are not connected to outputs.
return [[True, True, True], [True, True, True], [True, True, True], patterns = [[True, True, True], # x
[False, False, False]] [True, True, True], # scale
[True, True, True], # bias
[False, False, False], # epsilon
[False, False, False]] # running_average_factor
# Optional running_mean and running_var are only
# connected to their new values.
for i in range(5, len(node.inputs)):
patterns[0].append(True)
for pattern in patterns[1:]:
pattern.append(False)
patterns.append([False] * (3 + i - 5) + [True])
return patterns
def perform(self, node, inputs, output_storage): def perform(self, node, inputs, output_storage):
x, scale, bias, epsilon = inputs x, scale, bias, epsilon, running_average_factor = inputs[:5]
axes = self.axes axes = self.axes
if min(axes) < 0 or max(axes) >= x.ndim: if min(axes) < 0 or max(axes) >= x.ndim:
raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes))) raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))
mean = x.mean(axes, keepdims=True) mean = x.mean(axes, keepdims=True)
invstd = 1.0 / numpy.sqrt(x.var(axes, keepdims=True) + epsilon) var = x.var(axes, keepdims=True)
invstd = 1.0 / numpy.sqrt(var + epsilon)
out = (x - mean) * (scale * invstd) + bias out = (x - mean) * (scale * invstd) + bias
output_storage[0][0] = out output_storage[0][0] = out
output_storage[1][0] = mean output_storage[1][0] = mean
output_storage[2][0] = invstd output_storage[2][0] = invstd
if len(inputs) > 5:
running_mean = inputs[5]
running_mean = running_mean * (1.0 - running_average_factor) + \
mean * running_average_factor
output_storage[3][0] = running_mean
if len(inputs) > 6:
m = float(numpy.prod(x.shape) / numpy.prod(scale.shape))
running_var = inputs[6]
running_var = running_var * (1.0 - running_average_factor) + \
(m / (m - 1)) * var * running_average_factor
output_storage[4][0] = running_var
class AbstractBatchNormInference(Op): class AbstractBatchNormInference(Op):
""" """
...@@ -429,21 +549,42 @@ def local_abstract_batch_norm_train(node): ...@@ -429,21 +549,42 @@ def local_abstract_batch_norm_train(node):
if not isinstance(node.op, AbstractBatchNormTrain): if not isinstance(node.op, AbstractBatchNormTrain):
return None return None
x, scale, bias, epsilon = node.inputs x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
axes = node.op.axes axes = node.op.axes
if min(axes) < 0 or max(axes) > x.ndim: if min(axes) < 0 or max(axes) > x.ndim:
return None return None
if not isinstance(x.type, TensorType) or \ if not isinstance(x.type, TensorType) or \
not isinstance(scale.type, TensorType) or \ not isinstance(scale.type, TensorType) or \
not isinstance(bias.type, TensorType) or \ not isinstance(bias.type, TensorType) or \
not isinstance(epsilon.type, TensorType): not isinstance(epsilon.type, TensorType) or \
not isinstance(running_average_factor.type, TensorType):
return None
# optional running_mean and running_var
if len(node.inputs) > 5 and not isinstance(node.inputs[5].type, TensorType):
return None
if len(node.inputs) > 6 and not isinstance(node.inputs[6].type, TensorType):
return None return None
mean = x.mean(axes, keepdims=True) mean = x.mean(axes, keepdims=True)
invstd = T.inv(T.sqrt(x.var(axes, keepdims=True) + epsilon)) var = x.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (x - mean) * (scale * invstd) + bias out = (x - mean) * (scale * invstd) + bias
results = [out, mean, invstd]
if len(node.inputs) > 5:
running_mean = node.inputs[5]
running_mean = running_mean * (1.0 - running_average_factor) + \
mean * running_average_factor
results.append(running_mean)
if len(node.inputs) > 6:
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
running_var = node.inputs[6]
running_var = running_var * (1.0 - running_average_factor) + \
(m / (m - 1)) * var * running_average_factor
results.append(running_var)
# TODO copy_stack_trace? # TODO copy_stack_trace?
return [out, mean, invstd] return results
@local_optimizer([AbstractBatchNormTrainGrad]) @local_optimizer([AbstractBatchNormTrainGrad])
......
...@@ -148,9 +148,13 @@ def test_batch_normalization_train(): ...@@ -148,9 +148,13 @@ def test_batch_normalization_train():
for axes in ('per-activation', 'spatial', (1, 2, 3, 4)): for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector): for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias')) x, scale, bias, running_mean, running_var = (vartype(n)
for n in ('x', 'scale', 'bias',
'running_mean',
'running_var'))
ndim = x.ndim ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used eps = 5e-3 # some non-standard value to test if it's used
running_average_factor = 0.3
# remove non-existing axes # remove non-existing axes
if isinstance(axes, tuple): if isinstance(axes, tuple):
...@@ -159,8 +163,10 @@ def test_batch_normalization_train(): ...@@ -159,8 +163,10 @@ def test_batch_normalization_train():
continue continue
# forward pass # forward pass
out, x_mean, x_invstd = bn.batch_normalization_train( out, x_mean, x_invstd, out_running_mean, out_running_var = \
x, scale, bias, axes, eps) bn.batch_normalization_train(
x, scale, bias, axes, eps,
running_average_factor, running_mean, running_var)
# reference forward pass # reference forward pass
if axes == 'per-activation': if axes == 'per-activation':
axes2 = (0,) axes2 = (0,)
...@@ -169,18 +175,25 @@ def test_batch_normalization_train(): ...@@ -169,18 +175,25 @@ def test_batch_normalization_train():
else: else:
axes2 = axes axes2 = axes
x_mean2 = x.mean(axis=axes2, keepdims=True) x_mean2 = x.mean(axis=axes2, keepdims=True)
x_invstd2 = T.inv(T.sqrt(x.var(axis=axes2, keepdims=True) + eps)) x_var2 = x.var(axis=axes2, keepdims=True)
x_invstd2 = T.inv(T.sqrt(x_var2 + eps))
scale2 = T.addbroadcast(scale, *axes2) scale2 = T.addbroadcast(scale, *axes2)
bias2 = T.addbroadcast(bias, *axes2) bias2 = T.addbroadcast(bias, *axes2)
out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
out_running_mean2 = running_mean * (1 - running_average_factor) + \
x_mean2 * running_average_factor
out_running_var2 = running_var * (1 - running_average_factor) + \
(m / (m - 1)) * x_var2 * running_average_factor
# backward pass # backward pass
dy = vartype('dy') dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
# reference backward pass # reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
# compile # compile
f = theano.function([x, scale, bias, dy], f = theano.function([x, scale, bias, running_mean, running_var, dy],
[out, x_mean, x_invstd, out2, x_mean2, x_invstd2] + [out, x_mean, x_invstd, out_running_mean, out_running_var,
out2, x_mean2, x_invstd2, out_running_mean2, out_running_var2] +
grads + grads2, mode='FAST_RUN') grads + grads2, mode='FAST_RUN')
# check if the abstract Ops have been replaced # check if the abstract Ops have been replaced
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain, assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
...@@ -196,15 +209,47 @@ def test_batch_normalization_train(): ...@@ -196,15 +209,47 @@ def test_batch_normalization_train():
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX) Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX) Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Dy) Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy)
# compare outputs # compare outputs
utt.assert_allclose(outputs[0], outputs[0 + 3]) # out utt.assert_allclose(outputs[0], outputs[0 + 5]) # out
utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean utt.assert_allclose(outputs[1], outputs[1 + 5]) # mean
utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd utt.assert_allclose(outputs[2], outputs[2 + 5]) # invstd
utt.assert_allclose(outputs[3], outputs[3 + 5]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs[4]),
numpy.nan_to_num(outputs[4 + 5])) # running_var
# compare gradients # compare gradients
utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4) # dx utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4) # dx
utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias utt.assert_allclose(outputs[12], outputs[12 + 3]) # dbias
def test_batch_normalization_train_without_running_averages():
# compile and run batch_normalization_train without running averages
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
# forward pass
out, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
# compile
f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads, mode='FAST_RUN')
# check if the abstract Ops have been replaced
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f.maker.fgraph.toposort()])
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias, Dy)
def test_batch_normalization_test(): def test_batch_normalization_test():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论