提交 681c67fa authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3410 from ballasn/batchnormalization

Add batch normalization op
import theano
from theano.scalar import Composite
from theano.scalar import add, sub, true_div, mul
class BNComposite(Composite):
def __init__(self, dtype):
x = theano.scalar.Scalar(dtype=dtype).make_variable()
mean = theano.scalar.Scalar(dtype=dtype).make_variable()
std = theano.scalar.Scalar(dtype=dtype).make_variable()
gamma = theano.scalar.Scalar(dtype=dtype).make_variable()
beta = theano.scalar.Scalar(dtype=dtype).make_variable()
o = add(mul(true_div(sub(x, mean), std), gamma), beta)
inputs = [x, mean, std, gamma, beta]
outputs = [o]
super(BNComposite, self).__init__(inputs, outputs)
def grad(self, inps, grads):
x, mean, std, gamma, beta = inps
top, = grads
dx = (top * gamma) / std
dmean = -(top * gamma) / std
dstd = -(top * gamma * (x - mean)) / (std * std)
dgamma = top * (x - mean) / std
return [dx, dmean, dstd, dgamma, top]
def batch_normalization(inputs, gamma, beta, mean, std,
mode='low_mem'):
"""
This function will build the symbolic graph for applying batch normalization
to a set of activations.
Work also on GPU
Parameters
----------
inputs : symbolic tensor
Mini-batch of activations
gamma: symbolic tensor
BN scale parameter, must be of same dimensionality as
inputs and broadcastable against it
beta: symbolic tensor
BN shift parameter, must be of same dimensionality as
inputs and broadcastable against it
mean: symbolic tensor
inputs means, must be of same dimensionality as
inputs and broadcastable against it
std: symbolic tensor
inputs standard deviation, must be of same dimensionality as
inputs and broadcastable against it
mode: 'low_mem' or 'high_mem'
Specify which batch_normalization implementation that will be
used.
As no intermediate representations are stored for the back-propagation,
'low_mem' implementation lower the memory usage, however,
it is 5-10% slower than 'high_mem' implementation. Note that 5-10% computation
time difference compare the batch_normalization operation only, time difference
between implementation is likely to be less important on the full model fprop/bprop.
"""
if mode == 'low_mem':
elm_bn = theano.tensor.elemwise.Elemwise(scalar_op=BNComposite(dtype=inputs.dtype))
rval = elm_bn(inputs, mean, std, gamma, beta)
elif mode == 'high_mem':
rval = (inputs - mean) / std
rval = rval * gamma + beta
else:
raise ValueError(
'mode must be either "low_mem", "high_mem"')
return rval
import theano
from theano.tests import unittest_tools as utt
import numpy
from theano.tensor.nnet.bn import batch_normalization
def test_bn():
def bn_ref(x, G, B, M, V):
n = (x - M) / V
return n * G + B
numpy.random.seed(1234)
X = 1 + numpy.random.random([10, 20]).astype('float32')
B = 1 + numpy.random.random([20]).astype('float32')
G = 1 + numpy.random.random([20]).astype('float32')
M = 1 + numpy.random.random([20]).astype('float32')
V = 1 + numpy.random.random([20]).astype('float32')
x = theano.tensor.matrix('x')
b = theano.tensor.vector('b')
g = theano.tensor.vector('g')
m = theano.tensor.vector('m')
v = theano.tensor.vector('v')
bn_ref_op = bn_ref(x, g, b, m, v)
f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
res_ref = f_ref(X, G, B, M, V)
for mode in ['low_mem', 'high_mem']:
bn_op = batch_normalization(x, g, b, m, v, mode=mode)
f = theano.function([x, b, g, m, v], [bn_op])
res = f(X, G, B, M, V)
utt.assert_allclose(res_ref, res)
def bn(inputs, gamma, beta, mean, std):
return batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
utt.verify_grad(bn, [X, G, B, M, V])
bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True))
f_ref = theano.function([x, b, g], [bn_ref_op])
res_ref = f_ref(X, G, B)
for mode in ['low_mem', 'high_mem']:
bn_op = batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode)
f = theano.function([x, b, g], [bn_op])
res = f(X, G, B)
utt.assert_allclose(res_ref, res)
def bn(inputs, gamma, beta, mean, std):
return batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
utt.verify_grad(batch_normalization, [X, G, B,
X.mean(axis=0)[numpy.newaxis], X.std(axis=0)[numpy.newaxis]])
def test_bn_feature_maps():
def bn_ref(x, G, B, M, V):
n = (x - M) / V
return n * G + B
numpy.random.seed(1234)
X = 1 + numpy.random.random([10, 20, 4, 4]).astype('float32')
B = 1 + numpy.random.random([20]).astype('float32')
G = 1 + numpy.random.random([20]).astype('float32')
M = 1 + numpy.random.random([20]).astype('float32')
V = 1 + numpy.random.random([20]).astype('float32')
x = theano.tensor.tensor4('x')
b = theano.tensor.vector('b')
g = theano.tensor.vector('g')
m = theano.tensor.vector('m')
v = theano.tensor.vector('v')
bn_ref_op = bn_ref(x,
g.dimshuffle('x', 0, 'x', 'x'),
b.dimshuffle('x', 0, 'x', 'x'),
m.dimshuffle('x', 0, 'x', 'x'),
v.dimshuffle('x', 0, 'x', 'x'))
f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
res_ref = f_ref(X, G, B, M, V)
for mode in ['low_mem', 'high_mem']:
bn_op = batch_normalization(x,
g.dimshuffle('x', 0, 'x', 'x'),
b.dimshuffle('x', 0, 'x', 'x'),
m.dimshuffle('x', 0, 'x', 'x'),
v.dimshuffle('x', 0, 'x', 'x'),
mode=mode)
f = theano.function([x, b, g, m, v], [bn_op])
res = f(X, G, B, M, V)
utt.assert_allclose(res_ref, res)
def conv_bn(inputs, gamma, beta, mean, std):
return batch_normalization(inputs,
gamma.dimshuffle('x', 0, 'x', 'x'),
beta.dimshuffle('x', 0, 'x', 'x'),
mean.dimshuffle('x', 0, 'x', 'x'),
std.dimshuffle('x', 0, 'x', 'x'),
mode=mode)
utt.verify_grad(conv_bn, [X, G, B, M, V])
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论