提交 36de8dd2 authored 作者: Nicolas Ballas's avatar Nicolas Ballas

Minor updates

上级 6ad3fded
...@@ -8,49 +8,49 @@ class BNComposite(Composite): ...@@ -8,49 +8,49 @@ class BNComposite(Composite):
def __init__(self, dtype): def __init__(self, dtype):
x = theano.scalar.Scalar(dtype=dtype).make_variable() x = theano.scalar.Scalar(dtype=dtype).make_variable()
mean = theano.scalar.Scalar(dtype=dtype).make_variable() mean = theano.scalar.Scalar(dtype=dtype).make_variable()
var = theano.scalar.Scalar(dtype=dtype).make_variable() std = theano.scalar.Scalar(dtype=dtype).make_variable()
gamma = theano.scalar.Scalar(dtype=dtype).make_variable() gamma = theano.scalar.Scalar(dtype=dtype).make_variable()
beta = theano.scalar.Scalar(dtype=dtype).make_variable() beta = theano.scalar.Scalar(dtype=dtype).make_variable()
o = add(mul(true_div(sub(x, mean), var), gamma), beta) o = add(mul(true_div(sub(x, mean), std), gamma), beta)
inputs = [x, mean, var, gamma, beta] inputs = [x, mean, std, gamma, beta]
outputs= [o] outputs = [o]
super(BNComposite, self).__init__(inputs, outputs) super(BNComposite, self).__init__(inputs, outputs)
def grad(self, inps, grads): def grad(self, inps, grads):
x, mean, var, gamma, beta = inps x, mean, std, gamma, beta = inps
top, = grads top, = grads
dx = (top*gamma) / var dx = (top * gamma) / std
dmean = -(top*gamma) / var dmean = -(top * gamma) / std
dvar = -(top * gamma * (x - mean)) / (var*var) dstd = -(top * gamma * (x - mean)) / (std * std)
dgamma = top*(x - mean) / var dgamma = top * (x - mean) / std
return [dx, dmean, dvar, dgamma, top] return [dx, dmean, dstd, dgamma, top]
def batch_normalization(inputs, gamma, beta, mean, variance, axis=0): def batch_normalization(inputs, gamma, beta, mean, std):
""" """
This function will build the symbolic graph for applying batch normalization This function will build the symbolic graph for applying batch normalization
to a set of activations. to a set of activations. As no intermediate representations are stored for the
back-propagation, this implementation lower the memory usage, however,
it is 5-10% slower than a naive theano implementation, as it redo
some foward computations for the backprop.
Parameters Parameters
---------- ----------
inputs : symbolic tensor inputs : symbolic tensor
Mini-batch of examples Mini-batch of activations
gamma: symbolic vector gamma: symbolic tensor
BN scale parameter, must be of same dimension that BN scale parameter, must be of same dimensionality as
the number of inputs channel inputs and broadcastable against it
beta: symbolic vector beta: symbolic tensor
BN shift parameter, must be of same dimension that BN shift parameter, must be of same dimensionality as
the number of inputs channel inputs and broadcastable against it
mean: symbolic tensor mean: symbolic tensor
inputs means inputs means, must be of same dimensionality as
variance: symbolic tensor inputs and broadcastable against it
inputs variance std: symbolic tensor
axis: int inputs standard deviation, must be of same dimensionality as
channel axis inputs and broadcastable against it
""" """
elm_bn = theano.tensor.elemwise.Elemwise(scalar_op=BNComposite(dtype=inputs.dtype)) elm_bn = theano.tensor.elemwise.Elemwise(scalar_op=BNComposite(dtype=inputs.dtype))
rval = elm_bn(inputs, mean, variance, gamma, beta) rval = elm_bn(inputs, mean, std, gamma, beta)
return rval return rval
...@@ -8,8 +8,8 @@ from theano.tensor.nnet.bn import batch_normalization ...@@ -8,8 +8,8 @@ from theano.tensor.nnet.bn import batch_normalization
def test_bn(): def test_bn():
def bn_ref(x, G, B, M, V): def bn_ref(x, G, B, M, V):
n = (x-M)/V n = (x - M) / V
return n*G+B return n * G + B
numpy.random.seed(1234) numpy.random.seed(1234)
X = 1 + numpy.random.random([10, 20]).astype('float32') X = 1 + numpy.random.random([10, 20]).astype('float32')
...@@ -26,28 +26,28 @@ def test_bn(): ...@@ -26,28 +26,28 @@ def test_bn():
bn_op = batch_normalization(x, g, b, m, v) bn_op = batch_normalization(x, g, b, m, v)
bn_ref_op = bn_ref(x, g, b, m, v) bn_ref_op = bn_ref(x, g, b, m, v)
f = theano.function([x, b, g, m ,v], [bn_op]) f = theano.function([x, b, g, m, v], [bn_op])
f_ref = theano.function([x, b, g, m ,v], [bn_ref_op]) f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
res = f(X, G, B, M, V) res = f(X, G, B, M, V)
res_ref = f_ref(X, G, B, M, V) res_ref = f_ref(X, G, B, M, V)
utt.assert_allclose(res_ref, res) utt.assert_allclose(res_ref, res)
utt.verify_grad(batch_normalization, [X, G, B, M, V]) utt.verify_grad(batch_normalization, [X, G, B, M, V])
bn_op = batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.var(axis=0, keepdims=True)) bn_op = batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True))
bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.var(axis=0, keepdims=True)) bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.var(axis=0, keepdims=True))
f = theano.function([x, b, g], [bn_op]) f = theano.function([x, b, g], [bn_op])
f_ref = theano.function([x, b, g], [bn_ref_op]) f_ref = theano.function([x, b, g], [bn_ref_op])
res = f(X, G, B) res = f(X, G, B)
res_ref = f_ref(X, G, B) res_ref = f_ref(X, G, B)
utt.assert_allclose(res_ref, res) utt.assert_allclose(res_ref, res)
utt.verify_grad(batch_normalization, [X, G, B, X.mean(axis=0, keepdims=True), X.var(axis=0, keepdims=True)]) utt.verify_grad(batch_normalization, [X, G, B, X.mean(axis=0, keepdims=True), X.std(axis=0, keepdims=True)])
def test_bn_feature_maps(): def test_bn_feature_maps():
def bn_ref(x, G, B, M, V): def bn_ref(x, G, B, M, V):
n = (x-M)/V n = (x - M) / V
return n*G+B return n * G + B
numpy.random.seed(1234) numpy.random.seed(1234)
X = 1 + numpy.random.random([10, 20, 4, 4]).astype('float32') X = 1 + numpy.random.random([10, 20, 4, 4]).astype('float32')
...@@ -62,7 +62,6 @@ def test_bn_feature_maps(): ...@@ -62,7 +62,6 @@ def test_bn_feature_maps():
m = theano.tensor.vector('m') m = theano.tensor.vector('m')
v = theano.tensor.vector('v') v = theano.tensor.vector('v')
### Provide mean/var
bn_op = batch_normalization(x, bn_op = batch_normalization(x,
g.dimshuffle('x', 0, 'x', 'x'), g.dimshuffle('x', 0, 'x', 'x'),
b.dimshuffle('x', 0, 'x', 'x'), b.dimshuffle('x', 0, 'x', 'x'),
...@@ -73,8 +72,8 @@ def test_bn_feature_maps(): ...@@ -73,8 +72,8 @@ def test_bn_feature_maps():
b.dimshuffle('x', 0, 'x', 'x'), b.dimshuffle('x', 0, 'x', 'x'),
m.dimshuffle('x', 0, 'x', 'x'), m.dimshuffle('x', 0, 'x', 'x'),
v.dimshuffle('x', 0, 'x', 'x')) v.dimshuffle('x', 0, 'x', 'x'))
f = theano.function([x, b, g, m ,v], [bn_op]) f = theano.function([x, b, g, m, v], [bn_op])
f_ref = theano.function([x, b, g, m ,v], [bn_ref_op]) f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
res = f(X, G, B, M, V) res = f(X, G, B, M, V)
res_ref = f_ref(X, G, B, M, V) res_ref = f_ref(X, G, B, M, V)
utt.assert_allclose(res_ref, res) utt.assert_allclose(res_ref, res)
...@@ -87,4 +86,3 @@ def test_bn_feature_maps(): ...@@ -87,4 +86,3 @@ def test_bn_feature_maps():
variance.dimshuffle('x', 0, 'x', 'x'), variance.dimshuffle('x', 0, 'x', 'x'),
axis=1) axis=1)
utt.verify_grad(conv_bn, [X, G, B, M, V]) utt.verify_grad(conv_bn, [X, G, B, M, V])
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论