Merge pull request #3410 from ballasn/batchnormalization

Add batch normalization op

Merge pull request #3410 from ballasn/batchnormalization
681c67fa · Frédéric Bastien · 8f038cc6 · 4bc49179 · 681c67fa · 681c67fa
--- a/theano/tensor/nnet/bn.py
+++ b/theano/tensor/nnet/bn.py
+import theano
+from theano.scalar import Composite
+from theano.scalar import add, sub, true_div, mul
+class BNComposite(Composite):
+    def __init__(self, dtype):
+        x = theano.scalar.Scalar(dtype=dtype).make_variable()
+        mean = theano.scalar.Scalar(dtype=dtype).make_variable()
+        std = theano.scalar.Scalar(dtype=dtype).make_variable()
+        gamma = theano.scalar.Scalar(dtype=dtype).make_variable()
+        beta = theano.scalar.Scalar(dtype=dtype).make_variable()
+        o = add(mul(true_div(sub(x, mean), std), gamma), beta)
+        inputs = [x, mean, std, gamma, beta]
+        outputs = [o]
+        super(BNComposite, self).__init__(inputs, outputs)
+    def grad(self, inps, grads):
+        x, mean, std, gamma, beta = inps
+        top, = grads
+        dx = (top * gamma) / std
+        dmean = -(top * gamma) / std
+        dstd = -(top * gamma * (x - mean)) / (std * std)
+        dgamma = top * (x - mean) / std
+        return [dx, dmean, dstd, dgamma, top]
+def batch_normalization(inputs, gamma, beta, mean, std,
+                        mode='low_mem'):
+    """
+    This function will build the symbolic graph for applying batch normalization
+    to a set of activations.
+    Work also on GPU
+    Parameters
+    ----------
+    inputs : symbolic tensor
+        Mini-batch of activations
+    gamma: symbolic tensor
+        BN scale parameter, must be of same dimensionality as
+        inputs and broadcastable against it
+    beta: symbolic tensor
+        BN shift parameter, must be of same dimensionality as
+        inputs and broadcastable against it
+    mean: symbolic tensor
+        inputs means, must be of same dimensionality as
+        inputs and broadcastable against it
+    std: symbolic tensor
+        inputs standard deviation, must be of same dimensionality as
+        inputs and broadcastable against it
+    mode: 'low_mem' or 'high_mem'
+        Specify which batch_normalization implementation that will be
+        used.
+        As no intermediate representations are stored for the back-propagation,
+        'low_mem' implementation lower the memory usage, however,
+        it is 5-10% slower than 'high_mem' implementation. Note that 5-10% computation
+        time difference compare the batch_normalization operation only, time difference
+        between implementation is likely to be less important on the full model fprop/bprop.
+    """
+    if mode == 'low_mem':
+        elm_bn = theano.tensor.elemwise.Elemwise(scalar_op=BNComposite(dtype=inputs.dtype))
+        rval = elm_bn(inputs, mean, std, gamma, beta)
+    elif mode == 'high_mem':
+        rval = (inputs - mean) / std
+        rval = rval * gamma + beta
+    else:
+        raise ValueError(
+            'mode must be either "low_mem", "high_mem"')
+    return rval
--- a/theano/tensor/nnet/tests/test_bn.py
+++ b/theano/tensor/nnet/tests/test_bn.py
+import theano
+from theano.tests import unittest_tools as utt
+import numpy
+from theano.tensor.nnet.bn import batch_normalization
+def test_bn():
+    def bn_ref(x, G, B, M, V):
+        n = (x - M) / V
+        return n * G + B
+    numpy.random.seed(1234)
+    X = 1 + numpy.random.random([10, 20]).astype('float32')
+    B = 1 + numpy.random.random([20]).astype('float32')
+    G = 1 + numpy.random.random([20]).astype('float32')
+    M = 1 + numpy.random.random([20]).astype('float32')
+    V = 1 + numpy.random.random([20]).astype('float32')
+    x = theano.tensor.matrix('x')
+    b = theano.tensor.vector('b')
+    g = theano.tensor.vector('g')
+    m = theano.tensor.vector('m')
+    v = theano.tensor.vector('v')
+    bn_ref_op = bn_ref(x, g, b, m, v)
+    f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
+    res_ref = f_ref(X, G, B, M, V)
+    for mode in ['low_mem', 'high_mem']:
+        bn_op = batch_normalization(x, g, b, m, v, mode=mode)
+        f = theano.function([x, b, g, m, v], [bn_op])
+        res = f(X, G, B, M, V)
+        utt.assert_allclose(res_ref, res)
+        def bn(inputs, gamma, beta, mean, std):
+            return batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
+        utt.verify_grad(bn, [X, G, B, M, V])
+    bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True))
+    f_ref = theano.function([x, b, g], [bn_ref_op])
+    res_ref = f_ref(X, G, B)
+    for mode in ['low_mem', 'high_mem']:
+        bn_op = batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode)
+        f = theano.function([x, b, g], [bn_op])
+        res = f(X, G, B)
+        utt.assert_allclose(res_ref, res)
+        def bn(inputs, gamma, beta, mean, std):
+            return batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
+        utt.verify_grad(batch_normalization, [X, G, B,
+                                              X.mean(axis=0)[numpy.newaxis], X.std(axis=0)[numpy.newaxis]])
+def test_bn_feature_maps():
+    def bn_ref(x, G, B, M, V):
+        n = (x - M) / V
+        return n * G + B
+    numpy.random.seed(1234)
+    X = 1 + numpy.random.random([10, 20, 4, 4]).astype('float32')
+    B = 1 + numpy.random.random([20]).astype('float32')
+    G = 1 + numpy.random.random([20]).astype('float32')
+    M = 1 + numpy.random.random([20]).astype('float32')
+    V = 1 + numpy.random.random([20]).astype('float32')
+    x = theano.tensor.tensor4('x')
+    b = theano.tensor.vector('b')
+    g = theano.tensor.vector('g')
+    m = theano.tensor.vector('m')
+    v = theano.tensor.vector('v')
+    bn_ref_op = bn_ref(x,
+                       g.dimshuffle('x', 0, 'x', 'x'),
+                       b.dimshuffle('x', 0, 'x', 'x'),
+                       m.dimshuffle('x', 0, 'x', 'x'),
+                       v.dimshuffle('x', 0, 'x', 'x'))
+    f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
+    res_ref = f_ref(X, G, B, M, V)
+    for mode in ['low_mem', 'high_mem']:
+        bn_op = batch_normalization(x,
+                                    g.dimshuffle('x', 0, 'x', 'x'),
+                                    b.dimshuffle('x', 0, 'x', 'x'),
+                                    m.dimshuffle('x', 0, 'x', 'x'),
+                                    v.dimshuffle('x', 0, 'x', 'x'),
+                                    mode=mode)
+        f = theano.function([x, b, g, m, v], [bn_op])
+        res = f(X, G, B, M, V)
+        utt.assert_allclose(res_ref, res)
+        def conv_bn(inputs, gamma, beta, mean, std):
+            return batch_normalization(inputs,
+                                       gamma.dimshuffle('x', 0, 'x', 'x'),
+                                       beta.dimshuffle('x', 0, 'x', 'x'),
+                                       mean.dimshuffle('x', 0, 'x', 'x'),
+                                       std.dimshuffle('x', 0, 'x', 'x'),
+                                       mode=mode)
+        utt.verify_grad(conv_bn, [X, G, B, M, V])