Minor updates

36de8dd2 · Nicolas Ballas · 6ad3fded · 36de8dd2 · 36de8dd2
--- a/theano/tensor/nnet/bn.py
+++ b/theano/tensor/nnet/bn.py
@@ -8,49 +8,49 @@ class BNComposite(Composite):
    def __init__(self, dtype):
        x = theano.scalar.Scalar(dtype=dtype).make_variable()
        mean = theano.scalar.Scalar(dtype=dtype).make_variable()
-        var = theano.scalar.Scalar(dtype=dtype).make_variable()
+        std = theano.scalar.Scalar(dtype=dtype).make_variable()
        gamma = theano.scalar.Scalar(dtype=dtype).make_variable()
        beta = theano.scalar.Scalar(dtype=dtype).make_variable()
-        o = add(mul(true_div(sub(x, mean),  var), gamma), beta)
-        inputs = [x, mean, var, gamma, beta]
-        outputs= [o]
+        o = add(mul(true_div(sub(x, mean), std), gamma), beta)
+        inputs = [x, mean, std, gamma, beta]
+        outputs = [o]
        super(BNComposite, self).__init__(inputs, outputs)

    def grad(self, inps, grads):
-        x, mean, var, gamma, beta = inps
+        x, mean, std, gamma, beta = inps
        top, = grads
-        dx = (top*gamma) / var
-        dmean = -(top*gamma) / var
-        dvar = -(top * gamma * (x - mean)) / (var*var)
-        dgamma = top*(x - mean) / var
-        return [dx, dmean, dvar, dgamma, top]
+        dx = (top * gamma) / std
+        dmean = -(top * gamma) / std
+        dstd = -(top * gamma * (x - mean)) / (std * std)
+        dgamma = top * (x - mean) / std
+        return [dx, dmean, dstd, dgamma, top]


-def batch_normalization(inputs, gamma, beta, mean, variance, axis=0):
+def batch_normalization(inputs, gamma, beta, mean, std):
    """
    This function will build the symbolic graph for applying batch normalization
-    to a set of activations.
+    to a set of activations. As no intermediate representations are stored for the
+    back-propagation, this implementation lower the memory usage, however,
+    it is 5-10% slower than a naive theano implementation, as it redo
+    some foward computations for the backprop.

    Parameters
    ----------
    inputs : symbolic tensor
-        Mini-batch of examples
-    gamma: symbolic vector
-        BN scale parameter, must be of same dimension that
-        the number of inputs channel
-    beta: symbolic vector
-        BN shift parameter, must be of same dimension that
-        the number of inputs channel
+        Mini-batch of activations
+    gamma: symbolic tensor
+        BN scale parameter, must be of same dimensionality as
+        inputs and broadcastable against it
+    beta: symbolic tensor
+        BN shift parameter, must be of same dimensionality as
+        inputs and broadcastable against it
    mean: symbolic tensor
-        inputs means
-    variance: symbolic tensor
-        inputs variance
-    axis: int
-        channel axis
+        inputs means, must be of same dimensionality as
+        inputs and broadcastable against it
+    std: symbolic tensor
+        inputs standard deviation, must be of same dimensionality as
+        inputs and broadcastable against it
    """
    elm_bn = theano.tensor.elemwise.Elemwise(scalar_op=BNComposite(dtype=inputs.dtype))
-    rval = elm_bn(inputs, mean, variance, gamma, beta)
+    rval = elm_bn(inputs, mean, std, gamma, beta)
    return rval
-
-
-
--- a/theano/tensor/nnet/tests/test_bn.py
+++ b/theano/tensor/nnet/tests/test_bn.py
@@ -8,8 +8,8 @@ from theano.tensor.nnet.bn import batch_normalization
 def test_bn():

    def bn_ref(x, G, B, M, V):
-        n = (x-M)/V
-        return n*G+B
+        n = (x - M) / V
+        return n * G + B

    numpy.random.seed(1234)
    X = 1 + numpy.random.random([10, 20]).astype('float32')
@@ -26,28 +26,28 @@ def test_bn():

    bn_op = batch_normalization(x, g, b, m, v)
    bn_ref_op = bn_ref(x, g, b, m, v)
-    f = theano.function([x, b, g, m ,v], [bn_op])
-    f_ref = theano.function([x, b, g, m ,v], [bn_ref_op])
+    f = theano.function([x, b, g, m, v], [bn_op])
+    f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
    res = f(X, G, B, M, V)
    res_ref = f_ref(X, G, B, M, V)
    utt.assert_allclose(res_ref, res)
    utt.verify_grad(batch_normalization, [X, G, B, M, V])

-    bn_op = batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.var(axis=0, keepdims=True))
+    bn_op = batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True))
    bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.var(axis=0, keepdims=True))
    f = theano.function([x, b, g], [bn_op])
    f_ref = theano.function([x, b, g], [bn_ref_op])
    res = f(X, G, B)
    res_ref = f_ref(X, G, B)
    utt.assert_allclose(res_ref, res)
-    utt.verify_grad(batch_normalization, [X, G, B, X.mean(axis=0, keepdims=True), X.var(axis=0, keepdims=True)])
+    utt.verify_grad(batch_normalization, [X, G, B, X.mean(axis=0, keepdims=True), X.std(axis=0, keepdims=True)])


 def test_bn_feature_maps():

    def bn_ref(x, G, B, M, V):
-        n = (x-M)/V
-        return n*G+B
+        n = (x - M) / V
+        return n * G + B

    numpy.random.seed(1234)
    X = 1 + numpy.random.random([10, 20, 4, 4]).astype('float32')
@@ -62,7 +62,6 @@ def test_bn_feature_maps():
    m = theano.tensor.vector('m')
    v = theano.tensor.vector('v')

-    ### Provide mean/var
    bn_op = batch_normalization(x,
                                g.dimshuffle('x', 0, 'x', 'x'),
                                b.dimshuffle('x', 0, 'x', 'x'),
@@ -73,8 +72,8 @@ def test_bn_feature_maps():
                       b.dimshuffle('x', 0, 'x', 'x'),
                       m.dimshuffle('x', 0, 'x', 'x'),
                       v.dimshuffle('x', 0, 'x', 'x'))
-    f = theano.function([x, b, g, m ,v], [bn_op])
-    f_ref = theano.function([x, b, g, m ,v], [bn_ref_op])
+    f = theano.function([x, b, g, m, v], [bn_op])
+    f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
    res = f(X, G, B, M, V)
    res_ref = f_ref(X, G, B, M, V)
    utt.assert_allclose(res_ref, res)
@@ -87,4 +86,3 @@ def test_bn_feature_maps():
                                   variance.dimshuffle('x', 0, 'x', 'x'),
                                   axis=1)
    utt.verify_grad(conv_bn, [X, G, B, M, V])
-