Add running averages to batch norm (no cuDNN yet).

c4293e69 · Gijs van Tulder · 4f291961 · c4293e69 · c4293e69 · c4293e69
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -2949,7 +2949,9 @@ def local_abstract_batch_norm_train_cudnn(node):
    if not isinstance(node.op, bn.AbstractBatchNormTrain):
        return None

-    x, scale, bias, epsilon = node.inputs
+    x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
+    running_mean = node.inputs[5] if len(node.inputs) > 5 else None
+    running_var = node.inputs[6] if len(node.inputs) > 6 else None

    # input on gpu?  TODO what about the output?
    x_on_gpu = (isinstance(x.type, GpuArrayType) or
@@ -2983,15 +2985,24 @@ def local_abstract_batch_norm_train_cudnn(node):

    out, mean, invstd = dnn_batch_normalization_train(x, scale, bias, mode, eps)

+    results = [out, mean, invstd]
+    if running_mean is not None:
+        running_mean = running_mean * (1 - running_average_factor) + \
+            mean * running_average_factor
+        results.append(running_mean)
+    if running_var is not None:
+        var = x.var(axis=axes, keepdims=True)
+        m = tensor.cast(tensor.prod(x.shape) / tensor.prod(scale.shape), theano.config.floatX)
+        running_var = running_var * (1 - running_average_factor) + \
+            (m / (m - 1)) * var * running_average_factor
+        results.append(running_var)
+
    # If the original output was on CPU, we have to transfer it
-    if isinstance(node.outputs[0].type, tensor.TensorType):
-        out = tensor.as_tensor_variable(out)
-    if isinstance(node.outputs[1].type, tensor.TensorType):
-        mean = tensor.as_tensor_variable(mean)
-    if isinstance(node.outputs[2].type, tensor.TensorType):
-        invstd = tensor.as_tensor_variable(invstd)
+    for i in range(len(node.outputs)):
+        if isinstance(node.outputs[i].type, tensor.TensorType):
+            results[i] = tensor.as_tensor_variable(results[i])
    # TODO copy_stack_trace?
-    return [out, mean, invstd]
+    return results


 @local_optimizer([bn.AbstractBatchNormTrainGrad])

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -1384,26 +1384,39 @@ def test_dnn_batchnorm_train():

    for mode in ('per-activation', 'spatial'):
        for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
-            x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias'))
+            x, scale, bias, running_mean, running_var = (vartype(n)
+                                                         for n in ('x', 'scale', 'bias',
+                                                                   'running_mean',
+                                                                   'running_var'))
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used
+            running_average_factor = 0.3

            # forward pass, direct interface
            out_gpu, x_mean_gpu, x_invstd_gpu = dnn.dnn_batch_normalization_train(
                x, scale, bias, mode, eps)
            # forward pass, abstract interface
-            out_abstract, x_mean_abstract, x_invstd_abstract = bn.batch_normalization_train(
-                x, scale, bias, mode, eps)
+            out_abstract, x_mean_abstract, x_invstd_abstract, \
+                out_running_mean_abstract, out_running_var_abstract = \
+                bn.batch_normalization_train(x, scale, bias, mode, eps,
+                                             running_average_factor,
+                                             running_mean, running_var)
            # reference forward pass
            if mode == 'per-activation':
                axes = (0,)
            elif mode == 'spatial':
                axes = (0,) + tuple(range(2, ndim))
            x_mean_ref = x.mean(axis=axes, keepdims=True)
-            x_invstd_ref = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps))
+            x_var_ref = x.var(axis=axes, keepdims=True)
+            x_invstd_ref = T.inv(T.sqrt(x_var_ref + eps))
            scale_ref = T.addbroadcast(scale, *axes)
            bias_ref = T.addbroadcast(bias, *axes)
+            m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
            out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
+            out_running_mean_ref = running_mean * (1 - running_average_factor) + \
+                x_mean_ref * running_average_factor
+            out_running_var_ref = running_var * (1 - running_average_factor) + \
+                (m / (m - 1)) * x_var_ref * running_average_factor
            # backward pass
            dy = vartype('dy')
            grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
@@ -1414,12 +1427,14 @@ def test_dnn_batchnorm_train():
            f_gpu = theano.function([x, scale, bias, dy],
                                    [out_gpu, x_mean_gpu, x_invstd_gpu] + grads_gpu,
                                    mode=mode_with_gpu)
-            f_abstract = theano.function([x, scale, bias, dy],
-                                         [out_abstract, x_mean_abstract, x_invstd_abstract] +
+            f_abstract = theano.function([x, scale, bias, running_mean, running_var, dy],
+                                         [out_abstract, x_mean_abstract, x_invstd_abstract,
+                                          out_running_mean_abstract, out_running_var_abstract] +
                                         grads_abstract,
                                         mode=mode_with_gpu)
-            f_ref = theano.function([x, scale, bias, dy],
-                                    [out_ref, x_mean_ref, x_invstd_ref] + grads_ref)
+            f_ref = theano.function([x, scale, bias, running_mean, running_var, dy],
+                                    [out_ref, x_mean_ref, x_invstd_ref,
+                                     out_running_mean_ref, out_running_var_ref] + grads_ref)
            # check if the abstract Ops have been replaced
            assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
                        in f_abstract.maker.fgraph.toposort()])
@@ -1438,9 +1453,11 @@ def test_dnn_batchnorm_train():
                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
                Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                outputs_gpu = f_gpu(X, Scale, Bias, Dy)
-                outputs_abstract = f_abstract(X, Scale, Bias, Dy)
-                outputs_ref = f_ref(X, Scale, Bias, Dy)
+                outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
+                outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
                # compare outputs
                utt.assert_allclose(outputs_gpu[0], outputs_ref[0])  # out
                utt.assert_allclose(outputs_gpu[1], outputs_ref[1])  # mean
@@ -1448,13 +1465,51 @@ def test_dnn_batchnorm_train():
                utt.assert_allclose(outputs_abstract[0], outputs_ref[0])  # out
                utt.assert_allclose(outputs_abstract[1], outputs_ref[1])  # mean
                utt.assert_allclose(outputs_abstract[2], outputs_ref[2])  # invstd
+                utt.assert_allclose(outputs_abstract[3], outputs_ref[3])  # running_mean
+                utt.assert_allclose(numpy.nan_to_num(outputs_abstract[4]),
+                                    numpy.nan_to_num(outputs_ref[4]))  # running_var
                # compare gradients
-                utt.assert_allclose(outputs_gpu[3], outputs_ref[3], atol=2e-4)  # dx
-                utt.assert_allclose(outputs_gpu[4], outputs_ref[4], rtol=2e-4, atol=1e-4)  # dscale
-                utt.assert_allclose(outputs_gpu[5], outputs_ref[5])  # dbias
-                utt.assert_allclose(outputs_abstract[3], outputs_ref[3], atol=2e-4)  # dx
-                utt.assert_allclose(outputs_abstract[4], outputs_ref[4], rtol=2e-4, atol=1e-4)  # dscale
-                utt.assert_allclose(outputs_abstract[5], outputs_ref[5])  # dbias
+                utt.assert_allclose(outputs_gpu[3], outputs_ref[5], atol=2e-4)  # dx
+                utt.assert_allclose(outputs_gpu[4], outputs_ref[6], rtol=4e-4, atol=1e-4)  # dscale
+                utt.assert_allclose(outputs_gpu[5], outputs_ref[7])  # dbias
+                utt.assert_allclose(outputs_abstract[5], outputs_ref[5], atol=2e-4)  # dx
+                utt.assert_allclose(outputs_abstract[6], outputs_ref[6], rtol=4e-4, atol=1e-4)  # dscale
+                utt.assert_allclose(outputs_abstract[7], outputs_ref[7])  # dbias
+
+
+def test_dnn_batchnorm_train_without_running_averages():
+    # compile and run batch_normalization_train without running averages
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+    if dnn.version(raises=False) < 5000:
+        raise SkipTest("batch normalization requires cudnn v5+")
+    utt.seed_rng()
+
+    x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
+    data_shape = (5, 10, 30, 25)
+    param_shape = (1, 10, 30, 25)
+
+    # forward pass
+    out, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'per-activation')
+    # backward pass
+    grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
+    # compile
+    f_abstract = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads, mode=mode_with_gpu)
+    # check if the abstract Ops have been replaced
+    assert any([isinstance(n.op, dnn.GpuDnnBatchNorm)
+                for n in f_abstract.maker.fgraph.toposort()])
+    assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
+                for n in f_abstract.maker.fgraph.toposort()])
+    assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                      bn.AbstractBatchNormInference,
+                                      bn.AbstractBatchNormTrainGrad))
+                    for n in f_abstract.maker.fgraph.toposort()])
+    # run
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    f_abstract(X, Scale, Bias, Dy)


 def test_batchnorm_inference():

--- a/theano/tensor/nnet/bn.py
+++ b/theano/tensor/nnet/bn.py
@@ -84,7 +84,8 @@ def batch_normalization(inputs, gamma, beta, mean, std,


 def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
-                              epsilon=1e-4):
+                              epsilon=1e-4, running_average_factor=0.1,
+                              running_mean=None, running_var=None):
    """
    Performs batch normalization of the given inputs, using the mean and
    variance of the inputs.
@@ -107,6 +108,23 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
    epsilon : float
        Epsilon value used in the batch normalization formula. Minimum allowed
        value is 1e-5 (imposed by cuDNN).
+    running_average_factor : float
+        Factor for updating the values or `running_mean` and `running_var`.
+        If the factor is close to one, the running averages will update quickly,
+        if the factor is close to zero it will update slowly.
+    running_mean : tensor or None
+        Previous value of the running mean. If this is given, the new value
+        ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
+        will be returned as one of the outputs of this function.
+        `running_mean` and `running_var` should either both be given or
+        both be None.
+    running_var : tensor or None
+        Previous value of the running variance. If this is given, the new value
+        ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
+        will be returned as one of the outputs of this function,
+        where `m` is the product of lengths of the averaged-over dimensions.
+        `running_mean` and `running_var` should either both be given or
+        both be None.

    Returns
    -------
@@ -116,6 +134,12 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
        Means of `inputs` across the normalization axes.
    invstd : tensor
        Inverse standard deviations of `inputs` across the normalization axes.
+    new_running_mean : tensor
+        New value of the running mean (only if both `running_mean` and
+        `running_var` were given).
+    new_running_var : tensor
+        New value of the running variance (only if both `running_var` and
+        `running_mean` were given).

    Notes
    -----
@@ -131,14 +155,32 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
        # for spatial normalization
        axes = (0,) + tuple(range(2, inputs.ndim))
        mean = inputs.mean(axes, keepdims=True)
-        invstd = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon))
+        var = inputs.var(axes, keepdims=True)
+        invstd = T.inv(T.sqrt(var + epsilon))
        out = (inputs - mean) * gamma * invstd + beta
+
+        m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
+        running_mean = running_mean * (1 - running_average_factor) + \\
+                       mean * running_average_factor
+        running_var = running_var * (1 - running_average_factor) + \\
+                      (m / (m - 1)) * var * running_average_factor
    """
    ndim = inputs.ndim
    if gamma.ndim != ndim or beta.ndim != ndim:
        raise ValueError("gamma and beta must be of the same dimensionality "
                         "as inputs; got %d and %d instead of %d" %
                         (gamma.ndim, beta.ndim, ndim))
+    if (running_mean is None) != (running_var is None):
+        raise ValueError("running_mean and running_var must either both be "
+                         "given or both be None")
+    if running_mean is not None and running_mean.ndim != ndim:
+        raise ValueError("running_mean must be of the same dimensionality "
+                         "as inputs; got %d instead of %d" %
+                         (running_mean.ndim, ndim))
+    if running_var is not None and running_var.ndim != ndim:
+        raise ValueError("running_var must be of the same dimensionality "
+                         "as inputs; got %d instead of %d" %
+                         (running_var.ndim, ndim))
    if epsilon < 1e-5:
        raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)

@@ -163,7 +205,23 @@ def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
    beta = T.addbroadcast(beta, *axes)

    batchnorm_op = AbstractBatchNormTrain(axes=axes)
-    return tuple(batchnorm_op(inputs, gamma, beta, epsilon=epsilon))
+
+    if running_mean is not None and running_var is not None:
+        running_mean = as_tensor_variable(running_mean)
+        running_var = as_tensor_variable(running_var)
+        running_mean_bc = T.addbroadcast(running_mean, *axes)
+        running_var_bc = T.addbroadcast(running_var, *axes)
+        out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
+            inputs, gamma, beta, epsilon=epsilon,
+            running_average_factor=running_average_factor,
+            running_mean=running_mean_bc, running_var=running_var_bc)
+        if new_running_mean.broadcastable != running_mean.broadcastable:
+            new_running_mean = T.patternbroadcast(new_running_mean, running_mean.broadcastable)
+        if new_running_var.broadcastable != running_var.broadcastable:
+            new_running_var = T.patternbroadcast(new_running_var, running_var.broadcastable)
+        return out, mean, invstd, new_running_mean, new_running_var
+    else:
+        return tuple(batchnorm_op(inputs, gamma, beta, epsilon=epsilon))


 def batch_normalization_test(inputs, gamma, beta, mean, var,
@@ -277,6 +335,23 @@ class AbstractBatchNormTrain(Op):
    epsilon
        Epsilon value used in the batch normalization formula. Minimum allowed
        value is 1e-5 (imposed by cuDNN).
+    running_average_factor : float
+        Factor for updating the values or `running_mean` and `running_var`.
+        If the factor is close to one, the running averages will update quickly,
+        if the factor is close to zero it will update slowly.
+    running_mean : tensor or None
+        Previous value of the running mean. If this is given, the new value
+        ``running_mean * (1 - running_average_factor) + batch mean * running_average_factor``
+        will be returned as one of the outputs of this function.
+        `running_mean` and `running_var` should either both be given or
+        both be None.
+    running_var : tensor or None
+        Previous value of the running variance. If this is given, the new value
+        ``running_var * (1 - running_average_factor) + (m / (m - 1)) * batch var * running_average_factor``
+        will be returned as one of the outputs of this function,
+        where `m` is the product of lengths of the averaged-over dimensions.
+        `running_mean` and `running_var` should either both be given or
+        both be None.
    """

    __props__ = ('axes',)
@@ -288,40 +363,85 @@ class AbstractBatchNormTrain(Op):
        self.axes = axes

    def infer_shape(self, node, shape):
-        return [shape[0], shape[1], shape[1]]
+        return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)

-    def make_node(self, x, scale, bias, epsilon=1e-4):
+    def make_node(self, x, scale, bias, epsilon=1e-4,
+                  running_average_factor=0.1,
+                  running_mean=None, running_var=None):
        assert x.ndim == scale.ndim == bias.ndim
+        assert ((running_mean is None and running_var is None) or
+                (running_mean is not None and running_var is not None))
+        assert (running_mean is None or running_mean.ndim == x.ndim)
+        assert (running_var is None or running_var.ndim == x.ndim)
        if not isinstance(epsilon, theano.Variable):
            epsilon = as_tensor_variable(epsilon)
-        return Apply(self, [x, scale, bias, epsilon], [x.type(), scale.type(), scale.type()])
+        if not isinstance(running_average_factor, theano.Variable):
+            running_average_factor = as_tensor_variable(running_average_factor)
+        inputs = [x, scale, bias, epsilon, running_average_factor]
+        output_types = [x.type(), scale.type(), scale.type()]
+        if running_mean is not None and running_var is not None:
+            inputs.append(running_mean)
+            inputs.append(running_var)
+            output_types.append(scale.type())
+            output_types.append(scale.type())
+        return Apply(self, inputs, output_types)

    def grad(self, inputs, grads):
-        x, scale, bias, epsilon = inputs
+        x, scale, bias, epsilon, running_average_factor = inputs[:5]
        dy = grads[0]
-        _, x_mean, x_invstd = self(x, scale, bias, epsilon)
+        _, x_mean, x_invstd = self(*inputs)[:3]
+        disconnected_outputs = [
+            theano.gradient.DisconnectedType()(),  # epsilon
+            theano.gradient.DisconnectedType()()]  # running_average_factor
+        # Optional running_mean and running_var.
+        for i in range(5, len(inputs)):
+            disconnected_outputs.append(theano.gradient.DisconnectedType()())
        return AbstractBatchNormTrainGrad(self.axes)(
-            x, dy, scale, x_mean, x_invstd, epsilon) + [theano.gradient.DisconnectedType()()]
+            x, dy, scale, x_mean, x_invstd, epsilon) + disconnected_outputs

    def connection_pattern(self, node):
-        # Specificy that epsilon is not connected to outputs.
-        return [[True, True, True], [True, True, True], [True, True, True],
-                [False, False, False]]
+        # Specificy that epsilon and running_average_factor are not connected to outputs.
+        patterns = [[True, True, True],     # x
+                    [True, True, True],     # scale
+                    [True, True, True],     # bias
+                    [False, False, False],  # epsilon
+                    [False, False, False]]  # running_average_factor
+        # Optional running_mean and running_var are only
+        # connected to their new values.
+        for i in range(5, len(node.inputs)):
+            patterns[0].append(True)
+            for pattern in patterns[1:]:
+                pattern.append(False)
+            patterns.append([False] * (3 + i - 5) + [True])
+        return patterns

    def perform(self, node, inputs, output_storage):
-        x, scale, bias, epsilon = inputs
+        x, scale, bias, epsilon, running_average_factor = inputs[:5]
        axes = self.axes
        if min(axes) < 0 or max(axes) >= x.ndim:
            raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))

        mean = x.mean(axes, keepdims=True)
-        invstd = 1.0 / numpy.sqrt(x.var(axes, keepdims=True) + epsilon)
+        var = x.var(axes, keepdims=True)
+        invstd = 1.0 / numpy.sqrt(var + epsilon)
        out = (x - mean) * (scale * invstd) + bias

        output_storage[0][0] = out
        output_storage[1][0] = mean
        output_storage[2][0] = invstd

+        if len(inputs) > 5:
+            running_mean = inputs[5]
+            running_mean = running_mean * (1.0 - running_average_factor) + \
+                mean * running_average_factor
+            output_storage[3][0] = running_mean
+        if len(inputs) > 6:
+            m = float(numpy.prod(x.shape) / numpy.prod(scale.shape))
+            running_var = inputs[6]
+            running_var = running_var * (1.0 - running_average_factor) + \
+                (m / (m - 1)) * var * running_average_factor
+            output_storage[4][0] = running_var
+

 class AbstractBatchNormInference(Op):
    """
@@ -429,21 +549,42 @@ def local_abstract_batch_norm_train(node):
    if not isinstance(node.op, AbstractBatchNormTrain):
        return None

-    x, scale, bias, epsilon = node.inputs
+    x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
    axes = node.op.axes
    if min(axes) < 0 or max(axes) > x.ndim:
        return None
    if not isinstance(x.type, TensorType) or \
       not isinstance(scale.type, TensorType) or \
       not isinstance(bias.type, TensorType) or \
-       not isinstance(epsilon.type, TensorType):
+       not isinstance(epsilon.type, TensorType) or \
+       not isinstance(running_average_factor.type, TensorType):
+        return None
+    # optional running_mean and running_var
+    if len(node.inputs) > 5 and not isinstance(node.inputs[5].type, TensorType):
+        return None
+    if len(node.inputs) > 6 and not isinstance(node.inputs[6].type, TensorType):
        return None

    mean = x.mean(axes, keepdims=True)
-    invstd = T.inv(T.sqrt(x.var(axes, keepdims=True) + epsilon))
+    var = x.var(axes, keepdims=True)
+    invstd = T.inv(T.sqrt(var + epsilon))
    out = (x - mean) * (scale * invstd) + bias
+    results = [out, mean, invstd]
+
+    if len(node.inputs) > 5:
+        running_mean = node.inputs[5]
+        running_mean = running_mean * (1.0 - running_average_factor) + \
+            mean * running_average_factor
+        results.append(running_mean)
+    if len(node.inputs) > 6:
+        m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
+        running_var = node.inputs[6]
+        running_var = running_var * (1.0 - running_average_factor) + \
+            (m / (m - 1)) * var * running_average_factor
+        results.append(running_var)
+
    # TODO copy_stack_trace?
-    return [out, mean, invstd]
+    return results


 @local_optimizer([AbstractBatchNormTrainGrad])

--- a/theano/tensor/nnet/tests/test_bn.py
+++ b/theano/tensor/nnet/tests/test_bn.py
@@ -148,9 +148,13 @@ def test_batch_normalization_train():

    for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
        for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
-            x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias'))
+            x, scale, bias, running_mean, running_var = (vartype(n)
+                                                         for n in ('x', 'scale', 'bias',
+                                                                   'running_mean',
+                                                                   'running_var'))
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used
+            running_average_factor = 0.3

            # remove non-existing axes
            if isinstance(axes, tuple):
@@ -159,8 +163,10 @@ def test_batch_normalization_train():
                continue

            # forward pass
-            out, x_mean, x_invstd = bn.batch_normalization_train(
-                x, scale, bias, axes, eps)
+            out, x_mean, x_invstd, out_running_mean, out_running_var = \
+                bn.batch_normalization_train(
+                    x, scale, bias, axes, eps,
+                    running_average_factor, running_mean, running_var)
            # reference forward pass
            if axes == 'per-activation':
                axes2 = (0,)
@@ -169,18 +175,25 @@ def test_batch_normalization_train():
            else:
                axes2 = axes
            x_mean2 = x.mean(axis=axes2, keepdims=True)
-            x_invstd2 = T.inv(T.sqrt(x.var(axis=axes2, keepdims=True) + eps))
+            x_var2 = x.var(axis=axes2, keepdims=True)
+            x_invstd2 = T.inv(T.sqrt(x_var2 + eps))
            scale2 = T.addbroadcast(scale, *axes2)
            bias2 = T.addbroadcast(bias, *axes2)
            out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
+            m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
+            out_running_mean2 = running_mean * (1 - running_average_factor) + \
+                x_mean2 * running_average_factor
+            out_running_var2 = running_var * (1 - running_average_factor) + \
+                (m / (m - 1)) * x_var2 * running_average_factor
            # backward pass
            dy = vartype('dy')
            grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
            # reference backward pass
            grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
            # compile
-            f = theano.function([x, scale, bias, dy],
-                                [out, x_mean, x_invstd, out2, x_mean2, x_invstd2] +
+            f = theano.function([x, scale, bias, running_mean, running_var, dy],
+                                [out, x_mean, x_invstd, out_running_mean, out_running_var,
+                                 out2, x_mean2, x_invstd2, out_running_mean2, out_running_var2] +
                                grads + grads2, mode='FAST_RUN')
            # check if the abstract Ops have been replaced
            assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
@@ -196,15 +209,47 @@ def test_batch_normalization_train():
                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
                Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
-                outputs = f(X, Scale, Bias, Dy)
+                Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy)
                # compare outputs
-                utt.assert_allclose(outputs[0], outputs[0 + 3])  # out
-                utt.assert_allclose(outputs[1], outputs[1 + 3])  # mean
-                utt.assert_allclose(outputs[2], outputs[2 + 3])  # invstd
+                utt.assert_allclose(outputs[0], outputs[0 + 5])  # out
+                utt.assert_allclose(outputs[1], outputs[1 + 5])  # mean
+                utt.assert_allclose(outputs[2], outputs[2 + 5])  # invstd
+                utt.assert_allclose(outputs[3], outputs[3 + 5])  # running_mean
+                utt.assert_allclose(numpy.nan_to_num(outputs[4]),
+                                    numpy.nan_to_num(outputs[4 + 5]))  # running_var
                # compare gradients
-                utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4)  # dx
-                utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4)  # dscale
-                utt.assert_allclose(outputs[8], outputs[8 + 3])  # dbias
+                utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4)  # dx
+                utt.assert_allclose(outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4)  # dscale
+                utt.assert_allclose(outputs[12], outputs[12 + 3])  # dbias
+
+
+def test_batch_normalization_train_without_running_averages():
+    # compile and run batch_normalization_train without running averages
+    utt.seed_rng()
+
+    x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
+    data_shape = (5, 10, 30, 25)
+    param_shape = (1, 10, 30, 25)
+
+    # forward pass
+    out, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'per-activation')
+    # backward pass
+    grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
+    # compile
+    f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads, mode='FAST_RUN')
+    # check if the abstract Ops have been replaced
+    assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                      bn.AbstractBatchNormInference,
+                                      bn.AbstractBatchNormTrainGrad))
+                    for n in f.maker.fgraph.toposort()])
+    # run
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    f(X, Scale, Bias, Dy)


 def test_batch_normalization_test():