Merge pull request #5190 from gvtulder/f-batchnorm-abstract

Abstract Ops for batch normalization

Merge pull request #5190 from gvtulder/f-batchnorm-abstract
8b9f7336 · Frédéric Bastien · GitHub · 18f27c44 · 60238616 · 8b9f7336
--- a/doc/library/tensor/nnet/bn.txt
+++ b/doc/library/tensor/nnet/bn.txt
@@ -10,6 +10,9 @@
 .. moduleauthor:: LISA


-.. seealso:: cuDNN batch normalization: :class:`theano.gpuarray.dnn.dnn_batch_normalization_train`, :class:`theano.gpuarray.dnn.dnn_batch_normalization_test>`. They must be added manually as they do not have the same user interface.
+.. autofunction:: theano.tensor.nnet.bn.batch_normalization_train
+.. autofunction:: theano.tensor.nnet.bn.batch_normalization_test
+
+.. seealso:: cuDNN batch normalization: :class:`theano.gpuarray.dnn.dnn_batch_normalization_train`, :class:`theano.gpuarray.dnn.dnn_batch_normalization_test>`.

 .. autofunction:: theano.tensor.nnet.bn.batch_normalization
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -28,19 +28,20 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              assert_conv_shape)
 from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
+from theano.tensor.nnet import bn
 from . import pygpu
 from .type import (get_context, gpu_context_type, list_contexts,
                   GpuArraySharedVariable)
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        gpu_contiguous, gpu_alloc_empty,
-                        empty_like, GpuArrayType)
+                        empty_like, GpuArrayType, HostFromGpu)
 from .elemwise import GpuElemwise

 # These don't exist in gpuarray
 # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
 from .nnet import GpuSoftmax
 from .opt import (gpu_seqopt, register_opt, pool_db, pool_db2,
-                  op_lifter, register_opt2)
+                  op_lifter, register_opt2, register_inplace)

 from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims

@@ -1389,13 +1390,13 @@ class GpuDnnPool(DnnBase):
            res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
        return [res]

-    def grad(self, inp, grads):
+    def L_op(self, inp, outputs, grads):
        img, ws, stride, pad = inp
        grad, = grads

        grad = gpu_contiguous(grad)

-        out = self(img, ws, stride, pad)
+        out, = outputs

        g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad)

@@ -1591,10 +1592,10 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
        assert x.ndim == 4
        return Apply(self, [x], [x.type()])

-    def grad(self, inp, grads):
+    def L_op(self, inp, outputs, grads):
        x, = inp
        g_sm, = grads
-        sm = self(x)
+        sm, = outputs
        return [GpuDnnSoftmaxGrad(
                self.algo,
                self.mode
@@ -1646,48 +1647,131 @@ class GpuDnnBatchNorm(DnnBase):
    epsilon
        Epsilon value used in the batch normalization formula. Minimum allowed
        value is 1e-5 (imposed by cuDNN).
+    running_average_factor : float
+        Factor for updating the values or `running_mean` and `running_var`.
+        If the factor is close to one, the running averages will update quickly,
+        if the factor is close to zero it will update slowly.
+    running_mean : tensor or None
+        Previous value of the running mean. If this is given, the new value
+        ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
+        will be returned as one of the outputs of this function.
+        `running_mean` and `running_var` should either both be given or
+        both be None.
+    running_var : tensor or None
+        Previous value of the running variance. If this is given, the new value
+        ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
+        will be returned as one of the outputs of this function,
+        where `m` is the product of lengths of the averaged-over dimensions.
+        `running_mean` and `running_var` should either both be given or
+        both be None.
    """

-    __props__ = ('mode',)
+    __props__ = ('mode', 'running_averages', 'inplace_running_mean',
+                 'inplace_running_var', 'inplace_output')

-    def __init__(self, mode='per-activation'):
+    def __init__(self, mode='per-activation', running_averages=False,
+                 inplace_running_mean=False, inplace_running_var=False,
+                 inplace_output=False):
        DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm.c'],
                         'dnn_batchnorm_op')

        assert (mode in ('per-activation', 'spatial'))
        self.mode = mode
+        self.running_averages = running_averages
+        self.inplace_output = inplace_output
+        self.inplace_running_mean = inplace_running_mean
+        self.inplace_running_var = inplace_running_var
+        self.destroy_map = {}
+        if self.inplace_output:
+            self.destroy_map[0] = [0]
+        if self.running_averages and self.inplace_running_mean:
+            self.destroy_map[3] = [5]
+        if self.running_averages and self.inplace_running_var:
+            self.destroy_map[4] = [6]
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'running_average_factor'):
+            self.running_average_factor = 0
+        if not hasattr(self, 'running_averages'):
+            self.running_averages = False
+        if not (hasattr(self, 'inplace_running_mean') and
+                hasattr(self, 'inplace_running_var') and
+                hasattr(self, 'inplace_output')):
+            self.inplace_running_mean = False
+            self.inplace_running_var = False
+            self.inplace_output = False
+            self.destroy_map = {}

    def get_op_params(self):
        params = []
+        if self.inplace_output:
+            params.append(('INPLACE_OUTPUT', '1'))
+        if self.running_averages:
+            params.append(('RUNNING_AVERAGES', '1'))
+            if self.inplace_running_mean:
+                params.append(('INPLACE_RUNNING_MEAN', '1'))
+            if self.inplace_running_var:
+                params.append(('INPLACE_RUNNING_VAR', '1'))
        params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL"
                                if self.mode == "spatial"
                                else "CUDNN_BATCHNORM_PER_ACTIVATION")))
        return params

    def infer_shape(self, node, shape):
-        return [shape[0], shape[1], shape[1]]
+        return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)

-    def make_node(self, x, scale, bias, epsilon=1e-4):
+    def make_node(self, x, scale, bias, epsilon=1e-4,
+                  running_average_factor=0.1,
+                  running_mean=None, running_var=None):
+        assert x.ndim == scale.ndim == bias.ndim
+        assert x.ndim in (4, 5)
+        assert self.running_averages == (running_mean is not None) == (running_var is not None)
+        assert (running_mean is None or running_mean.ndim == x.ndim)
+        assert (running_var is None or running_var.ndim == x.ndim)
        ctx_name = infer_context_name(x, scale, bias)
        x = as_gpuarray_variable(x, ctx_name)
        scale = as_gpuarray_variable(scale, ctx_name)
        bias = as_gpuarray_variable(bias, ctx_name)
        epsilon = as_scalar(epsilon).astype('float64')
-        assert x.ndim == scale.ndim == bias.ndim
-        assert x.ndim in (4, 5)
-        return Apply(self, [x, scale, bias, epsilon], [x.type(), scale.type(), scale.type()])
-
-    def grad(self, inputs, grads):
-        x, scale, bias, epsilon = inputs
+        running_average_factor = as_scalar(running_average_factor).astype('float64')
+        inputs = [x, scale, bias, epsilon, running_average_factor]
+        output_types = [x.type(), scale.type(), scale.type()]
+        if running_mean is not None and running_var is not None:
+            inputs.append(as_gpuarray_variable(running_mean, ctx_name))
+            inputs.append(as_gpuarray_variable(running_var, ctx_name))
+            output_types.append(scale.type())
+            output_types.append(scale.type())
+        return Apply(self, inputs, output_types)
+
+    def L_op(self, inputs, outputs, grads):
+        x, scale, bias, epsilon, running_average_factor = inputs[:5]
        dy = grads[0]
-        _, x_mean, x_invstd = self(x, scale, bias, epsilon)
-        return GpuDnnBatchNormGrad(self.mode)(x, dy, scale, x_mean,
-                                              x_invstd, epsilon) + [DisconnectedType()()]
+        _, x_mean, x_invstd = outputs[:3]
+        disconnected_outputs = [
+            DisconnectedType()(),  # epsilon
+            DisconnectedType()()]  # running_average_factor
+        # Optional running_mean and running_var.
+        for i in range(5, len(inputs)):
+            disconnected_outputs.append(DisconnectedType()())
+        return GpuDnnBatchNormGrad(self.mode)(
+            x, dy, scale, x_mean, x_invstd, epsilon) + disconnected_outputs

    def connection_pattern(self, node):
-        # Specificy that epsilon is not connected to outputs.
-        return [[True, True, True], [True, True, True], [True, True, True],
-                [False, False, False]]
+        # Specificy that epsilon and running_average_factor are not connected to outputs.
+        patterns = [[True, True, True],     # x
+                    [True, True, True],     # scale
+                    [True, True, True],     # bias
+                    [False, False, False],  # epsilon
+                    [False, False, False]]  # running_average_factor
+        # Optional running_mean and running_var are only
+        # connected to their new values.
+        for i in range(5, len(node.inputs)):
+            patterns[0].append(True)
+            for pattern in patterns[1:]:
+                pattern.append(False)
+            patterns.append([False] * (3 + i - 5) + [True])
+        return patterns


 class GpuDnnBatchNormInference(DnnBase):
@@ -1706,17 +1790,27 @@ class GpuDnnBatchNormInference(DnnBase):
        value is 1e-5 (imposed by cuDNN).
    """

-    __props__ = ('mode',)
+    __props__ = ('mode', 'inplace')

-    def __init__(self, mode='per-activation'):
+    def __init__(self, mode='per-activation', inplace=False):
        DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_inf.c'],
                         'dnn_batchnorm_op')

        assert (mode in ('per-activation', 'spatial'))
        self.mode = mode
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'inplace'):
+            self.inplace = False

    def get_op_params(self):
        params = []
+        if self.inplace:
+            params.append(('INPLACE_OUTPUT', '1'))
        params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL"
                                if self.mode == "spatial"
                                else "CUDNN_BATCHNORM_PER_ACTIVATION")))
@@ -2404,7 +2498,8 @@ class RNNBlock(object):


 def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
-                                  epsilon=1e-4):
+                                  epsilon=1e-4, running_average_factor=0.1,
+                                  running_mean=None, running_var=None):
    """
    Performs batch normalization of the given inputs, using the mean and
    variance of the inputs.
@@ -2424,6 +2519,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
    epsilon : float
        Epsilon value used in the batch normalization formula. Minimum allowed
        value is 1e-5 (imposed by cuDNN).
+    running_average_factor : float
+        Factor for updating the values or `running_mean` and `running_var`.
+        If the factor is close to one, the running averages will update quickly,
+        if the factor is close to zero it will update slowly.
+    running_mean : tensor or None
+        Previous value of the running mean. If this is given, the new value
+        ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
+        will be returned as one of the outputs of this function.
+        `running_mean` and `running_var` should either both be given or
+        both be None.
+    running_var : tensor or None
+        Previous value of the running variance. If this is given, the new value
+        ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
+        will be returned as one of the outputs of this function,
+        where `m` is the product of lengths of the averaged-over dimensions.
+        `running_mean` and `running_var` should either both be given or
+        both be None.

    Returns
    -------
@@ -2431,8 +2543,14 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
        Batch-normalized inputs.
    mean : tensor
        Means of `inputs` across the normalization axes.
-    stdinv : tensor
+    invstd : tensor
        Inverse standard deviations of `inputs` across the normalization axes.
+    new_running_mean : tensor
+        New value of the running mean (only if both `running_mean` and
+        `running_var` were given).
+    new_running_var : tensor
+        New value of the running variance (only if both `running_var` and
+        `running_mean` were given).

    Notes
    -----
@@ -2444,31 +2562,77 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',

        axes = 0 if mode == 'per-activation' else (0, 2, 3)
        mean = inputs.mean(axes, keepdims=True)
-        stdinv = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon))
-        out = (inputs - mean) * gamma * stdinv + beta
+        var = inputs.var(axes, keepdims=True)
+        invstd = T.inv(T.sqrt(var + epsilon))
+        out = (inputs - mean) * gamma * invstd + beta
+
+        m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
+        running_mean = running_mean * (1 - running_average_factor) + \\
+                       mean * running_average_factor
+        running_var = running_var * (1 - running_average_factor) + \\
+                      (m / (m - 1)) * var * running_average_factor

    For 5d tensors, the axes are (0, 2, 3, 4).
    """
    ndim = inputs.ndim
-    if ndim > 5:
-        raise ValueError("dnn_batch_normalization_train currently supports "
-                         "up to 5-dimensional tensors only, got %d" % ndim)
    if gamma.ndim != ndim or beta.ndim != ndim:
        raise ValueError("gamma and beta must be of the same dimensionality "
                         "as inputs; got %d and %d instead of %d" %
                         (gamma.ndim, beta.ndim, ndim))
+    if (running_mean is None) != (running_var is None):
+        raise ValueError("running_mean and running_var must either both be "
+                         "given or both be None")
+    if running_mean is not None and running_mean.ndim != ndim:
+        raise ValueError("running_mean must be of the same dimensionality "
+                         "as inputs; got %d instead of %d" %
+                         (running_mean.ndim, ndim))
+    if running_var is not None and running_var.ndim != ndim:
+        raise ValueError("running_var must be of the same dimensionality "
+                         "as inputs; got %d instead of %d" %
+                         (running_var.ndim, ndim))
    if epsilon < 1e-5:
        raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)

+    running_averages = (running_mean is not None and running_var is not None)
+
    if ndim < 4:
        inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
        gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
        beta = theano.tensor.shape_padright(beta, 4 - ndim)
-    batchnorm_op = GpuDnnBatchNorm(mode=mode)
-    result = tuple(batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
-                                gpu_contiguous(beta), epsilon=epsilon))
+        if running_averages:
+            running_mean = theano.tensor.shape_padright(running_mean, 4 - ndim)
+            running_var = theano.tensor.shape_padright(running_var, 4 - ndim)
+    elif ndim > 5:
+        inputs_shape = inputs.shape
+        params_shape = gamma.shape
+        inputs = theano.tensor.flatten(inputs, 5)
+        gamma = theano.tensor.flatten(gamma, 5)
+        beta = theano.tensor.flatten(beta, 5)
+        if running_averages:
+            running_mean = theano.tensor.flatten(running_mean, 5)
+            running_var = theano.tensor.flatten(running_var, 5)
+
+    batchnorm_op = GpuDnnBatchNorm(mode=mode, running_averages=running_averages)
+    if running_averages:
+        out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
+            gpu_contiguous(inputs), gpu_contiguous(gamma),
+            gpu_contiguous(beta), epsilon=epsilon,
+            running_average_factor=running_average_factor,
+            running_mean=gpu_contiguous(running_mean),
+            running_var=gpu_contiguous(running_var))
+        if new_running_mean.broadcastable != running_mean.broadcastable:
+            new_running_mean = tensor.patternbroadcast(new_running_mean, running_mean.broadcastable)
+        if new_running_var.broadcastable != running_var.broadcastable:
+            new_running_var = tensor.patternbroadcast(new_running_var, running_var.broadcastable)
+        result = (out, mean, invstd, new_running_mean, new_running_var)
+    else:
+        result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
+                              gpu_contiguous(beta), epsilon=epsilon)
    if ndim < 4:
        result = tuple(theano.tensor.flatten(r, ndim) for r in result)
+    elif ndim > 5:
+        result = (theano.tensor.reshape(result[0], inputs_shape),) + tuple(
+            theano.tensor.reshape(r, params_shape) for r in result[1:])
    return result


@@ -2521,9 +2685,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
    For 5d tensors, the axes would be (0, 2, 3, 4).
    """
    ndim = inputs.ndim
-    if ndim > 5:
-        raise ValueError("dnn_batch_normalization_test currently supports "
-                         "up to 5-dimensional tensors only, got %d" % ndim)
    if gamma.ndim != ndim or beta.ndim != ndim:
        raise ValueError("gamma and beta must be of the same dimensionality "
                         "as inputs; got %d and %d instead of %d" %
@@ -2541,12 +2702,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
        beta = theano.tensor.shape_padright(beta, 4 - ndim)
        mean = theano.tensor.shape_padright(mean, 4 - ndim)
        var = theano.tensor.shape_padright(var, 4 - ndim)
+    elif ndim > 5:
+        inputs_shape = inputs.shape
+        inputs = theano.tensor.flatten(inputs, 5)
+        gamma = theano.tensor.flatten(gamma, 5)
+        beta = theano.tensor.flatten(beta, 5)
+        mean = theano.tensor.flatten(mean, 5)
+        var = theano.tensor.flatten(var, 5)
    batchnorm_op = GpuDnnBatchNormInference(mode=mode)
    result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
                          gpu_contiguous(beta), gpu_contiguous(mean),
                          gpu_contiguous(var), epsilon=epsilon)
    if ndim < 4:
        result = theano.tensor.flatten(result, ndim)
+    elif ndim > 5:
+        result = theano.tensor.reshape(result, inputs_shape)
    return result


@@ -2928,3 +3098,197 @@ def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
    out = GpuDnnSoftmaxGrad('accurate', 'instance')(
        gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
    return [out.dimshuffle(0, 2)]
+
+
+@register_opt('cudnn', 'fast_compile')
+@op_lifter([bn.AbstractBatchNormTrain])
+@register_opt2([bn.AbstractBatchNormTrain], 'cudnn', 'fast_compile')
+def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
+    x, scale, bias, epsilon, running_average_factor = inputs[:5]
+    running_mean = inputs[5] if len(inputs) > 5 else None
+    running_var = inputs[6] if len(inputs) > 6 else None
+
+    # convert axes to cuDNN mode
+    axes = tuple(op.axes)
+    if axes == (0,):
+        mode = 'per-activation'
+    elif axes == (0,) + tuple(range(2, x.ndim)):
+        mode = 'spatial'
+    else:
+        return None
+
+    try:
+        eps = theano.tensor.get_scalar_constant_value(epsilon)
+    except theano.tensor.NotScalarConstantError:
+        return None
+    if eps < 1e-5:
+        return None
+    try:
+        running_average_factor = theano.tensor.get_scalar_constant_value(running_average_factor)
+    except theano.tensor.NotScalarConstantError:
+        return None
+
+    ctx = infer_context_name(*inputs)
+    if not dnn_available(ctx):
+        # TODO should this raise_no_cudnn?
+        return None
+    x = as_gpuarray_variable(x, context_name=ctx)
+    scale = as_gpuarray_variable(scale, context_name=ctx)
+    bias = as_gpuarray_variable(bias, context_name=ctx)
+
+    inputs = [x, scale, bias, mode, eps, running_average_factor]
+    if running_mean is not None and running_var is not None:
+        inputs.append(running_mean)
+        inputs.append(running_var)
+
+    results = list(dnn_batch_normalization_train(*inputs))
+
+    return results
+
+
+@register_inplace()
+@local_optimizer([GpuDnnBatchNorm], inplace=True)
+def local_batch_norm_inplace_output(node):
+    if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
+        return GpuDnnBatchNorm(mode=node.op.mode,
+                               running_averages=node.op.running_averages,
+                               inplace_running_mean=node.op.inplace_running_mean,
+                               inplace_running_var=node.op.inplace_running_var,
+                               inplace_output=True)(*node.inputs)
+
+
+@register_inplace()
+@local_optimizer([GpuDnnBatchNorm], inplace=True)
+def local_batch_norm_inplace_running_mean(node):
+    if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_mean:
+        return GpuDnnBatchNorm(mode=node.op.mode,
+                               running_averages=node.op.running_averages,
+                               inplace_running_mean=True,
+                               inplace_running_var=node.op.inplace_running_var,
+                               inplace_output=node.op.inplace_output)(*node.inputs)
+
+
+@register_inplace()
+@local_optimizer([GpuDnnBatchNorm], inplace=True)
+def local_batch_norm_inplace_running_var(node):
+    if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_var:
+        return GpuDnnBatchNorm(mode=node.op.mode,
+                               running_averages=node.op.running_averages,
+                               inplace_running_mean=node.op.inplace_running_mean,
+                               inplace_running_var=True,
+                               inplace_output=node.op.inplace_output)(*node.inputs)
+
+
+@register_inplace()
+@local_optimizer([GpuDnnBatchNormInference], inplace=True)
+def local_batch_norm_inference_inplace(node):
+    if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
+        return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)]
+
+
+@register_opt('cudnn', 'fast_compile')
+@op_lifter([bn.AbstractBatchNormTrainGrad])
+@register_opt2([bn.AbstractBatchNormTrainGrad], 'cudnn', 'fast_compile')
+def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):
+    x, dy, scale, x_mean, x_invstd, epsilon = inputs
+
+    # input on gpu?  TODO what about the output?
+    x_on_gpu = (isinstance(x.type, GpuArrayType) or
+                (x.owner and isinstance(x.owner.op, HostFromGpu)))
+    dy_on_gpu = (isinstance(dy.type, GpuArrayType) or
+                 (dy.owner and isinstance(dy.owner.op, HostFromGpu)))
+    if not (x_on_gpu or dy_on_gpu):
+        return None
+
+    # convert axes to cuDNN mode
+    axes = tuple(op.axes)
+    if axes == (0,):
+        mode = 'per-activation'
+    elif axes == (0,) + tuple(range(2, x.ndim)):
+        mode = 'spatial'
+    else:
+        return None
+
+    ndim = x.ndim
+    if ndim < 4:
+        x = theano.tensor.shape_padright(x, 4 - ndim)
+        dy = theano.tensor.shape_padright(dy, 4 - ndim)
+        scale = theano.tensor.shape_padright(scale, 4 - ndim)
+        x_mean = theano.tensor.shape_padright(x_mean, 4 - ndim)
+        x_invstd = theano.tensor.shape_padright(x_invstd, 4 - ndim)
+    elif ndim > 5:
+        x_shape = x.shape
+        params_shape = scale.shape
+        x = theano.tensor.flatten(x, 5)
+        dy = theano.tensor.flatten(dy, 5)
+        scale = theano.tensor.flatten(scale, 5)
+        x_mean = theano.tensor.flatten(x_mean, 5)
+        x_invstd = theano.tensor.flatten(x_invstd, 5)
+
+    try:
+        eps = theano.tensor.get_scalar_constant_value(epsilon)
+    except theano.tensor.NotScalarConstantError:
+        return None
+    if eps < 1e-5:
+        return None
+
+    ctx = infer_context_name(*inputs)
+    if not dnn_available(ctx):
+        # TODO should this raise_no_cudnn?
+        return None
+    x = as_gpuarray_variable(x, context_name=ctx)
+    dy = as_gpuarray_variable(dy, context_name=ctx)
+    scale = as_gpuarray_variable(scale, context_name=ctx)
+    x_mean = as_gpuarray_variable(x_mean, context_name=ctx)
+    x_invstd = as_gpuarray_variable(x_invstd, context_name=ctx)
+
+    g_wrt_inputs, g_wrt_scale, g_wrt_bias = \
+        GpuDnnBatchNormGrad(mode)(x, dy, scale, x_mean, x_invstd, eps)
+
+    if ndim < 4:
+        g_wrt_inputs = theano.tensor.flatten(g_wrt_inputs, ndim)
+        g_wrt_scale = theano.tensor.flatten(g_wrt_scale, ndim)
+        g_wrt_bias = theano.tensor.flatten(g_wrt_bias, ndim)
+    elif ndim > 5:
+        g_wrt_inputs = theano.tensor.reshape(g_wrt_inputs, x_shape)
+        g_wrt_scale = theano.tensor.reshape(g_wrt_scale, params_shape)
+        g_wrt_bias = theano.tensor.reshape(g_wrt_bias, params_shape)
+
+    return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
+
+
+@register_opt('cudnn', 'fast_compile')
+@op_lifter([bn.AbstractBatchNormInference])
+@register_opt2([bn.AbstractBatchNormInference], 'cudnn', 'fast_compile')
+def local_abstract_batch_norm_inference_cudnn(op, ctx_name, inputs, outputs):
+    x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs
+
+    axes = tuple(op.axes)
+    if axes == (0,):
+        mode = 'per-activation'
+    elif axes == (0,) + tuple(range(2, x.ndim)):
+        mode = 'spatial'
+    else:
+        return None
+
+    try:
+        eps = theano.tensor.get_scalar_constant_value(epsilon)
+    except theano.tensor.NotScalarConstantError:
+        return None
+    if eps < 1e-5:
+        return None
+
+    ctx = infer_context_name(*inputs)
+    if not dnn_available(ctx):
+        # TODO should this raise_no_cudnn?
+        return None
+    x = as_gpuarray_variable(x, context_name=ctx)
+    scale = as_gpuarray_variable(scale, context_name=ctx)
+    bias = as_gpuarray_variable(bias, context_name=ctx)
+    estimated_mean = as_gpuarray_variable(estimated_mean, context_name=ctx)
+    estimated_variance = as_gpuarray_variable(estimated_variance, context_name=ctx)
+
+    out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance,
+                                       mode, eps)
+
+    return [out]
--- a/theano/gpuarray/dnn_batchnorm.c
+++ b/theano/gpuarray/dnn_batchnorm.c
@@ -2,8 +2,19 @@

 int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
                     PyGpuArrayObject *bias, npy_float64 epsilon,
-                     PyGpuArrayObject **outp, PyGpuArrayObject **x_mean,
-                     PyGpuArrayObject **x_invstd, cudnnHandle_t _handle) {
+                     npy_float64 running_average_factor,
+#ifdef RUNNING_AVERAGES
+                     PyGpuArrayObject *in_running_mean,
+                     PyGpuArrayObject *in_running_var,
+#endif
+                     PyGpuArrayObject **outp,
+                     PyGpuArrayObject **x_mean,
+                     PyGpuArrayObject **x_invstd,
+#ifdef RUNNING_AVERAGES
+                     PyGpuArrayObject **out_running_mean,
+                     PyGpuArrayObject **out_running_var,
+#endif
+                     cudnnHandle_t _handle) {
  PyGpuContextObject *c = inp->context;

  if (c_set_tensorNd(inp, bn_input) != 0)
@@ -16,8 +27,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
    return 1;
  }

+#ifdef INPLACE_OUTPUT
+  Py_XDECREF(*outp);
+  *outp = inp;
+  Py_INCREF(*outp);
+#else
  if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
+#endif
  if (theano_prep_output(x_mean, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (theano_prep_output(x_invstd, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
@@ -26,6 +43,31 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
  if (c_set_tensorNd(*outp, bn_output) != 0)
    return 1;

+#ifdef RUNNING_AVERAGES
+#ifdef INPLACE_RUNNING_MEAN
+  Py_XDECREF(out_running_mean);
+  PyGpuArrayObject *running_mean = in_running_mean;
+  Py_INCREF(running_mean);
+#else
+  PyGpuArrayObject *running_mean = *out_running_mean;
+  running_mean = theano_try_copy(running_mean, in_running_mean);
+  if (running_mean == NULL) {
+    return 1;
+  }
+#endif
+#ifdef INPLACE_RUNNING_VAR
+  Py_XDECREF(out_running_var);
+  PyGpuArrayObject *running_var = in_running_var;
+  Py_INCREF(running_var);
+#else
+  PyGpuArrayObject *running_var = *out_running_var;
+  running_var = theano_try_copy(running_var, in_running_var);
+  if (running_var == NULL) {
+    return 1;
+  }
+#endif
+#endif
+
  {
    const float falpha = 1.;
    const float fbeta = 0.;
@@ -52,9 +94,15 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
      bn_params,
      PyGpuArray_DEV_DATA(scale),
      PyGpuArray_DEV_DATA(bias),
+#ifdef RUNNING_AVERAGES
+      running_average_factor,
+      PyGpuArray_DEV_DATA(running_mean),
+      PyGpuArray_DEV_DATA(running_var),
+#else
      0,
      NULL,  // running mean, deliberately unused
      NULL,  // running var, deliberately unused
+#endif
      epsilon,
      PyGpuArray_DEV_DATA(*x_mean),
      PyGpuArray_DEV_DATA(*x_invstd)
@@ -64,6 +112,10 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
                   cudnnGetErrorString(err));
      return 1;
    }
+#ifdef RUNNING_AVERAGES
+    *out_running_mean = running_mean;
+    *out_running_var = running_var;
+#endif
  }
  return 0;
 }
--- a/theano/gpuarray/dnn_batchnorm_inf.c
+++ b/theano/gpuarray/dnn_batchnorm_inf.c
@@ -16,8 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
    return 1;
  }

+#ifdef INPLACE_OUTPUT
+  Py_XDECREF(*outp);
+  *outp = inp;
+  Py_INCREF(*outp);
+#else
  if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
+#endif

  if (c_set_tensorNd(*outp, bn_output) != 0)
    return 1;

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
 from __future__ import absolute_import, print_function, division
 import logging
+from collections import OrderedDict

 from nose.plugins.skip import SkipTest
 from nose_parameterized import parameterized
@@ -13,6 +14,7 @@ import theano.tests.unittest_tools as utt
 from theano.tensor.signal.pool import pool_2d, pool_3d
 from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
 from theano.tensor.nnet.abstract_conv import get_conv_output_shape
+from theano.tensor.nnet import bn

 from .. import dnn
 from ..basic_ops import GpuAllocEmpty
@@ -1379,36 +1381,77 @@ def test_dnn_batchnorm_train():
        raise SkipTest(dnn.dnn_available.msg)
    utt.seed_rng()

+    tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
+
    for mode in ('per-activation', 'spatial'):
-        for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
-            x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias'))
+        for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
+            x, scale, bias, running_mean, running_var = (vartype(n)
+                                                         for n in ('x', 'scale', 'bias',
+                                                                   'running_mean',
+                                                                   'running_var'))
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used
-
-            # forward pass
-            out, x_mean, x_invstd = dnn.dnn_batch_normalization_train(
-                x, scale, bias, mode, eps)
+            running_average_factor = 0.3
+
+            # forward pass, direct interface
+            out_gpu, x_mean_gpu, x_invstd_gpu, \
+                out_running_mean_gpu, out_running_var_gpu = \
+                dnn.dnn_batch_normalization_train(x, scale, bias, mode, eps,
+                                                  running_average_factor,
+                                                  running_mean, running_var)
+            # forward pass, abstract interface
+            out_abstract, x_mean_abstract, x_invstd_abstract, \
+                out_running_mean_abstract, out_running_var_abstract = \
+                bn.batch_normalization_train(x, scale, bias, mode, eps,
+                                             running_average_factor,
+                                             running_mean, running_var)
            # reference forward pass
            if mode == 'per-activation':
                axes = (0,)
            elif mode == 'spatial':
                axes = (0,) + tuple(range(2, ndim))
-            x_mean2 = x.mean(axis=axes, keepdims=True)
-            x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps))
-            scale2 = T.addbroadcast(scale, *axes)
-            bias2 = T.addbroadcast(bias, *axes)
-            out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
+            x_mean_ref = x.mean(axis=axes, keepdims=True)
+            x_var_ref = x.var(axis=axes, keepdims=True)
+            x_invstd_ref = T.inv(T.sqrt(x_var_ref + eps))
+            scale_ref = T.addbroadcast(scale, *axes)
+            bias_ref = T.addbroadcast(bias, *axes)
+            m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
+            out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
+            out_running_mean_ref = running_mean * (1 - running_average_factor) + \
+                x_mean_ref * running_average_factor
+            out_running_var_ref = running_var * (1 - running_average_factor) + \
+                (m / (m - 1)) * x_var_ref * running_average_factor
            # backward pass
            dy = vartype('dy')
-            grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
+            grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
+            grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
            # reference backward pass
-            grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
+            grads_ref = T.grad(None, wrt=[x, scale, bias], known_grads={out_ref: dy})
            # compile
-            f = theano.function([x, scale, bias, dy],
-                                [out, x_mean, x_invstd, out2, x_mean2, x_invstd2] +
-                                grads + grads2, mode=mode_with_gpu)
+            f_gpu = theano.function([x, scale, bias, running_mean, running_var, dy],
+                                    [out_gpu, x_mean_gpu, x_invstd_gpu,
+                                     out_running_mean_gpu, out_running_var_gpu] + grads_gpu,
+                                    mode=mode_with_gpu)
+            f_abstract = theano.function([x, scale, bias, running_mean, running_var, dy],
+                                         [out_abstract, x_mean_abstract, x_invstd_abstract,
+                                          out_running_mean_abstract, out_running_var_abstract] +
+                                         grads_abstract,
+                                         mode=mode_with_gpu)
+            f_ref = theano.function([x, scale, bias, running_mean, running_var, dy],
+                                    [out_ref, x_mean_ref, x_invstd_ref,
+                                     out_running_mean_ref, out_running_var_ref] + grads_ref,
+                                    mode=mode_without_gpu)
+            # check if the abstract Ops have been replaced
+            assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
+                        in f_abstract.maker.fgraph.toposort()])
+            assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
+                        in f_abstract.maker.fgraph.toposort()])
+            assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                              bn.AbstractBatchNormInference,
+                                              bn.AbstractBatchNormTrainGrad)) for n
+                            in f_abstract.maker.fgraph.toposort()])
            # run
-            for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
+            for data_shape in ((5, 10, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes else s
                                    for d, s in enumerate(data_shape))
@@ -1416,15 +1459,124 @@ def test_dnn_batchnorm_train():
                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
                Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
-                outputs = f(X, Scale, Bias, Dy)
+                Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                outputs_gpu = f_gpu(X, Scale, Bias, Running_mean, Running_var, Dy)
+                outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
+                outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
                # compare outputs
-                utt.assert_allclose(outputs[0], outputs[0 + 3])  # out
-                utt.assert_allclose(outputs[1], outputs[1 + 3])  # mean
-                utt.assert_allclose(outputs[2], outputs[2 + 3])  # invstd
+                utt.assert_allclose(outputs_gpu[0], outputs_ref[0])  # out
+                utt.assert_allclose(outputs_gpu[1], outputs_ref[1])  # mean
+                utt.assert_allclose(outputs_gpu[2], outputs_ref[2])  # invstd
+                utt.assert_allclose(outputs_gpu[3], outputs_ref[3])  # running_mean
+                utt.assert_allclose(numpy.nan_to_num(outputs_gpu[4]),
+                                    numpy.nan_to_num(outputs_ref[4]))  # running_var
+                utt.assert_allclose(outputs_abstract[0], outputs_ref[0])  # out
+                utt.assert_allclose(outputs_abstract[1], outputs_ref[1])  # mean
+                utt.assert_allclose(outputs_abstract[2], outputs_ref[2])  # invstd
+                utt.assert_allclose(outputs_abstract[3], outputs_ref[3])  # running_mean
+                utt.assert_allclose(numpy.nan_to_num(outputs_abstract[4]),
+                                    numpy.nan_to_num(outputs_ref[4]))  # running_var
                # compare gradients
-                utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4)  # dx
-                utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4)  # dscale
-                utt.assert_allclose(outputs[8], outputs[8 + 3])  # dbias
+                utt.assert_allclose(outputs_gpu[5], outputs_ref[5], atol=2e-4)  # dx
+                utt.assert_allclose(outputs_gpu[6], outputs_ref[6], rtol=4e-4, atol=1e-4)  # dscale
+                utt.assert_allclose(outputs_gpu[7], outputs_ref[7])  # dbias
+                utt.assert_allclose(outputs_abstract[5], outputs_ref[5], atol=2e-4)  # dx
+                utt.assert_allclose(outputs_abstract[6], outputs_ref[6], rtol=4e-4, atol=1e-4)  # dscale
+                utt.assert_allclose(outputs_abstract[7], outputs_ref[7])  # dbias
+
+
+def test_dnn_batchnorm_train_without_running_averages():
+    # compile and run batch_normalization_train without running averages
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+    if dnn.version(raises=False) < 5000:
+        raise SkipTest("batch normalization requires cudnn v5+")
+    utt.seed_rng()
+
+    x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
+    data_shape = (5, 10, 30, 25)
+    param_shape = (1, 10, 30, 25)
+
+    # forward pass
+    out_gpu, x_mean_gpu, x_invstd_gpu = \
+        dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation')
+    out_abstract, x_mean_abstract, x_invstd_abstract = \
+        bn.batch_normalization_train(x, scale, bias, 'per-activation')
+    # backward pass
+    grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
+    grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
+    # compile
+    f_gpu = theano.function([x, scale, bias, dy],
+                            [out_gpu, x_mean_gpu, x_invstd_gpu] +
+                            grads_gpu,
+                            mode=mode_with_gpu)
+    f_abstract = theano.function([x, scale, bias, dy],
+                                 [out_abstract, x_mean_abstract, x_invstd_abstract] +
+                                 grads_abstract,
+                                 mode=mode_with_gpu)
+    # check if the abstract Ops have been replaced
+    assert any([isinstance(n.op, dnn.GpuDnnBatchNorm)
+                for n in f_abstract.maker.fgraph.toposort()])
+    assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
+                for n in f_abstract.maker.fgraph.toposort()])
+    assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                      bn.AbstractBatchNormInference,
+                                      bn.AbstractBatchNormTrainGrad))
+                    for n in f_abstract.maker.fgraph.toposort()])
+    # run
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    f_gpu(X, Scale, Bias, Dy)
+    f_abstract(X, Scale, Bias, Dy)
+
+
+def test_dnn_batchnorm_train_inplace():
+    # test inplace_running_mean and inplace_running_var
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+    if dnn.version(raises=False) < 5000:
+        raise SkipTest("batch normalization requires cudnn v5+")
+    utt.seed_rng()
+
+    x, scale, bias = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias')
+    data_shape = (5, 10, 30, 25)
+    param_shape = (1, 10, 30, 25)
+    running_mean = gpuarray_shared_constructor(
+        numpy.random.randn(*param_shape).astype(theano.config.floatX),
+        broadcastable=(True, False, False, False))
+    running_var = gpuarray_shared_constructor(
+        numpy.random.randn(*param_shape).astype(theano.config.floatX),
+        broadcastable=(True, False, False, False))
+
+    # forward pass
+    out, x_mean, x_invstd, new_running_mean, new_running_var = \
+        dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation',
+                                          epsilon=5e-3, running_average_factor=0.3,
+                                          running_mean=running_mean, running_var=running_var)
+    # update running averages
+    updates = OrderedDict()
+    updates[running_mean] = new_running_mean
+    updates[running_var] = new_running_var
+    # compile
+    f = theano.function([x, scale, bias],
+                        [out, x_mean, x_invstd],
+                        updates=updates,
+                        mode=mode_with_gpu)
+    # check for the inplace settings
+    nodes = [n for n in f.maker.fgraph.toposort()
+             if isinstance(n.op, dnn.GpuDnnBatchNorm)]
+    assert len(nodes) == 1
+    assert nodes[0].op.inplace_running_mean
+    assert nodes[0].op.inplace_running_var
+    assert nodes[0].op.inplace_output
+    # run
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    f(X, Scale, Bias)


 def test_batchnorm_inference():
@@ -1432,34 +1584,51 @@ def test_batchnorm_inference():
        raise SkipTest(dnn.dnn_available.msg)
    utt.seed_rng()

+    tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
+
    for mode in ('per-activation', 'spatial'):
-        for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
+        for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
            x, scale, bias, mean, var = (vartype(n)
                                         for n in ('x', 'scale', 'bias', 'mean', 'var'))
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used

-            # forward pass
-            out = dnn.dnn_batch_normalization_test(x, scale, bias, mean,
-                                                   var, mode, eps)
+            # forward pass, direct interface
+            out_gpu = dnn.dnn_batch_normalization_test(x, scale, bias, mean,
+                                                       var, mode, eps)
+            # forward pass, abstract interface
+            out_abstract = bn.batch_normalization_test(x, scale, bias, mean,
+                                                       var, mode, eps)
            # reference forward pass
            if mode == 'per-activation':
                axes = (0,)
            elif mode == 'spatial':
                axes = (0,) + tuple(range(2, ndim))
-            scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes)
-                                          for t in (scale, bias, mean, var))
-            out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2
+            scale_ref, bias_ref, mean_ref, var_ref = (T.addbroadcast(t, *axes)
+                                                      for t in (scale, bias, mean, var))
+            out_ref = (x - mean_ref) * (scale_ref / T.sqrt(var_ref + eps)) + bias_ref
            # backward pass
            dy = vartype('dy')
-            grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy})
+            grads_gpu = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_gpu: dy})
+            grads_abstract = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_abstract: dy})
            # reference backward pass
-            grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy})
+            grads_ref = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_ref: dy})
            # compile
-            f = theano.function([x, scale, bias, mean, var, dy],
-                                [out, out2] + grads + grads2, mode=mode_with_gpu)
+            f_gpu = theano.function([x, scale, bias, mean, var, dy],
+                                    [out_gpu] + grads_gpu, mode=mode_with_gpu)
+            f_abstract = theano.function([x, scale, bias, mean, var, dy],
+                                         [out_abstract] + grads_abstract, mode=mode_with_gpu)
+            f_ref = theano.function([x, scale, bias, mean, var, dy],
+                                    [out_ref] + grads_ref)
+            # check if the abstract Ops have been replaced
+            assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
+                        in f_abstract.maker.fgraph.toposort()])
+            assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                              bn.AbstractBatchNormInference,
+                                              bn.AbstractBatchNormTrainGrad)) for n
+                            in f_abstract.maker.fgraph.toposort()])
            # run
-            for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
+            for data_shape in ((10, 20, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes else s
                                    for d, s in enumerate(data_shape))
@@ -1469,15 +1638,106 @@ def test_batchnorm_inference():
                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
-                outputs = f(X, Scale, Bias, Mean, Var, Dy)
+                outputs_gpu = f_gpu(X, Scale, Bias, Mean, Var, Dy)
+                outputs_abstract = f_abstract(X, Scale, Bias, Mean, Var, Dy)
+                outputs_ref = f_ref(X, Scale, Bias, Mean, Var, Dy)
                # compare outputs
-                utt.assert_allclose(outputs[0], outputs[1])  # out
+                utt.assert_allclose(outputs_gpu[0], outputs_ref[0])  # out
+                utt.assert_allclose(outputs_abstract[0], outputs_ref[0])  # out
                # compare gradients
-                utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5)  # dx
-                utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5)  # dscale
-                utt.assert_allclose(outputs[4], outputs[4 + 5])  # dbias
-                utt.assert_allclose(outputs[5], outputs[5 + 5])  # dmean
-                utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5)  # dvar
+                utt.assert_allclose(outputs_gpu[1], outputs_ref[1], atol=4e-5)  # dx
+                utt.assert_allclose(outputs_gpu[2], outputs_ref[2], atol=4e-5)  # dscale
+                utt.assert_allclose(outputs_gpu[3], outputs_ref[3])  # dbias
+                utt.assert_allclose(outputs_gpu[4], outputs_ref[4])  # dmean
+                utt.assert_allclose(outputs_gpu[5], outputs_ref[5], rtol=2e-3, atol=4e-5)  # dvar
+                utt.assert_allclose(outputs_abstract[1], outputs_ref[1], atol=4e-5)  # dx
+                utt.assert_allclose(outputs_abstract[2], outputs_ref[2], atol=4e-5)  # dscale
+                utt.assert_allclose(outputs_abstract[3], outputs_ref[3])  # dbias
+                utt.assert_allclose(outputs_abstract[4], outputs_ref[4])  # dmean
+                utt.assert_allclose(outputs_abstract[5], outputs_ref[5], rtol=2e-3, atol=4e-5)  # dvar
+
+
+def test_batchnorm_inference_inplace():
+    # test inplace
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+    if dnn.version(raises=False) < 5000:
+        raise SkipTest("batch normalization requires cudnn v5+")
+    utt.seed_rng()
+
+    x, scale, bias, mean, var = (T.tensor4(n) for n in ('x', 'scale', 'bias', 'mean', 'var'))
+    data_shape = (5, 10, 30, 25)
+    param_shape = (1, 10, 30, 25)
+
+    out = dnn.dnn_batch_normalization_test(x, scale, bias, mean, var)
+    f = theano.function([x, scale, bias, mean, var], [out], mode=mode_with_gpu)
+
+    # check for the inplace settings
+    nodes = [n for n in f.maker.fgraph.toposort()
+             if isinstance(n.op, dnn.GpuDnnBatchNormInference)]
+    assert len(nodes) == 1
+    assert nodes[0].op.inplace
+
+    # run
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
+    f(X, Scale, Bias, Mean, Var)
+
+
+def test_dnn_batchnorm_valid_and_invalid_axes():
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+    if dnn.version(raises=False) < 5000:
+        raise SkipTest("batch normalization requires cudnn v5+")
+
+    for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix):
+        x, scale, bias, mean, var, dy = (vartype(n)
+                                         for n in ('x', 'scale', 'bias', 'mean', 'var', 'dy'))
+        ndim = x.ndim
+
+        # supported: per-activation and spatial
+        valid_axes_lists = ((0,), (0,) + tuple(range(2, ndim)))
+        # not supported: an axes list without 0 and including 1
+        invalid_axes_lists = (tuple(range(1, ndim)),)
+        for axes in valid_axes_lists + invalid_axes_lists:
+            # forward pass, abstract interface
+            out_train, x_mean, x_invstd = bn.batch_normalization_train(
+                x, scale, bias, axes)
+            out_test = bn.batch_normalization_test(
+                x, scale, bias, mean, var, axes)
+            # backward pass
+            dy = vartype('dy')
+            grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
+            grads_test = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_test: dy})
+            # compile
+            f = theano.function([x, scale, bias, mean, var, dy],
+                                [out_train, x_mean, x_invstd, out_test] +
+                                grads_train + grads_test,
+                                mode=mode_with_gpu)
+
+            if axes in valid_axes_lists:
+                # check if the abstract Ops have been replaced by the cuDNN Ops
+                assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
+                            in f.maker.fgraph.toposort()])
+                assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
+                            in f.maker.fgraph.toposort()])
+                assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
+                            in f.maker.fgraph.toposort()])
+                assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                                  bn.AbstractBatchNormInference,
+                                                  bn.AbstractBatchNormTrainGrad)) for n
+                                in f.maker.fgraph.toposort()])
+            else:
+                # check if the abstract Ops have been replaced, but not by the cuDNN Ops
+                assert not any([isinstance(n.op, (dnn.GpuDnnBatchNorm,
+                                                  dnn.GpuDnnBatchNormGrad,
+                                                  bn.AbstractBatchNormTrain,
+                                                  bn.AbstractBatchNormInference,
+                                                  bn.AbstractBatchNormTrainGrad)) for n
+                                in f.maker.fgraph.toposort()])


 def test_dnn_rnn_gru():

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -12,7 +12,7 @@ import warnings
 import theano
 from theano.compat import get_unbound_function
 from theano.compile import optdb
-from theano.gof import EquilibriumDB, SequenceDB
+from theano.gof import EquilibriumDB, SequenceDB, TopoOptimizer
 from theano.gof.cmodule import get_lib_extension
 from theano.gof.compilelock import get_lock, release_lock
 from theano import config
@@ -40,6 +40,17 @@ def register_opt(*tags, **kwargs):
    return f


+def register_inplace(*tags, **kwargs):
+    def f(local_opt):
+        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
+        optdb.register(
+            name, TopoOptimizer(
+                local_opt, failure_callback=TopoOptimizer.warn_inplace),
+            60, 'fast_run', 'inplace', 'gpu', *tags)
+        return local_opt
+    return f
+
+
 _logger_name = 'theano.sandbox.cuda'
 _logger = logging.getLogger(_logger_name)


--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -18,6 +18,7 @@ from theano.tensor.nnet.abstract_conv import (get_conv_output_shape,
                                              assert_conv_shape)
 from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
+from theano.tensor.nnet import bn
 from theano.sandbox.cuda.type import CudaNdarrayType

 from theano.sandbox.cuda import GpuOp, dnn_available
@@ -33,7 +34,7 @@ from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
 from theano.sandbox.cuda.nnet import GpuSoftmax
 from theano.sandbox.cuda.opt_util import (alpha_merge, output_merge,
                                          pad_dims, unpad_dims)
-from theano.sandbox.cuda import gpu_seqopt, register_opt
+from theano.sandbox.cuda import gpu_seqopt, register_opt, register_inplace

 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler

@@ -2347,6 +2348,23 @@ class GpuDnnBatchNormBase(DnnBase):
    epsilon
        Epsilon value used in the batch normalization formula. Minimum allowed
        value is 1e-5 (imposed by cuDNN).
+    running_average_factor : float
+        Factor for updating the values or `running_mean` and `running_var`.
+        If the factor is close to one, the running averages will update quickly,
+        if the factor is close to zero it will update slowly.
+    running_mean : tensor or None
+        Previous value of the running mean. If this is given, the new value
+        ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
+        will be returned as one of the outputs of this function.
+        `running_mean` and `running_var` should either both be given or
+        both be None.
+    running_var : tensor or None
+        Previous value of the running variance. If this is given, the new value
+        ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
+        will be returned as one of the outputs of this function,
+        where `m` is the product of lengths of the averaged-over dimensions.
+        `running_mean` and `running_var` should either both be given or
+        both be None.
    """

    __props__ = ('mode', 'epsilon')
@@ -2395,17 +2413,15 @@ cudnnStatus_t err%(name)s;
        result = """
 cudnnStatus_t err%(name)s;
 cudnnBatchNormMode_t mode%(name)s = %(mode)s;
-double exponentialAverageFactor%(name)s = %(exp_avg_factor)f;
 double epsilon%(name)s = %(epsilon)e;
 """ % dict(name=name,
           mode=mode,
-           exp_avg_factor=0,  # deliberately unused
           epsilon=self.epsilon)

        return result

    def c_code_cache_version(self):
-        return (3, version())
+        return (4, version())


 class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
@@ -2422,8 +2438,26 @@ class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
    Note: scale, bias, mean and variance must follow the same tensor layout!
    """

+    __props__ = ('mode', 'epsilon', 'inplace')
    tensor_descs = ['bn_input', 'bn_output', 'bn_params']

+    def __init__(self, mode='per-activation', epsilon=1e-4, inplace=False):
+        super(GpuDnnBatchNormInference, self).__init__(mode=mode, epsilon=epsilon)
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'inplace'):
+            self.inplace = False
+
+    def get_op_params(self):
+        params = []
+        if self.inplace:
+            params.append(('INPLACE_OUTPUT', '1'))
+        return params
+
    def infer_shape(self, node, shape):
        # output shape equals shape of x
        return [shape[0]]
@@ -2460,10 +2494,16 @@ if (c_set_tensorNd(%(scale)s, bn_params_%(name)s) != 0)
 }

 // build and prepare the output variable
+#ifdef INPLACE_OUTPUT
+  Py_XDECREF(%(outp)s);
+  %(outp)s = %(inp)s;
+  Py_INCREF(%(outp)s);
+#else
 if (CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp)s)) != 0)
 {
    %(fail)s
 }
+#endif

 // set output tensor descriptor from output tensor
 if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0)
@@ -2494,6 +2534,16 @@ err%(name)s = cudnnBatchNormalizationForwardInference(
 """ % dict(name=name, inp=inp, scale=scale, bias=bias, est_mean=est_mean,
           est_var=est_var, outp=outp, fail=sub['fail'])

+        # add params
+        define_macros, undef_macros = self.get_c_macros(node, name, check_input=False)
+        result = """
+%(define_macros)s
+{
+    %(code)s
+}
+%(undef_macros)s
+""" % dict(code=result, define_macros=define_macros, undef_macros=undef_macros)
+
        return result

    def grad(self, inputs, grads):
@@ -2537,28 +2587,98 @@ class GpuDnnBatchNorm(GpuDnnBatchNormBase):
    Note: scale and bias must follow the same tensor layout!
    """

+    __props__ = ('mode', 'epsilon', 'running_average_factor',
+                 'running_averages', 'inplace_running_mean',
+                 'inplace_running_var', 'inplace_output')
    tensor_descs = ['bn_input', 'bn_output', 'bn_params']

+    def __init__(self, mode='per-activation', epsilon=1e-4,
+                 running_average_factor=0,
+                 running_averages=False, inplace_running_mean=False,
+                 inplace_running_var=False, inplace_output=False):
+        super(GpuDnnBatchNorm, self).__init__(mode=mode, epsilon=epsilon)
+        self.running_average_factor = running_average_factor
+        self.running_averages = running_averages
+        self.inplace_output = inplace_output
+        self.inplace_running_mean = inplace_running_mean
+        self.inplace_running_var = inplace_running_var
+        self.destroy_map = {}
+        if self.inplace_output:
+            self.destroy_map[0] = [0]
+        if self.running_averages and self.inplace_running_mean:
+            self.destroy_map[3] = [3]
+        if self.running_averages and self.inplace_running_var:
+            self.destroy_map[4] = [4]
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'running_average_factor'):
+            self.running_average_factor = 0
+        if not hasattr(self, 'running_averages'):
+            self.running_averages = False
+        if not (hasattr(self, 'inplace_running_mean') and
+                hasattr(self, 'inplace_running_var') and
+                hasattr(self, 'inplace_output')):
+            self.inplace_running_mean = False
+            self.inplace_running_var = False
+            self.inplace_output = False
+            self.destroy_map = {}
+
+    def get_op_params(self):
+        params = []
+        if self.inplace_output:
+            params.append(('INPLACE_OUTPUT', '1'))
+        if self.running_averages:
+            params.append(('RUNNING_AVERAGES', '1'))
+            if self.inplace_running_mean:
+                params.append(('INPLACE_RUNNING_MEAN', '1'))
+            if self.inplace_running_var:
+                params.append(('INPLACE_RUNNING_VAR', '1'))
+        return params
+
    def infer_shape(self, node, shape):
        # first output equals shape of x
-        # second and third output equal shape of scale
-        return [shape[0], shape[1], shape[1]]
+        # other outputs equal shape of scale
+        return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)

-    def make_node(self, x, scale, bias):
+    def make_node(self, x, scale, bias,
+                  running_mean=None, running_var=None):
+        assert x.ndim == scale.ndim == bias.ndim
+        assert x.ndim in (4, 5)
+        assert self.running_averages == (running_mean is not None) == (running_var is not None)
+        assert (running_mean is None or running_mean.ndim == x.ndim)
+        assert (running_var is None or running_var.ndim == x.ndim)
        x = as_cuda_ndarray_variable(x)
        scale = as_cuda_ndarray_variable(scale)
        bias = as_cuda_ndarray_variable(bias)
-        assert x.ndim == scale.ndim == bias.ndim
-        assert x.ndim in (4, 5)
-        return Apply(self, [x, scale, bias], [x.type(), scale.type(), scale.type()])
+        inputs = [x, scale, bias]
+        output_types = [x.type(), scale.type(), scale.type()]
+        if running_mean is not None and running_var is not None:
+            inputs.append(as_cuda_ndarray_variable(running_mean))
+            inputs.append(as_cuda_ndarray_variable(running_var))
+            output_types.append(scale.type())
+            output_types.append(scale.type())
+        return Apply(self, inputs, output_types)

    def c_code(self, node, name, inputs, outputs, sub):
        # super call to prepare common configuration
        result = super(GpuDnnBatchNorm, self).c_code(node, name, inputs, outputs, sub)

        # give sensible names to inputs and outputs
-        inp, scale, bias = inputs
-        outp, x_mean, x_invstd = outputs
+        inp, scale, bias = inputs[:3]
+        outp, x_mean, x_invstd = outputs[:3]
+        if self.running_averages:
+            running_average_factor = self.running_average_factor
+            in_running_mean = inputs[3]
+            in_running_var = inputs[4]
+            out_running_mean = outputs[3]
+            out_running_var = outputs[4]
+        else:
+            running_average_factor = 0.
+            in_running_mean = 'NULL'
+            in_running_var = 'NULL'
+            out_running_mean = 'NULL'
+            out_running_var = 'NULL'

        # set input tensor descriptors from input tensors
        result += """
@@ -2579,6 +2699,32 @@ if ((CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp
 {
    %(fail)s
 }
+#ifdef RUNNING_AVERAGES
+#ifdef INPLACE_RUNNING_MEAN
+  Py_XDECREF(%(out_running_mean)s);
+  CudaNdarray *running_mean%(name)s = %(in_running_mean)s;
+  Py_INCREF(running_mean%(name)s);
+#else
+  if ((CudaNdarray_prep_output(&%(out_running_mean)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(scale)s)) != 0) ||
+      (CudaNdarray_CopyFromCudaNdarray(%(out_running_mean)s, %(in_running_mean)s) != 0))
+  {
+    %(fail)s
+  }
+  CudaNdarray *running_mean%(name)s = %(out_running_mean)s;
+#endif
+#ifdef INPLACE_RUNNING_VAR
+  Py_XDECREF(%(out_running_var)s);
+  CudaNdarray *running_var%(name)s = %(in_running_var)s;
+  Py_INCREF(running_var%(name)s);
+#else
+  if ((CudaNdarray_prep_output(&%(out_running_var)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(scale)s)) != 0) ||
+      (CudaNdarray_CopyFromCudaNdarray(%(out_running_var)s, %(in_running_var)s) != 0))
+  {
+    %(fail)s
+  }
+  CudaNdarray *running_var%(name)s = %(out_running_var)s;
+#endif
+#endif

 // set output tensor descriptor from output tensor
 if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0)
@@ -2601,25 +2747,66 @@ err%(name)s = cudnnBatchNormalizationForwardTraining(
  bn_params_%(name)s,
  CudaNdarray_DEV_DATA(%(scale)s),
  CudaNdarray_DEV_DATA(%(bias)s),
-  exponentialAverageFactor%(name)s,
-  NULL,  // running mean, deliberately unused
-  NULL,  // running var, deliberately unused
+#ifdef RUNNING_AVERAGES
+  %(running_average_factor)f,
+  CudaNdarray_DEV_DATA(running_mean%(name)s),
+  CudaNdarray_DEV_DATA(running_var%(name)s),
+#else
+  0,
+  NULL,
+  NULL,
+#endif
  epsilon%(name)s,
  CudaNdarray_DEV_DATA(%(x_mean)s),
  CudaNdarray_DEV_DATA(%(x_invstd)s)
 );
 }
+#ifdef RUNNING_AVERAGES
+  %(out_running_mean)s = running_mean%(name)s;
+  %(out_running_var)s = running_var%(name)s;
+#endif
 """ % dict(name=name, inp=inp, scale=scale, bias=bias, outp=outp,
-           x_mean=x_mean, x_invstd=x_invstd, fail=sub['fail'])
+           x_mean=x_mean, x_invstd=x_invstd,
+           running_average_factor=running_average_factor,
+           in_running_mean=in_running_mean, in_running_var=in_running_var,
+           out_running_mean=out_running_mean, out_running_var=out_running_var,
+           fail=sub['fail'])
+
+        # add params
+        define_macros, undef_macros = self.get_c_macros(node, name, check_input=False)
+        result = """
+%(define_macros)s
+{
+    %(code)s
+}
+%(undef_macros)s
+""" % dict(code=result, define_macros=define_macros, undef_macros=undef_macros)

        return result

    def grad(self, inputs, grads):
-        x, scale, bias = inputs
+        x, scale, bias = inputs[:3]
        dy = grads[0]
-        _, x_mean, x_invstd = self(x, scale, bias)
-        return GpuDnnBatchNormGrad(self.mode, self.epsilon)(x, dy, scale,
-                                                            x_mean, x_invstd)
+        _, x_mean, x_invstd = self(*inputs)[:3]
+        disconnected_outputs = []
+        # Optional running_mean and running_var.
+        for i in range(3, len(inputs)):
+            disconnected_outputs.append(DisconnectedType()())
+        return GpuDnnBatchNormGrad(self.mode, self.epsilon)(
+            x, dy, scale, x_mean, x_invstd) + disconnected_outputs
+
+    def connection_pattern(self, node):
+        patterns = [[True, True, True],     # x
+                    [True, True, True],     # scale
+                    [True, True, True]]     # bias
+        # Optional running_mean and running_var are only
+        # connected to their new values.
+        for i in range(3, len(node.inputs)):
+            patterns[0].append(True)
+            for pattern in patterns[1:]:
+                pattern.append(False)
+            patterns.append([False] * (i) + [True])
+        return patterns


 class GpuDnnBatchNormGrad(GpuDnnBatchNormBase):
@@ -2722,7 +2909,8 @@ err%(name)s = cudnnBatchNormalizationBackward(


 def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
-                                  epsilon=1e-4):
+                                  epsilon=1e-4, running_average_factor=0.1,
+                                  running_mean=None, running_var=None):
    """
    Performs batch normalization of the given inputs, using the mean and
    variance of the inputs.
@@ -2742,6 +2930,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
    epsilon : float
        Epsilon value used in the batch normalization formula. Minimum allowed
        value is 1e-5 (imposed by cuDNN).
+    running_average_factor : float
+        Factor for updating the values or `running_mean` and `running_var`.
+        If the factor is close to one, the running averages will update quickly,
+        if the factor is close to zero it will update slowly.
+    running_mean : tensor or None
+        Previous value of the running mean. If this is given, the new value
+        ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
+        will be returned as one of the outputs of this function.
+        `running_mean` and `running_var` should either both be given or
+        both be None.
+    running_var : tensor or None
+        Previous value of the running variance. If this is given, the new value
+        ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
+        will be returned as one of the outputs of this function,
+        where `m` is the product of lengths of the averaged-over dimensions.
+        `running_mean` and `running_var` should either both be given or
+        both be None.

    Returns
    -------
@@ -2749,8 +2954,14 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
        Batch-normalized inputs.
    mean : tensor
        Means of `inputs` across the normalization axes.
-    stdinv : tensor
+    invstd : tensor
        Inverse standard deviations of `inputs` across the normalization axes.
+    new_running_mean : tensor
+        New value of the running mean (only if both `running_mean` and
+        `running_var` were given).
+    new_running_var : tensor
+        New value of the running variance (only if both `running_var` and
+        `running_mean` were given).

    Notes
    -----
@@ -2762,31 +2973,78 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',

        axes = 0 if mode == 'per-activation' else (0, 2, 3)
        mean = inputs.mean(axes, keepdims=True)
-        stdinv = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon))
-        out = (inputs - mean) * gamma * stdinv + beta
+        var = inputs.var(axes, keepdims=True)
+        invstd = T.inv(T.sqrt(var + epsilon))
+        out = (inputs - mean) * gamma * invstd + beta
+
+        m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
+        running_mean = running_mean * (1 - running_average_factor) + \\
+                       mean * running_average_factor
+        running_var = running_var * (1 - running_average_factor) + \\
+                      (m / (m - 1)) * var * running_average_factor

    For 5d tensors, the axes are (0, 2, 3, 4).
    """
    ndim = inputs.ndim
-    if ndim > 5:
-        raise ValueError("dnn_batch_normalization_train currently supports "
-                         "up to 5-dimensional tensors only, got %d" % ndim)
    if gamma.ndim != ndim or beta.ndim != ndim:
        raise ValueError("gamma and beta must be of the same dimensionality "
                         "as inputs; got %d and %d instead of %d" %
                         (gamma.ndim, beta.ndim, ndim))
+    if (running_mean is None) != (running_var is None):
+        raise ValueError("running_mean and running_var must either both be "
+                         "given or both be None")
+    if running_mean is not None and running_mean.ndim != ndim:
+        raise ValueError("running_mean must be of the same dimensionality "
+                         "as inputs; got %d instead of %d" %
+                         (running_mean.ndim, ndim))
+    if running_var is not None and running_var.ndim != ndim:
+        raise ValueError("running_var must be of the same dimensionality "
+                         "as inputs; got %d instead of %d" %
+                         (running_var.ndim, ndim))
    if epsilon < 1e-5:
        raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)

+    running_averages = (running_var is not None and running_var is not None)
+
    if ndim < 4:
        inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
        gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
        beta = theano.tensor.shape_padright(beta, 4 - ndim)
-    batchnorm_op = GpuDnnBatchNorm(mode=mode, epsilon=epsilon)
-    result = tuple(batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
-                                gpu_contiguous(beta)))
+        if running_averages:
+            running_mean = theano.tensor.shape_padright(running_mean, 4 - ndim)
+            running_var = theano.tensor.shape_padright(running_var, 4 - ndim)
+    elif ndim > 5:
+        inputs_shape = inputs.shape
+        params_shape = gamma.shape
+        inputs = theano.tensor.flatten(inputs, 5)
+        gamma = theano.tensor.flatten(gamma, 5)
+        beta = theano.tensor.flatten(beta, 5)
+        if running_averages:
+            running_mean = theano.tensor.flatten(running_mean, 5)
+            running_var = theano.tensor.flatten(running_var, 5)
+
+    batchnorm_op = GpuDnnBatchNorm(mode=mode, epsilon=epsilon,
+                                   running_average_factor=running_average_factor,
+                                   running_averages=running_averages)
+    if running_averages:
+        out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
+            gpu_contiguous(inputs), gpu_contiguous(gamma),
+            gpu_contiguous(beta),
+            running_mean=gpu_contiguous(running_mean),
+            running_var=gpu_contiguous(running_var))
+        if new_running_mean.broadcastable != running_mean.broadcastable:
+            new_running_mean = tensor.patternbroadcast(new_running_mean, running_mean.broadcastable)
+        if new_running_var.broadcastable != running_var.broadcastable:
+            new_running_var = tensor.patternbroadcast(new_running_var, running_var.broadcastable)
+        result = (out, mean, invstd, new_running_mean, new_running_var)
+    else:
+        result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
+                              gpu_contiguous(beta))
    if ndim < 4:
        result = tuple(theano.tensor.flatten(r, ndim) for r in result)
+    elif ndim > 5:
+        result = (theano.tensor.reshape(result[0], inputs_shape),) + tuple(
+            theano.tensor.reshape(r, params_shape) for r in result[1:])
    return result


@@ -2839,9 +3097,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
    For 5d tensors, the axes would be (0, 2, 3, 4).
    """
    ndim = inputs.ndim
-    if ndim > 5:
-        raise ValueError("dnn_batch_normalization_test currently supports "
-                         "up to 5-dimensional tensors only, got %d" % ndim)
    if gamma.ndim != ndim or beta.ndim != ndim:
        raise ValueError("gamma and beta must be of the same dimensionality "
                         "as inputs; got %d and %d instead of %d" %
@@ -2859,12 +3114,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
        beta = theano.tensor.shape_padright(beta, 4 - ndim)
        mean = theano.tensor.shape_padright(mean, 4 - ndim)
        var = theano.tensor.shape_padright(var, 4 - ndim)
+    elif ndim > 5:
+        inputs_shape = inputs.shape
+        inputs = theano.tensor.flatten(inputs, 5)
+        gamma = theano.tensor.flatten(gamma, 5)
+        beta = theano.tensor.flatten(beta, 5)
+        mean = theano.tensor.flatten(mean, 5)
+        var = theano.tensor.flatten(var, 5)
    batchnorm_op = GpuDnnBatchNormInference(mode=mode, epsilon=epsilon)
    result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
                          gpu_contiguous(beta), gpu_contiguous(mean),
                          gpu_contiguous(var))
    if ndim < 4:
        result = theano.tensor.flatten(result, ndim)
+    elif ndim > 5:
+        result = theano.tensor.reshape(result, inputs_shape)
    return result


@@ -3334,3 +3598,235 @@ def local_abstractconv3d_cudnn(node):
                               subsample=node.op.subsample,
                               conv_mode=conv_mode)
        return [rval]
+
+
+@local_optimizer([bn.AbstractBatchNormTrain])
+def local_abstract_batch_norm_train_cudnn(node):
+    if not isinstance(node.op, bn.AbstractBatchNormTrain):
+        return None
+
+    x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
+    running_mean = node.inputs[5] if len(node.inputs) > 5 else None
+    running_var = node.inputs[6] if len(node.inputs) > 6 else None
+
+    # input on gpu?  TODO what about the output?
+    x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
+                (x.owner and isinstance(x.owner.op, HostFromGpu)))
+    if not x_on_gpu:
+        return None
+
+    # convert axes to cuDNN mode
+    axes = tuple(node.op.axes)
+    if axes == (0,):
+        mode = 'per-activation'
+    elif axes == (0,) + tuple(range(2, x.ndim)):
+        mode = 'spatial'
+    else:
+        return None
+
+    try:
+        eps = float(theano.tensor.get_scalar_constant_value(epsilon))
+    except theano.tensor.NotScalarConstantError:
+        return None
+    if eps < 1e-5:
+        return None
+    try:
+        running_average_factor = float(theano.tensor.get_scalar_constant_value(running_average_factor))
+    except theano.tensor.NotScalarConstantError:
+        return None
+
+    if not dnn_available():
+        return None
+
+    x = as_cuda_ndarray_variable(x)
+    scale = as_cuda_ndarray_variable(scale)
+    bias = as_cuda_ndarray_variable(bias)
+
+    inputs = [x, scale, bias, mode, eps, running_average_factor]
+    if running_mean is not None and running_var is not None:
+        inputs.append(running_mean)
+        inputs.append(running_var)
+
+    results = list(dnn_batch_normalization_train(*inputs))
+
+    # If the original output was on CPU, we have to transfer it
+    for i in range(len(node.outputs)):
+        if isinstance(node.outputs[i].type, tensor.TensorType):
+            results[i] = tensor.as_tensor_variable(results[i])
+    # TODO copy_stack_trace?
+    return results
+
+
+@register_inplace()
+@local_optimizer([GpuDnnBatchNorm], inplace=True)
+def local_gpu_batch_norm_inplace_output(node):
+    if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
+        return GpuDnnBatchNorm(mode=node.op.mode,
+                               epsilon=node.op.epsilon,
+                               running_average_factor=node.op.running_average_factor,
+                               running_averages=node.op.running_averages,
+                               inplace_running_mean=node.op.inplace_running_mean,
+                               inplace_running_var=node.op.inplace_running_var,
+                               inplace_output=True)(*node.inputs)
+
+
+@register_inplace()
+@local_optimizer([GpuDnnBatchNorm], inplace=True)
+def local_gpu_batch_norm_inplace_running_mean(node):
+    if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_mean:
+        return GpuDnnBatchNorm(mode=node.op.mode,
+                               epsilon=node.op.epsilon,
+                               running_average_factor=node.op.running_average_factor,
+                               running_averages=node.op.running_averages,
+                               inplace_running_mean=True,
+                               inplace_running_var=node.op.inplace_running_var,
+                               inplace_output=node.op.inplace_output)(*node.inputs)
+
+
+@register_inplace()
+@local_optimizer([GpuDnnBatchNorm], inplace=True)
+def local_gpu_batch_norm_inplace_running_var(node):
+    if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_var:
+        return GpuDnnBatchNorm(mode=node.op.mode,
+                               epsilon=node.op.epsilon,
+                               running_average_factor=node.op.running_average_factor,
+                               running_averages=node.op.running_averages,
+                               inplace_running_mean=node.op.inplace_running_mean,
+                               inplace_running_var=True,
+                               inplace_output=node.op.inplace_output)(*node.inputs)
+
+
+@register_inplace()
+@local_optimizer([GpuDnnBatchNormInference], inplace=True)
+def local_gpu_batch_norm_inference_inplace(node):
+    if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
+        return [GpuDnnBatchNormInference(mode=node.op.mode,
+                                         epsilon=node.op.epsilon,
+                                         inplace=True)(*node.inputs)]
+
+
+@local_optimizer([bn.AbstractBatchNormTrainGrad])
+def local_abstract_batch_norm_train_grad_cudnn(node):
+    if not isinstance(node.op, bn.AbstractBatchNormTrainGrad):
+        return None
+
+    x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
+
+    # input on gpu?  TODO what about the output?
+    x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
+                (x.owner and isinstance(x.owner.op, HostFromGpu)))
+    dy_on_gpu = (isinstance(dy.type, CudaNdarrayType) or
+                 (dy.owner and isinstance(dy.owner.op, HostFromGpu)))
+    if not (x_on_gpu or dy_on_gpu):
+        return None
+
+    # convert axes to cuDNN mode
+    axes = tuple(node.op.axes)
+    if axes == (0,):
+        mode = 'per-activation'
+    elif axes == (0,) + tuple(range(2, x.ndim)):
+        mode = 'spatial'
+    else:
+        return None
+
+    ndim = x.ndim
+    if ndim < 4:
+        x = theano.tensor.shape_padright(x, 4 - ndim)
+        dy = theano.tensor.shape_padright(dy, 4 - ndim)
+        scale = theano.tensor.shape_padright(scale, 4 - ndim)
+        x_mean = theano.tensor.shape_padright(x_mean, 4 - ndim)
+        x_invstd = theano.tensor.shape_padright(x_invstd, 4 - ndim)
+    elif ndim > 5:
+        x_shape = x.shape
+        params_shape = scale.shape
+        x = theano.tensor.flatten(x, 5)
+        dy = theano.tensor.flatten(dy, 5)
+        scale = theano.tensor.flatten(scale, 5)
+        x_mean = theano.tensor.flatten(x_mean, 5)
+        x_invstd = theano.tensor.flatten(x_invstd, 5)
+
+    try:
+        eps = float(theano.tensor.get_scalar_constant_value(epsilon))
+    except theano.tensor.NotScalarConstantError:
+        return None
+    if eps < 1e-5:
+        return None
+
+    if not dnn_available():
+        return None
+
+    x = as_cuda_ndarray_variable(x)
+    dy = as_cuda_ndarray_variable(dy)
+    scale = as_cuda_ndarray_variable(scale)
+    x_mean = as_cuda_ndarray_variable(x_mean)
+    x_invstd = as_cuda_ndarray_variable(x_invstd)
+
+    g_wrt_inputs, g_wrt_scale, g_wrt_bias = \
+        GpuDnnBatchNormGrad(mode, epsilon=eps)(x, dy, scale, x_mean, x_invstd)
+
+    if ndim < 4:
+        g_wrt_inputs = theano.tensor.flatten(g_wrt_inputs, ndim)
+        g_wrt_scale = theano.tensor.flatten(g_wrt_scale, ndim)
+        g_wrt_bias = theano.tensor.flatten(g_wrt_bias, ndim)
+    elif ndim > 5:
+        g_wrt_inputs = theano.tensor.reshape(g_wrt_inputs, x_shape)
+        g_wrt_scale = theano.tensor.reshape(g_wrt_scale, params_shape)
+        g_wrt_bias = theano.tensor.reshape(g_wrt_bias, params_shape)
+
+    # If the original output was on CPU, we have to transfer it
+    if isinstance(node.outputs[0].type, tensor.TensorType):
+        g_wrt_inputs = tensor.as_tensor_variable(g_wrt_inputs)
+    if isinstance(node.outputs[1].type, tensor.TensorType):
+        g_wrt_scale = tensor.as_tensor_variable(g_wrt_scale)
+    if isinstance(node.outputs[2].type, tensor.TensorType):
+        g_wrt_bias = tensor.as_tensor_variable(g_wrt_bias)
+    # TODO copy_stack_trace?
+    return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
+
+
+@local_optimizer([bn.AbstractBatchNormInference])
+def local_abstract_batch_norm_inference_cudnn(node):
+    if not isinstance(node.op, bn.AbstractBatchNormInference):
+        return None
+
+    x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs
+
+    axes = tuple(node.op.axes)
+    if axes == (0,):
+        mode = 'per-activation'
+    elif axes == (0,) + tuple(range(2, x.ndim)):
+        mode = 'spatial'
+    else:
+        return None
+
+    # input on gpu?  TODO what about the output?
+    x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
+                (x.owner and isinstance(x.owner.op, HostFromGpu)))
+    if not x_on_gpu:
+        return None
+
+    try:
+        eps = float(theano.tensor.get_scalar_constant_value(epsilon))
+    except theano.tensor.NotScalarConstantError:
+        return None
+    if eps < 1e-5:
+        return None
+
+    if not dnn_available():
+        return None
+
+    x = as_cuda_ndarray_variable(x)
+    scale = as_cuda_ndarray_variable(scale)
+    bias = as_cuda_ndarray_variable(bias)
+    estimated_mean = as_cuda_ndarray_variable(estimated_mean)
+    estimated_variance = as_cuda_ndarray_variable(estimated_variance)
+
+    out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance,
+                                       mode, eps)
+
+    # If the original output was on CPU, we have to transfer it
+    # TODO copy_stack_trace?
+    if isinstance(node.outputs[0].type, tensor.TensorType):
+        return [tensor.as_tensor_variable(out)]
+    else:
+        return [out]
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -3050,3 +3050,28 @@ conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
                       local_abstractconv3d_gradinputs_gemm, 30,
                       'conv_gemm',
                       'gpu', 'fast_compile', 'fast_run')
+
+
+# Register cuDNN batch normalization implementation
+abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
+abstract_batch_norm_groupopt.__name__ = "gpu_batchnorm_opts"
+register_opt('fast_compile')(abstract_batch_norm_groupopt)
+
+# cuDNN optimizations are only registered if cuDNN is available.
+# (we import these opts here instead of at the top of this file
+# to avoid a circular dependency problem with dnn)
+from .dnn import (local_abstract_batch_norm_train_cudnn,
+                  local_abstract_batch_norm_train_grad_cudnn,
+                  local_abstract_batch_norm_inference_cudnn)     # noqa: 402
+abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_dnn',
+                                      local_abstract_batch_norm_train_cudnn, 20,
+                                      'batchnorm_dnn',
+                                      'gpu', 'fast_compile', 'fast_run', 'cudnn')
+abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_grad_dnn',
+                                      local_abstract_batch_norm_train_grad_cudnn, 20,
+                                      'batchnorm_dnn',
+                                      'gpu', 'fast_compile', 'fast_run', 'cudnn')
+abstract_batch_norm_groupopt.register('local_abstract_batch_norm_inference_dnn',
+                                      local_abstract_batch_norm_inference_cudnn, 20,
+                                      'batchnorm_dnn',
+                                      'gpu', 'fast_compile', 'fast_run', 'cudnn')
--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
 from __future__ import absolute_import, print_function, division
+from collections import OrderedDict
 import logging
 import os
 import sys
@@ -18,6 +19,7 @@ import theano.tests.unittest_tools as utt
 from theano.tensor.signal.pool import pool_2d, pool_3d
 from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
 from theano.tensor.nnet.abstract_conv import get_conv_output_shape
+from theano.tensor.nnet import bn
 import theano.sandbox.cuda.dnn as dnn
 from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
 from theano.sandbox.cuda import float32_shared_constructor as shared
@@ -730,52 +732,201 @@ def test_batchnorm_train():
        raise SkipTest("batch normalization requires cudnn v5+")
    utt.seed_rng()

+    tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
+
    for mode in ('per-activation', 'spatial'):
-        for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
-            x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias'))
+        for vartype in (tensor6, T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
+            x, scale, bias, running_mean, running_var = (vartype(n)
+                                                         for n in ('x', 'scale', 'bias',
+                                                                   'running_mean',
+                                                                   'running_var'))
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used
-
-            # forward pass
-            out, x_mean, x_invstd = cuda.dnn.dnn_batch_normalization_train(
-                x, scale, bias, mode, eps)
+            running_average_factor = 0.3
+
+            # forward pass, direct interface
+            out_gpu, x_mean_gpu, x_invstd_gpu, \
+                out_running_mean_gpu, out_running_var_gpu = \
+                dnn.dnn_batch_normalization_train(x, scale, bias, mode, eps,
+                                                  running_average_factor,
+                                                  running_mean, running_var)
+            # forward pass, abstract interface
+            out_abstract, x_mean_abstract, x_invstd_abstract, \
+                out_running_mean_abstract, out_running_var_abstract = \
+                bn.batch_normalization_train(x, scale, bias, mode, eps,
+                                             running_average_factor,
+                                             running_mean, running_var)
            # reference forward pass
            if mode == 'per-activation':
                axes = (0,)
            elif mode == 'spatial':
                axes = (0,) + tuple(range(2, ndim))
-            x_mean2 = x.mean(axis=axes, keepdims=True)
-            x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps))
-            scale2 = T.addbroadcast(scale, *axes)
-            bias2 = T.addbroadcast(bias, *axes)
-            out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
+            x_mean_ref = x.mean(axis=axes, keepdims=True)
+            x_var_ref = x.var(axis=axes, keepdims=True)
+            x_invstd_ref = T.inv(T.sqrt(x_var_ref + eps))
+            scale_ref = T.addbroadcast(scale, *axes)
+            bias_ref = T.addbroadcast(bias, *axes)
+            m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
+            out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
+            out_running_mean_ref = running_mean * (1 - running_average_factor) + \
+                x_mean_ref * running_average_factor
+            out_running_var_ref = running_var * (1 - running_average_factor) + \
+                (m / (m - 1)) * x_var_ref * running_average_factor
            # backward pass
            dy = vartype('dy')
-            grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
+            grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
+            grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
            # reference backward pass
-            grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
+            grads_ref = T.grad(None, wrt=[x, scale, bias], known_grads={out_ref: dy})
            # compile
-            f = theano.function([x, scale, bias, dy],
-                                [out, x_mean, x_invstd, out2, x_mean2, x_invstd2] +
-                                grads + grads2, mode=mode_with_gpu)
+            f_gpu = theano.function([x, scale, bias, running_mean, running_var, dy],
+                                    [out_gpu, x_mean_gpu, x_invstd_gpu,
+                                     out_running_mean_gpu, out_running_var_gpu] + grads_gpu,
+                                    mode=mode_with_gpu)
+            f_abstract = theano.function([x, scale, bias, running_mean, running_var, dy],
+                                         [out_abstract, x_mean_abstract, x_invstd_abstract,
+                                          out_running_mean_abstract, out_running_var_abstract] +
+                                         grads_abstract,
+                                         mode=mode_with_gpu)
+            f_ref = theano.function([x, scale, bias, running_mean, running_var, dy],
+                                    [out_ref, x_mean_ref, x_invstd_ref,
+                                     out_running_mean_ref, out_running_var_ref] + grads_ref)
+            # check if the abstract Ops have been replaced
+            assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
+                        in f_abstract.maker.fgraph.toposort()])
+            assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
+                        in f_abstract.maker.fgraph.toposort()])
+            assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                              bn.AbstractBatchNormInference,
+                                              bn.AbstractBatchNormTrainGrad)) for n
+                            in f_abstract.maker.fgraph.toposort()])
            # run
-            for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
+            for data_shape in ((5, 10, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes else s
                                    for d, s in enumerate(data_shape))
-                X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
-                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
-                Scale = numpy.random.randn(*param_shape).astype('float32')
-                Bias = numpy.random.randn(*param_shape).astype('float32')
-                outputs = f(X, Scale, Bias, Dy)
+                X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+                Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                outputs_gpu = f_gpu(X, Scale, Bias, Running_mean, Running_var, Dy)
+                outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
+                outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
                # compare outputs
-                utt.assert_allclose(outputs[0], outputs[0 + 3])  # out
-                utt.assert_allclose(outputs[1], outputs[1 + 3])  # mean
-                utt.assert_allclose(outputs[2], outputs[2 + 3])  # invstd
+                utt.assert_allclose(outputs_gpu[0], outputs_ref[0])  # out
+                utt.assert_allclose(outputs_gpu[1], outputs_ref[1])  # mean
+                utt.assert_allclose(outputs_gpu[2], outputs_ref[2])  # invstd
+                utt.assert_allclose(outputs_gpu[3], outputs_ref[3])  # running_mean
+                utt.assert_allclose(numpy.nan_to_num(outputs_gpu[4]),
+                                    numpy.nan_to_num(outputs_ref[4]))  # running_var
+                utt.assert_allclose(outputs_abstract[0], outputs_ref[0])  # out
+                utt.assert_allclose(outputs_abstract[1], outputs_ref[1])  # mean
+                utt.assert_allclose(outputs_abstract[2], outputs_ref[2])  # invstd
+                utt.assert_allclose(outputs_abstract[3], outputs_ref[3])  # running_mean
+                utt.assert_allclose(numpy.nan_to_num(outputs_abstract[4]),
+                                    numpy.nan_to_num(outputs_ref[4]))  # running_var
                # compare gradients
-                utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4)  # dx
-                utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4)  # dscale
-                utt.assert_allclose(outputs[8], outputs[8 + 3])  # dbias
+                utt.assert_allclose(outputs_gpu[5], outputs_ref[5], atol=2e-4)  # dx
+                utt.assert_allclose(outputs_gpu[6], outputs_ref[6], rtol=4e-4, atol=1e-4)  # dscale
+                utt.assert_allclose(outputs_gpu[7], outputs_ref[7])  # dbias
+                utt.assert_allclose(outputs_abstract[5], outputs_ref[5], atol=2e-4)  # dx
+                utt.assert_allclose(outputs_abstract[6], outputs_ref[6], rtol=4e-4, atol=1e-4)  # dscale
+                utt.assert_allclose(outputs_abstract[7], outputs_ref[7])  # dbias
+
+
+def test_dnn_batchnorm_train_without_running_averages():
+    # compile and run batch_normalization_train without running averages
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+    if cuda.dnn.version() < (5000, 5000):
+        raise SkipTest("batch normalization requires cudnn v5+")
+    utt.seed_rng()
+
+    x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
+    data_shape = (5, 10, 30, 25)
+    param_shape = (1, 10, 30, 25)
+
+    # forward pass
+    out_gpu, x_mean_gpu, x_invstd_gpu = \
+        dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation')
+    out_abstract, x_mean_abstract, x_invstd_abstract = \
+        bn.batch_normalization_train(x, scale, bias, 'per-activation')
+    # backward pass
+    grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
+    grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
+    # compile
+    f_gpu = theano.function([x, scale, bias, dy],
+                            [out_gpu, x_mean_gpu, x_invstd_gpu] +
+                            grads_gpu,
+                            mode=mode_with_gpu)
+    f_abstract = theano.function([x, scale, bias, dy],
+                                 [out_abstract, x_mean_abstract, x_invstd_abstract] +
+                                 grads_abstract,
+                                 mode=mode_with_gpu)
+    # check if the abstract Ops have been replaced
+    assert any([isinstance(n.op, dnn.GpuDnnBatchNorm)
+                for n in f_abstract.maker.fgraph.toposort()])
+    assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
+                for n in f_abstract.maker.fgraph.toposort()])
+    assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                      bn.AbstractBatchNormInference,
+                                      bn.AbstractBatchNormTrainGrad))
+                    for n in f_abstract.maker.fgraph.toposort()])
+    # run
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    f_gpu(X, Scale, Bias, Dy)
+    f_abstract(X, Scale, Bias, Dy)
+
+
+def test_dnn_batchnorm_train_inplace():
+    # test inplace_running_mean and inplace_running_var
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+    if cuda.dnn.version() < (5000, 5000):
+        raise SkipTest("batch normalization requires cudnn v5+")
+    utt.seed_rng()
+
+    x, scale, bias = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias')
+    data_shape = (5, 10, 30, 25)
+    param_shape = (1, 10, 30, 25)
+    running_mean = shared(
+        numpy.random.randn(*param_shape).astype(theano.config.floatX),
+        broadcastable=(True, False, False, False))
+    running_var = shared(
+        numpy.random.randn(*param_shape).astype(theano.config.floatX),
+        broadcastable=(True, False, False, False))
+
+    # forward pass
+    out, x_mean, x_invstd, new_running_mean, new_running_var = \
+        dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation',
+                                          epsilon=5e-3, running_average_factor=0.3,
+                                          running_mean=running_mean, running_var=running_var)
+    # update running averages
+    updates = OrderedDict()
+    updates[running_mean] = new_running_mean
+    updates[running_var] = new_running_var
+    # compile
+    f = theano.function([x, scale, bias],
+                        [out, x_mean, x_invstd],
+                        updates=updates,
+                        mode=mode_with_gpu)
+    # check for the inplace settings
+    nodes = [n for n in f.maker.fgraph.toposort()
+             if isinstance(n.op, dnn.GpuDnnBatchNorm)]
+    assert len(nodes) == 1
+    assert nodes[0].op.inplace_running_mean
+    assert nodes[0].op.inplace_running_var
+    assert nodes[0].op.inplace_output
+    # run
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    f(X, Scale, Bias)


 def test_batchnorm_inference():
@@ -785,53 +936,160 @@ def test_batchnorm_inference():
        raise SkipTest("batch normalization requires cudnn v5+")
    utt.seed_rng()

+    tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
+
    for mode in ('per-activation', 'spatial'):
-        for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
-            x, scale, bias, mean, var = (vartype(n) for n in ('x', 'scale',
-                                                              'bias', 'mean',
-                                                              'var'))
+        for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
+            x, scale, bias, mean, var = (vartype(n)
+                                         for n in ('x', 'scale', 'bias', 'mean', 'var'))
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used

-            # forward pass
-            out = cuda.dnn.dnn_batch_normalization_test(x, scale, bias, mean,
-                                                        var, mode, eps)
+            # forward pass, direct interface
+            out_gpu = dnn.dnn_batch_normalization_test(x, scale, bias, mean,
+                                                       var, mode, eps)
+            # forward pass, abstract interface
+            out_abstract = bn.batch_normalization_test(x, scale, bias, mean,
+                                                       var, mode, eps)
            # reference forward pass
            if mode == 'per-activation':
                axes = (0,)
            elif mode == 'spatial':
                axes = (0,) + tuple(range(2, ndim))
-            scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes)
-                                          for t in (scale, bias, mean, var))
-            out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2
+            scale_ref, bias_ref, mean_ref, var_ref = (T.addbroadcast(t, *axes)
+                                                      for t in (scale, bias, mean, var))
+            out_ref = (x - mean_ref) * (scale_ref / T.sqrt(var_ref + eps)) + bias_ref
            # backward pass
            dy = vartype('dy')
-            grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy})
+            grads_gpu = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_gpu: dy})
+            grads_abstract = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_abstract: dy})
            # reference backward pass
-            grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy})
+            grads_ref = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_ref: dy})
            # compile
-            f = theano.function([x, scale, bias, mean, var, dy],
-                                [out, out2] + grads + grads2, mode=mode_with_gpu)
+            f_gpu = theano.function([x, scale, bias, mean, var, dy],
+                                    [out_gpu] + grads_gpu, mode=mode_with_gpu)
+            f_abstract = theano.function([x, scale, bias, mean, var, dy],
+                                         [out_abstract] + grads_abstract, mode=mode_with_gpu)
+            f_ref = theano.function([x, scale, bias, mean, var, dy],
+                                    [out_ref] + grads_ref)
+            # check if the abstract Ops have been replaced
+            assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
+                        in f_abstract.maker.fgraph.toposort()])
+            assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                              bn.AbstractBatchNormInference,
+                                              bn.AbstractBatchNormTrainGrad)) for n
+                            in f_abstract.maker.fgraph.toposort()])
            # run
-            for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
+            for data_shape in ((10, 20, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes else s
                                    for d, s in enumerate(data_shape))
-                X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
-                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
-                Scale = numpy.random.randn(*param_shape).astype('float32')
-                Bias = numpy.random.randn(*param_shape).astype('float32')
-                Mean = numpy.random.randn(*param_shape).astype('float32')
-                Var = numpy.random.rand(*param_shape).astype('float32')
-                outputs = f(X, Scale, Bias, Mean, Var, Dy)
+                X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+                Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
+                outputs_gpu = f_gpu(X, Scale, Bias, Mean, Var, Dy)
+                outputs_abstract = f_abstract(X, Scale, Bias, Mean, Var, Dy)
+                outputs_ref = f_ref(X, Scale, Bias, Mean, Var, Dy)
                # compare outputs
-                utt.assert_allclose(outputs[0], outputs[1])  # out
+                utt.assert_allclose(outputs_gpu[0], outputs_ref[0])  # out
+                utt.assert_allclose(outputs_abstract[0], outputs_ref[0])  # out
                # compare gradients
-                utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5)  # dx
-                utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5)  # dscale
-                utt.assert_allclose(outputs[4], outputs[4 + 5])  # dbias
-                utt.assert_allclose(outputs[5], outputs[5 + 5])  # dmean
-                utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5)  # dvar
+                utt.assert_allclose(outputs_gpu[1], outputs_ref[1], atol=4e-5)  # dx
+                utt.assert_allclose(outputs_gpu[2], outputs_ref[2], atol=4e-5)  # dscale
+                utt.assert_allclose(outputs_gpu[3], outputs_ref[3])  # dbias
+                utt.assert_allclose(outputs_gpu[4], outputs_ref[4])  # dmean
+                utt.assert_allclose(outputs_gpu[5], outputs_ref[5], rtol=2e-3, atol=4e-5)  # dvar
+                utt.assert_allclose(outputs_abstract[1], outputs_ref[1], atol=4e-5)  # dx
+                utt.assert_allclose(outputs_abstract[2], outputs_ref[2], atol=4e-5)  # dscale
+                utt.assert_allclose(outputs_abstract[3], outputs_ref[3])  # dbias
+                utt.assert_allclose(outputs_abstract[4], outputs_ref[4])  # dmean
+                utt.assert_allclose(outputs_abstract[5], outputs_ref[5], rtol=2e-3, atol=4e-5)  # dvar
+
+
+def test_batchnorm_inference_inplace():
+    # test inplace
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+    if cuda.dnn.version() < (5000, 5000):
+        raise SkipTest("batch normalization requires cudnn v5+")
+    utt.seed_rng()
+
+    x, scale, bias, mean, var = (T.tensor4(n) for n in ('x', 'scale', 'bias', 'mean', 'var'))
+    data_shape = (5, 10, 30, 25)
+    param_shape = (1, 10, 30, 25)
+
+    out = dnn.dnn_batch_normalization_test(x, scale, bias, mean, var)
+    f = theano.function([x, scale, bias, mean, var], [out], mode=mode_with_gpu)
+
+    # check for the inplace settings
+    nodes = [n for n in f.maker.fgraph.toposort()
+             if isinstance(n.op, dnn.GpuDnnBatchNormInference)]
+    assert len(nodes) == 1
+    assert nodes[0].op.inplace
+
+    # run
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
+    f(X, Scale, Bias, Mean, Var)
+
+
+def test_dnn_batchnorm_valid_and_invalid_axes():
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+    if cuda.dnn.version() < (5000, 5000):
+        raise SkipTest("batch normalization requires cudnn v5+")
+
+    for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix):
+        x, scale, bias, mean, var, dy = (vartype(n)
+                                         for n in ('x', 'scale', 'bias', 'mean', 'var', 'dy'))
+        ndim = x.ndim
+
+        # supported: per-activation and spatial
+        valid_axes_lists = ((0,), (0,) + tuple(range(2, ndim)))
+        # not supported: an axes list without 0 and including 1
+        invalid_axes_lists = (tuple(range(1, ndim)),)
+        for axes in valid_axes_lists + invalid_axes_lists:
+            # forward pass, abstract interface
+            out_train, x_mean, x_invstd = bn.batch_normalization_train(
+                x, scale, bias, axes)
+            out_test = bn.batch_normalization_test(
+                x, scale, bias, mean, var, axes)
+            # backward pass
+            dy = vartype('dy')
+            grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
+            grads_test = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_test: dy})
+            # compile
+            f = theano.function([x, scale, bias, mean, var, dy],
+                                [out_train, x_mean, x_invstd, out_test] +
+                                grads_train + grads_test,
+                                mode=mode_with_gpu)
+
+            if axes in valid_axes_lists:
+                # check if the abstract Ops have been replaced by the cuDNN Ops
+                assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
+                            in f.maker.fgraph.toposort()])
+                assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
+                            in f.maker.fgraph.toposort()])
+                assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
+                            in f.maker.fgraph.toposort()])
+                assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                                  bn.AbstractBatchNormInference,
+                                                  bn.AbstractBatchNormTrainGrad)) for n
+                                in f.maker.fgraph.toposort()])
+            else:
+                # check if the abstract Ops have been replaced, but not by the cuDNN Ops
+                assert not any([isinstance(n.op, (dnn.GpuDnnBatchNorm,
+                                                  dnn.GpuDnnBatchNormGrad,
+                                                  bn.AbstractBatchNormTrain,
+                                                  bn.AbstractBatchNormInference,
+                                                  bn.AbstractBatchNormTrainGrad)) for n
+                                in f.maker.fgraph.toposort()])


 def test_dnn_tag():

--- a/theano/tensor/nnet/bn.py
+++ b/theano/tensor/nnet/bn.py
 from __future__ import absolute_import, print_function, division
+import numpy
 import theano
+from theano import Apply, Op
+from theano.gof import local_optimizer
+from theano.gof.opt import copy_stack_trace
+from theano.tensor import as_tensor_variable, TensorType
+from theano.tensor import basic as T
+from theano.tensor.opt import register_specialize_device
 from theano.scalar import Composite
 from theano.scalar import add, sub, true_div, mul

@@ -37,7 +44,7 @@ def batch_normalization(inputs, gamma, beta, mean, std,
    """
    This function will build the symbolic graph for applying batch normalization
    to a set of activations.
-    Also works on GPUs
+    Also works on GPUs, but is not optimized using cuDNN.

    .. versionadded:: 0.7.1

@@ -75,3 +82,631 @@ def batch_normalization(inputs, gamma, beta, mean, std,
        raise ValueError(
            'mode must be either "low_mem", "high_mem"')
    return rval
+
+
+def _prepare_batch_normalization_axes(axes, ndim):
+    if axes == 'per-activation':
+        axes = (0,)
+    elif axes == 'spatial':
+        axes = (0,) + tuple(range(2, ndim))
+    elif isinstance(axes, (tuple, list, numpy.ndarray)):
+        axes = tuple(int(a) for a in axes)
+    else:
+        raise ValueError('invalid axes: %s', str(axes))
+    axes = tuple(sorted(axes))
+    if len(axes) == 0:
+        raise ValueError('there should be at least one normalization axis')
+    if min(axes) < 0 or max(axes) >= ndim:
+        raise ValueError('axes should be less than ndim (<%d), but %s given' % (ndim, str(axes)))
+    non_bc_axes = tuple(i for i in range(ndim) if i not in axes)
+    return axes, non_bc_axes
+
+
+def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
+                              epsilon=1e-4, running_average_factor=0.1,
+                              running_mean=None, running_var=None):
+    """
+    Performs batch normalization of the given inputs, using the mean and
+    variance of the inputs.
+
+    Parameters
+    ----------
+    axes : 'per-activation', 'spatial' or a tuple of ints
+        The axes along which the input should be normalized. ``'per-activation'``
+        normalizes per activation and is equal to ``axes=(0,)``.
+        ``'spatial'`` shares normalization factors across spatial dimensions
+        (i.e., all dimensions past the second), which for 4D inputs would be
+        equal to ``axes=(0, 2, 3)``.
+    gamma : tensor
+        Learnable scale factors. The shape must match the shape of `inputs`,
+        except for the axes in `axes`. These axes should be set to 1 or be
+        skipped altogether (such that `gamma.ndim == inputs.ndim - len(axes)`).
+    beta : tensor
+        Learnable biases. Must match the tensor layout of `gamma`.
+    epsilon : float
+        Epsilon value used in the batch normalization formula. Minimum allowed
+        value is 1e-5 (imposed by cuDNN).
+    running_average_factor : float
+        Factor for updating the values or `running_mean` and `running_var`.
+        If the factor is close to one, the running averages will update quickly,
+        if the factor is close to zero it will update slowly.
+    running_mean : tensor or None
+        Previous value of the running mean. If this is given, the new value
+        ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
+        will be returned as one of the outputs of this function.
+        `running_mean` and `running_var` should either both be given or
+        both be None. The shape should match that of `gamma` and `beta`.
+    running_var : tensor or None
+        Previous value of the running variance. If this is given, the new value
+        ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
+        will be returned as one of the outputs of this function,
+        where `m` is the product of lengths of the averaged-over dimensions.
+        `running_mean` and `running_var` should either both be given or
+        both be None. The shape should match that of `gamma` and `beta`.
+
+    Returns
+    -------
+    out : tensor
+        Batch-normalized inputs.
+    mean : tensor
+        Means of `inputs` across the normalization axes.
+    invstd : tensor
+        Inverse standard deviations of `inputs` across the normalization axes.
+    new_running_mean : tensor
+        New value of the running mean (only if both `running_mean` and
+        `running_var` were given).
+    new_running_var : tensor
+        New value of the running variance (only if both `running_var` and
+        `running_mean` were given).
+
+    Notes
+    -----
+    If per-activation or spatial normalization is selected, this operation
+    will use the cuDNN implementation. (This requires cuDNN 5 or newer.)
+
+    The returned values are equivalent to:
+
+    .. code-block:: python
+
+        # for per-activation normalization
+        axes = (0,)
+        # for spatial normalization
+        axes = (0,) + tuple(range(2, inputs.ndim))
+        mean = inputs.mean(axes, keepdims=True)
+        var = inputs.var(axes, keepdims=True)
+        invstd = T.inv(T.sqrt(var + epsilon))
+        out = (inputs - mean) * gamma * invstd + beta
+
+        m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
+        running_mean = running_mean * (1 - running_average_factor) + \\
+                       mean * running_average_factor
+        running_var = running_var * (1 - running_average_factor) + \\
+                      (m / (m - 1)) * var * running_average_factor
+    """
+    ndim = inputs.ndim
+    axes, non_bc_axes = _prepare_batch_normalization_axes(axes, ndim)
+
+    # have the parameter tensors been broadcasted yet?
+    if gamma.ndim == ndim:
+        params_ndim = ndim
+    else:
+        params_ndim = len(non_bc_axes)
+        params_dimshuffle_pattern = ['x'] * ndim
+        for i, axis in enumerate(non_bc_axes):
+            params_dimshuffle_pattern[axis] = i
+
+    if gamma.ndim != params_ndim or beta.ndim != params_ndim:
+        raise ValueError("gamma and beta dimensionality must match the "
+                         "number of non-normalized axes, or have the "
+                         "same number of dimensions as the inputs; "
+                         "got %d and %d instead of %d" %
+                         (gamma.ndim, beta.ndim, params_ndim))
+    if (running_mean is None) != (running_var is None):
+        raise ValueError("running_mean and running_var must either both be "
+                         "given or both be None")
+    if running_mean is not None and running_mean.ndim != params_ndim:
+        raise ValueError("running_mean must be of the same dimensionality "
+                         "as gamma and beta; got %d instead of %d" %
+                         (running_mean.ndim, params_ndim))
+    if running_var is not None and running_var.ndim != params_ndim:
+        raise ValueError("running_var must be of the same dimensionality "
+                         "as gamma and beta; got %d instead of %d" %
+                         (running_var.ndim, params_ndim))
+
+    # epsilon will be converted to floatX later. we need to check
+    # for rounding errors now, since numpy.float32(1e-5) < 1e-5.
+    epsilon = numpy.cast[theano.config.floatX](epsilon)
+    if epsilon < 1e-5:
+        raise ValueError("epsilon must be at least 1e-5, got %s" % str(epsilon))
+
+    inputs = as_tensor_variable(inputs)
+    gamma = as_tensor_variable(gamma)
+    beta = as_tensor_variable(beta)
+
+    if params_ndim != ndim:
+        gamma = gamma.dimshuffle(params_dimshuffle_pattern)
+        beta = beta.dimshuffle(params_dimshuffle_pattern)
+    else:
+        gamma = T.addbroadcast(gamma, *axes)
+        beta = T.addbroadcast(beta, *axes)
+
+    batchnorm_op = AbstractBatchNormTrain(axes=axes)
+
+    if running_mean is not None and running_var is not None:
+        running_mean = as_tensor_variable(running_mean)
+        running_var = as_tensor_variable(running_var)
+        if params_ndim != ndim:
+            running_mean = running_mean.dimshuffle(params_dimshuffle_pattern)
+            running_var = running_var.dimshuffle(params_dimshuffle_pattern)
+        else:
+            running_mean = T.addbroadcast(running_mean, *axes)
+            running_var = T.addbroadcast(running_var, *axes)
+        out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
+            inputs, gamma, beta, epsilon=epsilon,
+            running_average_factor=running_average_factor,
+            running_mean=running_mean, running_var=running_var)
+        if new_running_mean.broadcastable != running_mean.broadcastable:
+            new_running_mean = T.patternbroadcast(new_running_mean, running_mean.broadcastable)
+        if new_running_var.broadcastable != running_var.broadcastable:
+            new_running_var = T.patternbroadcast(new_running_var, running_var.broadcastable)
+        results = (out, mean, invstd, new_running_mean, new_running_var)
+    else:
+        results = batchnorm_op(inputs, gamma, beta, epsilon=epsilon)
+
+    if params_ndim != ndim:
+        # remove the broadcasted dimensions (except from the output)
+        results = ([results[0]] +
+                   [r.dimshuffle(non_bc_axes) for r in results[1:]])
+    return tuple(results)
+
+
+def batch_normalization_test(inputs, gamma, beta, mean, var,
+                             axes='per-activation', epsilon=1e-4):
+    """
+    Performs batch normalization of the given inputs, using the given mean and
+    variance.
+
+    Parameters
+    ----------
+    axes : 'per-activation', 'spatial' or a tuple of ints
+        The axes along which the input should be normalized. ``'per-activation'``
+        normalizes per activation and is equal to ``axes=(0,)``.
+        ``'spatial'`` shares normalization factors across spatial dimensions
+        (i.e., all dimensions past the second), which for 4D inputs would be
+        equal to ``axes=(0, 2, 3)``.
+    gamma : tensor
+        Scale factors. The shape must match the shape of `inputs`,
+        except for the axes in `axes`. These axes should be set to 1 or be
+        skipped altogether (such that `gamma.ndim == inputs.ndim - len(axes)`).
+    beta : tensor
+        Biases. Must match the tensor layout of `gamma`.
+    mean : tensor
+        Means. Usually these are running averages computed during training.
+        Must match the tensor layout of `gamma`.
+    var : tensor
+        Variances. Usually these are running averages computed during training.
+        Must match the tensor layout of `gamma`.
+    epsilon : float
+        Epsilon value used in the batch normalization formula. Minimum allowed
+        value is 1e-5 (imposed by cuDNN).
+
+    Returns
+    -------
+    out : tensor
+        Batch-normalized inputs.
+
+    Notes
+    -----
+    If per-activation or spatial normalization is selected, this operation
+    will use the cuDNN implementation. (This requires cuDNN 5 or newer.)
+
+    The returned value is equivalent to:
+
+    .. code-block:: python
+
+        # for per-activation normalization
+        axes = (0,)
+        # for spatial normalization
+        axes = (0,) + tuple(range(2, inputs.ndim))
+        gamma, beta, mean, var = (T.addbroadcast(t, *axes)
+                                  for t in (gamma, beta, mean, var))
+        out = (inputs - mean) * gamma / T.sqrt(var + epsilon) + beta
+    """
+    ndim = inputs.ndim
+    axes, non_bc_axes = _prepare_batch_normalization_axes(axes, ndim)
+
+    # have the parameter tensors been broadcasted yet?
+    if gamma.ndim == ndim:
+        params_ndim = ndim
+    else:
+        params_ndim = len(non_bc_axes)
+        params_dimshuffle_pattern = ['x'] * ndim
+        for i, axis in enumerate(non_bc_axes):
+            params_dimshuffle_pattern[axis] = i
+
+    if gamma.ndim != params_ndim or beta.ndim != params_ndim:
+        raise ValueError("gamma and beta dimensionality must match the "
+                         "number of non-normalized axes, or have the "
+                         "same number of dimensions as the inputs; "
+                         "got %d and %d instead of %d" %
+                         (gamma.ndim, beta.ndim, params_ndim))
+    if mean.ndim != params_ndim or var.ndim != params_ndim:
+        raise ValueError("mean and var must be of the same dimensionality "
+                         "as gamma and beta; got %d and %d instead of %d" %
+                         (mean.ndim, var.ndim, params_ndim))
+
+    # epsilon will be converted to floatX later. we need to check
+    # for rounding errors now, since numpy.float32(1e-5) < 1e-5.
+    epsilon = numpy.cast[theano.config.floatX](epsilon)
+    if epsilon < 1e-5:
+        raise ValueError("epsilon must be at least 1e-5, got %s" % str(epsilon))
+
+    gamma = as_tensor_variable(gamma)
+    beta = as_tensor_variable(beta)
+    mean = as_tensor_variable(mean)
+    var = as_tensor_variable(var)
+
+    if params_ndim != ndim:
+        gamma = gamma.dimshuffle(params_dimshuffle_pattern)
+        beta = beta.dimshuffle(params_dimshuffle_pattern)
+        mean = mean.dimshuffle(params_dimshuffle_pattern)
+        var = var.dimshuffle(params_dimshuffle_pattern)
+    else:
+        gamma = T.addbroadcast(gamma, *axes)
+        beta = T.addbroadcast(beta, *axes)
+        mean = T.addbroadcast(mean, *axes)
+        var = T.addbroadcast(var, *axes)
+
+    batchnorm_op = AbstractBatchNormInference(axes=axes)
+    return batchnorm_op(inputs, gamma, beta, mean, var, epsilon=epsilon)
+
+
+class AbstractBatchNormTrain(Op):
+    """
+    Abstract Op for Batch Normalization.
+
+    Parameters
+    ----------
+    axes : a tuple of ints
+        The axes along which the input should be normalized.
+    x : tensor
+        The input to be normalized along `axes`.
+    scale : tensor
+        `scale` should have the same number of dimensions as `x`.
+        All dimensions listed in `axes` should have length 1.
+    bias : tensor
+        `bias` should have the same number of dimensions as `x`.
+        All dimensions listed in `axes` should have length 1.
+    epsilon
+        Epsilon value used in the batch normalization formula. Minimum allowed
+        value is 1e-5 (imposed by cuDNN).
+    running_average_factor : float
+        Factor for updating the values or `running_mean` and `running_var`.
+        If the factor is close to one, the running averages will update quickly,
+        if the factor is close to zero it will update slowly.
+    running_mean : tensor or None
+        Previous value of the running mean. If this is given, the new value
+        ``running_mean * (1 - running_average_factor) + batch mean * running_average_factor``
+        will be returned as one of the outputs of this function.
+        `running_mean` and `running_var` should either both be given or
+        both be None.
+    running_var : tensor or None
+        Previous value of the running variance. If this is given, the new value
+        ``running_var * (1 - running_average_factor) + (m / (m - 1)) * batch var * running_average_factor``
+        will be returned as one of the outputs of this function,
+        where `m` is the product of lengths of the averaged-over dimensions.
+        `running_mean` and `running_var` should either both be given or
+        both be None.
+    """
+
+    __props__ = ('axes',)
+
+    def __init__(self, axes=(0,)):
+        assert isinstance(axes, (tuple, list))
+        assert len(axes) > 0
+        axes = tuple(int(a) for a in axes)
+        self.axes = axes
+
+    def infer_shape(self, node, shape):
+        return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)
+
+    def make_node(self, x, scale, bias, epsilon=1e-4,
+                  running_average_factor=0.1,
+                  running_mean=None, running_var=None):
+        assert x.ndim == scale.ndim == bias.ndim
+        assert ((running_mean is None and running_var is None) or
+                (running_mean is not None and running_var is not None))
+        assert (running_mean is None or running_mean.ndim == x.ndim)
+        assert (running_var is None or running_var.ndim == x.ndim)
+        if not isinstance(epsilon, theano.Variable):
+            epsilon = as_tensor_variable(epsilon)
+        if not isinstance(running_average_factor, theano.Variable):
+            running_average_factor = as_tensor_variable(running_average_factor)
+        inputs = [x, scale, bias, epsilon, running_average_factor]
+        output_types = [x.type(), scale.type(), scale.type()]
+        if running_mean is not None and running_var is not None:
+            inputs.append(running_mean)
+            inputs.append(running_var)
+            output_types.append(scale.type())
+            output_types.append(scale.type())
+        return Apply(self, inputs, output_types)
+
+    def L_op(self, inputs, outputs, grads):
+        x, scale, bias, epsilon, running_average_factor = inputs[:5]
+        dy = grads[0]
+        _, x_mean, x_invstd = outputs[:3]
+        disconnected_outputs = [
+            theano.gradient.DisconnectedType()(),  # epsilon
+            theano.gradient.DisconnectedType()()]  # running_average_factor
+        # Optional running_mean and running_var.
+        for i in range(5, len(inputs)):
+            disconnected_outputs.append(theano.gradient.DisconnectedType()())
+        return AbstractBatchNormTrainGrad(self.axes)(
+            x, dy, scale, x_mean, x_invstd, epsilon) + disconnected_outputs
+
+    def connection_pattern(self, node):
+        # Specificy that epsilon and running_average_factor are not connected to outputs.
+        patterns = [[True, True, True],     # x
+                    [True, True, True],     # scale
+                    [True, True, True],     # bias
+                    [False, False, False],  # epsilon
+                    [False, False, False]]  # running_average_factor
+        # Optional running_mean and running_var are only
+        # connected to their new values.
+        for i in range(5, len(node.inputs)):
+            patterns[0].append(True)
+            for pattern in patterns[1:]:
+                pattern.append(False)
+            patterns.append([False] * (3 + i - 5) + [True])
+        return patterns
+
+    def perform(self, node, inputs, output_storage):
+        x, scale, bias, epsilon, running_average_factor = inputs[:5]
+        axes = self.axes
+        if min(axes) < 0 or max(axes) >= x.ndim:
+            raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))
+
+        mean = x.mean(axes, keepdims=True)
+        var = x.var(axes, keepdims=True)
+        invstd = 1.0 / numpy.sqrt(var + epsilon)
+        out = (x - mean) * (scale * invstd) + bias
+
+        output_storage[0][0] = out
+        output_storage[1][0] = mean
+        output_storage[2][0] = invstd
+
+        if len(inputs) > 5:
+            running_mean = inputs[5]
+            running_mean = running_mean * (1.0 - running_average_factor) + \
+                mean * running_average_factor
+            output_storage[3][0] = running_mean
+        if len(inputs) > 6:
+            m = float(numpy.prod(x.shape) / numpy.prod(scale.shape))
+            running_var = inputs[6]
+            running_var = running_var * (1.0 - running_average_factor) + \
+                (m / (m - 1)) * var * running_average_factor
+            output_storage[4][0] = running_var
+
+
+class AbstractBatchNormInference(Op):
+    """
+    Abstract Op for Batch Normalization.
+
+    Parameters
+    ----------
+    axes : a tuple of ints
+        The axes along which the input is normalized.
+    epsilon
+        Epsilon value used in the batch normalization formula. Minimum allowed
+        value is 1e-5 (imposed by cuDNN).
+    """
+
+    __props__ = ('axes',)
+
+    def __init__(self, axes=(0,)):
+        assert isinstance(axes, (tuple, list))
+        assert len(axes) > 0
+        axes = tuple(int(a) for a in axes)
+        self.axes = axes
+
+    def infer_shape(self, node, shape):
+        return [shape[0]]
+
+    def make_node(self, x, scale, bias, estimated_mean, estimated_variance, epsilon=1e-4):
+        assert x.ndim == scale.ndim == bias.ndim == estimated_mean.ndim == estimated_variance.ndim
+        if not isinstance(epsilon, theano.Variable):
+            epsilon = as_tensor_variable(epsilon)
+        return Apply(self, [x, scale, bias, estimated_mean, estimated_variance, epsilon], [x.type()])
+
+    def grad(self, inputs, grads):
+        x, scale, bias, est_mean, est_var, epsilon = inputs
+        dy = grads[0]
+        axes = self.axes
+        if min(axes) < 0 or max(axes) >= x.ndim:
+            raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))
+
+        scale, bias, est_mean, est_var = (theano.tensor.addbroadcast(t, *axes)
+                                          for t in (scale, bias, est_mean, est_var))
+
+        # define helper expressions
+        est_var_eps = est_var + epsilon
+        est_std = theano.tensor.sqrt(est_var_eps)
+        two = theano.tensor.constant(2.)
+
+        # define and return gradients
+        dx = dy * (scale / est_std)
+        dscale = (dy * (x - est_mean)).sum(axes, keepdims=True) / est_std
+        dbias = dy.sum(axes, keepdims=True)
+        dmean = -dy.sum(axes, keepdims=True) * (scale / est_std)
+        dvar = -(dy * (x - est_mean)).sum(axes, keepdims=True) * (scale / (two * est_var_eps * est_std))
+        return [dx, dscale, dbias, dmean, dvar, theano.gradient.DisconnectedType()()]
+
+    def connection_pattern(self, node):
+        # Specificy that epsilon is not connected to outputs.
+        return [[True], [True], [True], [True], [True], [False]]
+
+    def perform(self, node, inputs, output_storage):
+        x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs
+        out = (x - estimated_mean) * (scale / numpy.sqrt(estimated_variance + epsilon)) + bias
+        output_storage[0][0] = out
+
+
+class AbstractBatchNormTrainGrad(Op):
+    __props__ = ('axes',)
+
+    def __init__(self, axes=(0,)):
+        assert isinstance(axes, (tuple, list))
+        assert len(axes) > 0
+        axes = tuple(int(a) for a in axes)
+        self.axes = axes
+
+    def make_node(self, x, dy, scale, x_mean, x_invstd, epsilon=1e-4):
+        assert x.ndim == dy.ndim == scale.ndim == x_mean.ndim == x_invstd.ndim
+        if not isinstance(epsilon, theano.Variable):
+            epsilon = as_tensor_variable(epsilon)
+        return Apply(self, [x, dy, scale, x_mean, x_invstd, epsilon],
+                     [x.type(), scale.type(), scale.type()])
+
+    def infer_shape(self, node, shape):
+        return [shape[0], shape[2], shape[2]]
+
+    def perform(self, node, inputs, output_storage):
+        x, dy, scale, x_mean, x_invstd, epsilon = inputs
+        axes = self.axes
+        if min(axes) < 0 or max(axes) >= x.ndim:
+            raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))
+
+        x_diff = x - x_mean
+        mean_dy_x_diff = numpy.mean(dy * x_diff, axis=axes, keepdims=True)
+        c = (dy * x_invstd) - (x_diff * mean_dy_x_diff * (x_invstd ** 3))
+
+        g_wrt_inputs = scale * (c - numpy.mean(c, axis=axes, keepdims=True))
+        g_wrt_scale = numpy.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)
+        g_wrt_bias = numpy.sum(dy, axis=axes, keepdims=True)
+
+        output_storage[0][0] = g_wrt_inputs
+        output_storage[1][0] = g_wrt_scale
+        output_storage[2][0] = g_wrt_bias
+
+
+@local_optimizer([AbstractBatchNormTrain])
+def local_abstract_batch_norm_train(node):
+    if not isinstance(node.op, AbstractBatchNormTrain):
+        return None
+
+    x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
+    axes = node.op.axes
+    if min(axes) < 0 or max(axes) > x.ndim:
+        return None
+    if not isinstance(x.type, TensorType) or \
+       not isinstance(scale.type, TensorType) or \
+       not isinstance(bias.type, TensorType) or \
+       not isinstance(epsilon.type, TensorType) or \
+       not isinstance(running_average_factor.type, TensorType):
+        return None
+    # optional running_mean and running_var
+    if len(node.inputs) > 5 and not isinstance(node.inputs[5].type, TensorType):
+        return None
+    if len(node.inputs) > 6 and not isinstance(node.inputs[6].type, TensorType):
+        return None
+
+    mean = x.mean(axes, keepdims=True)
+    var = x.var(axes, keepdims=True)
+    invstd = T.inv(T.sqrt(var + epsilon))
+    out = (x - mean) * (scale * invstd) + bias
+    results = [out, mean, invstd]
+
+    if len(node.inputs) > 5:
+        running_mean = node.inputs[5]
+        running_mean = running_mean * (1.0 - running_average_factor) + \
+            mean * running_average_factor
+        results.append(running_mean)
+    if len(node.inputs) > 6:
+        m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
+        running_var = node.inputs[6]
+        running_var = running_var * (1.0 - running_average_factor) + \
+            (m / (m - 1)) * var * running_average_factor
+        results.append(running_var)
+
+    results = [T.patternbroadcast(r, r_orig.broadcastable)
+               for (r, r_orig) in zip(results, node.outputs)]
+
+    for var in theano.gof.graph.variables(node.inputs, results):
+        if var not in node.inputs:
+            copy_stack_trace(node.outputs[0], var)
+    return results
+
+
+@local_optimizer([AbstractBatchNormTrainGrad])
+def local_abstract_batch_norm_train_grad(node):
+    if not isinstance(node.op, AbstractBatchNormTrainGrad):
+        return None
+
+    x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
+    axes = node.op.axes
+    if min(axes) < 0 or max(axes) > x.ndim:
+        return None
+    if not isinstance(x.type, TensorType) or \
+       not isinstance(dy.type, TensorType) or \
+       not isinstance(scale.type, TensorType) or \
+       not isinstance(x_mean.type, TensorType) or \
+       not isinstance(x_invstd.type, TensorType) or \
+       not isinstance(epsilon.type, TensorType):
+        return None
+
+    x_diff = x - x_mean
+    mean_dy_x_diff = T.mean(dy * x_diff, axis=axes, keepdims=True)
+    c = (dy * x_invstd) - x_diff * (mean_dy_x_diff * (x_invstd ** 3))
+
+    g_wrt_inputs = scale * (c - T.mean(c, axis=axes, keepdims=True))
+    g_wrt_scale = T.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)
+    g_wrt_bias = T.sum(dy, axis=axes, keepdims=True)
+    results = [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
+
+    results = [T.patternbroadcast(r, r_orig.broadcastable)
+               for (r, r_orig) in zip(results, node.outputs)]
+
+    for var in theano.gof.graph.variables(node.inputs, results):
+        if var not in node.inputs:
+            copy_stack_trace(node.outputs[0], var)
+    return results
+
+
+@local_optimizer([AbstractBatchNormInference])
+def local_abstract_batch_norm_inference(node):
+    if not isinstance(node.op, AbstractBatchNormInference):
+        return None
+
+    x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs
+
+    if not isinstance(x.type, TensorType) or \
+       not isinstance(scale.type, TensorType) or \
+       not isinstance(bias.type, TensorType) or \
+       not isinstance(estimated_mean.type, TensorType) or \
+       not isinstance(estimated_variance.type, TensorType) or \
+       not isinstance(epsilon.type, TensorType):
+        return None
+
+    result = (x - estimated_mean) * (scale / T.sqrt(estimated_variance + epsilon)) + bias
+    result = T.patternbroadcast(result, node.outputs[0].broadcastable)
+
+    for var in theano.gof.graph.variables(node.inputs, [result]):
+        if var not in node.inputs:
+            copy_stack_trace(node.outputs[0], var)
+    return [result]
+
+
+# Register Cpu Optmization
+bn_groupopt = theano.gof.optdb.LocalGroupDB()
+bn_groupopt.__name__ = 'batchnorm_opts'
+register_specialize_device(bn_groupopt, 'fast_compile', 'fast_run')
+
+bn_groupopt.register('local_abstract_batch_norm_train',
+                     local_abstract_batch_norm_train, 30,
+                     'fast_compile', 'fast_run')
+bn_groupopt.register('local_abstract_batch_norm_train_grad',
+                     local_abstract_batch_norm_train_grad, 30,
+                     'fast_compile', 'fast_run')
+bn_groupopt.register('local_abstract_batch_norm_inference',
+                     local_abstract_batch_norm_inference, 30,
+                     'fast_compile', 'fast_run')
--- a/theano/tensor/nnet/tests/test_bn.py
+++ b/theano/tensor/nnet/tests/test_bn.py
 from __future__ import absolute_import, print_function, division
 import theano
+import theano.tensor as T
 from theano.tests import unittest_tools as utt
 import numpy

-from theano.tensor.nnet.bn import batch_normalization
+from theano.tensor.nnet import bn


 def test_BNComposite():
@@ -39,7 +40,7 @@ def test_BNComposite():
        f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
        res_ref = f_ref(X, G, B, M, V)
        for mode in ['low_mem', 'high_mem']:
-            bn_op = batch_normalization(x, g, b, m, v, mode=mode)
+            bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode)
            f = theano.function([x, b, g, m, v], [bn_op])
            res = f(X, G, B, M, V)
            utt.assert_allclose(res_ref, res)
@@ -47,7 +48,7 @@ def test_BNComposite():
        theano.config.compute_test_value = orig


-def test_bn():
+def test_batch_normalization():

    def bn_ref(x, G, B, M, V):
        n = (x - M) / V
@@ -70,28 +71,28 @@ def test_bn():
    f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
    res_ref = f_ref(X, G, B, M, V)
    for mode in ['low_mem', 'high_mem']:
-        bn_op = batch_normalization(x, g, b, m, v, mode=mode)
+        bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode)
        f = theano.function([x, b, g, m, v], [bn_op])
        res = f(X, G, B, M, V)
        utt.assert_allclose(res_ref, res)

-        def bn(inputs, gamma, beta, mean, std):
-            return batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
-        utt.verify_grad(bn, [X, G, B, M, V])
+        def bn_f(inputs, gamma, beta, mean, std):
+            return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
+        utt.verify_grad(bn_f, [X, G, B, M, V])

    bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True))
    f_ref = theano.function([x, b, g], [bn_ref_op])
    res_ref = f_ref(X, G, B)
    for mode in ['low_mem', 'high_mem']:
-        bn_op = batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode)
+        bn_op = bn.batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode)
        f = theano.function([x, b, g], [bn_op])
        res = f(X, G, B)
        utt.assert_allclose(res_ref, res)

-        def bn(inputs, gamma, beta, mean, std):
-            return batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
-        utt.verify_grad(batch_normalization, [X, G, B,
-                                              X.mean(axis=0)[numpy.newaxis], X.std(axis=0)[numpy.newaxis]])
+        def bn_f(inputs, gamma, beta, mean, std):
+            return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
+        utt.verify_grad(bn_f, [X, G, B,
+                               X.mean(axis=0)[numpy.newaxis], X.std(axis=0)[numpy.newaxis]])


 def test_bn_feature_maps():
@@ -122,21 +123,296 @@ def test_bn_feature_maps():
    res_ref = f_ref(X, G, B, M, V)

    for mode in ['low_mem', 'high_mem']:
-        bn_op = batch_normalization(x,
-                                    g.dimshuffle('x', 0, 'x', 'x'),
-                                    b.dimshuffle('x', 0, 'x', 'x'),
-                                    m.dimshuffle('x', 0, 'x', 'x'),
-                                    v.dimshuffle('x', 0, 'x', 'x'),
-                                    mode=mode)
+        bn_op = bn.batch_normalization(x,
+                                       g.dimshuffle('x', 0, 'x', 'x'),
+                                       b.dimshuffle('x', 0, 'x', 'x'),
+                                       m.dimshuffle('x', 0, 'x', 'x'),
+                                       v.dimshuffle('x', 0, 'x', 'x'),
+                                       mode=mode)
        f = theano.function([x, b, g, m, v], [bn_op])
        res = f(X, G, B, M, V)
        utt.assert_allclose(res_ref, res)

        def conv_bn(inputs, gamma, beta, mean, std):
-            return batch_normalization(inputs,
-                                       gamma.dimshuffle('x', 0, 'x', 'x'),
-                                       beta.dimshuffle('x', 0, 'x', 'x'),
-                                       mean.dimshuffle('x', 0, 'x', 'x'),
-                                       std.dimshuffle('x', 0, 'x', 'x'),
-                                       mode=mode)
+            return bn.batch_normalization(inputs,
+                                          gamma.dimshuffle('x', 0, 'x', 'x'),
+                                          beta.dimshuffle('x', 0, 'x', 'x'),
+                                          mean.dimshuffle('x', 0, 'x', 'x'),
+                                          std.dimshuffle('x', 0, 'x', 'x'),
+                                          mode=mode)
        utt.verify_grad(conv_bn, [X, G, B, M, V])
+
+
+def test_batch_normalization_train():
+    utt.seed_rng()
+
+    for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
+        for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
+            x, scale, bias, running_mean, running_var = (vartype(n)
+                                                         for n in ('x', 'scale', 'bias',
+                                                                   'running_mean',
+                                                                   'running_var'))
+            ndim = x.ndim
+            eps = 5e-3  # some non-standard value to test if it's used
+            running_average_factor = 0.3
+
+            # remove non-existing axes
+            if isinstance(axes, tuple):
+                axes = tuple(i for i in axes if i < ndim)
+            if len(axes) == 0:
+                continue
+
+            # forward pass
+            out, x_mean, x_invstd, out_running_mean, out_running_var = \
+                bn.batch_normalization_train(
+                    x, scale, bias, axes, eps,
+                    running_average_factor, running_mean, running_var)
+            # reference forward pass
+            if axes == 'per-activation':
+                axes2 = (0,)
+            elif axes == 'spatial':
+                axes2 = (0,) + tuple(range(2, ndim))
+            else:
+                axes2 = axes
+            x_mean2 = x.mean(axis=axes2, keepdims=True)
+            x_var2 = x.var(axis=axes2, keepdims=True)
+            x_invstd2 = T.inv(T.sqrt(x_var2 + eps))
+            scale2 = T.addbroadcast(scale, *axes2)
+            bias2 = T.addbroadcast(bias, *axes2)
+            out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
+            m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
+            out_running_mean2 = running_mean * (1 - running_average_factor) + \
+                x_mean2 * running_average_factor
+            out_running_var2 = running_var * (1 - running_average_factor) + \
+                (m / (m - 1)) * x_var2 * running_average_factor
+            # backward pass
+            dy = vartype('dy')
+            grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
+            # reference backward pass
+            grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
+            # compile
+            f = theano.function([x, scale, bias, running_mean, running_var, dy],
+                                [out, x_mean, x_invstd, out_running_mean, out_running_var,
+                                 out2, x_mean2, x_invstd2, out_running_mean2, out_running_var2] +
+                                grads + grads2)
+            # check if the abstract Ops have been replaced
+            assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                              bn.AbstractBatchNormInference,
+                                              bn.AbstractBatchNormTrainGrad))
+                            for n in f.maker.fgraph.toposort()])
+            # run
+            for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
+                data_shape = data_shape[:ndim]
+                param_shape = tuple(1 if d in axes2 else s
+                                    for d, s in enumerate(data_shape))
+                X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+                Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy)
+                # compare outputs
+                utt.assert_allclose(outputs[0], outputs[0 + 5])  # out
+                utt.assert_allclose(outputs[1], outputs[1 + 5])  # mean
+                utt.assert_allclose(outputs[2], outputs[2 + 5])  # invstd
+                utt.assert_allclose(outputs[3], outputs[3 + 5])  # running_mean
+                utt.assert_allclose(numpy.nan_to_num(outputs[4]),
+                                    numpy.nan_to_num(outputs[4 + 5]))  # running_var
+                # compare gradients
+                utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4)  # dx
+                utt.assert_allclose(outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4)  # dscale
+                utt.assert_allclose(outputs[12], outputs[12 + 3])  # dbias
+
+
+def test_batch_normalization_train_without_running_averages():
+    # compile and run batch_normalization_train without running averages
+    utt.seed_rng()
+
+    x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
+    data_shape = (5, 10, 30, 25)
+    param_shape = (1, 10, 30, 25)
+
+    # forward pass
+    out, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'per-activation')
+    # backward pass
+    grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
+    # compile
+    f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads)
+    # check if the abstract Ops have been replaced
+    assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                      bn.AbstractBatchNormInference,
+                                      bn.AbstractBatchNormTrainGrad))
+                    for n in f.maker.fgraph.toposort()])
+    # run
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    f(X, Scale, Bias, Dy)
+
+
+def test_batch_normalization_train_broadcast():
+    for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
+        for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
+            x = vartype('x')
+            ndim = x.ndim
+            eps = 5e-3  # some non-standard value to test if it's used
+            running_average_factor = 0.3
+
+            # remove non-existing axes
+            if isinstance(axes, tuple):
+                axes = tuple(i for i in axes if i < ndim)
+            if len(axes) == 0:
+                continue
+
+            # convert axes to explicit list
+            if axes == 'per-activation':
+                axes2 = (0,)
+            elif axes == 'spatial':
+                axes2 = (0,) + tuple(range(2, ndim))
+            else:
+                axes2 = axes
+
+            # compute axes for parameter tensors
+            non_bc_axes = tuple(i for i in range(ndim) if i not in axes2)
+            params_dimshuffle = ['x'] * ndim
+            for i, axis in enumerate(non_bc_axes):
+                params_dimshuffle[axis] = i
+
+            # construct non-broadcasted parameter variables
+            param_type = T.TensorType(x.dtype, (False,) * len(non_bc_axes))
+            scale, bias, running_mean, running_var = (param_type(n)
+                                                      for n in ('scale', 'bias',
+                                                                'running_mean',
+                                                                'running_var'))
+
+            # broadcast parameter variables
+            scale_bc = scale.dimshuffle(params_dimshuffle)
+            bias_bc = bias.dimshuffle(params_dimshuffle)
+            running_mean_bc = running_mean.dimshuffle(params_dimshuffle)
+            running_var_bc = running_var.dimshuffle(params_dimshuffle)
+
+            # batch_normalization_train with original, non-broadcasted variables
+            train_non_bc = \
+                bn.batch_normalization_train(
+                    x, scale, bias, axes, eps,
+                    running_average_factor, running_mean, running_var)
+            # batch_normalization_train with broadcasted variables
+            train_bc = \
+                bn.batch_normalization_train(
+                    x, scale_bc, bias_bc, axes, eps,
+                    running_average_factor, running_mean_bc, running_var_bc)
+            train_bc = tuple([train_bc[0]] +  # out
+                             [r.dimshuffle(non_bc_axes) for r in train_bc[1:]])
+
+            # batch_normalization_test with original, non-broadcasted variables
+            test_non_bc = \
+                bn.batch_normalization_test(
+                    x, scale, bias, running_mean, running_var, axes, eps)
+            # batch_normalization_test with broadcasted variables
+            test_bc = \
+                bn.batch_normalization_test(
+                    x, scale_bc, bias_bc, running_mean_bc, running_var_bc, axes, eps)
+
+            # subtract the results of the non-broadcasted and broadcasted calls
+            results_non_bc = train_non_bc + (test_non_bc,)
+            results_bc = train_bc + (test_bc,)
+            results = [abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc)]
+
+            # compile to compute all differences
+            f = theano.function([x, scale, bias, running_mean, running_var],
+                                T.sum(sum(results)))
+
+            # the paired ops are exactly the same, so the optimizer should have
+            # collapsed the sum of differences to a constant zero
+            nodes = f.maker.fgraph.toposort()
+            if theano.config.mode != "FAST_COMPILE":
+                assert len(nodes) == 1
+                assert isinstance(nodes[0].op, theano.compile.DeepCopyOp)
+            inputs = [numpy.asarray(numpy.random.rand(*((4,) * n)), x.dtype)
+                      for n in [x.ndim, scale.ndim, bias.ndim,
+                                running_mean.ndim, running_var.ndim]]
+            assert 0.0 == f(*inputs)
+
+
+def test_batch_normalization_test():
+    for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
+        for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
+            x, scale, bias, mean, var = (vartype(n)
+                                         for n in ('x', 'scale', 'bias', 'mean', 'var'))
+            ndim = x.ndim
+            eps = 5e-3  # some non-standard value to test if it's used
+
+            # remove non-existing axes
+            if isinstance(axes, tuple):
+                axes = tuple(i for i in axes if i < ndim)
+            if len(axes) == 0:
+                continue
+
+            # forward pass
+            out = bn.batch_normalization_test(x, scale, bias, mean,
+                                              var, axes, eps)
+            # reference forward pass
+            if axes == 'per-activation':
+                axes2 = (0,)
+            elif axes == 'spatial':
+                axes2 = (0,) + tuple(range(2, ndim))
+            else:
+                axes2 = axes
+            scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes2)
+                                          for t in (scale, bias, mean, var))
+            out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2
+            # backward pass
+            dy = vartype('dy')
+            grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy})
+            # reference backward pass
+            grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy})
+            # compile
+            f = theano.function([x, scale, bias, mean, var, dy],
+                                [out, out2] + grads + grads2)
+            # check if the abstract Ops have been replaced
+            assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                              bn.AbstractBatchNormInference,
+                                              bn.AbstractBatchNormTrainGrad))
+                            for n in f.maker.fgraph.toposort()])
+            # run
+            for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
+                data_shape = data_shape[:ndim]
+                param_shape = tuple(1 if d in axes2 else s
+                                    for d, s in enumerate(data_shape))
+                X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+                Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
+                outputs = f(X, Scale, Bias, Mean, Var, Dy)
+                # compare outputs
+                utt.assert_allclose(outputs[0], outputs[1])  # out
+                # compare gradients
+                utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5)  # dx
+                utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5)  # dscale
+                utt.assert_allclose(outputs[4], outputs[4 + 5])  # dbias
+                utt.assert_allclose(outputs[5], outputs[5 + 5])  # dmean
+                utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5)  # dvar
+
+
+def test_batch_normalization_broadcastable():
+    # check if the broadcastable pattern is preserved by the optimizations
+    x, dy, scale, bias, mean, var = (T.scalar(n).dimshuffle(['x'] * 5)
+                                     for n in ('x', 'dy', 'scale', 'bias', 'mean', 'var'))
+
+    # forward pass
+    out_train, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'spatial')
+    out_test = bn.batch_normalization_test(x, scale, bias, mean, var, 'spatial')
+    # backward pass
+    grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
+    grads_test = T.grad(None, wrt=[x, scale, bias], known_grads={out_test: dy})
+    # compile
+    f = theano.function([x, scale, bias, mean, var, dy],
+                        [out_train, x_mean, x_invstd, out_test] + grads_train + grads_test)
+    assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                      bn.AbstractBatchNormInference,
+                                      bn.AbstractBatchNormTrainGrad))
+                    for n in f.maker.fgraph.toposort()])