Batch normalization optimizations for old gpu backend.

43411345 · Gijs van Tulder · ef21cb58 · 43411345 · 43411345 · 43411345
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -18,6 +18,7 @@ from theano.tensor.nnet.abstract_conv import (get_conv_output_shape,
                                              assert_conv_shape)
 from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
+from theano.tensor.nnet import bn
 from theano.sandbox.cuda.type import CudaNdarrayType

 from theano.sandbox.cuda import GpuOp, dnn_available
@@ -2347,6 +2348,23 @@ class GpuDnnBatchNormBase(DnnBase):
    epsilon
        Epsilon value used in the batch normalization formula. Minimum allowed
        value is 1e-5 (imposed by cuDNN).
+    running_average_factor : float
+        Factor for updating the values or `running_mean` and `running_var`.
+        If the factor is close to one, the running averages will update quickly,
+        if the factor is close to zero it will update slowly.
+    running_mean : tensor or None
+        Previous value of the running mean. If this is given, the new value
+        ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
+        will be returned as one of the outputs of this function.
+        `running_mean` and `running_var` should either both be given or
+        both be None.
+    running_var : tensor or None
+        Previous value of the running variance. If this is given, the new value
+        ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
+        will be returned as one of the outputs of this function,
+        where `m` is the product of lengths of the averaged-over dimensions.
+        `running_mean` and `running_var` should either both be given or
+        both be None.
    """

    __props__ = ('mode', 'epsilon')
@@ -2395,17 +2413,15 @@ cudnnStatus_t err%(name)s;
        result = """
 cudnnStatus_t err%(name)s;
 cudnnBatchNormMode_t mode%(name)s = %(mode)s;
-double exponentialAverageFactor%(name)s = %(exp_avg_factor)f;
 double epsilon%(name)s = %(epsilon)e;
 """ % dict(name=name,
           mode=mode,
-           exp_avg_factor=0,  # deliberately unused
           epsilon=self.epsilon)

        return result

    def c_code_cache_version(self):
-        return (3, version())
+        return (4, version())


 class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
@@ -2422,8 +2438,21 @@ class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
    Note: scale, bias, mean and variance must follow the same tensor layout!
    """

+    __props__ = ('mode', 'epsilon', 'inplace')
    tensor_descs = ['bn_input', 'bn_output', 'bn_params']

+    def __init__(self, mode='per-activation', epsilon=1e-4, inplace=False):
+        super(GpuDnnBatchNormInference, self).__init__(mode=mode, epsilon=epsilon)
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
+    def get_op_params(self):
+        params = []
+        if self.inplace:
+            params.append(('INPLACE_OUTPUT', '1'))
+        return params
+
    def infer_shape(self, node, shape):
        # output shape equals shape of x
        return [shape[0]]
@@ -2460,10 +2489,16 @@ if (c_set_tensorNd(%(scale)s, bn_params_%(name)s) != 0)
 }

 // build and prepare the output variable
+#ifdef INPLACE_OUTPUT
+  Py_XDECREF(%(outp)s);
+  %(outp)s = %(inp)s;
+  Py_INCREF(%(outp)s);
+#else
 if (CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp)s)) != 0)
 {
    %(fail)s
 }
+#endif

 // set output tensor descriptor from output tensor
 if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0)
@@ -2494,6 +2529,16 @@ err%(name)s = cudnnBatchNormalizationForwardInference(
 """ % dict(name=name, inp=inp, scale=scale, bias=bias, est_mean=est_mean,
           est_var=est_var, outp=outp, fail=sub['fail'])

+        # add params
+        define_macros, undef_macros = self.get_c_macros(node, name, check_input=False)
+        result = """
+%(define_macros)s
+{
+    %(code)s
+}
+%(undef_macros)s
+""" % dict(code=result, define_macros=define_macros, undef_macros=undef_macros)
+
        return result

    def grad(self, inputs, grads):
@@ -2537,28 +2582,84 @@ class GpuDnnBatchNorm(GpuDnnBatchNormBase):
    Note: scale and bias must follow the same tensor layout!
    """

+    __props__ = ('mode', 'epsilon', 'running_averages',
+                 'inplace_running_mean', 'inplace_running_var',
+                 'inplace_output')
    tensor_descs = ['bn_input', 'bn_output', 'bn_params']

+    def __init__(self, mode='per-activation', epsilon=1e-4,
+                 running_average_factor=0,
+                 running_averages=False, inplace_running_mean=False,
+                 inplace_running_var=False, inplace_output=False):
+        super(GpuDnnBatchNorm, self).__init__(mode=mode, epsilon=epsilon)
+        self.running_average_factor = running_average_factor
+        self.running_averages = running_averages
+        self.inplace_output = inplace_output
+        self.inplace_running_mean = inplace_running_mean
+        self.inplace_running_var = inplace_running_var
+        self.destroy_map = {}
+        if self.inplace_output:
+            self.destroy_map[0] = [0]
+        if self.running_averages and self.inplace_running_mean:
+            self.destroy_map[3] = [3]
+        if self.running_averages and self.inplace_running_var:
+            self.destroy_map[4] = [4]
+
+    def get_op_params(self):
+        params = []
+        if self.inplace_output:
+            params.append(('INPLACE_OUTPUT', '1'))
+        if self.running_averages:
+            params.append(('RUNNING_AVERAGES', '1'))
+            if self.inplace_running_mean:
+                params.append(('INPLACE_RUNNING_MEAN', '1'))
+            if self.inplace_running_var:
+                params.append(('INPLACE_RUNNING_VAR', '1'))
+        return params
+
    def infer_shape(self, node, shape):
        # first output equals shape of x
-        # second and third output equal shape of scale
-        return [shape[0], shape[1], shape[1]]
+        # other outputs equal shape of scale
+        return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)

-    def make_node(self, x, scale, bias):
+    def make_node(self, x, scale, bias,
+                  running_mean=None, running_var=None):
+        assert x.ndim == scale.ndim == bias.ndim
+        assert x.ndim in (4, 5)
+        assert self.running_averages == (running_mean is not None) == (running_var is not None)
+        assert (running_mean is None or running_mean.ndim == x.ndim)
+        assert (running_var is None or running_var.ndim == x.ndim)
        x = as_cuda_ndarray_variable(x)
        scale = as_cuda_ndarray_variable(scale)
        bias = as_cuda_ndarray_variable(bias)
-        assert x.ndim == scale.ndim == bias.ndim
-        assert x.ndim in (4, 5)
-        return Apply(self, [x, scale, bias], [x.type(), scale.type(), scale.type()])
+        inputs = [x, scale, bias]
+        output_types = [x.type(), scale.type(), scale.type()]
+        if running_mean is not None and running_var is not None:
+            inputs.append(as_cuda_ndarray_variable(running_mean))
+            inputs.append(as_cuda_ndarray_variable(running_var))
+            output_types.append(scale.type())
+            output_types.append(scale.type())
+        return Apply(self, inputs, output_types)

    def c_code(self, node, name, inputs, outputs, sub):
        # super call to prepare common configuration
        result = super(GpuDnnBatchNorm, self).c_code(node, name, inputs, outputs, sub)

        # give sensible names to inputs and outputs
-        inp, scale, bias = inputs
-        outp, x_mean, x_invstd = outputs
+        inp, scale, bias = inputs[:3]
+        outp, x_mean, x_invstd = outputs[:3]
+        if self.running_averages:
+            running_average_factor = self.running_average_factor
+            in_running_mean = inputs[3]
+            in_running_var = inputs[4]
+            out_running_mean = outputs[3]
+            out_running_var = outputs[4]
+        else:
+            running_average_factor = 0.
+            in_running_mean = 'NULL'
+            in_running_var = 'NULL'
+            out_running_mean = 'NULL'
+            out_running_var = 'NULL'

        # set input tensor descriptors from input tensors
        result += """
@@ -2579,6 +2680,32 @@ if ((CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp
 {
    %(fail)s
 }
+#ifdef RUNNING_AVERAGES
+#ifdef INPLACE_RUNNING_MEAN
+  Py_XDECREF(%(out_running_mean)s);
+  CudaNdarray *running_mean%(name)s = %(in_running_mean)s;
+  Py_INCREF(running_mean%(name)s);
+#else
+  if ((CudaNdarray_prep_output(&%(out_running_mean)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(scale)s)) != 0) ||
+      (CudaNdarray_CopyFromCudaNdarray(%(out_running_mean)s, %(in_running_mean)s) != 0))
+  {
+    %(fail)s
+  }
+  CudaNdarray *running_mean%(name)s = %(out_running_mean)s;
+#endif
+#ifdef INPLACE_RUNNING_VAR
+  Py_XDECREF(%(out_running_var)s);
+  CudaNdarray *running_var%(name)s = %(in_running_var)s;
+  Py_INCREF(running_var%(name)s);
+#else
+  if ((CudaNdarray_prep_output(&%(out_running_var)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(scale)s)) != 0) ||
+      (CudaNdarray_CopyFromCudaNdarray(%(out_running_var)s, %(in_running_var)s) != 0))
+  {
+    %(fail)s
+  }
+  CudaNdarray *running_var%(name)s = %(out_running_var)s;
+#endif
+#endif

 // set output tensor descriptor from output tensor
 if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0)
@@ -2601,25 +2728,66 @@ err%(name)s = cudnnBatchNormalizationForwardTraining(
  bn_params_%(name)s,
  CudaNdarray_DEV_DATA(%(scale)s),
  CudaNdarray_DEV_DATA(%(bias)s),
-  exponentialAverageFactor%(name)s,
-  NULL,  // running mean, deliberately unused
-  NULL,  // running var, deliberately unused
+#ifdef RUNNING_AVERAGES
+  %(running_average_factor)f,
+  CudaNdarray_DEV_DATA(running_mean%(name)s),
+  CudaNdarray_DEV_DATA(running_var%(name)s),
+#else
+  0,
+  NULL,
+  NULL,
+#endif
  epsilon%(name)s,
  CudaNdarray_DEV_DATA(%(x_mean)s),
  CudaNdarray_DEV_DATA(%(x_invstd)s)
 );
 }
+#ifdef RUNNING_AVERAGES
+  %(out_running_mean)s = running_mean%(name)s;
+  %(out_running_var)s = running_var%(name)s;
+#endif
 """ % dict(name=name, inp=inp, scale=scale, bias=bias, outp=outp,
-           x_mean=x_mean, x_invstd=x_invstd, fail=sub['fail'])
+           x_mean=x_mean, x_invstd=x_invstd,
+           running_average_factor=running_average_factor,
+           in_running_mean=in_running_mean, in_running_var=in_running_var,
+           out_running_mean=out_running_mean, out_running_var=out_running_var,
+           fail=sub['fail'])
+
+        # add params
+        define_macros, undef_macros = self.get_c_macros(node, name, check_input=False)
+        result = """
+%(define_macros)s
+{
+    %(code)s
+}
+%(undef_macros)s
+""" % dict(code=result, define_macros=define_macros, undef_macros=undef_macros)

        return result

    def grad(self, inputs, grads):
-        x, scale, bias = inputs
+        x, scale, bias = inputs[:3]
        dy = grads[0]
-        _, x_mean, x_invstd = self(x, scale, bias)
-        return GpuDnnBatchNormGrad(self.mode, self.epsilon)(x, dy, scale,
-                                                            x_mean, x_invstd)
+        _, x_mean, x_invstd = self(*inputs)[:3]
+        disconnected_outputs = []
+        # Optional running_mean and running_var.
+        for i in range(3, len(inputs)):
+            disconnected_outputs.append(DisconnectedType()())
+        return GpuDnnBatchNormGrad(self.mode, self.epsilon)(
+            x, dy, scale, x_mean, x_invstd) + disconnected_outputs
+
+    def connection_pattern(self, node):
+        patterns = [[True, True, True],     # x
+                    [True, True, True],     # scale
+                    [True, True, True]]     # bias
+        # Optional running_mean and running_var are only
+        # connected to their new values.
+        for i in range(3, len(node.inputs)):
+            patterns[0].append(True)
+            for pattern in patterns[1:]:
+                pattern.append(False)
+            patterns.append([False] * (i) + [True])
+        return patterns


 class GpuDnnBatchNormGrad(GpuDnnBatchNormBase):
@@ -2722,7 +2890,8 @@ err%(name)s = cudnnBatchNormalizationBackward(


 def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
-                                  epsilon=1e-4):
+                                  epsilon=1e-4, running_average_factor=0.1,
+                                  running_mean=None, running_var=None):
    """
    Performs batch normalization of the given inputs, using the mean and
    variance of the inputs.
@@ -2742,6 +2911,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
    epsilon : float
        Epsilon value used in the batch normalization formula. Minimum allowed
        value is 1e-5 (imposed by cuDNN).
+    running_average_factor : float
+        Factor for updating the values or `running_mean` and `running_var`.
+        If the factor is close to one, the running averages will update quickly,
+        if the factor is close to zero it will update slowly.
+    running_mean : tensor or None
+        Previous value of the running mean. If this is given, the new value
+        ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
+        will be returned as one of the outputs of this function.
+        `running_mean` and `running_var` should either both be given or
+        both be None.
+    running_var : tensor or None
+        Previous value of the running variance. If this is given, the new value
+        ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
+        will be returned as one of the outputs of this function,
+        where `m` is the product of lengths of the averaged-over dimensions.
+        `running_mean` and `running_var` should either both be given or
+        both be None.

    Returns
    -------
@@ -2751,6 +2937,12 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
        Means of `inputs` across the normalization axes.
    invstd : tensor
        Inverse standard deviations of `inputs` across the normalization axes.
+    new_running_mean : tensor
+        New value of the running mean (only if both `running_mean` and
+        `running_var` were given).
+    new_running_var : tensor
+        New value of the running variance (only if both `running_var` and
+        `running_mean` were given).

    Notes
    -----
@@ -2762,31 +2954,78 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',

        axes = 0 if mode == 'per-activation' else (0, 2, 3)
        mean = inputs.mean(axes, keepdims=True)
-        invstd = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon))
+        var = inputs.var(axes, keepdims=True)
+        invstd = T.inv(T.sqrt(var + epsilon))
        out = (inputs - mean) * gamma * invstd + beta

+        m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
+        running_mean = running_mean * (1 - running_average_factor) + \\
+                       mean * running_average_factor
+        running_var = running_var * (1 - running_average_factor) + \\
+                      (m / (m - 1)) * var * running_average_factor
+
    For 5d tensors, the axes are (0, 2, 3, 4).
    """
    ndim = inputs.ndim
-    if ndim > 5:
-        raise ValueError("dnn_batch_normalization_train currently supports "
-                         "up to 5-dimensional tensors only, got %d" % ndim)
    if gamma.ndim != ndim or beta.ndim != ndim:
        raise ValueError("gamma and beta must be of the same dimensionality "
                         "as inputs; got %d and %d instead of %d" %
                         (gamma.ndim, beta.ndim, ndim))
+    if (running_mean is None) != (running_var is None):
+        raise ValueError("running_mean and running_var must either both be "
+                         "given or both be None")
+    if running_mean is not None and running_mean.ndim != ndim:
+        raise ValueError("running_mean must be of the same dimensionality "
+                         "as inputs; got %d instead of %d" %
+                         (running_mean.ndim, ndim))
+    if running_var is not None and running_var.ndim != ndim:
+        raise ValueError("running_var must be of the same dimensionality "
+                         "as inputs; got %d instead of %d" %
+                         (running_var.ndim, ndim))
    if epsilon < 1e-5:
        raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)

+    running_averages = (running_var is not None and running_var is not None)
+
    if ndim < 4:
        inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
        gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
        beta = theano.tensor.shape_padright(beta, 4 - ndim)
-    batchnorm_op = GpuDnnBatchNorm(mode=mode, epsilon=epsilon)
-    result = tuple(batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
-                                gpu_contiguous(beta)))
+        if running_averages:
+            running_mean = theano.tensor.shape_padright(running_mean, 4 - ndim)
+            running_var = theano.tensor.shape_padright(running_var, 4 - ndim)
+    elif ndim > 5:
+        inputs_shape = inputs.shape
+        params_shape = gamma.shape
+        inputs = theano.tensor.flatten(inputs, 5)
+        gamma = theano.tensor.flatten(gamma, 5)
+        beta = theano.tensor.flatten(beta, 5)
+        if running_averages:
+            running_mean = theano.tensor.flatten(running_mean, 5)
+            running_var = theano.tensor.flatten(running_var, 5)
+
+    batchnorm_op = GpuDnnBatchNorm(mode=mode, epsilon=epsilon,
+                                   running_average_factor=running_average_factor,
+                                   running_averages=running_averages)
+    if running_averages:
+        out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
+            gpu_contiguous(inputs), gpu_contiguous(gamma),
+            gpu_contiguous(beta),
+            running_mean=gpu_contiguous(running_mean),
+            running_var=gpu_contiguous(running_var))
+        if new_running_mean.broadcastable != running_mean.broadcastable:
+            new_running_mean = tensor.patternbroadcast(new_running_mean, running_mean.broadcastable)
+        if new_running_var.broadcastable != running_var.broadcastable:
+            new_running_var = tensor.patternbroadcast(new_running_var, running_var.broadcastable)
+        result = (out, mean, invstd, new_running_mean, new_running_var)
+    else:
+        result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
+                              gpu_contiguous(beta))
    if ndim < 4:
        result = tuple(theano.tensor.flatten(r, ndim) for r in result)
+    elif ndim > 5:
+        result = (theano.tensor.reshape(result[0], inputs_shape),) + tuple(
+            theano.tensor.reshape(r, params_shape) for r in result[1:])
    return result


@@ -2839,9 +3078,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
    For 5d tensors, the axes would be (0, 2, 3, 4).
    """
    ndim = inputs.ndim
-    if ndim > 5:
-        raise ValueError("dnn_batch_normalization_test currently supports "
-                         "up to 5-dimensional tensors only, got %d" % ndim)
    if gamma.ndim != ndim or beta.ndim != ndim:
        raise ValueError("gamma and beta must be of the same dimensionality "
                         "as inputs; got %d and %d instead of %d" %
@@ -2859,12 +3095,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
        beta = theano.tensor.shape_padright(beta, 4 - ndim)
        mean = theano.tensor.shape_padright(mean, 4 - ndim)
        var = theano.tensor.shape_padright(var, 4 - ndim)
+    elif ndim > 5:
+        inputs_shape = inputs.shape
+        inputs = theano.tensor.flatten(inputs, 5)
+        gamma = theano.tensor.flatten(gamma, 5)
+        beta = theano.tensor.flatten(beta, 5)
+        mean = theano.tensor.flatten(mean, 5)
+        var = theano.tensor.flatten(var, 5)
    batchnorm_op = GpuDnnBatchNormInference(mode=mode, epsilon=epsilon)
    result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
                          gpu_contiguous(beta), gpu_contiguous(mean),
                          gpu_contiguous(var))
    if ndim < 4:
        result = theano.tensor.flatten(result, ndim)
+    elif ndim > 5:
+        result = theano.tensor.reshape(result, inputs_shape)
    return result


@@ -3334,3 +3579,235 @@ def local_abstractconv3d_cudnn(node):
                               subsample=node.op.subsample,
                               conv_mode=conv_mode)
        return [rval]
+
+
+@local_optimizer([bn.AbstractBatchNormTrain])
+def local_abstract_batch_norm_train_cudnn(node):
+    if not isinstance(node.op, bn.AbstractBatchNormTrain):
+        return None
+
+    x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
+    running_mean = node.inputs[5] if len(node.inputs) > 5 else None
+    running_var = node.inputs[6] if len(node.inputs) > 6 else None
+
+    # input on gpu?  TODO what about the output?
+    x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
+                (x.owner and isinstance(x.owner.op, HostFromGpu)))
+    if not x_on_gpu:
+        return None
+
+    # convert axes to cuDNN mode
+    axes = tuple(node.op.axes)
+    if axes == (0,):
+        mode = 'per-activation'
+    elif axes == (0,) + tuple(range(2, x.ndim)):
+        mode = 'spatial'
+    else:
+        return None
+
+    try:
+        eps = float(theano.tensor.get_scalar_constant_value(epsilon))
+    except theano.tensor.NotScalarConstantError:
+        return None
+    if eps < 1e-5:
+        return None
+    try:
+        running_average_factor = float(theano.tensor.get_scalar_constant_value(running_average_factor))
+    except theano.tensor.NotScalarConstantError:
+        return None
+
+    if not dnn_available():
+        return None
+
+    x = as_cuda_ndarray_variable(x)
+    scale = as_cuda_ndarray_variable(scale)
+    bias = as_cuda_ndarray_variable(bias)
+
+    inputs = [x, scale, bias, mode, eps, running_average_factor]
+    if running_mean is not None and running_var is not None:
+        inputs.append(running_mean)
+        inputs.append(running_var)
+
+    results = list(dnn_batch_normalization_train(*inputs))
+
+    # If the original output was on CPU, we have to transfer it
+    for i in range(len(node.outputs)):
+        if isinstance(node.outputs[i].type, tensor.TensorType):
+            results[i] = tensor.as_tensor_variable(results[i])
+    # TODO copy_stack_trace?
+    return results
+
+
+@register_inplace()
+@local_optimizer([GpuDnnBatchNorm], inplace=True)
+def local_gpu_batch_norm_inplace_output(node):
+    if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
+        return GpuDnnBatchNorm(mode=node.op.mode,
+                               epsilon=node.op.epsilon,
+                               running_average_factor=node.op.running_average_factor,
+                               running_averages=node.op.running_averages,
+                               inplace_running_mean=node.op.inplace_running_mean,
+                               inplace_running_var=node.op.inplace_running_var,
+                               inplace_output=True)(*node.inputs)
+
+
+@register_inplace()
+@local_optimizer([GpuDnnBatchNorm], inplace=True)
+def local_gpu_batch_norm_inplace_running_mean(node):
+    if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_mean:
+        return GpuDnnBatchNorm(mode=node.op.mode,
+                               epsilon=node.op.epsilon,
+                               running_average_factor=node.op.running_average_factor,
+                               running_averages=node.op.running_averages,
+                               inplace_running_mean=True,
+                               inplace_running_var=node.op.inplace_running_var,
+                               inplace_output=node.op.inplace_output)(*node.inputs)
+
+
+@register_inplace()
+@local_optimizer([GpuDnnBatchNorm], inplace=True)
+def local_gpu_batch_norm_inplace_running_var(node):
+    if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_var:
+        return GpuDnnBatchNorm(mode=node.op.mode,
+                               epsilon=node.op.epsilon,
+                               running_average_factor=node.op.running_average_factor,
+                               running_averages=node.op.running_averages,
+                               inplace_running_mean=node.op.inplace_running_mean,
+                               inplace_running_var=True,
+                               inplace_output=node.op.inplace_output)(*node.inputs)
+
+
+@register_inplace()
+@local_optimizer([GpuDnnBatchNormInference], inplace=True)
+def local_gpu_batch_norm_inference_inplace(node):
+    if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
+        return [GpuDnnBatchNormInference(mode=node.op.mode,
+                                         epsilon=node.op.epsilon,
+                                         inplace=True)(*node.inputs)]
+
+
+@local_optimizer([bn.AbstractBatchNormTrainGrad])
+def local_abstract_batch_norm_train_grad_cudnn(node):
+    if not isinstance(node.op, bn.AbstractBatchNormTrainGrad):
+        return None
+
+    x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
+
+    # input on gpu?  TODO what about the output?
+    x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
+                (x.owner and isinstance(x.owner.op, HostFromGpu)))
+    dy_on_gpu = (isinstance(dy.type, CudaNdarrayType) or
+                 (dy.owner and isinstance(dy.owner.op, HostFromGpu)))
+    if not (x_on_gpu or dy_on_gpu):
+        return None
+
+    # convert axes to cuDNN mode
+    axes = tuple(node.op.axes)
+    if axes == (0,):
+        mode = 'per-activation'
+    elif axes == (0,) + tuple(range(2, x.ndim)):
+        mode = 'spatial'
+    else:
+        return None
+
+    ndim = x.ndim
+    if ndim < 4:
+        x = theano.tensor.shape_padright(x, 4 - ndim)
+        dy = theano.tensor.shape_padright(dy, 4 - ndim)
+        scale = theano.tensor.shape_padright(scale, 4 - ndim)
+        x_mean = theano.tensor.shape_padright(x_mean, 4 - ndim)
+        x_invstd = theano.tensor.shape_padright(x_invstd, 4 - ndim)
+    elif ndim > 5:
+        x_shape = x.shape
+        params_shape = scale.shape
+        x = theano.tensor.flatten(x, 5)
+        dy = theano.tensor.flatten(dy, 5)
+        scale = theano.tensor.flatten(scale, 5)
+        x_mean = theano.tensor.flatten(x_mean, 5)
+        x_invstd = theano.tensor.flatten(x_invstd, 5)
+
+    try:
+        eps = float(theano.tensor.get_scalar_constant_value(epsilon))
+    except theano.tensor.NotScalarConstantError:
+        return None
+    if eps < 1e-5:
+        return None
+
+    if not dnn_available():
+        return None
+
+    x = as_cuda_ndarray_variable(x)
+    dy = as_cuda_ndarray_variable(dy)
+    scale = as_cuda_ndarray_variable(scale)
+    x_mean = as_cuda_ndarray_variable(x_mean)
+    x_invstd = as_cuda_ndarray_variable(x_invstd)
+
+    g_wrt_inputs, g_wrt_scale, g_wrt_bias = \
+        GpuDnnBatchNormGrad(mode, epsilon=eps)(x, dy, scale, x_mean, x_invstd)
+
+    if ndim < 4:
+        g_wrt_inputs = theano.tensor.flatten(g_wrt_inputs, ndim)
+        g_wrt_scale = theano.tensor.flatten(g_wrt_scale, ndim)
+        g_wrt_bias = theano.tensor.flatten(g_wrt_bias, ndim)
+    elif ndim > 5:
+        g_wrt_inputs = theano.tensor.reshape(g_wrt_inputs, x_shape)
+        g_wrt_scale = theano.tensor.reshape(g_wrt_scale, params_shape)
+        g_wrt_bias = theano.tensor.reshape(g_wrt_bias, params_shape)
+
+    # If the original output was on CPU, we have to transfer it
+    if isinstance(node.outputs[0].type, tensor.TensorType):
+        g_wrt_inputs = tensor.as_tensor_variable(g_wrt_inputs)
+    if isinstance(node.outputs[1].type, tensor.TensorType):
+        g_wrt_scale = tensor.as_tensor_variable(g_wrt_scale)
+    if isinstance(node.outputs[2].type, tensor.TensorType):
+        g_wrt_bias = tensor.as_tensor_variable(g_wrt_bias)
+    # TODO copy_stack_trace?
+    return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
+
+
+@local_optimizer([bn.AbstractBatchNormInference])
+def local_abstract_batch_norm_inference_cudnn(node):
+    if not isinstance(node.op, bn.AbstractBatchNormInference):
+        return None
+
+    x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs
+
+    axes = tuple(node.op.axes)
+    if axes == (0,):
+        mode = 'per-activation'
+    elif axes == (0,) + tuple(range(2, x.ndim)):
+        mode = 'spatial'
+    else:
+        return None
+
+    # input on gpu?  TODO what about the output?
+    x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
+                (x.owner and isinstance(x.owner.op, HostFromGpu)))
+    if not x_on_gpu:
+        return None
+
+    try:
+        eps = float(theano.tensor.get_scalar_constant_value(epsilon))
+    except theano.tensor.NotScalarConstantError:
+        return None
+    if eps < 1e-5:
+        return None
+
+    if not dnn_available():
+        return None
+
+    x = as_cuda_ndarray_variable(x)
+    scale = as_cuda_ndarray_variable(scale)
+    bias = as_cuda_ndarray_variable(bias)
+    estimated_mean = as_cuda_ndarray_variable(estimated_mean)
+    estimated_variance = as_cuda_ndarray_variable(estimated_variance)
+
+    out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance,
+                                       mode, eps)
+
+    # If the original output was on CPU, we have to transfer it
+    # TODO copy_stack_trace?
+    if isinstance(node.outputs[0].type, tensor.TensorType):
+        return [tensor.as_tensor_variable(out)]
+    else:
+        return [out]
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -3050,3 +3050,28 @@ conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
                       local_abstractconv3d_gradinputs_gemm, 30,
                       'conv_gemm',
                       'gpu', 'fast_compile', 'fast_run')
+
+
+# Register cuDNN batch normalization implementation
+abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
+abstract_batch_norm_groupopt.__name__ = "gpu_batchnorm_opts"
+register_opt('fast_compile')(abstract_batch_norm_groupopt)
+
+# cuDNN optimizations are only registered if cuDNN is available.
+# (we import these opts here instead of at the top of this file
+# to avoid a circular dependency problem with dnn)
+from .dnn import (local_abstract_batch_norm_train_cudnn,
+                  local_abstract_batch_norm_train_grad_cudnn,
+                  local_abstract_batch_norm_inference_cudnn)     # noqa: 402
+abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_dnn',
+                                      local_abstract_batch_norm_train_cudnn, 20,
+                                      'batchnorm_dnn',
+                                      'gpu', 'fast_compile', 'fast_run', 'cudnn')
+abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_grad_dnn',
+                                      local_abstract_batch_norm_train_grad_cudnn, 20,
+                                      'batchnorm_dnn',
+                                      'gpu', 'fast_compile', 'fast_run', 'cudnn')
+abstract_batch_norm_groupopt.register('local_abstract_batch_norm_inference_dnn',
+                                      local_abstract_batch_norm_inference_cudnn, 20,
+                                      'batchnorm_dnn',
+                                      'gpu', 'fast_compile', 'fast_run', 'cudnn')
--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
 from __future__ import absolute_import, print_function, division
+from collections import OrderedDict
 import logging
 import os
 import sys
@@ -18,6 +19,7 @@ import theano.tests.unittest_tools as utt
 from theano.tensor.signal.pool import pool_2d, pool_3d
 from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
 from theano.tensor.nnet.abstract_conv import get_conv_output_shape
+from theano.tensor.nnet import bn
 import theano.sandbox.cuda.dnn as dnn
 from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
 from theano.sandbox.cuda import float32_shared_constructor as shared
@@ -730,52 +732,201 @@ def test_batchnorm_train():
        raise SkipTest("batch normalization requires cudnn v5+")
    utt.seed_rng()

+    tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
+
    for mode in ('per-activation', 'spatial'):
-        for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
-            x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias'))
+        for vartype in (tensor6, T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
+            x, scale, bias, running_mean, running_var = (vartype(n)
+                                                         for n in ('x', 'scale', 'bias',
+                                                                   'running_mean',
+                                                                   'running_var'))
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used
-
-            # forward pass
-            out, x_mean, x_invstd = cuda.dnn.dnn_batch_normalization_train(
-                x, scale, bias, mode, eps)
+            running_average_factor = 0.3
+
+            # forward pass, direct interface
+            out_gpu, x_mean_gpu, x_invstd_gpu, \
+                out_running_mean_gpu, out_running_var_gpu = \
+                dnn.dnn_batch_normalization_train(x, scale, bias, mode, eps,
+                                                  running_average_factor,
+                                                  running_mean, running_var)
+            # forward pass, abstract interface
+            out_abstract, x_mean_abstract, x_invstd_abstract, \
+                out_running_mean_abstract, out_running_var_abstract = \
+                bn.batch_normalization_train(x, scale, bias, mode, eps,
+                                             running_average_factor,
+                                             running_mean, running_var)
            # reference forward pass
            if mode == 'per-activation':
                axes = (0,)
            elif mode == 'spatial':
                axes = (0,) + tuple(range(2, ndim))
-            x_mean2 = x.mean(axis=axes, keepdims=True)
-            x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps))
-            scale2 = T.addbroadcast(scale, *axes)
-            bias2 = T.addbroadcast(bias, *axes)
-            out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
+            x_mean_ref = x.mean(axis=axes, keepdims=True)
+            x_var_ref = x.var(axis=axes, keepdims=True)
+            x_invstd_ref = T.inv(T.sqrt(x_var_ref + eps))
+            scale_ref = T.addbroadcast(scale, *axes)
+            bias_ref = T.addbroadcast(bias, *axes)
+            m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
+            out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
+            out_running_mean_ref = running_mean * (1 - running_average_factor) + \
+                x_mean_ref * running_average_factor
+            out_running_var_ref = running_var * (1 - running_average_factor) + \
+                (m / (m - 1)) * x_var_ref * running_average_factor
            # backward pass
            dy = vartype('dy')
-            grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
+            grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
+            grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
            # reference backward pass
-            grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
+            grads_ref = T.grad(None, wrt=[x, scale, bias], known_grads={out_ref: dy})
            # compile
-            f = theano.function([x, scale, bias, dy],
-                                [out, x_mean, x_invstd, out2, x_mean2, x_invstd2] +
-                                grads + grads2, mode=mode_with_gpu)
+            f_gpu = theano.function([x, scale, bias, running_mean, running_var, dy],
+                                    [out_gpu, x_mean_gpu, x_invstd_gpu,
+                                     out_running_mean_gpu, out_running_var_gpu] + grads_gpu,
+                                    mode=mode_with_gpu)
+            f_abstract = theano.function([x, scale, bias, running_mean, running_var, dy],
+                                         [out_abstract, x_mean_abstract, x_invstd_abstract,
+                                          out_running_mean_abstract, out_running_var_abstract] +
+                                         grads_abstract,
+                                         mode=mode_with_gpu)
+            f_ref = theano.function([x, scale, bias, running_mean, running_var, dy],
+                                    [out_ref, x_mean_ref, x_invstd_ref,
+                                     out_running_mean_ref, out_running_var_ref] + grads_ref)
+            # check if the abstract Ops have been replaced
+            assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
+                        in f_abstract.maker.fgraph.toposort()])
+            assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
+                        in f_abstract.maker.fgraph.toposort()])
+            assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                              bn.AbstractBatchNormInference,
+                                              bn.AbstractBatchNormTrainGrad)) for n
+                            in f_abstract.maker.fgraph.toposort()])
            # run
-            for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
+            for data_shape in ((5, 10, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes else s
                                    for d, s in enumerate(data_shape))
-                X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
-                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
-                Scale = numpy.random.randn(*param_shape).astype('float32')
-                Bias = numpy.random.randn(*param_shape).astype('float32')
-                outputs = f(X, Scale, Bias, Dy)
+                X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+                Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                outputs_gpu = f_gpu(X, Scale, Bias, Running_mean, Running_var, Dy)
+                outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
+                outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
                # compare outputs
-                utt.assert_allclose(outputs[0], outputs[0 + 3])  # out
-                utt.assert_allclose(outputs[1], outputs[1 + 3])  # mean
-                utt.assert_allclose(outputs[2], outputs[2 + 3])  # invstd
+                utt.assert_allclose(outputs_gpu[0], outputs_ref[0])  # out
+                utt.assert_allclose(outputs_gpu[1], outputs_ref[1])  # mean
+                utt.assert_allclose(outputs_gpu[2], outputs_ref[2])  # invstd
+                utt.assert_allclose(outputs_gpu[3], outputs_ref[3])  # running_mean
+                utt.assert_allclose(numpy.nan_to_num(outputs_gpu[4]),
+                                    numpy.nan_to_num(outputs_ref[4]))  # running_var
+                utt.assert_allclose(outputs_abstract[0], outputs_ref[0])  # out
+                utt.assert_allclose(outputs_abstract[1], outputs_ref[1])  # mean
+                utt.assert_allclose(outputs_abstract[2], outputs_ref[2])  # invstd
+                utt.assert_allclose(outputs_abstract[3], outputs_ref[3])  # running_mean
+                utt.assert_allclose(numpy.nan_to_num(outputs_abstract[4]),
+                                    numpy.nan_to_num(outputs_ref[4]))  # running_var
                # compare gradients
-                utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4)  # dx
-                utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4)  # dscale
-                utt.assert_allclose(outputs[8], outputs[8 + 3])  # dbias
+                utt.assert_allclose(outputs_gpu[5], outputs_ref[5], atol=2e-4)  # dx
+                utt.assert_allclose(outputs_gpu[6], outputs_ref[6], rtol=4e-4, atol=1e-4)  # dscale
+                utt.assert_allclose(outputs_gpu[7], outputs_ref[7])  # dbias
+                utt.assert_allclose(outputs_abstract[5], outputs_ref[5], atol=2e-4)  # dx
+                utt.assert_allclose(outputs_abstract[6], outputs_ref[6], rtol=4e-4, atol=1e-4)  # dscale
+                utt.assert_allclose(outputs_abstract[7], outputs_ref[7])  # dbias
+
+
+def test_dnn_batchnorm_train_without_running_averages():
+    # compile and run batch_normalization_train without running averages
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+    if cuda.dnn.version() < (5000, 5000):
+        raise SkipTest("batch normalization requires cudnn v5+")
+    utt.seed_rng()
+
+    x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
+    data_shape = (5, 10, 30, 25)
+    param_shape = (1, 10, 30, 25)
+
+    # forward pass
+    out_gpu, x_mean_gpu, x_invstd_gpu = \
+        dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation')
+    out_abstract, x_mean_abstract, x_invstd_abstract = \
+        bn.batch_normalization_train(x, scale, bias, 'per-activation')
+    # backward pass
+    grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
+    grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
+    # compile
+    f_gpu = theano.function([x, scale, bias, dy],
+                            [out_gpu, x_mean_gpu, x_invstd_gpu] +
+                            grads_gpu,
+                            mode=mode_with_gpu)
+    f_abstract = theano.function([x, scale, bias, dy],
+                                 [out_abstract, x_mean_abstract, x_invstd_abstract] +
+                                 grads_abstract,
+                                 mode=mode_with_gpu)
+    # check if the abstract Ops have been replaced
+    assert any([isinstance(n.op, dnn.GpuDnnBatchNorm)
+                for n in f_abstract.maker.fgraph.toposort()])
+    assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
+                for n in f_abstract.maker.fgraph.toposort()])
+    assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                      bn.AbstractBatchNormInference,
+                                      bn.AbstractBatchNormTrainGrad))
+                    for n in f_abstract.maker.fgraph.toposort()])
+    # run
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    f_gpu(X, Scale, Bias, Dy)
+    f_abstract(X, Scale, Bias, Dy)
+
+
+def test_dnn_batchnorm_train_inplace():
+    # test inplace_running_mean and inplace_running_var
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+    if cuda.dnn.version() < (5000, 5000):
+        raise SkipTest("batch normalization requires cudnn v5+")
+    utt.seed_rng()
+
+    x, scale, bias = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias')
+    data_shape = (5, 10, 30, 25)
+    param_shape = (1, 10, 30, 25)
+    running_mean = shared(
+        numpy.random.randn(*param_shape).astype(theano.config.floatX),
+        broadcastable=(True, False, False, False))
+    running_var = shared(
+        numpy.random.randn(*param_shape).astype(theano.config.floatX),
+        broadcastable=(True, False, False, False))
+
+    # forward pass
+    out, x_mean, x_invstd, new_running_mean, new_running_var = \
+        dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation',
+                                          epsilon=5e-3, running_average_factor=0.3,
+                                          running_mean=running_mean, running_var=running_var)
+    # update running averages
+    updates = OrderedDict()
+    updates[running_mean] = new_running_mean
+    updates[running_var] = new_running_var
+    # compile
+    f = theano.function([x, scale, bias],
+                        [out, x_mean, x_invstd],
+                        updates=updates,
+                        mode=mode_with_gpu)
+    # check for the inplace settings
+    nodes = [n for n in f.maker.fgraph.toposort()
+             if isinstance(n.op, dnn.GpuDnnBatchNorm)]
+    assert len(nodes) == 1
+    assert nodes[0].op.inplace_running_mean
+    assert nodes[0].op.inplace_running_var
+    assert nodes[0].op.inplace_output
+    # run
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    f(X, Scale, Bias)


 def test_batchnorm_inference():
@@ -785,53 +936,160 @@ def test_batchnorm_inference():
        raise SkipTest("batch normalization requires cudnn v5+")
    utt.seed_rng()

+    tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
+
    for mode in ('per-activation', 'spatial'):
-        for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
-            x, scale, bias, mean, var = (vartype(n) for n in ('x', 'scale',
-                                                              'bias', 'mean',
-                                                              'var'))
+        for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
+            x, scale, bias, mean, var = (vartype(n)
+                                         for n in ('x', 'scale', 'bias', 'mean', 'var'))
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used

-            # forward pass
-            out = cuda.dnn.dnn_batch_normalization_test(x, scale, bias, mean,
-                                                        var, mode, eps)
+            # forward pass, direct interface
+            out_gpu = dnn.dnn_batch_normalization_test(x, scale, bias, mean,
+                                                       var, mode, eps)
+            # forward pass, abstract interface
+            out_abstract = bn.batch_normalization_test(x, scale, bias, mean,
+                                                       var, mode, eps)
            # reference forward pass
            if mode == 'per-activation':
                axes = (0,)
            elif mode == 'spatial':
                axes = (0,) + tuple(range(2, ndim))
-            scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes)
-                                          for t in (scale, bias, mean, var))
-            out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2
+            scale_ref, bias_ref, mean_ref, var_ref = (T.addbroadcast(t, *axes)
+                                                      for t in (scale, bias, mean, var))
+            out_ref = (x - mean_ref) * (scale_ref / T.sqrt(var_ref + eps)) + bias_ref
            # backward pass
            dy = vartype('dy')
-            grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy})
+            grads_gpu = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_gpu: dy})
+            grads_abstract = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_abstract: dy})
            # reference backward pass
-            grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy})
+            grads_ref = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_ref: dy})
            # compile
-            f = theano.function([x, scale, bias, mean, var, dy],
-                                [out, out2] + grads + grads2, mode=mode_with_gpu)
+            f_gpu = theano.function([x, scale, bias, mean, var, dy],
+                                    [out_gpu] + grads_gpu, mode=mode_with_gpu)
+            f_abstract = theano.function([x, scale, bias, mean, var, dy],
+                                         [out_abstract] + grads_abstract, mode=mode_with_gpu)
+            f_ref = theano.function([x, scale, bias, mean, var, dy],
+                                    [out_ref] + grads_ref)
+            # check if the abstract Ops have been replaced
+            assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
+                        in f_abstract.maker.fgraph.toposort()])
+            assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                              bn.AbstractBatchNormInference,
+                                              bn.AbstractBatchNormTrainGrad)) for n
+                            in f_abstract.maker.fgraph.toposort()])
            # run
-            for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
+            for data_shape in ((10, 20, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes else s
                                    for d, s in enumerate(data_shape))
-                X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
-                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
-                Scale = numpy.random.randn(*param_shape).astype('float32')
-                Bias = numpy.random.randn(*param_shape).astype('float32')
-                Mean = numpy.random.randn(*param_shape).astype('float32')
-                Var = numpy.random.rand(*param_shape).astype('float32')
-                outputs = f(X, Scale, Bias, Mean, Var, Dy)
+                X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+                Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
+                outputs_gpu = f_gpu(X, Scale, Bias, Mean, Var, Dy)
+                outputs_abstract = f_abstract(X, Scale, Bias, Mean, Var, Dy)
+                outputs_ref = f_ref(X, Scale, Bias, Mean, Var, Dy)
                # compare outputs
-                utt.assert_allclose(outputs[0], outputs[1])  # out
+                utt.assert_allclose(outputs_gpu[0], outputs_ref[0])  # out
+                utt.assert_allclose(outputs_abstract[0], outputs_ref[0])  # out
                # compare gradients
-                utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5)  # dx
-                utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5)  # dscale
-                utt.assert_allclose(outputs[4], outputs[4 + 5])  # dbias
-                utt.assert_allclose(outputs[5], outputs[5 + 5])  # dmean
-                utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5)  # dvar
+                utt.assert_allclose(outputs_gpu[1], outputs_ref[1], atol=4e-5)  # dx
+                utt.assert_allclose(outputs_gpu[2], outputs_ref[2], atol=4e-5)  # dscale
+                utt.assert_allclose(outputs_gpu[3], outputs_ref[3])  # dbias
+                utt.assert_allclose(outputs_gpu[4], outputs_ref[4])  # dmean
+                utt.assert_allclose(outputs_gpu[5], outputs_ref[5], rtol=2e-3, atol=4e-5)  # dvar
+                utt.assert_allclose(outputs_abstract[1], outputs_ref[1], atol=4e-5)  # dx
+                utt.assert_allclose(outputs_abstract[2], outputs_ref[2], atol=4e-5)  # dscale
+                utt.assert_allclose(outputs_abstract[3], outputs_ref[3])  # dbias
+                utt.assert_allclose(outputs_abstract[4], outputs_ref[4])  # dmean
+                utt.assert_allclose(outputs_abstract[5], outputs_ref[5], rtol=2e-3, atol=4e-5)  # dvar
+
+
+def test_batchnorm_inference_inplace():
+    # test inplace
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+    if cuda.dnn.version() < (5000, 5000):
+        raise SkipTest("batch normalization requires cudnn v5+")
+    utt.seed_rng()
+
+    x, scale, bias, mean, var = (T.tensor4(n) for n in ('x', 'scale', 'bias', 'mean', 'var'))
+    data_shape = (5, 10, 30, 25)
+    param_shape = (1, 10, 30, 25)
+
+    out = dnn.dnn_batch_normalization_test(x, scale, bias, mean, var)
+    f = theano.function([x, scale, bias, mean, var], [out], mode=mode_with_gpu)
+
+    # check for the inplace settings
+    nodes = [n for n in f.maker.fgraph.toposort()
+             if isinstance(n.op, dnn.GpuDnnBatchNormInference)]
+    assert len(nodes) == 1
+    assert nodes[0].op.inplace
+
+    # run
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
+    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
+    f(X, Scale, Bias, Mean, Var)
+
+
+def test_dnn_batchnorm_valid_and_invalid_axes():
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+    if cuda.dnn.version() < (5000, 5000):
+        raise SkipTest("batch normalization requires cudnn v5+")
+
+    for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix):
+        x, scale, bias, mean, var, dy = (vartype(n)
+                                         for n in ('x', 'scale', 'bias', 'mean', 'var', 'dy'))
+        ndim = x.ndim
+
+        # supported: per-activation and spatial
+        valid_axes_lists = ((0,), (0,) + tuple(range(2, ndim)))
+        # not supported: an axes list without 0 and including 1
+        invalid_axes_lists = (tuple(range(1, ndim)),)
+        for axes in valid_axes_lists + invalid_axes_lists:
+            # forward pass, abstract interface
+            out_train, x_mean, x_invstd = bn.batch_normalization_train(
+                x, scale, bias, axes)
+            out_test = bn.batch_normalization_test(
+                x, scale, bias, mean, var, axes)
+            # backward pass
+            dy = vartype('dy')
+            grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
+            grads_test = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_test: dy})
+            # compile
+            f = theano.function([x, scale, bias, mean, var, dy],
+                                [out_train, x_mean, x_invstd, out_test] +
+                                grads_train + grads_test,
+                                mode=mode_with_gpu)
+
+            if axes in valid_axes_lists:
+                # check if the abstract Ops have been replaced by the cuDNN Ops
+                assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
+                            in f.maker.fgraph.toposort()])
+                assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
+                            in f.maker.fgraph.toposort()])
+                assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
+                            in f.maker.fgraph.toposort()])
+                assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                                  bn.AbstractBatchNormInference,
+                                                  bn.AbstractBatchNormTrainGrad)) for n
+                                in f.maker.fgraph.toposort()])
+            else:
+                # check if the abstract Ops have been replaced, but not by the cuDNN Ops
+                assert not any([isinstance(n.op, (dnn.GpuDnnBatchNorm,
+                                                  dnn.GpuDnnBatchNormGrad,
+                                                  bn.AbstractBatchNormTrain,
+                                                  bn.AbstractBatchNormInference,
+                                                  bn.AbstractBatchNormTrainGrad)) for n
+                                in f.maker.fgraph.toposort()])


 def test_dnn_tag():