Merge pull request #4768 from abergeron/gpua_bn

GpuArray BatchNorm

Merge pull request #4768 from abergeron/gpua_bn
e4e08782 · Frédéric Bastien · GitHub · e5c41e76 · 18885126 · e4e08782
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -1398,6 +1398,326 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
        return Apply(self, [dy, sm], [sm.type()])
+class GpuDnnBatchNorm(DnnBase):
+    """
+    Base Op for cuDNN Batch Normalization.
+    Parameters
+    ----------
+    mode : {'per-activation', 'spatial'}
+        Whether to normalize per activation (in this mode, bias and scale
+        tensor dimensions are 1xCxHxW) or share normalization factors across
+        spatial dimensions (in this mode, bias and scale tensor dimensions
+        are 1xCx1x1).
+    epsilon
+        Epsilon value used in the batch normalization formula. Minimum allowed
+        value is 1e-5 (imposed by cuDNN).
+    """
+    __props__ = ('mode', 'epsilon')
+    def __init__(self, mode='per-activation', epsilon=1e-4):
+        DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm.c'],
+                         'dnn_batchnorm_op')
+        if version() < 5000:
+            raise RuntimeError("cuDNN Batch Normalization requires cuDNN v5 or later")
+        assert (mode in ('per-activation', 'spatial'))
+        self.mode = mode
+        assert (epsilon >= 1e-5)
+        self.epsilon = epsilon
+    def get_op_params(self):
+        params = []
+        params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL"
+                                if self.mode == "spatial"
+                                else "CUDNN_BATCHNORM_PER_ACTIVATION")))
+        params.append(('EPSILON', str(self.epsilon)))
+        return params
+    def infer_shape(self, node, shape):
+        return [shape[0], shape[1], shape[1]]
+    def make_node(self, x, scale, bias):
+        ctx_name = infer_context_name(x, scale, bias)
+        x = as_gpuarray_variable(x, ctx_name)
+        scale = as_gpuarray_variable(scale, ctx_name)
+        bias = as_gpuarray_variable(bias, ctx_name)
+        assert x.ndim == 4
+        assert scale.ndim == 4
+        assert bias.ndim == 4
+        return Apply(self, [x, scale, bias], [x.type(), scale.type(), scale.type()])
+    def grad(self, inputs, grads):
+        x, scale, bias = inputs
+        dy = grads[0]
+        _, x_mean, x_invstd = self.make_node(x, scale, bias).outputs
+        return GpuDnnBatchNormGrad(self.mode, self.epsilon)(x, dy, scale,
+                                                            x_mean, x_invstd)
+class GpuDnnBatchNormInference(DnnBase):
+    """
+    Base Op for cuDNN Batch Normalization.
+    Parameters
+    ----------
+    mode : {'per-activation', 'spatial'}
+        Whether to normalize per activation (in this mode, bias and scale
+        tensor dimensions are 1xCxHxW) or share normalization factors across
+        spatial dimensions (in this mode, bias and scale tensor dimensions
+        are 1xCx1x1).
+    epsilon
+        Epsilon value used in the batch normalization formula. Minimum allowed
+        value is 1e-5 (imposed by cuDNN).
+    """
+    __props__ = ('mode', 'epsilon')
+    def __init__(self, mode='per-activation', epsilon=1e-4):
+        DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_inf.c'],
+                         'dnn_batchnorm_op')
+        if version() < 5000:
+            raise RuntimeError("cuDNN Batch Normalization requires cuDNN v5 or later")
+        assert (mode in ('per-activation', 'spatial'))
+        self.mode = mode
+        assert (epsilon >= 1e-5)
+        self.epsilon = epsilon
+    def get_op_params(self):
+        params = []
+        params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL"
+                                if self.mode == "spatial"
+                                else "CUDNN_BATCHNORM_PER_ACTIVATION")))
+        params.append(('EPSILON', str(self.epsilon)))
+        return params
+    def infer_shape(self, node, shape):
+        return [shape[0]]
+    def make_node(self, x, scale, bias, estimated_mean, estimated_variance):
+        ctx_name = infer_context_name(x, scale, bias, estimated_mean,
+                                      estimated_variance)
+        x = as_gpuarray_variable(x, ctx_name)
+        scale = as_gpuarray_variable(scale, ctx_name)
+        bias = as_gpuarray_variable(bias, ctx_name)
+        estimated_mean = as_gpuarray_variable(estimated_mean, ctx_name)
+        estimated_variance = as_gpuarray_variable(estimated_variance, ctx_name)
+        assert x.ndim == 4
+        assert scale.ndim == 4
+        assert bias.ndim == 4
+        assert estimated_mean.ndim == 4
+        assert estimated_variance.ndim == 4
+        return Apply(self, [x, scale, bias, estimated_mean, estimated_variance], [x.type()])
+    def grad(self, inputs, grads):
+        x, scale, bias, est_mean, est_var = inputs
+        dy = grads[0]
+        if self.mode == "per-activation":
+            axes = (0,)
+        elif self.mode == "spatial":
+            axes = (0, 2, 3)
+        scale, bias, est_mean, est_var = (theano.tensor.addbroadcast(t, *axes)
+                                          for t in (scale, bias, est_mean, est_var))
+        # define helper expressions
+        est_var_eps = est_var + self.epsilon
+        est_std = theano.tensor.sqrt(est_var_eps)
+        two = theano.tensor.constant(2.)
+        # define and return gradients
+        dx = dy * (scale / est_std)
+        dscale = (dy * (x - est_mean)).sum(axes, keepdims=True) / est_std
+        dbias = dy.sum(axes, keepdims=True)
+        dmean = -dy.sum(axes, keepdims=True) * (scale / est_std)
+        dvar = -(dy * (x - est_mean)).sum(axes, keepdims=True) * (scale / (two * est_var_eps * est_std))
+        return [dx, dscale, dbias, dmean, dvar]
+class GpuDnnBatchNormGrad(DnnBase):
+    __props__ = ('mode', 'epsilon')
+    def __init__(self, mode='per-activation', epsilon=1e-4):
+        DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_grad.c'],
+                         'dnn_batchnorm_grad')
+        if version() < 5000:
+            raise RuntimeError("cuDNN Batch Normalization requires cuDNN v5 or later")
+        assert (mode in ('per-activation', 'spatial'))
+        self.mode = mode
+        assert (epsilon >= 1e-5)
+        self.epsilon = epsilon
+    def get_op_params(self):
+        params = []
+        params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL"
+                                if self.mode == "spatial"
+                                else "CUDNN_BATCHNORM_PER_ACTIVATION")))
+        params.append(('EPSILON', str(self.epsilon)))
+        return params
+    def make_node(self, x, dy, scale, x_mean, x_invstd):
+        ctx_name = infer_context_name(x, dy, scale, x_mean, x_invstd)
+        x = as_gpuarray_variable(x, ctx_name)
+        dy = as_gpuarray_variable(dy, ctx_name)
+        scale = as_gpuarray_variable(scale, ctx_name)
+        x_mean = as_gpuarray_variable(x_mean, ctx_name)
+        x_invstd = as_gpuarray_variable(x_invstd, ctx_name)
+        assert x.ndim == 4 and dy.ndim == 4 and scale.ndim == 4 and x_mean.ndim == 4 and x_invstd.ndim == 4
+        return Apply(self, [x, dy, scale, x_mean, x_invstd], [x.type(), scale.type(), scale.type()])
+    def infer_shape(self, node, shape):
+        return [shape[0], shape[2], shape[2]]
+def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
+                                  epsilon=1e-4):
+    """
+    Performs batch normalization of the given inputs, using the mean and
+    variance of the inputs.
+    Parameters
+    ----------
+    mode : {'per-activation', 'spatial'}
+        Whether to normalize per activation or share normalization factors
+        across spatial dimensions (i.e., all dimensions past the second).
+    gamma : tensor
+        Learnable scale factors. Must match the dimensionality of `inputs`,
+        but have sizes of `1` for all axes normalized over (i.e., in the first
+        dimension for ``mode='per-activation'`, and additionally in all
+        dimensions past the second for ``mode='spatial'``).
+    beta : tensor
+        Learnable biases. Must match the tensor layout of `gamma`.
+    epsilon : float
+        Epsilon value used in the batch normalization formula. Minimum allowed
+        value is 1e-5 (imposed by cuDNN).
+    Returns
+    -------
+    out : tensor
+        Batch-normalized inputs.
+    mean : tensor
+        Means of `inputs` across the normalization axes.
+    stdinv : tensor
+        Inverse standard deviations of `inputs` across the normalization axes.
+    Notes
+    -----
+    Requires cuDNN 5 and Theano 0.9dev2 or more recent.
+    For 4d tensors, returned values are equivalent to:
+    .. code-block:: python
+        axes = 0 if mode == 'per-activation' else (0, 2, 3)
+        mean = inputs.mean(axes, keepdims=True)
+        stdinv = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon))
+        out = (inputs - mean) * gamma * stdinv + beta
+    """
+    ndim = inputs.ndim
+    if ndim > 4:
+        raise ValueError("dnn_batch_normalization_train currently supports "
+                         "up to 4-dimensional tensors only, got %d" % ndim)
+    if gamma.ndim != ndim or beta.ndim != ndim:
+        raise ValueError("gamma and beta must be of the same dimensionality "
+                         "as inputs; got %d and %d instead of %d" %
+                         (gamma.ndim, beta.ndim, ndim))
+    if epsilon < 1e-5:
+        raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)
+    if ndim < 4:
+        inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
+        gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
+        beta = theano.tensor.shape_padright(beta, 4 - ndim)
+    batchnorm_op = GpuDnnBatchNorm(mode=mode, epsilon=epsilon)
+    result = tuple(batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
+                                gpu_contiguous(beta)))
+    if ndim < 4:
+        result = tuple(theano.tensor.flatten(r, ndim) for r in result)
+    return result
+def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
+                                 mode='per-activation', epsilon=1e-4):
+    """
+    Performs batch normalization of the given inputs, using the given mean and
+    variance.
+    Parameters
+    ----------
+    mode : {'per-activation', 'spatial'}
+        Whether to normalize per activation or share normalization factors
+        across spatial dimensions (i.e., all dimensions past the second).
+    gamma : tensor
+        Scale factors. Must match the dimensionality of `inputs`, but have
+        sizes of `1` for all axes normalized over (i.e., in the first dimension
+        for ``mode='per-activation'`, and additionally in all dimensions past
+        the second for ``mode='spatial'``).
+    beta : tensor
+        Biases. Must match the tensor layout of `gamma`.
+    mean : tensor
+        Means. Usually these are running averages computed during training.
+        Must match the tensor layout of `gamma`.
+    var : tensor
+        Variances. Usually these are running averages computed during training.
+        Must match the tensor layout of `gamma`.
+    epsilon : float
+        Epsilon value used in the batch normalization formula. Minimum allowed
+        value is 1e-5 (imposed by cuDNN).
+    Returns
+    -------
+    out : tensor
+        Batch-normalized inputs.
+    Notes
+    -----
+    Requires cuDNN 5 and Theano 0.9dev2 or more recent.
+    For 4d tensors, the returned value is equivalent to:
+    .. code-block:: python
+        axes = (0,) if mode == 'per-activation' else (0, 2, 3)
+        gamma, beta, mean, var = (T.addbroadcast(t, *axes)
+                                  for t in (gamma, beta, mean, var))
+        out = (inputs - mean) * gamma / T.sqrt(var + epsilon) + beta
+    """
+    ndim = inputs.ndim
+    if ndim > 4:
+        raise ValueError("dnn_batch_normalization_test currently supports "
+                         "up to 4-dimensional tensors only, got %d" % ndim)
+    if gamma.ndim != ndim or beta.ndim != ndim:
+        raise ValueError("gamma and beta must be of the same dimensionality "
+                         "as inputs; got %d and %d instead of %d" %
+                         (gamma.ndim, beta.ndim, ndim))
+    if mean.ndim != ndim or var.ndim != ndim:
+        raise ValueError("mean and var must be of the same dimensionality "
+                         "as inputs; got %d and %d instead of %d" %
+                         (mean.ndim, var.ndim, ndim))
+    if epsilon < 1e-5:
+        raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)
+    if ndim < 4:
+        inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
+        gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
+        beta = theano.tensor.shape_padright(beta, 4 - ndim)
+        mean = theano.tensor.shape_padright(mean, 4 - ndim)
+        var = theano.tensor.shape_padright(var, 4 - ndim)
+    batchnorm_op = GpuDnnBatchNormInference(mode=mode, epsilon=epsilon)
+    result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
+                          gpu_contiguous(beta), gpu_contiguous(mean),
+                          gpu_contiguous(var))
+    if ndim < 4:
+        result = theano.tensor.flatten(result, ndim)
+    return result
 @register_opt2([AbstractConv2d, AbstractConv2d_gradWeights,
                AbstractConv2d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
 def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):

--- a/theano/gpuarray/dnn_batchnorm.c
+++ b/theano/gpuarray/dnn_batchnorm.c
+#section support_code_struct
+int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
+                     PyGpuArrayObject *bias, PyGpuArrayObject **outp,
+                     PyGpuArrayObject **x_mean, PyGpuArrayObject **x_invstd,
+                     PyGpuContextObject *c) {
+  if (c_set_tensorNd(inp, bn_input) != 0)
+    return 1;
+  if (c_set_tensorNd(scale, bn_params) != 0)
+    return 1;
+  if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
+    return 1;
+  if (theano_prep_output(x_mean, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
+    return 1;
+  if (theano_prep_output(x_invstd, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
+    return 1;
+  if (c_set_tensorNd(*outp, bn_output) != 0)
+    return 1;
+  {
+    const float falpha = 1.;
+    const float fbeta = 0.;
+    const double dalpha = 1.;
+    const double dbeta = 0.;
+    void *alpha;
+    void *beta;
+    if (inp->ga.typecode == GA_DOUBLE) {
+      alpha = (void *)&dalpha;
+      beta = (void *)&dbeta;
+    } else {
+      alpha = (void *)&falpha;
+      beta = (void *)&fbeta;
+    }
+    cudnnStatus_t err = cudnnBatchNormalizationForwardTraining(
+      APPLY_SPECIFIC(_handle),
+      MODE,
+      alpha,
+      beta,
+      bn_input,
+      PyGpuArray_DEV_DATA(inp),
+      bn_output,
+      PyGpuArray_DEV_DATA(*outp),
+      bn_params,
+      PyGpuArray_DEV_DATA(scale),
+      PyGpuArray_DEV_DATA(bias),
+      0,
+      NULL,  // running mean, deliberately unused
+      NULL,  // running var, deliberately unused
+      EPSILON,
+      PyGpuArray_DEV_DATA(*x_mean),
+      PyGpuArray_DEV_DATA(*x_invstd)
+      );
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "Error during batchnorm: %s\n",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+  }
+  return 0;
+}
--- a/theano/gpuarray/dnn_batchnorm_base.c
+++ b/theano/gpuarray/dnn_batchnorm_base.c
+#section init_code_struct
+{
+  cudnnStatus_t err;
+  bn_input = NULL;
+  bn_params = NULL;
+  bn_output = NULL;
+  if ((err = cudnnCreateTensorDescriptor(&bn_input)) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+                 "(bn_input): %s", cudnnGetErrorString(err));
+    FAIL;
+  }
+  if ((err = cudnnCreateTensorDescriptor(&bn_params)) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+                 "(bn_params): %s", cudnnGetErrorString(err));
+    FAIL;
+  }
+  if ((err = cudnnCreateTensorDescriptor(&bn_output)) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+                 "(bn_output): %s", cudnnGetErrorString(err));
+    FAIL;
+  }
+}
+#section cleanup_code_struct
+if (bn_input != NULL)
+  cudnnDestroyTensorDescriptor(bn_input);
+if (bn_params != NULL)
+  cudnnDestroyTensorDescriptor(bn_params);
+if (bn_output != NULL)
+  cudnnDestroyTensorDescriptor(bn_output);
+#section support_code_struct
+cudnnTensorDescriptor_t bn_input;
+cudnnTensorDescriptor_t bn_params;
+cudnnTensorDescriptor_t bn_output;
--- a/theano/gpuarray/dnn_batchnorm_grad.c
+++ b/theano/gpuarray/dnn_batchnorm_grad.c
+#section init_code_struct
+{
+  cudnnStatus_t err;
+  bn_doutput = NULL;
+  if ((err = cudnnCreateTensorDescriptor(&bn_doutput)) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+                 "(bn_doutput): %s", cudnnGetErrorString(err));
+    FAIL;
+  }
+}
+#section cleanup_code_struct
+if (bn_doutput != NULL)
+  cudnnDestroyTensorDescriptor(bn_doutput);
+#section support_code_struct
+cudnnTensorDescriptor_t bn_doutput;
+int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
+                       PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
+                       PyGpuArrayObject *x_invstd, PyGpuArrayObject **dinp,
+                       PyGpuArrayObject **dscale, PyGpuArrayObject **dbias,
+                       PyGpuContextObject *c) {
+  if (c_set_tensorNd(inp, bn_input) != 0)
+    return 1;
+  if (c_set_tensorNd(doutp, bn_doutput) != 0)
+    return 1;
+  if (c_set_tensorNd(scale, bn_params) != 0)
+    return 1;
+  if (theano_prep_output(dinp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
+    return 1;
+  if (theano_prep_output(dscale, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
+    return 1;
+  if (theano_prep_output(dbias, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
+    return 1;
+  if (c_set_tensorNd(*dinp, bn_output) != 0)
+    return 1;
+  {
+    const float falpha = 1.;
+    const float fbeta = 0.;
+    const double dalpha = 1.;
+    const double dbeta = 0.;
+    void *alphaData;
+    void *betaData;
+    void *alphaParam;
+    void *betaParam;
+    if (inp->ga.typecode == GA_DOUBLE) {
+      alphaData = (void *)&dalpha;
+      betaData = (void *)&dbeta;
+      alphaParam = (void *)&dalpha;
+      betaParam = (void *)&dbeta;
+    } else {
+      alphaData = (void *)&falpha;
+      betaData = (void *)&fbeta;
+      alphaParam = (void *)&falpha;
+      betaParam = (void *)&fbeta;
+    }
+    cudnnStatus_t err = cudnnBatchNormalizationBackward(
+      APPLY_SPECIFIC(_handle),
+      MODE,
+      alphaData,
+      betaData,
+      alphaParam,
+      betaParam,
+      bn_input,
+      PyGpuArray_DEV_DATA(inp),
+      bn_doutput,
+      PyGpuArray_DEV_DATA(doutp),
+      bn_output,
+      PyGpuArray_DEV_DATA(*dinp),
+      bn_params,
+      PyGpuArray_DEV_DATA(scale),
+      PyGpuArray_DEV_DATA(*dscale),
+      PyGpuArray_DEV_DATA(*dbias),
+      EPSILON,
+      PyGpuArray_DEV_DATA(x_mean),
+      PyGpuArray_DEV_DATA(x_invstd)
+      );
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "Error during batchnorm: %s\n",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+  }
+  return 0;
+}
--- a/theano/gpuarray/dnn_batchnorm_inf.c
+++ b/theano/gpuarray/dnn_batchnorm_inf.c
+#section support_code_struct
+int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
+                     PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
+                     PyGpuArrayObject *est_var, PyGpuArrayObject **outp,
+                     PyGpuContextObject *c) {
+  if (c_set_tensorNd(inp, bn_input) != 0)
+    return 1;
+  if (c_set_tensorNd(scale, bn_params) != 0)
+    return 1;
+  if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
+    return 1;
+  if (c_set_tensorNd(*outp, bn_output) != 0)
+    return 1;
+  {
+    const float falpha = 1.;
+    const float fbeta = 0.;
+    const double dalpha = 1.;
+    const double dbeta = 0.;
+    void *alpha;
+    void *beta;
+    if (inp->ga.typecode == GA_DOUBLE) {
+      alpha = (void *)&dalpha;
+      beta = (void *)&dbeta;
+    } else {
+      alpha = (void *)&falpha;
+      beta = (void *)&fbeta;
+    }
+    cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
+      APPLY_SPECIFIC(_handle),
+      MODE,
+      alpha,
+      beta,
+      bn_input,
+      PyGpuArray_DEV_DATA(inp),
+      bn_output,
+      PyGpuArray_DEV_DATA(*outp),
+      bn_params,
+      PyGpuArray_DEV_DATA(scale),
+      PyGpuArray_DEV_DATA(bias),
+      PyGpuArray_DEV_DATA(est_mean),
+      PyGpuArray_DEV_DATA(est_var),
+      EPSILON
+      );
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "Error during batchnorm: %s\n",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+  }
+  return 0;
+}
--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -973,3 +973,114 @@ class test_SoftMax(test_nnet.test_SoftMax):
        # Compare the output of the function with the reference function
        inp = numpy.random.normal(0, 1, (5, 6)).astype("float32")
        utt.assert_allclose(f(inp), f_ref(inp))
+def test_dnn_batchnorm_train():
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+    if dnn.version(raises=False) < 5000:
+        raise SkipTest("batch normalization requires cudnn v5+")
+    utt.seed_rng()
+    for mode in ('per-activation', 'spatial'):
+        for vartype in (T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
+            x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias'))
+            ndim = x.ndim
+            eps = 5e-3  # some non-standard value to test if it's used
+            # forward pass
+            out, x_mean, x_invstd = dnn.dnn_batch_normalization_train(
+                x, scale, bias, mode, eps)
+            # reference forward pass
+            if mode == 'per-activation':
+                axes = (0,)
+            elif mode == 'spatial':
+                axes = (0,) + tuple(range(2, ndim))
+            x_mean2 = x.mean(axis=axes, keepdims=True)
+            x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps))
+            scale2 = T.addbroadcast(scale, *axes)
+            bias2 = T.addbroadcast(bias, *axes)
+            out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
+            # backward pass
+            dy = vartype('dy')
+            grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
+            # reference backward pass
+            grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
+            # compile
+            f = theano.function([x, scale, bias, dy],
+                                [out, x_mean, x_invstd, out2, x_mean2, x_invstd2] +
+                                grads + grads2, mode=mode_with_gpu)
+            # run
+            for data_shape in ((10, 20, 30, 40), (4, 3, 1, 1), (1, 1, 5, 5)):
+                data_shape = data_shape[:ndim]
+                param_shape = tuple(1 if d in axes else s
+                                    for d, s in enumerate(data_shape))
+                X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
+                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
+                Scale = numpy.random.randn(*param_shape).astype('float32')
+                Bias = numpy.random.randn(*param_shape).astype('float32')
+                outputs = f(X, Scale, Bias, Dy)
+                # compare outputs
+                utt.assert_allclose(outputs[0], outputs[0 + 3])  # out
+                utt.assert_allclose(outputs[1], outputs[1 + 3])  # mean
+                utt.assert_allclose(outputs[2], outputs[2 + 3])  # invstd
+                # compare gradients
+                utt.assert_allclose(outputs[6], outputs[6 + 3])  # dx
+                utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=3e-3)  # dscale
+                utt.assert_allclose(outputs[8], outputs[8 + 3])  # dbias
+def test_batchnorm_inference():
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+    if dnn.version(raises=False) < 5000:
+        raise SkipTest("batch normalization requires cudnn v5+")
+    utt.seed_rng()
+    for mode in ('per-activation', 'spatial'):
+        for vartype in (T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
+            x, scale, bias, mean, var = (vartype(n) for n in ('x', 'scale',
+                                                              'bias', 'mean',
+                                                              'var'))
+            ndim = x.ndim
+            eps = 5e-3  # some non-standard value to test if it's used
+            # forward pass
+            out = dnn.dnn_batch_normalization_test(x, scale, bias, mean,
+                                                   var, mode, eps)
+            # reference forward pass
+            if mode == 'per-activation':
+                axes = (0,)
+            elif mode == 'spatial':
+                axes = (0,) + tuple(range(2, ndim))
+            scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes)
+                                          for t in (scale, bias, mean, var))
+            out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2
+            # backward pass
+            dy = vartype('dy')
+            grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy})
+            # reference backward pass
+            grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy})
+            # compile
+            f = theano.function([x, scale, bias, mean, var, dy],
+                                [out, out2] + grads + grads2, mode=mode_with_gpu)
+            # run
+            for data_shape in ((10, 20, 30, 40), (4, 3, 1, 1), (1, 1, 5, 5)):
+                data_shape = data_shape[:ndim]
+                param_shape = tuple(1 if d in axes else s
+                                    for d, s in enumerate(data_shape))
+                X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
+                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
+                Scale = numpy.random.randn(*param_shape).astype('float32')
+                Bias = numpy.random.randn(*param_shape).astype('float32')
+                Mean = numpy.random.randn(*param_shape).astype('float32')
+                Var = numpy.random.rand(*param_shape).astype('float32')
+                outputs = f(X, Scale, Bias, Mean, Var, Dy)
+                # compare outputs
+                utt.assert_allclose(outputs[0], outputs[1])  # out
+                # compare gradients
+                utt.assert_allclose(outputs[2], outputs[2 + 5])  # dx
+                utt.assert_allclose(outputs[3], outputs[3 + 5])  # dscale
+                utt.assert_allclose(outputs[4], outputs[4 + 5])  # dbias
+                utt.assert_allclose(outputs[5], outputs[5 + 5])  # dmean
+                utt.assert_allclose(outputs[6], outputs[6 + 5], atol=2e-5)  # dvar