Merge pull request #4582 from f0k/dnnbatchnorm

cuDNN v5 Batch Normalization

Merge pull request #4582 from f0k/dnnbatchnorm
c7ea631b · Pascal Lamblin · GitHub · 7ed5257b · acf5edee · c7ea631b
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -40,6 +40,23 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradInputs)


+def c_define_tensor_desc(desc):
+    return """
+cudnnTensorDescriptor_t %(desc)s;
+""" % dict(desc=desc)
+
+
+def c_init_tensor_desc(desc, err, fail):
+    return """
+%(desc)s = NULL;
+if ((%(err)s = cudnnCreateTensorDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
+PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+           ": %%s", cudnnGetErrorString(%(err)s));
+%(fail)s
+}
+""" % dict(desc=desc, err=err, fail=fail)
+
+
 def c_set_tensor4d(var, desc, err, fail):
    return """
 {
@@ -73,6 +90,13 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
        """ % dict(var=var, err=err, desc=desc, fail=fail)


+def c_clean_tensor_desc(desc):
+    return """
+if(%(desc)s!= NULL)
+cudnnDestroyTensorDescriptor(%(desc)s);
+""" % dict(desc=desc)
+
+
 class DnnBase(GpuOp, COp):
    """
    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
@@ -2025,31 +2049,10 @@ class GpuDnnSoftmaxBase(DnnBase):
        else:
            return [shape[1]]

-    def _define_tensor4d_desc(self, name, id):
-        return """
-cudnnTensorDescriptor_t %(id)s_%(name)s;
-""" % dict(name=name, id=id)
-
-    def _init_tensor4d_desc(self, name, id, fail):
-        return """
-%(id)s_%(name)s = NULL;
-if ((err%(name)s = cudnnCreateTensorDescriptor(&%(id)s_%(name)s)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-               ": %%s", cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-""" % dict(name=name, id=id, fail=fail)
-
-    def _clean_tensor4d_desc(self, name, id):
-        return """
-if(%(id)s_%(name)s!= NULL)
-  cudnnDestroyTensorDescriptor(%(id)s_%(name)s);
-""" % dict(name=name, id=id)
-
    def c_support_code_struct(self, node, name):
        result = ''
        for id in self.tensor_4d_descs:
-            result += self._define_tensor4d_desc(name, id)
+            result += c_define_tensor_desc('%s_%s' % (id, name))
        return result

    def c_init_code_struct(self, node, name, sub):
@@ -2058,13 +2061,13 @@ cudnnStatus_t err%(name)s;
 """ % dict(name=name)

        for id in self.tensor_4d_descs:
-            result += self._init_tensor4d_desc(name, id, sub['fail'])
+            result += c_init_tensor_desc('%s_%s' % (id, name), 'err' + name, sub['fail'])
        return result

    def c_cleanup_code_struct(self, node, name):
        result = ''
        for id in self.tensor_4d_descs:
-            result += self._clean_tensor4d_desc(name, id)
+            result += c_clean_tensor_desc('%s_%s' % (id, name))
        return result

    def c_code(self, node, name, inputs, outputs, sub):
@@ -2267,6 +2270,507 @@ err%(name)s = cudnnSoftmaxBackward(
        """


+class GpuDnnBatchNormBase(DnnBase):
+    """
+    Base Op for cuDNN Batch Normalization.
+
+    Parameters
+    ----------
+    mode : {'per-activation', 'spatial'}
+        Whether to normalize per activation (in this mode, bias and scale
+        tensor dimensions are 1xCxHxW) or share normalization factors across
+        spatial dimensions (in this mode, bias and scale tensor dimensions
+        are 1xCx1x1).
+    epsilon
+        Epsilon value used in the batch normalization formula. Minimum allowed
+        value is 1e-5 (imposed by cuDNN).
+    """
+
+    __props__ = ('mode', 'epsilon')
+    tensor_4d_descs = []
+
+    def __init__(self, mode='per-activation', epsilon=1e-4):
+        DnnBase.__init__(self)
+
+        if version() < (5000, 5000):
+            raise RuntimeError("cuDNN Batch Normalization requires cuDNN v5")
+
+        assert (mode in ('per-activation', 'spatial'))
+        self.mode = mode
+
+        assert (epsilon >= 1e-5)
+        self.epsilon = epsilon
+
+    def c_support_code_struct(self, node, name):
+        result = ''
+        for id in self.tensor_4d_descs:
+            result += c_define_tensor_desc('%s_%s' % (id, name))
+        return result
+
+    def c_init_code_struct(self, node, name, sub):
+        result = """
+cudnnStatus_t err%(name)s;
+""" % dict(name=name)
+
+        for id in self.tensor_4d_descs:
+            result += c_init_tensor_desc('%s_%s' % (id, name), 'err' + name, sub['fail'])
+        return result
+
+    def c_cleanup_code_struct(self, node, name):
+        result = ''
+        for id in self.tensor_4d_descs:
+            result += c_clean_tensor_desc('%s_%s' % (id, name))
+        return result
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        if self.mode == "spatial":
+            mode = "CUDNN_BATCHNORM_SPATIAL"
+        else:
+            mode = "CUDNN_BATCHNORM_PER_ACTIVATION"
+
+        # Setup configuration variables.
+        result = """
+cudnnStatus_t err%(name)s;
+cudnnBatchNormMode_t mode%(name)s = %(mode)s;
+double exponentialAverageFactor%(name)s = %(exp_avg_factor)f;
+double epsilon%(name)s = %(epsilon)e;
+""" % dict(name=name,
+           mode=mode,
+           exp_avg_factor=0,  # deliberately unused
+           epsilon=self.epsilon)
+
+        return result
+
+    def c_code_cache_version(self):
+        return (2, version())
+
+
+class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
+    """
+    Op for the cuDNN BatchNormalizationForwardInference function.
+    See GpuDnnBatchNormBase for parameters.
+
+    On application, takes input, scale, bias, mean and variance and produces:
+    output = (input - mean) / sqrt(variance + epsilon) * scale + bias
+
+    where mean and variance are usually some running averages over multiple
+    batches computed during training.
+
+    Note: scale, bias, mean and variance must follow the same tensor layout!
+    """
+
+    tensor_4d_descs = ['bn_input', 'bn_output', 'bn_params']
+
+    def infer_shape(self, node, shape):
+        # output shape equals shape of x
+        return [shape[0]]
+
+    def make_node(self, x, scale, bias, estimated_mean, estimated_variance):
+        x = as_cuda_ndarray_variable(x)
+        scale = as_cuda_ndarray_variable(scale)
+        bias = as_cuda_ndarray_variable(bias)
+        estimated_mean = as_cuda_ndarray_variable(estimated_mean)
+        estimated_variance = as_cuda_ndarray_variable(estimated_variance)
+        assert x.ndim == 4
+        assert scale.ndim == 4
+        assert bias.ndim == 4
+        assert estimated_mean.ndim == 4
+        assert estimated_variance.ndim == 4
+        return Apply(self, [x, scale, bias, estimated_mean, estimated_variance],
+                     [x.type()])
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        # super call to prepare common configuration
+        result = super(GpuDnnBatchNormInference, self).c_code(node, name, inputs, outputs, sub)
+
+        # give sensible names to inputs and outputs
+        inp, scale, bias, est_mean, est_var = inputs
+        outp, = outputs
+
+        # set input tensor descriptors from input tensors
+        result += c_set_tensor4d(inp, 'bn_input_' + name, 'err' + name, sub['fail'])
+        result += c_set_tensor4d(scale, 'bn_params_' + name, 'err' + name, sub['fail'])
+
+        # build and prepare the output variable
+        result += """
+if (CudaNdarray_prep_output(&%(outp)s, 4, CudaNdarray_HOST_DIMS(%(inp)s)) != 0)
+{
+    %(fail)s
+}
+""" % dict(outp=outp, inp=inp, fail=sub['fail'])
+
+        # set output tensor descriptor from output tensor
+        result += c_set_tensor4d(outp, 'bn_output_' + name, 'err' + name, sub['fail'])
+
+        # call cuDNN function
+        result += """
+{
+const float alpha = 1.;
+const float beta = 0.;
+err%(name)s = cudnnBatchNormalizationForwardInference(
+  _handle,
+  mode%(name)s,
+  (void*) &alpha,
+  (void*) &beta,
+  bn_input_%(name)s,
+  CudaNdarray_DEV_DATA(%(inp)s),
+  bn_output_%(name)s,
+  CudaNdarray_DEV_DATA(%(outp)s),
+  bn_params_%(name)s,
+  CudaNdarray_DEV_DATA(%(scale)s),
+  CudaNdarray_DEV_DATA(%(bias)s),
+  CudaNdarray_DEV_DATA(%(est_mean)s),
+  CudaNdarray_DEV_DATA(%(est_var)s),
+  epsilon%(name)s
+);
+}
+""" % dict(name=name, inp=inp, scale=scale, bias=bias, est_mean=est_mean,
+           est_var=est_var, outp=outp)
+
+        return result
+
+    def grad(self, inputs, grads):
+        x, scale, bias, est_mean, est_var = inputs
+        dy = grads[0]
+
+        # add necessary broadcasts
+        if self.mode == 'per-activation':
+            axes = (0,)
+        elif self.mode == 'spatial':
+            axes = (0, 2, 3)
+        scale, bias, est_mean, est_var = (theano.tensor.addbroadcast(t, *axes)
+                                          for t in (scale, bias, est_mean, est_var))
+
+        # define helper expressions
+        est_var_eps = est_var + self.epsilon
+        est_std = theano.tensor.sqrt(est_var_eps)
+        two = theano.tensor.constant(2.)
+
+        # define and return gradients
+        dx = dy * (scale / est_std)
+        dscale = (dy * (x - est_mean)).sum(axes, keepdims=True) / est_std
+        dbias = dy.sum(axes, keepdims=True)
+        dmean = -dy.sum(axes, keepdims=True) * (scale / est_std)
+        dvar = -(dy * (x - est_mean)).sum(axes, keepdims=True) * (scale / (two * est_var_eps * est_std))
+        return [dx, dscale, dbias, dmean, dvar]
+
+
+class GpuDnnBatchNorm(GpuDnnBatchNormBase):
+    """
+    Op for the cuDNN BatchNormalizationForwardTraining function.
+    See GpuDnnBatchNormBase for parameters.
+
+    On application, takes input, scale, bias and produces:
+    output = (input - mean) / sqrt(variance + epsilon) * scale + bias
+    mean = input.mean(axis=axes, keepdims=True),
+    invstd = 1. / sqrt(input.var(axis=axes, keepdims=True) + epsilon)
+
+    where axes=0 if mode='per-activation', and axes=(0,2,3) if mode='spatial'
+
+    Note: scale and bias must follow the same tensor layout!
+    """
+
+    tensor_4d_descs = ['bn_input', 'bn_output', 'bn_params']
+
+    def infer_shape(self, node, shape):
+        # first output equals shape of x
+        # second and third output equal shape of scale
+        return [shape[0], shape[1], shape[1]]
+
+    def make_node(self, x, scale, bias):
+        x = as_cuda_ndarray_variable(x)
+        scale = as_cuda_ndarray_variable(scale)
+        bias = as_cuda_ndarray_variable(bias)
+        assert x.ndim == 4
+        assert scale.ndim == 4
+        assert bias.ndim == 4
+        return Apply(self, [x, scale, bias], [x.type(), scale.type(), scale.type()])
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        # super call to prepare common configuration
+        result = super(GpuDnnBatchNorm, self).c_code(node, name, inputs, outputs, sub)
+
+        # give sensible names to inputs and outputs
+        inp, scale, bias = inputs
+        outp, x_mean, x_invstd = outputs
+
+        # set input tensor descriptors from input tensors
+        result += c_set_tensor4d(inp, 'bn_input_' + name, 'err' + name, sub['fail'])
+        result += c_set_tensor4d(scale, 'bn_params_' + name, 'err' + name, sub['fail'])
+
+        # build and prepare the output variables
+        result += """
+if ((CudaNdarray_prep_output(&%(outp)s, 4, CudaNdarray_HOST_DIMS(%(inp)s)) != 0) ||
+    (CudaNdarray_prep_output(&%(x_mean)s, 4, CudaNdarray_HOST_DIMS(%(scale)s)) != 0) ||
+    (CudaNdarray_prep_output(&%(x_invstd)s, 4, CudaNdarray_HOST_DIMS(%(scale)s)) != 0))
+{
+    %(fail)s
+}
+""" % dict(outp=outp, inp=inp, x_mean=x_mean, x_invstd=x_invstd, scale=scale,
+           fail=sub['fail'])
+
+        # set output tensor descriptor from output tensor
+        result += c_set_tensor4d(outp, 'bn_output_' + name, 'err' + name, sub['fail'])
+
+        # call cuDNN function
+        result += """
+{
+const float alpha = 1.;
+const float beta = 0.;
+err%(name)s = cudnnBatchNormalizationForwardTraining(
+  _handle,
+  mode%(name)s,
+  (void*) &alpha,
+  (void*) &beta,
+  bn_input_%(name)s,
+  CudaNdarray_DEV_DATA(%(inp)s),
+  bn_output_%(name)s,
+  CudaNdarray_DEV_DATA(%(outp)s),
+  bn_params_%(name)s,
+  CudaNdarray_DEV_DATA(%(scale)s),
+  CudaNdarray_DEV_DATA(%(bias)s),
+  exponentialAverageFactor%(name)s,
+  NULL,  // running mean, deliberately unused
+  NULL,  // running var, deliberately unused
+  epsilon%(name)s,
+  CudaNdarray_DEV_DATA(%(x_mean)s),
+  CudaNdarray_DEV_DATA(%(x_invstd)s)
+);
+}
+""" % dict(name=name, inp=inp, scale=scale, bias=bias, outp=outp,
+           x_mean=x_mean, x_invstd=x_invstd)
+
+        return result
+
+    def grad(self, inputs, grads):
+        x, scale, bias = inputs
+        dy = grads[0]
+        _, x_mean, x_invstd = self.make_node(x, scale, bias).outputs
+        return GpuDnnBatchNormGrad(self.mode, self.epsilon)(x, dy, scale,
+                                                            x_mean, x_invstd)
+
+
+class GpuDnnBatchNormGrad(GpuDnnBatchNormBase):
+    """
+    Op for the cuDNN BatchNormalizationBackward function.
+    See GpuDnnBatchNormBase for parameters.
+
+    On application, takes input, dy, scale, mean, invstd and produces
+    dinput, dscale and dbias. Note that it does not need the bias.
+
+    Note: scale, mean and invstd must follow the same tensor layout!
+    """
+
+    tensor_4d_descs = ['bn_input', 'bn_doutput', 'bn_dinput', 'bn_params']
+
+    def infer_shape(self, node, shape):
+        # first output equals shape of x
+        # second and third output equal shape of scale
+        return [shape[0], shape[2], shape[2]]
+
+    def make_node(self, x, dy, scale, x_mean, x_invstd):
+        x = as_cuda_ndarray_variable(x)
+        dy = as_cuda_ndarray_variable(dy)
+        scale = as_cuda_ndarray_variable(scale)
+        x_mean = as_cuda_ndarray_variable(x_mean)
+        x_invstd = as_cuda_ndarray_variable(x_invstd)
+        assert x.ndim == 4 and dy.ndim == 4 and scale.ndim == 4 and x_mean.ndim == 4 and x_invstd.ndim == 4
+        return Apply(self, [x, dy, scale, x_mean, x_invstd], [x.type(), scale.type(), scale.type()])
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        # super call to prepare common configuration
+        result = super(GpuDnnBatchNormGrad, self).c_code(node, name, inputs, outputs, sub)
+
+        # give sensible names to inputs and outputs
+        inp, doutp, scale, x_mean, x_invstd = inputs
+        dinp, dscale, dbias = outputs
+
+        # set input tensor descriptors from input tensors
+        result += c_set_tensor4d(inp, 'bn_input_' + name, 'err' + name, sub['fail'])
+        result += c_set_tensor4d(doutp, 'bn_doutput_' + name, 'err' + name, sub['fail'])
+        result += c_set_tensor4d(scale, 'bn_params_' + name, 'err' + name, sub['fail'])
+
+        # build and prepare the output variables
+        result += """
+if ((CudaNdarray_prep_output(&%(dinp)s, 4, CudaNdarray_HOST_DIMS(%(inp)s)) != 0) ||
+    (CudaNdarray_prep_output(&%(dscale)s, 4, CudaNdarray_HOST_DIMS(%(scale)s)) != 0) ||
+    (CudaNdarray_prep_output(&%(dbias)s, 4, CudaNdarray_HOST_DIMS(%(scale)s)) != 0))
+{
+    %(fail)s
+}
+""" % dict(dinp=dinp, inp=inp, dscale=dscale, scale=scale, dbias=dbias,
+           fail=sub['fail'])
+
+        # set output tensor descriptor from output tensor
+        result += c_set_tensor4d(dinp, 'bn_dinput_' + name, 'err' + name, sub['fail'])
+
+        # call cuDNN function
+        result += """
+{
+const float alphaData = 1.;
+const float betaData = 0.;
+const float alphaParam = 1.;
+const float betaParam = 0.;
+err%(name)s = cudnnBatchNormalizationBackward(
+  _handle,
+  mode%(name)s,
+  (void*) &alphaData,
+  (void*) &betaData,
+  (void*) &alphaParam,
+  (void*) &betaParam,
+  bn_input_%(name)s,
+  CudaNdarray_DEV_DATA(%(inp)s),
+  bn_doutput_%(name)s,
+  CudaNdarray_DEV_DATA(%(doutp)s),
+  bn_dinput_%(name)s,
+  CudaNdarray_DEV_DATA(%(dinp)s),
+  bn_params_%(name)s,
+  CudaNdarray_DEV_DATA(%(scale)s),
+  CudaNdarray_DEV_DATA(%(dscale)s),
+  CudaNdarray_DEV_DATA(%(dbias)s),
+  epsilon%(name)s,
+  CudaNdarray_DEV_DATA(%(x_mean)s),
+  CudaNdarray_DEV_DATA(%(x_invstd)s)
+);
+}
+""" % dict(name=name, inp=inp, doutp=doutp, scale=scale, x_mean=x_mean,
+           x_invstd=x_invstd, dinp=dinp, dscale=dscale, dbias=dbias)
+
+        return result
+
+
+def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
+                                  epsilon=1e-4):
+    """
+    Performs batch normalization of the given inputs, using the mean and
+    variance of the inputs.
+
+    Parameters
+    ----------
+    mode : {'per-activation', 'spatial'}
+        Whether to normalize per activation or share normalization factors
+        across spatial dimensions (i.e., all dimensions past the second).
+    gamma : tensor
+        Learnable scale factors. Must match the dimensionality of `inputs`,
+        but have sizes of `1` for all axes normalized over (i.e., in the first
+        dimension for ``mode='per-activation'`, and additionally in all
+        dimensions past the second for ``mode='spatial'``).
+    beta : tensor
+        Learnable biases. Must match the tensor layout of `gamma`.
+    epsilon : float
+        Epsilon value used in the batch normalization formula. Minimum allowed
+        value is 1e-5 (imposed by cuDNN).
+
+    Returns
+    -------
+    out : tensor
+        Batch-normalized inputs.
+    mean : tensor
+        Means of `inputs` across the normalization axes.
+    stdinv : tensor
+        Inverse standard deviations of `inputs` across the normalization axes.
+
+    Notes
+    -----
+    For 4d tensors, returned values are equivalent to:
+
+    >>> axes = 0 if mode == 'per-activation' else (0, 2, 3)
+    >>> mean = inputs.mean(axes, keepdims=True)
+    >>> stdinv = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon))
+    >>> out = (inputs - mean) * gamma * stdinv + beta
+    """
+    ndim = inputs.ndim
+    if ndim > 4:
+        raise ValueError("dnn_batch_normalization_train currently supports "
+                         "up to 4-dimensional tensors only, got %d" % ndim)
+    if gamma.ndim != ndim or beta.ndim != ndim:
+        raise ValueError("gamma and beta must be of the same dimensionality "
+                         "as inputs; got %d and %d instead of %d" %
+                         (gamma.ndim, beta.ndim, ndim))
+    if epsilon < 1e-5:
+        raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)
+
+    if ndim < 4:
+        inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
+        gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
+        beta = theano.tensor.shape_padright(beta, 4 - ndim)
+    batchnorm_op = GpuDnnBatchNorm(mode=mode, epsilon=epsilon)
+    result = tuple(batchnorm_op(inputs, gamma, beta))
+    if ndim < 4:
+        result = tuple(theano.tensor.flatten(r, ndim) for r in result)
+    return result
+
+
+def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
+                                 mode='per-activation', epsilon=1e-4):
+    """
+    Performs batch normalization of the given inputs, using the given mean and
+    variance.
+
+    Parameters
+    ----------
+    mode : {'per-activation', 'spatial'}
+        Whether to normalize per activation or share normalization factors
+        across spatial dimensions (i.e., all dimensions past the second).
+    gamma : tensor
+        Scale factors. Must match the dimensionality of `inputs`, but have
+        sizes of `1` for all axes normalized over (i.e., in the first dimension
+        for ``mode='per-activation'`, and additionally in all dimensions past
+        the second for ``mode='spatial'``).
+    beta : tensor
+        Biases. Must match the tensor layout of `gamma`.
+    mean : tensor
+        Means. Usually these are running averages computed during training.
+        Must match the tensor layout of `gamma`.
+    var : tensor
+        Variances. Usually these are running averages computed during training.
+        Must match the tensor layout of `gamma`.
+    epsilon : float
+        Epsilon value used in the batch normalization formula. Minimum allowed
+        value is 1e-5 (imposed by cuDNN).
+
+    Returns
+    -------
+    out : tensor
+        Batch-normalized inputs.
+
+    Notes
+    -----
+    For 4d tensors, the returned value is equivalent to:
+
+    >>> axes = (0,) if mode == 'per-activation' else (0, 2, 3)
+    >>> gamma, beta, mean, var = (T.addbroadcast(t, *axes)
+    ...                           for t in (gamma, beta, mean, var))
+    >>> out = (inputs - mean) * gamma / T.sqrt(var + epsilon) + beta
+    """
+    ndim = inputs.ndim
+    if ndim > 4:
+        raise ValueError("dnn_batch_normalization_test currently supports "
+                         "up to 4-dimensional tensors only, got %d" % ndim)
+    if gamma.ndim != ndim or beta.ndim != ndim:
+        raise ValueError("gamma and beta must be of the same dimensionality "
+                         "as inputs; got %d and %d instead of %d" %
+                         (gamma.ndim, beta.ndim, ndim))
+    if mean.ndim != ndim or var.ndim != ndim:
+        raise ValueError("mean and var must be of the same dimensionality "
+                         "as inputs; got %d and %d instead of %d" %
+                         (mean.ndim, var.ndim, ndim))
+    if epsilon < 1e-5:
+        raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)
+
+    if ndim < 4:
+        inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
+        gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
+        beta = theano.tensor.shape_padright(beta, 4 - ndim)
+        mean = theano.tensor.shape_padright(mean, 4 - ndim)
+        var = theano.tensor.shape_padright(var, 4 - ndim)
+    batchnorm_op = GpuDnnBatchNormInference(mode=mode, epsilon=epsilon)
+    result = batchnorm_op(inputs, gamma, beta, mean, var)
+    if ndim < 4:
+        result = theano.tensor.flatten(result, ndim)
+    return result
+
+
 # Intentation for history
 if True:
    # @register_opt('cudnn')  # this optimizer is registered in opt.py instead.

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -715,6 +715,119 @@ class test_DnnSoftMax(test_nnet.test_SoftMax):
        utt.assert_allclose(f(inp), f_ref(inp))


+def test_batchnorm_train():
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+    if cuda.dnn.version() < (5000, 5000):
+        raise SkipTest("batch normalization requires cudnn v5+")
+    utt.seed_rng()
+
+    for mode in ('per-activation', 'spatial'):
+        for vartype in (T.tensor4, T.tensor3, T.matrix, T.vector):
+            x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias'))
+            ndim = x.ndim
+            eps = 5e-3  # some non-standard value to test if it's used
+
+            # forward pass
+            out, x_mean, x_invstd = cuda.dnn.dnn_batch_normalization_train(
+                x, scale, bias, mode, eps)
+            # reference forward pass
+            if mode == 'per-activation':
+                axes = (0,)
+            elif mode == 'spatial':
+                axes = (0,) + tuple(range(2, ndim))
+            x_mean2 = x.mean(axis=axes, keepdims=True)
+            x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps))
+            scale2 = T.addbroadcast(scale, *axes)
+            bias2 = T.addbroadcast(bias, *axes)
+            out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
+            # backward pass
+            dy = vartype('dy')
+            grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
+            # reference backward pass
+            grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
+            # compile
+            f = theano.function([x, scale, bias, dy],
+                                [out, x_mean, x_invstd, out2, x_mean2, x_invstd2] +
+                                grads + grads2, mode=mode_with_gpu)
+            # run
+            floatX = theano.config.floatX
+            for data_shape in ((10, 20, 30, 40), (4, 3, 1, 1), (1, 1, 5, 5)):
+                data_shape = data_shape[:ndim]
+                param_shape = tuple(1 if d in axes else s
+                                    for d, s in enumerate(data_shape))
+                X = 4 + 3 * numpy.random.randn(*data_shape).astype(floatX)
+                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(floatX)
+                Scale = numpy.random.randn(*param_shape).astype(floatX)
+                Bias = numpy.random.randn(*param_shape).astype(floatX)
+                outputs = f(X, Scale, Bias, Dy)
+                # compare outputs
+                utt.assert_allclose(outputs[0], outputs[0 + 3])  # out
+                utt.assert_allclose(outputs[1], outputs[1 + 3])  # mean
+                utt.assert_allclose(outputs[2], outputs[2 + 3])  # invstd
+                # compare gradients
+                utt.assert_allclose(outputs[6], outputs[6 + 3])  # dx
+                utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=3e-3)  # dscale
+                utt.assert_allclose(outputs[8], outputs[8 + 3])  # dbias
+
+
+def test_batchnorm_inference():
+    if not cuda.dnn.dnn_available():
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+    if cuda.dnn.version() < (5000, 5000):
+        raise SkipTest("batch normalization requires cudnn v5+")
+    utt.seed_rng()
+
+    for mode in ('per-activation', 'spatial'):
+        for vartype in (T.tensor4, T.tensor3, T.matrix, T.vector):
+            x, scale, bias, mean, var = (vartype(n) for n in ('x', 'scale',
+                                                              'bias', 'mean',
+                                                              'var'))
+            ndim = x.ndim
+            eps = 5e-3  # some non-standard value to test if it's used
+
+            # forward pass
+            out = cuda.dnn.dnn_batch_normalization_test(x, scale, bias, mean,
+                                                        var, mode, eps)
+            # reference forward pass
+            if mode == 'per-activation':
+                axes = (0,)
+            elif mode == 'spatial':
+                axes = (0,) + tuple(range(2, ndim))
+            scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes)
+                                          for t in (scale, bias, mean, var))
+            out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2
+            # backward pass
+            dy = vartype('dy')
+            grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy})
+            # reference backward pass
+            grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy})
+            # compile
+            f = theano.function([x, scale, bias, mean, var, dy],
+                                [out, out2] + grads + grads2, mode=mode_with_gpu)
+            # run
+            floatX = theano.config.floatX
+            for data_shape in ((10, 20, 30, 40), (4, 3, 1, 1), (1, 1, 5, 5)):
+                data_shape = data_shape[:ndim]
+                param_shape = tuple(1 if d in axes else s
+                                    for d, s in enumerate(data_shape))
+                X = 4 + 3 * numpy.random.randn(*data_shape).astype(floatX)
+                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(floatX)
+                Scale = numpy.random.randn(*param_shape).astype(floatX)
+                Bias = numpy.random.randn(*param_shape).astype(floatX)
+                Mean = numpy.random.randn(*param_shape).astype(floatX)
+                Var = numpy.random.rand(*param_shape).astype(floatX)
+                outputs = f(X, Scale, Bias, Mean, Var, Dy)
+                # compare outputs
+                utt.assert_allclose(outputs[0], outputs[1])  # out
+                # compare gradients
+                utt.assert_allclose(outputs[2], outputs[2 + 5])  # dx
+                utt.assert_allclose(outputs[3], outputs[3 + 5])  # dscale
+                utt.assert_allclose(outputs[4], outputs[4 + 5])  # dbias
+                utt.assert_allclose(outputs[5], outputs[5 + 5])  # dmean
+                utt.assert_allclose(outputs[6], outputs[6 + 5])  # dvar
+
+
 def test_dnn_tag():
    """
    Test that if cudnn isn't avail we crash and that if it is avail, we use it.

--- a/theano/tests/unittest_tools.py
+++ b/theano/tests/unittest_tools.py
@@ -301,8 +301,7 @@ def str_diagnostic(expected, value, rtol, atol):
        print("  Mean Abs Diff: ", numpy.mean(absdiff), file=ssio)
        print("  Median Abs Diff: ", numpy.median(absdiff), file=ssio)
        print("  Std Abs Diff: ", numpy.std(absdiff), file=ssio)
-        reldiff = numpy.absolute(nv - ov) / (numpy.absolute(nv) +
-                                             numpy.absolute(ov))
+        reldiff = numpy.absolute(nv - ov) / numpy.absolute(ov)
        print("  Max Rel Diff: ", numpy.max(reldiff), file=ssio)
        print("  Mean Rel Diff: ", numpy.mean(reldiff), file=ssio)
        print("  Median Rel Diff: ", numpy.median(reldiff), file=ssio)