提交 8b9f7336 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5190 from gvtulder/f-batchnorm-abstract

Abstract Ops for batch normalization
......@@ -10,6 +10,9 @@
.. moduleauthor:: LISA
.. seealso:: cuDNN batch normalization: :class:`theano.gpuarray.dnn.dnn_batch_normalization_train`, :class:`theano.gpuarray.dnn.dnn_batch_normalization_test>`. They must be added manually as they do not have the same user interface.
.. autofunction:: theano.tensor.nnet.bn.batch_normalization_train
.. autofunction:: theano.tensor.nnet.bn.batch_normalization_test
.. seealso:: cuDNN batch normalization: :class:`theano.gpuarray.dnn.dnn_batch_normalization_train`, :class:`theano.gpuarray.dnn.dnn_batch_normalization_test>`.
.. autofunction:: theano.tensor.nnet.bn.batch_normalization
......@@ -28,19 +28,20 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
assert_conv_shape)
from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad)
from theano.tensor.nnet import bn
from . import pygpu
from .type import (get_context, gpu_context_type, list_contexts,
GpuArraySharedVariable)
from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, gpu_alloc_empty,
empty_like, GpuArrayType)
empty_like, GpuArrayType, HostFromGpu)
from .elemwise import GpuElemwise
# These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from .nnet import GpuSoftmax
from .opt import (gpu_seqopt, register_opt, pool_db, pool_db2,
op_lifter, register_opt2)
op_lifter, register_opt2, register_inplace)
from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims
......@@ -1389,13 +1390,13 @@ class GpuDnnPool(DnnBase):
res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
return [res]
def grad(self, inp, grads):
def L_op(self, inp, outputs, grads):
img, ws, stride, pad = inp
grad, = grads
grad = gpu_contiguous(grad)
out = self(img, ws, stride, pad)
out, = outputs
g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad)
......@@ -1591,10 +1592,10 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
assert x.ndim == 4
return Apply(self, [x], [x.type()])
def grad(self, inp, grads):
def L_op(self, inp, outputs, grads):
x, = inp
g_sm, = grads
sm = self(x)
sm, = outputs
return [GpuDnnSoftmaxGrad(
self.algo,
self.mode
......@@ -1646,48 +1647,131 @@ class GpuDnnBatchNorm(DnnBase):
epsilon
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
"""
__props__ = ('mode',)
__props__ = ('mode', 'running_averages', 'inplace_running_mean',
'inplace_running_var', 'inplace_output')
def __init__(self, mode='per-activation'):
def __init__(self, mode='per-activation', running_averages=False,
inplace_running_mean=False, inplace_running_var=False,
inplace_output=False):
DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm.c'],
'dnn_batchnorm_op')
assert (mode in ('per-activation', 'spatial'))
self.mode = mode
self.running_averages = running_averages
self.inplace_output = inplace_output
self.inplace_running_mean = inplace_running_mean
self.inplace_running_var = inplace_running_var
self.destroy_map = {}
if self.inplace_output:
self.destroy_map[0] = [0]
if self.running_averages and self.inplace_running_mean:
self.destroy_map[3] = [5]
if self.running_averages and self.inplace_running_var:
self.destroy_map[4] = [6]
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'running_average_factor'):
self.running_average_factor = 0
if not hasattr(self, 'running_averages'):
self.running_averages = False
if not (hasattr(self, 'inplace_running_mean') and
hasattr(self, 'inplace_running_var') and
hasattr(self, 'inplace_output')):
self.inplace_running_mean = False
self.inplace_running_var = False
self.inplace_output = False
self.destroy_map = {}
def get_op_params(self):
params = []
if self.inplace_output:
params.append(('INPLACE_OUTPUT', '1'))
if self.running_averages:
params.append(('RUNNING_AVERAGES', '1'))
if self.inplace_running_mean:
params.append(('INPLACE_RUNNING_MEAN', '1'))
if self.inplace_running_var:
params.append(('INPLACE_RUNNING_VAR', '1'))
params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL"
if self.mode == "spatial"
else "CUDNN_BATCHNORM_PER_ACTIVATION")))
return params
def infer_shape(self, node, shape):
return [shape[0], shape[1], shape[1]]
return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)
def make_node(self, x, scale, bias, epsilon=1e-4):
def make_node(self, x, scale, bias, epsilon=1e-4,
running_average_factor=0.1,
running_mean=None, running_var=None):
assert x.ndim == scale.ndim == bias.ndim
assert x.ndim in (4, 5)
assert self.running_averages == (running_mean is not None) == (running_var is not None)
assert (running_mean is None or running_mean.ndim == x.ndim)
assert (running_var is None or running_var.ndim == x.ndim)
ctx_name = infer_context_name(x, scale, bias)
x = as_gpuarray_variable(x, ctx_name)
scale = as_gpuarray_variable(scale, ctx_name)
bias = as_gpuarray_variable(bias, ctx_name)
epsilon = as_scalar(epsilon).astype('float64')
assert x.ndim == scale.ndim == bias.ndim
assert x.ndim in (4, 5)
return Apply(self, [x, scale, bias, epsilon], [x.type(), scale.type(), scale.type()])
def grad(self, inputs, grads):
x, scale, bias, epsilon = inputs
running_average_factor = as_scalar(running_average_factor).astype('float64')
inputs = [x, scale, bias, epsilon, running_average_factor]
output_types = [x.type(), scale.type(), scale.type()]
if running_mean is not None and running_var is not None:
inputs.append(as_gpuarray_variable(running_mean, ctx_name))
inputs.append(as_gpuarray_variable(running_var, ctx_name))
output_types.append(scale.type())
output_types.append(scale.type())
return Apply(self, inputs, output_types)
def L_op(self, inputs, outputs, grads):
x, scale, bias, epsilon, running_average_factor = inputs[:5]
dy = grads[0]
_, x_mean, x_invstd = self(x, scale, bias, epsilon)
return GpuDnnBatchNormGrad(self.mode)(x, dy, scale, x_mean,
x_invstd, epsilon) + [DisconnectedType()()]
_, x_mean, x_invstd = outputs[:3]
disconnected_outputs = [
DisconnectedType()(), # epsilon
DisconnectedType()()] # running_average_factor
# Optional running_mean and running_var.
for i in range(5, len(inputs)):
disconnected_outputs.append(DisconnectedType()())
return GpuDnnBatchNormGrad(self.mode)(
x, dy, scale, x_mean, x_invstd, epsilon) + disconnected_outputs
def connection_pattern(self, node):
# Specificy that epsilon is not connected to outputs.
return [[True, True, True], [True, True, True], [True, True, True],
[False, False, False]]
# Specificy that epsilon and running_average_factor are not connected to outputs.
patterns = [[True, True, True], # x
[True, True, True], # scale
[True, True, True], # bias
[False, False, False], # epsilon
[False, False, False]] # running_average_factor
# Optional running_mean and running_var are only
# connected to their new values.
for i in range(5, len(node.inputs)):
patterns[0].append(True)
for pattern in patterns[1:]:
pattern.append(False)
patterns.append([False] * (3 + i - 5) + [True])
return patterns
class GpuDnnBatchNormInference(DnnBase):
......@@ -1706,17 +1790,27 @@ class GpuDnnBatchNormInference(DnnBase):
value is 1e-5 (imposed by cuDNN).
"""
__props__ = ('mode',)
__props__ = ('mode', 'inplace')
def __init__(self, mode='per-activation'):
def __init__(self, mode='per-activation', inplace=False):
DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_inf.c'],
'dnn_batchnorm_op')
assert (mode in ('per-activation', 'spatial'))
self.mode = mode
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'inplace'):
self.inplace = False
def get_op_params(self):
params = []
if self.inplace:
params.append(('INPLACE_OUTPUT', '1'))
params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL"
if self.mode == "spatial"
else "CUDNN_BATCHNORM_PER_ACTIVATION")))
......@@ -2404,7 +2498,8 @@ class RNNBlock(object):
def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon=1e-4):
epsilon=1e-4, running_average_factor=0.1,
running_mean=None, running_var=None):
"""
Performs batch normalization of the given inputs, using the mean and
variance of the inputs.
......@@ -2424,6 +2519,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
Returns
-------
......@@ -2431,8 +2543,14 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
Batch-normalized inputs.
mean : tensor
Means of `inputs` across the normalization axes.
stdinv : tensor
invstd : tensor
Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes
-----
......@@ -2444,31 +2562,77 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
axes = 0 if mode == 'per-activation' else (0, 2, 3)
mean = inputs.mean(axes, keepdims=True)
stdinv = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon))
out = (inputs - mean) * gamma * stdinv + beta
var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) + \\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) + \\
(m / (m - 1)) * var * running_average_factor
For 5d tensors, the axes are (0, 2, 3, 4).
"""
ndim = inputs.ndim
if ndim > 5:
raise ValueError("dnn_batch_normalization_train currently supports "
"up to 5-dimensional tensors only, got %d" % ndim)
if gamma.ndim != ndim or beta.ndim != ndim:
raise ValueError("gamma and beta must be of the same dimensionality "
"as inputs; got %d and %d instead of %d" %
(gamma.ndim, beta.ndim, ndim))
if (running_mean is None) != (running_var is None):
raise ValueError("running_mean and running_var must either both be "
"given or both be None")
if running_mean is not None and running_mean.ndim != ndim:
raise ValueError("running_mean must be of the same dimensionality "
"as inputs; got %d instead of %d" %
(running_mean.ndim, ndim))
if running_var is not None and running_var.ndim != ndim:
raise ValueError("running_var must be of the same dimensionality "
"as inputs; got %d instead of %d" %
(running_var.ndim, ndim))
if epsilon < 1e-5:
raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)
running_averages = (running_mean is not None and running_var is not None)
if ndim < 4:
inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
beta = theano.tensor.shape_padright(beta, 4 - ndim)
batchnorm_op = GpuDnnBatchNorm(mode=mode)
result = tuple(batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta), epsilon=epsilon))
if running_averages:
running_mean = theano.tensor.shape_padright(running_mean, 4 - ndim)
running_var = theano.tensor.shape_padright(running_var, 4 - ndim)
elif ndim > 5:
inputs_shape = inputs.shape
params_shape = gamma.shape
inputs = theano.tensor.flatten(inputs, 5)
gamma = theano.tensor.flatten(gamma, 5)
beta = theano.tensor.flatten(beta, 5)
if running_averages:
running_mean = theano.tensor.flatten(running_mean, 5)
running_var = theano.tensor.flatten(running_var, 5)
batchnorm_op = GpuDnnBatchNorm(mode=mode, running_averages=running_averages)
if running_averages:
out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta), epsilon=epsilon,
running_average_factor=running_average_factor,
running_mean=gpu_contiguous(running_mean),
running_var=gpu_contiguous(running_var))
if new_running_mean.broadcastable != running_mean.broadcastable:
new_running_mean = tensor.patternbroadcast(new_running_mean, running_mean.broadcastable)
if new_running_var.broadcastable != running_var.broadcastable:
new_running_var = tensor.patternbroadcast(new_running_var, running_var.broadcastable)
result = (out, mean, invstd, new_running_mean, new_running_var)
else:
result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta), epsilon=epsilon)
if ndim < 4:
result = tuple(theano.tensor.flatten(r, ndim) for r in result)
elif ndim > 5:
result = (theano.tensor.reshape(result[0], inputs_shape),) + tuple(
theano.tensor.reshape(r, params_shape) for r in result[1:])
return result
......@@ -2521,9 +2685,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
For 5d tensors, the axes would be (0, 2, 3, 4).
"""
ndim = inputs.ndim
if ndim > 5:
raise ValueError("dnn_batch_normalization_test currently supports "
"up to 5-dimensional tensors only, got %d" % ndim)
if gamma.ndim != ndim or beta.ndim != ndim:
raise ValueError("gamma and beta must be of the same dimensionality "
"as inputs; got %d and %d instead of %d" %
......@@ -2541,12 +2702,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
beta = theano.tensor.shape_padright(beta, 4 - ndim)
mean = theano.tensor.shape_padright(mean, 4 - ndim)
var = theano.tensor.shape_padright(var, 4 - ndim)
elif ndim > 5:
inputs_shape = inputs.shape
inputs = theano.tensor.flatten(inputs, 5)
gamma = theano.tensor.flatten(gamma, 5)
beta = theano.tensor.flatten(beta, 5)
mean = theano.tensor.flatten(mean, 5)
var = theano.tensor.flatten(var, 5)
batchnorm_op = GpuDnnBatchNormInference(mode=mode)
result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta), gpu_contiguous(mean),
gpu_contiguous(var), epsilon=epsilon)
if ndim < 4:
result = theano.tensor.flatten(result, ndim)
elif ndim > 5:
result = theano.tensor.reshape(result, inputs_shape)
return result
......@@ -2928,3 +3098,197 @@ def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
out = GpuDnnSoftmaxGrad('accurate', 'instance')(
gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
return [out.dimshuffle(0, 2)]
@register_opt('cudnn', 'fast_compile')
@op_lifter([bn.AbstractBatchNormTrain])
@register_opt2([bn.AbstractBatchNormTrain], 'cudnn', 'fast_compile')
def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
x, scale, bias, epsilon, running_average_factor = inputs[:5]
running_mean = inputs[5] if len(inputs) > 5 else None
running_var = inputs[6] if len(inputs) > 6 else None
# convert axes to cuDNN mode
axes = tuple(op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
try:
eps = theano.tensor.get_scalar_constant_value(epsilon)
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
try:
running_average_factor = theano.tensor.get_scalar_constant_value(running_average_factor)
except theano.tensor.NotScalarConstantError:
return None
ctx = infer_context_name(*inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
x = as_gpuarray_variable(x, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
bias = as_gpuarray_variable(bias, context_name=ctx)
inputs = [x, scale, bias, mode, eps, running_average_factor]
if running_mean is not None and running_var is not None:
inputs.append(running_mean)
inputs.append(running_var)
results = list(dnn_batch_normalization_train(*inputs))
return results
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_output(node):
if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
return GpuDnnBatchNorm(mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=node.op.inplace_running_var,
inplace_output=True)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_running_mean(node):
if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_mean:
return GpuDnnBatchNorm(mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=True,
inplace_running_var=node.op.inplace_running_var,
inplace_output=node.op.inplace_output)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_running_var(node):
if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_var:
return GpuDnnBatchNorm(mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=True,
inplace_output=node.op.inplace_output)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNormInference], inplace=True)
def local_batch_norm_inference_inplace(node):
if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)]
@register_opt('cudnn', 'fast_compile')
@op_lifter([bn.AbstractBatchNormTrainGrad])
@register_opt2([bn.AbstractBatchNormTrainGrad], 'cudnn', 'fast_compile')
def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):
x, dy, scale, x_mean, x_invstd, epsilon = inputs
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, GpuArrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
dy_on_gpu = (isinstance(dy.type, GpuArrayType) or
(dy.owner and isinstance(dy.owner.op, HostFromGpu)))
if not (x_on_gpu or dy_on_gpu):
return None
# convert axes to cuDNN mode
axes = tuple(op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
ndim = x.ndim
if ndim < 4:
x = theano.tensor.shape_padright(x, 4 - ndim)
dy = theano.tensor.shape_padright(dy, 4 - ndim)
scale = theano.tensor.shape_padright(scale, 4 - ndim)
x_mean = theano.tensor.shape_padright(x_mean, 4 - ndim)
x_invstd = theano.tensor.shape_padright(x_invstd, 4 - ndim)
elif ndim > 5:
x_shape = x.shape
params_shape = scale.shape
x = theano.tensor.flatten(x, 5)
dy = theano.tensor.flatten(dy, 5)
scale = theano.tensor.flatten(scale, 5)
x_mean = theano.tensor.flatten(x_mean, 5)
x_invstd = theano.tensor.flatten(x_invstd, 5)
try:
eps = theano.tensor.get_scalar_constant_value(epsilon)
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
ctx = infer_context_name(*inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
x = as_gpuarray_variable(x, context_name=ctx)
dy = as_gpuarray_variable(dy, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
x_mean = as_gpuarray_variable(x_mean, context_name=ctx)
x_invstd = as_gpuarray_variable(x_invstd, context_name=ctx)
g_wrt_inputs, g_wrt_scale, g_wrt_bias = \
GpuDnnBatchNormGrad(mode)(x, dy, scale, x_mean, x_invstd, eps)
if ndim < 4:
g_wrt_inputs = theano.tensor.flatten(g_wrt_inputs, ndim)
g_wrt_scale = theano.tensor.flatten(g_wrt_scale, ndim)
g_wrt_bias = theano.tensor.flatten(g_wrt_bias, ndim)
elif ndim > 5:
g_wrt_inputs = theano.tensor.reshape(g_wrt_inputs, x_shape)
g_wrt_scale = theano.tensor.reshape(g_wrt_scale, params_shape)
g_wrt_bias = theano.tensor.reshape(g_wrt_bias, params_shape)
return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
@register_opt('cudnn', 'fast_compile')
@op_lifter([bn.AbstractBatchNormInference])
@register_opt2([bn.AbstractBatchNormInference], 'cudnn', 'fast_compile')
def local_abstract_batch_norm_inference_cudnn(op, ctx_name, inputs, outputs):
x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs
axes = tuple(op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
try:
eps = theano.tensor.get_scalar_constant_value(epsilon)
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
ctx = infer_context_name(*inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
x = as_gpuarray_variable(x, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
bias = as_gpuarray_variable(bias, context_name=ctx)
estimated_mean = as_gpuarray_variable(estimated_mean, context_name=ctx)
estimated_variance = as_gpuarray_variable(estimated_variance, context_name=ctx)
out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance,
mode, eps)
return [out]
......@@ -2,8 +2,19 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, npy_float64 epsilon,
PyGpuArrayObject **outp, PyGpuArrayObject **x_mean,
PyGpuArrayObject **x_invstd, cudnnHandle_t _handle) {
npy_float64 running_average_factor,
#ifdef RUNNING_AVERAGES
PyGpuArrayObject *in_running_mean,
PyGpuArrayObject *in_running_var,
#endif
PyGpuArrayObject **outp,
PyGpuArrayObject **x_mean,
PyGpuArrayObject **x_invstd,
#ifdef RUNNING_AVERAGES
PyGpuArrayObject **out_running_mean,
PyGpuArrayObject **out_running_var,
#endif
cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
......@@ -16,8 +27,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
return 1;
}
#ifdef INPLACE_OUTPUT
Py_XDECREF(*outp);
*outp = inp;
Py_INCREF(*outp);
#else
if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
#endif
if (theano_prep_output(x_mean, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
if (theano_prep_output(x_invstd, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
......@@ -26,6 +43,31 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
if (c_set_tensorNd(*outp, bn_output) != 0)
return 1;
#ifdef RUNNING_AVERAGES
#ifdef INPLACE_RUNNING_MEAN
Py_XDECREF(out_running_mean);
PyGpuArrayObject *running_mean = in_running_mean;
Py_INCREF(running_mean);
#else
PyGpuArrayObject *running_mean = *out_running_mean;
running_mean = theano_try_copy(running_mean, in_running_mean);
if (running_mean == NULL) {
return 1;
}
#endif
#ifdef INPLACE_RUNNING_VAR
Py_XDECREF(out_running_var);
PyGpuArrayObject *running_var = in_running_var;
Py_INCREF(running_var);
#else
PyGpuArrayObject *running_var = *out_running_var;
running_var = theano_try_copy(running_var, in_running_var);
if (running_var == NULL) {
return 1;
}
#endif
#endif
{
const float falpha = 1.;
const float fbeta = 0.;
......@@ -52,9 +94,15 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
bn_params,
PyGpuArray_DEV_DATA(scale),
PyGpuArray_DEV_DATA(bias),
#ifdef RUNNING_AVERAGES
running_average_factor,
PyGpuArray_DEV_DATA(running_mean),
PyGpuArray_DEV_DATA(running_var),
#else
0,
NULL, // running mean, deliberately unused
NULL, // running var, deliberately unused
#endif
epsilon,
PyGpuArray_DEV_DATA(*x_mean),
PyGpuArray_DEV_DATA(*x_invstd)
......@@ -64,6 +112,10 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
cudnnGetErrorString(err));
return 1;
}
#ifdef RUNNING_AVERAGES
*out_running_mean = running_mean;
*out_running_var = running_var;
#endif
}
return 0;
}
......@@ -16,8 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
return 1;
}
#ifdef INPLACE_OUTPUT
Py_XDECREF(*outp);
*outp = inp;
Py_INCREF(*outp);
#else
if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
return 1;
#endif
if (c_set_tensorNd(*outp, bn_output) != 0)
return 1;
......
from __future__ import absolute_import, print_function, division
import logging
from collections import OrderedDict
from nose.plugins.skip import SkipTest
from nose_parameterized import parameterized
......@@ -13,6 +14,7 @@ import theano.tests.unittest_tools as utt
from theano.tensor.signal.pool import pool_2d, pool_3d
from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
from theano.tensor.nnet.abstract_conv import get_conv_output_shape
from theano.tensor.nnet import bn
from .. import dnn
from ..basic_ops import GpuAllocEmpty
......@@ -1379,36 +1381,77 @@ def test_dnn_batchnorm_train():
raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng()
tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
for mode in ('per-activation', 'spatial'):
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias'))
for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias, running_mean, running_var = (vartype(n)
for n in ('x', 'scale', 'bias',
'running_mean',
'running_var'))
ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used
# forward pass
out, x_mean, x_invstd = dnn.dnn_batch_normalization_train(
x, scale, bias, mode, eps)
running_average_factor = 0.3
# forward pass, direct interface
out_gpu, x_mean_gpu, x_invstd_gpu, \
out_running_mean_gpu, out_running_var_gpu = \
dnn.dnn_batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# forward pass, abstract interface
out_abstract, x_mean_abstract, x_invstd_abstract, \
out_running_mean_abstract, out_running_var_abstract = \
bn.batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# reference forward pass
if mode == 'per-activation':
axes = (0,)
elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim))
x_mean2 = x.mean(axis=axes, keepdims=True)
x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps))
scale2 = T.addbroadcast(scale, *axes)
bias2 = T.addbroadcast(bias, *axes)
out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
x_mean_ref = x.mean(axis=axes, keepdims=True)
x_var_ref = x.var(axis=axes, keepdims=True)
x_invstd_ref = T.inv(T.sqrt(x_var_ref + eps))
scale_ref = T.addbroadcast(scale, *axes)
bias_ref = T.addbroadcast(bias, *axes)
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
out_running_mean_ref = running_mean * (1 - running_average_factor) + \
x_mean_ref * running_average_factor
out_running_var_ref = running_var * (1 - running_average_factor) + \
(m / (m - 1)) * x_var_ref * running_average_factor
# backward pass
dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
# reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
grads_ref = T.grad(None, wrt=[x, scale, bias], known_grads={out_ref: dy})
# compile
f = theano.function([x, scale, bias, dy],
[out, x_mean, x_invstd, out2, x_mean2, x_invstd2] +
grads + grads2, mode=mode_with_gpu)
f_gpu = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_gpu, x_mean_gpu, x_invstd_gpu,
out_running_mean_gpu, out_running_var_gpu] + grads_gpu,
mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract,
out_running_mean_abstract, out_running_var_abstract] +
grads_abstract,
mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_ref, x_mean_ref, x_invstd_ref,
out_running_mean_ref, out_running_var_ref] + grads_ref,
mode=mode_without_gpu)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
for data_shape in ((5, 10, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape))
......@@ -1416,15 +1459,124 @@ def test_dnn_batchnorm_train():
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Dy)
Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs_gpu = f_gpu(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
# compare outputs
utt.assert_allclose(outputs[0], outputs[0 + 3]) # out
utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean
utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd
utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_gpu[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs_gpu[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_gpu[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_gpu[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs_abstract[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_abstract[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
# compare gradients
utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4) # dx
utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias
utt.assert_allclose(outputs_gpu[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_gpu[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_gpu[7], outputs_ref[7]) # dbias
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_abstract[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_abstract[7], outputs_ref[7]) # dbias
def test_dnn_batchnorm_train_without_running_averages():
# compile and run batch_normalization_train without running averages
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
# forward pass
out_gpu, x_mean_gpu, x_invstd_gpu = \
dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation')
out_abstract, x_mean_abstract, x_invstd_abstract = \
bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
# compile
f_gpu = theano.function([x, scale, bias, dy],
[out_gpu, x_mean_gpu, x_invstd_gpu] +
grads_gpu,
mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract] +
grads_abstract,
mode=mode_with_gpu)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm)
for n in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
for n in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f_abstract.maker.fgraph.toposort()])
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f_gpu(X, Scale, Bias, Dy)
f_abstract(X, Scale, Bias, Dy)
def test_dnn_batchnorm_train_inplace():
# test inplace_running_mean and inplace_running_var
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
running_mean = gpuarray_shared_constructor(
numpy.random.randn(*param_shape).astype(theano.config.floatX),
broadcastable=(True, False, False, False))
running_var = gpuarray_shared_constructor(
numpy.random.randn(*param_shape).astype(theano.config.floatX),
broadcastable=(True, False, False, False))
# forward pass
out, x_mean, x_invstd, new_running_mean, new_running_var = \
dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation',
epsilon=5e-3, running_average_factor=0.3,
running_mean=running_mean, running_var=running_var)
# update running averages
updates = OrderedDict()
updates[running_mean] = new_running_mean
updates[running_var] = new_running_var
# compile
f = theano.function([x, scale, bias],
[out, x_mean, x_invstd],
updates=updates,
mode=mode_with_gpu)
# check for the inplace settings
nodes = [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, dnn.GpuDnnBatchNorm)]
assert len(nodes) == 1
assert nodes[0].op.inplace_running_mean
assert nodes[0].op.inplace_running_var
assert nodes[0].op.inplace_output
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias)
def test_batchnorm_inference():
......@@ -1432,34 +1584,51 @@ def test_batchnorm_inference():
raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng()
tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
for mode in ('per-activation', 'spatial'):
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias, mean, var = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var'))
ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used
# forward pass
out = dnn.dnn_batch_normalization_test(x, scale, bias, mean,
var, mode, eps)
# forward pass, direct interface
out_gpu = dnn.dnn_batch_normalization_test(x, scale, bias, mean,
var, mode, eps)
# forward pass, abstract interface
out_abstract = bn.batch_normalization_test(x, scale, bias, mean,
var, mode, eps)
# reference forward pass
if mode == 'per-activation':
axes = (0,)
elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim))
scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes)
for t in (scale, bias, mean, var))
out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2
scale_ref, bias_ref, mean_ref, var_ref = (T.addbroadcast(t, *axes)
for t in (scale, bias, mean, var))
out_ref = (x - mean_ref) * (scale_ref / T.sqrt(var_ref + eps)) + bias_ref
# backward pass
dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy})
grads_gpu = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_abstract: dy})
# reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy})
grads_ref = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_ref: dy})
# compile
f = theano.function([x, scale, bias, mean, var, dy],
[out, out2] + grads + grads2, mode=mode_with_gpu)
f_gpu = theano.function([x, scale, bias, mean, var, dy],
[out_gpu] + grads_gpu, mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, mean, var, dy],
[out_abstract] + grads_abstract, mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, mean, var, dy],
[out_ref] + grads_ref)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run
for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
for data_shape in ((10, 20, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape))
......@@ -1469,15 +1638,106 @@ def test_batchnorm_inference():
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Mean, Var, Dy)
outputs_gpu = f_gpu(X, Scale, Bias, Mean, Var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Mean, Var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Mean, Var, Dy)
# compare outputs
utt.assert_allclose(outputs[0], outputs[1]) # out
utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
# compare gradients
utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx
utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale
utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias
utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean
utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar
utt.assert_allclose(outputs_gpu[1], outputs_ref[1], atol=4e-5) # dx
utt.assert_allclose(outputs_gpu[2], outputs_ref[2], atol=4e-5) # dscale
utt.assert_allclose(outputs_gpu[3], outputs_ref[3]) # dbias
utt.assert_allclose(outputs_gpu[4], outputs_ref[4]) # dmean
utt.assert_allclose(outputs_gpu[5], outputs_ref[5], rtol=2e-3, atol=4e-5) # dvar
utt.assert_allclose(outputs_abstract[1], outputs_ref[1], atol=4e-5) # dx
utt.assert_allclose(outputs_abstract[2], outputs_ref[2], atol=4e-5) # dscale
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # dbias
utt.assert_allclose(outputs_abstract[4], outputs_ref[4]) # dmean
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], rtol=2e-3, atol=4e-5) # dvar
def test_batchnorm_inference_inplace():
# test inplace
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, mean, var = (T.tensor4(n) for n in ('x', 'scale', 'bias', 'mean', 'var'))
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
out = dnn.dnn_batch_normalization_test(x, scale, bias, mean, var)
f = theano.function([x, scale, bias, mean, var], [out], mode=mode_with_gpu)
# check for the inplace settings
nodes = [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, dnn.GpuDnnBatchNormInference)]
assert len(nodes) == 1
assert nodes[0].op.inplace
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias, Mean, Var)
def test_dnn_batchnorm_valid_and_invalid_axes():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix):
x, scale, bias, mean, var, dy = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var', 'dy'))
ndim = x.ndim
# supported: per-activation and spatial
valid_axes_lists = ((0,), (0,) + tuple(range(2, ndim)))
# not supported: an axes list without 0 and including 1
invalid_axes_lists = (tuple(range(1, ndim)),)
for axes in valid_axes_lists + invalid_axes_lists:
# forward pass, abstract interface
out_train, x_mean, x_invstd = bn.batch_normalization_train(
x, scale, bias, axes)
out_test = bn.batch_normalization_test(
x, scale, bias, mean, var, axes)
# backward pass
dy = vartype('dy')
grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
grads_test = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_test: dy})
# compile
f = theano.function([x, scale, bias, mean, var, dy],
[out_train, x_mean, x_invstd, out_test] +
grads_train + grads_test,
mode=mode_with_gpu)
if axes in valid_axes_lists:
# check if the abstract Ops have been replaced by the cuDNN Ops
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
in f.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
in f.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f.maker.fgraph.toposort()])
else:
# check if the abstract Ops have been replaced, but not by the cuDNN Ops
assert not any([isinstance(n.op, (dnn.GpuDnnBatchNorm,
dnn.GpuDnnBatchNormGrad,
bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f.maker.fgraph.toposort()])
def test_dnn_rnn_gru():
......
......@@ -12,7 +12,7 @@ import warnings
import theano
from theano.compat import get_unbound_function
from theano.compile import optdb
from theano.gof import EquilibriumDB, SequenceDB
from theano.gof import EquilibriumDB, SequenceDB, TopoOptimizer
from theano.gof.cmodule import get_lib_extension
from theano.gof.compilelock import get_lock, release_lock
from theano import config
......@@ -40,6 +40,17 @@ def register_opt(*tags, **kwargs):
return f
def register_inplace(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
optdb.register(
name, TopoOptimizer(
local_opt, failure_callback=TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace', 'gpu', *tags)
return local_opt
return f
_logger_name = 'theano.sandbox.cuda'
_logger = logging.getLogger(_logger_name)
......
......@@ -18,6 +18,7 @@ from theano.tensor.nnet.abstract_conv import (get_conv_output_shape,
assert_conv_shape)
from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad)
from theano.tensor.nnet import bn
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp, dnn_available
......@@ -33,7 +34,7 @@ from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
from theano.sandbox.cuda.nnet import GpuSoftmax
from theano.sandbox.cuda.opt_util import (alpha_merge, output_merge,
pad_dims, unpad_dims)
from theano.sandbox.cuda import gpu_seqopt, register_opt
from theano.sandbox.cuda import gpu_seqopt, register_opt, register_inplace
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
......@@ -2347,6 +2348,23 @@ class GpuDnnBatchNormBase(DnnBase):
epsilon
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
"""
__props__ = ('mode', 'epsilon')
......@@ -2395,17 +2413,15 @@ cudnnStatus_t err%(name)s;
result = """
cudnnStatus_t err%(name)s;
cudnnBatchNormMode_t mode%(name)s = %(mode)s;
double exponentialAverageFactor%(name)s = %(exp_avg_factor)f;
double epsilon%(name)s = %(epsilon)e;
""" % dict(name=name,
mode=mode,
exp_avg_factor=0, # deliberately unused
epsilon=self.epsilon)
return result
def c_code_cache_version(self):
return (3, version())
return (4, version())
class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
......@@ -2422,8 +2438,26 @@ class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
Note: scale, bias, mean and variance must follow the same tensor layout!
"""
__props__ = ('mode', 'epsilon', 'inplace')
tensor_descs = ['bn_input', 'bn_output', 'bn_params']
def __init__(self, mode='per-activation', epsilon=1e-4, inplace=False):
super(GpuDnnBatchNormInference, self).__init__(mode=mode, epsilon=epsilon)
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'inplace'):
self.inplace = False
def get_op_params(self):
params = []
if self.inplace:
params.append(('INPLACE_OUTPUT', '1'))
return params
def infer_shape(self, node, shape):
# output shape equals shape of x
return [shape[0]]
......@@ -2460,10 +2494,16 @@ if (c_set_tensorNd(%(scale)s, bn_params_%(name)s) != 0)
}
// build and prepare the output variable
#ifdef INPLACE_OUTPUT
Py_XDECREF(%(outp)s);
%(outp)s = %(inp)s;
Py_INCREF(%(outp)s);
#else
if (CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp)s)) != 0)
{
%(fail)s
}
#endif
// set output tensor descriptor from output tensor
if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0)
......@@ -2494,6 +2534,16 @@ err%(name)s = cudnnBatchNormalizationForwardInference(
""" % dict(name=name, inp=inp, scale=scale, bias=bias, est_mean=est_mean,
est_var=est_var, outp=outp, fail=sub['fail'])
# add params
define_macros, undef_macros = self.get_c_macros(node, name, check_input=False)
result = """
%(define_macros)s
{
%(code)s
}
%(undef_macros)s
""" % dict(code=result, define_macros=define_macros, undef_macros=undef_macros)
return result
def grad(self, inputs, grads):
......@@ -2537,28 +2587,98 @@ class GpuDnnBatchNorm(GpuDnnBatchNormBase):
Note: scale and bias must follow the same tensor layout!
"""
__props__ = ('mode', 'epsilon', 'running_average_factor',
'running_averages', 'inplace_running_mean',
'inplace_running_var', 'inplace_output')
tensor_descs = ['bn_input', 'bn_output', 'bn_params']
def __init__(self, mode='per-activation', epsilon=1e-4,
running_average_factor=0,
running_averages=False, inplace_running_mean=False,
inplace_running_var=False, inplace_output=False):
super(GpuDnnBatchNorm, self).__init__(mode=mode, epsilon=epsilon)
self.running_average_factor = running_average_factor
self.running_averages = running_averages
self.inplace_output = inplace_output
self.inplace_running_mean = inplace_running_mean
self.inplace_running_var = inplace_running_var
self.destroy_map = {}
if self.inplace_output:
self.destroy_map[0] = [0]
if self.running_averages and self.inplace_running_mean:
self.destroy_map[3] = [3]
if self.running_averages and self.inplace_running_var:
self.destroy_map[4] = [4]
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'running_average_factor'):
self.running_average_factor = 0
if not hasattr(self, 'running_averages'):
self.running_averages = False
if not (hasattr(self, 'inplace_running_mean') and
hasattr(self, 'inplace_running_var') and
hasattr(self, 'inplace_output')):
self.inplace_running_mean = False
self.inplace_running_var = False
self.inplace_output = False
self.destroy_map = {}
def get_op_params(self):
params = []
if self.inplace_output:
params.append(('INPLACE_OUTPUT', '1'))
if self.running_averages:
params.append(('RUNNING_AVERAGES', '1'))
if self.inplace_running_mean:
params.append(('INPLACE_RUNNING_MEAN', '1'))
if self.inplace_running_var:
params.append(('INPLACE_RUNNING_VAR', '1'))
return params
def infer_shape(self, node, shape):
# first output equals shape of x
# second and third output equal shape of scale
return [shape[0], shape[1], shape[1]]
# other outputs equal shape of scale
return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)
def make_node(self, x, scale, bias):
def make_node(self, x, scale, bias,
running_mean=None, running_var=None):
assert x.ndim == scale.ndim == bias.ndim
assert x.ndim in (4, 5)
assert self.running_averages == (running_mean is not None) == (running_var is not None)
assert (running_mean is None or running_mean.ndim == x.ndim)
assert (running_var is None or running_var.ndim == x.ndim)
x = as_cuda_ndarray_variable(x)
scale = as_cuda_ndarray_variable(scale)
bias = as_cuda_ndarray_variable(bias)
assert x.ndim == scale.ndim == bias.ndim
assert x.ndim in (4, 5)
return Apply(self, [x, scale, bias], [x.type(), scale.type(), scale.type()])
inputs = [x, scale, bias]
output_types = [x.type(), scale.type(), scale.type()]
if running_mean is not None and running_var is not None:
inputs.append(as_cuda_ndarray_variable(running_mean))
inputs.append(as_cuda_ndarray_variable(running_var))
output_types.append(scale.type())
output_types.append(scale.type())
return Apply(self, inputs, output_types)
def c_code(self, node, name, inputs, outputs, sub):
# super call to prepare common configuration
result = super(GpuDnnBatchNorm, self).c_code(node, name, inputs, outputs, sub)
# give sensible names to inputs and outputs
inp, scale, bias = inputs
outp, x_mean, x_invstd = outputs
inp, scale, bias = inputs[:3]
outp, x_mean, x_invstd = outputs[:3]
if self.running_averages:
running_average_factor = self.running_average_factor
in_running_mean = inputs[3]
in_running_var = inputs[4]
out_running_mean = outputs[3]
out_running_var = outputs[4]
else:
running_average_factor = 0.
in_running_mean = 'NULL'
in_running_var = 'NULL'
out_running_mean = 'NULL'
out_running_var = 'NULL'
# set input tensor descriptors from input tensors
result += """
......@@ -2579,6 +2699,32 @@ if ((CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp
{
%(fail)s
}
#ifdef RUNNING_AVERAGES
#ifdef INPLACE_RUNNING_MEAN
Py_XDECREF(%(out_running_mean)s);
CudaNdarray *running_mean%(name)s = %(in_running_mean)s;
Py_INCREF(running_mean%(name)s);
#else
if ((CudaNdarray_prep_output(&%(out_running_mean)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(scale)s)) != 0) ||
(CudaNdarray_CopyFromCudaNdarray(%(out_running_mean)s, %(in_running_mean)s) != 0))
{
%(fail)s
}
CudaNdarray *running_mean%(name)s = %(out_running_mean)s;
#endif
#ifdef INPLACE_RUNNING_VAR
Py_XDECREF(%(out_running_var)s);
CudaNdarray *running_var%(name)s = %(in_running_var)s;
Py_INCREF(running_var%(name)s);
#else
if ((CudaNdarray_prep_output(&%(out_running_var)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(scale)s)) != 0) ||
(CudaNdarray_CopyFromCudaNdarray(%(out_running_var)s, %(in_running_var)s) != 0))
{
%(fail)s
}
CudaNdarray *running_var%(name)s = %(out_running_var)s;
#endif
#endif
// set output tensor descriptor from output tensor
if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0)
......@@ -2601,25 +2747,66 @@ err%(name)s = cudnnBatchNormalizationForwardTraining(
bn_params_%(name)s,
CudaNdarray_DEV_DATA(%(scale)s),
CudaNdarray_DEV_DATA(%(bias)s),
exponentialAverageFactor%(name)s,
NULL, // running mean, deliberately unused
NULL, // running var, deliberately unused
#ifdef RUNNING_AVERAGES
%(running_average_factor)f,
CudaNdarray_DEV_DATA(running_mean%(name)s),
CudaNdarray_DEV_DATA(running_var%(name)s),
#else
0,
NULL,
NULL,
#endif
epsilon%(name)s,
CudaNdarray_DEV_DATA(%(x_mean)s),
CudaNdarray_DEV_DATA(%(x_invstd)s)
);
}
#ifdef RUNNING_AVERAGES
%(out_running_mean)s = running_mean%(name)s;
%(out_running_var)s = running_var%(name)s;
#endif
""" % dict(name=name, inp=inp, scale=scale, bias=bias, outp=outp,
x_mean=x_mean, x_invstd=x_invstd, fail=sub['fail'])
x_mean=x_mean, x_invstd=x_invstd,
running_average_factor=running_average_factor,
in_running_mean=in_running_mean, in_running_var=in_running_var,
out_running_mean=out_running_mean, out_running_var=out_running_var,
fail=sub['fail'])
# add params
define_macros, undef_macros = self.get_c_macros(node, name, check_input=False)
result = """
%(define_macros)s
{
%(code)s
}
%(undef_macros)s
""" % dict(code=result, define_macros=define_macros, undef_macros=undef_macros)
return result
def grad(self, inputs, grads):
x, scale, bias = inputs
x, scale, bias = inputs[:3]
dy = grads[0]
_, x_mean, x_invstd = self(x, scale, bias)
return GpuDnnBatchNormGrad(self.mode, self.epsilon)(x, dy, scale,
x_mean, x_invstd)
_, x_mean, x_invstd = self(*inputs)[:3]
disconnected_outputs = []
# Optional running_mean and running_var.
for i in range(3, len(inputs)):
disconnected_outputs.append(DisconnectedType()())
return GpuDnnBatchNormGrad(self.mode, self.epsilon)(
x, dy, scale, x_mean, x_invstd) + disconnected_outputs
def connection_pattern(self, node):
patterns = [[True, True, True], # x
[True, True, True], # scale
[True, True, True]] # bias
# Optional running_mean and running_var are only
# connected to their new values.
for i in range(3, len(node.inputs)):
patterns[0].append(True)
for pattern in patterns[1:]:
pattern.append(False)
patterns.append([False] * (i) + [True])
return patterns
class GpuDnnBatchNormGrad(GpuDnnBatchNormBase):
......@@ -2722,7 +2909,8 @@ err%(name)s = cudnnBatchNormalizationBackward(
def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon=1e-4):
epsilon=1e-4, running_average_factor=0.1,
running_mean=None, running_var=None):
"""
Performs batch normalization of the given inputs, using the mean and
variance of the inputs.
......@@ -2742,6 +2930,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
Returns
-------
......@@ -2749,8 +2954,14 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
Batch-normalized inputs.
mean : tensor
Means of `inputs` across the normalization axes.
stdinv : tensor
invstd : tensor
Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes
-----
......@@ -2762,31 +2973,78 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
axes = 0 if mode == 'per-activation' else (0, 2, 3)
mean = inputs.mean(axes, keepdims=True)
stdinv = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon))
out = (inputs - mean) * gamma * stdinv + beta
var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) + \\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) + \\
(m / (m - 1)) * var * running_average_factor
For 5d tensors, the axes are (0, 2, 3, 4).
"""
ndim = inputs.ndim
if ndim > 5:
raise ValueError("dnn_batch_normalization_train currently supports "
"up to 5-dimensional tensors only, got %d" % ndim)
if gamma.ndim != ndim or beta.ndim != ndim:
raise ValueError("gamma and beta must be of the same dimensionality "
"as inputs; got %d and %d instead of %d" %
(gamma.ndim, beta.ndim, ndim))
if (running_mean is None) != (running_var is None):
raise ValueError("running_mean and running_var must either both be "
"given or both be None")
if running_mean is not None and running_mean.ndim != ndim:
raise ValueError("running_mean must be of the same dimensionality "
"as inputs; got %d instead of %d" %
(running_mean.ndim, ndim))
if running_var is not None and running_var.ndim != ndim:
raise ValueError("running_var must be of the same dimensionality "
"as inputs; got %d instead of %d" %
(running_var.ndim, ndim))
if epsilon < 1e-5:
raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)
running_averages = (running_var is not None and running_var is not None)
if ndim < 4:
inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
beta = theano.tensor.shape_padright(beta, 4 - ndim)
batchnorm_op = GpuDnnBatchNorm(mode=mode, epsilon=epsilon)
result = tuple(batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta)))
if running_averages:
running_mean = theano.tensor.shape_padright(running_mean, 4 - ndim)
running_var = theano.tensor.shape_padright(running_var, 4 - ndim)
elif ndim > 5:
inputs_shape = inputs.shape
params_shape = gamma.shape
inputs = theano.tensor.flatten(inputs, 5)
gamma = theano.tensor.flatten(gamma, 5)
beta = theano.tensor.flatten(beta, 5)
if running_averages:
running_mean = theano.tensor.flatten(running_mean, 5)
running_var = theano.tensor.flatten(running_var, 5)
batchnorm_op = GpuDnnBatchNorm(mode=mode, epsilon=epsilon,
running_average_factor=running_average_factor,
running_averages=running_averages)
if running_averages:
out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta),
running_mean=gpu_contiguous(running_mean),
running_var=gpu_contiguous(running_var))
if new_running_mean.broadcastable != running_mean.broadcastable:
new_running_mean = tensor.patternbroadcast(new_running_mean, running_mean.broadcastable)
if new_running_var.broadcastable != running_var.broadcastable:
new_running_var = tensor.patternbroadcast(new_running_var, running_var.broadcastable)
result = (out, mean, invstd, new_running_mean, new_running_var)
else:
result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta))
if ndim < 4:
result = tuple(theano.tensor.flatten(r, ndim) for r in result)
elif ndim > 5:
result = (theano.tensor.reshape(result[0], inputs_shape),) + tuple(
theano.tensor.reshape(r, params_shape) for r in result[1:])
return result
......@@ -2839,9 +3097,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
For 5d tensors, the axes would be (0, 2, 3, 4).
"""
ndim = inputs.ndim
if ndim > 5:
raise ValueError("dnn_batch_normalization_test currently supports "
"up to 5-dimensional tensors only, got %d" % ndim)
if gamma.ndim != ndim or beta.ndim != ndim:
raise ValueError("gamma and beta must be of the same dimensionality "
"as inputs; got %d and %d instead of %d" %
......@@ -2859,12 +3114,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
beta = theano.tensor.shape_padright(beta, 4 - ndim)
mean = theano.tensor.shape_padright(mean, 4 - ndim)
var = theano.tensor.shape_padright(var, 4 - ndim)
elif ndim > 5:
inputs_shape = inputs.shape
inputs = theano.tensor.flatten(inputs, 5)
gamma = theano.tensor.flatten(gamma, 5)
beta = theano.tensor.flatten(beta, 5)
mean = theano.tensor.flatten(mean, 5)
var = theano.tensor.flatten(var, 5)
batchnorm_op = GpuDnnBatchNormInference(mode=mode, epsilon=epsilon)
result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta), gpu_contiguous(mean),
gpu_contiguous(var))
if ndim < 4:
result = theano.tensor.flatten(result, ndim)
elif ndim > 5:
result = theano.tensor.reshape(result, inputs_shape)
return result
......@@ -3334,3 +3598,235 @@ def local_abstractconv3d_cudnn(node):
subsample=node.op.subsample,
conv_mode=conv_mode)
return [rval]
@local_optimizer([bn.AbstractBatchNormTrain])
def local_abstract_batch_norm_train_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrain):
return None
x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
running_mean = node.inputs[5] if len(node.inputs) > 5 else None
running_var = node.inputs[6] if len(node.inputs) > 6 else None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
if not x_on_gpu:
return None
# convert axes to cuDNN mode
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
try:
eps = float(theano.tensor.get_scalar_constant_value(epsilon))
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
try:
running_average_factor = float(theano.tensor.get_scalar_constant_value(running_average_factor))
except theano.tensor.NotScalarConstantError:
return None
if not dnn_available():
return None
x = as_cuda_ndarray_variable(x)
scale = as_cuda_ndarray_variable(scale)
bias = as_cuda_ndarray_variable(bias)
inputs = [x, scale, bias, mode, eps, running_average_factor]
if running_mean is not None and running_var is not None:
inputs.append(running_mean)
inputs.append(running_var)
results = list(dnn_batch_normalization_train(*inputs))
# If the original output was on CPU, we have to transfer it
for i in range(len(node.outputs)):
if isinstance(node.outputs[i].type, tensor.TensorType):
results[i] = tensor.as_tensor_variable(results[i])
# TODO copy_stack_trace?
return results
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_gpu_batch_norm_inplace_output(node):
if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
return GpuDnnBatchNorm(mode=node.op.mode,
epsilon=node.op.epsilon,
running_average_factor=node.op.running_average_factor,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=node.op.inplace_running_var,
inplace_output=True)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_gpu_batch_norm_inplace_running_mean(node):
if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_mean:
return GpuDnnBatchNorm(mode=node.op.mode,
epsilon=node.op.epsilon,
running_average_factor=node.op.running_average_factor,
running_averages=node.op.running_averages,
inplace_running_mean=True,
inplace_running_var=node.op.inplace_running_var,
inplace_output=node.op.inplace_output)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_gpu_batch_norm_inplace_running_var(node):
if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_var:
return GpuDnnBatchNorm(mode=node.op.mode,
epsilon=node.op.epsilon,
running_average_factor=node.op.running_average_factor,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=True,
inplace_output=node.op.inplace_output)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNormInference], inplace=True)
def local_gpu_batch_norm_inference_inplace(node):
if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
return [GpuDnnBatchNormInference(mode=node.op.mode,
epsilon=node.op.epsilon,
inplace=True)(*node.inputs)]
@local_optimizer([bn.AbstractBatchNormTrainGrad])
def local_abstract_batch_norm_train_grad_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrainGrad):
return None
x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
dy_on_gpu = (isinstance(dy.type, CudaNdarrayType) or
(dy.owner and isinstance(dy.owner.op, HostFromGpu)))
if not (x_on_gpu or dy_on_gpu):
return None
# convert axes to cuDNN mode
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
ndim = x.ndim
if ndim < 4:
x = theano.tensor.shape_padright(x, 4 - ndim)
dy = theano.tensor.shape_padright(dy, 4 - ndim)
scale = theano.tensor.shape_padright(scale, 4 - ndim)
x_mean = theano.tensor.shape_padright(x_mean, 4 - ndim)
x_invstd = theano.tensor.shape_padright(x_invstd, 4 - ndim)
elif ndim > 5:
x_shape = x.shape
params_shape = scale.shape
x = theano.tensor.flatten(x, 5)
dy = theano.tensor.flatten(dy, 5)
scale = theano.tensor.flatten(scale, 5)
x_mean = theano.tensor.flatten(x_mean, 5)
x_invstd = theano.tensor.flatten(x_invstd, 5)
try:
eps = float(theano.tensor.get_scalar_constant_value(epsilon))
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
if not dnn_available():
return None
x = as_cuda_ndarray_variable(x)
dy = as_cuda_ndarray_variable(dy)
scale = as_cuda_ndarray_variable(scale)
x_mean = as_cuda_ndarray_variable(x_mean)
x_invstd = as_cuda_ndarray_variable(x_invstd)
g_wrt_inputs, g_wrt_scale, g_wrt_bias = \
GpuDnnBatchNormGrad(mode, epsilon=eps)(x, dy, scale, x_mean, x_invstd)
if ndim < 4:
g_wrt_inputs = theano.tensor.flatten(g_wrt_inputs, ndim)
g_wrt_scale = theano.tensor.flatten(g_wrt_scale, ndim)
g_wrt_bias = theano.tensor.flatten(g_wrt_bias, ndim)
elif ndim > 5:
g_wrt_inputs = theano.tensor.reshape(g_wrt_inputs, x_shape)
g_wrt_scale = theano.tensor.reshape(g_wrt_scale, params_shape)
g_wrt_bias = theano.tensor.reshape(g_wrt_bias, params_shape)
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType):
g_wrt_inputs = tensor.as_tensor_variable(g_wrt_inputs)
if isinstance(node.outputs[1].type, tensor.TensorType):
g_wrt_scale = tensor.as_tensor_variable(g_wrt_scale)
if isinstance(node.outputs[2].type, tensor.TensorType):
g_wrt_bias = tensor.as_tensor_variable(g_wrt_bias)
# TODO copy_stack_trace?
return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
@local_optimizer([bn.AbstractBatchNormInference])
def local_abstract_batch_norm_inference_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormInference):
return None
x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
if not x_on_gpu:
return None
try:
eps = float(theano.tensor.get_scalar_constant_value(epsilon))
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
if not dnn_available():
return None
x = as_cuda_ndarray_variable(x)
scale = as_cuda_ndarray_variable(scale)
bias = as_cuda_ndarray_variable(bias)
estimated_mean = as_cuda_ndarray_variable(estimated_mean)
estimated_variance = as_cuda_ndarray_variable(estimated_variance)
out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance,
mode, eps)
# If the original output was on CPU, we have to transfer it
# TODO copy_stack_trace?
if isinstance(node.outputs[0].type, tensor.TensorType):
return [tensor.as_tensor_variable(out)]
else:
return [out]
......@@ -3050,3 +3050,28 @@ conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
local_abstractconv3d_gradinputs_gemm, 30,
'conv_gemm',
'gpu', 'fast_compile', 'fast_run')
# Register cuDNN batch normalization implementation
abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
abstract_batch_norm_groupopt.__name__ = "gpu_batchnorm_opts"
register_opt('fast_compile')(abstract_batch_norm_groupopt)
# cuDNN optimizations are only registered if cuDNN is available.
# (we import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn)
from .dnn import (local_abstract_batch_norm_train_cudnn,
local_abstract_batch_norm_train_grad_cudnn,
local_abstract_batch_norm_inference_cudnn) # noqa: 402
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_dnn',
local_abstract_batch_norm_train_cudnn, 20,
'batchnorm_dnn',
'gpu', 'fast_compile', 'fast_run', 'cudnn')
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_grad_dnn',
local_abstract_batch_norm_train_grad_cudnn, 20,
'batchnorm_dnn',
'gpu', 'fast_compile', 'fast_run', 'cudnn')
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_inference_dnn',
local_abstract_batch_norm_inference_cudnn, 20,
'batchnorm_dnn',
'gpu', 'fast_compile', 'fast_run', 'cudnn')
from __future__ import absolute_import, print_function, division
from collections import OrderedDict
import logging
import os
import sys
......@@ -18,6 +19,7 @@ import theano.tests.unittest_tools as utt
from theano.tensor.signal.pool import pool_2d, pool_3d
from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
from theano.tensor.nnet.abstract_conv import get_conv_output_shape
from theano.tensor.nnet import bn
import theano.sandbox.cuda.dnn as dnn
from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
from theano.sandbox.cuda import float32_shared_constructor as shared
......@@ -730,52 +732,201 @@ def test_batchnorm_train():
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
for mode in ('per-activation', 'spatial'):
for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias'))
for vartype in (tensor6, T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
x, scale, bias, running_mean, running_var = (vartype(n)
for n in ('x', 'scale', 'bias',
'running_mean',
'running_var'))
ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used
# forward pass
out, x_mean, x_invstd = cuda.dnn.dnn_batch_normalization_train(
x, scale, bias, mode, eps)
running_average_factor = 0.3
# forward pass, direct interface
out_gpu, x_mean_gpu, x_invstd_gpu, \
out_running_mean_gpu, out_running_var_gpu = \
dnn.dnn_batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# forward pass, abstract interface
out_abstract, x_mean_abstract, x_invstd_abstract, \
out_running_mean_abstract, out_running_var_abstract = \
bn.batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# reference forward pass
if mode == 'per-activation':
axes = (0,)
elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim))
x_mean2 = x.mean(axis=axes, keepdims=True)
x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps))
scale2 = T.addbroadcast(scale, *axes)
bias2 = T.addbroadcast(bias, *axes)
out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
x_mean_ref = x.mean(axis=axes, keepdims=True)
x_var_ref = x.var(axis=axes, keepdims=True)
x_invstd_ref = T.inv(T.sqrt(x_var_ref + eps))
scale_ref = T.addbroadcast(scale, *axes)
bias_ref = T.addbroadcast(bias, *axes)
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
out_running_mean_ref = running_mean * (1 - running_average_factor) + \
x_mean_ref * running_average_factor
out_running_var_ref = running_var * (1 - running_average_factor) + \
(m / (m - 1)) * x_var_ref * running_average_factor
# backward pass
dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
# reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
grads_ref = T.grad(None, wrt=[x, scale, bias], known_grads={out_ref: dy})
# compile
f = theano.function([x, scale, bias, dy],
[out, x_mean, x_invstd, out2, x_mean2, x_invstd2] +
grads + grads2, mode=mode_with_gpu)
f_gpu = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_gpu, x_mean_gpu, x_invstd_gpu,
out_running_mean_gpu, out_running_var_gpu] + grads_gpu,
mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract,
out_running_mean_abstract, out_running_var_abstract] +
grads_abstract,
mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_ref, x_mean_ref, x_invstd_ref,
out_running_mean_ref, out_running_var_ref] + grads_ref)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
for data_shape in ((5, 10, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape))
X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
Scale = numpy.random.randn(*param_shape).astype('float32')
Bias = numpy.random.randn(*param_shape).astype('float32')
outputs = f(X, Scale, Bias, Dy)
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs_gpu = f_gpu(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
# compare outputs
utt.assert_allclose(outputs[0], outputs[0 + 3]) # out
utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean
utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd
utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_gpu[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs_gpu[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_gpu[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_gpu[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs_abstract[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_abstract[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
# compare gradients
utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4) # dx
utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias
utt.assert_allclose(outputs_gpu[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_gpu[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_gpu[7], outputs_ref[7]) # dbias
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_abstract[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_abstract[7], outputs_ref[7]) # dbias
def test_dnn_batchnorm_train_without_running_averages():
# compile and run batch_normalization_train without running averages
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
# forward pass
out_gpu, x_mean_gpu, x_invstd_gpu = \
dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation')
out_abstract, x_mean_abstract, x_invstd_abstract = \
bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
# compile
f_gpu = theano.function([x, scale, bias, dy],
[out_gpu, x_mean_gpu, x_invstd_gpu] +
grads_gpu,
mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract] +
grads_abstract,
mode=mode_with_gpu)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm)
for n in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
for n in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f_abstract.maker.fgraph.toposort()])
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f_gpu(X, Scale, Bias, Dy)
f_abstract(X, Scale, Bias, Dy)
def test_dnn_batchnorm_train_inplace():
# test inplace_running_mean and inplace_running_var
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
running_mean = shared(
numpy.random.randn(*param_shape).astype(theano.config.floatX),
broadcastable=(True, False, False, False))
running_var = shared(
numpy.random.randn(*param_shape).astype(theano.config.floatX),
broadcastable=(True, False, False, False))
# forward pass
out, x_mean, x_invstd, new_running_mean, new_running_var = \
dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation',
epsilon=5e-3, running_average_factor=0.3,
running_mean=running_mean, running_var=running_var)
# update running averages
updates = OrderedDict()
updates[running_mean] = new_running_mean
updates[running_var] = new_running_var
# compile
f = theano.function([x, scale, bias],
[out, x_mean, x_invstd],
updates=updates,
mode=mode_with_gpu)
# check for the inplace settings
nodes = [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, dnn.GpuDnnBatchNorm)]
assert len(nodes) == 1
assert nodes[0].op.inplace_running_mean
assert nodes[0].op.inplace_running_var
assert nodes[0].op.inplace_output
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias)
def test_batchnorm_inference():
......@@ -785,53 +936,160 @@ def test_batchnorm_inference():
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
for mode in ('per-activation', 'spatial'):
for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
x, scale, bias, mean, var = (vartype(n) for n in ('x', 'scale',
'bias', 'mean',
'var'))
for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias, mean, var = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var'))
ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used
# forward pass
out = cuda.dnn.dnn_batch_normalization_test(x, scale, bias, mean,
var, mode, eps)
# forward pass, direct interface
out_gpu = dnn.dnn_batch_normalization_test(x, scale, bias, mean,
var, mode, eps)
# forward pass, abstract interface
out_abstract = bn.batch_normalization_test(x, scale, bias, mean,
var, mode, eps)
# reference forward pass
if mode == 'per-activation':
axes = (0,)
elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim))
scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes)
for t in (scale, bias, mean, var))
out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2
scale_ref, bias_ref, mean_ref, var_ref = (T.addbroadcast(t, *axes)
for t in (scale, bias, mean, var))
out_ref = (x - mean_ref) * (scale_ref / T.sqrt(var_ref + eps)) + bias_ref
# backward pass
dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy})
grads_gpu = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_abstract: dy})
# reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy})
grads_ref = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_ref: dy})
# compile
f = theano.function([x, scale, bias, mean, var, dy],
[out, out2] + grads + grads2, mode=mode_with_gpu)
f_gpu = theano.function([x, scale, bias, mean, var, dy],
[out_gpu] + grads_gpu, mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, mean, var, dy],
[out_abstract] + grads_abstract, mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, mean, var, dy],
[out_ref] + grads_ref)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
for data_shape in ((10, 20, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape))
X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
Scale = numpy.random.randn(*param_shape).astype('float32')
Bias = numpy.random.randn(*param_shape).astype('float32')
Mean = numpy.random.randn(*param_shape).astype('float32')
Var = numpy.random.rand(*param_shape).astype('float32')
outputs = f(X, Scale, Bias, Mean, Var, Dy)
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
outputs_gpu = f_gpu(X, Scale, Bias, Mean, Var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Mean, Var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Mean, Var, Dy)
# compare outputs
utt.assert_allclose(outputs[0], outputs[1]) # out
utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
# compare gradients
utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx
utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale
utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias
utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean
utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar
utt.assert_allclose(outputs_gpu[1], outputs_ref[1], atol=4e-5) # dx
utt.assert_allclose(outputs_gpu[2], outputs_ref[2], atol=4e-5) # dscale
utt.assert_allclose(outputs_gpu[3], outputs_ref[3]) # dbias
utt.assert_allclose(outputs_gpu[4], outputs_ref[4]) # dmean
utt.assert_allclose(outputs_gpu[5], outputs_ref[5], rtol=2e-3, atol=4e-5) # dvar
utt.assert_allclose(outputs_abstract[1], outputs_ref[1], atol=4e-5) # dx
utt.assert_allclose(outputs_abstract[2], outputs_ref[2], atol=4e-5) # dscale
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # dbias
utt.assert_allclose(outputs_abstract[4], outputs_ref[4]) # dmean
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], rtol=2e-3, atol=4e-5) # dvar
def test_batchnorm_inference_inplace():
# test inplace
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, mean, var = (T.tensor4(n) for n in ('x', 'scale', 'bias', 'mean', 'var'))
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
out = dnn.dnn_batch_normalization_test(x, scale, bias, mean, var)
f = theano.function([x, scale, bias, mean, var], [out], mode=mode_with_gpu)
# check for the inplace settings
nodes = [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, dnn.GpuDnnBatchNormInference)]
assert len(nodes) == 1
assert nodes[0].op.inplace
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias, Mean, Var)
def test_dnn_batchnorm_valid_and_invalid_axes():
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix):
x, scale, bias, mean, var, dy = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var', 'dy'))
ndim = x.ndim
# supported: per-activation and spatial
valid_axes_lists = ((0,), (0,) + tuple(range(2, ndim)))
# not supported: an axes list without 0 and including 1
invalid_axes_lists = (tuple(range(1, ndim)),)
for axes in valid_axes_lists + invalid_axes_lists:
# forward pass, abstract interface
out_train, x_mean, x_invstd = bn.batch_normalization_train(
x, scale, bias, axes)
out_test = bn.batch_normalization_test(
x, scale, bias, mean, var, axes)
# backward pass
dy = vartype('dy')
grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
grads_test = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_test: dy})
# compile
f = theano.function([x, scale, bias, mean, var, dy],
[out_train, x_mean, x_invstd, out_test] +
grads_train + grads_test,
mode=mode_with_gpu)
if axes in valid_axes_lists:
# check if the abstract Ops have been replaced by the cuDNN Ops
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
in f.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
in f.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f.maker.fgraph.toposort()])
else:
# check if the abstract Ops have been replaced, but not by the cuDNN Ops
assert not any([isinstance(n.op, (dnn.GpuDnnBatchNorm,
dnn.GpuDnnBatchNormGrad,
bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f.maker.fgraph.toposort()])
def test_dnn_tag():
......
from __future__ import absolute_import, print_function, division
import numpy
import theano
from theano import Apply, Op
from theano.gof import local_optimizer
from theano.gof.opt import copy_stack_trace
from theano.tensor import as_tensor_variable, TensorType
from theano.tensor import basic as T
from theano.tensor.opt import register_specialize_device
from theano.scalar import Composite
from theano.scalar import add, sub, true_div, mul
......@@ -37,7 +44,7 @@ def batch_normalization(inputs, gamma, beta, mean, std,
"""
This function will build the symbolic graph for applying batch normalization
to a set of activations.
Also works on GPUs
Also works on GPUs, but is not optimized using cuDNN.
.. versionadded:: 0.7.1
......@@ -75,3 +82,631 @@ def batch_normalization(inputs, gamma, beta, mean, std,
raise ValueError(
'mode must be either "low_mem", "high_mem"')
return rval
def _prepare_batch_normalization_axes(axes, ndim):
if axes == 'per-activation':
axes = (0,)
elif axes == 'spatial':
axes = (0,) + tuple(range(2, ndim))
elif isinstance(axes, (tuple, list, numpy.ndarray)):
axes = tuple(int(a) for a in axes)
else:
raise ValueError('invalid axes: %s', str(axes))
axes = tuple(sorted(axes))
if len(axes) == 0:
raise ValueError('there should be at least one normalization axis')
if min(axes) < 0 or max(axes) >= ndim:
raise ValueError('axes should be less than ndim (<%d), but %s given' % (ndim, str(axes)))
non_bc_axes = tuple(i for i in range(ndim) if i not in axes)
return axes, non_bc_axes
def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
epsilon=1e-4, running_average_factor=0.1,
running_mean=None, running_var=None):
"""
Performs batch normalization of the given inputs, using the mean and
variance of the inputs.
Parameters
----------
axes : 'per-activation', 'spatial' or a tuple of ints
The axes along which the input should be normalized. ``'per-activation'``
normalizes per activation and is equal to ``axes=(0,)``.
``'spatial'`` shares normalization factors across spatial dimensions
(i.e., all dimensions past the second), which for 4D inputs would be
equal to ``axes=(0, 2, 3)``.
gamma : tensor
Learnable scale factors. The shape must match the shape of `inputs`,
except for the axes in `axes`. These axes should be set to 1 or be
skipped altogether (such that `gamma.ndim == inputs.ndim - len(axes)`).
beta : tensor
Learnable biases. Must match the tensor layout of `gamma`.
epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None. The shape should match that of `gamma` and `beta`.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None. The shape should match that of `gamma` and `beta`.
Returns
-------
out : tensor
Batch-normalized inputs.
mean : tensor
Means of `inputs` across the normalization axes.
invstd : tensor
Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes
-----
If per-activation or spatial normalization is selected, this operation
will use the cuDNN implementation. (This requires cuDNN 5 or newer.)
The returned values are equivalent to:
.. code-block:: python
# for per-activation normalization
axes = (0,)
# for spatial normalization
axes = (0,) + tuple(range(2, inputs.ndim))
mean = inputs.mean(axes, keepdims=True)
var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) + \\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) + \\
(m / (m - 1)) * var * running_average_factor
"""
ndim = inputs.ndim
axes, non_bc_axes = _prepare_batch_normalization_axes(axes, ndim)
# have the parameter tensors been broadcasted yet?
if gamma.ndim == ndim:
params_ndim = ndim
else:
params_ndim = len(non_bc_axes)
params_dimshuffle_pattern = ['x'] * ndim
for i, axis in enumerate(non_bc_axes):
params_dimshuffle_pattern[axis] = i
if gamma.ndim != params_ndim or beta.ndim != params_ndim:
raise ValueError("gamma and beta dimensionality must match the "
"number of non-normalized axes, or have the "
"same number of dimensions as the inputs; "
"got %d and %d instead of %d" %
(gamma.ndim, beta.ndim, params_ndim))
if (running_mean is None) != (running_var is None):
raise ValueError("running_mean and running_var must either both be "
"given or both be None")
if running_mean is not None and running_mean.ndim != params_ndim:
raise ValueError("running_mean must be of the same dimensionality "
"as gamma and beta; got %d instead of %d" %
(running_mean.ndim, params_ndim))
if running_var is not None and running_var.ndim != params_ndim:
raise ValueError("running_var must be of the same dimensionality "
"as gamma and beta; got %d instead of %d" %
(running_var.ndim, params_ndim))
# epsilon will be converted to floatX later. we need to check
# for rounding errors now, since numpy.float32(1e-5) < 1e-5.
epsilon = numpy.cast[theano.config.floatX](epsilon)
if epsilon < 1e-5:
raise ValueError("epsilon must be at least 1e-5, got %s" % str(epsilon))
inputs = as_tensor_variable(inputs)
gamma = as_tensor_variable(gamma)
beta = as_tensor_variable(beta)
if params_ndim != ndim:
gamma = gamma.dimshuffle(params_dimshuffle_pattern)
beta = beta.dimshuffle(params_dimshuffle_pattern)
else:
gamma = T.addbroadcast(gamma, *axes)
beta = T.addbroadcast(beta, *axes)
batchnorm_op = AbstractBatchNormTrain(axes=axes)
if running_mean is not None and running_var is not None:
running_mean = as_tensor_variable(running_mean)
running_var = as_tensor_variable(running_var)
if params_ndim != ndim:
running_mean = running_mean.dimshuffle(params_dimshuffle_pattern)
running_var = running_var.dimshuffle(params_dimshuffle_pattern)
else:
running_mean = T.addbroadcast(running_mean, *axes)
running_var = T.addbroadcast(running_var, *axes)
out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
inputs, gamma, beta, epsilon=epsilon,
running_average_factor=running_average_factor,
running_mean=running_mean, running_var=running_var)
if new_running_mean.broadcastable != running_mean.broadcastable:
new_running_mean = T.patternbroadcast(new_running_mean, running_mean.broadcastable)
if new_running_var.broadcastable != running_var.broadcastable:
new_running_var = T.patternbroadcast(new_running_var, running_var.broadcastable)
results = (out, mean, invstd, new_running_mean, new_running_var)
else:
results = batchnorm_op(inputs, gamma, beta, epsilon=epsilon)
if params_ndim != ndim:
# remove the broadcasted dimensions (except from the output)
results = ([results[0]] +
[r.dimshuffle(non_bc_axes) for r in results[1:]])
return tuple(results)
def batch_normalization_test(inputs, gamma, beta, mean, var,
axes='per-activation', epsilon=1e-4):
"""
Performs batch normalization of the given inputs, using the given mean and
variance.
Parameters
----------
axes : 'per-activation', 'spatial' or a tuple of ints
The axes along which the input should be normalized. ``'per-activation'``
normalizes per activation and is equal to ``axes=(0,)``.
``'spatial'`` shares normalization factors across spatial dimensions
(i.e., all dimensions past the second), which for 4D inputs would be
equal to ``axes=(0, 2, 3)``.
gamma : tensor
Scale factors. The shape must match the shape of `inputs`,
except for the axes in `axes`. These axes should be set to 1 or be
skipped altogether (such that `gamma.ndim == inputs.ndim - len(axes)`).
beta : tensor
Biases. Must match the tensor layout of `gamma`.
mean : tensor
Means. Usually these are running averages computed during training.
Must match the tensor layout of `gamma`.
var : tensor
Variances. Usually these are running averages computed during training.
Must match the tensor layout of `gamma`.
epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
Returns
-------
out : tensor
Batch-normalized inputs.
Notes
-----
If per-activation or spatial normalization is selected, this operation
will use the cuDNN implementation. (This requires cuDNN 5 or newer.)
The returned value is equivalent to:
.. code-block:: python
# for per-activation normalization
axes = (0,)
# for spatial normalization
axes = (0,) + tuple(range(2, inputs.ndim))
gamma, beta, mean, var = (T.addbroadcast(t, *axes)
for t in (gamma, beta, mean, var))
out = (inputs - mean) * gamma / T.sqrt(var + epsilon) + beta
"""
ndim = inputs.ndim
axes, non_bc_axes = _prepare_batch_normalization_axes(axes, ndim)
# have the parameter tensors been broadcasted yet?
if gamma.ndim == ndim:
params_ndim = ndim
else:
params_ndim = len(non_bc_axes)
params_dimshuffle_pattern = ['x'] * ndim
for i, axis in enumerate(non_bc_axes):
params_dimshuffle_pattern[axis] = i
if gamma.ndim != params_ndim or beta.ndim != params_ndim:
raise ValueError("gamma and beta dimensionality must match the "
"number of non-normalized axes, or have the "
"same number of dimensions as the inputs; "
"got %d and %d instead of %d" %
(gamma.ndim, beta.ndim, params_ndim))
if mean.ndim != params_ndim or var.ndim != params_ndim:
raise ValueError("mean and var must be of the same dimensionality "
"as gamma and beta; got %d and %d instead of %d" %
(mean.ndim, var.ndim, params_ndim))
# epsilon will be converted to floatX later. we need to check
# for rounding errors now, since numpy.float32(1e-5) < 1e-5.
epsilon = numpy.cast[theano.config.floatX](epsilon)
if epsilon < 1e-5:
raise ValueError("epsilon must be at least 1e-5, got %s" % str(epsilon))
gamma = as_tensor_variable(gamma)
beta = as_tensor_variable(beta)
mean = as_tensor_variable(mean)
var = as_tensor_variable(var)
if params_ndim != ndim:
gamma = gamma.dimshuffle(params_dimshuffle_pattern)
beta = beta.dimshuffle(params_dimshuffle_pattern)
mean = mean.dimshuffle(params_dimshuffle_pattern)
var = var.dimshuffle(params_dimshuffle_pattern)
else:
gamma = T.addbroadcast(gamma, *axes)
beta = T.addbroadcast(beta, *axes)
mean = T.addbroadcast(mean, *axes)
var = T.addbroadcast(var, *axes)
batchnorm_op = AbstractBatchNormInference(axes=axes)
return batchnorm_op(inputs, gamma, beta, mean, var, epsilon=epsilon)
class AbstractBatchNormTrain(Op):
"""
Abstract Op for Batch Normalization.
Parameters
----------
axes : a tuple of ints
The axes along which the input should be normalized.
x : tensor
The input to be normalized along `axes`.
scale : tensor
`scale` should have the same number of dimensions as `x`.
All dimensions listed in `axes` should have length 1.
bias : tensor
`bias` should have the same number of dimensions as `x`.
All dimensions listed in `axes` should have length 1.
epsilon
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - running_average_factor) + batch mean * running_average_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - running_average_factor) + (m / (m - 1)) * batch var * running_average_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
"""
__props__ = ('axes',)
def __init__(self, axes=(0,)):
assert isinstance(axes, (tuple, list))
assert len(axes) > 0
axes = tuple(int(a) for a in axes)
self.axes = axes
def infer_shape(self, node, shape):
return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)
def make_node(self, x, scale, bias, epsilon=1e-4,
running_average_factor=0.1,
running_mean=None, running_var=None):
assert x.ndim == scale.ndim == bias.ndim
assert ((running_mean is None and running_var is None) or
(running_mean is not None and running_var is not None))
assert (running_mean is None or running_mean.ndim == x.ndim)
assert (running_var is None or running_var.ndim == x.ndim)
if not isinstance(epsilon, theano.Variable):
epsilon = as_tensor_variable(epsilon)
if not isinstance(running_average_factor, theano.Variable):
running_average_factor = as_tensor_variable(running_average_factor)
inputs = [x, scale, bias, epsilon, running_average_factor]
output_types = [x.type(), scale.type(), scale.type()]
if running_mean is not None and running_var is not None:
inputs.append(running_mean)
inputs.append(running_var)
output_types.append(scale.type())
output_types.append(scale.type())
return Apply(self, inputs, output_types)
def L_op(self, inputs, outputs, grads):
x, scale, bias, epsilon, running_average_factor = inputs[:5]
dy = grads[0]
_, x_mean, x_invstd = outputs[:3]
disconnected_outputs = [
theano.gradient.DisconnectedType()(), # epsilon
theano.gradient.DisconnectedType()()] # running_average_factor
# Optional running_mean and running_var.
for i in range(5, len(inputs)):
disconnected_outputs.append(theano.gradient.DisconnectedType()())
return AbstractBatchNormTrainGrad(self.axes)(
x, dy, scale, x_mean, x_invstd, epsilon) + disconnected_outputs
def connection_pattern(self, node):
# Specificy that epsilon and running_average_factor are not connected to outputs.
patterns = [[True, True, True], # x
[True, True, True], # scale
[True, True, True], # bias
[False, False, False], # epsilon
[False, False, False]] # running_average_factor
# Optional running_mean and running_var are only
# connected to their new values.
for i in range(5, len(node.inputs)):
patterns[0].append(True)
for pattern in patterns[1:]:
pattern.append(False)
patterns.append([False] * (3 + i - 5) + [True])
return patterns
def perform(self, node, inputs, output_storage):
x, scale, bias, epsilon, running_average_factor = inputs[:5]
axes = self.axes
if min(axes) < 0 or max(axes) >= x.ndim:
raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))
mean = x.mean(axes, keepdims=True)
var = x.var(axes, keepdims=True)
invstd = 1.0 / numpy.sqrt(var + epsilon)
out = (x - mean) * (scale * invstd) + bias
output_storage[0][0] = out
output_storage[1][0] = mean
output_storage[2][0] = invstd
if len(inputs) > 5:
running_mean = inputs[5]
running_mean = running_mean * (1.0 - running_average_factor) + \
mean * running_average_factor
output_storage[3][0] = running_mean
if len(inputs) > 6:
m = float(numpy.prod(x.shape) / numpy.prod(scale.shape))
running_var = inputs[6]
running_var = running_var * (1.0 - running_average_factor) + \
(m / (m - 1)) * var * running_average_factor
output_storage[4][0] = running_var
class AbstractBatchNormInference(Op):
"""
Abstract Op for Batch Normalization.
Parameters
----------
axes : a tuple of ints
The axes along which the input is normalized.
epsilon
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
"""
__props__ = ('axes',)
def __init__(self, axes=(0,)):
assert isinstance(axes, (tuple, list))
assert len(axes) > 0
axes = tuple(int(a) for a in axes)
self.axes = axes
def infer_shape(self, node, shape):
return [shape[0]]
def make_node(self, x, scale, bias, estimated_mean, estimated_variance, epsilon=1e-4):
assert x.ndim == scale.ndim == bias.ndim == estimated_mean.ndim == estimated_variance.ndim
if not isinstance(epsilon, theano.Variable):
epsilon = as_tensor_variable(epsilon)
return Apply(self, [x, scale, bias, estimated_mean, estimated_variance, epsilon], [x.type()])
def grad(self, inputs, grads):
x, scale, bias, est_mean, est_var, epsilon = inputs
dy = grads[0]
axes = self.axes
if min(axes) < 0 or max(axes) >= x.ndim:
raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))
scale, bias, est_mean, est_var = (theano.tensor.addbroadcast(t, *axes)
for t in (scale, bias, est_mean, est_var))
# define helper expressions
est_var_eps = est_var + epsilon
est_std = theano.tensor.sqrt(est_var_eps)
two = theano.tensor.constant(2.)
# define and return gradients
dx = dy * (scale / est_std)
dscale = (dy * (x - est_mean)).sum(axes, keepdims=True) / est_std
dbias = dy.sum(axes, keepdims=True)
dmean = -dy.sum(axes, keepdims=True) * (scale / est_std)
dvar = -(dy * (x - est_mean)).sum(axes, keepdims=True) * (scale / (two * est_var_eps * est_std))
return [dx, dscale, dbias, dmean, dvar, theano.gradient.DisconnectedType()()]
def connection_pattern(self, node):
# Specificy that epsilon is not connected to outputs.
return [[True], [True], [True], [True], [True], [False]]
def perform(self, node, inputs, output_storage):
x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs
out = (x - estimated_mean) * (scale / numpy.sqrt(estimated_variance + epsilon)) + bias
output_storage[0][0] = out
class AbstractBatchNormTrainGrad(Op):
__props__ = ('axes',)
def __init__(self, axes=(0,)):
assert isinstance(axes, (tuple, list))
assert len(axes) > 0
axes = tuple(int(a) for a in axes)
self.axes = axes
def make_node(self, x, dy, scale, x_mean, x_invstd, epsilon=1e-4):
assert x.ndim == dy.ndim == scale.ndim == x_mean.ndim == x_invstd.ndim
if not isinstance(epsilon, theano.Variable):
epsilon = as_tensor_variable(epsilon)
return Apply(self, [x, dy, scale, x_mean, x_invstd, epsilon],
[x.type(), scale.type(), scale.type()])
def infer_shape(self, node, shape):
return [shape[0], shape[2], shape[2]]
def perform(self, node, inputs, output_storage):
x, dy, scale, x_mean, x_invstd, epsilon = inputs
axes = self.axes
if min(axes) < 0 or max(axes) >= x.ndim:
raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))
x_diff = x - x_mean
mean_dy_x_diff = numpy.mean(dy * x_diff, axis=axes, keepdims=True)
c = (dy * x_invstd) - (x_diff * mean_dy_x_diff * (x_invstd ** 3))
g_wrt_inputs = scale * (c - numpy.mean(c, axis=axes, keepdims=True))
g_wrt_scale = numpy.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)
g_wrt_bias = numpy.sum(dy, axis=axes, keepdims=True)
output_storage[0][0] = g_wrt_inputs
output_storage[1][0] = g_wrt_scale
output_storage[2][0] = g_wrt_bias
@local_optimizer([AbstractBatchNormTrain])
def local_abstract_batch_norm_train(node):
if not isinstance(node.op, AbstractBatchNormTrain):
return None
x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
axes = node.op.axes
if min(axes) < 0 or max(axes) > x.ndim:
return None
if not isinstance(x.type, TensorType) or \
not isinstance(scale.type, TensorType) or \
not isinstance(bias.type, TensorType) or \
not isinstance(epsilon.type, TensorType) or \
not isinstance(running_average_factor.type, TensorType):
return None
# optional running_mean and running_var
if len(node.inputs) > 5 and not isinstance(node.inputs[5].type, TensorType):
return None
if len(node.inputs) > 6 and not isinstance(node.inputs[6].type, TensorType):
return None
mean = x.mean(axes, keepdims=True)
var = x.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (x - mean) * (scale * invstd) + bias
results = [out, mean, invstd]
if len(node.inputs) > 5:
running_mean = node.inputs[5]
running_mean = running_mean * (1.0 - running_average_factor) + \
mean * running_average_factor
results.append(running_mean)
if len(node.inputs) > 6:
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
running_var = node.inputs[6]
running_var = running_var * (1.0 - running_average_factor) + \
(m / (m - 1)) * var * running_average_factor
results.append(running_var)
results = [T.patternbroadcast(r, r_orig.broadcastable)
for (r, r_orig) in zip(results, node.outputs)]
for var in theano.gof.graph.variables(node.inputs, results):
if var not in node.inputs:
copy_stack_trace(node.outputs[0], var)
return results
@local_optimizer([AbstractBatchNormTrainGrad])
def local_abstract_batch_norm_train_grad(node):
if not isinstance(node.op, AbstractBatchNormTrainGrad):
return None
x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
axes = node.op.axes
if min(axes) < 0 or max(axes) > x.ndim:
return None
if not isinstance(x.type, TensorType) or \
not isinstance(dy.type, TensorType) or \
not isinstance(scale.type, TensorType) or \
not isinstance(x_mean.type, TensorType) or \
not isinstance(x_invstd.type, TensorType) or \
not isinstance(epsilon.type, TensorType):
return None
x_diff = x - x_mean
mean_dy_x_diff = T.mean(dy * x_diff, axis=axes, keepdims=True)
c = (dy * x_invstd) - x_diff * (mean_dy_x_diff * (x_invstd ** 3))
g_wrt_inputs = scale * (c - T.mean(c, axis=axes, keepdims=True))
g_wrt_scale = T.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)
g_wrt_bias = T.sum(dy, axis=axes, keepdims=True)
results = [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
results = [T.patternbroadcast(r, r_orig.broadcastable)
for (r, r_orig) in zip(results, node.outputs)]
for var in theano.gof.graph.variables(node.inputs, results):
if var not in node.inputs:
copy_stack_trace(node.outputs[0], var)
return results
@local_optimizer([AbstractBatchNormInference])
def local_abstract_batch_norm_inference(node):
if not isinstance(node.op, AbstractBatchNormInference):
return None
x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs
if not isinstance(x.type, TensorType) or \
not isinstance(scale.type, TensorType) or \
not isinstance(bias.type, TensorType) or \
not isinstance(estimated_mean.type, TensorType) or \
not isinstance(estimated_variance.type, TensorType) or \
not isinstance(epsilon.type, TensorType):
return None
result = (x - estimated_mean) * (scale / T.sqrt(estimated_variance + epsilon)) + bias
result = T.patternbroadcast(result, node.outputs[0].broadcastable)
for var in theano.gof.graph.variables(node.inputs, [result]):
if var not in node.inputs:
copy_stack_trace(node.outputs[0], var)
return [result]
# Register Cpu Optmization
bn_groupopt = theano.gof.optdb.LocalGroupDB()
bn_groupopt.__name__ = 'batchnorm_opts'
register_specialize_device(bn_groupopt, 'fast_compile', 'fast_run')
bn_groupopt.register('local_abstract_batch_norm_train',
local_abstract_batch_norm_train, 30,
'fast_compile', 'fast_run')
bn_groupopt.register('local_abstract_batch_norm_train_grad',
local_abstract_batch_norm_train_grad, 30,
'fast_compile', 'fast_run')
bn_groupopt.register('local_abstract_batch_norm_inference',
local_abstract_batch_norm_inference, 30,
'fast_compile', 'fast_run')
from __future__ import absolute_import, print_function, division
import theano
import theano.tensor as T
from theano.tests import unittest_tools as utt
import numpy
from theano.tensor.nnet.bn import batch_normalization
from theano.tensor.nnet import bn
def test_BNComposite():
......@@ -39,7 +40,7 @@ def test_BNComposite():
f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
res_ref = f_ref(X, G, B, M, V)
for mode in ['low_mem', 'high_mem']:
bn_op = batch_normalization(x, g, b, m, v, mode=mode)
bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode)
f = theano.function([x, b, g, m, v], [bn_op])
res = f(X, G, B, M, V)
utt.assert_allclose(res_ref, res)
......@@ -47,7 +48,7 @@ def test_BNComposite():
theano.config.compute_test_value = orig
def test_bn():
def test_batch_normalization():
def bn_ref(x, G, B, M, V):
n = (x - M) / V
......@@ -70,28 +71,28 @@ def test_bn():
f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
res_ref = f_ref(X, G, B, M, V)
for mode in ['low_mem', 'high_mem']:
bn_op = batch_normalization(x, g, b, m, v, mode=mode)
bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode)
f = theano.function([x, b, g, m, v], [bn_op])
res = f(X, G, B, M, V)
utt.assert_allclose(res_ref, res)
def bn(inputs, gamma, beta, mean, std):
return batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
utt.verify_grad(bn, [X, G, B, M, V])
def bn_f(inputs, gamma, beta, mean, std):
return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
utt.verify_grad(bn_f, [X, G, B, M, V])
bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True))
f_ref = theano.function([x, b, g], [bn_ref_op])
res_ref = f_ref(X, G, B)
for mode in ['low_mem', 'high_mem']:
bn_op = batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode)
bn_op = bn.batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode)
f = theano.function([x, b, g], [bn_op])
res = f(X, G, B)
utt.assert_allclose(res_ref, res)
def bn(inputs, gamma, beta, mean, std):
return batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
utt.verify_grad(batch_normalization, [X, G, B,
X.mean(axis=0)[numpy.newaxis], X.std(axis=0)[numpy.newaxis]])
def bn_f(inputs, gamma, beta, mean, std):
return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
utt.verify_grad(bn_f, [X, G, B,
X.mean(axis=0)[numpy.newaxis], X.std(axis=0)[numpy.newaxis]])
def test_bn_feature_maps():
......@@ -122,21 +123,296 @@ def test_bn_feature_maps():
res_ref = f_ref(X, G, B, M, V)
for mode in ['low_mem', 'high_mem']:
bn_op = batch_normalization(x,
g.dimshuffle('x', 0, 'x', 'x'),
b.dimshuffle('x', 0, 'x', 'x'),
m.dimshuffle('x', 0, 'x', 'x'),
v.dimshuffle('x', 0, 'x', 'x'),
mode=mode)
bn_op = bn.batch_normalization(x,
g.dimshuffle('x', 0, 'x', 'x'),
b.dimshuffle('x', 0, 'x', 'x'),
m.dimshuffle('x', 0, 'x', 'x'),
v.dimshuffle('x', 0, 'x', 'x'),
mode=mode)
f = theano.function([x, b, g, m, v], [bn_op])
res = f(X, G, B, M, V)
utt.assert_allclose(res_ref, res)
def conv_bn(inputs, gamma, beta, mean, std):
return batch_normalization(inputs,
gamma.dimshuffle('x', 0, 'x', 'x'),
beta.dimshuffle('x', 0, 'x', 'x'),
mean.dimshuffle('x', 0, 'x', 'x'),
std.dimshuffle('x', 0, 'x', 'x'),
mode=mode)
return bn.batch_normalization(inputs,
gamma.dimshuffle('x', 0, 'x', 'x'),
beta.dimshuffle('x', 0, 'x', 'x'),
mean.dimshuffle('x', 0, 'x', 'x'),
std.dimshuffle('x', 0, 'x', 'x'),
mode=mode)
utt.verify_grad(conv_bn, [X, G, B, M, V])
def test_batch_normalization_train():
utt.seed_rng()
for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias, running_mean, running_var = (vartype(n)
for n in ('x', 'scale', 'bias',
'running_mean',
'running_var'))
ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used
running_average_factor = 0.3
# remove non-existing axes
if isinstance(axes, tuple):
axes = tuple(i for i in axes if i < ndim)
if len(axes) == 0:
continue
# forward pass
out, x_mean, x_invstd, out_running_mean, out_running_var = \
bn.batch_normalization_train(
x, scale, bias, axes, eps,
running_average_factor, running_mean, running_var)
# reference forward pass
if axes == 'per-activation':
axes2 = (0,)
elif axes == 'spatial':
axes2 = (0,) + tuple(range(2, ndim))
else:
axes2 = axes
x_mean2 = x.mean(axis=axes2, keepdims=True)
x_var2 = x.var(axis=axes2, keepdims=True)
x_invstd2 = T.inv(T.sqrt(x_var2 + eps))
scale2 = T.addbroadcast(scale, *axes2)
bias2 = T.addbroadcast(bias, *axes2)
out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
out_running_mean2 = running_mean * (1 - running_average_factor) + \
x_mean2 * running_average_factor
out_running_var2 = running_var * (1 - running_average_factor) + \
(m / (m - 1)) * x_var2 * running_average_factor
# backward pass
dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
# reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
# compile
f = theano.function([x, scale, bias, running_mean, running_var, dy],
[out, x_mean, x_invstd, out_running_mean, out_running_var,
out2, x_mean2, x_invstd2, out_running_mean2, out_running_var2] +
grads + grads2)
# check if the abstract Ops have been replaced
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f.maker.fgraph.toposort()])
# run
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes2 else s
for d, s in enumerate(data_shape))
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy)
# compare outputs
utt.assert_allclose(outputs[0], outputs[0 + 5]) # out
utt.assert_allclose(outputs[1], outputs[1 + 5]) # mean
utt.assert_allclose(outputs[2], outputs[2 + 5]) # invstd
utt.assert_allclose(outputs[3], outputs[3 + 5]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs[4]),
numpy.nan_to_num(outputs[4 + 5])) # running_var
# compare gradients
utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4) # dx
utt.assert_allclose(outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs[12], outputs[12 + 3]) # dbias
def test_batch_normalization_train_without_running_averages():
# compile and run batch_normalization_train without running averages
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
# forward pass
out, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
# compile
f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads)
# check if the abstract Ops have been replaced
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f.maker.fgraph.toposort()])
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias, Dy)
def test_batch_normalization_train_broadcast():
for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x = vartype('x')
ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used
running_average_factor = 0.3
# remove non-existing axes
if isinstance(axes, tuple):
axes = tuple(i for i in axes if i < ndim)
if len(axes) == 0:
continue
# convert axes to explicit list
if axes == 'per-activation':
axes2 = (0,)
elif axes == 'spatial':
axes2 = (0,) + tuple(range(2, ndim))
else:
axes2 = axes
# compute axes for parameter tensors
non_bc_axes = tuple(i for i in range(ndim) if i not in axes2)
params_dimshuffle = ['x'] * ndim
for i, axis in enumerate(non_bc_axes):
params_dimshuffle[axis] = i
# construct non-broadcasted parameter variables
param_type = T.TensorType(x.dtype, (False,) * len(non_bc_axes))
scale, bias, running_mean, running_var = (param_type(n)
for n in ('scale', 'bias',
'running_mean',
'running_var'))
# broadcast parameter variables
scale_bc = scale.dimshuffle(params_dimshuffle)
bias_bc = bias.dimshuffle(params_dimshuffle)
running_mean_bc = running_mean.dimshuffle(params_dimshuffle)
running_var_bc = running_var.dimshuffle(params_dimshuffle)
# batch_normalization_train with original, non-broadcasted variables
train_non_bc = \
bn.batch_normalization_train(
x, scale, bias, axes, eps,
running_average_factor, running_mean, running_var)
# batch_normalization_train with broadcasted variables
train_bc = \
bn.batch_normalization_train(
x, scale_bc, bias_bc, axes, eps,
running_average_factor, running_mean_bc, running_var_bc)
train_bc = tuple([train_bc[0]] + # out
[r.dimshuffle(non_bc_axes) for r in train_bc[1:]])
# batch_normalization_test with original, non-broadcasted variables
test_non_bc = \
bn.batch_normalization_test(
x, scale, bias, running_mean, running_var, axes, eps)
# batch_normalization_test with broadcasted variables
test_bc = \
bn.batch_normalization_test(
x, scale_bc, bias_bc, running_mean_bc, running_var_bc, axes, eps)
# subtract the results of the non-broadcasted and broadcasted calls
results_non_bc = train_non_bc + (test_non_bc,)
results_bc = train_bc + (test_bc,)
results = [abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc)]
# compile to compute all differences
f = theano.function([x, scale, bias, running_mean, running_var],
T.sum(sum(results)))
# the paired ops are exactly the same, so the optimizer should have
# collapsed the sum of differences to a constant zero
nodes = f.maker.fgraph.toposort()
if theano.config.mode != "FAST_COMPILE":
assert len(nodes) == 1
assert isinstance(nodes[0].op, theano.compile.DeepCopyOp)
inputs = [numpy.asarray(numpy.random.rand(*((4,) * n)), x.dtype)
for n in [x.ndim, scale.ndim, bias.ndim,
running_mean.ndim, running_var.ndim]]
assert 0.0 == f(*inputs)
def test_batch_normalization_test():
for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias, mean, var = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var'))
ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used
# remove non-existing axes
if isinstance(axes, tuple):
axes = tuple(i for i in axes if i < ndim)
if len(axes) == 0:
continue
# forward pass
out = bn.batch_normalization_test(x, scale, bias, mean,
var, axes, eps)
# reference forward pass
if axes == 'per-activation':
axes2 = (0,)
elif axes == 'spatial':
axes2 = (0,) + tuple(range(2, ndim))
else:
axes2 = axes
scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes2)
for t in (scale, bias, mean, var))
out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2
# backward pass
dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy})
# reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy})
# compile
f = theano.function([x, scale, bias, mean, var, dy],
[out, out2] + grads + grads2)
# check if the abstract Ops have been replaced
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f.maker.fgraph.toposort()])
# run
for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes2 else s
for d, s in enumerate(data_shape))
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Mean, Var, Dy)
# compare outputs
utt.assert_allclose(outputs[0], outputs[1]) # out
# compare gradients
utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx
utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale
utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias
utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean
utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar
def test_batch_normalization_broadcastable():
# check if the broadcastable pattern is preserved by the optimizations
x, dy, scale, bias, mean, var = (T.scalar(n).dimshuffle(['x'] * 5)
for n in ('x', 'dy', 'scale', 'bias', 'mean', 'var'))
# forward pass
out_train, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'spatial')
out_test = bn.batch_normalization_test(x, scale, bias, mean, var, 'spatial')
# backward pass
grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
grads_test = T.grad(None, wrt=[x, scale, bias], known_grads={out_test: dy})
# compile
f = theano.function([x, scale, bias, mean, var, dy],
[out_train, x_mean, x_invstd, out_test] + grads_train + grads_test)
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f.maker.fgraph.toposort()])
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论