提交 8b9f7336 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5190 from gvtulder/f-batchnorm-abstract

Abstract Ops for batch normalization
...@@ -10,6 +10,9 @@ ...@@ -10,6 +10,9 @@
.. moduleauthor:: LISA .. moduleauthor:: LISA
.. seealso:: cuDNN batch normalization: :class:`theano.gpuarray.dnn.dnn_batch_normalization_train`, :class:`theano.gpuarray.dnn.dnn_batch_normalization_test>`. They must be added manually as they do not have the same user interface. .. autofunction:: theano.tensor.nnet.bn.batch_normalization_train
.. autofunction:: theano.tensor.nnet.bn.batch_normalization_test
.. seealso:: cuDNN batch normalization: :class:`theano.gpuarray.dnn.dnn_batch_normalization_train`, :class:`theano.gpuarray.dnn.dnn_batch_normalization_test>`.
.. autofunction:: theano.tensor.nnet.bn.batch_normalization .. autofunction:: theano.tensor.nnet.bn.batch_normalization
...@@ -28,19 +28,20 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d, ...@@ -28,19 +28,20 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
assert_conv_shape) assert_conv_shape)
from theano.tensor.signal.pool import ( from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad) Pool, MaxPoolGrad, AveragePoolGrad)
from theano.tensor.nnet import bn
from . import pygpu from . import pygpu
from .type import (get_context, gpu_context_type, list_contexts, from .type import (get_context, gpu_context_type, list_contexts,
GpuArraySharedVariable) GpuArraySharedVariable)
from .basic_ops import (as_gpuarray_variable, infer_context_name, from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, gpu_alloc_empty, gpu_contiguous, gpu_alloc_empty,
empty_like, GpuArrayType) empty_like, GpuArrayType, HostFromGpu)
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
# These don't exist in gpuarray # These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from .nnet import GpuSoftmax from .nnet import GpuSoftmax
from .opt import (gpu_seqopt, register_opt, pool_db, pool_db2, from .opt import (gpu_seqopt, register_opt, pool_db, pool_db2,
op_lifter, register_opt2) op_lifter, register_opt2, register_inplace)
from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims
...@@ -1389,13 +1390,13 @@ class GpuDnnPool(DnnBase): ...@@ -1389,13 +1390,13 @@ class GpuDnnPool(DnnBase):
res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1) res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
return [res] return [res]
def grad(self, inp, grads): def L_op(self, inp, outputs, grads):
img, ws, stride, pad = inp img, ws, stride, pad = inp
grad, = grads grad, = grads
grad = gpu_contiguous(grad) grad = gpu_contiguous(grad)
out = self(img, ws, stride, pad) out, = outputs
g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad) g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad)
...@@ -1591,10 +1592,10 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase): ...@@ -1591,10 +1592,10 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
assert x.ndim == 4 assert x.ndim == 4
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
def grad(self, inp, grads): def L_op(self, inp, outputs, grads):
x, = inp x, = inp
g_sm, = grads g_sm, = grads
sm = self(x) sm, = outputs
return [GpuDnnSoftmaxGrad( return [GpuDnnSoftmaxGrad(
self.algo, self.algo,
self.mode self.mode
...@@ -1646,48 +1647,131 @@ class GpuDnnBatchNorm(DnnBase): ...@@ -1646,48 +1647,131 @@ class GpuDnnBatchNorm(DnnBase):
epsilon epsilon
Epsilon value used in the batch normalization formula. Minimum allowed Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN). value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
""" """
__props__ = ('mode',) __props__ = ('mode', 'running_averages', 'inplace_running_mean',
'inplace_running_var', 'inplace_output')
def __init__(self, mode='per-activation'): def __init__(self, mode='per-activation', running_averages=False,
inplace_running_mean=False, inplace_running_var=False,
inplace_output=False):
DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm.c'], DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm.c'],
'dnn_batchnorm_op') 'dnn_batchnorm_op')
assert (mode in ('per-activation', 'spatial')) assert (mode in ('per-activation', 'spatial'))
self.mode = mode self.mode = mode
self.running_averages = running_averages
self.inplace_output = inplace_output
self.inplace_running_mean = inplace_running_mean
self.inplace_running_var = inplace_running_var
self.destroy_map = {}
if self.inplace_output:
self.destroy_map[0] = [0]
if self.running_averages and self.inplace_running_mean:
self.destroy_map[3] = [5]
if self.running_averages and self.inplace_running_var:
self.destroy_map[4] = [6]
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'running_average_factor'):
self.running_average_factor = 0
if not hasattr(self, 'running_averages'):
self.running_averages = False
if not (hasattr(self, 'inplace_running_mean') and
hasattr(self, 'inplace_running_var') and
hasattr(self, 'inplace_output')):
self.inplace_running_mean = False
self.inplace_running_var = False
self.inplace_output = False
self.destroy_map = {}
def get_op_params(self): def get_op_params(self):
params = [] params = []
if self.inplace_output:
params.append(('INPLACE_OUTPUT', '1'))
if self.running_averages:
params.append(('RUNNING_AVERAGES', '1'))
if self.inplace_running_mean:
params.append(('INPLACE_RUNNING_MEAN', '1'))
if self.inplace_running_var:
params.append(('INPLACE_RUNNING_VAR', '1'))
params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL" params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL"
if self.mode == "spatial" if self.mode == "spatial"
else "CUDNN_BATCHNORM_PER_ACTIVATION"))) else "CUDNN_BATCHNORM_PER_ACTIVATION")))
return params return params
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[0], shape[1], shape[1]] return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)
def make_node(self, x, scale, bias, epsilon=1e-4): def make_node(self, x, scale, bias, epsilon=1e-4,
running_average_factor=0.1,
running_mean=None, running_var=None):
assert x.ndim == scale.ndim == bias.ndim
assert x.ndim in (4, 5)
assert self.running_averages == (running_mean is not None) == (running_var is not None)
assert (running_mean is None or running_mean.ndim == x.ndim)
assert (running_var is None or running_var.ndim == x.ndim)
ctx_name = infer_context_name(x, scale, bias) ctx_name = infer_context_name(x, scale, bias)
x = as_gpuarray_variable(x, ctx_name) x = as_gpuarray_variable(x, ctx_name)
scale = as_gpuarray_variable(scale, ctx_name) scale = as_gpuarray_variable(scale, ctx_name)
bias = as_gpuarray_variable(bias, ctx_name) bias = as_gpuarray_variable(bias, ctx_name)
epsilon = as_scalar(epsilon).astype('float64') epsilon = as_scalar(epsilon).astype('float64')
assert x.ndim == scale.ndim == bias.ndim running_average_factor = as_scalar(running_average_factor).astype('float64')
assert x.ndim in (4, 5) inputs = [x, scale, bias, epsilon, running_average_factor]
return Apply(self, [x, scale, bias, epsilon], [x.type(), scale.type(), scale.type()]) output_types = [x.type(), scale.type(), scale.type()]
if running_mean is not None and running_var is not None:
def grad(self, inputs, grads): inputs.append(as_gpuarray_variable(running_mean, ctx_name))
x, scale, bias, epsilon = inputs inputs.append(as_gpuarray_variable(running_var, ctx_name))
output_types.append(scale.type())
output_types.append(scale.type())
return Apply(self, inputs, output_types)
def L_op(self, inputs, outputs, grads):
x, scale, bias, epsilon, running_average_factor = inputs[:5]
dy = grads[0] dy = grads[0]
_, x_mean, x_invstd = self(x, scale, bias, epsilon) _, x_mean, x_invstd = outputs[:3]
return GpuDnnBatchNormGrad(self.mode)(x, dy, scale, x_mean, disconnected_outputs = [
x_invstd, epsilon) + [DisconnectedType()()] DisconnectedType()(), # epsilon
DisconnectedType()()] # running_average_factor
# Optional running_mean and running_var.
for i in range(5, len(inputs)):
disconnected_outputs.append(DisconnectedType()())
return GpuDnnBatchNormGrad(self.mode)(
x, dy, scale, x_mean, x_invstd, epsilon) + disconnected_outputs
def connection_pattern(self, node): def connection_pattern(self, node):
# Specificy that epsilon is not connected to outputs. # Specificy that epsilon and running_average_factor are not connected to outputs.
return [[True, True, True], [True, True, True], [True, True, True], patterns = [[True, True, True], # x
[False, False, False]] [True, True, True], # scale
[True, True, True], # bias
[False, False, False], # epsilon
[False, False, False]] # running_average_factor
# Optional running_mean and running_var are only
# connected to their new values.
for i in range(5, len(node.inputs)):
patterns[0].append(True)
for pattern in patterns[1:]:
pattern.append(False)
patterns.append([False] * (3 + i - 5) + [True])
return patterns
class GpuDnnBatchNormInference(DnnBase): class GpuDnnBatchNormInference(DnnBase):
...@@ -1706,17 +1790,27 @@ class GpuDnnBatchNormInference(DnnBase): ...@@ -1706,17 +1790,27 @@ class GpuDnnBatchNormInference(DnnBase):
value is 1e-5 (imposed by cuDNN). value is 1e-5 (imposed by cuDNN).
""" """
__props__ = ('mode',) __props__ = ('mode', 'inplace')
def __init__(self, mode='per-activation'): def __init__(self, mode='per-activation', inplace=False):
DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_inf.c'], DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_inf.c'],
'dnn_batchnorm_op') 'dnn_batchnorm_op')
assert (mode in ('per-activation', 'spatial')) assert (mode in ('per-activation', 'spatial'))
self.mode = mode self.mode = mode
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'inplace'):
self.inplace = False
def get_op_params(self): def get_op_params(self):
params = [] params = []
if self.inplace:
params.append(('INPLACE_OUTPUT', '1'))
params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL" params.append(('MODE', ("CUDNN_BATCHNORM_SPATIAL"
if self.mode == "spatial" if self.mode == "spatial"
else "CUDNN_BATCHNORM_PER_ACTIVATION"))) else "CUDNN_BATCHNORM_PER_ACTIVATION")))
...@@ -2404,7 +2498,8 @@ class RNNBlock(object): ...@@ -2404,7 +2498,8 @@ class RNNBlock(object):
def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon=1e-4): epsilon=1e-4, running_average_factor=0.1,
running_mean=None, running_var=None):
""" """
Performs batch normalization of the given inputs, using the mean and Performs batch normalization of the given inputs, using the mean and
variance of the inputs. variance of the inputs.
...@@ -2424,6 +2519,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', ...@@ -2424,6 +2519,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon : float epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN). value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
Returns Returns
------- -------
...@@ -2431,8 +2543,14 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', ...@@ -2431,8 +2543,14 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
Batch-normalized inputs. Batch-normalized inputs.
mean : tensor mean : tensor
Means of `inputs` across the normalization axes. Means of `inputs` across the normalization axes.
stdinv : tensor invstd : tensor
Inverse standard deviations of `inputs` across the normalization axes. Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes Notes
----- -----
...@@ -2444,31 +2562,77 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', ...@@ -2444,31 +2562,77 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
axes = 0 if mode == 'per-activation' else (0, 2, 3) axes = 0 if mode == 'per-activation' else (0, 2, 3)
mean = inputs.mean(axes, keepdims=True) mean = inputs.mean(axes, keepdims=True)
stdinv = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon)) var = inputs.var(axes, keepdims=True)
out = (inputs - mean) * gamma * stdinv + beta invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) + \\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) + \\
(m / (m - 1)) * var * running_average_factor
For 5d tensors, the axes are (0, 2, 3, 4). For 5d tensors, the axes are (0, 2, 3, 4).
""" """
ndim = inputs.ndim ndim = inputs.ndim
if ndim > 5:
raise ValueError("dnn_batch_normalization_train currently supports "
"up to 5-dimensional tensors only, got %d" % ndim)
if gamma.ndim != ndim or beta.ndim != ndim: if gamma.ndim != ndim or beta.ndim != ndim:
raise ValueError("gamma and beta must be of the same dimensionality " raise ValueError("gamma and beta must be of the same dimensionality "
"as inputs; got %d and %d instead of %d" % "as inputs; got %d and %d instead of %d" %
(gamma.ndim, beta.ndim, ndim)) (gamma.ndim, beta.ndim, ndim))
if (running_mean is None) != (running_var is None):
raise ValueError("running_mean and running_var must either both be "
"given or both be None")
if running_mean is not None and running_mean.ndim != ndim:
raise ValueError("running_mean must be of the same dimensionality "
"as inputs; got %d instead of %d" %
(running_mean.ndim, ndim))
if running_var is not None and running_var.ndim != ndim:
raise ValueError("running_var must be of the same dimensionality "
"as inputs; got %d instead of %d" %
(running_var.ndim, ndim))
if epsilon < 1e-5: if epsilon < 1e-5:
raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon) raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)
running_averages = (running_mean is not None and running_var is not None)
if ndim < 4: if ndim < 4:
inputs = theano.tensor.shape_padright(inputs, 4 - ndim) inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
gamma = theano.tensor.shape_padright(gamma, 4 - ndim) gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
beta = theano.tensor.shape_padright(beta, 4 - ndim) beta = theano.tensor.shape_padright(beta, 4 - ndim)
batchnorm_op = GpuDnnBatchNorm(mode=mode) if running_averages:
result = tuple(batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma), running_mean = theano.tensor.shape_padright(running_mean, 4 - ndim)
gpu_contiguous(beta), epsilon=epsilon)) running_var = theano.tensor.shape_padright(running_var, 4 - ndim)
elif ndim > 5:
inputs_shape = inputs.shape
params_shape = gamma.shape
inputs = theano.tensor.flatten(inputs, 5)
gamma = theano.tensor.flatten(gamma, 5)
beta = theano.tensor.flatten(beta, 5)
if running_averages:
running_mean = theano.tensor.flatten(running_mean, 5)
running_var = theano.tensor.flatten(running_var, 5)
batchnorm_op = GpuDnnBatchNorm(mode=mode, running_averages=running_averages)
if running_averages:
out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta), epsilon=epsilon,
running_average_factor=running_average_factor,
running_mean=gpu_contiguous(running_mean),
running_var=gpu_contiguous(running_var))
if new_running_mean.broadcastable != running_mean.broadcastable:
new_running_mean = tensor.patternbroadcast(new_running_mean, running_mean.broadcastable)
if new_running_var.broadcastable != running_var.broadcastable:
new_running_var = tensor.patternbroadcast(new_running_var, running_var.broadcastable)
result = (out, mean, invstd, new_running_mean, new_running_var)
else:
result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta), epsilon=epsilon)
if ndim < 4: if ndim < 4:
result = tuple(theano.tensor.flatten(r, ndim) for r in result) result = tuple(theano.tensor.flatten(r, ndim) for r in result)
elif ndim > 5:
result = (theano.tensor.reshape(result[0], inputs_shape),) + tuple(
theano.tensor.reshape(r, params_shape) for r in result[1:])
return result return result
...@@ -2521,9 +2685,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var, ...@@ -2521,9 +2685,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
For 5d tensors, the axes would be (0, 2, 3, 4). For 5d tensors, the axes would be (0, 2, 3, 4).
""" """
ndim = inputs.ndim ndim = inputs.ndim
if ndim > 5:
raise ValueError("dnn_batch_normalization_test currently supports "
"up to 5-dimensional tensors only, got %d" % ndim)
if gamma.ndim != ndim or beta.ndim != ndim: if gamma.ndim != ndim or beta.ndim != ndim:
raise ValueError("gamma and beta must be of the same dimensionality " raise ValueError("gamma and beta must be of the same dimensionality "
"as inputs; got %d and %d instead of %d" % "as inputs; got %d and %d instead of %d" %
...@@ -2541,12 +2702,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var, ...@@ -2541,12 +2702,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
beta = theano.tensor.shape_padright(beta, 4 - ndim) beta = theano.tensor.shape_padright(beta, 4 - ndim)
mean = theano.tensor.shape_padright(mean, 4 - ndim) mean = theano.tensor.shape_padright(mean, 4 - ndim)
var = theano.tensor.shape_padright(var, 4 - ndim) var = theano.tensor.shape_padright(var, 4 - ndim)
elif ndim > 5:
inputs_shape = inputs.shape
inputs = theano.tensor.flatten(inputs, 5)
gamma = theano.tensor.flatten(gamma, 5)
beta = theano.tensor.flatten(beta, 5)
mean = theano.tensor.flatten(mean, 5)
var = theano.tensor.flatten(var, 5)
batchnorm_op = GpuDnnBatchNormInference(mode=mode) batchnorm_op = GpuDnnBatchNormInference(mode=mode)
result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma), result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta), gpu_contiguous(mean), gpu_contiguous(beta), gpu_contiguous(mean),
gpu_contiguous(var), epsilon=epsilon) gpu_contiguous(var), epsilon=epsilon)
if ndim < 4: if ndim < 4:
result = theano.tensor.flatten(result, ndim) result = theano.tensor.flatten(result, ndim)
elif ndim > 5:
result = theano.tensor.reshape(result, inputs_shape)
return result return result
...@@ -2928,3 +3098,197 @@ def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs): ...@@ -2928,3 +3098,197 @@ def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
out = GpuDnnSoftmaxGrad('accurate', 'instance')( out = GpuDnnSoftmaxGrad('accurate', 'instance')(
gpu_contiguous(ins[0]), gpu_contiguous(ins[1])) gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
return [out.dimshuffle(0, 2)] return [out.dimshuffle(0, 2)]
@register_opt('cudnn', 'fast_compile')
@op_lifter([bn.AbstractBatchNormTrain])
@register_opt2([bn.AbstractBatchNormTrain], 'cudnn', 'fast_compile')
def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
x, scale, bias, epsilon, running_average_factor = inputs[:5]
running_mean = inputs[5] if len(inputs) > 5 else None
running_var = inputs[6] if len(inputs) > 6 else None
# convert axes to cuDNN mode
axes = tuple(op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
try:
eps = theano.tensor.get_scalar_constant_value(epsilon)
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
try:
running_average_factor = theano.tensor.get_scalar_constant_value(running_average_factor)
except theano.tensor.NotScalarConstantError:
return None
ctx = infer_context_name(*inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
x = as_gpuarray_variable(x, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
bias = as_gpuarray_variable(bias, context_name=ctx)
inputs = [x, scale, bias, mode, eps, running_average_factor]
if running_mean is not None and running_var is not None:
inputs.append(running_mean)
inputs.append(running_var)
results = list(dnn_batch_normalization_train(*inputs))
return results
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_output(node):
if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
return GpuDnnBatchNorm(mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=node.op.inplace_running_var,
inplace_output=True)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_running_mean(node):
if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_mean:
return GpuDnnBatchNorm(mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=True,
inplace_running_var=node.op.inplace_running_var,
inplace_output=node.op.inplace_output)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_batch_norm_inplace_running_var(node):
if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_var:
return GpuDnnBatchNorm(mode=node.op.mode,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=True,
inplace_output=node.op.inplace_output)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNormInference], inplace=True)
def local_batch_norm_inference_inplace(node):
if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)]
@register_opt('cudnn', 'fast_compile')
@op_lifter([bn.AbstractBatchNormTrainGrad])
@register_opt2([bn.AbstractBatchNormTrainGrad], 'cudnn', 'fast_compile')
def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):
x, dy, scale, x_mean, x_invstd, epsilon = inputs
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, GpuArrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
dy_on_gpu = (isinstance(dy.type, GpuArrayType) or
(dy.owner and isinstance(dy.owner.op, HostFromGpu)))
if not (x_on_gpu or dy_on_gpu):
return None
# convert axes to cuDNN mode
axes = tuple(op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
ndim = x.ndim
if ndim < 4:
x = theano.tensor.shape_padright(x, 4 - ndim)
dy = theano.tensor.shape_padright(dy, 4 - ndim)
scale = theano.tensor.shape_padright(scale, 4 - ndim)
x_mean = theano.tensor.shape_padright(x_mean, 4 - ndim)
x_invstd = theano.tensor.shape_padright(x_invstd, 4 - ndim)
elif ndim > 5:
x_shape = x.shape
params_shape = scale.shape
x = theano.tensor.flatten(x, 5)
dy = theano.tensor.flatten(dy, 5)
scale = theano.tensor.flatten(scale, 5)
x_mean = theano.tensor.flatten(x_mean, 5)
x_invstd = theano.tensor.flatten(x_invstd, 5)
try:
eps = theano.tensor.get_scalar_constant_value(epsilon)
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
ctx = infer_context_name(*inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
x = as_gpuarray_variable(x, context_name=ctx)
dy = as_gpuarray_variable(dy, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
x_mean = as_gpuarray_variable(x_mean, context_name=ctx)
x_invstd = as_gpuarray_variable(x_invstd, context_name=ctx)
g_wrt_inputs, g_wrt_scale, g_wrt_bias = \
GpuDnnBatchNormGrad(mode)(x, dy, scale, x_mean, x_invstd, eps)
if ndim < 4:
g_wrt_inputs = theano.tensor.flatten(g_wrt_inputs, ndim)
g_wrt_scale = theano.tensor.flatten(g_wrt_scale, ndim)
g_wrt_bias = theano.tensor.flatten(g_wrt_bias, ndim)
elif ndim > 5:
g_wrt_inputs = theano.tensor.reshape(g_wrt_inputs, x_shape)
g_wrt_scale = theano.tensor.reshape(g_wrt_scale, params_shape)
g_wrt_bias = theano.tensor.reshape(g_wrt_bias, params_shape)
return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
@register_opt('cudnn', 'fast_compile')
@op_lifter([bn.AbstractBatchNormInference])
@register_opt2([bn.AbstractBatchNormInference], 'cudnn', 'fast_compile')
def local_abstract_batch_norm_inference_cudnn(op, ctx_name, inputs, outputs):
x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs
axes = tuple(op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
try:
eps = theano.tensor.get_scalar_constant_value(epsilon)
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
ctx = infer_context_name(*inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
x = as_gpuarray_variable(x, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
bias = as_gpuarray_variable(bias, context_name=ctx)
estimated_mean = as_gpuarray_variable(estimated_mean, context_name=ctx)
estimated_variance = as_gpuarray_variable(estimated_variance, context_name=ctx)
out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance,
mode, eps)
return [out]
...@@ -2,8 +2,19 @@ ...@@ -2,8 +2,19 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, npy_float64 epsilon, PyGpuArrayObject *bias, npy_float64 epsilon,
PyGpuArrayObject **outp, PyGpuArrayObject **x_mean, npy_float64 running_average_factor,
PyGpuArrayObject **x_invstd, cudnnHandle_t _handle) { #ifdef RUNNING_AVERAGES
PyGpuArrayObject *in_running_mean,
PyGpuArrayObject *in_running_var,
#endif
PyGpuArrayObject **outp,
PyGpuArrayObject **x_mean,
PyGpuArrayObject **x_invstd,
#ifdef RUNNING_AVERAGES
PyGpuArrayObject **out_running_mean,
PyGpuArrayObject **out_running_var,
#endif
cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context; PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0) if (c_set_tensorNd(inp, bn_input) != 0)
...@@ -16,8 +27,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -16,8 +27,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
return 1; return 1;
} }
#ifdef INPLACE_OUTPUT
Py_XDECREF(*outp);
*outp = inp;
Py_INCREF(*outp);
#else
if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0) if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
return 1; return 1;
#endif
if (theano_prep_output(x_mean, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0) if (theano_prep_output(x_mean, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
return 1; return 1;
if (theano_prep_output(x_invstd, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0) if (theano_prep_output(x_invstd, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
...@@ -26,6 +43,31 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -26,6 +43,31 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
if (c_set_tensorNd(*outp, bn_output) != 0) if (c_set_tensorNd(*outp, bn_output) != 0)
return 1; return 1;
#ifdef RUNNING_AVERAGES
#ifdef INPLACE_RUNNING_MEAN
Py_XDECREF(out_running_mean);
PyGpuArrayObject *running_mean = in_running_mean;
Py_INCREF(running_mean);
#else
PyGpuArrayObject *running_mean = *out_running_mean;
running_mean = theano_try_copy(running_mean, in_running_mean);
if (running_mean == NULL) {
return 1;
}
#endif
#ifdef INPLACE_RUNNING_VAR
Py_XDECREF(out_running_var);
PyGpuArrayObject *running_var = in_running_var;
Py_INCREF(running_var);
#else
PyGpuArrayObject *running_var = *out_running_var;
running_var = theano_try_copy(running_var, in_running_var);
if (running_var == NULL) {
return 1;
}
#endif
#endif
{ {
const float falpha = 1.; const float falpha = 1.;
const float fbeta = 0.; const float fbeta = 0.;
...@@ -52,9 +94,15 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -52,9 +94,15 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
bn_params, bn_params,
PyGpuArray_DEV_DATA(scale), PyGpuArray_DEV_DATA(scale),
PyGpuArray_DEV_DATA(bias), PyGpuArray_DEV_DATA(bias),
#ifdef RUNNING_AVERAGES
running_average_factor,
PyGpuArray_DEV_DATA(running_mean),
PyGpuArray_DEV_DATA(running_var),
#else
0, 0,
NULL, // running mean, deliberately unused NULL, // running mean, deliberately unused
NULL, // running var, deliberately unused NULL, // running var, deliberately unused
#endif
epsilon, epsilon,
PyGpuArray_DEV_DATA(*x_mean), PyGpuArray_DEV_DATA(*x_mean),
PyGpuArray_DEV_DATA(*x_invstd) PyGpuArray_DEV_DATA(*x_invstd)
...@@ -64,6 +112,10 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -64,6 +112,10 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
#ifdef RUNNING_AVERAGES
*out_running_mean = running_mean;
*out_running_var = running_var;
#endif
} }
return 0; return 0;
} }
...@@ -16,8 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -16,8 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
return 1; return 1;
} }
#ifdef INPLACE_OUTPUT
Py_XDECREF(*outp);
*outp = inp;
Py_INCREF(*outp);
#else
if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0) if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
return 1; return 1;
#endif
if (c_set_tensorNd(*outp, bn_output) != 0) if (c_set_tensorNd(*outp, bn_output) != 0)
return 1; return 1;
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import logging import logging
from collections import OrderedDict
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
from nose_parameterized import parameterized from nose_parameterized import parameterized
...@@ -13,6 +14,7 @@ import theano.tests.unittest_tools as utt ...@@ -13,6 +14,7 @@ import theano.tests.unittest_tools as utt
from theano.tensor.signal.pool import pool_2d, pool_3d from theano.tensor.signal.pool import pool_2d, pool_3d
from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
from theano.tensor.nnet.abstract_conv import get_conv_output_shape from theano.tensor.nnet.abstract_conv import get_conv_output_shape
from theano.tensor.nnet import bn
from .. import dnn from .. import dnn
from ..basic_ops import GpuAllocEmpty from ..basic_ops import GpuAllocEmpty
...@@ -1379,36 +1381,77 @@ def test_dnn_batchnorm_train(): ...@@ -1379,36 +1381,77 @@ def test_dnn_batchnorm_train():
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng() utt.seed_rng()
tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
for mode in ('per-activation', 'spatial'): for mode in ('per-activation', 'spatial'):
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector): for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias')) x, scale, bias, running_mean, running_var = (vartype(n)
for n in ('x', 'scale', 'bias',
'running_mean',
'running_var'))
ndim = x.ndim ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used eps = 5e-3 # some non-standard value to test if it's used
running_average_factor = 0.3
# forward pass
out, x_mean, x_invstd = dnn.dnn_batch_normalization_train( # forward pass, direct interface
x, scale, bias, mode, eps) out_gpu, x_mean_gpu, x_invstd_gpu, \
out_running_mean_gpu, out_running_var_gpu = \
dnn.dnn_batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# forward pass, abstract interface
out_abstract, x_mean_abstract, x_invstd_abstract, \
out_running_mean_abstract, out_running_var_abstract = \
bn.batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# reference forward pass # reference forward pass
if mode == 'per-activation': if mode == 'per-activation':
axes = (0,) axes = (0,)
elif mode == 'spatial': elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim)) axes = (0,) + tuple(range(2, ndim))
x_mean2 = x.mean(axis=axes, keepdims=True) x_mean_ref = x.mean(axis=axes, keepdims=True)
x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps)) x_var_ref = x.var(axis=axes, keepdims=True)
scale2 = T.addbroadcast(scale, *axes) x_invstd_ref = T.inv(T.sqrt(x_var_ref + eps))
bias2 = T.addbroadcast(bias, *axes) scale_ref = T.addbroadcast(scale, *axes)
out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 bias_ref = T.addbroadcast(bias, *axes)
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
out_running_mean_ref = running_mean * (1 - running_average_factor) + \
x_mean_ref * running_average_factor
out_running_var_ref = running_var * (1 - running_average_factor) + \
(m / (m - 1)) * x_var_ref * running_average_factor
# backward pass # backward pass
dy = vartype('dy') dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
# reference backward pass # reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) grads_ref = T.grad(None, wrt=[x, scale, bias], known_grads={out_ref: dy})
# compile # compile
f = theano.function([x, scale, bias, dy], f_gpu = theano.function([x, scale, bias, running_mean, running_var, dy],
[out, x_mean, x_invstd, out2, x_mean2, x_invstd2] + [out_gpu, x_mean_gpu, x_invstd_gpu,
grads + grads2, mode=mode_with_gpu) out_running_mean_gpu, out_running_var_gpu] + grads_gpu,
mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract,
out_running_mean_abstract, out_running_var_abstract] +
grads_abstract,
mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_ref, x_mean_ref, x_invstd_ref,
out_running_mean_ref, out_running_var_ref] + grads_ref,
mode=mode_without_gpu)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run # run
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): for data_shape in ((5, 10, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
data_shape = data_shape[:ndim] data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape)) for d, s in enumerate(data_shape))
...@@ -1416,15 +1459,124 @@ def test_dnn_batchnorm_train(): ...@@ -1416,15 +1459,124 @@ def test_dnn_batchnorm_train():
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX) Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX) Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Dy) Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs_gpu = f_gpu(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
# compare outputs # compare outputs
utt.assert_allclose(outputs[0], outputs[0 + 3]) # out utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean utt.assert_allclose(outputs_gpu[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd utt.assert_allclose(outputs_gpu[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_gpu[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_gpu[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs_abstract[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_abstract[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
# compare gradients # compare gradients
utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4) # dx utt.assert_allclose(outputs_gpu[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs_gpu[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias utt.assert_allclose(outputs_gpu[7], outputs_ref[7]) # dbias
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_abstract[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_abstract[7], outputs_ref[7]) # dbias
def test_dnn_batchnorm_train_without_running_averages():
# compile and run batch_normalization_train without running averages
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
# forward pass
out_gpu, x_mean_gpu, x_invstd_gpu = \
dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation')
out_abstract, x_mean_abstract, x_invstd_abstract = \
bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
# compile
f_gpu = theano.function([x, scale, bias, dy],
[out_gpu, x_mean_gpu, x_invstd_gpu] +
grads_gpu,
mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract] +
grads_abstract,
mode=mode_with_gpu)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm)
for n in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
for n in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f_abstract.maker.fgraph.toposort()])
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f_gpu(X, Scale, Bias, Dy)
f_abstract(X, Scale, Bias, Dy)
def test_dnn_batchnorm_train_inplace():
# test inplace_running_mean and inplace_running_var
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
running_mean = gpuarray_shared_constructor(
numpy.random.randn(*param_shape).astype(theano.config.floatX),
broadcastable=(True, False, False, False))
running_var = gpuarray_shared_constructor(
numpy.random.randn(*param_shape).astype(theano.config.floatX),
broadcastable=(True, False, False, False))
# forward pass
out, x_mean, x_invstd, new_running_mean, new_running_var = \
dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation',
epsilon=5e-3, running_average_factor=0.3,
running_mean=running_mean, running_var=running_var)
# update running averages
updates = OrderedDict()
updates[running_mean] = new_running_mean
updates[running_var] = new_running_var
# compile
f = theano.function([x, scale, bias],
[out, x_mean, x_invstd],
updates=updates,
mode=mode_with_gpu)
# check for the inplace settings
nodes = [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, dnn.GpuDnnBatchNorm)]
assert len(nodes) == 1
assert nodes[0].op.inplace_running_mean
assert nodes[0].op.inplace_running_var
assert nodes[0].op.inplace_output
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias)
def test_batchnorm_inference(): def test_batchnorm_inference():
...@@ -1432,34 +1584,51 @@ def test_batchnorm_inference(): ...@@ -1432,34 +1584,51 @@ def test_batchnorm_inference():
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng() utt.seed_rng()
tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
for mode in ('per-activation', 'spatial'): for mode in ('per-activation', 'spatial'):
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector): for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias, mean, var = (vartype(n) x, scale, bias, mean, var = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var')) for n in ('x', 'scale', 'bias', 'mean', 'var'))
ndim = x.ndim ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used eps = 5e-3 # some non-standard value to test if it's used
# forward pass # forward pass, direct interface
out = dnn.dnn_batch_normalization_test(x, scale, bias, mean, out_gpu = dnn.dnn_batch_normalization_test(x, scale, bias, mean,
var, mode, eps) var, mode, eps)
# forward pass, abstract interface
out_abstract = bn.batch_normalization_test(x, scale, bias, mean,
var, mode, eps)
# reference forward pass # reference forward pass
if mode == 'per-activation': if mode == 'per-activation':
axes = (0,) axes = (0,)
elif mode == 'spatial': elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim)) axes = (0,) + tuple(range(2, ndim))
scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes) scale_ref, bias_ref, mean_ref, var_ref = (T.addbroadcast(t, *axes)
for t in (scale, bias, mean, var)) for t in (scale, bias, mean, var))
out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2 out_ref = (x - mean_ref) * (scale_ref / T.sqrt(var_ref + eps)) + bias_ref
# backward pass # backward pass
dy = vartype('dy') dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy}) grads_gpu = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_abstract: dy})
# reference backward pass # reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy}) grads_ref = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_ref: dy})
# compile # compile
f = theano.function([x, scale, bias, mean, var, dy], f_gpu = theano.function([x, scale, bias, mean, var, dy],
[out, out2] + grads + grads2, mode=mode_with_gpu) [out_gpu] + grads_gpu, mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, mean, var, dy],
[out_abstract] + grads_abstract, mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, mean, var, dy],
[out_ref] + grads_ref)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run # run
for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): for data_shape in ((10, 20, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
data_shape = data_shape[:ndim] data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape)) for d, s in enumerate(data_shape))
...@@ -1469,15 +1638,106 @@ def test_batchnorm_inference(): ...@@ -1469,15 +1638,106 @@ def test_batchnorm_inference():
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX) Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX) Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX) Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Mean, Var, Dy) outputs_gpu = f_gpu(X, Scale, Bias, Mean, Var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Mean, Var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Mean, Var, Dy)
# compare outputs # compare outputs
utt.assert_allclose(outputs[0], outputs[1]) # out utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
# compare gradients # compare gradients
utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx utt.assert_allclose(outputs_gpu[1], outputs_ref[1], atol=4e-5) # dx
utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale utt.assert_allclose(outputs_gpu[2], outputs_ref[2], atol=4e-5) # dscale
utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias utt.assert_allclose(outputs_gpu[3], outputs_ref[3]) # dbias
utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean utt.assert_allclose(outputs_gpu[4], outputs_ref[4]) # dmean
utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar utt.assert_allclose(outputs_gpu[5], outputs_ref[5], rtol=2e-3, atol=4e-5) # dvar
utt.assert_allclose(outputs_abstract[1], outputs_ref[1], atol=4e-5) # dx
utt.assert_allclose(outputs_abstract[2], outputs_ref[2], atol=4e-5) # dscale
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # dbias
utt.assert_allclose(outputs_abstract[4], outputs_ref[4]) # dmean
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], rtol=2e-3, atol=4e-5) # dvar
def test_batchnorm_inference_inplace():
# test inplace
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, mean, var = (T.tensor4(n) for n in ('x', 'scale', 'bias', 'mean', 'var'))
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
out = dnn.dnn_batch_normalization_test(x, scale, bias, mean, var)
f = theano.function([x, scale, bias, mean, var], [out], mode=mode_with_gpu)
# check for the inplace settings
nodes = [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, dnn.GpuDnnBatchNormInference)]
assert len(nodes) == 1
assert nodes[0].op.inplace
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias, Mean, Var)
def test_dnn_batchnorm_valid_and_invalid_axes():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix):
x, scale, bias, mean, var, dy = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var', 'dy'))
ndim = x.ndim
# supported: per-activation and spatial
valid_axes_lists = ((0,), (0,) + tuple(range(2, ndim)))
# not supported: an axes list without 0 and including 1
invalid_axes_lists = (tuple(range(1, ndim)),)
for axes in valid_axes_lists + invalid_axes_lists:
# forward pass, abstract interface
out_train, x_mean, x_invstd = bn.batch_normalization_train(
x, scale, bias, axes)
out_test = bn.batch_normalization_test(
x, scale, bias, mean, var, axes)
# backward pass
dy = vartype('dy')
grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
grads_test = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_test: dy})
# compile
f = theano.function([x, scale, bias, mean, var, dy],
[out_train, x_mean, x_invstd, out_test] +
grads_train + grads_test,
mode=mode_with_gpu)
if axes in valid_axes_lists:
# check if the abstract Ops have been replaced by the cuDNN Ops
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
in f.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
in f.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f.maker.fgraph.toposort()])
else:
# check if the abstract Ops have been replaced, but not by the cuDNN Ops
assert not any([isinstance(n.op, (dnn.GpuDnnBatchNorm,
dnn.GpuDnnBatchNormGrad,
bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f.maker.fgraph.toposort()])
def test_dnn_rnn_gru(): def test_dnn_rnn_gru():
......
...@@ -12,7 +12,7 @@ import warnings ...@@ -12,7 +12,7 @@ import warnings
import theano import theano
from theano.compat import get_unbound_function from theano.compat import get_unbound_function
from theano.compile import optdb from theano.compile import optdb
from theano.gof import EquilibriumDB, SequenceDB from theano.gof import EquilibriumDB, SequenceDB, TopoOptimizer
from theano.gof.cmodule import get_lib_extension from theano.gof.cmodule import get_lib_extension
from theano.gof.compilelock import get_lock, release_lock from theano.gof.compilelock import get_lock, release_lock
from theano import config from theano import config
...@@ -40,6 +40,17 @@ def register_opt(*tags, **kwargs): ...@@ -40,6 +40,17 @@ def register_opt(*tags, **kwargs):
return f return f
def register_inplace(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
optdb.register(
name, TopoOptimizer(
local_opt, failure_callback=TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace', 'gpu', *tags)
return local_opt
return f
_logger_name = 'theano.sandbox.cuda' _logger_name = 'theano.sandbox.cuda'
_logger = logging.getLogger(_logger_name) _logger = logging.getLogger(_logger_name)
......
...@@ -18,6 +18,7 @@ from theano.tensor.nnet.abstract_conv import (get_conv_output_shape, ...@@ -18,6 +18,7 @@ from theano.tensor.nnet.abstract_conv import (get_conv_output_shape,
assert_conv_shape) assert_conv_shape)
from theano.tensor.signal.pool import ( from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad) Pool, MaxPoolGrad, AveragePoolGrad)
from theano.tensor.nnet import bn
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp, dnn_available from theano.sandbox.cuda import GpuOp, dnn_available
...@@ -33,7 +34,7 @@ from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax, ...@@ -33,7 +34,7 @@ from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
from theano.sandbox.cuda.nnet import GpuSoftmax from theano.sandbox.cuda.nnet import GpuSoftmax
from theano.sandbox.cuda.opt_util import (alpha_merge, output_merge, from theano.sandbox.cuda.opt_util import (alpha_merge, output_merge,
pad_dims, unpad_dims) pad_dims, unpad_dims)
from theano.sandbox.cuda import gpu_seqopt, register_opt from theano.sandbox.cuda import gpu_seqopt, register_opt, register_inplace
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
...@@ -2347,6 +2348,23 @@ class GpuDnnBatchNormBase(DnnBase): ...@@ -2347,6 +2348,23 @@ class GpuDnnBatchNormBase(DnnBase):
epsilon epsilon
Epsilon value used in the batch normalization formula. Minimum allowed Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN). value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
""" """
__props__ = ('mode', 'epsilon') __props__ = ('mode', 'epsilon')
...@@ -2395,17 +2413,15 @@ cudnnStatus_t err%(name)s; ...@@ -2395,17 +2413,15 @@ cudnnStatus_t err%(name)s;
result = """ result = """
cudnnStatus_t err%(name)s; cudnnStatus_t err%(name)s;
cudnnBatchNormMode_t mode%(name)s = %(mode)s; cudnnBatchNormMode_t mode%(name)s = %(mode)s;
double exponentialAverageFactor%(name)s = %(exp_avg_factor)f;
double epsilon%(name)s = %(epsilon)e; double epsilon%(name)s = %(epsilon)e;
""" % dict(name=name, """ % dict(name=name,
mode=mode, mode=mode,
exp_avg_factor=0, # deliberately unused
epsilon=self.epsilon) epsilon=self.epsilon)
return result return result
def c_code_cache_version(self): def c_code_cache_version(self):
return (3, version()) return (4, version())
class GpuDnnBatchNormInference(GpuDnnBatchNormBase): class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
...@@ -2422,8 +2438,26 @@ class GpuDnnBatchNormInference(GpuDnnBatchNormBase): ...@@ -2422,8 +2438,26 @@ class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
Note: scale, bias, mean and variance must follow the same tensor layout! Note: scale, bias, mean and variance must follow the same tensor layout!
""" """
__props__ = ('mode', 'epsilon', 'inplace')
tensor_descs = ['bn_input', 'bn_output', 'bn_params'] tensor_descs = ['bn_input', 'bn_output', 'bn_params']
def __init__(self, mode='per-activation', epsilon=1e-4, inplace=False):
super(GpuDnnBatchNormInference, self).__init__(mode=mode, epsilon=epsilon)
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'inplace'):
self.inplace = False
def get_op_params(self):
params = []
if self.inplace:
params.append(('INPLACE_OUTPUT', '1'))
return params
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
# output shape equals shape of x # output shape equals shape of x
return [shape[0]] return [shape[0]]
...@@ -2460,10 +2494,16 @@ if (c_set_tensorNd(%(scale)s, bn_params_%(name)s) != 0) ...@@ -2460,10 +2494,16 @@ if (c_set_tensorNd(%(scale)s, bn_params_%(name)s) != 0)
} }
// build and prepare the output variable // build and prepare the output variable
#ifdef INPLACE_OUTPUT
Py_XDECREF(%(outp)s);
%(outp)s = %(inp)s;
Py_INCREF(%(outp)s);
#else
if (CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp)s)) != 0) if (CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp)s)) != 0)
{ {
%(fail)s %(fail)s
} }
#endif
// set output tensor descriptor from output tensor // set output tensor descriptor from output tensor
if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0) if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0)
...@@ -2494,6 +2534,16 @@ err%(name)s = cudnnBatchNormalizationForwardInference( ...@@ -2494,6 +2534,16 @@ err%(name)s = cudnnBatchNormalizationForwardInference(
""" % dict(name=name, inp=inp, scale=scale, bias=bias, est_mean=est_mean, """ % dict(name=name, inp=inp, scale=scale, bias=bias, est_mean=est_mean,
est_var=est_var, outp=outp, fail=sub['fail']) est_var=est_var, outp=outp, fail=sub['fail'])
# add params
define_macros, undef_macros = self.get_c_macros(node, name, check_input=False)
result = """
%(define_macros)s
{
%(code)s
}
%(undef_macros)s
""" % dict(code=result, define_macros=define_macros, undef_macros=undef_macros)
return result return result
def grad(self, inputs, grads): def grad(self, inputs, grads):
...@@ -2537,28 +2587,98 @@ class GpuDnnBatchNorm(GpuDnnBatchNormBase): ...@@ -2537,28 +2587,98 @@ class GpuDnnBatchNorm(GpuDnnBatchNormBase):
Note: scale and bias must follow the same tensor layout! Note: scale and bias must follow the same tensor layout!
""" """
__props__ = ('mode', 'epsilon', 'running_average_factor',
'running_averages', 'inplace_running_mean',
'inplace_running_var', 'inplace_output')
tensor_descs = ['bn_input', 'bn_output', 'bn_params'] tensor_descs = ['bn_input', 'bn_output', 'bn_params']
def __init__(self, mode='per-activation', epsilon=1e-4,
running_average_factor=0,
running_averages=False, inplace_running_mean=False,
inplace_running_var=False, inplace_output=False):
super(GpuDnnBatchNorm, self).__init__(mode=mode, epsilon=epsilon)
self.running_average_factor = running_average_factor
self.running_averages = running_averages
self.inplace_output = inplace_output
self.inplace_running_mean = inplace_running_mean
self.inplace_running_var = inplace_running_var
self.destroy_map = {}
if self.inplace_output:
self.destroy_map[0] = [0]
if self.running_averages and self.inplace_running_mean:
self.destroy_map[3] = [3]
if self.running_averages and self.inplace_running_var:
self.destroy_map[4] = [4]
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'running_average_factor'):
self.running_average_factor = 0
if not hasattr(self, 'running_averages'):
self.running_averages = False
if not (hasattr(self, 'inplace_running_mean') and
hasattr(self, 'inplace_running_var') and
hasattr(self, 'inplace_output')):
self.inplace_running_mean = False
self.inplace_running_var = False
self.inplace_output = False
self.destroy_map = {}
def get_op_params(self):
params = []
if self.inplace_output:
params.append(('INPLACE_OUTPUT', '1'))
if self.running_averages:
params.append(('RUNNING_AVERAGES', '1'))
if self.inplace_running_mean:
params.append(('INPLACE_RUNNING_MEAN', '1'))
if self.inplace_running_var:
params.append(('INPLACE_RUNNING_VAR', '1'))
return params
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
# first output equals shape of x # first output equals shape of x
# second and third output equal shape of scale # other outputs equal shape of scale
return [shape[0], shape[1], shape[1]] return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)
def make_node(self, x, scale, bias): def make_node(self, x, scale, bias,
running_mean=None, running_var=None):
assert x.ndim == scale.ndim == bias.ndim
assert x.ndim in (4, 5)
assert self.running_averages == (running_mean is not None) == (running_var is not None)
assert (running_mean is None or running_mean.ndim == x.ndim)
assert (running_var is None or running_var.ndim == x.ndim)
x = as_cuda_ndarray_variable(x) x = as_cuda_ndarray_variable(x)
scale = as_cuda_ndarray_variable(scale) scale = as_cuda_ndarray_variable(scale)
bias = as_cuda_ndarray_variable(bias) bias = as_cuda_ndarray_variable(bias)
assert x.ndim == scale.ndim == bias.ndim inputs = [x, scale, bias]
assert x.ndim in (4, 5) output_types = [x.type(), scale.type(), scale.type()]
return Apply(self, [x, scale, bias], [x.type(), scale.type(), scale.type()]) if running_mean is not None and running_var is not None:
inputs.append(as_cuda_ndarray_variable(running_mean))
inputs.append(as_cuda_ndarray_variable(running_var))
output_types.append(scale.type())
output_types.append(scale.type())
return Apply(self, inputs, output_types)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
# super call to prepare common configuration # super call to prepare common configuration
result = super(GpuDnnBatchNorm, self).c_code(node, name, inputs, outputs, sub) result = super(GpuDnnBatchNorm, self).c_code(node, name, inputs, outputs, sub)
# give sensible names to inputs and outputs # give sensible names to inputs and outputs
inp, scale, bias = inputs inp, scale, bias = inputs[:3]
outp, x_mean, x_invstd = outputs outp, x_mean, x_invstd = outputs[:3]
if self.running_averages:
running_average_factor = self.running_average_factor
in_running_mean = inputs[3]
in_running_var = inputs[4]
out_running_mean = outputs[3]
out_running_var = outputs[4]
else:
running_average_factor = 0.
in_running_mean = 'NULL'
in_running_var = 'NULL'
out_running_mean = 'NULL'
out_running_var = 'NULL'
# set input tensor descriptors from input tensors # set input tensor descriptors from input tensors
result += """ result += """
...@@ -2579,6 +2699,32 @@ if ((CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp ...@@ -2579,6 +2699,32 @@ if ((CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp
{ {
%(fail)s %(fail)s
} }
#ifdef RUNNING_AVERAGES
#ifdef INPLACE_RUNNING_MEAN
Py_XDECREF(%(out_running_mean)s);
CudaNdarray *running_mean%(name)s = %(in_running_mean)s;
Py_INCREF(running_mean%(name)s);
#else
if ((CudaNdarray_prep_output(&%(out_running_mean)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(scale)s)) != 0) ||
(CudaNdarray_CopyFromCudaNdarray(%(out_running_mean)s, %(in_running_mean)s) != 0))
{
%(fail)s
}
CudaNdarray *running_mean%(name)s = %(out_running_mean)s;
#endif
#ifdef INPLACE_RUNNING_VAR
Py_XDECREF(%(out_running_var)s);
CudaNdarray *running_var%(name)s = %(in_running_var)s;
Py_INCREF(running_var%(name)s);
#else
if ((CudaNdarray_prep_output(&%(out_running_var)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(scale)s)) != 0) ||
(CudaNdarray_CopyFromCudaNdarray(%(out_running_var)s, %(in_running_var)s) != 0))
{
%(fail)s
}
CudaNdarray *running_var%(name)s = %(out_running_var)s;
#endif
#endif
// set output tensor descriptor from output tensor // set output tensor descriptor from output tensor
if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0) if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0)
...@@ -2601,25 +2747,66 @@ err%(name)s = cudnnBatchNormalizationForwardTraining( ...@@ -2601,25 +2747,66 @@ err%(name)s = cudnnBatchNormalizationForwardTraining(
bn_params_%(name)s, bn_params_%(name)s,
CudaNdarray_DEV_DATA(%(scale)s), CudaNdarray_DEV_DATA(%(scale)s),
CudaNdarray_DEV_DATA(%(bias)s), CudaNdarray_DEV_DATA(%(bias)s),
exponentialAverageFactor%(name)s, #ifdef RUNNING_AVERAGES
NULL, // running mean, deliberately unused %(running_average_factor)f,
NULL, // running var, deliberately unused CudaNdarray_DEV_DATA(running_mean%(name)s),
CudaNdarray_DEV_DATA(running_var%(name)s),
#else
0,
NULL,
NULL,
#endif
epsilon%(name)s, epsilon%(name)s,
CudaNdarray_DEV_DATA(%(x_mean)s), CudaNdarray_DEV_DATA(%(x_mean)s),
CudaNdarray_DEV_DATA(%(x_invstd)s) CudaNdarray_DEV_DATA(%(x_invstd)s)
); );
} }
#ifdef RUNNING_AVERAGES
%(out_running_mean)s = running_mean%(name)s;
%(out_running_var)s = running_var%(name)s;
#endif
""" % dict(name=name, inp=inp, scale=scale, bias=bias, outp=outp, """ % dict(name=name, inp=inp, scale=scale, bias=bias, outp=outp,
x_mean=x_mean, x_invstd=x_invstd, fail=sub['fail']) x_mean=x_mean, x_invstd=x_invstd,
running_average_factor=running_average_factor,
in_running_mean=in_running_mean, in_running_var=in_running_var,
out_running_mean=out_running_mean, out_running_var=out_running_var,
fail=sub['fail'])
# add params
define_macros, undef_macros = self.get_c_macros(node, name, check_input=False)
result = """
%(define_macros)s
{
%(code)s
}
%(undef_macros)s
""" % dict(code=result, define_macros=define_macros, undef_macros=undef_macros)
return result return result
def grad(self, inputs, grads): def grad(self, inputs, grads):
x, scale, bias = inputs x, scale, bias = inputs[:3]
dy = grads[0] dy = grads[0]
_, x_mean, x_invstd = self(x, scale, bias) _, x_mean, x_invstd = self(*inputs)[:3]
return GpuDnnBatchNormGrad(self.mode, self.epsilon)(x, dy, scale, disconnected_outputs = []
x_mean, x_invstd) # Optional running_mean and running_var.
for i in range(3, len(inputs)):
disconnected_outputs.append(DisconnectedType()())
return GpuDnnBatchNormGrad(self.mode, self.epsilon)(
x, dy, scale, x_mean, x_invstd) + disconnected_outputs
def connection_pattern(self, node):
patterns = [[True, True, True], # x
[True, True, True], # scale
[True, True, True]] # bias
# Optional running_mean and running_var are only
# connected to their new values.
for i in range(3, len(node.inputs)):
patterns[0].append(True)
for pattern in patterns[1:]:
pattern.append(False)
patterns.append([False] * (i) + [True])
return patterns
class GpuDnnBatchNormGrad(GpuDnnBatchNormBase): class GpuDnnBatchNormGrad(GpuDnnBatchNormBase):
...@@ -2722,7 +2909,8 @@ err%(name)s = cudnnBatchNormalizationBackward( ...@@ -2722,7 +2909,8 @@ err%(name)s = cudnnBatchNormalizationBackward(
def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon=1e-4): epsilon=1e-4, running_average_factor=0.1,
running_mean=None, running_var=None):
""" """
Performs batch normalization of the given inputs, using the mean and Performs batch normalization of the given inputs, using the mean and
variance of the inputs. variance of the inputs.
...@@ -2742,6 +2930,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', ...@@ -2742,6 +2930,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon : float epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN). value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
Returns Returns
------- -------
...@@ -2749,8 +2954,14 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', ...@@ -2749,8 +2954,14 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
Batch-normalized inputs. Batch-normalized inputs.
mean : tensor mean : tensor
Means of `inputs` across the normalization axes. Means of `inputs` across the normalization axes.
stdinv : tensor invstd : tensor
Inverse standard deviations of `inputs` across the normalization axes. Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes Notes
----- -----
...@@ -2762,31 +2973,78 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', ...@@ -2762,31 +2973,78 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
axes = 0 if mode == 'per-activation' else (0, 2, 3) axes = 0 if mode == 'per-activation' else (0, 2, 3)
mean = inputs.mean(axes, keepdims=True) mean = inputs.mean(axes, keepdims=True)
stdinv = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon)) var = inputs.var(axes, keepdims=True)
out = (inputs - mean) * gamma * stdinv + beta invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) + \\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) + \\
(m / (m - 1)) * var * running_average_factor
For 5d tensors, the axes are (0, 2, 3, 4). For 5d tensors, the axes are (0, 2, 3, 4).
""" """
ndim = inputs.ndim ndim = inputs.ndim
if ndim > 5:
raise ValueError("dnn_batch_normalization_train currently supports "
"up to 5-dimensional tensors only, got %d" % ndim)
if gamma.ndim != ndim or beta.ndim != ndim: if gamma.ndim != ndim or beta.ndim != ndim:
raise ValueError("gamma and beta must be of the same dimensionality " raise ValueError("gamma and beta must be of the same dimensionality "
"as inputs; got %d and %d instead of %d" % "as inputs; got %d and %d instead of %d" %
(gamma.ndim, beta.ndim, ndim)) (gamma.ndim, beta.ndim, ndim))
if (running_mean is None) != (running_var is None):
raise ValueError("running_mean and running_var must either both be "
"given or both be None")
if running_mean is not None and running_mean.ndim != ndim:
raise ValueError("running_mean must be of the same dimensionality "
"as inputs; got %d instead of %d" %
(running_mean.ndim, ndim))
if running_var is not None and running_var.ndim != ndim:
raise ValueError("running_var must be of the same dimensionality "
"as inputs; got %d instead of %d" %
(running_var.ndim, ndim))
if epsilon < 1e-5: if epsilon < 1e-5:
raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon) raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)
running_averages = (running_var is not None and running_var is not None)
if ndim < 4: if ndim < 4:
inputs = theano.tensor.shape_padright(inputs, 4 - ndim) inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
gamma = theano.tensor.shape_padright(gamma, 4 - ndim) gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
beta = theano.tensor.shape_padright(beta, 4 - ndim) beta = theano.tensor.shape_padright(beta, 4 - ndim)
batchnorm_op = GpuDnnBatchNorm(mode=mode, epsilon=epsilon) if running_averages:
result = tuple(batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma), running_mean = theano.tensor.shape_padright(running_mean, 4 - ndim)
gpu_contiguous(beta))) running_var = theano.tensor.shape_padright(running_var, 4 - ndim)
elif ndim > 5:
inputs_shape = inputs.shape
params_shape = gamma.shape
inputs = theano.tensor.flatten(inputs, 5)
gamma = theano.tensor.flatten(gamma, 5)
beta = theano.tensor.flatten(beta, 5)
if running_averages:
running_mean = theano.tensor.flatten(running_mean, 5)
running_var = theano.tensor.flatten(running_var, 5)
batchnorm_op = GpuDnnBatchNorm(mode=mode, epsilon=epsilon,
running_average_factor=running_average_factor,
running_averages=running_averages)
if running_averages:
out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta),
running_mean=gpu_contiguous(running_mean),
running_var=gpu_contiguous(running_var))
if new_running_mean.broadcastable != running_mean.broadcastable:
new_running_mean = tensor.patternbroadcast(new_running_mean, running_mean.broadcastable)
if new_running_var.broadcastable != running_var.broadcastable:
new_running_var = tensor.patternbroadcast(new_running_var, running_var.broadcastable)
result = (out, mean, invstd, new_running_mean, new_running_var)
else:
result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta))
if ndim < 4: if ndim < 4:
result = tuple(theano.tensor.flatten(r, ndim) for r in result) result = tuple(theano.tensor.flatten(r, ndim) for r in result)
elif ndim > 5:
result = (theano.tensor.reshape(result[0], inputs_shape),) + tuple(
theano.tensor.reshape(r, params_shape) for r in result[1:])
return result return result
...@@ -2839,9 +3097,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var, ...@@ -2839,9 +3097,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
For 5d tensors, the axes would be (0, 2, 3, 4). For 5d tensors, the axes would be (0, 2, 3, 4).
""" """
ndim = inputs.ndim ndim = inputs.ndim
if ndim > 5:
raise ValueError("dnn_batch_normalization_test currently supports "
"up to 5-dimensional tensors only, got %d" % ndim)
if gamma.ndim != ndim or beta.ndim != ndim: if gamma.ndim != ndim or beta.ndim != ndim:
raise ValueError("gamma and beta must be of the same dimensionality " raise ValueError("gamma and beta must be of the same dimensionality "
"as inputs; got %d and %d instead of %d" % "as inputs; got %d and %d instead of %d" %
...@@ -2859,12 +3114,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var, ...@@ -2859,12 +3114,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
beta = theano.tensor.shape_padright(beta, 4 - ndim) beta = theano.tensor.shape_padright(beta, 4 - ndim)
mean = theano.tensor.shape_padright(mean, 4 - ndim) mean = theano.tensor.shape_padright(mean, 4 - ndim)
var = theano.tensor.shape_padright(var, 4 - ndim) var = theano.tensor.shape_padright(var, 4 - ndim)
elif ndim > 5:
inputs_shape = inputs.shape
inputs = theano.tensor.flatten(inputs, 5)
gamma = theano.tensor.flatten(gamma, 5)
beta = theano.tensor.flatten(beta, 5)
mean = theano.tensor.flatten(mean, 5)
var = theano.tensor.flatten(var, 5)
batchnorm_op = GpuDnnBatchNormInference(mode=mode, epsilon=epsilon) batchnorm_op = GpuDnnBatchNormInference(mode=mode, epsilon=epsilon)
result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma), result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta), gpu_contiguous(mean), gpu_contiguous(beta), gpu_contiguous(mean),
gpu_contiguous(var)) gpu_contiguous(var))
if ndim < 4: if ndim < 4:
result = theano.tensor.flatten(result, ndim) result = theano.tensor.flatten(result, ndim)
elif ndim > 5:
result = theano.tensor.reshape(result, inputs_shape)
return result return result
...@@ -3334,3 +3598,235 @@ def local_abstractconv3d_cudnn(node): ...@@ -3334,3 +3598,235 @@ def local_abstractconv3d_cudnn(node):
subsample=node.op.subsample, subsample=node.op.subsample,
conv_mode=conv_mode) conv_mode=conv_mode)
return [rval] return [rval]
@local_optimizer([bn.AbstractBatchNormTrain])
def local_abstract_batch_norm_train_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrain):
return None
x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
running_mean = node.inputs[5] if len(node.inputs) > 5 else None
running_var = node.inputs[6] if len(node.inputs) > 6 else None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
if not x_on_gpu:
return None
# convert axes to cuDNN mode
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
try:
eps = float(theano.tensor.get_scalar_constant_value(epsilon))
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
try:
running_average_factor = float(theano.tensor.get_scalar_constant_value(running_average_factor))
except theano.tensor.NotScalarConstantError:
return None
if not dnn_available():
return None
x = as_cuda_ndarray_variable(x)
scale = as_cuda_ndarray_variable(scale)
bias = as_cuda_ndarray_variable(bias)
inputs = [x, scale, bias, mode, eps, running_average_factor]
if running_mean is not None and running_var is not None:
inputs.append(running_mean)
inputs.append(running_var)
results = list(dnn_batch_normalization_train(*inputs))
# If the original output was on CPU, we have to transfer it
for i in range(len(node.outputs)):
if isinstance(node.outputs[i].type, tensor.TensorType):
results[i] = tensor.as_tensor_variable(results[i])
# TODO copy_stack_trace?
return results
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_gpu_batch_norm_inplace_output(node):
if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
return GpuDnnBatchNorm(mode=node.op.mode,
epsilon=node.op.epsilon,
running_average_factor=node.op.running_average_factor,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=node.op.inplace_running_var,
inplace_output=True)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_gpu_batch_norm_inplace_running_mean(node):
if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_mean:
return GpuDnnBatchNorm(mode=node.op.mode,
epsilon=node.op.epsilon,
running_average_factor=node.op.running_average_factor,
running_averages=node.op.running_averages,
inplace_running_mean=True,
inplace_running_var=node.op.inplace_running_var,
inplace_output=node.op.inplace_output)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_gpu_batch_norm_inplace_running_var(node):
if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_var:
return GpuDnnBatchNorm(mode=node.op.mode,
epsilon=node.op.epsilon,
running_average_factor=node.op.running_average_factor,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=True,
inplace_output=node.op.inplace_output)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNormInference], inplace=True)
def local_gpu_batch_norm_inference_inplace(node):
if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
return [GpuDnnBatchNormInference(mode=node.op.mode,
epsilon=node.op.epsilon,
inplace=True)(*node.inputs)]
@local_optimizer([bn.AbstractBatchNormTrainGrad])
def local_abstract_batch_norm_train_grad_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrainGrad):
return None
x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
dy_on_gpu = (isinstance(dy.type, CudaNdarrayType) or
(dy.owner and isinstance(dy.owner.op, HostFromGpu)))
if not (x_on_gpu or dy_on_gpu):
return None
# convert axes to cuDNN mode
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
ndim = x.ndim
if ndim < 4:
x = theano.tensor.shape_padright(x, 4 - ndim)
dy = theano.tensor.shape_padright(dy, 4 - ndim)
scale = theano.tensor.shape_padright(scale, 4 - ndim)
x_mean = theano.tensor.shape_padright(x_mean, 4 - ndim)
x_invstd = theano.tensor.shape_padright(x_invstd, 4 - ndim)
elif ndim > 5:
x_shape = x.shape
params_shape = scale.shape
x = theano.tensor.flatten(x, 5)
dy = theano.tensor.flatten(dy, 5)
scale = theano.tensor.flatten(scale, 5)
x_mean = theano.tensor.flatten(x_mean, 5)
x_invstd = theano.tensor.flatten(x_invstd, 5)
try:
eps = float(theano.tensor.get_scalar_constant_value(epsilon))
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
if not dnn_available():
return None
x = as_cuda_ndarray_variable(x)
dy = as_cuda_ndarray_variable(dy)
scale = as_cuda_ndarray_variable(scale)
x_mean = as_cuda_ndarray_variable(x_mean)
x_invstd = as_cuda_ndarray_variable(x_invstd)
g_wrt_inputs, g_wrt_scale, g_wrt_bias = \
GpuDnnBatchNormGrad(mode, epsilon=eps)(x, dy, scale, x_mean, x_invstd)
if ndim < 4:
g_wrt_inputs = theano.tensor.flatten(g_wrt_inputs, ndim)
g_wrt_scale = theano.tensor.flatten(g_wrt_scale, ndim)
g_wrt_bias = theano.tensor.flatten(g_wrt_bias, ndim)
elif ndim > 5:
g_wrt_inputs = theano.tensor.reshape(g_wrt_inputs, x_shape)
g_wrt_scale = theano.tensor.reshape(g_wrt_scale, params_shape)
g_wrt_bias = theano.tensor.reshape(g_wrt_bias, params_shape)
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType):
g_wrt_inputs = tensor.as_tensor_variable(g_wrt_inputs)
if isinstance(node.outputs[1].type, tensor.TensorType):
g_wrt_scale = tensor.as_tensor_variable(g_wrt_scale)
if isinstance(node.outputs[2].type, tensor.TensorType):
g_wrt_bias = tensor.as_tensor_variable(g_wrt_bias)
# TODO copy_stack_trace?
return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
@local_optimizer([bn.AbstractBatchNormInference])
def local_abstract_batch_norm_inference_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormInference):
return None
x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
if not x_on_gpu:
return None
try:
eps = float(theano.tensor.get_scalar_constant_value(epsilon))
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
if not dnn_available():
return None
x = as_cuda_ndarray_variable(x)
scale = as_cuda_ndarray_variable(scale)
bias = as_cuda_ndarray_variable(bias)
estimated_mean = as_cuda_ndarray_variable(estimated_mean)
estimated_variance = as_cuda_ndarray_variable(estimated_variance)
out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance,
mode, eps)
# If the original output was on CPU, we have to transfer it
# TODO copy_stack_trace?
if isinstance(node.outputs[0].type, tensor.TensorType):
return [tensor.as_tensor_variable(out)]
else:
return [out]
...@@ -3050,3 +3050,28 @@ conv_groupopt.register('local_abstractconv3d_gradinputs_gemm', ...@@ -3050,3 +3050,28 @@ conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
local_abstractconv3d_gradinputs_gemm, 30, local_abstractconv3d_gradinputs_gemm, 30,
'conv_gemm', 'conv_gemm',
'gpu', 'fast_compile', 'fast_run') 'gpu', 'fast_compile', 'fast_run')
# Register cuDNN batch normalization implementation
abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
abstract_batch_norm_groupopt.__name__ = "gpu_batchnorm_opts"
register_opt('fast_compile')(abstract_batch_norm_groupopt)
# cuDNN optimizations are only registered if cuDNN is available.
# (we import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn)
from .dnn import (local_abstract_batch_norm_train_cudnn,
local_abstract_batch_norm_train_grad_cudnn,
local_abstract_batch_norm_inference_cudnn) # noqa: 402
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_dnn',
local_abstract_batch_norm_train_cudnn, 20,
'batchnorm_dnn',
'gpu', 'fast_compile', 'fast_run', 'cudnn')
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_grad_dnn',
local_abstract_batch_norm_train_grad_cudnn, 20,
'batchnorm_dnn',
'gpu', 'fast_compile', 'fast_run', 'cudnn')
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_inference_dnn',
local_abstract_batch_norm_inference_cudnn, 20,
'batchnorm_dnn',
'gpu', 'fast_compile', 'fast_run', 'cudnn')
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from collections import OrderedDict
import logging import logging
import os import os
import sys import sys
...@@ -18,6 +19,7 @@ import theano.tests.unittest_tools as utt ...@@ -18,6 +19,7 @@ import theano.tests.unittest_tools as utt
from theano.tensor.signal.pool import pool_2d, pool_3d from theano.tensor.signal.pool import pool_2d, pool_3d
from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
from theano.tensor.nnet.abstract_conv import get_conv_output_shape from theano.tensor.nnet.abstract_conv import get_conv_output_shape
from theano.tensor.nnet import bn
import theano.sandbox.cuda.dnn as dnn import theano.sandbox.cuda.dnn as dnn
from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
from theano.sandbox.cuda import float32_shared_constructor as shared from theano.sandbox.cuda import float32_shared_constructor as shared
...@@ -730,52 +732,201 @@ def test_batchnorm_train(): ...@@ -730,52 +732,201 @@ def test_batchnorm_train():
raise SkipTest("batch normalization requires cudnn v5+") raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng() utt.seed_rng()
tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
for mode in ('per-activation', 'spatial'): for mode in ('per-activation', 'spatial'):
for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector): for vartype in (tensor6, T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias')) x, scale, bias, running_mean, running_var = (vartype(n)
for n in ('x', 'scale', 'bias',
'running_mean',
'running_var'))
ndim = x.ndim ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used eps = 5e-3 # some non-standard value to test if it's used
running_average_factor = 0.3
# forward pass
out, x_mean, x_invstd = cuda.dnn.dnn_batch_normalization_train( # forward pass, direct interface
x, scale, bias, mode, eps) out_gpu, x_mean_gpu, x_invstd_gpu, \
out_running_mean_gpu, out_running_var_gpu = \
dnn.dnn_batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# forward pass, abstract interface
out_abstract, x_mean_abstract, x_invstd_abstract, \
out_running_mean_abstract, out_running_var_abstract = \
bn.batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# reference forward pass # reference forward pass
if mode == 'per-activation': if mode == 'per-activation':
axes = (0,) axes = (0,)
elif mode == 'spatial': elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim)) axes = (0,) + tuple(range(2, ndim))
x_mean2 = x.mean(axis=axes, keepdims=True) x_mean_ref = x.mean(axis=axes, keepdims=True)
x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps)) x_var_ref = x.var(axis=axes, keepdims=True)
scale2 = T.addbroadcast(scale, *axes) x_invstd_ref = T.inv(T.sqrt(x_var_ref + eps))
bias2 = T.addbroadcast(bias, *axes) scale_ref = T.addbroadcast(scale, *axes)
out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 bias_ref = T.addbroadcast(bias, *axes)
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
out_running_mean_ref = running_mean * (1 - running_average_factor) + \
x_mean_ref * running_average_factor
out_running_var_ref = running_var * (1 - running_average_factor) + \
(m / (m - 1)) * x_var_ref * running_average_factor
# backward pass # backward pass
dy = vartype('dy') dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
# reference backward pass # reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) grads_ref = T.grad(None, wrt=[x, scale, bias], known_grads={out_ref: dy})
# compile # compile
f = theano.function([x, scale, bias, dy], f_gpu = theano.function([x, scale, bias, running_mean, running_var, dy],
[out, x_mean, x_invstd, out2, x_mean2, x_invstd2] + [out_gpu, x_mean_gpu, x_invstd_gpu,
grads + grads2, mode=mode_with_gpu) out_running_mean_gpu, out_running_var_gpu] + grads_gpu,
mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract,
out_running_mean_abstract, out_running_var_abstract] +
grads_abstract,
mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_ref, x_mean_ref, x_invstd_ref,
out_running_mean_ref, out_running_var_ref] + grads_ref)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run # run
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): for data_shape in ((5, 10, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
data_shape = data_shape[:ndim] data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape)) for d, s in enumerate(data_shape))
X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32') X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32') Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype('float32') Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype('float32') Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Dy) Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs_gpu = f_gpu(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
# compare outputs # compare outputs
utt.assert_allclose(outputs[0], outputs[0 + 3]) # out utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean utt.assert_allclose(outputs_gpu[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd utt.assert_allclose(outputs_gpu[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_gpu[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_gpu[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs_abstract[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_abstract[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
# compare gradients # compare gradients
utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4) # dx utt.assert_allclose(outputs_gpu[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs_gpu[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias utt.assert_allclose(outputs_gpu[7], outputs_ref[7]) # dbias
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_abstract[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_abstract[7], outputs_ref[7]) # dbias
def test_dnn_batchnorm_train_without_running_averages():
# compile and run batch_normalization_train without running averages
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
# forward pass
out_gpu, x_mean_gpu, x_invstd_gpu = \
dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation')
out_abstract, x_mean_abstract, x_invstd_abstract = \
bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
# compile
f_gpu = theano.function([x, scale, bias, dy],
[out_gpu, x_mean_gpu, x_invstd_gpu] +
grads_gpu,
mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract] +
grads_abstract,
mode=mode_with_gpu)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm)
for n in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
for n in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f_abstract.maker.fgraph.toposort()])
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f_gpu(X, Scale, Bias, Dy)
f_abstract(X, Scale, Bias, Dy)
def test_dnn_batchnorm_train_inplace():
# test inplace_running_mean and inplace_running_var
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
running_mean = shared(
numpy.random.randn(*param_shape).astype(theano.config.floatX),
broadcastable=(True, False, False, False))
running_var = shared(
numpy.random.randn(*param_shape).astype(theano.config.floatX),
broadcastable=(True, False, False, False))
# forward pass
out, x_mean, x_invstd, new_running_mean, new_running_var = \
dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation',
epsilon=5e-3, running_average_factor=0.3,
running_mean=running_mean, running_var=running_var)
# update running averages
updates = OrderedDict()
updates[running_mean] = new_running_mean
updates[running_var] = new_running_var
# compile
f = theano.function([x, scale, bias],
[out, x_mean, x_invstd],
updates=updates,
mode=mode_with_gpu)
# check for the inplace settings
nodes = [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, dnn.GpuDnnBatchNorm)]
assert len(nodes) == 1
assert nodes[0].op.inplace_running_mean
assert nodes[0].op.inplace_running_var
assert nodes[0].op.inplace_output
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias)
def test_batchnorm_inference(): def test_batchnorm_inference():
...@@ -785,53 +936,160 @@ def test_batchnorm_inference(): ...@@ -785,53 +936,160 @@ def test_batchnorm_inference():
raise SkipTest("batch normalization requires cudnn v5+") raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng() utt.seed_rng()
tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
for mode in ('per-activation', 'spatial'): for mode in ('per-activation', 'spatial'):
for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector): for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias, mean, var = (vartype(n) for n in ('x', 'scale', x, scale, bias, mean, var = (vartype(n)
'bias', 'mean', for n in ('x', 'scale', 'bias', 'mean', 'var'))
'var'))
ndim = x.ndim ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used eps = 5e-3 # some non-standard value to test if it's used
# forward pass # forward pass, direct interface
out = cuda.dnn.dnn_batch_normalization_test(x, scale, bias, mean, out_gpu = dnn.dnn_batch_normalization_test(x, scale, bias, mean,
var, mode, eps) var, mode, eps)
# forward pass, abstract interface
out_abstract = bn.batch_normalization_test(x, scale, bias, mean,
var, mode, eps)
# reference forward pass # reference forward pass
if mode == 'per-activation': if mode == 'per-activation':
axes = (0,) axes = (0,)
elif mode == 'spatial': elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim)) axes = (0,) + tuple(range(2, ndim))
scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes) scale_ref, bias_ref, mean_ref, var_ref = (T.addbroadcast(t, *axes)
for t in (scale, bias, mean, var)) for t in (scale, bias, mean, var))
out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2 out_ref = (x - mean_ref) * (scale_ref / T.sqrt(var_ref + eps)) + bias_ref
# backward pass # backward pass
dy = vartype('dy') dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy}) grads_gpu = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_abstract: dy})
# reference backward pass # reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy}) grads_ref = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_ref: dy})
# compile # compile
f = theano.function([x, scale, bias, mean, var, dy], f_gpu = theano.function([x, scale, bias, mean, var, dy],
[out, out2] + grads + grads2, mode=mode_with_gpu) [out_gpu] + grads_gpu, mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, mean, var, dy],
[out_abstract] + grads_abstract, mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, mean, var, dy],
[out_ref] + grads_ref)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run # run
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): for data_shape in ((10, 20, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
data_shape = data_shape[:ndim] data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape)) for d, s in enumerate(data_shape))
X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32') X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32') Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype('float32') Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype('float32') Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype('float32') Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype('float32') Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Mean, Var, Dy) outputs_gpu = f_gpu(X, Scale, Bias, Mean, Var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Mean, Var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Mean, Var, Dy)
# compare outputs # compare outputs
utt.assert_allclose(outputs[0], outputs[1]) # out utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
# compare gradients # compare gradients
utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx utt.assert_allclose(outputs_gpu[1], outputs_ref[1], atol=4e-5) # dx
utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale utt.assert_allclose(outputs_gpu[2], outputs_ref[2], atol=4e-5) # dscale
utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias utt.assert_allclose(outputs_gpu[3], outputs_ref[3]) # dbias
utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean utt.assert_allclose(outputs_gpu[4], outputs_ref[4]) # dmean
utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar utt.assert_allclose(outputs_gpu[5], outputs_ref[5], rtol=2e-3, atol=4e-5) # dvar
utt.assert_allclose(outputs_abstract[1], outputs_ref[1], atol=4e-5) # dx
utt.assert_allclose(outputs_abstract[2], outputs_ref[2], atol=4e-5) # dscale
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # dbias
utt.assert_allclose(outputs_abstract[4], outputs_ref[4]) # dmean
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], rtol=2e-3, atol=4e-5) # dvar
def test_batchnorm_inference_inplace():
# test inplace
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, mean, var = (T.tensor4(n) for n in ('x', 'scale', 'bias', 'mean', 'var'))
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
out = dnn.dnn_batch_normalization_test(x, scale, bias, mean, var)
f = theano.function([x, scale, bias, mean, var], [out], mode=mode_with_gpu)
# check for the inplace settings
nodes = [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, dnn.GpuDnnBatchNormInference)]
assert len(nodes) == 1
assert nodes[0].op.inplace
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias, Mean, Var)
def test_dnn_batchnorm_valid_and_invalid_axes():
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix):
x, scale, bias, mean, var, dy = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var', 'dy'))
ndim = x.ndim
# supported: per-activation and spatial
valid_axes_lists = ((0,), (0,) + tuple(range(2, ndim)))
# not supported: an axes list without 0 and including 1
invalid_axes_lists = (tuple(range(1, ndim)),)
for axes in valid_axes_lists + invalid_axes_lists:
# forward pass, abstract interface
out_train, x_mean, x_invstd = bn.batch_normalization_train(
x, scale, bias, axes)
out_test = bn.batch_normalization_test(
x, scale, bias, mean, var, axes)
# backward pass
dy = vartype('dy')
grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
grads_test = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_test: dy})
# compile
f = theano.function([x, scale, bias, mean, var, dy],
[out_train, x_mean, x_invstd, out_test] +
grads_train + grads_test,
mode=mode_with_gpu)
if axes in valid_axes_lists:
# check if the abstract Ops have been replaced by the cuDNN Ops
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
in f.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
in f.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f.maker.fgraph.toposort()])
else:
# check if the abstract Ops have been replaced, but not by the cuDNN Ops
assert not any([isinstance(n.op, (dnn.GpuDnnBatchNorm,
dnn.GpuDnnBatchNormGrad,
bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f.maker.fgraph.toposort()])
def test_dnn_tag(): def test_dnn_tag():
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import numpy
import theano import theano
from theano import Apply, Op
from theano.gof import local_optimizer
from theano.gof.opt import copy_stack_trace
from theano.tensor import as_tensor_variable, TensorType
from theano.tensor import basic as T
from theano.tensor.opt import register_specialize_device
from theano.scalar import Composite from theano.scalar import Composite
from theano.scalar import add, sub, true_div, mul from theano.scalar import add, sub, true_div, mul
...@@ -37,7 +44,7 @@ def batch_normalization(inputs, gamma, beta, mean, std, ...@@ -37,7 +44,7 @@ def batch_normalization(inputs, gamma, beta, mean, std,
""" """
This function will build the symbolic graph for applying batch normalization This function will build the symbolic graph for applying batch normalization
to a set of activations. to a set of activations.
Also works on GPUs Also works on GPUs, but is not optimized using cuDNN.
.. versionadded:: 0.7.1 .. versionadded:: 0.7.1
...@@ -75,3 +82,631 @@ def batch_normalization(inputs, gamma, beta, mean, std, ...@@ -75,3 +82,631 @@ def batch_normalization(inputs, gamma, beta, mean, std,
raise ValueError( raise ValueError(
'mode must be either "low_mem", "high_mem"') 'mode must be either "low_mem", "high_mem"')
return rval return rval
def _prepare_batch_normalization_axes(axes, ndim):
if axes == 'per-activation':
axes = (0,)
elif axes == 'spatial':
axes = (0,) + tuple(range(2, ndim))
elif isinstance(axes, (tuple, list, numpy.ndarray)):
axes = tuple(int(a) for a in axes)
else:
raise ValueError('invalid axes: %s', str(axes))
axes = tuple(sorted(axes))
if len(axes) == 0:
raise ValueError('there should be at least one normalization axis')
if min(axes) < 0 or max(axes) >= ndim:
raise ValueError('axes should be less than ndim (<%d), but %s given' % (ndim, str(axes)))
non_bc_axes = tuple(i for i in range(ndim) if i not in axes)
return axes, non_bc_axes
def batch_normalization_train(inputs, gamma, beta, axes='per-activation',
epsilon=1e-4, running_average_factor=0.1,
running_mean=None, running_var=None):
"""
Performs batch normalization of the given inputs, using the mean and
variance of the inputs.
Parameters
----------
axes : 'per-activation', 'spatial' or a tuple of ints
The axes along which the input should be normalized. ``'per-activation'``
normalizes per activation and is equal to ``axes=(0,)``.
``'spatial'`` shares normalization factors across spatial dimensions
(i.e., all dimensions past the second), which for 4D inputs would be
equal to ``axes=(0, 2, 3)``.
gamma : tensor
Learnable scale factors. The shape must match the shape of `inputs`,
except for the axes in `axes`. These axes should be set to 1 or be
skipped altogether (such that `gamma.ndim == inputs.ndim - len(axes)`).
beta : tensor
Learnable biases. Must match the tensor layout of `gamma`.
epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None. The shape should match that of `gamma` and `beta`.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None. The shape should match that of `gamma` and `beta`.
Returns
-------
out : tensor
Batch-normalized inputs.
mean : tensor
Means of `inputs` across the normalization axes.
invstd : tensor
Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes
-----
If per-activation or spatial normalization is selected, this operation
will use the cuDNN implementation. (This requires cuDNN 5 or newer.)
The returned values are equivalent to:
.. code-block:: python
# for per-activation normalization
axes = (0,)
# for spatial normalization
axes = (0,) + tuple(range(2, inputs.ndim))
mean = inputs.mean(axes, keepdims=True)
var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) + \\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) + \\
(m / (m - 1)) * var * running_average_factor
"""
ndim = inputs.ndim
axes, non_bc_axes = _prepare_batch_normalization_axes(axes, ndim)
# have the parameter tensors been broadcasted yet?
if gamma.ndim == ndim:
params_ndim = ndim
else:
params_ndim = len(non_bc_axes)
params_dimshuffle_pattern = ['x'] * ndim
for i, axis in enumerate(non_bc_axes):
params_dimshuffle_pattern[axis] = i
if gamma.ndim != params_ndim or beta.ndim != params_ndim:
raise ValueError("gamma and beta dimensionality must match the "
"number of non-normalized axes, or have the "
"same number of dimensions as the inputs; "
"got %d and %d instead of %d" %
(gamma.ndim, beta.ndim, params_ndim))
if (running_mean is None) != (running_var is None):
raise ValueError("running_mean and running_var must either both be "
"given or both be None")
if running_mean is not None and running_mean.ndim != params_ndim:
raise ValueError("running_mean must be of the same dimensionality "
"as gamma and beta; got %d instead of %d" %
(running_mean.ndim, params_ndim))
if running_var is not None and running_var.ndim != params_ndim:
raise ValueError("running_var must be of the same dimensionality "
"as gamma and beta; got %d instead of %d" %
(running_var.ndim, params_ndim))
# epsilon will be converted to floatX later. we need to check
# for rounding errors now, since numpy.float32(1e-5) < 1e-5.
epsilon = numpy.cast[theano.config.floatX](epsilon)
if epsilon < 1e-5:
raise ValueError("epsilon must be at least 1e-5, got %s" % str(epsilon))
inputs = as_tensor_variable(inputs)
gamma = as_tensor_variable(gamma)
beta = as_tensor_variable(beta)
if params_ndim != ndim:
gamma = gamma.dimshuffle(params_dimshuffle_pattern)
beta = beta.dimshuffle(params_dimshuffle_pattern)
else:
gamma = T.addbroadcast(gamma, *axes)
beta = T.addbroadcast(beta, *axes)
batchnorm_op = AbstractBatchNormTrain(axes=axes)
if running_mean is not None and running_var is not None:
running_mean = as_tensor_variable(running_mean)
running_var = as_tensor_variable(running_var)
if params_ndim != ndim:
running_mean = running_mean.dimshuffle(params_dimshuffle_pattern)
running_var = running_var.dimshuffle(params_dimshuffle_pattern)
else:
running_mean = T.addbroadcast(running_mean, *axes)
running_var = T.addbroadcast(running_var, *axes)
out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
inputs, gamma, beta, epsilon=epsilon,
running_average_factor=running_average_factor,
running_mean=running_mean, running_var=running_var)
if new_running_mean.broadcastable != running_mean.broadcastable:
new_running_mean = T.patternbroadcast(new_running_mean, running_mean.broadcastable)
if new_running_var.broadcastable != running_var.broadcastable:
new_running_var = T.patternbroadcast(new_running_var, running_var.broadcastable)
results = (out, mean, invstd, new_running_mean, new_running_var)
else:
results = batchnorm_op(inputs, gamma, beta, epsilon=epsilon)
if params_ndim != ndim:
# remove the broadcasted dimensions (except from the output)
results = ([results[0]] +
[r.dimshuffle(non_bc_axes) for r in results[1:]])
return tuple(results)
def batch_normalization_test(inputs, gamma, beta, mean, var,
axes='per-activation', epsilon=1e-4):
"""
Performs batch normalization of the given inputs, using the given mean and
variance.
Parameters
----------
axes : 'per-activation', 'spatial' or a tuple of ints
The axes along which the input should be normalized. ``'per-activation'``
normalizes per activation and is equal to ``axes=(0,)``.
``'spatial'`` shares normalization factors across spatial dimensions
(i.e., all dimensions past the second), which for 4D inputs would be
equal to ``axes=(0, 2, 3)``.
gamma : tensor
Scale factors. The shape must match the shape of `inputs`,
except for the axes in `axes`. These axes should be set to 1 or be
skipped altogether (such that `gamma.ndim == inputs.ndim - len(axes)`).
beta : tensor
Biases. Must match the tensor layout of `gamma`.
mean : tensor
Means. Usually these are running averages computed during training.
Must match the tensor layout of `gamma`.
var : tensor
Variances. Usually these are running averages computed during training.
Must match the tensor layout of `gamma`.
epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
Returns
-------
out : tensor
Batch-normalized inputs.
Notes
-----
If per-activation or spatial normalization is selected, this operation
will use the cuDNN implementation. (This requires cuDNN 5 or newer.)
The returned value is equivalent to:
.. code-block:: python
# for per-activation normalization
axes = (0,)
# for spatial normalization
axes = (0,) + tuple(range(2, inputs.ndim))
gamma, beta, mean, var = (T.addbroadcast(t, *axes)
for t in (gamma, beta, mean, var))
out = (inputs - mean) * gamma / T.sqrt(var + epsilon) + beta
"""
ndim = inputs.ndim
axes, non_bc_axes = _prepare_batch_normalization_axes(axes, ndim)
# have the parameter tensors been broadcasted yet?
if gamma.ndim == ndim:
params_ndim = ndim
else:
params_ndim = len(non_bc_axes)
params_dimshuffle_pattern = ['x'] * ndim
for i, axis in enumerate(non_bc_axes):
params_dimshuffle_pattern[axis] = i
if gamma.ndim != params_ndim or beta.ndim != params_ndim:
raise ValueError("gamma and beta dimensionality must match the "
"number of non-normalized axes, or have the "
"same number of dimensions as the inputs; "
"got %d and %d instead of %d" %
(gamma.ndim, beta.ndim, params_ndim))
if mean.ndim != params_ndim or var.ndim != params_ndim:
raise ValueError("mean and var must be of the same dimensionality "
"as gamma and beta; got %d and %d instead of %d" %
(mean.ndim, var.ndim, params_ndim))
# epsilon will be converted to floatX later. we need to check
# for rounding errors now, since numpy.float32(1e-5) < 1e-5.
epsilon = numpy.cast[theano.config.floatX](epsilon)
if epsilon < 1e-5:
raise ValueError("epsilon must be at least 1e-5, got %s" % str(epsilon))
gamma = as_tensor_variable(gamma)
beta = as_tensor_variable(beta)
mean = as_tensor_variable(mean)
var = as_tensor_variable(var)
if params_ndim != ndim:
gamma = gamma.dimshuffle(params_dimshuffle_pattern)
beta = beta.dimshuffle(params_dimshuffle_pattern)
mean = mean.dimshuffle(params_dimshuffle_pattern)
var = var.dimshuffle(params_dimshuffle_pattern)
else:
gamma = T.addbroadcast(gamma, *axes)
beta = T.addbroadcast(beta, *axes)
mean = T.addbroadcast(mean, *axes)
var = T.addbroadcast(var, *axes)
batchnorm_op = AbstractBatchNormInference(axes=axes)
return batchnorm_op(inputs, gamma, beta, mean, var, epsilon=epsilon)
class AbstractBatchNormTrain(Op):
"""
Abstract Op for Batch Normalization.
Parameters
----------
axes : a tuple of ints
The axes along which the input should be normalized.
x : tensor
The input to be normalized along `axes`.
scale : tensor
`scale` should have the same number of dimensions as `x`.
All dimensions listed in `axes` should have length 1.
bias : tensor
`bias` should have the same number of dimensions as `x`.
All dimensions listed in `axes` should have length 1.
epsilon
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - running_average_factor) + batch mean * running_average_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - running_average_factor) + (m / (m - 1)) * batch var * running_average_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
"""
__props__ = ('axes',)
def __init__(self, axes=(0,)):
assert isinstance(axes, (tuple, list))
assert len(axes) > 0
axes = tuple(int(a) for a in axes)
self.axes = axes
def infer_shape(self, node, shape):
return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)
def make_node(self, x, scale, bias, epsilon=1e-4,
running_average_factor=0.1,
running_mean=None, running_var=None):
assert x.ndim == scale.ndim == bias.ndim
assert ((running_mean is None and running_var is None) or
(running_mean is not None and running_var is not None))
assert (running_mean is None or running_mean.ndim == x.ndim)
assert (running_var is None or running_var.ndim == x.ndim)
if not isinstance(epsilon, theano.Variable):
epsilon = as_tensor_variable(epsilon)
if not isinstance(running_average_factor, theano.Variable):
running_average_factor = as_tensor_variable(running_average_factor)
inputs = [x, scale, bias, epsilon, running_average_factor]
output_types = [x.type(), scale.type(), scale.type()]
if running_mean is not None and running_var is not None:
inputs.append(running_mean)
inputs.append(running_var)
output_types.append(scale.type())
output_types.append(scale.type())
return Apply(self, inputs, output_types)
def L_op(self, inputs, outputs, grads):
x, scale, bias, epsilon, running_average_factor = inputs[:5]
dy = grads[0]
_, x_mean, x_invstd = outputs[:3]
disconnected_outputs = [
theano.gradient.DisconnectedType()(), # epsilon
theano.gradient.DisconnectedType()()] # running_average_factor
# Optional running_mean and running_var.
for i in range(5, len(inputs)):
disconnected_outputs.append(theano.gradient.DisconnectedType()())
return AbstractBatchNormTrainGrad(self.axes)(
x, dy, scale, x_mean, x_invstd, epsilon) + disconnected_outputs
def connection_pattern(self, node):
# Specificy that epsilon and running_average_factor are not connected to outputs.
patterns = [[True, True, True], # x
[True, True, True], # scale
[True, True, True], # bias
[False, False, False], # epsilon
[False, False, False]] # running_average_factor
# Optional running_mean and running_var are only
# connected to their new values.
for i in range(5, len(node.inputs)):
patterns[0].append(True)
for pattern in patterns[1:]:
pattern.append(False)
patterns.append([False] * (3 + i - 5) + [True])
return patterns
def perform(self, node, inputs, output_storage):
x, scale, bias, epsilon, running_average_factor = inputs[:5]
axes = self.axes
if min(axes) < 0 or max(axes) >= x.ndim:
raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))
mean = x.mean(axes, keepdims=True)
var = x.var(axes, keepdims=True)
invstd = 1.0 / numpy.sqrt(var + epsilon)
out = (x - mean) * (scale * invstd) + bias
output_storage[0][0] = out
output_storage[1][0] = mean
output_storage[2][0] = invstd
if len(inputs) > 5:
running_mean = inputs[5]
running_mean = running_mean * (1.0 - running_average_factor) + \
mean * running_average_factor
output_storage[3][0] = running_mean
if len(inputs) > 6:
m = float(numpy.prod(x.shape) / numpy.prod(scale.shape))
running_var = inputs[6]
running_var = running_var * (1.0 - running_average_factor) + \
(m / (m - 1)) * var * running_average_factor
output_storage[4][0] = running_var
class AbstractBatchNormInference(Op):
"""
Abstract Op for Batch Normalization.
Parameters
----------
axes : a tuple of ints
The axes along which the input is normalized.
epsilon
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
"""
__props__ = ('axes',)
def __init__(self, axes=(0,)):
assert isinstance(axes, (tuple, list))
assert len(axes) > 0
axes = tuple(int(a) for a in axes)
self.axes = axes
def infer_shape(self, node, shape):
return [shape[0]]
def make_node(self, x, scale, bias, estimated_mean, estimated_variance, epsilon=1e-4):
assert x.ndim == scale.ndim == bias.ndim == estimated_mean.ndim == estimated_variance.ndim
if not isinstance(epsilon, theano.Variable):
epsilon = as_tensor_variable(epsilon)
return Apply(self, [x, scale, bias, estimated_mean, estimated_variance, epsilon], [x.type()])
def grad(self, inputs, grads):
x, scale, bias, est_mean, est_var, epsilon = inputs
dy = grads[0]
axes = self.axes
if min(axes) < 0 or max(axes) >= x.ndim:
raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))
scale, bias, est_mean, est_var = (theano.tensor.addbroadcast(t, *axes)
for t in (scale, bias, est_mean, est_var))
# define helper expressions
est_var_eps = est_var + epsilon
est_std = theano.tensor.sqrt(est_var_eps)
two = theano.tensor.constant(2.)
# define and return gradients
dx = dy * (scale / est_std)
dscale = (dy * (x - est_mean)).sum(axes, keepdims=True) / est_std
dbias = dy.sum(axes, keepdims=True)
dmean = -dy.sum(axes, keepdims=True) * (scale / est_std)
dvar = -(dy * (x - est_mean)).sum(axes, keepdims=True) * (scale / (two * est_var_eps * est_std))
return [dx, dscale, dbias, dmean, dvar, theano.gradient.DisconnectedType()()]
def connection_pattern(self, node):
# Specificy that epsilon is not connected to outputs.
return [[True], [True], [True], [True], [True], [False]]
def perform(self, node, inputs, output_storage):
x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs
out = (x - estimated_mean) * (scale / numpy.sqrt(estimated_variance + epsilon)) + bias
output_storage[0][0] = out
class AbstractBatchNormTrainGrad(Op):
__props__ = ('axes',)
def __init__(self, axes=(0,)):
assert isinstance(axes, (tuple, list))
assert len(axes) > 0
axes = tuple(int(a) for a in axes)
self.axes = axes
def make_node(self, x, dy, scale, x_mean, x_invstd, epsilon=1e-4):
assert x.ndim == dy.ndim == scale.ndim == x_mean.ndim == x_invstd.ndim
if not isinstance(epsilon, theano.Variable):
epsilon = as_tensor_variable(epsilon)
return Apply(self, [x, dy, scale, x_mean, x_invstd, epsilon],
[x.type(), scale.type(), scale.type()])
def infer_shape(self, node, shape):
return [shape[0], shape[2], shape[2]]
def perform(self, node, inputs, output_storage):
x, dy, scale, x_mean, x_invstd, epsilon = inputs
axes = self.axes
if min(axes) < 0 or max(axes) >= x.ndim:
raise ValueError('axes should be less than ndim (<%d), but %s given' % (x.ndim, str(axes)))
x_diff = x - x_mean
mean_dy_x_diff = numpy.mean(dy * x_diff, axis=axes, keepdims=True)
c = (dy * x_invstd) - (x_diff * mean_dy_x_diff * (x_invstd ** 3))
g_wrt_inputs = scale * (c - numpy.mean(c, axis=axes, keepdims=True))
g_wrt_scale = numpy.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)
g_wrt_bias = numpy.sum(dy, axis=axes, keepdims=True)
output_storage[0][0] = g_wrt_inputs
output_storage[1][0] = g_wrt_scale
output_storage[2][0] = g_wrt_bias
@local_optimizer([AbstractBatchNormTrain])
def local_abstract_batch_norm_train(node):
if not isinstance(node.op, AbstractBatchNormTrain):
return None
x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
axes = node.op.axes
if min(axes) < 0 or max(axes) > x.ndim:
return None
if not isinstance(x.type, TensorType) or \
not isinstance(scale.type, TensorType) or \
not isinstance(bias.type, TensorType) or \
not isinstance(epsilon.type, TensorType) or \
not isinstance(running_average_factor.type, TensorType):
return None
# optional running_mean and running_var
if len(node.inputs) > 5 and not isinstance(node.inputs[5].type, TensorType):
return None
if len(node.inputs) > 6 and not isinstance(node.inputs[6].type, TensorType):
return None
mean = x.mean(axes, keepdims=True)
var = x.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (x - mean) * (scale * invstd) + bias
results = [out, mean, invstd]
if len(node.inputs) > 5:
running_mean = node.inputs[5]
running_mean = running_mean * (1.0 - running_average_factor) + \
mean * running_average_factor
results.append(running_mean)
if len(node.inputs) > 6:
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
running_var = node.inputs[6]
running_var = running_var * (1.0 - running_average_factor) + \
(m / (m - 1)) * var * running_average_factor
results.append(running_var)
results = [T.patternbroadcast(r, r_orig.broadcastable)
for (r, r_orig) in zip(results, node.outputs)]
for var in theano.gof.graph.variables(node.inputs, results):
if var not in node.inputs:
copy_stack_trace(node.outputs[0], var)
return results
@local_optimizer([AbstractBatchNormTrainGrad])
def local_abstract_batch_norm_train_grad(node):
if not isinstance(node.op, AbstractBatchNormTrainGrad):
return None
x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
axes = node.op.axes
if min(axes) < 0 or max(axes) > x.ndim:
return None
if not isinstance(x.type, TensorType) or \
not isinstance(dy.type, TensorType) or \
not isinstance(scale.type, TensorType) or \
not isinstance(x_mean.type, TensorType) or \
not isinstance(x_invstd.type, TensorType) or \
not isinstance(epsilon.type, TensorType):
return None
x_diff = x - x_mean
mean_dy_x_diff = T.mean(dy * x_diff, axis=axes, keepdims=True)
c = (dy * x_invstd) - x_diff * (mean_dy_x_diff * (x_invstd ** 3))
g_wrt_inputs = scale * (c - T.mean(c, axis=axes, keepdims=True))
g_wrt_scale = T.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)
g_wrt_bias = T.sum(dy, axis=axes, keepdims=True)
results = [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
results = [T.patternbroadcast(r, r_orig.broadcastable)
for (r, r_orig) in zip(results, node.outputs)]
for var in theano.gof.graph.variables(node.inputs, results):
if var not in node.inputs:
copy_stack_trace(node.outputs[0], var)
return results
@local_optimizer([AbstractBatchNormInference])
def local_abstract_batch_norm_inference(node):
if not isinstance(node.op, AbstractBatchNormInference):
return None
x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs
if not isinstance(x.type, TensorType) or \
not isinstance(scale.type, TensorType) or \
not isinstance(bias.type, TensorType) or \
not isinstance(estimated_mean.type, TensorType) or \
not isinstance(estimated_variance.type, TensorType) or \
not isinstance(epsilon.type, TensorType):
return None
result = (x - estimated_mean) * (scale / T.sqrt(estimated_variance + epsilon)) + bias
result = T.patternbroadcast(result, node.outputs[0].broadcastable)
for var in theano.gof.graph.variables(node.inputs, [result]):
if var not in node.inputs:
copy_stack_trace(node.outputs[0], var)
return [result]
# Register Cpu Optmization
bn_groupopt = theano.gof.optdb.LocalGroupDB()
bn_groupopt.__name__ = 'batchnorm_opts'
register_specialize_device(bn_groupopt, 'fast_compile', 'fast_run')
bn_groupopt.register('local_abstract_batch_norm_train',
local_abstract_batch_norm_train, 30,
'fast_compile', 'fast_run')
bn_groupopt.register('local_abstract_batch_norm_train_grad',
local_abstract_batch_norm_train_grad, 30,
'fast_compile', 'fast_run')
bn_groupopt.register('local_abstract_batch_norm_inference',
local_abstract_batch_norm_inference, 30,
'fast_compile', 'fast_run')
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import theano import theano
import theano.tensor as T
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
import numpy import numpy
from theano.tensor.nnet.bn import batch_normalization from theano.tensor.nnet import bn
def test_BNComposite(): def test_BNComposite():
...@@ -39,7 +40,7 @@ def test_BNComposite(): ...@@ -39,7 +40,7 @@ def test_BNComposite():
f_ref = theano.function([x, b, g, m, v], [bn_ref_op]) f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
res_ref = f_ref(X, G, B, M, V) res_ref = f_ref(X, G, B, M, V)
for mode in ['low_mem', 'high_mem']: for mode in ['low_mem', 'high_mem']:
bn_op = batch_normalization(x, g, b, m, v, mode=mode) bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode)
f = theano.function([x, b, g, m, v], [bn_op]) f = theano.function([x, b, g, m, v], [bn_op])
res = f(X, G, B, M, V) res = f(X, G, B, M, V)
utt.assert_allclose(res_ref, res) utt.assert_allclose(res_ref, res)
...@@ -47,7 +48,7 @@ def test_BNComposite(): ...@@ -47,7 +48,7 @@ def test_BNComposite():
theano.config.compute_test_value = orig theano.config.compute_test_value = orig
def test_bn(): def test_batch_normalization():
def bn_ref(x, G, B, M, V): def bn_ref(x, G, B, M, V):
n = (x - M) / V n = (x - M) / V
...@@ -70,28 +71,28 @@ def test_bn(): ...@@ -70,28 +71,28 @@ def test_bn():
f_ref = theano.function([x, b, g, m, v], [bn_ref_op]) f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
res_ref = f_ref(X, G, B, M, V) res_ref = f_ref(X, G, B, M, V)
for mode in ['low_mem', 'high_mem']: for mode in ['low_mem', 'high_mem']:
bn_op = batch_normalization(x, g, b, m, v, mode=mode) bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode)
f = theano.function([x, b, g, m, v], [bn_op]) f = theano.function([x, b, g, m, v], [bn_op])
res = f(X, G, B, M, V) res = f(X, G, B, M, V)
utt.assert_allclose(res_ref, res) utt.assert_allclose(res_ref, res)
def bn(inputs, gamma, beta, mean, std): def bn_f(inputs, gamma, beta, mean, std):
return batch_normalization(inputs, gamma, beta, mean, std, mode=mode) return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
utt.verify_grad(bn, [X, G, B, M, V]) utt.verify_grad(bn_f, [X, G, B, M, V])
bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True)) bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True))
f_ref = theano.function([x, b, g], [bn_ref_op]) f_ref = theano.function([x, b, g], [bn_ref_op])
res_ref = f_ref(X, G, B) res_ref = f_ref(X, G, B)
for mode in ['low_mem', 'high_mem']: for mode in ['low_mem', 'high_mem']:
bn_op = batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode) bn_op = bn.batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode)
f = theano.function([x, b, g], [bn_op]) f = theano.function([x, b, g], [bn_op])
res = f(X, G, B) res = f(X, G, B)
utt.assert_allclose(res_ref, res) utt.assert_allclose(res_ref, res)
def bn(inputs, gamma, beta, mean, std): def bn_f(inputs, gamma, beta, mean, std):
return batch_normalization(inputs, gamma, beta, mean, std, mode=mode) return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
utt.verify_grad(batch_normalization, [X, G, B, utt.verify_grad(bn_f, [X, G, B,
X.mean(axis=0)[numpy.newaxis], X.std(axis=0)[numpy.newaxis]]) X.mean(axis=0)[numpy.newaxis], X.std(axis=0)[numpy.newaxis]])
def test_bn_feature_maps(): def test_bn_feature_maps():
...@@ -122,21 +123,296 @@ def test_bn_feature_maps(): ...@@ -122,21 +123,296 @@ def test_bn_feature_maps():
res_ref = f_ref(X, G, B, M, V) res_ref = f_ref(X, G, B, M, V)
for mode in ['low_mem', 'high_mem']: for mode in ['low_mem', 'high_mem']:
bn_op = batch_normalization(x, bn_op = bn.batch_normalization(x,
g.dimshuffle('x', 0, 'x', 'x'), g.dimshuffle('x', 0, 'x', 'x'),
b.dimshuffle('x', 0, 'x', 'x'), b.dimshuffle('x', 0, 'x', 'x'),
m.dimshuffle('x', 0, 'x', 'x'), m.dimshuffle('x', 0, 'x', 'x'),
v.dimshuffle('x', 0, 'x', 'x'), v.dimshuffle('x', 0, 'x', 'x'),
mode=mode) mode=mode)
f = theano.function([x, b, g, m, v], [bn_op]) f = theano.function([x, b, g, m, v], [bn_op])
res = f(X, G, B, M, V) res = f(X, G, B, M, V)
utt.assert_allclose(res_ref, res) utt.assert_allclose(res_ref, res)
def conv_bn(inputs, gamma, beta, mean, std): def conv_bn(inputs, gamma, beta, mean, std):
return batch_normalization(inputs, return bn.batch_normalization(inputs,
gamma.dimshuffle('x', 0, 'x', 'x'), gamma.dimshuffle('x', 0, 'x', 'x'),
beta.dimshuffle('x', 0, 'x', 'x'), beta.dimshuffle('x', 0, 'x', 'x'),
mean.dimshuffle('x', 0, 'x', 'x'), mean.dimshuffle('x', 0, 'x', 'x'),
std.dimshuffle('x', 0, 'x', 'x'), std.dimshuffle('x', 0, 'x', 'x'),
mode=mode) mode=mode)
utt.verify_grad(conv_bn, [X, G, B, M, V]) utt.verify_grad(conv_bn, [X, G, B, M, V])
def test_batch_normalization_train():
utt.seed_rng()
for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias, running_mean, running_var = (vartype(n)
for n in ('x', 'scale', 'bias',
'running_mean',
'running_var'))
ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used
running_average_factor = 0.3
# remove non-existing axes
if isinstance(axes, tuple):
axes = tuple(i for i in axes if i < ndim)
if len(axes) == 0:
continue
# forward pass
out, x_mean, x_invstd, out_running_mean, out_running_var = \
bn.batch_normalization_train(
x, scale, bias, axes, eps,
running_average_factor, running_mean, running_var)
# reference forward pass
if axes == 'per-activation':
axes2 = (0,)
elif axes == 'spatial':
axes2 = (0,) + tuple(range(2, ndim))
else:
axes2 = axes
x_mean2 = x.mean(axis=axes2, keepdims=True)
x_var2 = x.var(axis=axes2, keepdims=True)
x_invstd2 = T.inv(T.sqrt(x_var2 + eps))
scale2 = T.addbroadcast(scale, *axes2)
bias2 = T.addbroadcast(bias, *axes2)
out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
out_running_mean2 = running_mean * (1 - running_average_factor) + \
x_mean2 * running_average_factor
out_running_var2 = running_var * (1 - running_average_factor) + \
(m / (m - 1)) * x_var2 * running_average_factor
# backward pass
dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
# reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
# compile
f = theano.function([x, scale, bias, running_mean, running_var, dy],
[out, x_mean, x_invstd, out_running_mean, out_running_var,
out2, x_mean2, x_invstd2, out_running_mean2, out_running_var2] +
grads + grads2)
# check if the abstract Ops have been replaced
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f.maker.fgraph.toposort()])
# run
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes2 else s
for d, s in enumerate(data_shape))
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy)
# compare outputs
utt.assert_allclose(outputs[0], outputs[0 + 5]) # out
utt.assert_allclose(outputs[1], outputs[1 + 5]) # mean
utt.assert_allclose(outputs[2], outputs[2 + 5]) # invstd
utt.assert_allclose(outputs[3], outputs[3 + 5]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs[4]),
numpy.nan_to_num(outputs[4 + 5])) # running_var
# compare gradients
utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4) # dx
utt.assert_allclose(outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs[12], outputs[12 + 3]) # dbias
def test_batch_normalization_train_without_running_averages():
# compile and run batch_normalization_train without running averages
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
# forward pass
out, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
# compile
f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads)
# check if the abstract Ops have been replaced
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f.maker.fgraph.toposort()])
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias, Dy)
def test_batch_normalization_train_broadcast():
for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x = vartype('x')
ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used
running_average_factor = 0.3
# remove non-existing axes
if isinstance(axes, tuple):
axes = tuple(i for i in axes if i < ndim)
if len(axes) == 0:
continue
# convert axes to explicit list
if axes == 'per-activation':
axes2 = (0,)
elif axes == 'spatial':
axes2 = (0,) + tuple(range(2, ndim))
else:
axes2 = axes
# compute axes for parameter tensors
non_bc_axes = tuple(i for i in range(ndim) if i not in axes2)
params_dimshuffle = ['x'] * ndim
for i, axis in enumerate(non_bc_axes):
params_dimshuffle[axis] = i
# construct non-broadcasted parameter variables
param_type = T.TensorType(x.dtype, (False,) * len(non_bc_axes))
scale, bias, running_mean, running_var = (param_type(n)
for n in ('scale', 'bias',
'running_mean',
'running_var'))
# broadcast parameter variables
scale_bc = scale.dimshuffle(params_dimshuffle)
bias_bc = bias.dimshuffle(params_dimshuffle)
running_mean_bc = running_mean.dimshuffle(params_dimshuffle)
running_var_bc = running_var.dimshuffle(params_dimshuffle)
# batch_normalization_train with original, non-broadcasted variables
train_non_bc = \
bn.batch_normalization_train(
x, scale, bias, axes, eps,
running_average_factor, running_mean, running_var)
# batch_normalization_train with broadcasted variables
train_bc = \
bn.batch_normalization_train(
x, scale_bc, bias_bc, axes, eps,
running_average_factor, running_mean_bc, running_var_bc)
train_bc = tuple([train_bc[0]] + # out
[r.dimshuffle(non_bc_axes) for r in train_bc[1:]])
# batch_normalization_test with original, non-broadcasted variables
test_non_bc = \
bn.batch_normalization_test(
x, scale, bias, running_mean, running_var, axes, eps)
# batch_normalization_test with broadcasted variables
test_bc = \
bn.batch_normalization_test(
x, scale_bc, bias_bc, running_mean_bc, running_var_bc, axes, eps)
# subtract the results of the non-broadcasted and broadcasted calls
results_non_bc = train_non_bc + (test_non_bc,)
results_bc = train_bc + (test_bc,)
results = [abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc)]
# compile to compute all differences
f = theano.function([x, scale, bias, running_mean, running_var],
T.sum(sum(results)))
# the paired ops are exactly the same, so the optimizer should have
# collapsed the sum of differences to a constant zero
nodes = f.maker.fgraph.toposort()
if theano.config.mode != "FAST_COMPILE":
assert len(nodes) == 1
assert isinstance(nodes[0].op, theano.compile.DeepCopyOp)
inputs = [numpy.asarray(numpy.random.rand(*((4,) * n)), x.dtype)
for n in [x.ndim, scale.ndim, bias.ndim,
running_mean.ndim, running_var.ndim]]
assert 0.0 == f(*inputs)
def test_batch_normalization_test():
for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias, mean, var = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var'))
ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used
# remove non-existing axes
if isinstance(axes, tuple):
axes = tuple(i for i in axes if i < ndim)
if len(axes) == 0:
continue
# forward pass
out = bn.batch_normalization_test(x, scale, bias, mean,
var, axes, eps)
# reference forward pass
if axes == 'per-activation':
axes2 = (0,)
elif axes == 'spatial':
axes2 = (0,) + tuple(range(2, ndim))
else:
axes2 = axes
scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes2)
for t in (scale, bias, mean, var))
out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2
# backward pass
dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy})
# reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy})
# compile
f = theano.function([x, scale, bias, mean, var, dy],
[out, out2] + grads + grads2)
# check if the abstract Ops have been replaced
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f.maker.fgraph.toposort()])
# run
for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes2 else s
for d, s in enumerate(data_shape))
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Mean, Var, Dy)
# compare outputs
utt.assert_allclose(outputs[0], outputs[1]) # out
# compare gradients
utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx
utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale
utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias
utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean
utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar
def test_batch_normalization_broadcastable():
# check if the broadcastable pattern is preserved by the optimizations
x, dy, scale, bias, mean, var = (T.scalar(n).dimshuffle(['x'] * 5)
for n in ('x', 'dy', 'scale', 'bias', 'mean', 'var'))
# forward pass
out_train, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'spatial')
out_test = bn.batch_normalization_test(x, scale, bias, mean, var, 'spatial')
# backward pass
grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
grads_test = T.grad(None, wrt=[x, scale, bias], known_grads={out_test: dy})
# compile
f = theano.function([x, scale, bias, mean, var, dy],
[out_train, x_mean, x_invstd, out_test] + grads_train + grads_test)
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f.maker.fgraph.toposort()])
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论