提交 43411345 authored 作者: Gijs van Tulder's avatar Gijs van Tulder

Batch normalization optimizations for old gpu backend.

上级 ef21cb58
......@@ -18,6 +18,7 @@ from theano.tensor.nnet.abstract_conv import (get_conv_output_shape,
assert_conv_shape)
from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad)
from theano.tensor.nnet import bn
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp, dnn_available
......@@ -2347,6 +2348,23 @@ class GpuDnnBatchNormBase(DnnBase):
epsilon
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
"""
__props__ = ('mode', 'epsilon')
......@@ -2395,17 +2413,15 @@ cudnnStatus_t err%(name)s;
result = """
cudnnStatus_t err%(name)s;
cudnnBatchNormMode_t mode%(name)s = %(mode)s;
double exponentialAverageFactor%(name)s = %(exp_avg_factor)f;
double epsilon%(name)s = %(epsilon)e;
""" % dict(name=name,
mode=mode,
exp_avg_factor=0, # deliberately unused
epsilon=self.epsilon)
return result
def c_code_cache_version(self):
return (3, version())
return (4, version())
class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
......@@ -2422,8 +2438,21 @@ class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
Note: scale, bias, mean and variance must follow the same tensor layout!
"""
__props__ = ('mode', 'epsilon', 'inplace')
tensor_descs = ['bn_input', 'bn_output', 'bn_params']
def __init__(self, mode='per-activation', epsilon=1e-4, inplace=False):
super(GpuDnnBatchNormInference, self).__init__(mode=mode, epsilon=epsilon)
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def get_op_params(self):
params = []
if self.inplace:
params.append(('INPLACE_OUTPUT', '1'))
return params
def infer_shape(self, node, shape):
# output shape equals shape of x
return [shape[0]]
......@@ -2460,10 +2489,16 @@ if (c_set_tensorNd(%(scale)s, bn_params_%(name)s) != 0)
}
// build and prepare the output variable
#ifdef INPLACE_OUTPUT
Py_XDECREF(%(outp)s);
%(outp)s = %(inp)s;
Py_INCREF(%(outp)s);
#else
if (CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp)s)) != 0)
{
%(fail)s
}
#endif
// set output tensor descriptor from output tensor
if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0)
......@@ -2494,6 +2529,16 @@ err%(name)s = cudnnBatchNormalizationForwardInference(
""" % dict(name=name, inp=inp, scale=scale, bias=bias, est_mean=est_mean,
est_var=est_var, outp=outp, fail=sub['fail'])
# add params
define_macros, undef_macros = self.get_c_macros(node, name, check_input=False)
result = """
%(define_macros)s
{
%(code)s
}
%(undef_macros)s
""" % dict(code=result, define_macros=define_macros, undef_macros=undef_macros)
return result
def grad(self, inputs, grads):
......@@ -2537,28 +2582,84 @@ class GpuDnnBatchNorm(GpuDnnBatchNormBase):
Note: scale and bias must follow the same tensor layout!
"""
__props__ = ('mode', 'epsilon', 'running_averages',
'inplace_running_mean', 'inplace_running_var',
'inplace_output')
tensor_descs = ['bn_input', 'bn_output', 'bn_params']
def __init__(self, mode='per-activation', epsilon=1e-4,
running_average_factor=0,
running_averages=False, inplace_running_mean=False,
inplace_running_var=False, inplace_output=False):
super(GpuDnnBatchNorm, self).__init__(mode=mode, epsilon=epsilon)
self.running_average_factor = running_average_factor
self.running_averages = running_averages
self.inplace_output = inplace_output
self.inplace_running_mean = inplace_running_mean
self.inplace_running_var = inplace_running_var
self.destroy_map = {}
if self.inplace_output:
self.destroy_map[0] = [0]
if self.running_averages and self.inplace_running_mean:
self.destroy_map[3] = [3]
if self.running_averages and self.inplace_running_var:
self.destroy_map[4] = [4]
def get_op_params(self):
params = []
if self.inplace_output:
params.append(('INPLACE_OUTPUT', '1'))
if self.running_averages:
params.append(('RUNNING_AVERAGES', '1'))
if self.inplace_running_mean:
params.append(('INPLACE_RUNNING_MEAN', '1'))
if self.inplace_running_var:
params.append(('INPLACE_RUNNING_VAR', '1'))
return params
def infer_shape(self, node, shape):
# first output equals shape of x
# second and third output equal shape of scale
return [shape[0], shape[1], shape[1]]
# other outputs equal shape of scale
return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)
def make_node(self, x, scale, bias):
def make_node(self, x, scale, bias,
running_mean=None, running_var=None):
assert x.ndim == scale.ndim == bias.ndim
assert x.ndim in (4, 5)
assert self.running_averages == (running_mean is not None) == (running_var is not None)
assert (running_mean is None or running_mean.ndim == x.ndim)
assert (running_var is None or running_var.ndim == x.ndim)
x = as_cuda_ndarray_variable(x)
scale = as_cuda_ndarray_variable(scale)
bias = as_cuda_ndarray_variable(bias)
assert x.ndim == scale.ndim == bias.ndim
assert x.ndim in (4, 5)
return Apply(self, [x, scale, bias], [x.type(), scale.type(), scale.type()])
inputs = [x, scale, bias]
output_types = [x.type(), scale.type(), scale.type()]
if running_mean is not None and running_var is not None:
inputs.append(as_cuda_ndarray_variable(running_mean))
inputs.append(as_cuda_ndarray_variable(running_var))
output_types.append(scale.type())
output_types.append(scale.type())
return Apply(self, inputs, output_types)
def c_code(self, node, name, inputs, outputs, sub):
# super call to prepare common configuration
result = super(GpuDnnBatchNorm, self).c_code(node, name, inputs, outputs, sub)
# give sensible names to inputs and outputs
inp, scale, bias = inputs
outp, x_mean, x_invstd = outputs
inp, scale, bias = inputs[:3]
outp, x_mean, x_invstd = outputs[:3]
if self.running_averages:
running_average_factor = self.running_average_factor
in_running_mean = inputs[3]
in_running_var = inputs[4]
out_running_mean = outputs[3]
out_running_var = outputs[4]
else:
running_average_factor = 0.
in_running_mean = 'NULL'
in_running_var = 'NULL'
out_running_mean = 'NULL'
out_running_var = 'NULL'
# set input tensor descriptors from input tensors
result += """
......@@ -2579,6 +2680,32 @@ if ((CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp
{
%(fail)s
}
#ifdef RUNNING_AVERAGES
#ifdef INPLACE_RUNNING_MEAN
Py_XDECREF(%(out_running_mean)s);
CudaNdarray *running_mean%(name)s = %(in_running_mean)s;
Py_INCREF(running_mean%(name)s);
#else
if ((CudaNdarray_prep_output(&%(out_running_mean)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(scale)s)) != 0) ||
(CudaNdarray_CopyFromCudaNdarray(%(out_running_mean)s, %(in_running_mean)s) != 0))
{
%(fail)s
}
CudaNdarray *running_mean%(name)s = %(out_running_mean)s;
#endif
#ifdef INPLACE_RUNNING_VAR
Py_XDECREF(%(out_running_var)s);
CudaNdarray *running_var%(name)s = %(in_running_var)s;
Py_INCREF(running_var%(name)s);
#else
if ((CudaNdarray_prep_output(&%(out_running_var)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(scale)s)) != 0) ||
(CudaNdarray_CopyFromCudaNdarray(%(out_running_var)s, %(in_running_var)s) != 0))
{
%(fail)s
}
CudaNdarray *running_var%(name)s = %(out_running_var)s;
#endif
#endif
// set output tensor descriptor from output tensor
if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0)
......@@ -2601,25 +2728,66 @@ err%(name)s = cudnnBatchNormalizationForwardTraining(
bn_params_%(name)s,
CudaNdarray_DEV_DATA(%(scale)s),
CudaNdarray_DEV_DATA(%(bias)s),
exponentialAverageFactor%(name)s,
NULL, // running mean, deliberately unused
NULL, // running var, deliberately unused
#ifdef RUNNING_AVERAGES
%(running_average_factor)f,
CudaNdarray_DEV_DATA(running_mean%(name)s),
CudaNdarray_DEV_DATA(running_var%(name)s),
#else
0,
NULL,
NULL,
#endif
epsilon%(name)s,
CudaNdarray_DEV_DATA(%(x_mean)s),
CudaNdarray_DEV_DATA(%(x_invstd)s)
);
}
#ifdef RUNNING_AVERAGES
%(out_running_mean)s = running_mean%(name)s;
%(out_running_var)s = running_var%(name)s;
#endif
""" % dict(name=name, inp=inp, scale=scale, bias=bias, outp=outp,
x_mean=x_mean, x_invstd=x_invstd, fail=sub['fail'])
x_mean=x_mean, x_invstd=x_invstd,
running_average_factor=running_average_factor,
in_running_mean=in_running_mean, in_running_var=in_running_var,
out_running_mean=out_running_mean, out_running_var=out_running_var,
fail=sub['fail'])
# add params
define_macros, undef_macros = self.get_c_macros(node, name, check_input=False)
result = """
%(define_macros)s
{
%(code)s
}
%(undef_macros)s
""" % dict(code=result, define_macros=define_macros, undef_macros=undef_macros)
return result
def grad(self, inputs, grads):
x, scale, bias = inputs
x, scale, bias = inputs[:3]
dy = grads[0]
_, x_mean, x_invstd = self(x, scale, bias)
return GpuDnnBatchNormGrad(self.mode, self.epsilon)(x, dy, scale,
x_mean, x_invstd)
_, x_mean, x_invstd = self(*inputs)[:3]
disconnected_outputs = []
# Optional running_mean and running_var.
for i in range(3, len(inputs)):
disconnected_outputs.append(DisconnectedType()())
return GpuDnnBatchNormGrad(self.mode, self.epsilon)(
x, dy, scale, x_mean, x_invstd) + disconnected_outputs
def connection_pattern(self, node):
patterns = [[True, True, True], # x
[True, True, True], # scale
[True, True, True]] # bias
# Optional running_mean and running_var are only
# connected to their new values.
for i in range(3, len(node.inputs)):
patterns[0].append(True)
for pattern in patterns[1:]:
pattern.append(False)
patterns.append([False] * (i) + [True])
return patterns
class GpuDnnBatchNormGrad(GpuDnnBatchNormBase):
......@@ -2722,7 +2890,8 @@ err%(name)s = cudnnBatchNormalizationBackward(
def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon=1e-4):
epsilon=1e-4, running_average_factor=0.1,
running_mean=None, running_var=None):
"""
Performs batch normalization of the given inputs, using the mean and
variance of the inputs.
......@@ -2742,6 +2911,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
Returns
-------
......@@ -2751,6 +2937,12 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
Means of `inputs` across the normalization axes.
invstd : tensor
Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes
-----
......@@ -2762,31 +2954,78 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
axes = 0 if mode == 'per-activation' else (0, 2, 3)
mean = inputs.mean(axes, keepdims=True)
invstd = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon))
var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) + \\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) + \\
(m / (m - 1)) * var * running_average_factor
For 5d tensors, the axes are (0, 2, 3, 4).
"""
ndim = inputs.ndim
if ndim > 5:
raise ValueError("dnn_batch_normalization_train currently supports "
"up to 5-dimensional tensors only, got %d" % ndim)
if gamma.ndim != ndim or beta.ndim != ndim:
raise ValueError("gamma and beta must be of the same dimensionality "
"as inputs; got %d and %d instead of %d" %
(gamma.ndim, beta.ndim, ndim))
if (running_mean is None) != (running_var is None):
raise ValueError("running_mean and running_var must either both be "
"given or both be None")
if running_mean is not None and running_mean.ndim != ndim:
raise ValueError("running_mean must be of the same dimensionality "
"as inputs; got %d instead of %d" %
(running_mean.ndim, ndim))
if running_var is not None and running_var.ndim != ndim:
raise ValueError("running_var must be of the same dimensionality "
"as inputs; got %d instead of %d" %
(running_var.ndim, ndim))
if epsilon < 1e-5:
raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)
running_averages = (running_var is not None and running_var is not None)
if ndim < 4:
inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
beta = theano.tensor.shape_padright(beta, 4 - ndim)
batchnorm_op = GpuDnnBatchNorm(mode=mode, epsilon=epsilon)
result = tuple(batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta)))
if running_averages:
running_mean = theano.tensor.shape_padright(running_mean, 4 - ndim)
running_var = theano.tensor.shape_padright(running_var, 4 - ndim)
elif ndim > 5:
inputs_shape = inputs.shape
params_shape = gamma.shape
inputs = theano.tensor.flatten(inputs, 5)
gamma = theano.tensor.flatten(gamma, 5)
beta = theano.tensor.flatten(beta, 5)
if running_averages:
running_mean = theano.tensor.flatten(running_mean, 5)
running_var = theano.tensor.flatten(running_var, 5)
batchnorm_op = GpuDnnBatchNorm(mode=mode, epsilon=epsilon,
running_average_factor=running_average_factor,
running_averages=running_averages)
if running_averages:
out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta),
running_mean=gpu_contiguous(running_mean),
running_var=gpu_contiguous(running_var))
if new_running_mean.broadcastable != running_mean.broadcastable:
new_running_mean = tensor.patternbroadcast(new_running_mean, running_mean.broadcastable)
if new_running_var.broadcastable != running_var.broadcastable:
new_running_var = tensor.patternbroadcast(new_running_var, running_var.broadcastable)
result = (out, mean, invstd, new_running_mean, new_running_var)
else:
result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta))
if ndim < 4:
result = tuple(theano.tensor.flatten(r, ndim) for r in result)
elif ndim > 5:
result = (theano.tensor.reshape(result[0], inputs_shape),) + tuple(
theano.tensor.reshape(r, params_shape) for r in result[1:])
return result
......@@ -2839,9 +3078,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
For 5d tensors, the axes would be (0, 2, 3, 4).
"""
ndim = inputs.ndim
if ndim > 5:
raise ValueError("dnn_batch_normalization_test currently supports "
"up to 5-dimensional tensors only, got %d" % ndim)
if gamma.ndim != ndim or beta.ndim != ndim:
raise ValueError("gamma and beta must be of the same dimensionality "
"as inputs; got %d and %d instead of %d" %
......@@ -2859,12 +3095,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
beta = theano.tensor.shape_padright(beta, 4 - ndim)
mean = theano.tensor.shape_padright(mean, 4 - ndim)
var = theano.tensor.shape_padright(var, 4 - ndim)
elif ndim > 5:
inputs_shape = inputs.shape
inputs = theano.tensor.flatten(inputs, 5)
gamma = theano.tensor.flatten(gamma, 5)
beta = theano.tensor.flatten(beta, 5)
mean = theano.tensor.flatten(mean, 5)
var = theano.tensor.flatten(var, 5)
batchnorm_op = GpuDnnBatchNormInference(mode=mode, epsilon=epsilon)
result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta), gpu_contiguous(mean),
gpu_contiguous(var))
if ndim < 4:
result = theano.tensor.flatten(result, ndim)
elif ndim > 5:
result = theano.tensor.reshape(result, inputs_shape)
return result
......@@ -3334,3 +3579,235 @@ def local_abstractconv3d_cudnn(node):
subsample=node.op.subsample,
conv_mode=conv_mode)
return [rval]
@local_optimizer([bn.AbstractBatchNormTrain])
def local_abstract_batch_norm_train_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrain):
return None
x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
running_mean = node.inputs[5] if len(node.inputs) > 5 else None
running_var = node.inputs[6] if len(node.inputs) > 6 else None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
if not x_on_gpu:
return None
# convert axes to cuDNN mode
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
try:
eps = float(theano.tensor.get_scalar_constant_value(epsilon))
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
try:
running_average_factor = float(theano.tensor.get_scalar_constant_value(running_average_factor))
except theano.tensor.NotScalarConstantError:
return None
if not dnn_available():
return None
x = as_cuda_ndarray_variable(x)
scale = as_cuda_ndarray_variable(scale)
bias = as_cuda_ndarray_variable(bias)
inputs = [x, scale, bias, mode, eps, running_average_factor]
if running_mean is not None and running_var is not None:
inputs.append(running_mean)
inputs.append(running_var)
results = list(dnn_batch_normalization_train(*inputs))
# If the original output was on CPU, we have to transfer it
for i in range(len(node.outputs)):
if isinstance(node.outputs[i].type, tensor.TensorType):
results[i] = tensor.as_tensor_variable(results[i])
# TODO copy_stack_trace?
return results
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_gpu_batch_norm_inplace_output(node):
if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
return GpuDnnBatchNorm(mode=node.op.mode,
epsilon=node.op.epsilon,
running_average_factor=node.op.running_average_factor,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=node.op.inplace_running_var,
inplace_output=True)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_gpu_batch_norm_inplace_running_mean(node):
if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_mean:
return GpuDnnBatchNorm(mode=node.op.mode,
epsilon=node.op.epsilon,
running_average_factor=node.op.running_average_factor,
running_averages=node.op.running_averages,
inplace_running_mean=True,
inplace_running_var=node.op.inplace_running_var,
inplace_output=node.op.inplace_output)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_gpu_batch_norm_inplace_running_var(node):
if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_var:
return GpuDnnBatchNorm(mode=node.op.mode,
epsilon=node.op.epsilon,
running_average_factor=node.op.running_average_factor,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=True,
inplace_output=node.op.inplace_output)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNormInference], inplace=True)
def local_gpu_batch_norm_inference_inplace(node):
if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
return [GpuDnnBatchNormInference(mode=node.op.mode,
epsilon=node.op.epsilon,
inplace=True)(*node.inputs)]
@local_optimizer([bn.AbstractBatchNormTrainGrad])
def local_abstract_batch_norm_train_grad_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrainGrad):
return None
x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
dy_on_gpu = (isinstance(dy.type, CudaNdarrayType) or
(dy.owner and isinstance(dy.owner.op, HostFromGpu)))
if not (x_on_gpu or dy_on_gpu):
return None
# convert axes to cuDNN mode
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
ndim = x.ndim
if ndim < 4:
x = theano.tensor.shape_padright(x, 4 - ndim)
dy = theano.tensor.shape_padright(dy, 4 - ndim)
scale = theano.tensor.shape_padright(scale, 4 - ndim)
x_mean = theano.tensor.shape_padright(x_mean, 4 - ndim)
x_invstd = theano.tensor.shape_padright(x_invstd, 4 - ndim)
elif ndim > 5:
x_shape = x.shape
params_shape = scale.shape
x = theano.tensor.flatten(x, 5)
dy = theano.tensor.flatten(dy, 5)
scale = theano.tensor.flatten(scale, 5)
x_mean = theano.tensor.flatten(x_mean, 5)
x_invstd = theano.tensor.flatten(x_invstd, 5)
try:
eps = float(theano.tensor.get_scalar_constant_value(epsilon))
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
if not dnn_available():
return None
x = as_cuda_ndarray_variable(x)
dy = as_cuda_ndarray_variable(dy)
scale = as_cuda_ndarray_variable(scale)
x_mean = as_cuda_ndarray_variable(x_mean)
x_invstd = as_cuda_ndarray_variable(x_invstd)
g_wrt_inputs, g_wrt_scale, g_wrt_bias = \
GpuDnnBatchNormGrad(mode, epsilon=eps)(x, dy, scale, x_mean, x_invstd)
if ndim < 4:
g_wrt_inputs = theano.tensor.flatten(g_wrt_inputs, ndim)
g_wrt_scale = theano.tensor.flatten(g_wrt_scale, ndim)
g_wrt_bias = theano.tensor.flatten(g_wrt_bias, ndim)
elif ndim > 5:
g_wrt_inputs = theano.tensor.reshape(g_wrt_inputs, x_shape)
g_wrt_scale = theano.tensor.reshape(g_wrt_scale, params_shape)
g_wrt_bias = theano.tensor.reshape(g_wrt_bias, params_shape)
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType):
g_wrt_inputs = tensor.as_tensor_variable(g_wrt_inputs)
if isinstance(node.outputs[1].type, tensor.TensorType):
g_wrt_scale = tensor.as_tensor_variable(g_wrt_scale)
if isinstance(node.outputs[2].type, tensor.TensorType):
g_wrt_bias = tensor.as_tensor_variable(g_wrt_bias)
# TODO copy_stack_trace?
return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
@local_optimizer([bn.AbstractBatchNormInference])
def local_abstract_batch_norm_inference_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormInference):
return None
x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
if not x_on_gpu:
return None
try:
eps = float(theano.tensor.get_scalar_constant_value(epsilon))
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
if not dnn_available():
return None
x = as_cuda_ndarray_variable(x)
scale = as_cuda_ndarray_variable(scale)
bias = as_cuda_ndarray_variable(bias)
estimated_mean = as_cuda_ndarray_variable(estimated_mean)
estimated_variance = as_cuda_ndarray_variable(estimated_variance)
out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance,
mode, eps)
# If the original output was on CPU, we have to transfer it
# TODO copy_stack_trace?
if isinstance(node.outputs[0].type, tensor.TensorType):
return [tensor.as_tensor_variable(out)]
else:
return [out]
......@@ -3050,3 +3050,28 @@ conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
local_abstractconv3d_gradinputs_gemm, 30,
'conv_gemm',
'gpu', 'fast_compile', 'fast_run')
# Register cuDNN batch normalization implementation
abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
abstract_batch_norm_groupopt.__name__ = "gpu_batchnorm_opts"
register_opt('fast_compile')(abstract_batch_norm_groupopt)
# cuDNN optimizations are only registered if cuDNN is available.
# (we import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn)
from .dnn import (local_abstract_batch_norm_train_cudnn,
local_abstract_batch_norm_train_grad_cudnn,
local_abstract_batch_norm_inference_cudnn) # noqa: 402
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_dnn',
local_abstract_batch_norm_train_cudnn, 20,
'batchnorm_dnn',
'gpu', 'fast_compile', 'fast_run', 'cudnn')
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_grad_dnn',
local_abstract_batch_norm_train_grad_cudnn, 20,
'batchnorm_dnn',
'gpu', 'fast_compile', 'fast_run', 'cudnn')
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_inference_dnn',
local_abstract_batch_norm_inference_cudnn, 20,
'batchnorm_dnn',
'gpu', 'fast_compile', 'fast_run', 'cudnn')
from __future__ import absolute_import, print_function, division
from collections import OrderedDict
import logging
import os
import sys
......@@ -18,6 +19,7 @@ import theano.tests.unittest_tools as utt
from theano.tensor.signal.pool import pool_2d, pool_3d
from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
from theano.tensor.nnet.abstract_conv import get_conv_output_shape
from theano.tensor.nnet import bn
import theano.sandbox.cuda.dnn as dnn
from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
from theano.sandbox.cuda import float32_shared_constructor as shared
......@@ -730,52 +732,201 @@ def test_batchnorm_train():
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
for mode in ('per-activation', 'spatial'):
for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias'))
for vartype in (tensor6, T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
x, scale, bias, running_mean, running_var = (vartype(n)
for n in ('x', 'scale', 'bias',
'running_mean',
'running_var'))
ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used
# forward pass
out, x_mean, x_invstd = cuda.dnn.dnn_batch_normalization_train(
x, scale, bias, mode, eps)
running_average_factor = 0.3
# forward pass, direct interface
out_gpu, x_mean_gpu, x_invstd_gpu, \
out_running_mean_gpu, out_running_var_gpu = \
dnn.dnn_batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# forward pass, abstract interface
out_abstract, x_mean_abstract, x_invstd_abstract, \
out_running_mean_abstract, out_running_var_abstract = \
bn.batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# reference forward pass
if mode == 'per-activation':
axes = (0,)
elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim))
x_mean2 = x.mean(axis=axes, keepdims=True)
x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps))
scale2 = T.addbroadcast(scale, *axes)
bias2 = T.addbroadcast(bias, *axes)
out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
x_mean_ref = x.mean(axis=axes, keepdims=True)
x_var_ref = x.var(axis=axes, keepdims=True)
x_invstd_ref = T.inv(T.sqrt(x_var_ref + eps))
scale_ref = T.addbroadcast(scale, *axes)
bias_ref = T.addbroadcast(bias, *axes)
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
out_running_mean_ref = running_mean * (1 - running_average_factor) + \
x_mean_ref * running_average_factor
out_running_var_ref = running_var * (1 - running_average_factor) + \
(m / (m - 1)) * x_var_ref * running_average_factor
# backward pass
dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
# reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
grads_ref = T.grad(None, wrt=[x, scale, bias], known_grads={out_ref: dy})
# compile
f = theano.function([x, scale, bias, dy],
[out, x_mean, x_invstd, out2, x_mean2, x_invstd2] +
grads + grads2, mode=mode_with_gpu)
f_gpu = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_gpu, x_mean_gpu, x_invstd_gpu,
out_running_mean_gpu, out_running_var_gpu] + grads_gpu,
mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract,
out_running_mean_abstract, out_running_var_abstract] +
grads_abstract,
mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_ref, x_mean_ref, x_invstd_ref,
out_running_mean_ref, out_running_var_ref] + grads_ref)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
for data_shape in ((5, 10, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape))
X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
Scale = numpy.random.randn(*param_shape).astype('float32')
Bias = numpy.random.randn(*param_shape).astype('float32')
outputs = f(X, Scale, Bias, Dy)
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs_gpu = f_gpu(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
# compare outputs
utt.assert_allclose(outputs[0], outputs[0 + 3]) # out
utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean
utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd
utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_gpu[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs_gpu[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_gpu[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_gpu[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs_abstract[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_abstract[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
# compare gradients
utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4) # dx
utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias
utt.assert_allclose(outputs_gpu[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_gpu[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_gpu[7], outputs_ref[7]) # dbias
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_abstract[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_abstract[7], outputs_ref[7]) # dbias
def test_dnn_batchnorm_train_without_running_averages():
# compile and run batch_normalization_train without running averages
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
# forward pass
out_gpu, x_mean_gpu, x_invstd_gpu = \
dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation')
out_abstract, x_mean_abstract, x_invstd_abstract = \
bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
# compile
f_gpu = theano.function([x, scale, bias, dy],
[out_gpu, x_mean_gpu, x_invstd_gpu] +
grads_gpu,
mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract] +
grads_abstract,
mode=mode_with_gpu)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm)
for n in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
for n in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f_abstract.maker.fgraph.toposort()])
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f_gpu(X, Scale, Bias, Dy)
f_abstract(X, Scale, Bias, Dy)
def test_dnn_batchnorm_train_inplace():
# test inplace_running_mean and inplace_running_var
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
running_mean = shared(
numpy.random.randn(*param_shape).astype(theano.config.floatX),
broadcastable=(True, False, False, False))
running_var = shared(
numpy.random.randn(*param_shape).astype(theano.config.floatX),
broadcastable=(True, False, False, False))
# forward pass
out, x_mean, x_invstd, new_running_mean, new_running_var = \
dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation',
epsilon=5e-3, running_average_factor=0.3,
running_mean=running_mean, running_var=running_var)
# update running averages
updates = OrderedDict()
updates[running_mean] = new_running_mean
updates[running_var] = new_running_var
# compile
f = theano.function([x, scale, bias],
[out, x_mean, x_invstd],
updates=updates,
mode=mode_with_gpu)
# check for the inplace settings
nodes = [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, dnn.GpuDnnBatchNorm)]
assert len(nodes) == 1
assert nodes[0].op.inplace_running_mean
assert nodes[0].op.inplace_running_var
assert nodes[0].op.inplace_output
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias)
def test_batchnorm_inference():
......@@ -785,53 +936,160 @@ def test_batchnorm_inference():
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
for mode in ('per-activation', 'spatial'):
for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
x, scale, bias, mean, var = (vartype(n) for n in ('x', 'scale',
'bias', 'mean',
'var'))
for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias, mean, var = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var'))
ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used
# forward pass
out = cuda.dnn.dnn_batch_normalization_test(x, scale, bias, mean,
var, mode, eps)
# forward pass, direct interface
out_gpu = dnn.dnn_batch_normalization_test(x, scale, bias, mean,
var, mode, eps)
# forward pass, abstract interface
out_abstract = bn.batch_normalization_test(x, scale, bias, mean,
var, mode, eps)
# reference forward pass
if mode == 'per-activation':
axes = (0,)
elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim))
scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes)
for t in (scale, bias, mean, var))
out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2
scale_ref, bias_ref, mean_ref, var_ref = (T.addbroadcast(t, *axes)
for t in (scale, bias, mean, var))
out_ref = (x - mean_ref) * (scale_ref / T.sqrt(var_ref + eps)) + bias_ref
# backward pass
dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy})
grads_gpu = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_abstract: dy})
# reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy})
grads_ref = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_ref: dy})
# compile
f = theano.function([x, scale, bias, mean, var, dy],
[out, out2] + grads + grads2, mode=mode_with_gpu)
f_gpu = theano.function([x, scale, bias, mean, var, dy],
[out_gpu] + grads_gpu, mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, mean, var, dy],
[out_abstract] + grads_abstract, mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, mean, var, dy],
[out_ref] + grads_ref)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
for data_shape in ((10, 20, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape))
X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
Scale = numpy.random.randn(*param_shape).astype('float32')
Bias = numpy.random.randn(*param_shape).astype('float32')
Mean = numpy.random.randn(*param_shape).astype('float32')
Var = numpy.random.rand(*param_shape).astype('float32')
outputs = f(X, Scale, Bias, Mean, Var, Dy)
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
outputs_gpu = f_gpu(X, Scale, Bias, Mean, Var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Mean, Var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Mean, Var, Dy)
# compare outputs
utt.assert_allclose(outputs[0], outputs[1]) # out
utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
# compare gradients
utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx
utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale
utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias
utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean
utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar
utt.assert_allclose(outputs_gpu[1], outputs_ref[1], atol=4e-5) # dx
utt.assert_allclose(outputs_gpu[2], outputs_ref[2], atol=4e-5) # dscale
utt.assert_allclose(outputs_gpu[3], outputs_ref[3]) # dbias
utt.assert_allclose(outputs_gpu[4], outputs_ref[4]) # dmean
utt.assert_allclose(outputs_gpu[5], outputs_ref[5], rtol=2e-3, atol=4e-5) # dvar
utt.assert_allclose(outputs_abstract[1], outputs_ref[1], atol=4e-5) # dx
utt.assert_allclose(outputs_abstract[2], outputs_ref[2], atol=4e-5) # dscale
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # dbias
utt.assert_allclose(outputs_abstract[4], outputs_ref[4]) # dmean
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], rtol=2e-3, atol=4e-5) # dvar
def test_batchnorm_inference_inplace():
# test inplace
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, mean, var = (T.tensor4(n) for n in ('x', 'scale', 'bias', 'mean', 'var'))
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
out = dnn.dnn_batch_normalization_test(x, scale, bias, mean, var)
f = theano.function([x, scale, bias, mean, var], [out], mode=mode_with_gpu)
# check for the inplace settings
nodes = [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, dnn.GpuDnnBatchNormInference)]
assert len(nodes) == 1
assert nodes[0].op.inplace
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias, Mean, Var)
def test_dnn_batchnorm_valid_and_invalid_axes():
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix):
x, scale, bias, mean, var, dy = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var', 'dy'))
ndim = x.ndim
# supported: per-activation and spatial
valid_axes_lists = ((0,), (0,) + tuple(range(2, ndim)))
# not supported: an axes list without 0 and including 1
invalid_axes_lists = (tuple(range(1, ndim)),)
for axes in valid_axes_lists + invalid_axes_lists:
# forward pass, abstract interface
out_train, x_mean, x_invstd = bn.batch_normalization_train(
x, scale, bias, axes)
out_test = bn.batch_normalization_test(
x, scale, bias, mean, var, axes)
# backward pass
dy = vartype('dy')
grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
grads_test = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_test: dy})
# compile
f = theano.function([x, scale, bias, mean, var, dy],
[out_train, x_mean, x_invstd, out_test] +
grads_train + grads_test,
mode=mode_with_gpu)
if axes in valid_axes_lists:
# check if the abstract Ops have been replaced by the cuDNN Ops
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
in f.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
in f.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f.maker.fgraph.toposort()])
else:
# check if the abstract Ops have been replaced, but not by the cuDNN Ops
assert not any([isinstance(n.op, (dnn.GpuDnnBatchNorm,
dnn.GpuDnnBatchNormGrad,
bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f.maker.fgraph.toposort()])
def test_dnn_tag():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论