提交 43411345 authored 作者: Gijs van Tulder's avatar Gijs van Tulder

Batch normalization optimizations for old gpu backend.

上级 ef21cb58
...@@ -18,6 +18,7 @@ from theano.tensor.nnet.abstract_conv import (get_conv_output_shape, ...@@ -18,6 +18,7 @@ from theano.tensor.nnet.abstract_conv import (get_conv_output_shape,
assert_conv_shape) assert_conv_shape)
from theano.tensor.signal.pool import ( from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad) Pool, MaxPoolGrad, AveragePoolGrad)
from theano.tensor.nnet import bn
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp, dnn_available from theano.sandbox.cuda import GpuOp, dnn_available
...@@ -2347,6 +2348,23 @@ class GpuDnnBatchNormBase(DnnBase): ...@@ -2347,6 +2348,23 @@ class GpuDnnBatchNormBase(DnnBase):
epsilon epsilon
Epsilon value used in the batch normalization formula. Minimum allowed Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN). value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
""" """
__props__ = ('mode', 'epsilon') __props__ = ('mode', 'epsilon')
...@@ -2395,17 +2413,15 @@ cudnnStatus_t err%(name)s; ...@@ -2395,17 +2413,15 @@ cudnnStatus_t err%(name)s;
result = """ result = """
cudnnStatus_t err%(name)s; cudnnStatus_t err%(name)s;
cudnnBatchNormMode_t mode%(name)s = %(mode)s; cudnnBatchNormMode_t mode%(name)s = %(mode)s;
double exponentialAverageFactor%(name)s = %(exp_avg_factor)f;
double epsilon%(name)s = %(epsilon)e; double epsilon%(name)s = %(epsilon)e;
""" % dict(name=name, """ % dict(name=name,
mode=mode, mode=mode,
exp_avg_factor=0, # deliberately unused
epsilon=self.epsilon) epsilon=self.epsilon)
return result return result
def c_code_cache_version(self): def c_code_cache_version(self):
return (3, version()) return (4, version())
class GpuDnnBatchNormInference(GpuDnnBatchNormBase): class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
...@@ -2422,8 +2438,21 @@ class GpuDnnBatchNormInference(GpuDnnBatchNormBase): ...@@ -2422,8 +2438,21 @@ class GpuDnnBatchNormInference(GpuDnnBatchNormBase):
Note: scale, bias, mean and variance must follow the same tensor layout! Note: scale, bias, mean and variance must follow the same tensor layout!
""" """
__props__ = ('mode', 'epsilon', 'inplace')
tensor_descs = ['bn_input', 'bn_output', 'bn_params'] tensor_descs = ['bn_input', 'bn_output', 'bn_params']
def __init__(self, mode='per-activation', epsilon=1e-4, inplace=False):
super(GpuDnnBatchNormInference, self).__init__(mode=mode, epsilon=epsilon)
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def get_op_params(self):
params = []
if self.inplace:
params.append(('INPLACE_OUTPUT', '1'))
return params
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
# output shape equals shape of x # output shape equals shape of x
return [shape[0]] return [shape[0]]
...@@ -2460,10 +2489,16 @@ if (c_set_tensorNd(%(scale)s, bn_params_%(name)s) != 0) ...@@ -2460,10 +2489,16 @@ if (c_set_tensorNd(%(scale)s, bn_params_%(name)s) != 0)
} }
// build and prepare the output variable // build and prepare the output variable
#ifdef INPLACE_OUTPUT
Py_XDECREF(%(outp)s);
%(outp)s = %(inp)s;
Py_INCREF(%(outp)s);
#else
if (CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp)s)) != 0) if (CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp)s)) != 0)
{ {
%(fail)s %(fail)s
} }
#endif
// set output tensor descriptor from output tensor // set output tensor descriptor from output tensor
if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0) if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0)
...@@ -2494,6 +2529,16 @@ err%(name)s = cudnnBatchNormalizationForwardInference( ...@@ -2494,6 +2529,16 @@ err%(name)s = cudnnBatchNormalizationForwardInference(
""" % dict(name=name, inp=inp, scale=scale, bias=bias, est_mean=est_mean, """ % dict(name=name, inp=inp, scale=scale, bias=bias, est_mean=est_mean,
est_var=est_var, outp=outp, fail=sub['fail']) est_var=est_var, outp=outp, fail=sub['fail'])
# add params
define_macros, undef_macros = self.get_c_macros(node, name, check_input=False)
result = """
%(define_macros)s
{
%(code)s
}
%(undef_macros)s
""" % dict(code=result, define_macros=define_macros, undef_macros=undef_macros)
return result return result
def grad(self, inputs, grads): def grad(self, inputs, grads):
...@@ -2537,28 +2582,84 @@ class GpuDnnBatchNorm(GpuDnnBatchNormBase): ...@@ -2537,28 +2582,84 @@ class GpuDnnBatchNorm(GpuDnnBatchNormBase):
Note: scale and bias must follow the same tensor layout! Note: scale and bias must follow the same tensor layout!
""" """
__props__ = ('mode', 'epsilon', 'running_averages',
'inplace_running_mean', 'inplace_running_var',
'inplace_output')
tensor_descs = ['bn_input', 'bn_output', 'bn_params'] tensor_descs = ['bn_input', 'bn_output', 'bn_params']
def __init__(self, mode='per-activation', epsilon=1e-4,
running_average_factor=0,
running_averages=False, inplace_running_mean=False,
inplace_running_var=False, inplace_output=False):
super(GpuDnnBatchNorm, self).__init__(mode=mode, epsilon=epsilon)
self.running_average_factor = running_average_factor
self.running_averages = running_averages
self.inplace_output = inplace_output
self.inplace_running_mean = inplace_running_mean
self.inplace_running_var = inplace_running_var
self.destroy_map = {}
if self.inplace_output:
self.destroy_map[0] = [0]
if self.running_averages and self.inplace_running_mean:
self.destroy_map[3] = [3]
if self.running_averages and self.inplace_running_var:
self.destroy_map[4] = [4]
def get_op_params(self):
params = []
if self.inplace_output:
params.append(('INPLACE_OUTPUT', '1'))
if self.running_averages:
params.append(('RUNNING_AVERAGES', '1'))
if self.inplace_running_mean:
params.append(('INPLACE_RUNNING_MEAN', '1'))
if self.inplace_running_var:
params.append(('INPLACE_RUNNING_VAR', '1'))
return params
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
# first output equals shape of x # first output equals shape of x
# second and third output equal shape of scale # other outputs equal shape of scale
return [shape[0], shape[1], shape[1]] return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)
def make_node(self, x, scale, bias): def make_node(self, x, scale, bias,
running_mean=None, running_var=None):
assert x.ndim == scale.ndim == bias.ndim
assert x.ndim in (4, 5)
assert self.running_averages == (running_mean is not None) == (running_var is not None)
assert (running_mean is None or running_mean.ndim == x.ndim)
assert (running_var is None or running_var.ndim == x.ndim)
x = as_cuda_ndarray_variable(x) x = as_cuda_ndarray_variable(x)
scale = as_cuda_ndarray_variable(scale) scale = as_cuda_ndarray_variable(scale)
bias = as_cuda_ndarray_variable(bias) bias = as_cuda_ndarray_variable(bias)
assert x.ndim == scale.ndim == bias.ndim inputs = [x, scale, bias]
assert x.ndim in (4, 5) output_types = [x.type(), scale.type(), scale.type()]
return Apply(self, [x, scale, bias], [x.type(), scale.type(), scale.type()]) if running_mean is not None and running_var is not None:
inputs.append(as_cuda_ndarray_variable(running_mean))
inputs.append(as_cuda_ndarray_variable(running_var))
output_types.append(scale.type())
output_types.append(scale.type())
return Apply(self, inputs, output_types)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
# super call to prepare common configuration # super call to prepare common configuration
result = super(GpuDnnBatchNorm, self).c_code(node, name, inputs, outputs, sub) result = super(GpuDnnBatchNorm, self).c_code(node, name, inputs, outputs, sub)
# give sensible names to inputs and outputs # give sensible names to inputs and outputs
inp, scale, bias = inputs inp, scale, bias = inputs[:3]
outp, x_mean, x_invstd = outputs outp, x_mean, x_invstd = outputs[:3]
if self.running_averages:
running_average_factor = self.running_average_factor
in_running_mean = inputs[3]
in_running_var = inputs[4]
out_running_mean = outputs[3]
out_running_var = outputs[4]
else:
running_average_factor = 0.
in_running_mean = 'NULL'
in_running_var = 'NULL'
out_running_mean = 'NULL'
out_running_var = 'NULL'
# set input tensor descriptors from input tensors # set input tensor descriptors from input tensors
result += """ result += """
...@@ -2579,6 +2680,32 @@ if ((CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp ...@@ -2579,6 +2680,32 @@ if ((CudaNdarray_prep_output(&%(outp)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(inp
{ {
%(fail)s %(fail)s
} }
#ifdef RUNNING_AVERAGES
#ifdef INPLACE_RUNNING_MEAN
Py_XDECREF(%(out_running_mean)s);
CudaNdarray *running_mean%(name)s = %(in_running_mean)s;
Py_INCREF(running_mean%(name)s);
#else
if ((CudaNdarray_prep_output(&%(out_running_mean)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(scale)s)) != 0) ||
(CudaNdarray_CopyFromCudaNdarray(%(out_running_mean)s, %(in_running_mean)s) != 0))
{
%(fail)s
}
CudaNdarray *running_mean%(name)s = %(out_running_mean)s;
#endif
#ifdef INPLACE_RUNNING_VAR
Py_XDECREF(%(out_running_var)s);
CudaNdarray *running_var%(name)s = %(in_running_var)s;
Py_INCREF(running_var%(name)s);
#else
if ((CudaNdarray_prep_output(&%(out_running_var)s, %(inp)s->nd, CudaNdarray_HOST_DIMS(%(scale)s)) != 0) ||
(CudaNdarray_CopyFromCudaNdarray(%(out_running_var)s, %(in_running_var)s) != 0))
{
%(fail)s
}
CudaNdarray *running_var%(name)s = %(out_running_var)s;
#endif
#endif
// set output tensor descriptor from output tensor // set output tensor descriptor from output tensor
if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0) if (c_set_tensorNd(%(outp)s, bn_output_%(name)s) != 0)
...@@ -2601,25 +2728,66 @@ err%(name)s = cudnnBatchNormalizationForwardTraining( ...@@ -2601,25 +2728,66 @@ err%(name)s = cudnnBatchNormalizationForwardTraining(
bn_params_%(name)s, bn_params_%(name)s,
CudaNdarray_DEV_DATA(%(scale)s), CudaNdarray_DEV_DATA(%(scale)s),
CudaNdarray_DEV_DATA(%(bias)s), CudaNdarray_DEV_DATA(%(bias)s),
exponentialAverageFactor%(name)s, #ifdef RUNNING_AVERAGES
NULL, // running mean, deliberately unused %(running_average_factor)f,
NULL, // running var, deliberately unused CudaNdarray_DEV_DATA(running_mean%(name)s),
CudaNdarray_DEV_DATA(running_var%(name)s),
#else
0,
NULL,
NULL,
#endif
epsilon%(name)s, epsilon%(name)s,
CudaNdarray_DEV_DATA(%(x_mean)s), CudaNdarray_DEV_DATA(%(x_mean)s),
CudaNdarray_DEV_DATA(%(x_invstd)s) CudaNdarray_DEV_DATA(%(x_invstd)s)
); );
} }
#ifdef RUNNING_AVERAGES
%(out_running_mean)s = running_mean%(name)s;
%(out_running_var)s = running_var%(name)s;
#endif
""" % dict(name=name, inp=inp, scale=scale, bias=bias, outp=outp, """ % dict(name=name, inp=inp, scale=scale, bias=bias, outp=outp,
x_mean=x_mean, x_invstd=x_invstd, fail=sub['fail']) x_mean=x_mean, x_invstd=x_invstd,
running_average_factor=running_average_factor,
in_running_mean=in_running_mean, in_running_var=in_running_var,
out_running_mean=out_running_mean, out_running_var=out_running_var,
fail=sub['fail'])
# add params
define_macros, undef_macros = self.get_c_macros(node, name, check_input=False)
result = """
%(define_macros)s
{
%(code)s
}
%(undef_macros)s
""" % dict(code=result, define_macros=define_macros, undef_macros=undef_macros)
return result return result
def grad(self, inputs, grads): def grad(self, inputs, grads):
x, scale, bias = inputs x, scale, bias = inputs[:3]
dy = grads[0] dy = grads[0]
_, x_mean, x_invstd = self(x, scale, bias) _, x_mean, x_invstd = self(*inputs)[:3]
return GpuDnnBatchNormGrad(self.mode, self.epsilon)(x, dy, scale, disconnected_outputs = []
x_mean, x_invstd) # Optional running_mean and running_var.
for i in range(3, len(inputs)):
disconnected_outputs.append(DisconnectedType()())
return GpuDnnBatchNormGrad(self.mode, self.epsilon)(
x, dy, scale, x_mean, x_invstd) + disconnected_outputs
def connection_pattern(self, node):
patterns = [[True, True, True], # x
[True, True, True], # scale
[True, True, True]] # bias
# Optional running_mean and running_var are only
# connected to their new values.
for i in range(3, len(node.inputs)):
patterns[0].append(True)
for pattern in patterns[1:]:
pattern.append(False)
patterns.append([False] * (i) + [True])
return patterns
class GpuDnnBatchNormGrad(GpuDnnBatchNormBase): class GpuDnnBatchNormGrad(GpuDnnBatchNormBase):
...@@ -2722,7 +2890,8 @@ err%(name)s = cudnnBatchNormalizationBackward( ...@@ -2722,7 +2890,8 @@ err%(name)s = cudnnBatchNormalizationBackward(
def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon=1e-4): epsilon=1e-4, running_average_factor=0.1,
running_mean=None, running_var=None):
""" """
Performs batch normalization of the given inputs, using the mean and Performs batch normalization of the given inputs, using the mean and
variance of the inputs. variance of the inputs.
...@@ -2742,6 +2911,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', ...@@ -2742,6 +2911,23 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon : float epsilon : float
Epsilon value used in the batch normalization formula. Minimum allowed Epsilon value used in the batch normalization formula. Minimum allowed
value is 1e-5 (imposed by cuDNN). value is 1e-5 (imposed by cuDNN).
running_average_factor : float
Factor for updating the values or `running_mean` and `running_var`.
If the factor is close to one, the running averages will update quickly,
if the factor is close to zero it will update slowly.
running_mean : tensor or None
Previous value of the running mean. If this is given, the new value
``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
will be returned as one of the outputs of this function.
`running_mean` and `running_var` should either both be given or
both be None.
running_var : tensor or None
Previous value of the running variance. If this is given, the new value
``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
will be returned as one of the outputs of this function,
where `m` is the product of lengths of the averaged-over dimensions.
`running_mean` and `running_var` should either both be given or
both be None.
Returns Returns
------- -------
...@@ -2751,6 +2937,12 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', ...@@ -2751,6 +2937,12 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
Means of `inputs` across the normalization axes. Means of `inputs` across the normalization axes.
invstd : tensor invstd : tensor
Inverse standard deviations of `inputs` across the normalization axes. Inverse standard deviations of `inputs` across the normalization axes.
new_running_mean : tensor
New value of the running mean (only if both `running_mean` and
`running_var` were given).
new_running_var : tensor
New value of the running variance (only if both `running_var` and
`running_mean` were given).
Notes Notes
----- -----
...@@ -2762,31 +2954,78 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', ...@@ -2762,31 +2954,78 @@ def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
axes = 0 if mode == 'per-activation' else (0, 2, 3) axes = 0 if mode == 'per-activation' else (0, 2, 3)
mean = inputs.mean(axes, keepdims=True) mean = inputs.mean(axes, keepdims=True)
invstd = T.inv(T.sqrt(inputs.var(axes, keepdims=True) + epsilon)) var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) + \\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) + \\
(m / (m - 1)) * var * running_average_factor
For 5d tensors, the axes are (0, 2, 3, 4). For 5d tensors, the axes are (0, 2, 3, 4).
""" """
ndim = inputs.ndim ndim = inputs.ndim
if ndim > 5:
raise ValueError("dnn_batch_normalization_train currently supports "
"up to 5-dimensional tensors only, got %d" % ndim)
if gamma.ndim != ndim or beta.ndim != ndim: if gamma.ndim != ndim or beta.ndim != ndim:
raise ValueError("gamma and beta must be of the same dimensionality " raise ValueError("gamma and beta must be of the same dimensionality "
"as inputs; got %d and %d instead of %d" % "as inputs; got %d and %d instead of %d" %
(gamma.ndim, beta.ndim, ndim)) (gamma.ndim, beta.ndim, ndim))
if (running_mean is None) != (running_var is None):
raise ValueError("running_mean and running_var must either both be "
"given or both be None")
if running_mean is not None and running_mean.ndim != ndim:
raise ValueError("running_mean must be of the same dimensionality "
"as inputs; got %d instead of %d" %
(running_mean.ndim, ndim))
if running_var is not None and running_var.ndim != ndim:
raise ValueError("running_var must be of the same dimensionality "
"as inputs; got %d instead of %d" %
(running_var.ndim, ndim))
if epsilon < 1e-5: if epsilon < 1e-5:
raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon) raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)
running_averages = (running_var is not None and running_var is not None)
if ndim < 4: if ndim < 4:
inputs = theano.tensor.shape_padright(inputs, 4 - ndim) inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
gamma = theano.tensor.shape_padright(gamma, 4 - ndim) gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
beta = theano.tensor.shape_padright(beta, 4 - ndim) beta = theano.tensor.shape_padright(beta, 4 - ndim)
batchnorm_op = GpuDnnBatchNorm(mode=mode, epsilon=epsilon) if running_averages:
result = tuple(batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma), running_mean = theano.tensor.shape_padright(running_mean, 4 - ndim)
gpu_contiguous(beta))) running_var = theano.tensor.shape_padright(running_var, 4 - ndim)
elif ndim > 5:
inputs_shape = inputs.shape
params_shape = gamma.shape
inputs = theano.tensor.flatten(inputs, 5)
gamma = theano.tensor.flatten(gamma, 5)
beta = theano.tensor.flatten(beta, 5)
if running_averages:
running_mean = theano.tensor.flatten(running_mean, 5)
running_var = theano.tensor.flatten(running_var, 5)
batchnorm_op = GpuDnnBatchNorm(mode=mode, epsilon=epsilon,
running_average_factor=running_average_factor,
running_averages=running_averages)
if running_averages:
out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta),
running_mean=gpu_contiguous(running_mean),
running_var=gpu_contiguous(running_var))
if new_running_mean.broadcastable != running_mean.broadcastable:
new_running_mean = tensor.patternbroadcast(new_running_mean, running_mean.broadcastable)
if new_running_var.broadcastable != running_var.broadcastable:
new_running_var = tensor.patternbroadcast(new_running_var, running_var.broadcastable)
result = (out, mean, invstd, new_running_mean, new_running_var)
else:
result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta))
if ndim < 4: if ndim < 4:
result = tuple(theano.tensor.flatten(r, ndim) for r in result) result = tuple(theano.tensor.flatten(r, ndim) for r in result)
elif ndim > 5:
result = (theano.tensor.reshape(result[0], inputs_shape),) + tuple(
theano.tensor.reshape(r, params_shape) for r in result[1:])
return result return result
...@@ -2839,9 +3078,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var, ...@@ -2839,9 +3078,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
For 5d tensors, the axes would be (0, 2, 3, 4). For 5d tensors, the axes would be (0, 2, 3, 4).
""" """
ndim = inputs.ndim ndim = inputs.ndim
if ndim > 5:
raise ValueError("dnn_batch_normalization_test currently supports "
"up to 5-dimensional tensors only, got %d" % ndim)
if gamma.ndim != ndim or beta.ndim != ndim: if gamma.ndim != ndim or beta.ndim != ndim:
raise ValueError("gamma and beta must be of the same dimensionality " raise ValueError("gamma and beta must be of the same dimensionality "
"as inputs; got %d and %d instead of %d" % "as inputs; got %d and %d instead of %d" %
...@@ -2859,12 +3095,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var, ...@@ -2859,12 +3095,21 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
beta = theano.tensor.shape_padright(beta, 4 - ndim) beta = theano.tensor.shape_padright(beta, 4 - ndim)
mean = theano.tensor.shape_padright(mean, 4 - ndim) mean = theano.tensor.shape_padright(mean, 4 - ndim)
var = theano.tensor.shape_padright(var, 4 - ndim) var = theano.tensor.shape_padright(var, 4 - ndim)
elif ndim > 5:
inputs_shape = inputs.shape
inputs = theano.tensor.flatten(inputs, 5)
gamma = theano.tensor.flatten(gamma, 5)
beta = theano.tensor.flatten(beta, 5)
mean = theano.tensor.flatten(mean, 5)
var = theano.tensor.flatten(var, 5)
batchnorm_op = GpuDnnBatchNormInference(mode=mode, epsilon=epsilon) batchnorm_op = GpuDnnBatchNormInference(mode=mode, epsilon=epsilon)
result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma), result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
gpu_contiguous(beta), gpu_contiguous(mean), gpu_contiguous(beta), gpu_contiguous(mean),
gpu_contiguous(var)) gpu_contiguous(var))
if ndim < 4: if ndim < 4:
result = theano.tensor.flatten(result, ndim) result = theano.tensor.flatten(result, ndim)
elif ndim > 5:
result = theano.tensor.reshape(result, inputs_shape)
return result return result
...@@ -3334,3 +3579,235 @@ def local_abstractconv3d_cudnn(node): ...@@ -3334,3 +3579,235 @@ def local_abstractconv3d_cudnn(node):
subsample=node.op.subsample, subsample=node.op.subsample,
conv_mode=conv_mode) conv_mode=conv_mode)
return [rval] return [rval]
@local_optimizer([bn.AbstractBatchNormTrain])
def local_abstract_batch_norm_train_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrain):
return None
x, scale, bias, epsilon, running_average_factor = node.inputs[:5]
running_mean = node.inputs[5] if len(node.inputs) > 5 else None
running_var = node.inputs[6] if len(node.inputs) > 6 else None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
if not x_on_gpu:
return None
# convert axes to cuDNN mode
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
try:
eps = float(theano.tensor.get_scalar_constant_value(epsilon))
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
try:
running_average_factor = float(theano.tensor.get_scalar_constant_value(running_average_factor))
except theano.tensor.NotScalarConstantError:
return None
if not dnn_available():
return None
x = as_cuda_ndarray_variable(x)
scale = as_cuda_ndarray_variable(scale)
bias = as_cuda_ndarray_variable(bias)
inputs = [x, scale, bias, mode, eps, running_average_factor]
if running_mean is not None and running_var is not None:
inputs.append(running_mean)
inputs.append(running_var)
results = list(dnn_batch_normalization_train(*inputs))
# If the original output was on CPU, we have to transfer it
for i in range(len(node.outputs)):
if isinstance(node.outputs[i].type, tensor.TensorType):
results[i] = tensor.as_tensor_variable(results[i])
# TODO copy_stack_trace?
return results
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_gpu_batch_norm_inplace_output(node):
if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
return GpuDnnBatchNorm(mode=node.op.mode,
epsilon=node.op.epsilon,
running_average_factor=node.op.running_average_factor,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=node.op.inplace_running_var,
inplace_output=True)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_gpu_batch_norm_inplace_running_mean(node):
if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_mean:
return GpuDnnBatchNorm(mode=node.op.mode,
epsilon=node.op.epsilon,
running_average_factor=node.op.running_average_factor,
running_averages=node.op.running_averages,
inplace_running_mean=True,
inplace_running_var=node.op.inplace_running_var,
inplace_output=node.op.inplace_output)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNorm], inplace=True)
def local_gpu_batch_norm_inplace_running_var(node):
if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_var:
return GpuDnnBatchNorm(mode=node.op.mode,
epsilon=node.op.epsilon,
running_average_factor=node.op.running_average_factor,
running_averages=node.op.running_averages,
inplace_running_mean=node.op.inplace_running_mean,
inplace_running_var=True,
inplace_output=node.op.inplace_output)(*node.inputs)
@register_inplace()
@local_optimizer([GpuDnnBatchNormInference], inplace=True)
def local_gpu_batch_norm_inference_inplace(node):
if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
return [GpuDnnBatchNormInference(mode=node.op.mode,
epsilon=node.op.epsilon,
inplace=True)(*node.inputs)]
@local_optimizer([bn.AbstractBatchNormTrainGrad])
def local_abstract_batch_norm_train_grad_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrainGrad):
return None
x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
dy_on_gpu = (isinstance(dy.type, CudaNdarrayType) or
(dy.owner and isinstance(dy.owner.op, HostFromGpu)))
if not (x_on_gpu or dy_on_gpu):
return None
# convert axes to cuDNN mode
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
ndim = x.ndim
if ndim < 4:
x = theano.tensor.shape_padright(x, 4 - ndim)
dy = theano.tensor.shape_padright(dy, 4 - ndim)
scale = theano.tensor.shape_padright(scale, 4 - ndim)
x_mean = theano.tensor.shape_padright(x_mean, 4 - ndim)
x_invstd = theano.tensor.shape_padright(x_invstd, 4 - ndim)
elif ndim > 5:
x_shape = x.shape
params_shape = scale.shape
x = theano.tensor.flatten(x, 5)
dy = theano.tensor.flatten(dy, 5)
scale = theano.tensor.flatten(scale, 5)
x_mean = theano.tensor.flatten(x_mean, 5)
x_invstd = theano.tensor.flatten(x_invstd, 5)
try:
eps = float(theano.tensor.get_scalar_constant_value(epsilon))
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
if not dnn_available():
return None
x = as_cuda_ndarray_variable(x)
dy = as_cuda_ndarray_variable(dy)
scale = as_cuda_ndarray_variable(scale)
x_mean = as_cuda_ndarray_variable(x_mean)
x_invstd = as_cuda_ndarray_variable(x_invstd)
g_wrt_inputs, g_wrt_scale, g_wrt_bias = \
GpuDnnBatchNormGrad(mode, epsilon=eps)(x, dy, scale, x_mean, x_invstd)
if ndim < 4:
g_wrt_inputs = theano.tensor.flatten(g_wrt_inputs, ndim)
g_wrt_scale = theano.tensor.flatten(g_wrt_scale, ndim)
g_wrt_bias = theano.tensor.flatten(g_wrt_bias, ndim)
elif ndim > 5:
g_wrt_inputs = theano.tensor.reshape(g_wrt_inputs, x_shape)
g_wrt_scale = theano.tensor.reshape(g_wrt_scale, params_shape)
g_wrt_bias = theano.tensor.reshape(g_wrt_bias, params_shape)
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType):
g_wrt_inputs = tensor.as_tensor_variable(g_wrt_inputs)
if isinstance(node.outputs[1].type, tensor.TensorType):
g_wrt_scale = tensor.as_tensor_variable(g_wrt_scale)
if isinstance(node.outputs[2].type, tensor.TensorType):
g_wrt_bias = tensor.as_tensor_variable(g_wrt_bias)
# TODO copy_stack_trace?
return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
@local_optimizer([bn.AbstractBatchNormInference])
def local_abstract_batch_norm_inference_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormInference):
return None
x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, CudaNdarrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
if not x_on_gpu:
return None
try:
eps = float(theano.tensor.get_scalar_constant_value(epsilon))
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
if not dnn_available():
return None
x = as_cuda_ndarray_variable(x)
scale = as_cuda_ndarray_variable(scale)
bias = as_cuda_ndarray_variable(bias)
estimated_mean = as_cuda_ndarray_variable(estimated_mean)
estimated_variance = as_cuda_ndarray_variable(estimated_variance)
out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance,
mode, eps)
# If the original output was on CPU, we have to transfer it
# TODO copy_stack_trace?
if isinstance(node.outputs[0].type, tensor.TensorType):
return [tensor.as_tensor_variable(out)]
else:
return [out]
...@@ -3050,3 +3050,28 @@ conv_groupopt.register('local_abstractconv3d_gradinputs_gemm', ...@@ -3050,3 +3050,28 @@ conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
local_abstractconv3d_gradinputs_gemm, 30, local_abstractconv3d_gradinputs_gemm, 30,
'conv_gemm', 'conv_gemm',
'gpu', 'fast_compile', 'fast_run') 'gpu', 'fast_compile', 'fast_run')
# Register cuDNN batch normalization implementation
abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
abstract_batch_norm_groupopt.__name__ = "gpu_batchnorm_opts"
register_opt('fast_compile')(abstract_batch_norm_groupopt)
# cuDNN optimizations are only registered if cuDNN is available.
# (we import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn)
from .dnn import (local_abstract_batch_norm_train_cudnn,
local_abstract_batch_norm_train_grad_cudnn,
local_abstract_batch_norm_inference_cudnn) # noqa: 402
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_dnn',
local_abstract_batch_norm_train_cudnn, 20,
'batchnorm_dnn',
'gpu', 'fast_compile', 'fast_run', 'cudnn')
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_grad_dnn',
local_abstract_batch_norm_train_grad_cudnn, 20,
'batchnorm_dnn',
'gpu', 'fast_compile', 'fast_run', 'cudnn')
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_inference_dnn',
local_abstract_batch_norm_inference_cudnn, 20,
'batchnorm_dnn',
'gpu', 'fast_compile', 'fast_run', 'cudnn')
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from collections import OrderedDict
import logging import logging
import os import os
import sys import sys
...@@ -18,6 +19,7 @@ import theano.tests.unittest_tools as utt ...@@ -18,6 +19,7 @@ import theano.tests.unittest_tools as utt
from theano.tensor.signal.pool import pool_2d, pool_3d from theano.tensor.signal.pool import pool_2d, pool_3d
from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
from theano.tensor.nnet.abstract_conv import get_conv_output_shape from theano.tensor.nnet.abstract_conv import get_conv_output_shape
from theano.tensor.nnet import bn
import theano.sandbox.cuda.dnn as dnn import theano.sandbox.cuda.dnn as dnn
from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
from theano.sandbox.cuda import float32_shared_constructor as shared from theano.sandbox.cuda import float32_shared_constructor as shared
...@@ -730,52 +732,201 @@ def test_batchnorm_train(): ...@@ -730,52 +732,201 @@ def test_batchnorm_train():
raise SkipTest("batch normalization requires cudnn v5+") raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng() utt.seed_rng()
tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
for mode in ('per-activation', 'spatial'): for mode in ('per-activation', 'spatial'):
for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector): for vartype in (tensor6, T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias')) x, scale, bias, running_mean, running_var = (vartype(n)
for n in ('x', 'scale', 'bias',
'running_mean',
'running_var'))
ndim = x.ndim ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used eps = 5e-3 # some non-standard value to test if it's used
running_average_factor = 0.3
# forward pass
out, x_mean, x_invstd = cuda.dnn.dnn_batch_normalization_train( # forward pass, direct interface
x, scale, bias, mode, eps) out_gpu, x_mean_gpu, x_invstd_gpu, \
out_running_mean_gpu, out_running_var_gpu = \
dnn.dnn_batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# forward pass, abstract interface
out_abstract, x_mean_abstract, x_invstd_abstract, \
out_running_mean_abstract, out_running_var_abstract = \
bn.batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# reference forward pass # reference forward pass
if mode == 'per-activation': if mode == 'per-activation':
axes = (0,) axes = (0,)
elif mode == 'spatial': elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim)) axes = (0,) + tuple(range(2, ndim))
x_mean2 = x.mean(axis=axes, keepdims=True) x_mean_ref = x.mean(axis=axes, keepdims=True)
x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps)) x_var_ref = x.var(axis=axes, keepdims=True)
scale2 = T.addbroadcast(scale, *axes) x_invstd_ref = T.inv(T.sqrt(x_var_ref + eps))
bias2 = T.addbroadcast(bias, *axes) scale_ref = T.addbroadcast(scale, *axes)
out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 bias_ref = T.addbroadcast(bias, *axes)
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
out_running_mean_ref = running_mean * (1 - running_average_factor) + \
x_mean_ref * running_average_factor
out_running_var_ref = running_var * (1 - running_average_factor) + \
(m / (m - 1)) * x_var_ref * running_average_factor
# backward pass # backward pass
dy = vartype('dy') dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
# reference backward pass # reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) grads_ref = T.grad(None, wrt=[x, scale, bias], known_grads={out_ref: dy})
# compile # compile
f = theano.function([x, scale, bias, dy], f_gpu = theano.function([x, scale, bias, running_mean, running_var, dy],
[out, x_mean, x_invstd, out2, x_mean2, x_invstd2] + [out_gpu, x_mean_gpu, x_invstd_gpu,
grads + grads2, mode=mode_with_gpu) out_running_mean_gpu, out_running_var_gpu] + grads_gpu,
mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract,
out_running_mean_abstract, out_running_var_abstract] +
grads_abstract,
mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_ref, x_mean_ref, x_invstd_ref,
out_running_mean_ref, out_running_var_ref] + grads_ref)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run # run
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): for data_shape in ((5, 10, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
data_shape = data_shape[:ndim] data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape)) for d, s in enumerate(data_shape))
X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32') X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32') Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype('float32') Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype('float32') Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Dy) Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs_gpu = f_gpu(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
# compare outputs # compare outputs
utt.assert_allclose(outputs[0], outputs[0 + 3]) # out utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean utt.assert_allclose(outputs_gpu[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd utt.assert_allclose(outputs_gpu[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_gpu[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_gpu[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs_abstract[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_abstract[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
# compare gradients # compare gradients
utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4) # dx utt.assert_allclose(outputs_gpu[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs_gpu[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias utt.assert_allclose(outputs_gpu[7], outputs_ref[7]) # dbias
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_abstract[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_abstract[7], outputs_ref[7]) # dbias
def test_dnn_batchnorm_train_without_running_averages():
# compile and run batch_normalization_train without running averages
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
# forward pass
out_gpu, x_mean_gpu, x_invstd_gpu = \
dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation')
out_abstract, x_mean_abstract, x_invstd_abstract = \
bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
# compile
f_gpu = theano.function([x, scale, bias, dy],
[out_gpu, x_mean_gpu, x_invstd_gpu] +
grads_gpu,
mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract] +
grads_abstract,
mode=mode_with_gpu)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm)
for n in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
for n in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f_abstract.maker.fgraph.toposort()])
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f_gpu(X, Scale, Bias, Dy)
f_abstract(X, Scale, Bias, Dy)
def test_dnn_batchnorm_train_inplace():
# test inplace_running_mean and inplace_running_var
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
running_mean = shared(
numpy.random.randn(*param_shape).astype(theano.config.floatX),
broadcastable=(True, False, False, False))
running_var = shared(
numpy.random.randn(*param_shape).astype(theano.config.floatX),
broadcastable=(True, False, False, False))
# forward pass
out, x_mean, x_invstd, new_running_mean, new_running_var = \
dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation',
epsilon=5e-3, running_average_factor=0.3,
running_mean=running_mean, running_var=running_var)
# update running averages
updates = OrderedDict()
updates[running_mean] = new_running_mean
updates[running_var] = new_running_var
# compile
f = theano.function([x, scale, bias],
[out, x_mean, x_invstd],
updates=updates,
mode=mode_with_gpu)
# check for the inplace settings
nodes = [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, dnn.GpuDnnBatchNorm)]
assert len(nodes) == 1
assert nodes[0].op.inplace_running_mean
assert nodes[0].op.inplace_running_var
assert nodes[0].op.inplace_output
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias)
def test_batchnorm_inference(): def test_batchnorm_inference():
...@@ -785,53 +936,160 @@ def test_batchnorm_inference(): ...@@ -785,53 +936,160 @@ def test_batchnorm_inference():
raise SkipTest("batch normalization requires cudnn v5+") raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng() utt.seed_rng()
tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
for mode in ('per-activation', 'spatial'): for mode in ('per-activation', 'spatial'):
for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector): for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
x, scale, bias, mean, var = (vartype(n) for n in ('x', 'scale', x, scale, bias, mean, var = (vartype(n)
'bias', 'mean', for n in ('x', 'scale', 'bias', 'mean', 'var'))
'var'))
ndim = x.ndim ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used eps = 5e-3 # some non-standard value to test if it's used
# forward pass # forward pass, direct interface
out = cuda.dnn.dnn_batch_normalization_test(x, scale, bias, mean, out_gpu = dnn.dnn_batch_normalization_test(x, scale, bias, mean,
var, mode, eps) var, mode, eps)
# forward pass, abstract interface
out_abstract = bn.batch_normalization_test(x, scale, bias, mean,
var, mode, eps)
# reference forward pass # reference forward pass
if mode == 'per-activation': if mode == 'per-activation':
axes = (0,) axes = (0,)
elif mode == 'spatial': elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim)) axes = (0,) + tuple(range(2, ndim))
scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes) scale_ref, bias_ref, mean_ref, var_ref = (T.addbroadcast(t, *axes)
for t in (scale, bias, mean, var)) for t in (scale, bias, mean, var))
out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2 out_ref = (x - mean_ref) * (scale_ref / T.sqrt(var_ref + eps)) + bias_ref
# backward pass # backward pass
dy = vartype('dy') dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy}) grads_gpu = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_abstract: dy})
# reference backward pass # reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy}) grads_ref = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_ref: dy})
# compile # compile
f = theano.function([x, scale, bias, mean, var, dy], f_gpu = theano.function([x, scale, bias, mean, var, dy],
[out, out2] + grads + grads2, mode=mode_with_gpu) [out_gpu] + grads_gpu, mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, mean, var, dy],
[out_abstract] + grads_abstract, mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, mean, var, dy],
[out_ref] + grads_ref)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run # run
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): for data_shape in ((10, 20, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
data_shape = data_shape[:ndim] data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape)) for d, s in enumerate(data_shape))
X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32') X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32') Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype('float32') Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype('float32') Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype('float32') Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype('float32') Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Mean, Var, Dy) outputs_gpu = f_gpu(X, Scale, Bias, Mean, Var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Mean, Var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Mean, Var, Dy)
# compare outputs # compare outputs
utt.assert_allclose(outputs[0], outputs[1]) # out utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
# compare gradients # compare gradients
utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx utt.assert_allclose(outputs_gpu[1], outputs_ref[1], atol=4e-5) # dx
utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale utt.assert_allclose(outputs_gpu[2], outputs_ref[2], atol=4e-5) # dscale
utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias utt.assert_allclose(outputs_gpu[3], outputs_ref[3]) # dbias
utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean utt.assert_allclose(outputs_gpu[4], outputs_ref[4]) # dmean
utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar utt.assert_allclose(outputs_gpu[5], outputs_ref[5], rtol=2e-3, atol=4e-5) # dvar
utt.assert_allclose(outputs_abstract[1], outputs_ref[1], atol=4e-5) # dx
utt.assert_allclose(outputs_abstract[2], outputs_ref[2], atol=4e-5) # dscale
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # dbias
utt.assert_allclose(outputs_abstract[4], outputs_ref[4]) # dmean
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], rtol=2e-3, atol=4e-5) # dvar
def test_batchnorm_inference_inplace():
# test inplace
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, mean, var = (T.tensor4(n) for n in ('x', 'scale', 'bias', 'mean', 'var'))
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
out = dnn.dnn_batch_normalization_test(x, scale, bias, mean, var)
f = theano.function([x, scale, bias, mean, var], [out], mode=mode_with_gpu)
# check for the inplace settings
nodes = [n for n in f.maker.fgraph.toposort()
if isinstance(n.op, dnn.GpuDnnBatchNormInference)]
assert len(nodes) == 1
assert nodes[0].op.inplace
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
f(X, Scale, Bias, Mean, Var)
def test_dnn_batchnorm_valid_and_invalid_axes():
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix):
x, scale, bias, mean, var, dy = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var', 'dy'))
ndim = x.ndim
# supported: per-activation and spatial
valid_axes_lists = ((0,), (0,) + tuple(range(2, ndim)))
# not supported: an axes list without 0 and including 1
invalid_axes_lists = (tuple(range(1, ndim)),)
for axes in valid_axes_lists + invalid_axes_lists:
# forward pass, abstract interface
out_train, x_mean, x_invstd = bn.batch_normalization_train(
x, scale, bias, axes)
out_test = bn.batch_normalization_test(
x, scale, bias, mean, var, axes)
# backward pass
dy = vartype('dy')
grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
grads_test = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_test: dy})
# compile
f = theano.function([x, scale, bias, mean, var, dy],
[out_train, x_mean, x_invstd, out_test] +
grads_train + grads_test,
mode=mode_with_gpu)
if axes in valid_axes_lists:
# check if the abstract Ops have been replaced by the cuDNN Ops
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
in f.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
in f.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f.maker.fgraph.toposort()])
else:
# check if the abstract Ops have been replaced, but not by the cuDNN Ops
assert not any([isinstance(n.op, (dnn.GpuDnnBatchNorm,
dnn.GpuDnnBatchNormGrad,
bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f.maker.fgraph.toposort()])
def test_dnn_tag(): def test_dnn_tag():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论