提交 ea927aef authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5474 from nouiz/abstract_bn

Follow up from abstract bn PR
......@@ -28,7 +28,6 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
assert_conv_shape)
from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad)
from theano.tensor.nnet import bn
from . import pygpu
from .type import (get_context, gpu_context_type, list_contexts,
GpuArraySharedVariable)
......@@ -87,10 +86,6 @@ def _make_handle(ctx):
return handle
def raise_no_cudnn(msg="cuDNN is required for convolution and pooling"):
raise RuntimeError(msg)
def _dnn_check_compile():
preambule = """
#include <stdio.h>
......@@ -2733,7 +2728,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
inp2 = inputs[1]
if not dnn_available(inp1.type.context_name):
raise_no_cudnn()
return
if op.filter_flip:
conv_mode = 'conv'
......@@ -2776,7 +2771,7 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
inp2 = inputs[1]
if not dnn_available(inp1.type.context_name):
raise_no_cudnn()
return
if op.filter_flip:
conv_mode = 'conv'
......@@ -2902,7 +2897,7 @@ def local_dnn_convi_output_merge(node, *inputs):
def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
raise_no_cudnn()
return
if not op.ignore_border:
return
img, ws, stride, pad = inputs
......@@ -2931,7 +2926,7 @@ pool_db2.register("local_gpua_pool_dnn_alternative",
def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
raise_no_cudnn()
return
if not op.ignore_border:
return
inp, out, out_grad, ws, stride, pad = inputs
......@@ -2975,7 +2970,7 @@ pool_db2.register("local_gpua_pool_dnn_grad_stride",
def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
raise_no_cudnn()
return
if not op.ignore_border:
return
inp, out_grad, ws, stride, pad = inputs
......@@ -3018,7 +3013,7 @@ pool_db2.register("local_gpua_avg_pool_dnn_grad_stride",
def local_softmax_dnn(node):
if isinstance(node.op, GpuSoftmax):
if not dnn_available(node.outputs[0].type.context_name):
raise_no_cudnn()
return
ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
ins = gpu_contiguous(ins)
out = GpuDnnSoftmax('accurate', 'channel')(ins)
......@@ -3035,9 +3030,6 @@ def local_log_softmax_dnn(node):
node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, GpuDnnSoftmax) and
len(node.inputs[0].clients) == 1):
if version(raises=False) < 3000:
# No log-softmax before cudnn v3
raise_no_cudnn("Need cuDNN v3 for LogSoftmax")
softmax_node = node.inputs[0].owner
new_softmax = GpuDnnSoftmax('log', softmax_node.op.mode)
return [new_softmax(softmax_node.inputs[0])]
......@@ -3051,9 +3043,8 @@ def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
inp = inputs[0]
if inp.ndim != 2:
return
if not dnn_available(ctx_name) or version(raises=False) < 3000:
# No log-softmax before cudnn v3
raise_no_cudnn("Need cuDNN v3 for LogSoftmax")
if not dnn_available(ctx_name):
return
inp = inp.dimshuffle(0, 1, 'x', 'x')
inp.tag.context_name = ctx_name
......@@ -3087,7 +3078,7 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
@register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile')
def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
raise_no_cudnn("cuDNN needed for SoftmaxGrad")
return
ins = []
for n in inputs:
n = as_gpuarray_variable(n, ctx_name)
......@@ -3100,9 +3091,6 @@ def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
return [out.dimshuffle(0, 2)]
@register_opt('cudnn', 'fast_compile')
@op_lifter([bn.AbstractBatchNormTrain])
@register_opt2([bn.AbstractBatchNormTrain], 'cudnn', 'fast_compile')
def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
x, scale, bias, epsilon, running_average_factor = inputs[:5]
running_mean = inputs[5] if len(inputs) > 5 else None
......@@ -3130,8 +3118,7 @@ def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
ctx = infer_context_name(*inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
return
x = as_gpuarray_variable(x, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
bias = as_gpuarray_variable(bias, context_name=ctx)
......@@ -3186,9 +3173,6 @@ def local_batch_norm_inference_inplace(node):
return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)]
@register_opt('cudnn', 'fast_compile')
@op_lifter([bn.AbstractBatchNormTrainGrad])
@register_opt2([bn.AbstractBatchNormTrainGrad], 'cudnn', 'fast_compile')
def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):
x, dy, scale, x_mean, x_invstd, epsilon = inputs
......@@ -3234,8 +3218,7 @@ def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):
ctx = infer_context_name(*inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
return
x = as_gpuarray_variable(x, context_name=ctx)
dy = as_gpuarray_variable(dy, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
......@@ -3257,9 +3240,6 @@ def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):
return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
@register_opt('cudnn', 'fast_compile')
@op_lifter([bn.AbstractBatchNormInference])
@register_opt2([bn.AbstractBatchNormInference], 'cudnn', 'fast_compile')
def local_abstract_batch_norm_inference_cudnn(op, ctx_name, inputs, outputs):
x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs
......@@ -3280,8 +3260,7 @@ def local_abstract_batch_norm_inference_cudnn(op, ctx_name, inputs, outputs):
ctx = infer_context_name(*inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
return
x = as_gpuarray_variable(x, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
bias = as_gpuarray_variable(bias, context_name=ctx)
......
......@@ -22,6 +22,7 @@ from theano.scalar.basic import Scalar, Pow, Cast
from theano.scalar.basic_scipy import Erfinv, Erfcinv
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet import bn
from theano.tensor.nnet.conv import ConvOp
from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
......@@ -1964,9 +1965,8 @@ abstractconv_groupopt = theano.gof.optdb.LocalGroupDB()
abstractconv_groupopt.__name__ = "gpuarray_abstractconv_opts"
register_opt('fast_compile')(abstractconv_groupopt)
# cuDNN is first, but only registered if cuDNN is available.
# (we import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn)
# We import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn
from .dnn import (local_abstractconv_cudnn, local_abstractconv_gw_cudnn,
local_abstractconv_gi_cudnn) # noqa: 402
abstractconv_groupopt.register('local_abstractconv_dnn',
......@@ -2005,3 +2005,56 @@ abstractconv_groupopt.register('local_abstractconv3d_gradinputs',
local_abstractconv3d_gradinputs_gemm, 30,
'conv_gemm',
'gpuarray', 'fast_compile', 'fast_run')
# Register cuDNN batch normalization implementation
# We import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn
from .dnn import (local_abstract_batch_norm_train_cudnn,
local_abstract_batch_norm_train_grad_cudnn,
local_abstract_batch_norm_inference_cudnn) # noqa: 402
abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
abstract_batch_norm_groupopt.__name__ = "gpuarray_batchnorm_opts"
register_opt('fast_compile')(abstract_batch_norm_groupopt)
abstract_batch_norm_db = LocalGroupDB()
abstract_batch_norm_db2 = LocalGroupDB(
local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
abstract_batch_norm_db2.__name__ = "abstract_batch_norm_db2"
register_opt('fast_compile', name='abstract_batch_norm_db')(
abstract_batch_norm_db)
register_opt2([bn.AbstractBatchNormTrain,
bn.AbstractBatchNormTrainGrad,
bn.AbstractBatchNormInference],
'fast_compile', name='abstract_batch_norm_db2')(
abstract_batch_norm_db2)
for op, fct, cpu in [(bn.AbstractBatchNormTrain,
local_abstract_batch_norm_train_cudnn,
bn.local_abstract_batch_norm_train),
(bn.AbstractBatchNormTrainGrad,
local_abstract_batch_norm_train_grad_cudnn,
bn.local_abstract_batch_norm_train_grad),
(bn.AbstractBatchNormInference,
local_abstract_batch_norm_inference_cudnn,
bn.local_abstract_batch_norm_inference)]:
lifter = op_lifter([op])(fct)
abstract_batch_norm_db.register(fct.__name__,
lifter,
'gpuarray', 'fast_compile', 'fast_run',
'cudnn', 'batchnorm_dnn',
position=1)
abstract_batch_norm_db2.register(fct.__name__,
local_optimizer([op])(fct),
'gpuarray', 'fast_compile', 'fast_run',
'cudnn', 'batchnorm_dnn',
position=1)
# cpu is a normal optimization. We can't register it in
# GraphToGPU. So for now, only add it to the slower EQ phase. If
# there is no cuDNN, we still want to move it to the GPU now with
# a Theano graph so to have this graph on the GPU.
abstract_batch_norm_db.register(cpu.__name__, cpu,
'gpuarray', 'fast_compile', 'fast_run',
position='last')
......@@ -22,6 +22,7 @@ if theano.config.mode == 'FAST_COMPILE':
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
mode_without_gpu.check_py_code = False
# If using float16, cast reference input to float32
......
......@@ -26,6 +26,10 @@ from .rnn_support import Model, GRU, LSTM, WrapperLayer
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD
mode_with_gpu = mode_with_gpu.including()
# Globally disabled for mode_without_gpu
mode_with_gpu.check_py_code = False
# If using float16, set CUDNN precision to float32
def set_precision(floatX):
......@@ -1451,7 +1455,7 @@ def test_dnn_batchnorm_train():
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run
for data_shape in ((5, 10, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
for data_shape in ((5, 10, 30, 4, 10, 5), (4, 3, 1, 1, 1, 1), (2, 3, 5, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape))
......@@ -1505,7 +1509,7 @@ def test_dnn_batchnorm_train_without_running_averages():
bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
# compile
f_gpu = theano.function([x, scale, bias, dy],
[out_gpu, x_mean_gpu, x_invstd_gpu] +
......@@ -1533,6 +1537,44 @@ def test_dnn_batchnorm_train_without_running_averages():
f_abstract(X, Scale, Bias, Dy)
def test_without_dnn_batchnorm_train_without_running_averages():
# compile and run batch_normalization_train without running averages
# But disable cudnn and make sure it run on the GPU.
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
# forward pass
out_abstract, x_mean_abstract, x_invstd_abstract = \
bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
# compile
f_abstract = theano.function([x, scale, bias, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract] +
grads_abstract,
mode=mode_with_gpu.excluding('cudnn'))
# check if the abstract Ops have been replaced
assert not any([isinstance(n.op, dnn.GpuDnnBatchNorm)
for n in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
for n in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuElemwise)
for n in f_abstract.maker.fgraph.toposort()])
# run
X = 4 + 3 * np.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * np.random.randn(*data_shape).astype(theano.config.floatX)
Scale = np.random.randn(*param_shape).astype(theano.config.floatX)
Bias = np.random.randn(*param_shape).astype(theano.config.floatX)
f_abstract(X, Scale, Bias, Dy)
def test_dnn_batchnorm_train_inplace():
# test inplace_running_mean and inplace_running_var
if not dnn.dnn_available(test_ctx_name):
......@@ -1628,7 +1670,7 @@ def test_batchnorm_inference():
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run
for data_shape in ((10, 20, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
for data_shape in ((10, 2, 30, 4, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape))
......
......@@ -3705,6 +3705,19 @@ def local_gpu_batch_norm_inference_inplace(node):
inplace=True)(*node.inputs)]
def values_eq_approx_high_tol(a, b):
"""
This fct is needed to don't have DebugMode raise useless
errors due to rounding error.
This happen as we reduce on the two last dimensions, so this
can raise the absolute error if the number of elements we
reduce on is significant.
"""
return tensor.TensorType.values_eq_approx(a, b, atol=0.015)
@local_optimizer([bn.AbstractBatchNormTrainGrad])
def local_abstract_batch_norm_train_grad_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrainGrad):
......@@ -3781,6 +3794,9 @@ def local_abstract_batch_norm_train_grad_cudnn(node):
if isinstance(node.outputs[2].type, tensor.TensorType):
g_wrt_bias = tensor.as_tensor_variable(g_wrt_bias)
# TODO copy_stack_trace?
g_wrt_inputs.tag.values_eq_approx = values_eq_approx_high_tol
g_wrt_scale.tag.values_eq_approx = values_eq_approx_high_tol
return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
......
......@@ -38,6 +38,8 @@ if theano.config.mode == 'FAST_COMPILE':
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
mode_with_gpu.check_py_code = False
mode_without_gpu.check_py_code = False
def test_dnn_conv_desc_merge():
......@@ -732,7 +734,7 @@ def test_batchnorm_train():
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
tensor6 = T.TensorType('float32', (False,) * 6)
for mode in ('per-activation', 'spatial'):
for vartype in (tensor6, T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
......@@ -766,7 +768,7 @@ def test_batchnorm_train():
x_invstd_ref = T.inv(T.sqrt(x_var_ref + eps))
scale_ref = T.addbroadcast(scale, *axes)
bias_ref = T.addbroadcast(bias, *axes)
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), 'float32')
out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
out_running_mean_ref = running_mean * (1 - running_average_factor) + \
x_mean_ref * running_average_factor
......@@ -801,16 +803,16 @@ def test_batchnorm_train():
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run
for data_shape in ((5, 10, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
for data_shape in ((5, 2, 30, 4, 10, 5), (4, 3, 1, 1, 1, 1), (2, 3, 5, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape))
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
Scale = numpy.random.randn(*param_shape).astype('float32')
Bias = numpy.random.randn(*param_shape).astype('float32')
Running_mean = numpy.random.randn(*param_shape).astype('float32')
Running_var = numpy.random.randn(*param_shape).astype('float32')
outputs_gpu = f_gpu(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
......@@ -844,7 +846,7 @@ def test_dnn_batchnorm_train_without_running_averages():
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
x, scale, bias, dy = T.ftensor4('x'), T.ftensor4('scale'), T.ftensor4('bias'), T.ftensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
......@@ -875,10 +877,10 @@ def test_dnn_batchnorm_train_without_running_averages():
bn.AbstractBatchNormTrainGrad))
for n in f_abstract.maker.fgraph.toposort()])
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
Scale = numpy.random.randn(*param_shape).astype('float32')
Bias = numpy.random.randn(*param_shape).astype('float32')
f_gpu(X, Scale, Bias, Dy)
f_abstract(X, Scale, Bias, Dy)
......@@ -891,14 +893,14 @@ def test_dnn_batchnorm_train_inplace():
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias')
x, scale, bias = T.ftensor4('x'), T.ftensor4('scale'), T.ftensor4('bias')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
running_mean = shared(
numpy.random.randn(*param_shape).astype(theano.config.floatX),
numpy.random.randn(*param_shape).astype('float32'),
broadcastable=(True, False, False, False))
running_var = shared(
numpy.random.randn(*param_shape).astype(theano.config.floatX),
numpy.random.randn(*param_shape).astype('float32'),
broadcastable=(True, False, False, False))
# forward pass
......@@ -923,9 +925,9 @@ def test_dnn_batchnorm_train_inplace():
assert nodes[0].op.inplace_running_var
assert nodes[0].op.inplace_output
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
Scale = numpy.random.randn(*param_shape).astype('float32')
Bias = numpy.random.randn(*param_shape).astype('float32')
f(X, Scale, Bias)
......@@ -936,10 +938,10 @@ def test_batchnorm_inference():
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
tensor6 = T.TensorType('float32', (False,) * 6)
for mode in ('per-activation', 'spatial'):
for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
for vartype in (tensor6, T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
x, scale, bias, mean, var = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var'))
ndim = x.ndim
......@@ -980,16 +982,16 @@ def test_batchnorm_inference():
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run
for data_shape in ((10, 20, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
for data_shape in ((10, 2, 15, 4, 6, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes else s
for d, s in enumerate(data_shape))
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
Scale = numpy.random.randn(*param_shape).astype('float32')
Bias = numpy.random.randn(*param_shape).astype('float32')
Mean = numpy.random.randn(*param_shape).astype('float32')
Var = numpy.random.rand(*param_shape).astype('float32')
outputs_gpu = f_gpu(X, Scale, Bias, Mean, Var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Mean, Var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Mean, Var, Dy)
......@@ -1017,7 +1019,7 @@ def test_batchnorm_inference_inplace():
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
x, scale, bias, mean, var = (T.tensor4(n) for n in ('x', 'scale', 'bias', 'mean', 'var'))
x, scale, bias, mean, var = (T.ftensor4(n) for n in ('x', 'scale', 'bias', 'mean', 'var'))
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
......@@ -1031,11 +1033,11 @@ def test_batchnorm_inference_inplace():
assert nodes[0].op.inplace
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
Scale = numpy.random.randn(*param_shape).astype('float32')
Bias = numpy.random.randn(*param_shape).astype('float32')
Mean = numpy.random.randn(*param_shape).astype('float32')
Var = numpy.random.rand(*param_shape).astype('float32')
f(X, Scale, Bias, Mean, Var)
......@@ -1045,7 +1047,7 @@ def test_dnn_batchnorm_valid_and_invalid_axes():
if cuda.dnn.version() < (5000, 5000):
raise SkipTest("batch normalization requires cudnn v5+")
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix):
for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix):
x, scale, bias, mean, var, dy = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var', 'dy'))
ndim = x.ndim
......
......@@ -88,6 +88,14 @@ def upcast(dtype, *dtypes):
return rval
def as_common_dtype(*vars):
"""
For for theano.scalar.Scalar and TensorVariable.
"""
dtype = upcast(*[v.dtype for v in vars])
return (v.astype(dtype) for v in vars)
def get_scalar_type(dtype):
"""
Return a Scalar(dtype) object.
......
......@@ -7,7 +7,7 @@ from theano.gof.opt import copy_stack_trace
from theano.tensor import as_tensor_variable, TensorType
from theano.tensor import basic as T
from theano.tensor.opt import register_specialize_device
from theano.scalar import Composite
from theano.scalar import Composite, as_common_dtype
from theano.scalar import add, sub, true_div, mul
......@@ -413,15 +413,27 @@ class AbstractBatchNormTrain(Op):
def make_node(self, x, scale, bias, epsilon=1e-4,
running_average_factor=0.1,
running_mean=None, running_var=None):
x = as_tensor_variable(x)
scale = as_tensor_variable(scale)
bias = as_tensor_variable(bias)
epsilon = as_tensor_variable(epsilon)
running_average_factor = as_tensor_variable(running_average_factor)
if running_mean is not None:
running_mean = as_tensor_variable(running_mean)
if running_var is not None:
running_var = as_tensor_variable(running_var)
assert x.ndim == scale.ndim == bias.ndim
assert ((running_mean is None and running_var is None) or
(running_mean is not None and running_var is not None))
assert (running_mean is None or running_mean.ndim == x.ndim)
assert (running_var is None or running_var.ndim == x.ndim)
if not isinstance(epsilon, theano.Variable):
epsilon = as_tensor_variable(epsilon)
if not isinstance(running_average_factor, theano.Variable):
running_average_factor = as_tensor_variable(running_average_factor)
# Upcast to common dtype on the non-scalar
# Keep as is dtype of scalar (epsilon and running_average_factor)
if running_mean:
x, scale, bias, running_mean, running_var = as_common_dtype(
x, scale, bias, running_mean, running_var)
else:
x, scale, bias = as_common_dtype(x, scale, bias)
inputs = [x, scale, bias, epsilon, running_average_factor]
output_types = [x.type(), scale.type(), scale.type()]
if running_mean is not None and running_var is not None:
......@@ -513,9 +525,18 @@ class AbstractBatchNormInference(Op):
return [shape[0]]
def make_node(self, x, scale, bias, estimated_mean, estimated_variance, epsilon=1e-4):
x = as_tensor_variable(x)
scale = as_tensor_variable(scale)
bias = as_tensor_variable(bias)
estimated_mean = as_tensor_variable(estimated_mean)
estimated_variance = as_tensor_variable(estimated_variance)
epsilon = as_tensor_variable(epsilon)
# Upcast to common dtype on the non-scalar
# Keep as is dtype of scalar (epsilon)
x, scale, bias, estimated_mean, estimated_variance = as_common_dtype(
x, scale, bias, estimated_mean, estimated_variance)
assert x.ndim == scale.ndim == bias.ndim == estimated_mean.ndim == estimated_variance.ndim
if not isinstance(epsilon, theano.Variable):
epsilon = as_tensor_variable(epsilon)
return Apply(self, [x, scale, bias, estimated_mean, estimated_variance, epsilon], [x.type()])
def grad(self, inputs, grads):
......@@ -561,9 +582,18 @@ class AbstractBatchNormTrainGrad(Op):
self.axes = axes
def make_node(self, x, dy, scale, x_mean, x_invstd, epsilon=1e-4):
x = as_tensor_variable(x)
dy = as_tensor_variable(dy)
scale = as_tensor_variable(scale)
x_mean = as_tensor_variable(x_mean)
x_invstd = as_tensor_variable(x_invstd)
epsilon = as_tensor_variable(epsilon)
# Upcast to common dtype on the non-scalar
# Keep as is dtype of scalar (epsilon)
x, dy, scale, x_mean, x_invstd = as_common_dtype(
x, dy, scale, x_mean, x_invstd)
assert x.ndim == dy.ndim == scale.ndim == x_mean.ndim == x_invstd.ndim
if not isinstance(epsilon, theano.Variable):
epsilon = as_tensor_variable(epsilon)
return Apply(self, [x, dy, scale, x_mean, x_invstd, epsilon],
[x.type(), scale.type(), scale.type()])
......@@ -612,6 +642,9 @@ def local_abstract_batch_norm_train(node):
mean = x.mean(axes, keepdims=True)
var = x.var(axes, keepdims=True)
# The epsilon should not upcast the dtype.
if var.dtype == 'float32' and epsilon.dtype == 'float64':
epsilon = epsilon.astype('float32')
invstd = T.inv(T.sqrt(var + epsilon))
out = (x - mean) * (scale * invstd) + bias
results = [out, mean, invstd]
......@@ -687,6 +720,10 @@ def local_abstract_batch_norm_inference(node):
not isinstance(epsilon.type, TensorType):
return None
# The epsilon should not upcast the dtype.
if estimated_variance.dtype == 'float32' and epsilon.dtype == 'float64':
epsilon = epsilon.astype('float32')
result = (x - estimated_mean) * (scale / T.sqrt(estimated_variance + epsilon)) + bias
result = T.patternbroadcast(result, node.outputs[0].broadcastable)
......
......@@ -201,7 +201,7 @@ def test_batch_normalization_train():
bn.AbstractBatchNormTrainGrad))
for n in f.maker.fgraph.toposort()])
# run
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (2, 3, 5, 5, 5)):
data_shape = data_shape[:ndim]
param_shape = tuple(1 if d in axes2 else s
for d, s in enumerate(data_shape))
......
......@@ -203,9 +203,10 @@ class TensorType(Type):
"""
Convert a symbolic Variable into a TensorType, if compatible.
For the moment, only a TensorType or CudaNdarrayType will be
converted, provided they have the same number of dimensions,
broadcastable pattern, and dtype.
For the moment, only a TensorType, GpuArrayType and
CudaNdarrayType will be
converted, provided they have the same number of dimensions and
dtype and have "compatible" broadcastable pattern.
"""
if hasattr(other, '_as_TensorVariable'):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论