提交 ae5e5a03 authored 作者: Gijs van Tulder's avatar Gijs van Tulder

Batch norm optimizations for gpuarray.

上级 9ad04124
...@@ -28,12 +28,13 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d, ...@@ -28,12 +28,13 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
assert_conv_shape) assert_conv_shape)
from theano.tensor.signal.pool import ( from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad) Pool, MaxPoolGrad, AveragePoolGrad)
from theano.tensor.nnet import bn
from . import pygpu from . import pygpu
from .type import (get_context, gpu_context_type, list_contexts, from .type import (get_context, gpu_context_type, list_contexts,
GpuArraySharedVariable) GpuArraySharedVariable)
from .basic_ops import (as_gpuarray_variable, infer_context_name, from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, gpu_alloc_empty, gpu_contiguous, gpu_alloc_empty,
empty_like, GpuArrayType) empty_like, GpuArrayType, HostFromGpu)
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
# These don't exist in gpuarray # These don't exist in gpuarray
...@@ -2928,3 +2929,182 @@ def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs): ...@@ -2928,3 +2929,182 @@ def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
out = GpuDnnSoftmaxGrad('accurate', 'instance')( out = GpuDnnSoftmaxGrad('accurate', 'instance')(
gpu_contiguous(ins[0]), gpu_contiguous(ins[1])) gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
return [out.dimshuffle(0, 2)] return [out.dimshuffle(0, 2)]
@local_optimizer([bn.AbstractBatchNormTrain])
def local_abstract_batch_norm_train_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrain):
return None
x, scale, bias, epsilon = node.inputs
if x.ndim > 5:
# TODO do something better than this (reshape?)
return None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, GpuArrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
if not x_on_gpu:
return None
# convert axes to cuDNN mode
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
try:
eps = theano.tensor.get_scalar_constant_value(epsilon)
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
ctx = infer_context_name(*node.inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
x = as_gpuarray_variable(x, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
bias = as_gpuarray_variable(bias, context_name=ctx)
out, mean, invstd = dnn_batch_normalization_train(x, scale, bias, mode, eps)
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType):
out = tensor.as_tensor_variable(out)
if isinstance(node.outputs[1].type, tensor.TensorType):
mean = tensor.as_tensor_variable(mean)
if isinstance(node.outputs[2].type, tensor.TensorType):
invstd = tensor.as_tensor_variable(invstd)
# TODO copy_stack_trace?
return [out, mean, invstd]
@local_optimizer([bn.AbstractBatchNormTrainGrad])
def local_abstract_batch_norm_train_grad_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormTrainGrad):
return None
x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
if x.ndim > 5:
# TODO do something better than this (reshape?)
return None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, GpuArrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
dy_on_gpu = (isinstance(dy.type, GpuArrayType) or
(dy.owner and isinstance(dy.owner.op, HostFromGpu)))
if not (x_on_gpu or dy_on_gpu):
return None
# convert axes to cuDNN mode
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
ndim = x.ndim
if ndim < 4:
x = theano.tensor.shape_padright(x, 4 - ndim)
dy = theano.tensor.shape_padright(dy, 4 - ndim)
scale = theano.tensor.shape_padright(scale, 4 - ndim)
x_mean = theano.tensor.shape_padright(x_mean, 4 - ndim)
x_invstd = theano.tensor.shape_padright(x_invstd, 4 - ndim)
try:
eps = theano.tensor.get_scalar_constant_value(epsilon)
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
ctx = infer_context_name(*node.inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
x = as_gpuarray_variable(x, context_name=ctx)
dy = as_gpuarray_variable(dy, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
x_mean = as_gpuarray_variable(x_mean, context_name=ctx)
x_invstd = as_gpuarray_variable(x_invstd, context_name=ctx)
g_wrt_inputs, g_wrt_scale, g_wrt_bias = \
GpuDnnBatchNormGrad(mode)(x, dy, scale, x_mean, x_invstd, eps)
if ndim < 4:
g_wrt_inputs = theano.tensor.flatten(g_wrt_inputs, ndim)
g_wrt_scale = theano.tensor.flatten(g_wrt_scale, ndim)
g_wrt_bias = theano.tensor.flatten(g_wrt_bias, ndim)
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType):
g_wrt_inputs = tensor.as_tensor_variable(g_wrt_inputs)
if isinstance(node.outputs[1].type, tensor.TensorType):
g_wrt_scale = tensor.as_tensor_variable(g_wrt_scale)
if isinstance(node.outputs[2].type, tensor.TensorType):
g_wrt_bias = tensor.as_tensor_variable(g_wrt_bias)
# TODO copy_stack_trace?
return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
@local_optimizer([bn.AbstractBatchNormInference])
def local_abstract_batch_norm_inference_cudnn(node):
if not isinstance(node.op, bn.AbstractBatchNormInference):
return None
x, scale, bias, estimated_mean, estimated_variance, epsilon = node.inputs
if x.ndim > 5:
# TODO do something better than this (reshape?)
return None
axes = tuple(node.op.axes)
if axes == (0,):
mode = 'per-activation'
elif axes == (0,) + tuple(range(2, x.ndim)):
mode = 'spatial'
else:
return None
# input on gpu? TODO what about the output?
x_on_gpu = (isinstance(x.type, GpuArrayType) or
(x.owner and isinstance(x.owner.op, HostFromGpu)))
if not x_on_gpu:
return None
try:
eps = theano.tensor.get_scalar_constant_value(epsilon)
except theano.tensor.NotScalarConstantError:
return None
if eps < 1e-5:
return None
ctx = infer_context_name(*node.inputs)
if not dnn_available(ctx):
# TODO should this raise_no_cudnn?
return None
x = as_gpuarray_variable(x, context_name=ctx)
scale = as_gpuarray_variable(scale, context_name=ctx)
bias = as_gpuarray_variable(bias, context_name=ctx)
estimated_mean = as_gpuarray_variable(estimated_mean, context_name=ctx)
estimated_variance = as_gpuarray_variable(estimated_variance, context_name=ctx)
out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance,
mode, eps)
# If the original output was on CPU, we have to transfer it
# TODO copy_stack_trace?
if isinstance(node.outputs[0].type, tensor.TensorType):
return [tensor.as_tensor_variable(out)]
else:
return [out]
...@@ -2005,3 +2005,28 @@ abstractconv_groupopt.register('local_abstractconv3d_gradinputs', ...@@ -2005,3 +2005,28 @@ abstractconv_groupopt.register('local_abstractconv3d_gradinputs',
local_abstractconv3d_gradinputs_gemm, 30, local_abstractconv3d_gradinputs_gemm, 30,
'conv_gemm', 'conv_gemm',
'gpuarray', 'fast_compile', 'fast_run') 'gpuarray', 'fast_compile', 'fast_run')
# Register cuDNN batch normalization implementation
abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
abstract_batch_norm_groupopt.__name__ = "gpuarray_batchnorm_opts"
register_opt('fast_compile')(abstract_batch_norm_groupopt)
# cuDNN optimizations are only registered if cuDNN is available.
# (we import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn)
from .dnn import (local_abstract_batch_norm_train_cudnn,
local_abstract_batch_norm_train_grad_cudnn,
local_abstract_batch_norm_inference_cudnn) # noqa: 402
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_dnn',
local_abstract_batch_norm_train_cudnn, 20,
'batchnorm_dnn',
'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_grad_dnn',
local_abstract_batch_norm_train_grad_cudnn, 20,
'batchnorm_dnn',
'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
abstract_batch_norm_groupopt.register('local_abstract_batch_norm_inference_dnn',
local_abstract_batch_norm_inference_cudnn, 20,
'batchnorm_dnn',
'gpuarray', 'fast_compile', 'fast_run', 'cudnn')
...@@ -13,6 +13,7 @@ import theano.tests.unittest_tools as utt ...@@ -13,6 +13,7 @@ import theano.tests.unittest_tools as utt
from theano.tensor.signal.pool import pool_2d, pool_3d from theano.tensor.signal.pool import pool_2d, pool_3d
from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
from theano.tensor.nnet.abstract_conv import get_conv_output_shape from theano.tensor.nnet.abstract_conv import get_conv_output_shape
from theano.tensor.nnet import bn
from .. import dnn from .. import dnn
from ..basic_ops import GpuAllocEmpty from ..basic_ops import GpuAllocEmpty
...@@ -1385,28 +1386,47 @@ def test_dnn_batchnorm_train(): ...@@ -1385,28 +1386,47 @@ def test_dnn_batchnorm_train():
ndim = x.ndim ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used eps = 5e-3 # some non-standard value to test if it's used
# forward pass # forward pass, direct interface
out, x_mean, x_invstd = dnn.dnn_batch_normalization_train( out_gpu, x_mean_gpu, x_invstd_gpu = dnn.dnn_batch_normalization_train(
x, scale, bias, mode, eps)
# forward pass, abstract interface
out_abstract, x_mean_abstract, x_invstd_abstract = bn.batch_normalization_train(
x, scale, bias, mode, eps) x, scale, bias, mode, eps)
# reference forward pass # reference forward pass
if mode == 'per-activation': if mode == 'per-activation':
axes = (0,) axes = (0,)
elif mode == 'spatial': elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim)) axes = (0,) + tuple(range(2, ndim))
x_mean2 = x.mean(axis=axes, keepdims=True) x_mean_ref = x.mean(axis=axes, keepdims=True)
x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps)) x_invstd_ref = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps))
scale2 = T.addbroadcast(scale, *axes) scale_ref = T.addbroadcast(scale, *axes)
bias2 = T.addbroadcast(bias, *axes) bias_ref = T.addbroadcast(bias, *axes)
out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
# backward pass # backward pass
dy = vartype('dy') dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
# reference backward pass # reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) grads_ref = T.grad(None, wrt=[x, scale, bias], known_grads={out_ref: dy})
# compile # compile
f = theano.function([x, scale, bias, dy], f_gpu = theano.function([x, scale, bias, dy],
[out, x_mean, x_invstd, out2, x_mean2, x_invstd2] + [out_gpu, x_mean_gpu, x_invstd_gpu] + grads_gpu,
grads + grads2, mode=mode_with_gpu) mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract] +
grads_abstract,
mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, dy],
[out_ref, x_mean_ref, x_invstd_ref] + grads_ref)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run # run
for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
data_shape = data_shape[:ndim] data_shape = data_shape[:ndim]
...@@ -1416,15 +1436,23 @@ def test_dnn_batchnorm_train(): ...@@ -1416,15 +1436,23 @@ def test_dnn_batchnorm_train():
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX) Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX) Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Dy) outputs_gpu = f_gpu(X, Scale, Bias, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Dy)
outputs_ref = f_ref(X, Scale, Bias, Dy)
# compare outputs # compare outputs
utt.assert_allclose(outputs[0], outputs[0 + 3]) # out utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean utt.assert_allclose(outputs_gpu[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd utt.assert_allclose(outputs_gpu[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs_abstract[2], outputs_ref[2]) # invstd
# compare gradients # compare gradients
utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4) # dx utt.assert_allclose(outputs_gpu[3], outputs_ref[3], atol=1e-4) # dx
utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs_gpu[4], outputs_ref[4], rtol=2e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias utt.assert_allclose(outputs_gpu[5], outputs_ref[5]) # dbias
utt.assert_allclose(outputs_abstract[3], outputs_ref[3], atol=1e-4) # dx
utt.assert_allclose(outputs_abstract[4], outputs_ref[4], rtol=2e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_abstract[5], outputs_ref[5]) # dbias
def test_batchnorm_inference(): def test_batchnorm_inference():
...@@ -1439,25 +1467,40 @@ def test_batchnorm_inference(): ...@@ -1439,25 +1467,40 @@ def test_batchnorm_inference():
ndim = x.ndim ndim = x.ndim
eps = 5e-3 # some non-standard value to test if it's used eps = 5e-3 # some non-standard value to test if it's used
# forward pass # forward pass, direct interface
out = dnn.dnn_batch_normalization_test(x, scale, bias, mean, out_gpu = dnn.dnn_batch_normalization_test(x, scale, bias, mean,
var, mode, eps) var, mode, eps)
# forward pass, abstract interface
out_abstract = bn.batch_normalization_test(x, scale, bias, mean,
var, mode, eps)
# reference forward pass # reference forward pass
if mode == 'per-activation': if mode == 'per-activation':
axes = (0,) axes = (0,)
elif mode == 'spatial': elif mode == 'spatial':
axes = (0,) + tuple(range(2, ndim)) axes = (0,) + tuple(range(2, ndim))
scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes) scale_ref, bias_ref, mean_ref, var_ref = (T.addbroadcast(t, *axes)
for t in (scale, bias, mean, var)) for t in (scale, bias, mean, var))
out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2 out_ref = (x - mean_ref) * (scale_ref / T.sqrt(var_ref + eps)) + bias_ref
# backward pass # backward pass
dy = vartype('dy') dy = vartype('dy')
grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy}) grads_gpu = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_abstract: dy})
# reference backward pass # reference backward pass
grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy}) grads_ref = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_ref: dy})
# compile # compile
f = theano.function([x, scale, bias, mean, var, dy], f_gpu = theano.function([x, scale, bias, mean, var, dy],
[out, out2] + grads + grads2, mode=mode_with_gpu) [out_gpu] + grads_gpu, mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, mean, var, dy],
[out_abstract] + grads_abstract, mode=mode_with_gpu)
f_ref = theano.function([x, scale, bias, mean, var, dy],
[out_ref] + grads_ref)
# check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f_abstract.maker.fgraph.toposort()])
# run # run
for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
data_shape = data_shape[:ndim] data_shape = data_shape[:ndim]
...@@ -1469,15 +1512,76 @@ def test_batchnorm_inference(): ...@@ -1469,15 +1512,76 @@ def test_batchnorm_inference():
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX) Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX) Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Var = numpy.random.rand(*param_shape).astype(theano.config.floatX) Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
outputs = f(X, Scale, Bias, Mean, Var, Dy) outputs_gpu = f_gpu(X, Scale, Bias, Mean, Var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Mean, Var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Mean, Var, Dy)
# compare outputs # compare outputs
utt.assert_allclose(outputs[0], outputs[1]) # out utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
# compare gradients # compare gradients
utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx utt.assert_allclose(outputs_gpu[1], outputs_ref[1], atol=4e-5) # dx
utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale utt.assert_allclose(outputs_gpu[2], outputs_ref[2], atol=4e-5) # dscale
utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias utt.assert_allclose(outputs_gpu[3], outputs_ref[3]) # dbias
utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean utt.assert_allclose(outputs_gpu[4], outputs_ref[4]) # dmean
utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar utt.assert_allclose(outputs_gpu[5], outputs_ref[5], rtol=2e-3, atol=4e-5) # dvar
utt.assert_allclose(outputs_abstract[1], outputs_ref[1], atol=4e-5) # dx
utt.assert_allclose(outputs_abstract[2], outputs_ref[2], atol=4e-5) # dscale
utt.assert_allclose(outputs_abstract[3], outputs_ref[3]) # dbias
utt.assert_allclose(outputs_abstract[4], outputs_ref[4]) # dmean
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], rtol=2e-3, atol=4e-5) # dvar
def test_dnn_batchnorm_valid_and_invalid_axes():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix):
x, scale, bias, mean, var, dy = (vartype(n)
for n in ('x', 'scale', 'bias', 'mean', 'var', 'dy'))
ndim = x.ndim
# supported: per-activation and spatial
valid_axes_lists = ((0,), (0,) + tuple(range(2, ndim)))
# not supported: an axes list without 0 and including 1
invalid_axes_lists = (tuple(range(1, ndim)),)
for axes in valid_axes_lists + invalid_axes_lists:
# forward pass, abstract interface
out_train, x_mean, x_invstd = bn.batch_normalization_train(
x, scale, bias, axes)
out_test = bn.batch_normalization_test(
x, scale, bias, mean, var, axes)
# backward pass
dy = vartype('dy')
grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
grads_test = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out_test: dy})
# compile
f = theano.function([x, scale, bias, mean, var, dy],
[out_train, x_mean, x_invstd, out_test] +
grads_train + grads_test,
mode=mode_with_gpu)
if axes in valid_axes_lists:
# check if the abstract Ops have been replaced by the cuDNN Ops
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) for n
in f.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormGrad) for n
in f.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuDnnBatchNormInference) for n
in f.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f.maker.fgraph.toposort()])
else:
# check if the abstract Ops have been replaced, but not by the cuDNN Ops
assert not any([isinstance(n.op, (dnn.GpuDnnBatchNorm,
dnn.GpuDnnBatchNormGrad,
bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad)) for n
in f.maker.fgraph.toposort()])
def test_dnn_rnn_gru(): def test_dnn_rnn_gru():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论