提交 83f99214 authored 作者: Frederic Bastien's avatar Frederic Bastien

Make bn work on the GPU even without cudnn.

上级 16d8b099
......@@ -3100,9 +3100,6 @@ def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
return [out.dimshuffle(0, 2)]
@register_opt('cudnn', 'fast_compile')
@op_lifter([bn.AbstractBatchNormTrain])
@register_opt2([bn.AbstractBatchNormTrain], 'cudnn', 'fast_compile')
def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
x, scale, bias, epsilon, running_average_factor = inputs[:5]
running_mean = inputs[5] if len(inputs) > 5 else None
......@@ -3186,9 +3183,6 @@ def local_batch_norm_inference_inplace(node):
return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)]
@register_opt('cudnn', 'fast_compile')
@op_lifter([bn.AbstractBatchNormTrainGrad])
@register_opt2([bn.AbstractBatchNormTrainGrad], 'cudnn', 'fast_compile')
def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):
x, dy, scale, x_mean, x_invstd, epsilon = inputs
......@@ -3257,9 +3251,6 @@ def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):
return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
@register_opt('cudnn', 'fast_compile')
@op_lifter([bn.AbstractBatchNormInference])
@register_opt2([bn.AbstractBatchNormInference], 'cudnn', 'fast_compile')
def local_abstract_batch_norm_inference_cudnn(op, ctx_name, inputs, outputs):
x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs
......
......@@ -22,6 +22,7 @@ from theano.scalar.basic import Scalar, Pow, Cast
from theano.scalar.basic_scipy import Erfinv, Erfcinv
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet import bn
from theano.tensor.nnet.conv import ConvOp
from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
......@@ -2005,3 +2006,56 @@ abstractconv_groupopt.register('local_abstractconv3d_gradinputs',
local_abstractconv3d_gradinputs_gemm, 30,
'conv_gemm',
'gpuarray', 'fast_compile', 'fast_run')
# Register cuDNN batch normalization implementation
# We import these opts here instead of at the top of this file
# to avoid a circular dependency problem with dnn
from .dnn import (local_abstract_batch_norm_train_cudnn,
local_abstract_batch_norm_train_grad_cudnn,
local_abstract_batch_norm_inference_cudnn) # noqa: 402
abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
abstract_batch_norm_groupopt.__name__ = "gpuarray_batchnorm_opts"
register_opt('fast_compile')(abstract_batch_norm_groupopt)
abstract_batch_norm_db = LocalGroupDB()
abstract_batch_norm_db2 = LocalGroupDB(
local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
abstract_batch_norm_db2.__name__ = "abstract_batch_norm_db2"
register_opt('fast_compile', name='abstract_batch_norm_db')(
abstract_batch_norm_db)
register_opt2([bn.AbstractBatchNormTrain,
bn.AbstractBatchNormTrainGrad,
bn.AbstractBatchNormInference],
'fast_compile', name='abstract_batch_norm_db2')(
abstract_batch_norm_db2)
for op, fct, cpu in [(bn.AbstractBatchNormTrain,
local_abstract_batch_norm_train_cudnn,
bn.local_abstract_batch_norm_train),
(bn.AbstractBatchNormTrainGrad,
local_abstract_batch_norm_train_grad_cudnn,
bn.local_abstract_batch_norm_train_grad),
(bn.AbstractBatchNormInference,
local_abstract_batch_norm_inference_cudnn,
bn.local_abstract_batch_norm_inference)]:
lifter = op_lifter([op])(fct)
abstract_batch_norm_db.register(fct.__name__,
lifter,
'gpuarray', 'fast_compile', 'fast_run',
'cudnn', 'batchnorm_dnn',
position=1)
abstract_batch_norm_db2.register(fct.__name__,
local_optimizer([op])(fct),
'gpuarray', 'fast_compile', 'fast_run',
'cudnn', 'batchnorm_dnn',
position=1)
# cpu is a normal optimization. We can't register it in
# GraphToGPU. So for now, only add it to the slower EQ phase. If
# there is no cuDNN, we still want to move it to the GPU now with
# a Theano graph so to have this graph on the GPU.
abstract_batch_norm_db.register(cpu.__name__, cpu,
'gpuarray', 'fast_compile', 'fast_run',
position='last')
......@@ -1533,6 +1533,44 @@ def test_dnn_batchnorm_train_without_running_averages():
f_abstract(X, Scale, Bias, Dy)
def test_without_dnn_batchnorm_train_without_running_averages():
# compile and run batch_normalization_train without running averages
# But disable cudnn and make sure it run on the GPU.
utt.seed_rng()
x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
data_shape = (5, 10, 30, 25)
param_shape = (1, 10, 30, 25)
# forward pass
out_abstract, x_mean_abstract, x_invstd_abstract = \
bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
# compile
f_abstract = theano.function([x, scale, bias, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract] +
grads_abstract,
mode=mode_with_gpu)
# check if the abstract Ops have been replaced
assert not any([isinstance(n.op, dnn.GpuDnnBatchNorm)
for n in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
for n in f_abstract.maker.fgraph.toposort()])
assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
bn.AbstractBatchNormInference,
bn.AbstractBatchNormTrainGrad))
for n in f_abstract.maker.fgraph.toposort()])
assert any([isinstance(n.op, dnn.GpuElemwise)
for n in f_abstract.maker.fgraph.toposort()])
# run
X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f_abstract(X, Scale, Bias, Dy)
def test_dnn_batchnorm_train_inplace():
# test inplace_running_mean and inplace_running_var
if not dnn.dnn_available(test_ctx_name):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论