提交 186056b8 authored 作者: Gijs van Tulder's avatar Gijs van Tulder

Compute running_mean and running_var using cuDNN.

上级 c4293e69
差异被折叠。
...@@ -2,8 +2,19 @@ ...@@ -2,8 +2,19 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, npy_float64 epsilon, PyGpuArrayObject *bias, npy_float64 epsilon,
PyGpuArrayObject **outp, PyGpuArrayObject **x_mean, npy_float64 running_average_factor,
PyGpuArrayObject **x_invstd, cudnnHandle_t _handle) { #ifdef RUNNING_AVERAGES
PyGpuArrayObject *in_running_mean,
PyGpuArrayObject *in_running_var,
#endif
PyGpuArrayObject **outp,
PyGpuArrayObject **x_mean,
PyGpuArrayObject **x_invstd,
#ifdef RUNNING_AVERAGES
PyGpuArrayObject **out_running_mean,
PyGpuArrayObject **out_running_var,
#endif
cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context; PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0) if (c_set_tensorNd(inp, bn_input) != 0)
...@@ -24,6 +35,19 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -24,6 +35,19 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
if (c_set_tensorNd(*outp, bn_output) != 0) if (c_set_tensorNd(*outp, bn_output) != 0)
return 1; return 1;
#ifdef RUNNING_AVERAGES
PyGpuArrayObject *running_mean = *out_running_mean;
PyGpuArrayObject *running_var = *out_running_var;
running_mean = theano_try_copy(running_mean, in_running_mean);
if (running_mean == NULL) {
return 1;
}
running_var = theano_try_copy(running_var, in_running_var);
if (running_var == NULL) {
return 1;
}
#endif
{ {
const float falpha = 1.; const float falpha = 1.;
const float fbeta = 0.; const float fbeta = 0.;
...@@ -50,9 +74,15 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -50,9 +74,15 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
bn_params, bn_params,
PyGpuArray_DEV_DATA(scale), PyGpuArray_DEV_DATA(scale),
PyGpuArray_DEV_DATA(bias), PyGpuArray_DEV_DATA(bias),
#ifdef RUNNING_AVERAGES
running_average_factor,
PyGpuArray_DEV_DATA(running_mean),
PyGpuArray_DEV_DATA(running_var),
#else
0, 0,
NULL, // running mean, deliberately unused NULL, // running mean, deliberately unused
NULL, // running var, deliberately unused NULL, // running var, deliberately unused
#endif
epsilon, epsilon,
PyGpuArray_DEV_DATA(*x_mean), PyGpuArray_DEV_DATA(*x_mean),
PyGpuArray_DEV_DATA(*x_invstd) PyGpuArray_DEV_DATA(*x_invstd)
...@@ -62,6 +92,10 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -62,6 +92,10 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
#ifdef RUNNING_AVERAGES
*out_running_mean = running_mean;
*out_running_var = running_var;
#endif
} }
return 0; return 0;
} }
...@@ -1393,8 +1393,11 @@ def test_dnn_batchnorm_train(): ...@@ -1393,8 +1393,11 @@ def test_dnn_batchnorm_train():
running_average_factor = 0.3 running_average_factor = 0.3
# forward pass, direct interface # forward pass, direct interface
out_gpu, x_mean_gpu, x_invstd_gpu = dnn.dnn_batch_normalization_train( out_gpu, x_mean_gpu, x_invstd_gpu, \
x, scale, bias, mode, eps) out_running_mean_gpu, out_running_var_gpu = \
dnn.dnn_batch_normalization_train(x, scale, bias, mode, eps,
running_average_factor,
running_mean, running_var)
# forward pass, abstract interface # forward pass, abstract interface
out_abstract, x_mean_abstract, x_invstd_abstract, \ out_abstract, x_mean_abstract, x_invstd_abstract, \
out_running_mean_abstract, out_running_var_abstract = \ out_running_mean_abstract, out_running_var_abstract = \
...@@ -1424,8 +1427,9 @@ def test_dnn_batchnorm_train(): ...@@ -1424,8 +1427,9 @@ def test_dnn_batchnorm_train():
# reference backward pass # reference backward pass
grads_ref = T.grad(None, wrt=[x, scale, bias], known_grads={out_ref: dy}) grads_ref = T.grad(None, wrt=[x, scale, bias], known_grads={out_ref: dy})
# compile # compile
f_gpu = theano.function([x, scale, bias, dy], f_gpu = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_gpu, x_mean_gpu, x_invstd_gpu] + grads_gpu, [out_gpu, x_mean_gpu, x_invstd_gpu,
out_running_mean_gpu, out_running_var_gpu] + grads_gpu,
mode=mode_with_gpu) mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, running_mean, running_var, dy], f_abstract = theano.function([x, scale, bias, running_mean, running_var, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract, [out_abstract, x_mean_abstract, x_invstd_abstract,
...@@ -1455,13 +1459,16 @@ def test_dnn_batchnorm_train(): ...@@ -1455,13 +1459,16 @@ def test_dnn_batchnorm_train():
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX) Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX) Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX) Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
outputs_gpu = f_gpu(X, Scale, Bias, Dy) outputs_gpu = f_gpu(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy) outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy) outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
# compare outputs # compare outputs
utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out utt.assert_allclose(outputs_gpu[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_gpu[1], outputs_ref[1]) # mean utt.assert_allclose(outputs_gpu[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs_gpu[2], outputs_ref[2]) # invstd utt.assert_allclose(outputs_gpu[2], outputs_ref[2]) # invstd
utt.assert_allclose(outputs_gpu[3], outputs_ref[3]) # running_mean
utt.assert_allclose(numpy.nan_to_num(outputs_gpu[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var
utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out utt.assert_allclose(outputs_abstract[0], outputs_ref[0]) # out
utt.assert_allclose(outputs_abstract[1], outputs_ref[1]) # mean utt.assert_allclose(outputs_abstract[1], outputs_ref[1]) # mean
utt.assert_allclose(outputs_abstract[2], outputs_ref[2]) # invstd utt.assert_allclose(outputs_abstract[2], outputs_ref[2]) # invstd
...@@ -1469,9 +1476,9 @@ def test_dnn_batchnorm_train(): ...@@ -1469,9 +1476,9 @@ def test_dnn_batchnorm_train():
utt.assert_allclose(numpy.nan_to_num(outputs_abstract[4]), utt.assert_allclose(numpy.nan_to_num(outputs_abstract[4]),
numpy.nan_to_num(outputs_ref[4])) # running_var numpy.nan_to_num(outputs_ref[4])) # running_var
# compare gradients # compare gradients
utt.assert_allclose(outputs_gpu[3], outputs_ref[5], atol=2e-4) # dx utt.assert_allclose(outputs_gpu[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_gpu[4], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale utt.assert_allclose(outputs_gpu[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_gpu[5], outputs_ref[7]) # dbias utt.assert_allclose(outputs_gpu[7], outputs_ref[7]) # dbias
utt.assert_allclose(outputs_abstract[5], outputs_ref[5], atol=2e-4) # dx utt.assert_allclose(outputs_abstract[5], outputs_ref[5], atol=2e-4) # dx
utt.assert_allclose(outputs_abstract[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale utt.assert_allclose(outputs_abstract[6], outputs_ref[6], rtol=4e-4, atol=1e-4) # dscale
utt.assert_allclose(outputs_abstract[7], outputs_ref[7]) # dbias utt.assert_allclose(outputs_abstract[7], outputs_ref[7]) # dbias
...@@ -1490,11 +1497,22 @@ def test_dnn_batchnorm_train_without_running_averages(): ...@@ -1490,11 +1497,22 @@ def test_dnn_batchnorm_train_without_running_averages():
param_shape = (1, 10, 30, 25) param_shape = (1, 10, 30, 25)
# forward pass # forward pass
out, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'per-activation') out_gpu, x_mean_gpu, x_invstd_gpu = \
dnn.dnn_batch_normalization_train(x, scale, bias, 'per-activation')
out_abstract, x_mean_abstract, x_invstd_abstract = \
bn.batch_normalization_train(x, scale, bias, 'per-activation')
# backward pass # backward pass
grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
# compile # compile
f_abstract = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads, mode=mode_with_gpu) f_gpu = theano.function([x, scale, bias, dy],
[out_gpu, x_mean_gpu, x_invstd_gpu] +
grads_gpu,
mode=mode_with_gpu)
f_abstract = theano.function([x, scale, bias, dy],
[out_abstract, x_mean_abstract, x_invstd_abstract] +
grads_abstract,
mode=mode_with_gpu)
# check if the abstract Ops have been replaced # check if the abstract Ops have been replaced
assert any([isinstance(n.op, dnn.GpuDnnBatchNorm) assert any([isinstance(n.op, dnn.GpuDnnBatchNorm)
for n in f_abstract.maker.fgraph.toposort()]) for n in f_abstract.maker.fgraph.toposort()])
...@@ -1509,6 +1527,7 @@ def test_dnn_batchnorm_train_without_running_averages(): ...@@ -1509,6 +1527,7 @@ def test_dnn_batchnorm_train_without_running_averages():
Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX) Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX) Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
f_gpu(X, Scale, Bias, Dy)
f_abstract(X, Scale, Bias, Dy) f_abstract(X, Scale, Bias, Dy)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论