提交 3dcba54d authored 作者: abergeron's avatar abergeron

Merge pull request #3363 from fvisin/logsoftmax

LogSoftmax
from .nnet import ( from .nnet import (
CrossentropyCategorical1Hot, CrossentropyCategorical1HotGrad, CrossentropyCategorical1Hot, CrossentropyCategorical1HotGrad,
CrossentropySoftmax1HotWithBiasDx, CrossentropySoftmaxArgmax1HotWithBias, CrossentropySoftmax1HotWithBiasDx, CrossentropySoftmaxArgmax1HotWithBias,
Prepend_scalar_constant_to_each_row, Prepend_scalar_to_each_row, Softmax, LogSoftmax, Prepend_scalar_constant_to_each_row,
Prepend_scalar_to_each_row, Softmax,
SoftmaxGrad, SoftmaxWithBias, binary_crossentropy, SoftmaxGrad, SoftmaxWithBias, binary_crossentropy,
categorical_crossentropy, crossentropy_categorical_1hot, categorical_crossentropy, crossentropy_categorical_1hot,
crossentropy_categorical_1hot_grad, crossentropy_softmax_1hot, crossentropy_categorical_1hot_grad, crossentropy_softmax_1hot,
...@@ -13,12 +14,7 @@ from .nnet import ( ...@@ -13,12 +14,7 @@ from .nnet import (
crossentropy_to_crossentropy_with_softmax, crossentropy_to_crossentropy_with_softmax,
crossentropy_to_crossentropy_with_softmax_with_bias, crossentropy_to_crossentropy_with_softmax_with_bias,
graph_merge_softmax_with_crossentropy_softmax, h_softmax, graph_merge_softmax_with_crossentropy_softmax, h_softmax,
local_advanced_indexing_crossentropy_onehot, logsoftmax, logsoftmax_op, prepend_0_to_each_row, prepend_1_to_each_row,
local_advanced_indexing_crossentropy_onehot_grad, local_argmax_pushdown,
local_log_softmax, local_softmax_grad_to_crossentropy_with_softmax_grad,
local_softmax_with_bias,
local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc,
make_out_pattern, prepend_0_to_each_row, prepend_1_to_each_row,
prepend_scalar_to_each_row, relu, softmax, softmax_grad, softmax_graph, prepend_scalar_to_each_row, relu, softmax, softmax_grad, softmax_graph,
softmax_op, softmax_simplifier, softmax_with_bias) softmax_op, softmax_simplifier, softmax_with_bias)
from . import opt from . import opt
......
差异被折叠。
...@@ -24,8 +24,8 @@ from theano.tensor.nnet import (categorical_crossentropy, ...@@ -24,8 +24,8 @@ from theano.tensor.nnet import (categorical_crossentropy,
CrossentropyCategorical1HotGrad, CrossentropyCategorical1HotGrad,
sigmoid, softplus, Softmax, softmax, sigmoid, softplus, Softmax, softmax,
softmax_op, softmax_graph, SoftmaxWithBias, softmax_op, softmax_graph, SoftmaxWithBias,
softmax_grad, softmax_with_bias, LogSoftmax, logsoftmax_op,
softmax_with_bias, SoftmaxGrad, softmax_grad, SoftmaxGrad,
Prepend_scalar_constant_to_each_row, Prepend_scalar_constant_to_each_row,
Prepend_scalar_to_each_row, Prepend_scalar_to_each_row,
relu, relu,
...@@ -122,9 +122,9 @@ class T_SoftmaxWithBias(utt.InferShapeTester): ...@@ -122,9 +122,9 @@ class T_SoftmaxWithBias(utt.InferShapeTester):
# test that we don't raise an error during optimization for no good # test that we don't raise an error during optimization for no good
# reason as softmax_with_bias don't support correctly some/all # reason as softmax_with_bias don't support correctly some/all
# broadcasted inputs pattern # broadcasted inputs pattern
initial_W = numpy.asarray([[0.1, 0.1, 0.1], \ initial_W = numpy.asarray([[0.1, 0.1, 0.1],
[0.1, 0.1, 0.1], \ [0.1, 0.1, 0.1],
[0.1, 0.1, 0.1]], \ [0.1, 0.1, 0.1]],
dtype=theano.config.floatX) dtype=theano.config.floatX)
W = theano.shared(value=initial_W, name='W') W = theano.shared(value=initial_W, name='W')
vbias = theano.shared(value=0.1, name='vbias') # 0.01 vbias = theano.shared(value=0.1, name='vbias') # 0.01
...@@ -148,6 +148,118 @@ class T_SoftmaxWithBias(utt.InferShapeTester): ...@@ -148,6 +148,118 @@ class T_SoftmaxWithBias(utt.InferShapeTester):
[admat_val, advec_val], SoftmaxWithBias) [admat_val, advec_val], SoftmaxWithBias)
class T_LogSoftmax(utt.InferShapeTester):
def test0(self):
def f(a):
return logsoftmax_op(a)[:, 0]
utt.verify_grad(f, [numpy.random.rand(3, 4)])
def test1(self):
def f(a):
return logsoftmax_op(a)[:, 1]
utt.verify_grad(f, [numpy.random.rand(3, 4)])
def test2(self):
def f(a):
return logsoftmax_op(a)[:, 2]
utt.verify_grad(f, [numpy.random.rand(3, 4)])
def test3(self):
def f(a):
return logsoftmax_op(a)[:, 3]
utt.verify_grad(f, [numpy.random.rand(3, 4)])
def test_matrix(self):
def f(a):
return logsoftmax_op(a)
utt.verify_grad(f, [numpy.random.rand(3, 4)])
def test_vector(self):
x = T.vector()
f = theano.function([x], logsoftmax_op(x))
xv = numpy.random.randn(6).astype(config.floatX)
assert numpy.allclose(f(xv),
numpy.log(numpy.exp(xv) / numpy.exp(xv).sum()))
def test_vector_grad(self):
def f(a):
return logsoftmax_op(a)
utt.verify_grad(f, [numpy.random.rand(4)])
def test_allclose(self):
x, y = tensor.matrices('xy')
# regular softmax and crossentropy
sm = tensor.nnet.softmax(x)
cm = tensor.nnet.categorical_crossentropy(sm, y)
# numerically stable log-softmax with crossentropy
logsm = tensor.nnet.logsoftmax(x)
sm2 = tensor.exp(logsm) # just used to show equivalence with sm
cm2 = -tensor.sum(y*logsm, axis=1)
grad = tensor.grad(cm2.mean(), x)
# create some inputs into a softmax that are large and labels
a = numpy.exp(10*numpy.random.rand(5, 10).astype(theano.config.floatX))
# create some one-hot coded labels
b = numpy.eye(5, 10).astype(theano.config.floatX)
# show equivalence of softmax and exponentiated numerically stable
# log-softmax
f1 = theano.function([x], [sm, sm2])
sm_, sm2_ = f1(a)
utt.assert_allclose(sm_, sm2_)
# now show that the two versions result in the same crossentropy cost
# this indicates that the forward function does provide some numerical
# stability
f2 = theano.function([x, y], [cm, cm2])
cm_, cm2_ = f2(a, b)
utt.assert_allclose(cm_, cm2_)
# now, show that in the standard softmax case the gradients blow up
# while in the log-softmax case they don't
f3 = theano.function([x, y], [grad])
grad_ = f3(a, b)
assert numpy.all(numpy.isnan(grad_) == False)
def test_isclose(self):
def f(a):
return logsoftmax_op(a)
def test_local_softmax_optimization(self):
"""Test the Logsoftmax substitution
Check that Log(Softmax(x)) is substituted with Logsoftmax(x). Note that
only the forward pass is checked (i.e., doesn't check the gradient)
"""
x, y = tensor.matrices('xy')
sm = tensor.nnet.softmax(x)
logsm = tensor.log(sm)
f = theano.function([x], logsm)
assert isinstance(f.maker.fgraph.outputs[0].owner.op,
theano.tensor.nnet.nnet.LogSoftmax)
def test_local_softmax_grad_optimization_and_big_input(self):
"""Test the Logsoftmax's grad substitution.
Check that Log(Softmax(x))'s grad is substituted with Logsoftmax(x)'s
grad and that the new operation does not explode for big inputs.
Note that only the grad is checked.
"""
# some inputs that are large to make the gradient explode in the non
# optimized case
a = numpy.exp(10*numpy.random.rand(5, 10).astype(theano.config.floatX))
def myfunc(x):
sm = tensor.nnet.softmax(x)
logsm = tensor.log(sm)
return logsm
# We set step to 0.1 because for big values we need a big epsilon
utt.verify_grad(myfunc, [a], eps=0.1)
class T_SoftmaxGrad(utt.InferShapeTester): class T_SoftmaxGrad(utt.InferShapeTester):
def test_infer_shape(self): def test_infer_shape(self):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论