Add LogSoftmax python code and tests

f05a0c89 · fvisin · Francesco Visin · 30cc6380 · f05a0c89 · f05a0c89
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -431,6 +431,7 @@ class Softmax(gof.Op):
                             x.type)
        if x.ndim == 1:
            x = tensor.shape_padleft(x, n_ones=1)
        return Apply(self, [x], [x.type()])
    def perform(self, node, input_storage, output_storage):
@@ -599,6 +600,52 @@ class Softmax(gof.Op):
 softmax_op = Softmax()
+class LogSoftmax(gof.Op):
+    """
+    LogSoftmax activation function
+    :math:`\\varphi(\\mathbf{x})_j =
+    \\e^{(\mathbf{x}_j - log{\sum_{k=1}^K e^{\mathbf{x}_k})}}
+    where :math:`K` is the total number of neurons in the layer. This
+    activation function gets applied row-wise.
+    """
+    def make_node(self, x):
+        x = tensor.as_tensor_variable(x)
+        if x.type.ndim not in (1, 2) \
+                or x.type.dtype not in tensor.float_dtypes:
+            raise ValueError('x must be 1-d or 2-d tensor of floats. Got %s' %
+                             x.type)
+        if x.ndim == 1:
+            x = tensor.shape_padleft(x, n_ones=1)
+        return Apply(self, [x], [x.type()])
+    def perform(self, node, input_storage, output_storage):
+        x, = input_storage
+        xdev = x - x.max(axis=1)[:, None]
+        lsm = xdev - numpy.log(numpy.sum(numpy.exp(xdev), axis=1,
+                               keepdims=True))
+        output_storage[0][0] = lsm
+    def grad(self, inp, grads):
+        x, = inp
+        sm = softmax_op(x)
+        return [grads[0] - tensor.sum(grads[0], axis=1, keepdims=True) * sm]
+    def R_op(self, inputs, eval_points):
+        # I think the Jacobian is symmetric so the R_op
+        # is the same as the grad
+        if None in eval_points:
+            return [None]
+        return self.grad(inputs, eval_points)
+    def infer_shape(self, node, shape):
+        return shape
+logsoftmax_op = LogSoftmax()
 def softmax_graph(c):
    return tensor.exp(c) / tensor.exp(c).sum(axis=-1, keepdims=True)
@@ -607,6 +654,10 @@ def softmax(c):
    return softmax_op(c)
+def logsoftmax(c):
+    return logsoftmax_op(c)
 @opt.register_specialize('fast_compile_gpu')
 @gof.local_optimizer([softmax_op])
 def local_softmax_with_bias(node):

--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -24,8 +24,8 @@ from theano.tensor.nnet import (categorical_crossentropy,
                                CrossentropyCategorical1HotGrad,
                                sigmoid, softplus, Softmax, softmax,
                                softmax_op, softmax_graph, SoftmaxWithBias,
-                                softmax_grad,
+                                softmax_with_bias, LogSoftmax, logsoftmax_op,
-                                softmax_with_bias, SoftmaxGrad,
+                                softmax_grad, SoftmaxGrad,
                                Prepend_scalar_constant_to_each_row,
                                Prepend_scalar_to_each_row,
                                relu,
@@ -122,9 +122,9 @@ class T_SoftmaxWithBias(utt.InferShapeTester):
        # test that we don't raise an error during optimization for no good
        # reason as softmax_with_bias don't support correctly some/all
        # broadcasted inputs pattern
-        initial_W = numpy.asarray([[0.1, 0.1, 0.1], \
+        initial_W = numpy.asarray([[0.1, 0.1, 0.1],
-                            [0.1, 0.1, 0.1], \
+                                   [0.1, 0.1, 0.1],
-                            [0.1, 0.1, 0.1]], \
+                                   [0.1, 0.1, 0.1]],
                                  dtype=theano.config.floatX)
        W = theano.shared(value=initial_W, name='W')
        vbias = theano.shared(value=0.1, name='vbias')  # 0.01
@@ -148,6 +148,82 @@ class T_SoftmaxWithBias(utt.InferShapeTester):
                                [admat_val, advec_val], SoftmaxWithBias)
+class T_LogSoftmax(utt.InferShapeTester):
+    def test0(self):
+        def f(a):
+            return logsoftmax_op(a)[:, 0]
+        utt.verify_grad(f, [numpy.random.rand(3, 4)])
+    def test1(self):
+        def f(a):
+            return logsoftmax_op(a)[:, 1]
+        utt.verify_grad(f, [numpy.random.rand(3, 4)])
+    def test2(self):
+        def f(a):
+            return logsoftmax_op(a)[:, 2]
+        utt.verify_grad(f, [numpy.random.rand(3, 4)])
+    def test3(self):
+        def f(a):
+            return logsoftmax_op(a)[:, 3]
+        utt.verify_grad(f, [numpy.random.rand(3, 4)])
+    def test_vector(self):
+        x = T.vector()
+        f = theano.function([x], logsoftmax_op(x))
+        xv = numpy.random.randn(6).astype(config.floatX)
+        assert numpy.allclose(f(xv),
+                              numpy.log(numpy.exp(xv) / numpy.exp(xv).sum()))
+    def test_vector_grad(self):
+        def f(a):
+            return softmax_op(a)
+        utt.verify_grad(f, [numpy.random.rand(4)])
+    def test_allclose(self):
+        x, y = tensor.matrices('xy')
+        # regular softmax and crossentropy
+        sm = tensor.nnet.softmax(x)
+        cm = tensor.nnet.categorical_crossentropy(sm, y)
+        # numerically stable log-softmax with crossentropy
+        logsm = tensor.nnet.logsoftmax(x)
+        sm2 = tensor.exp(logsm)  # just used to show equivalence with sm
+        cm2 = -tensor.sum(y*logsm, axis=1)
+        grad = tensor.grad(cm2.mean(), x)
+        # create some inputs into a softmax that are large and labels
+        a = numpy.exp(10*numpy.random.rand(5, 10).astype(theano.config.floatX))
+        # create some one-hot coded labels
+        b = numpy.eye(5, 10).astype(theano.config.floatX)
+        # show equivalence of softmax and exponentiated numerically stable
+        # log-softmax
+        f1 = theano.function([x], [sm, sm2])
+        sm_, sm2_ = f1(a)
+        utt.assert_allclose(sm_, sm2_)
+        # now show that the two versions result in the same crossentropy cost
+        # this indicates that the forward function does provide some numerical
+        # stability
+        f2 = theano.function([x, y], [cm, cm2])
+        cm_, cm2_ = f2(a, b)
+        utt.assert_allclose(cm_, cm2_)
+        # now, show that in the standard softmax case the gradients blow up
+        # while in the log-softmax case they don't
+        f3 = theano.function([x, y], [grad])
+        grad_ = f3(a, b)
+        assert numpy.all(numpy.isnan(grad_) == False)
+    def test_isclose(self):
+        def f(a):
+            return logsoftmax_op(a)
 class T_SoftmaxGrad(utt.InferShapeTester):
    def test_infer_shape(self):