Merge pull request #3363 from fvisin/logsoftmax

LogSoftmax

Merge pull request #3363 from fvisin/logsoftmax
3dcba54d · abergeron · a5735a1b · 9ad1ea03 · 3dcba54d · 3dcba54d
--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
 from .nnet import (
    CrossentropyCategorical1Hot, CrossentropyCategorical1HotGrad,
    CrossentropySoftmax1HotWithBiasDx, CrossentropySoftmaxArgmax1HotWithBias,
-    Prepend_scalar_constant_to_each_row, Prepend_scalar_to_each_row, Softmax,
+    LogSoftmax, Prepend_scalar_constant_to_each_row,
+    Prepend_scalar_to_each_row, Softmax,
    SoftmaxGrad, SoftmaxWithBias, binary_crossentropy,
    categorical_crossentropy, crossentropy_categorical_1hot,
    crossentropy_categorical_1hot_grad, crossentropy_softmax_1hot,
@@ -13,12 +14,7 @@ from .nnet import (
    crossentropy_to_crossentropy_with_softmax,
    crossentropy_to_crossentropy_with_softmax_with_bias,
    graph_merge_softmax_with_crossentropy_softmax, h_softmax,
-    local_advanced_indexing_crossentropy_onehot,
-    local_advanced_indexing_crossentropy_onehot_grad, local_argmax_pushdown,
-    local_log_softmax, local_softmax_grad_to_crossentropy_with_softmax_grad,
-    local_softmax_with_bias,
-    local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc,
-    make_out_pattern, prepend_0_to_each_row, prepend_1_to_each_row,
+    logsoftmax, logsoftmax_op, prepend_0_to_each_row, prepend_1_to_each_row,
    prepend_scalar_to_each_row, relu, softmax, softmax_grad, softmax_graph,
    softmax_op, softmax_simplifier, softmax_with_bias)
 from . import opt

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -30,7 +30,6 @@ from theano.tensor.nnet.sigm import sigmoid, softplus
 from theano.gradient import DisconnectedType
 from theano.gradient import grad_not_implemented
 from theano.tensor.nnet.blocksparse import sparse_block_dot
-from theano.tensor.type import values_eq_approx_remove_nan


 ############
@@ -190,7 +189,6 @@ class SoftmaxWithBias(gof.Op):
        {
            size_t j;
            double sum = 0.0;
-            bool  discount_max = false;

            const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
            const dtype_%(b)s* __restrict__ b_i = (dtype_%(b)s*)(PyArray_BYTES(%(b)s));
@@ -431,6 +429,7 @@ class Softmax(gof.Op):
                             x.type)
        if x.ndim == 1:
            x = tensor.shape_padleft(x, n_ones=1)
+
        return Apply(self, [x], [x.type()])

    def perform(self, node, input_storage, output_storage):
@@ -508,12 +507,10 @@ class Softmax(gof.Op):
        {
            size_t j;
            double sum = 0.0;
-            bool  discount_max = false;

            const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
            dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);

-            size_t row_max_j=0;
            dtype_%(sm)s row_max = x_i[0];
            //std::cout << "0 " << row_max << "\\n";
            // Get the maximum value of the row
@@ -521,7 +518,6 @@ class Softmax(gof.Op):
            {
                dtype_%(sm)s row_ij = x_i[j * Sx1] ;
                //std::cout << "1 " << row_ij << "\\n";
-                row_max_j = (row_ij > row_max) ? j : row_max_j;
                row_max   = (row_ij > row_max) ? row_ij : row_max;
            }

@@ -599,6 +595,198 @@ class Softmax(gof.Op):
 softmax_op = Softmax()


+class LogSoftmax(gof.Op):
+    """
+    LogSoftmax activation function
+    :math:`\\varphi(\\mathbf{x})_j =
+    \\e^{(\mathbf{x}_j - log{\sum_{k=1}^K e^{\mathbf{x}_k})}}
+    where :math:`K` is the total number of neurons in the layer. This
+    activation function gets applied row-wise.
+
+    """
+    __props__ = ()
+
+    def make_node(self, x):
+        x = tensor.as_tensor_variable(x)
+        if x.type.ndim not in (1, 2) \
+                or x.type.dtype not in tensor.float_dtypes:
+            raise ValueError('x must be 1-d or 2-d tensor of floats. Got %s' %
+                             x.type)
+        if x.ndim == 1:
+            x = tensor.shape_padleft(x, n_ones=1)
+
+        return Apply(self, [x], [x.type()])
+
+    def perform(self, node, input_storage, output_storage):
+        x, = input_storage
+        xdev = x - x.max(axis=1)[:, None]
+        lsm = xdev - numpy.log(numpy.sum(numpy.exp(xdev), axis=1,
+                               keepdims=True))
+        output_storage[0][0] = lsm
+
+    def grad(self, inp, grads):
+        x, = inp
+        sm = softmax_op(x)
+        return [grads[0] - tensor.sum(grads[0], axis=1, keepdims=True) * sm]
+
+    def R_op(self, inputs, eval_points):
+        # I think the Jacobian is symmetric so the R_op
+        # is the same as the grad
+        if None in eval_points:
+            return [None]
+        return self.grad(inputs, eval_points)
+
+    def infer_shape(self, node, shape):
+        return shape
+
+    def c_headers(self):
+        return ['<cmath>']
+
+    @staticmethod
+    def c_code_template(dtype):
+        init_decl = """
+          npy_intp* Nx = PyArray_DIMS(%(x)s);
+          npy_intp Sx1 = 0;
+          npy_intp Ssm1 = 0;
+
+          if (PyArray_NDIM(%(x)s) != 2)
+          {
+              PyErr_SetString(PyExc_ValueError, "not a 2d tensor");
+              %(fail)s;
+          }
+          if ((PyArray_TYPE(%(x)s) != NPY_DOUBLE) &&
+              (PyArray_TYPE(%(x)s) != NPY_FLOAT))
+          {
+              PyErr_SetString(PyExc_TypeError, "not a float");
+              %(fail)s;
+          }
+
+          if ((NULL == %(sm)s)
+              || (PyArray_DIMS(%(sm)s)[0] != PyArray_DIMS(%(x)s)[0])
+              || (PyArray_DIMS(%(sm)s)[1] != PyArray_DIMS(%(x)s)[1]))
+          {
+              Py_XDECREF(%(sm)s);
+              %(sm)s = (PyArrayObject*)PyArray_SimpleNew(
+                  2, PyArray_DIMS(%(x)s),
+                  PyArray_TYPE(%(x)s));
+              if(!%(sm)s) {
+                  PyErr_SetString(PyExc_MemoryError,
+                       "failed to alloc sm output");
+                  %(fail)s
+              }
+          }
+          Sx1 = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
+          Ssm1 = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
+          """
+
+        begin_row_loop = """
+          // minibatch loop
+          for (size_t i = 0; i < Nx[0]; ++i)
+          {
+              size_t j;
+              double sum = 0.0;
+
+              const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(
+                  PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
+              dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*)(
+                  PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
+
+              dtype_%(sm)s row_max = x_i[0];
+              // Get the maximum value of the row
+              for (j = 1; j < Nx[1]; ++j)
+              {
+                  dtype_%(sm)s x_ij = x_i[j * Sx1] ;
+                  row_max = (x_ij > row_max) ? x_ij : row_max;
+              }
+              """
+
+        inside_row_loop = """
+              // Compute xdev and sum(exp(xdev), axis=1)
+              double xdev_exp_row_sum = 0.0;
+              for (j = 0; j < Nx[1]; j++)
+              {
+                  // use sm_i to temporary store xdev
+                  sm_i[j * Ssm1] = (dtype_%(sm)s) (x_i[j * Sx1] - row_max);
+                  xdev_exp_row_sum += exp(sm_i[j * Ssm1]);
+              }
+
+              // Write sm = xdev - log(sum(exp(xdev), axis=1))
+              xdev_exp_row_sum = log(xdev_exp_row_sum);
+              for (j = 0; j < Nx[1]; ++j)
+              {
+                  sm_i[j * Ssm1] -= (dtype_%(sm)s) xdev_exp_row_sum;
+              }
+              """
+        end_row_loop = """
+          }
+          """
+        return (init_decl, begin_row_loop, inside_row_loop, end_row_loop)
+
+    def c_code(self, node, name, inp, out, sub):
+        x, = inp
+        sm, = out
+        code_template = ''.join(self.c_code_template(
+            node.inputs[0].type.dtype_specs()[1]))
+        return code_template % dict(locals(), **sub)
+
+    @staticmethod
+    def c_code_cache_version():
+        return (0,)
+
+logsoftmax_op = LogSoftmax()
+
+
+@opt.register_specialize('stabilize', 'fast_compile')
+@gof.local_optimizer([tensor.Elemwise])
+def local_logsoftmax(node):
+    """
+    Detect Log(Softmax(x)) and replace it with LogSoftmax(x)
+
+    Note: only forward pass is affected
+    """
+    if (isinstance(node.op, tensor.Elemwise) and
+            isinstance(node.op.scalar_op, scalar.basic.Log) and
+            len(node.inputs) == 1 and
+            node.inputs[0].owner is not None and
+            isinstance(node.inputs[0].owner.op, Softmax)):
+        inVars = node.inputs[0].owner.inputs[0]
+        new_op = LogSoftmax()
+        return [new_op(inVars)]
+
+
+@opt.register_specialize('stabilize', 'fast_compile')
+@gof.local_optimizer([SoftmaxGrad])
+def local_logsoftmax_grad(node):
+    """
+    Detect Log(Softmax(x))'s grad and replace it with LogSoftmax(x)'s grad
+
+    Note: only grad is affected
+    """
+    if (isinstance(node.op, SoftmaxGrad) and
+        len(node.inputs) == 2 and
+        node.inputs[0].owner is not None and
+        isinstance(node.inputs[0].owner.op, tensor.Elemwise) and
+        len(node.inputs[0].owner.inputs) >= 2 and
+        node.inputs[0].owner.inputs[1].owner is not None and
+        node.inputs[0].owner.inputs[1].owner.op == softmax_op and
+        node.inputs[1] == node.inputs[0].owner.inputs[1] and
+        not (
+            # skip if it will be optimized by
+            # local_advanced_indexing_crossentropy_onehot_grad
+            node.inputs[0].owner.op == tensor.true_div and
+            node.inputs[0].owner.inputs[0].owner is not None and
+            isinstance(node.inputs[0].owner.inputs[0].owner.op,
+                       subtensor.AdvancedIncSubtensor))):
+        # get parameters from unoptimized op
+        sm = node.inputs[0].owner.inputs[1]
+        # sm_input = node.inputs[1].owner.inputs[0]
+        grads = node.inputs[0].owner.inputs[0]
+        if grads.broadcastable[1] and not sm.broadcastable[1]:
+            grads = tensor.alloc(grads, grads.shape[0], sm.shape[1])
+
+        return [grads - tensor.sum(grads, axis=1, keepdims=True) * sm]
+
+
 def softmax_graph(c):
    return tensor.exp(c) / tensor.exp(c).sum(axis=-1, keepdims=True)

@@ -607,6 +795,10 @@ def softmax(c):
    return softmax_op(c)


+def logsoftmax(c):
+    return logsoftmax_op(c)
+
+
 @opt.register_specialize('fast_compile_gpu')
 @gof.local_optimizer([softmax_op])
 def local_softmax_with_bias(node):
@@ -1472,7 +1664,7 @@ def local_advanced_indexing_crossentropy_onehot(node):
            sm = log.owner.inputs[0]

    # Second case: log(softmax(x)[rows, labels])
-    if node.op == tensor.log:
+    elif node.op == tensor.log:
        pre_log = node.inputs[0].owner
        if pre_log and isinstance(pre_log.op, subtensor.AdvancedSubtensor):
            try:
@@ -1982,27 +2174,6 @@ prepend_0_to_each_row = Prepend_scalar_constant_to_each_row(0.)
 prepend_1_to_each_row = Prepend_scalar_constant_to_each_row(1.)


-# numerically stabilize log softmax (X)
-# as  X-X.max(axis=1).dimshuffle(0,'x') - log(exp(X-X.max(axis=1).dimshuffle(0,'x')).sum(axis=1)).dimshuffle(0,'x)
-def make_out_pattern(X):
-    stabilized_X = X - X.max(axis=1).dimshuffle(0, 'x')
-    out_var = stabilized_X - tensor.log(tensor.exp(stabilized_X).sum(
-        axis=1)).dimshuffle(0, 'x')
-    # tell DEBUG_MODE that it's OK if the original graph produced NaN and the optimized graph does not
-    out_var.values_eq_approx = values_eq_approx_remove_nan
-    return out_var
-
-
-local_log_softmax = gof.PatternSub(in_pattern=(tensor.log, (softmax_op, 'x')),
-                                   out_pattern=(make_out_pattern, 'x'),
-                                   allow_multiple_clients=True)
-
-# don't do register_stabilize, this is to make local_log_softmax run
-# only after another more specific optimization that stabilizes cross entropy
-# opt.register_stabilize(local_log_softmax, name = 'local_log_softmax')
-opt.register_specialize(local_log_softmax, 'fast_compile_gpu', name='local_log_softmax')
-
-
 def relu(x, alpha=0):
    """
    Compute the element-wise rectified linear activation function.

--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -24,8 +24,8 @@ from theano.tensor.nnet import (categorical_crossentropy,
                                CrossentropyCategorical1HotGrad,
                                sigmoid, softplus, Softmax, softmax,
                                softmax_op, softmax_graph, SoftmaxWithBias,
-                                softmax_grad,
-                                softmax_with_bias, SoftmaxGrad,
+                                softmax_with_bias, LogSoftmax, logsoftmax_op,
+                                softmax_grad, SoftmaxGrad,
                                Prepend_scalar_constant_to_each_row,
                                Prepend_scalar_to_each_row,
                                relu,
@@ -98,34 +98,34 @@ class T_SoftmaxWithBias(utt.InferShapeTester):
        def f(a, b):
            return softmax_with_bias(a, b)[:, 0]
        utt.verify_grad(f, [numpy.random.rand(3, 4),
-            numpy.random.rand(4)])
+                        numpy.random.rand(4)])

    def test1(self):
        def f(a, b):
            return softmax_with_bias(a, b)[:, 1]
        utt.verify_grad(f, [numpy.random.rand(3, 4),
-            numpy.random.rand(4)])
+                        numpy.random.rand(4)])

    def test2(self):
        def f(a, b):
            return softmax_with_bias(a, b)[:, 2]
        utt.verify_grad(f, [numpy.random.rand(3, 4),
-            numpy.random.rand(4)])
+                        numpy.random.rand(4)])

    def test3(self):
        def f(a, b):
            return softmax_with_bias(a, b)[:, 3]
        utt.verify_grad(f, [numpy.random.rand(3, 4),
-            numpy.random.rand(4)])
+                        numpy.random.rand(4)])

    def test_broadcast(self):
        # test that we don't raise an error during optimization for no good
        # reason as softmax_with_bias don't support correctly some/all
        # broadcasted inputs pattern
-        initial_W = numpy.asarray([[0.1, 0.1, 0.1], \
-                            [0.1, 0.1, 0.1], \
-                            [0.1, 0.1, 0.1]], \
-                            dtype=theano.config.floatX)
+        initial_W = numpy.asarray([[0.1, 0.1, 0.1],
+                                   [0.1, 0.1, 0.1],
+                                   [0.1, 0.1, 0.1]],
+                                  dtype=theano.config.floatX)
        W = theano.shared(value=initial_W, name='W')
        vbias = theano.shared(value=0.1, name='vbias')  # 0.01
        hid = T.vector('hid')
@@ -144,8 +144,120 @@ class T_SoftmaxWithBias(utt.InferShapeTester):
        admat_val = numpy.random.rand(3, 4).astype(config.floatX)
        advec_val = numpy.random.rand(4).astype(config.floatX)
        self._compile_and_check([admat, advec],
-                            [SoftmaxWithBias()(admat, advec)],
-                            [admat_val, advec_val], SoftmaxWithBias)
+                                [SoftmaxWithBias()(admat, advec)],
+                                [admat_val, advec_val], SoftmaxWithBias)
+
+
+class T_LogSoftmax(utt.InferShapeTester):
+
+    def test0(self):
+        def f(a):
+            return logsoftmax_op(a)[:, 0]
+        utt.verify_grad(f, [numpy.random.rand(3, 4)])
+
+    def test1(self):
+        def f(a):
+            return logsoftmax_op(a)[:, 1]
+        utt.verify_grad(f, [numpy.random.rand(3, 4)])
+
+    def test2(self):
+        def f(a):
+            return logsoftmax_op(a)[:, 2]
+        utt.verify_grad(f, [numpy.random.rand(3, 4)])
+
+    def test3(self):
+        def f(a):
+            return logsoftmax_op(a)[:, 3]
+        utt.verify_grad(f, [numpy.random.rand(3, 4)])
+
+    def test_matrix(self):
+        def f(a):
+            return logsoftmax_op(a)
+        utt.verify_grad(f, [numpy.random.rand(3, 4)])
+
+    def test_vector(self):
+        x = T.vector()
+        f = theano.function([x], logsoftmax_op(x))
+
+        xv = numpy.random.randn(6).astype(config.floatX)
+        assert numpy.allclose(f(xv),
+                              numpy.log(numpy.exp(xv) / numpy.exp(xv).sum()))
+
+    def test_vector_grad(self):
+        def f(a):
+            return logsoftmax_op(a)
+        utt.verify_grad(f, [numpy.random.rand(4)])
+
+    def test_allclose(self):
+        x, y = tensor.matrices('xy')
+        # regular softmax and crossentropy
+        sm = tensor.nnet.softmax(x)
+        cm = tensor.nnet.categorical_crossentropy(sm, y)
+
+        # numerically stable log-softmax with crossentropy
+        logsm = tensor.nnet.logsoftmax(x)
+        sm2 = tensor.exp(logsm)  # just used to show equivalence with sm
+        cm2 = -tensor.sum(y*logsm, axis=1)
+        grad = tensor.grad(cm2.mean(), x)
+
+        # create some inputs into a softmax that are large and labels
+        a = numpy.exp(10*numpy.random.rand(5, 10).astype(theano.config.floatX))
+        # create some one-hot coded labels
+        b = numpy.eye(5, 10).astype(theano.config.floatX)
+
+        # show equivalence of softmax and exponentiated numerically stable
+        # log-softmax
+        f1 = theano.function([x], [sm, sm2])
+        sm_, sm2_ = f1(a)
+        utt.assert_allclose(sm_, sm2_)
+
+        # now show that the two versions result in the same crossentropy cost
+        # this indicates that the forward function does provide some numerical
+        # stability
+        f2 = theano.function([x, y], [cm, cm2])
+        cm_, cm2_ = f2(a, b)
+        utt.assert_allclose(cm_, cm2_)
+
+        # now, show that in the standard softmax case the gradients blow up
+        # while in the log-softmax case they don't
+        f3 = theano.function([x, y], [grad])
+        grad_ = f3(a, b)
+        assert numpy.all(numpy.isnan(grad_) == False)
+
+    def test_isclose(self):
+        def f(a):
+            return logsoftmax_op(a)
+
+    def test_local_softmax_optimization(self):
+        """Test the Logsoftmax substitution
+
+        Check that Log(Softmax(x)) is substituted with Logsoftmax(x). Note that
+        only the forward pass is checked (i.e., doesn't check the gradient)
+        """
+        x, y = tensor.matrices('xy')
+        sm = tensor.nnet.softmax(x)
+        logsm = tensor.log(sm)
+        f = theano.function([x], logsm)
+        assert isinstance(f.maker.fgraph.outputs[0].owner.op,
+                          theano.tensor.nnet.nnet.LogSoftmax)
+
+    def test_local_softmax_grad_optimization_and_big_input(self):
+        """Test the Logsoftmax's grad substitution.
+
+        Check that Log(Softmax(x))'s grad is substituted with Logsoftmax(x)'s
+        grad and that the new operation does not explode for big inputs.
+        Note that only the grad is checked.
+        """
+        # some inputs that are large to make the gradient explode in the non
+        # optimized case
+        a = numpy.exp(10*numpy.random.rand(5, 10).astype(theano.config.floatX))
+
+        def myfunc(x):
+            sm = tensor.nnet.softmax(x)
+            logsm = tensor.log(sm)
+            return logsm
+        # We set step to 0.1 because for big values we need a big epsilon
+        utt.verify_grad(myfunc, [a], eps=0.1)


 class T_SoftmaxGrad(utt.InferShapeTester):
@@ -156,7 +268,7 @@ class T_SoftmaxGrad(utt.InferShapeTester):
        admat_val = numpy.random.rand(3, 4).astype(config.floatX)
        bdmat_val = numpy.random.rand(3, 4).astype(config.floatX)
        self._compile_and_check([admat, bdmat], [SoftmaxGrad()(admat, bdmat)],
-                            [admat_val, bdmat_val], SoftmaxGrad)
+                                [admat_val, bdmat_val], SoftmaxGrad)


 class T_CrossentropySoftmax1Hot(unittest.TestCase):