Gradient of Softmaxgrad

83ea3e8f · Harm de Vries · b22ae136 · 83ea3e8f · 83ea3e8f
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -77,11 +77,16 @@ class SoftmaxWithBias(gof.Op):
        if b.shape[0] != x.shape[1]:
            raise ValueError('b must have same number of columns as x')
-        sm = numpy.zeros_like(x)
+        # sm = numpy.zeros_like(x)
-        for i in xrange(sm.shape[0]):
+        # for i in xrange(sm.shape[0]):
-            row = x[i] + b
+            # row = x[i] + b
-            sm[i] = numpy.exp(row - numpy.max(row))
+            # sm[i] = numpy.exp(row - numpy.max(row))
-            sm[i] *= 1.0 / numpy.sum(sm[i])
+            # sm[i] *= 1.0 / numpy.sum(sm[i])
+        # output_storage[0][0] = sm
+        x_plus_b = x + b[None, :]
+        e_x = numpy.exp(x_plus_b - x_plus_b.max(axis=1)[:, None])
+        sm = e_x / e_x.sum(axis=1)[:, None]
        output_storage[0][0] = sm
    def grad(self, inp, grads):
@@ -303,8 +308,17 @@ class SoftmaxGrad(gof.Op):
            dx[i] = dy_times_sm_i - sum(dy_times_sm_i) * sm[i]
        output_storage[0][0] = dx
-    def grad(self, *args):
+    def grad(self, inp, grads):
-        raise NotImplementedError()
+        dy, sm = inp
+        g, = grads
+        tmp = g + tensor.neg(tensor.sum(g*sm, axis=1).dimshuffle((0, 'x')))
+        g_dy = tmp * sm
+        tmp2 = tensor.sum(dy*sm, axis=1).dimshuffle((0, 'x'))
+        g_sm = tmp*dy - g *tmp2
+        return g_dy, g_sm
    def infer_shape(self, node, shape):
        return [shape[1]]
@@ -573,9 +587,7 @@ def softmax_graph(c):
    return tensor.exp(c) / tensor.exp(c).sum(axis=-1, keepdims=True)
 def softmax(c):
-    if c.ndim == 1:
+    return softmax_op(c)
-        c = tensor.shape_padleft(c, n_ones=1)
-    return softmax_graph(c)    
 @opt.register_specialize('fast_compile_gpu')
 @gof.local_optimizer([softmax_op])
@@ -733,7 +745,7 @@ if 0:
                            rest.append(add_in)
                            # print 'maybe_ds =', maybe_ds
                            # if maybe_ds:
-                  #I will make a plot with the average over many realizations.            #    print 'maybe_ds.ndim =', maybe_ds.ndim, ', maybe_sm.ndim =', maybe_sm.ndim
+                            #    print 'maybe_ds.ndim =', maybe_ds.ndim, ', maybe_sm.ndim =', maybe_sm.ndim
                            continue
                        if maybe_sm is mul_inputs[0]:

--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -1011,7 +1011,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
            try:
                g = theano.function([x, b, y], T.grad(expr, x), mode=mode)
            finally:
-                config.warn.sum_div_dimshuffle_bug = backup
+                config.warn.sum_div_dimshuffle_qbug = backup
            if verbose:
                printing.debugprint(g)
@@ -1026,7 +1026,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
                theano.printing.debugprint(g)
                raise
-    def test_scale_cost(self):
+    def test_scrossentropy_softmax_1hot_with_bias_dxcale_cost(self):
        # TODO: add the optimization in FAST_COMPILE?
        # In the mean time, run it as 'FAST_RUN' instead
        mode = theano.compile.mode.get_default_mode()
@@ -1071,25 +1071,25 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
        # Cases to test
        expressions = [
-                a * T.sum(-T.log(softmax_graph(x)[T.arange(y.shape[0]), y])),
+                a * T.sum(-T.log(softmax(x)[T.arange(y.shape[0]), y])),
-                -a * T.sum(T.log(softmax_graph(x)[T.arange(y.shape[0]), y])),
+                -a * T.sum(T.log(softmax(x)[T.arange(y.shape[0]), y])),
-                a * (-T.sum(T.log(softmax_graph(x)[T.arange(y.shape[0]), y]))),
+                a * (-T.sum(T.log(softmax(x)[T.arange(y.shape[0]), y]))),
-                a * T.sum(T.log(softmax_graph(x)[T.arange(y.shape[0]), y])),
+                a * T.sum(T.log(softmax(x)[T.arange(y.shape[0]), y])),
-                a * T.sum(-T.log(softmax_graph(x))[T.arange(y.shape[0]), y]),
+                a * T.sum(-T.log(softmax(x))[T.arange(y.shape[0]), y]),
-                -a * T.sum(T.log(softmax_graph(x))[T.arange(y.shape[0]), y]),
+                -a * T.sum(T.log(softmax(x))[T.arange(y.shape[0]), y]),
-                a * (-T.sum(T.log(softmax_graph(x))[T.arange(y.shape[0]), y])),
+                a * (-T.sum(T.log(softmax(x))[T.arange(y.shape[0]), y])),
-                a * T.sum(T.log(softmax_graph(x))[T.arange(y.shape[0]), y]),
+                a * T.sum(T.log(softmax(x))[T.arange(y.shape[0]), y]),
-                a * T.mean(-T.log(softmax_graph(x)[T.arange(y.shape[0]), y])),
+                a * T.mean(-T.log(softmax(x)[T.arange(y.shape[0]), y])),
-                -a * T.mean(T.log(softmax_graph(x)[T.arange(y.shape[0]), y])),
+                -a * T.mean(T.log(softmax(x)[T.arange(y.shape[0]), y])),
-                a * (-T.mean(T.log(softmax_graph(x)[T.arange(y.shape[0]), y]))),
+                a * (-T.mean(T.log(softmax(x)[T.arange(y.shape[0]), y]))),
-                a * T.mean(T.log(softmax_graph(x)[T.arange(y.shape[0]), y])),
+                a * T.mean(T.log(softmax(x)[T.arange(y.shape[0]), y])),
-                a * T.mean(-T.log(softmax_graph(x))[T.arange(y.shape[0]), y]),
+                a * T.mean(-T.log(softmax(x))[T.arange(y.shape[0]), y]),
-                -a * T.mean(T.log(softmax_graph(x))[T.arange(y.shape[0]), y]),
+                -a * T.mean(T.log(softmax(x))[T.arange(y.shape[0]), y]),
-                a * (-T.mean(T.log(softmax_graph(x))[T.arange(y.shape[0]), y])),
+                a * (-T.mean(T.log(softmax(x))[T.arange(y.shape[0]), y])),
-                a * T.mean(T.log(softmax_graph(x))[T.arange(y.shape[0]), y]),
+                a * T.mean(T.log(softmax(x))[T.arange(y.shape[0]), y]),
                ]
        for expr in expressions:
@@ -1130,7 +1130,7 @@ def test_argmax_pushdown():
    # test that the max_and_argmax is pushed down if the max is not used
    out = tensor.max_and_argmax(
-            softmax_graph(tensor.exp(tensor.tanh(sigmoid(x)))),
+            softmax(tensor.exp(tensor.tanh(sigmoid(x)))),
            axis=-1)[1]
    fgraph = gof.FunctionGraph(
            [x],
@@ -1147,7 +1147,7 @@ def test_argmax_pushdown():
    x = tensor.matrix()
    # test that the max_and_argmax is not pushed down if the max is used
    out = tensor.max_and_argmax(
-            softmax_graph(tensor.exp(tensor.tanh(sigmoid(x)))),
+            softmax(tensor.exp(tensor.tanh(sigmoid(x)))),
            axis=-1)[0]
    fgraph = gof.FunctionGraph(
            [x],
@@ -1236,7 +1236,7 @@ def test_asymptotic_32():
            x2 = tensor.dvector()
        y = tensor.lvector()
-        c = categorical_crossentropy(softmax_graph(x + x2), y)
+        c = categorical_crossentropy(softmax(x + x2), y)
        f = theano.function([x, y, x2], [c.sum(),
                            tensor.grad(c.sum(), x)], mode='FAST_RUN')
        if 0:
@@ -1374,15 +1374,26 @@ class Test_softmax_opt:
    # REPEAT 3 CASES in presence of log(softmax) with the advanced indexing
    # etc.
 def test_softmax_graph():
    rng = numpy.random.RandomState(utt.fetch_seed())
    x = theano.shared(rng.normal(size=(3, 4)))
    def f(inputs):
        y = softmax_graph(x)
-        z = (y**2).mean()
+        return theano.grad(None, x, known_grads={y: inputs})
-        return theano.grad(z, x, known_grads={y: inputs})
-    utt.verify_grad(f, [numpy.random.rand(3, 4)])
+    utt.verify_grad(f, [rng.rand(3, 4)])
+def test_grad_softmax_grad():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    x = theano.shared(rng.normal(size=(3, 4)))
+    def f(inputs):
+        y = softmax_op(x)
+        return theano.grad(None, x, known_grads={y: inputs})
+    utt.verify_grad(f, [rng.rand(3, 4)])
 def test_stabilize_log_softmax():
@@ -1390,7 +1401,7 @@ def test_stabilize_log_softmax():
    mode = mode.including('local_log_softmax', 'specialize')
    x = matrix()
-    y = theano.tensor.nnet.softmax_graph(x)
+    y = softmax(x)
    z = theano.tensor.log(y)
    f = theano.function([x], z, mode=mode)