Merge pull request #2847 from harmdevries89/softmax_iss2050

[MRG] softmax function that builds expression instead of using softmax op

Merge pull request #2847 from harmdevries89/softmax_iss2050
e05036f0 · Frédéric Bastien · 414300e7 · c398b58d · e05036f0 · e05036f0
--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -739,7 +739,7 @@ class T_Scan(unittest.TestCase):

        def forward_scanner(x_t):
            a2_t = tensor.dot(x_t, W)
-            y_t = tensor.nnet.softmax(a2_t)
+            y_t = tensor.nnet.softmax_graph(a2_t)
            return y_t

        y, _ = theano.scan(fn=forward_scanner, sequences=x,

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -78,12 +78,17 @@ class SoftmaxWithBias(gof.Op):
        if b.shape[0] != x.shape[1]:
            raise ValueError('b must have same number of columns as x')

-        sm = numpy.zeros_like(x)
-        for i in xrange(sm.shape[0]):
-            row = x[i] + b
-            sm[i] = numpy.exp(row - numpy.max(row))
-            sm[i] *= 1.0 / numpy.sum(sm[i])
-        output_storage[0][0] = sm
+        # sm = numpy.zeros_like(x)
+        # for i in xrange(sm.shape[0]):
+            # row = x[i] + b
+            # sm[i] = numpy.exp(row - numpy.max(row))
+            # sm[i] *= 1.0 / numpy.sum(sm[i])
+        # output_storage[0][0] = sm
+
+        x_plus_b = x + b[None, :]
+        e_x = numpy.exp(x_plus_b - x_plus_b.max(axis=1)[:, None])
+        e_x *= 1.0 / e_x.sum(axis=1)[:, None]
+        output_storage[0][0] = e_x

    def grad(self, inp, grads):
        x, b = inp
@@ -304,8 +309,17 @@ class SoftmaxGrad(gof.Op):
            dx[i] = dy_times_sm_i - sum(dy_times_sm_i) * sm[i]
        output_storage[0][0] = dx

-    def grad(self, *args):
-        raise NotImplementedError()
+    def grad(self, inp, grads):
+        dy, sm = inp
+        g, = grads
+        
+        tmp = g + tensor.neg(tensor.sum(g*sm, axis=1).dimshuffle((0, 'x')))
+        g_dy = tmp * sm
+        
+        tmp2 = tensor.sum(dy*sm, axis=1).dimshuffle((0, 'x'))
+        g_sm = tmp*dy - g *tmp2
+    
+        return g_dy, g_sm

    def infer_shape(self, node, shape):
        return [shape[1]]
@@ -414,7 +428,7 @@ class Softmax(gof.Op):
    def grad(self, inp, grads):
        x, = inp
        g_sm, = grads
-        sm = softmax(x)
+        sm = softmax_op(x)
        return [softmax_grad(g_sm, sm)]

    def R_op(self, inputs, eval_points):
@@ -568,15 +582,20 @@ class Softmax(gof.Op):
    def c_code_cache_version():
        return (3,)

-softmax = Softmax()
+softmax_op = Softmax()
+
+def softmax_graph(c):
+    return tensor.exp(c) / tensor.exp(c).sum(axis=-1, keepdims=True)

+def softmax(c):
+    return softmax_op(c)

 @opt.register_specialize('fast_compile_gpu')
-@gof.local_optimizer([softmax])
+@gof.local_optimizer([softmax_op])
 def local_softmax_with_bias(node):
    """Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias)
    """
-    if node.op == softmax:
+    if node.op == softmax_op:
        x, = node.inputs
        if x.owner and x.owner.op == tensor.add:
            vectors = []
@@ -638,7 +657,7 @@ def softmax_simplifier(numerators, denominators):
        if not numerator.type.dtype.startswith('float'):
            continue

-        if not numerator.type.broadcastable == (False, False):
+        if numerator.ndim != 2:
            continue
        if numerator.owner and numerator.owner.op == tensor.exp:
            x = numerator.owner.inputs[0]
@@ -664,7 +683,8 @@ def softmax_simplifier(numerators, denominators):
        if matching_denom:
            numerators.remove(numerator)
            denominators.remove(matching_denom)
-            numerators.append(softmax(x))
+            numerators.append(softmax_op(x))
+
    return numerators, denominators
 opt.local_mul_canonizer.add_simplifier(softmax_simplifier,
     'softmax_simplifier')
@@ -1404,7 +1424,7 @@ def crossentropy_to_crossentropy_with_softmax(fgraph):
            if node.op == crossentropy_categorical_1hot:
                nll, = node.outputs
                sm, one_of_n = node.inputs
-                if sm.owner and sm.owner.op == softmax:
+                if sm.owner and sm.owner.op == softmax_op:
                    x, = sm.owner.inputs
                    new_nll, new_sm, new_am = crossentropy_softmax_argmax_1hot_with_bias(x,
                            tensor.zeros_like(x[0]), one_of_n)
@@ -1450,7 +1470,7 @@ def local_softmax_grad_to_crossentropy_with_softmax_grad(node):
 def local_argmax_pushdown(node):
    if node.op == tensor._max_and_argmax and node.inputs[0].owner and \
            len(node.outputs[0].clients) > 0 and node.inputs[0].owner.op in \
-            (softmax, softplus, tensor.exp, tensor.log, tensor.tanh, sigmoid,
+            (softmax_op, softplus, tensor.exp, tensor.log, tensor.tanh, sigmoid,
             softmax_with_bias):
        if theano.config.warn.argmax_pushdown_bug:
            logging.getLogger('theano.tensor.nnet.nnet').warn("WARNING: there "
@@ -1466,7 +1486,7 @@ def local_argmax_pushdown(node):
        x_max, x_argmax = node.outputs
        x, axis = node.inputs
        # TODO: Make a list/set of monotonic ops...
-        if x.owner and x.owner.op in (softmax, softplus, tensor.exp,
+        if x.owner and x.owner.op in (softmax_op, softplus, tensor.exp,
                                      tensor.log, tensor.tanh, sigmoid):
            pre_x, = x.owner.inputs
            return tensor._max_and_argmax(pre_x, axis)
@@ -1554,7 +1574,7 @@ def local_advanced_indexing_crossentropy_onehot(node):
            except Exception:
                pass

-    if sm is not None and sm.owner and sm.owner.op in (softmax,
+    if sm is not None and sm.owner and sm.owner.op in (softmax_op,
                                                       softmax_with_bias):
        sm_w_bias = local_softmax_with_bias.transform(sm.owner)
        if sm_w_bias:
@@ -1584,7 +1604,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
    except Exception:
        return

-    if (sm is not None) and sm.owner and (sm.owner.op in (softmax,
+    if (sm is not None) and sm.owner and (sm.owner.op in (softmax_op,
                                                          softmax_with_bias)):
        sm_w_bias = local_softmax_with_bias.transform(sm.owner)
        if sm_w_bias:
@@ -2054,7 +2074,7 @@ def make_out_pattern(X):
    return out_var


-local_log_softmax = gof.PatternSub(in_pattern=(tensor.log, (softmax, 'x')),
+local_log_softmax = gof.PatternSub(in_pattern=(tensor.log, (softmax_op, 'x')),
                                   out_pattern=(make_out_pattern, 'x'),
                                   allow_multiple_clients=True)


--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -22,8 +22,8 @@ from theano.tensor.nnet import (categorical_crossentropy,
                                CrossentropySoftmaxArgmax1HotWithBias,
                                CrossentropyCategorical1Hot,
                                CrossentropyCategorical1HotGrad,
-                                sigmoid, softplus,
-                                Softmax, softmax, SoftmaxWithBias,
+                                sigmoid, softplus, Softmax, softmax,
+                                softmax_op, softmax_graph, SoftmaxWithBias,
                                softmax_grad,
                                softmax_with_bias, SoftmaxGrad,
                                Prepend_scalar_constant_to_each_row,
@@ -54,40 +54,40 @@ class T_Softmax(utt.InferShapeTester):

    def test0(self):
        def f(a):
-            return softmax(a)[:, 0]
+            return softmax_op(a)[:, 0]
        utt.verify_grad(f, [numpy.random.rand(3, 4)])

    def test1(self):
        def f(a):
-            return softmax(a)[:, 1]
+            return softmax_op(a)[:, 1]
        utt.verify_grad(f, [numpy.random.rand(3, 4)])

    def test2(self):
        def f(a):
-            return softmax(a)[:, 2]
+            return softmax_op(a)[:, 2]
        utt.verify_grad(f, [numpy.random.rand(3, 4)])

    def test3(self):
        def f(a):
-            return softmax(a)[:, 3]
+            return softmax_op(a)[:, 3]
        utt.verify_grad(f, [numpy.random.rand(3, 4)])

    def test_infer_shape(self):
        admat = matrix()
        admat_val = numpy.random.rand(3, 4).astype(config.floatX)
        self._compile_and_check([admat], [Softmax()(admat)],
-                            [admat_val], Softmax)
+                                [admat_val], Softmax)

    def test_vector(self):
        x = T.vector()
-        f = theano.function([x], softmax(x))
+        f = theano.function([x], softmax_op(x))

        xv = numpy.random.randn(6).astype(config.floatX)
        assert numpy.allclose(f(xv), numpy.exp(xv) / numpy.exp(xv).sum())

    def test_vector_grad(self):
        def f(a):
-            return softmax(a)
+            return softmax_op(a)
        utt.verify_grad(f, [numpy.random.rand(4)])


@@ -129,10 +129,10 @@ class T_SoftmaxWithBias(utt.InferShapeTester):
        vbias = theano.shared(value=0.1, name='vbias')  # 0.01
        hid = T.vector('hid')
        f = theano.function([hid],
-                            T.nnet.softmax(T.dot(hid, W.T) + vbias))
+                            T.nnet.softmax_op(T.dot(hid, W.T) + vbias))
        ops = [node.op for node in f.maker.fgraph.toposort()]
        assert softmax_with_bias not in ops
-        assert softmax in ops
+        assert softmax_op in ops

        f([0, 1, 0])
        # print f.maker.fgraph.toposort()
@@ -400,7 +400,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):

        fgraph = gof.FunctionGraph(
                [x, one_of_n],
-                [op(softmax(x), one_of_n)])
+                [op(softmax_op(x), one_of_n)])
        assert fgraph.outputs[0].owner.op == op

        theano.compile.mode.optdb.query(
@@ -416,7 +416,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
        op = crossentropy_categorical_1hot
        fgraph = gof.FunctionGraph(
                [x, one_of_n],
-                [op(softmax(x), one_of_n)])
+                [op(softmax_op(x), one_of_n)])
        assert fgraph.outputs[0].owner.op == op

        theano.compile.mode.optdb.query(
@@ -434,7 +434,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):

        fgraph = gof.FunctionGraph(
                [x, b, one_of_n],
-                [op(softmax(x + b), one_of_n)])
+                [op(softmax_op(x + b), one_of_n)])
        assert fgraph.outputs[0].owner.op == op

        # print 'BEFORE'
@@ -466,7 +466,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):

        fgraph = gof.FunctionGraph(
                [x, b, c, one_of_n],
-                [op(softmax(T.add(x, b, c)), one_of_n)])
+                [op(softmax_op(T.add(x, b, c)), one_of_n)])
        assert fgraph.outputs[0].owner.op == op

        # print 'BEFORE'
@@ -494,7 +494,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
        op = crossentropy_categorical_1hot
        fgraph = gof.FunctionGraph(
                [x, b, one_of_n],
-                [op(softmax(x + b), one_of_n)])
+                [op(softmax_op(x + b), one_of_n)])
        assert fgraph.outputs[0].owner.op == op
        # print 'BEFORE'
        # for node in fgraph.toposort():
@@ -517,7 +517,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
        x = tensor.matrix('x')
        one_of_n = tensor.lvector('one_of_n')
        op = crossentropy_categorical_1hot
-        xe = op(softmax(x), one_of_n)
+        xe = op(softmax_op(x), one_of_n)
        sum_xe = tensor.sum(xe)
        g_x = tensor.grad(sum_xe, x)
        fgraph = gof.FunctionGraph(
@@ -546,7 +546,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
                has_cx1hot = True
            if node.op == crossentropy_softmax_1hot_with_bias_dx:
                has_cx1hotdx = True
-            if node.op == softmax:
+            if node.op == softmax_op:
                has_softmax = True
            if node.op == softmax_grad:
                has_softmaxdx = True
@@ -559,7 +559,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
        x = tensor.vector('x')
        one_of_n = tensor.lvector('one_of_n')
        op = crossentropy_categorical_1hot
-        xe = op(softmax(x), one_of_n)
+        xe = op(softmax_op(x), one_of_n)
        sum_xe = tensor.sum(xe)
        g_x = tensor.grad(sum_xe, x)
        fgraph = gof.FunctionGraph(
@@ -588,7 +588,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
                has_cx1hot = True
            if node.op == crossentropy_softmax_1hot_with_bias_dx:
                has_cx1hotdx = True
-            if node.op == softmax:
+            if node.op == softmax_op:
                has_softmax = True
            if node.op == softmax_grad:
                has_softmaxdx = True
@@ -643,7 +643,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
                ops = [node.op for node in g.maker.fgraph.toposort()]
                assert len(ops) == 2
                assert crossentropy_softmax_1hot_with_bias_dx in ops
-                assert softmax in ops
+                assert softmax_op in ops
                assert softmax_grad not in ops
                g(x_val, y_val)
            except Exception:
@@ -714,7 +714,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
                #there's an extra dimshuffle in there
                # but I can't think of a good rule to get rid of it
                assert crossentropy_softmax_1hot_with_bias_dx in ops
-                assert softmax in ops
+                assert softmax_op in ops
                assert softmax_grad not in ops
                g(x_val, y_val)
            except Exception:
@@ -796,7 +796,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
                ops = [node.op for node in g.maker.fgraph.toposort()]
                assert len(ops) == 3
                assert crossentropy_softmax_1hot_with_bias_dx in ops
-                assert softmax in ops
+                assert softmax_op in ops
                assert softmax_grad not in ops
                g(x_val, y_val)
            except Exception:
@@ -841,7 +841,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
                ops = [node.op for node in g.maker.fgraph.toposort()]
                assert len(ops) == 4
                assert crossentropy_softmax_1hot_with_bias_dx in ops
-                assert softmax in ops
+                assert softmax_op in ops
                assert softmax_grad not in ops
                g(x_val, y_val)
            except Exception:
@@ -1028,7 +1028,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
                theano.printing.debugprint(g)
                raise

-    def test_scale_cost(self):
+    def test_crossentropy_softmax_1hot_with_bias_dxcale_cost(self):
        # TODO: add the optimization in FAST_COMPILE?
        # In the mean time, run it as 'FAST_RUN' instead
        mode = theano.compile.mode.get_default_mode()
@@ -1048,7 +1048,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
            for node in func.maker.fgraph.toposort():
                if node.op == crossentropy_softmax_argmax_1hot_with_bias:
                    has_cx1hot = True
-                if node.op == softmax:
+                if node.op == softmax_op:
                    has_softmax = True

            assert has_cx1hot
@@ -1062,7 +1062,7 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):
            for node in func.maker.fgraph.toposort():
                if node.op == crossentropy_softmax_1hot_with_bias_dx:
                    has_cx1hotdx = True
-                if node.op == softmax:
+                if node.op == softmax_op:
                    has_softmax = True
                if node.op == softmax_grad:
                    has_softmaxdx = True
@@ -1129,49 +1129,49 @@ class T_CrossentropyCategorical1Hot(utt.InferShapeTester):

 def test_argmax_pushdown():
    x = tensor.matrix()
+    for softmax in [softmax_graph, softmax_op]:
+        # test that the max_and_argmax is pushed down if the max is not used
+        out = tensor.max_and_argmax(
+                softmax(tensor.exp(tensor.tanh(sigmoid(x)))),
+                axis=-1)[1]
+        fgraph = gof.FunctionGraph(
+                [x],
+                [out])
+        theano.compile.mode.optdb.query(
+                theano.compile.mode.OPT_FAST_RUN).optimize(fgraph)

-    # test that the max_and_argmax is pushed down if the max is not used
-    out = tensor.max_and_argmax(
-            softmax(tensor.exp(tensor.tanh(sigmoid(x)))),
-            axis=-1)[1]
-    fgraph = gof.FunctionGraph(
-            [x],
-            [out])
-    theano.compile.mode.optdb.query(
-            theano.compile.mode.OPT_FAST_RUN).optimize(fgraph)
-
-    # print 'AFTER'
-    # for node in fgraph.toposort():
-        # print node.op
-    assert len(fgraph.toposort()) == 2  # an output_guard is second
-    assert fgraph.toposort()[0].op == tensor.basic._max_and_argmax
-    assert str(fgraph.toposort()[1].op) == 'OutputGuard'
-    x = tensor.matrix()
-    # test that the max_and_argmax is not pushed down if the max is used
-    out = tensor.max_and_argmax(
-            softmax(tensor.exp(tensor.tanh(sigmoid(x)))),
-            axis=-1)[0]
-    fgraph = gof.FunctionGraph(
-            [x],
-            [out])
+        # print 'AFTER'
+        # for node in fgraph.toposort():
+            # print node.op
+        assert len(fgraph.toposort()) == 2  # an output_guard is second
+        assert fgraph.toposort()[0].op == tensor.basic._max_and_argmax
+        assert str(fgraph.toposort()[1].op) == 'OutputGuard'
+        x = tensor.matrix()
+        # test that the max_and_argmax is not pushed down if the max is used
+        out = tensor.max_and_argmax(
+                softmax(tensor.exp(tensor.tanh(sigmoid(x)))),
+                axis=-1)[0]
+        fgraph = gof.FunctionGraph(
+                [x],
+                [out])

-    backup = config.warn.argmax_pushdown_bug
-    config.warn.argmax_pushdown_bug = False
-    try:
-        theano.compile.mode.optdb.query(
+        backup = config.warn.argmax_pushdown_bug
+        config.warn.argmax_pushdown_bug = False
+        try:
+            theano.compile.mode.optdb.query(
                theano.compile.mode.OPT_FAST_RUN).optimize(fgraph)
-    finally:
-        config.warn.argmax_pushdown_bug = backup
+        finally:
+            config.warn.argmax_pushdown_bug = backup

-    # print 'AFTER'
-    # for node in fgraph.toposort():
-        # print node.op
-    assert len(fgraph.toposort()) == 4  # an output_guard is second
-    assert isinstance(fgraph.toposort()[0].op, tensor.Elemwise)
-    assert isinstance(fgraph.toposort()[1].op, Softmax)
-    assert isinstance(fgraph.toposort()[2].op, tensor.CAReduce)
-    assert isinstance(fgraph.toposort()[2].op.scalar_op, theano.scalar.Maximum)
-    assert str(fgraph.toposort()[3].op) == 'OutputGuard'
+        # print 'AFTER'
+        # for node in fgraph.toposort():
+            # print node.op
+        assert len(fgraph.toposort()) == 4  # an output_guard is second
+        assert isinstance(fgraph.toposort()[0].op, tensor.Elemwise)
+        assert isinstance(fgraph.toposort()[1].op, Softmax)
+        assert isinstance(fgraph.toposort()[2].op, tensor.CAReduce)
+        assert isinstance(fgraph.toposort()[2].op.scalar_op, theano.scalar.Maximum)
+        assert str(fgraph.toposort()[3].op) == 'OutputGuard'


 def test_argmax_pushdown_bias():
@@ -1295,7 +1295,7 @@ class Test_softmax_opt:
        # printing.debugprint(f)
        # print '==='
        assert len(f_ops) == 1
-        assert softmax in f_ops
+        assert softmax_op in f_ops
        f(self.rng.rand(3, 4).astype(config.floatX))

    def test_basic_keepdims(self):
@@ -1309,7 +1309,7 @@ class Test_softmax_opt:
        # printing.debugprint(f)
        # print '==='
        assert len(f_ops) == 1
-        assert softmax in f_ops
+        assert softmax_op in f_ops
        f(self.rng.rand(3, 4).astype(config.floatX))

    def test_grad(self):
@@ -1331,7 +1331,7 @@ class Test_softmax_opt:

        raise SkipTest('Optimization not enabled for the moment')
        assert len(g_ops) == 2
-        assert softmax in g_ops
+        assert softmax_op in g_ops
        assert softmax_grad in g_ops
        g(self.rng.rand(3, 4), self.rng.uniform(.5, 1, (3, 4)))

@@ -1377,12 +1377,33 @@ class Test_softmax_opt:
    # etc.


+def test_softmax_graph():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    x = theano.shared(rng.normal(size=(3, 4)))
+
+    def f(inputs):
+        y = softmax_graph(x)
+        return theano.grad(None, x, known_grads={y: inputs})
+
+    utt.verify_grad(f, [rng.rand(3, 4)])
+
+
+def test_grad_softmax_grad():
+    rng = numpy.random.RandomState(utt.fetch_seed())
+    x = theano.shared(rng.normal(size=(3, 4)))
+
+    def f(inputs):
+        y = softmax_op(x)
+        return theano.grad(None, x, known_grads={y: inputs})
+    utt.verify_grad(f, [rng.rand(3, 4)])
+
+
 def test_stabilize_log_softmax():
    mode = theano.compile.mode.get_default_mode()
    mode = mode.including('local_log_softmax', 'specialize')

    x = matrix()
-    y = theano.tensor.nnet.softmax(x)
+    y = softmax(x)
    z = theano.tensor.log(y)

    f = theano.function([x], z, mode=mode)