Move sigmoid tests to respective files

Add tests for inplace Sigmoid and Softplus

Move sigmoid tests to respective files
fc9052e3 · Ricardo · Thomas Wiecki · ec51faa6 · fc9052e3 · fc9052e3
--- a/tests/tensor/nnet/test_sigm.py
+++ b/tests/tensor/nnet/test_sigm.py
 import numpy as np

 import aesara
-import aesara.tensor as aet
 from aesara.configdefaults import config
 from aesara.graph.opt import check_stack_trace
-from aesara.graph.toolbox import is_same_graph
-from aesara.scalar import Softplus
-from aesara.tensor import sigmoid, softplus
-from aesara.tensor.inplace import neg_inplace, sigmoid_inplace
-from aesara.tensor.math import clip, exp, log, mul, neg
-from aesara.tensor.math_opt import (
-    compute_mul,
-    is_1pexp,
-    parse_mul_tree,
-    perform_sigm_times_exp,
-    register_local_1msigmoid,
-    simplify_mul,
-)
+from aesara.tensor.math import clip, sigmoid
 from aesara.tensor.nnet.sigm import hard_sigmoid, ultra_fast_sigmoid
-from aesara.tensor.shape import Reshape
-from aesara.tensor.type import fmatrix, matrix, scalar, vector, vectors
-from tests import unittest_tools as utt
+from aesara.tensor.type import matrix
 from tests.tensor.utils import (
    _good_broadcast_unary_normal_no_complex,
    check_floatX,
@@ -30,29 +15,6 @@ from tests.tensor.utils import (
 )


-class TestSigmoid:
-    def setup_method(self):
-        utt.seed_rng()
-
-    def test_elemwise(self):
-        utt.verify_grad(sigmoid, [np.random.rand(3, 4)])
-
-
-TestSigmoidBroadcast = makeBroadcastTester(
-    op=sigmoid,
-    expected=upcast_int8_nfunc(
-        lambda inputs: check_floatX(inputs, 1 / (1 + np.exp(-inputs)))
-    ),
-    good=copymod(
-        _good_broadcast_unary_normal_no_complex, without=["uint16"]
-    ),  # The reason that 'uint16' is excluted is that
-    # aesara works well but numpy overflows resulting
-    # in an assertion error.
-    # grad=_grad_broadcast_unary_normal,
-    name="SigmoidTester",
-    eps=1e-8,
-)
-
 TestUltraFastSigmoidBroadcast = makeBroadcastTester(
    op=ultra_fast_sigmoid,
    expected=upcast_int8_nfunc(
@@ -82,42 +44,7 @@ TestHardSigmoidBroadcast = makeBroadcastTester(
 )


-TestSoftplusBroadcast = makeBroadcastTester(
-    op=softplus,
-    expected=upcast_int8_nfunc(
-        lambda inputs: check_floatX(inputs, np.log1p(np.exp(inputs)))
-    ),
-    good=dict(
-        copymod(
-            _good_broadcast_unary_normal_no_complex,
-            without=["uint8", "uint16", "big_scalar"],
-        ),  # numpy function overflows with uint16.
-        uint8=[
-            np.arange(0, 89, dtype="uint8")
-        ],  # the range is different in new added uint8.
-        int8=[np.arange(-127, 89, dtype="int8")],
-    ),
-    # grad=_grad_broadcast_unary_normal,
-    name="SoftplusTester",
-)
-
-
-class TestSoftplus:
-    def setup_method(self):
-        utt.seed_rng()
-
-    def test_elemwise(self):
-        utt.verify_grad(softplus, [np.random.rand(3, 4)])
-
-    def test_accuracy(self):
-        # Test all aproximations are working (cutoff points are -37, 18, 33.3)
-        x_test = np.array([-40.0, -17.5, 17.5, 18.5, 40.0])
-        y_th = softplus(x_test).eval()
-        y_np = np.log1p(np.exp(x_test))
-        np.testing.assert_allclose(y_th, y_np, rtol=10e-10)
-
-
-class TestSigmoidOpts:
+class TestSpecialSigmoidOpts:
    def get_mode(self, excluding=None):
        """
        Return appropriate mode for the tests.
@@ -140,271 +67,6 @@ class TestSigmoidOpts:
        else:
            return mode

-    def test_exp_over_1_plus_exp(self):
-        m = self.get_mode(excluding=["local_elemwise_fusion"])
-
-        x = vector()
-        data = np.random.rand(54).astype(config.floatX)
-
-        backup = config.warn__identify_1pexp_bug
-        config.warn__identify_1pexp_bug = False
-        try:
-            # tests exp_over_1_plus_exp
-            f = aesara.function([x], exp(x) / (1 + exp(x)), mode=m)
-            assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid]
-            f(data)
-            f = aesara.function([x], exp(x) / (2 + exp(x)), mode=m)
-            assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid]
-            f(data)
-            f = aesara.function([x], exp(x) / (1 - exp(x)), mode=m)
-            assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid]
-            f(data)
-            f = aesara.function([x], exp(x + 1) / (1 + exp(x)), mode=m)
-            assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid]
-            f(data)
-
-            # tests inv_1_plus_exp
-            f = aesara.function([x], aet.fill(x, 1.0) / (1 + exp(-x)), mode=m)
-            # todo: solve issue #4589 first
-            # assert check_stack_trace(f, ops_to_check=sigmoid)
-            assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid]
-            f(data)
-            f = aesara.function([x], aet.fill(x, 1.0) / (2 + exp(-x)), mode=m)
-            assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid]
-            f(data)
-            f = aesara.function([x], aet.fill(x, 1.0) / (1 - exp(-x)), mode=m)
-            assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid]
-            f(data)
-            f = aesara.function([x], aet.fill(x, 1.1) / (1 + exp(-x)), mode=m)
-            assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid]
-            f(data)
-
-            # tests inv_1_plus_exp with neg
-            f = aesara.function([x], aet.fill(x, -1.0) / (1 + exp(-x)), mode=m)
-            # todo: solve issue #4589 first
-            # assert check_stack_trace(
-            #     f, ops_to_check=[sigmoid, neg_inplace])
-            assert [node.op for node in f.maker.fgraph.toposort()] == [
-                sigmoid,
-                neg_inplace,
-            ]
-            f(data)
-            f = aesara.function([x], aet.fill(x, -1.0) / (1 - exp(-x)), mode=m)
-            assert [node.op for node in f.maker.fgraph.toposort()] != [
-                sigmoid,
-                neg_inplace,
-            ]
-            f(data)
-            f = aesara.function([x], aet.fill(x, -1.0) / (2 + exp(-x)), mode=m)
-            assert [node.op for node in f.maker.fgraph.toposort()] != [
-                sigmoid,
-                neg_inplace,
-            ]
-            f(data)
-            f = aesara.function([x], aet.fill(x, -1.1) / (1 + exp(-x)), mode=m)
-            assert [node.op for node in f.maker.fgraph.toposort()] != [
-                sigmoid,
-                neg_inplace,
-            ]
-            f(data)
-
-            # tests double inv_1_plus_exp with neg
-            # (-1)(exp(x)) / (1+exp(x))(1+exp(-x))
-            # = (-1)/(1+exp(-x)) * exp(x)/(1+exp(x))
-            # = - (sigm(x) * sigm(x))
-            f = aesara.function(
-                [x],
-                (aet.fill(x, -1.0) * exp(x)) / ((1 + exp(x)) * (1 + exp(-x))),
-                mode=m,
-            )
-            # todo: solve issue #4589 first
-            # assert check_stack_trace(f, ops_to_check=[sigmoid, mul])
-            assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid, mul]
-            f(data)
-            f = aesara.function(
-                [x],
-                (aet.fill(x, -1.1) * exp(x)) / ((1 + exp(x)) * (1 + exp(-x))),
-                mode=m,
-            )
-            assert [node.op for node in f.maker.fgraph.toposort()] != [
-                sigmoid,
-                mul,
-                neg_inplace,
-            ]
-            f(data)
-            f = aesara.function(
-                [x],
-                (aet.fill(x, -1.0) * exp(x)) / ((2 + exp(x)) * (1 + exp(-x))),
-                mode=m,
-            )
-            assert [node.op for node in f.maker.fgraph.toposort()] != [
-                sigmoid,
-                mul,
-                neg_inplace,
-            ]
-            f(data)
-            f = aesara.function(
-                [x],
-                (aet.fill(x, -1.0) * exp(x)) / ((1 + exp(x)) * (2 + exp(-x))),
-                mode=m,
-            )
-            assert [node.op for node in f.maker.fgraph.toposort()] != [
-                sigmoid,
-                mul,
-                neg_inplace,
-            ]
-            f(data)
-            f = aesara.function(
-                [x],
-                (aet.fill(x, -1.0) * exp(x)) / ((1 + exp(x)) * (1 + exp(x))),
-                mode=m,
-            )
-            assert [node.op for node in f.maker.fgraph.toposort()] != [
-                sigmoid,
-                mul,
-                neg_inplace,
-            ]
-            f(data)
-            f = aesara.function(
-                [x],
-                (aet.fill(x, -1.0) * exp(x)) / ((1 + exp(x)) * (2 + exp(-x))),
-                mode=m,
-            )
-            assert [node.op for node in f.maker.fgraph.toposort()] != [
-                sigmoid,
-                mul,
-                neg_inplace,
-            ]
-            f(data)
-
-        finally:
-            # Restore config option.
-            config.warn__identify_1pexp_bug = backup
-
-    def test_1msigmoid(self):
-        if not register_local_1msigmoid:
-            return
-
-        m = self.get_mode()
-        x = fmatrix()
-
-        # tests exp_over_1_plus_exp
-        f = aesara.function([x], 1 - exp(x) / (1 + exp(x)), mode=m)
-        assert check_stack_trace(f, ops_to_check=[neg, sigmoid_inplace])
-        assert [node.op for node in f.maker.fgraph.toposort()] == [
-            neg,
-            sigmoid_inplace,
-        ]
-
-        # tests inv_1_plus_exp
-        f = aesara.function([x], 1 - aet.fill(x, 1.0) / (1 + exp(-x)), mode=m)
-        assert check_stack_trace(f, ops_to_check=[neg, sigmoid_inplace])
-        assert [node.op for node in f.maker.fgraph.toposort()] == [
-            neg,
-            sigmoid_inplace,
-        ]
-
-    def test_local_sigm_times_exp(self):
-        # Test the `local_sigm_times_exp` optimization.
-        # exp(x) * sigm(-x) -> sigm(x)
-        # exp(-x) * sigm(x) -> sigm(-x)
-
-        def match(func, ops):
-            # print [node.op.scalar_op for node in func.maker.fgraph.toposort()]
-            assert [node.op for node in func.maker.fgraph.toposort()] == ops
-
-        m = self.get_mode(excluding=["local_elemwise_fusion", "inplace"])
-        x, y = vectors("x", "y")
-
-        f = aesara.function([x], sigmoid(-x) * exp(x), mode=m)
-        match(f, [sigmoid])
-        assert check_stack_trace(f, ops_to_check=sigmoid)
-
-        f = aesara.function([x], sigmoid(x) * exp(-x), mode=m)
-        match(f, [neg, sigmoid])
-        assert check_stack_trace(f, ops_to_check=sigmoid)
-
-        f = aesara.function([x], -(-(-(sigmoid(x)))) * exp(-x), mode=m)
-        match(f, [neg, sigmoid, neg])
-        # assert check_stack_trace(f, ops_to_check=sigmoid)
-
-        f = aesara.function(
-            [x, y],
-            (sigmoid(x) * sigmoid(-y) * -exp(-x) * exp(x * y) * exp(y)),
-            mode=m,
-        )
-        topo = f.maker.fgraph.toposort()
-        for op, nb in [(sigmoid, 2), (mul, 2), (neg, 1), (exp, 1)]:
-            assert sum([n.op == op for n in topo]) == nb
-        # assert check_stack_trace(f, ops_to_check=[sigmoid, mul,
-        #                                           exp])
-
-    def test_perform_sigm_times_exp(self):
-        # Test the core function doing the `sigm_times_exp` optimization.
-        #
-        # It is easier to test different graph scenarios this way than by
-        # compiling an Aesara function.
-
-        x, y, z, t = vectors("x", "y", "z", "t")
-        exp_op = exp
-
-        def ok(expr1, expr2):
-            trees = [parse_mul_tree(e) for e in (expr1, expr2)]
-            perform_sigm_times_exp(trees[0])
-            trees[0] = simplify_mul(trees[0])
-            good = is_same_graph(compute_mul(trees[0]), compute_mul(trees[1]))
-            if not good:
-                print(trees[0])
-                print(trees[1])
-                print("***")
-                aesara.printing.debugprint(compute_mul(trees[0]))
-                print("***")
-                aesara.printing.debugprint(compute_mul(trees[1]))
-            assert good
-
-        ok(sigmoid(x) * exp_op(-x), sigmoid(-x))
-        ok(
-            -x * sigmoid(x) * (y * (-1 * z) * exp_op(-x)),
-            -x * sigmoid(-x) * (y * (-1 * z)),
-        )
-        ok(
-            -sigmoid(-x)
-            * (
-                exp_op(y)
-                * (-exp_op(-z) * 3 * -exp_op(x))
-                * (y * 2 * (-sigmoid(-y) * (z + t) * exp_op(z)) * sigmoid(z))
-            )
-            * -sigmoid(x),
-            sigmoid(x)
-            * (-sigmoid(y) * (-sigmoid(-z) * 3) * (y * 2 * ((z + t) * exp_op(z))))
-            * (-sigmoid(x)),
-        )
-        ok(
-            exp_op(-x) * -exp_op(-x) * (-sigmoid(x) * -sigmoid(x)),
-            -sigmoid(-x) * sigmoid(-x),
-        )
-        ok(-exp_op(x) * -sigmoid(-x) * -exp_op(-x), -sigmoid(-x))
-
-    def test_grad_log1msigm(self):
-        # At some point, this returned nan, because (1 - sigm(x)) was
-        # on both the numerator and the denominator of a fraction,
-        # but the two nodes in question had not been merged.
-        x = matrix("x")
-        lr = scalar("lr")
-
-        s = sigmoid(x)
-        l = log(1 - s)
-        c = l.mean()
-        ux = x - lr * aesara.grad(c, x)
-
-        # Before the optimization, inf and NaN will be produced in the graph,
-        # and DebugMode will complain. Everything is fine afterwards.
-        mode = self.get_mode()
-        if not isinstance(mode, aesara.compile.debugmode.DebugMode):
-            f = aesara.function([x, lr], ux, mode=mode)
-            ux_v = f([[50]], 0.1)
-            assert not np.isnan(ux_v)
-
    def test_local_ultra_fast_sigmoid(self):
        x = matrix("x")
        s = sigmoid(x)
@@ -444,136 +106,3 @@ class TestSigmoidOpts:
        mode2 = mode.excluding("fusion").excluding("inplace")
        f2 = aesara.function([x], s, mode=mode2)
        assert check_stack_trace(f2, ops_to_check=clip)
-
-
-class TestSoftplusOpts:
-    def setup_method(self):
-        if aesara.config.mode == "FAST_COMPILE":
-            m = aesara.compile.mode.get_mode("FAST_RUN").excluding(
-                "local_elemwise_fusion"
-            )
-        else:
-            m = aesara.compile.mode.get_default_mode().excluding(
-                "local_elemwise_fusion"
-            )
-        self.m = m
-        utt.seed_rng()
-
-    def test_logsigm_to_softplus(self):
-        x = vector()
-
-        out = log(sigmoid(x))
-        f = aesara.function([x], out, mode=self.m)
-
-        # Fix ticket #4581 first
-        # assert check_stack_trace(
-        #     f, ops_to_check=(aesara.scalar.Neg,
-        #                      ScalarSoftplus))
-        topo = f.maker.fgraph.toposort()
-        assert len(topo) == 3
-        assert isinstance(topo[0].op.scalar_op, aesara.scalar.Neg)
-        assert isinstance(topo[1].op.scalar_op, Softplus)
-        assert isinstance(topo[2].op.scalar_op, aesara.scalar.Neg)
-        f(np.random.rand(54).astype(config.floatX))
-
-    def test_log1msigm_to_softplus(self):
-        x = matrix()
-
-        out = log(1 - sigmoid(x))
-        f = aesara.function([x], out, mode=self.m)
-        topo = f.maker.fgraph.toposort()
-        assert len(topo) == 2
-        assert isinstance(topo[0].op.scalar_op, Softplus)
-        assert isinstance(topo[1].op.scalar_op, aesara.scalar.Neg)
-        # assert check_stack_trace(f, ops_to_check='all')
-        f(np.random.rand(54, 11).astype(config.floatX))
-
-        # Same test with a flatten
-        out = log(1 - aet.flatten(sigmoid(x)))
-        f = aesara.function([x], out, mode=self.m)
-
-        # assert check_stack_trace(f, ops_to_check='all')
-        topo = f.maker.fgraph.toposort()
-        assert len(topo) == 3
-        assert aet.is_flat(topo[0].outputs[0])
-        assert isinstance(topo[1].op.scalar_op, Softplus)
-        assert isinstance(topo[2].op.scalar_op, aesara.scalar.Neg)
-        f(np.random.rand(54, 11).astype(config.floatX))
-
-        # Same test with a reshape
-        out = log(1 - sigmoid(x).reshape([x.size]))
-        f = aesara.function([x], out, mode=self.m)
-        topo = f.maker.fgraph.toposort()
-        # assert len(topo) == 3
-        assert any(isinstance(node.op, Reshape) for node in topo)
-        assert any(
-            isinstance(
-                getattr(node.op, "scalar_op", None),
-                Softplus,
-            )
-            for node in topo
-        )
-        f(np.random.rand(54, 11).astype(config.floatX))
-
-    def test_log1pexp_to_softplus(self):
-        m = aesara.config.mode
-        if m == "FAST_COMPILE":
-            m = "FAST_RUN"
-
-        x = vector()
-
-        out = log(1 + exp(x))
-        f = aesara.function([x], out, mode=self.m)
-
-        # Fix ticket #4581 first
-        # assert check_stack_trace(f, ops_to_check='all')
-        topo = f.maker.fgraph.toposort()
-        assert len(topo) == 1
-        assert isinstance(topo[0].op.scalar_op, Softplus)
-        f(np.random.rand(54).astype(config.floatX))
-
-
-class TestSigmoidUtils:
-    """
-    Test utility functions found in 'sigm.py'.
-    """
-
-    def test_compute_mul(self):
-        x, y, z = vectors("x", "y", "z")
-        tree = (x * y) * -z
-        mul_tree = parse_mul_tree(tree)
-        assert parse_mul_tree(compute_mul(mul_tree)) == mul_tree
-        assert is_same_graph(compute_mul(parse_mul_tree(tree)), tree)
-
-    def test_parse_mul_tree(self):
-        x, y, z = vectors("x", "y", "z")
-        assert parse_mul_tree(x * y) == [False, [[False, x], [False, y]]]
-        assert parse_mul_tree(-(x * y)) == [True, [[False, x], [False, y]]]
-        assert parse_mul_tree(-x * y) == [False, [[True, x], [False, y]]]
-        assert parse_mul_tree(-x) == [True, x]
-        assert parse_mul_tree((x * y) * -z) == [
-            False,
-            [[False, [[False, x], [False, y]]], [True, z]],
-        ]
-
-    def test_is_1pexp(self):
-        backup = config.warn__identify_1pexp_bug
-        config.warn__identify_1pexp_bug = False
-        try:
-            x = vector("x")
-            exp_op = exp
-            assert is_1pexp(1 + exp_op(x), False) == (False, x)
-            assert is_1pexp(exp_op(x) + 1, False) == (False, x)
-            for neg_, exp_arg in map(
-                lambda x: is_1pexp(x, only_process_constants=False),
-                [(1 + exp_op(-x)), (exp_op(-x) + 1)],
-            ):
-                assert not neg_ and is_same_graph(exp_arg, -x)
-            assert is_1pexp(1 - exp_op(x), False) is None
-            assert is_1pexp(2 + exp_op(x), False) is None
-            assert is_1pexp(exp_op(x) + 2, False) is None
-            assert is_1pexp(exp_op(x) - 1, False) is None
-            assert is_1pexp(-1 + exp_op(x), False) is None
-            assert is_1pexp(1 + 2 * exp_op(x), False) is None
-        finally:
-            config.warn__identify_1pexp_bug = backup
--- a/tests/tensor/test_basic_scipy.py
+++ b/tests/tensor/test_basic_scipy.py
@@ -17,13 +17,17 @@ from tests.tensor.utils import (
    _good_broadcast_unary_normal_float,
    _good_broadcast_unary_normal_float_no_complex,
    _good_broadcast_unary_normal_float_no_complex_small_neg_range,
+    _good_broadcast_unary_normal_no_complex,
    _grad_broadcast_unary_0_2_no_complex,
    _grad_broadcast_unary_abs1_no_complex,
    _grad_broadcast_unary_normal,
    _grad_broadcast_unary_normal_small_neg_range,
+    check_floatX,
+    copymod,
    makeBroadcastTester,
    rand_ranged,
    randint_ranged,
+    upcast_int8_nfunc,
 )


@@ -72,6 +76,7 @@ if imported_scipy_special:
    expected_i1 = scipy.special.i1
    expected_iv = scipy.special.iv
    expected_erfcx = scipy.special.erfcx
+    expected_sigmoid = scipy.special.expit
    skip_scipy = False
 else:
    expected_erf = []
@@ -94,6 +99,11 @@ else:
    expected_i0 = []
    expected_i1 = []
    expected_iv = []
+    expected_sigmoid = (
+        upcast_int8_nfunc(
+            lambda inputs: check_floatX(inputs, np.log1p(np.exp(inputs)))
+        ),
+    )
    skip_scipy = "scipy is not present"

 TestErfBroadcast = makeBroadcastTester(
@@ -563,3 +573,75 @@ def test_verify_iv_grad():
        return aet.iv(v_val, x)

    utt.verify_grad(fixed_first_input_iv, [x_val])
+
+
+TestSigmoidBroadcast = makeBroadcastTester(
+    op=aet.sigmoid,
+    expected=expected_sigmoid,
+    good=_good_broadcast_unary_normal_no_complex,
+    eps=1e-8,
+)
+
+TestSigmoidInplaceBroadcast = makeBroadcastTester(
+    op=inplace.sigmoid_inplace,
+    expected=expected_sigmoid,
+    good=_good_broadcast_unary_normal_no_complex,
+    grad=_grad_broadcast_unary_normal,
+    eps=1e-8,
+    inplace=True,
+)
+
+
+class TestSigmoid:
+    def setup_method(self):
+        utt.seed_rng()
+
+    def test_elemwise(self):
+        utt.verify_grad(aet.sigmoid, [np.random.rand(3, 4)])
+
+
+_good_broadcast_unary_softplus = dict(
+    copymod(
+        _good_broadcast_unary_normal_no_complex,
+        without=["uint8", "uint16", "big_scalar"],
+    ),  # numpy function overflows with uint16.
+    uint8=[
+        np.arange(0, 89, dtype="uint8")
+    ],  # the range is different in new added uint8.
+    int8=[np.arange(-127, 89, dtype="int8")],
+)
+
+expected_sofplus = upcast_int8_nfunc(
+    lambda inputs: check_floatX(inputs, np.log1p(np.exp(inputs)))
+)
+
+TestSoftplusBroadcast = makeBroadcastTester(
+    op=aet.softplus,
+    expected=expected_sofplus,
+    good=_good_broadcast_unary_softplus,
+    eps=1e-8,
+)
+
+TestSoftplusInplaceBroadcast = makeBroadcastTester(
+    op=inplace.softplus_inplace,
+    expected=expected_sofplus,
+    good=_good_broadcast_unary_softplus,
+    grad=_grad_broadcast_unary_normal,
+    eps=1e-8,
+    inplace=True,
+)
+
+
+class TestSoftplus:
+    def setup_method(self):
+        utt.seed_rng()
+
+    def test_elemwise(self):
+        utt.verify_grad(aet.softplus, [np.random.rand(3, 4)])
+
+    def test_accuracy(self):
+        # Test all aproximations are working (cutoff points are -37, 18, 33.3)
+        x_test = np.array([-40.0, -17.5, 17.5, 18.5, 40.0])
+        y_th = aet.softplus(x_test).eval()
+        y_np = np.log1p(np.exp(x_test))
+        np.testing.assert_allclose(y_th, y_np, rtol=10e-10)
--- a/tests/tensor/test_math.py
+++ b/tests/tensor/test_math.py
@@ -93,6 +93,7 @@ from aesara.tensor.math import (
    round_half_away_from_zero,
    round_half_to_even,
    sgn,
+    sigmoid,
    sin,
    sinh,
    smallest,
@@ -102,7 +103,6 @@ from aesara.tensor.math import (
 )
 from aesara.tensor.math import sum as aet_sum
 from aesara.tensor.math import tan, tanh, tensordot, true_div, trunc, var
-from aesara.tensor.nnet import sigmoid
 from aesara.tensor.type import (
    TensorType,
    complex_dtypes,

--- a/tests/tensor/test_math_opt.py
+++ b/tests/tensor/test_math_opt.py
@@ -20,6 +20,7 @@ from aesara.graph.basic import Constant
 from aesara.graph.fg import FunctionGraph
 from aesara.graph.opt import LocalOptGroup, TopoOptimizer, check_stack_trace, out2in
 from aesara.graph.optdb import Query
+from aesara.graph.toolbox import is_same_graph
 from aesara.misc.safe_asarray import _asarray
 from aesara.tensor import inplace
 from aesara.tensor.basic import Alloc, join, switch
@@ -68,16 +69,22 @@ from aesara.tensor.math import minimum, mul, neg, neq
 from aesara.tensor.math import pow as aet_pow
 from aesara.tensor.math import prod, rad2deg
 from aesara.tensor.math import round as aet_round
-from aesara.tensor.math import sgn, sin, sinh, sqr, sqrt, sub
+from aesara.tensor.math import sgn, sigmoid, sin, sinh, sqr, sqrt, sub
 from aesara.tensor.math import sum as aet_sum
 from aesara.tensor.math import tan, tanh, true_div, xor
 from aesara.tensor.math_opt import (
+    compute_mul,
+    is_1pexp,
    local_add_specialize,
    local_grad_log_erfc_neg,
    local_greedy_distributor,
    mul_canonizer,
+    parse_mul_tree,
+    perform_sigm_times_exp,
+    register_local_1msigmoid,
+    simplify_mul,
 )
-from aesara.tensor.shape import Shape_i
+from aesara.tensor.shape import Reshape, Shape_i
 from aesara.tensor.type import (
    TensorType,
    cmatrix,
@@ -3991,3 +3998,426 @@ def test_local_log_sum_exp3():
    optimised_ret = f(x_val)

    assert np.allclose(optimised_ret, 100.0)
+
+
+class TestSigmoidOpts:
+    def get_mode(self, excluding=None):
+        """
+        Return appropriate mode for the tests.
+
+        :param excluding: List of optimizations to exclude.
+
+        :return: The current default mode unless the `config.mode` option is
+        set to 'FAST_COMPILE' (in which case it is replaced by the 'FAST_RUN'
+        mode), without the optimizations specified in `excluding`.
+        """
+        if excluding is None:
+            excluding = []
+        m = config.mode
+        if m == "FAST_COMPILE":
+            mode = aesara.compile.mode.get_mode("FAST_RUN")
+        else:
+            mode = aesara.compile.mode.get_default_mode()
+        if excluding:
+            return mode.excluding(*excluding)
+        else:
+            return mode
+
+    def test_exp_over_1_plus_exp(self):
+        m = self.get_mode(excluding=["local_elemwise_fusion"])
+
+        x = vector()
+        data = np.random.rand(54).astype(config.floatX)
+
+        backup = config.warn__identify_1pexp_bug
+        config.warn__identify_1pexp_bug = False
+        try:
+            # tests exp_over_1_plus_exp
+            f = aesara.function([x], exp(x) / (1 + exp(x)), mode=m)
+            assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid]
+            f(data)
+            f = aesara.function([x], exp(x) / (2 + exp(x)), mode=m)
+            assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid]
+            f(data)
+            f = aesara.function([x], exp(x) / (1 - exp(x)), mode=m)
+            assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid]
+            f(data)
+            f = aesara.function([x], exp(x + 1) / (1 + exp(x)), mode=m)
+            assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid]
+            f(data)
+
+            # tests inv_1_plus_exp
+            f = aesara.function([x], aet.fill(x, 1.0) / (1 + exp(-x)), mode=m)
+            # todo: solve issue #4589 first
+            # assert check_stack_trace(f, ops_to_check=sigmoid)
+            assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid]
+            f(data)
+            f = aesara.function([x], aet.fill(x, 1.0) / (2 + exp(-x)), mode=m)
+            assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid]
+            f(data)
+            f = aesara.function([x], aet.fill(x, 1.0) / (1 - exp(-x)), mode=m)
+            assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid]
+            f(data)
+            f = aesara.function([x], aet.fill(x, 1.1) / (1 + exp(-x)), mode=m)
+            assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid]
+            f(data)
+
+            # tests inv_1_plus_exp with neg
+            f = aesara.function([x], aet.fill(x, -1.0) / (1 + exp(-x)), mode=m)
+            # todo: solve issue #4589 first
+            # assert check_stack_trace(
+            #     f, ops_to_check=[sigmoid, neg_inplace])
+            assert [node.op for node in f.maker.fgraph.toposort()] == [
+                sigmoid,
+                inplace.neg_inplace,
+            ]
+            f(data)
+            f = aesara.function([x], aet.fill(x, -1.0) / (1 - exp(-x)), mode=m)
+            assert [node.op for node in f.maker.fgraph.toposort()] != [
+                sigmoid,
+                inplace.neg_inplace,
+            ]
+            f(data)
+            f = aesara.function([x], aet.fill(x, -1.0) / (2 + exp(-x)), mode=m)
+            assert [node.op for node in f.maker.fgraph.toposort()] != [
+                sigmoid,
+                inplace.neg_inplace,
+            ]
+            f(data)
+            f = aesara.function([x], aet.fill(x, -1.1) / (1 + exp(-x)), mode=m)
+            assert [node.op for node in f.maker.fgraph.toposort()] != [
+                sigmoid,
+                inplace.neg_inplace,
+            ]
+            f(data)
+
+            # tests double inv_1_plus_exp with neg
+            # (-1)(exp(x)) / (1+exp(x))(1+exp(-x))
+            # = (-1)/(1+exp(-x)) * exp(x)/(1+exp(x))
+            # = - (sigm(x) * sigm(x))
+            f = aesara.function(
+                [x],
+                (aet.fill(x, -1.0) * exp(x)) / ((1 + exp(x)) * (1 + exp(-x))),
+                mode=m,
+            )
+            # todo: solve issue #4589 first
+            # assert check_stack_trace(f, ops_to_check=[sigmoid, mul])
+            assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid, mul]
+            f(data)
+            f = aesara.function(
+                [x],
+                (aet.fill(x, -1.1) * exp(x)) / ((1 + exp(x)) * (1 + exp(-x))),
+                mode=m,
+            )
+            assert [node.op for node in f.maker.fgraph.toposort()] != [
+                sigmoid,
+                mul,
+                inplace.neg_inplace,
+            ]
+            f(data)
+            f = aesara.function(
+                [x],
+                (aet.fill(x, -1.0) * exp(x)) / ((2 + exp(x)) * (1 + exp(-x))),
+                mode=m,
+            )
+            assert [node.op for node in f.maker.fgraph.toposort()] != [
+                sigmoid,
+                mul,
+                inplace.neg_inplace,
+            ]
+            f(data)
+            f = aesara.function(
+                [x],
+                (aet.fill(x, -1.0) * exp(x)) / ((1 + exp(x)) * (2 + exp(-x))),
+                mode=m,
+            )
+            assert [node.op for node in f.maker.fgraph.toposort()] != [
+                sigmoid,
+                mul,
+                inplace.neg_inplace,
+            ]
+            f(data)
+            f = aesara.function(
+                [x],
+                (aet.fill(x, -1.0) * exp(x)) / ((1 + exp(x)) * (1 + exp(x))),
+                mode=m,
+            )
+            assert [node.op for node in f.maker.fgraph.toposort()] != [
+                sigmoid,
+                mul,
+                inplace.neg_inplace,
+            ]
+            f(data)
+            f = aesara.function(
+                [x],
+                (aet.fill(x, -1.0) * exp(x)) / ((1 + exp(x)) * (2 + exp(-x))),
+                mode=m,
+            )
+            assert [node.op for node in f.maker.fgraph.toposort()] != [
+                sigmoid,
+                mul,
+                inplace.neg_inplace,
+            ]
+            f(data)
+
+        finally:
+            # Restore config option.
+            config.warn__identify_1pexp_bug = backup
+
+    def test_1msigmoid(self):
+        if not register_local_1msigmoid:
+            return
+
+        m = self.get_mode()
+        x = fmatrix()
+
+        # tests exp_over_1_plus_exp
+        f = aesara.function([x], 1 - exp(x) / (1 + exp(x)), mode=m)
+        assert check_stack_trace(f, ops_to_check=[neg, inplace.sigmoid_inplace])
+        assert [node.op for node in f.maker.fgraph.toposort()] == [
+            neg,
+            inplace.sigmoid_inplace,
+        ]
+
+        # tests inv_1_plus_exp
+        f = aesara.function([x], 1 - aet.fill(x, 1.0) / (1 + exp(-x)), mode=m)
+        assert check_stack_trace(f, ops_to_check=[neg, inplace.sigmoid_inplace])
+        assert [node.op for node in f.maker.fgraph.toposort()] == [
+            neg,
+            inplace.sigmoid_inplace,
+        ]
+
+    def test_local_sigm_times_exp(self):
+        # Test the `local_sigm_times_exp` optimization.
+        # exp(x) * sigm(-x) -> sigm(x)
+        # exp(-x) * sigm(x) -> sigm(-x)
+
+        def match(func, ops):
+            # print [node.op.scalar_op for node in func.maker.fgraph.toposort()]
+            assert [node.op for node in func.maker.fgraph.toposort()] == ops
+
+        m = self.get_mode(excluding=["local_elemwise_fusion", "inplace"])
+        x, y = vectors("x", "y")
+
+        f = aesara.function([x], sigmoid(-x) * exp(x), mode=m)
+        match(f, [sigmoid])
+        assert check_stack_trace(f, ops_to_check=sigmoid)
+
+        f = aesara.function([x], sigmoid(x) * exp(-x), mode=m)
+        match(f, [neg, sigmoid])
+        assert check_stack_trace(f, ops_to_check=sigmoid)
+
+        f = aesara.function([x], -(-(-(sigmoid(x)))) * exp(-x), mode=m)
+        match(f, [neg, sigmoid, neg])
+        # assert check_stack_trace(f, ops_to_check=sigmoid)
+
+        f = aesara.function(
+            [x, y],
+            (sigmoid(x) * sigmoid(-y) * -exp(-x) * exp(x * y) * exp(y)),
+            mode=m,
+        )
+        topo = f.maker.fgraph.toposort()
+        for op, nb in [(sigmoid, 2), (mul, 2), (neg, 1), (exp, 1)]:
+            assert sum([n.op == op for n in topo]) == nb
+        # assert check_stack_trace(f, ops_to_check=[sigmoid, mul,
+        #                                           exp])
+
+    def test_perform_sigm_times_exp(self):
+        # Test the core function doing the `sigm_times_exp` optimization.
+        #
+        # It is easier to test different graph scenarios this way than by
+        # compiling an Aesara function.
+
+        x, y, z, t = vectors("x", "y", "z", "t")
+        exp_op = exp
+
+        def ok(expr1, expr2):
+            trees = [parse_mul_tree(e) for e in (expr1, expr2)]
+            perform_sigm_times_exp(trees[0])
+            trees[0] = simplify_mul(trees[0])
+            good = is_same_graph(compute_mul(trees[0]), compute_mul(trees[1]))
+            if not good:
+                print(trees[0])
+                print(trees[1])
+                print("***")
+                aesara.printing.debugprint(compute_mul(trees[0]))
+                print("***")
+                aesara.printing.debugprint(compute_mul(trees[1]))
+            assert good
+
+        ok(sigmoid(x) * exp_op(-x), sigmoid(-x))
+        ok(
+            -x * sigmoid(x) * (y * (-1 * z) * exp_op(-x)),
+            -x * sigmoid(-x) * (y * (-1 * z)),
+        )
+        ok(
+            -sigmoid(-x)
+            * (
+                exp_op(y)
+                * (-exp_op(-z) * 3 * -exp_op(x))
+                * (y * 2 * (-sigmoid(-y) * (z + t) * exp_op(z)) * sigmoid(z))
+            )
+            * -sigmoid(x),
+            sigmoid(x)
+            * (-sigmoid(y) * (-sigmoid(-z) * 3) * (y * 2 * ((z + t) * exp_op(z))))
+            * (-sigmoid(x)),
+        )
+        ok(
+            exp_op(-x) * -exp_op(-x) * (-sigmoid(x) * -sigmoid(x)),
+            -sigmoid(-x) * sigmoid(-x),
+        )
+        ok(-exp_op(x) * -sigmoid(-x) * -exp_op(-x), -sigmoid(-x))
+
+    def test_grad_log1msigm(self):
+        # At some point, this returned nan, because (1 - sigm(x)) was
+        # on both the numerator and the denominator of a fraction,
+        # but the two nodes in question had not been merged.
+        x = matrix("x")
+        lr = scalar("lr")
+
+        s = sigmoid(x)
+        l = log(1 - s)
+        c = l.mean()
+        ux = x - lr * aesara.grad(c, x)
+
+        # Before the optimization, inf and NaN will be produced in the graph,
+        # and DebugMode will complain. Everything is fine afterwards.
+        mode = self.get_mode()
+        if not isinstance(mode, aesara.compile.debugmode.DebugMode):
+            f = aesara.function([x, lr], ux, mode=mode)
+            ux_v = f([[50]], 0.1)
+            assert not np.isnan(ux_v)
+
+
+class TestSoftplusOpts:
+    def setup_method(self):
+        if aesara.config.mode == "FAST_COMPILE":
+            m = aesara.compile.mode.get_mode("FAST_RUN").excluding(
+                "local_elemwise_fusion"
+            )
+        else:
+            m = aesara.compile.mode.get_default_mode().excluding(
+                "local_elemwise_fusion"
+            )
+        self.m = m
+        utt.seed_rng()
+
+    def test_logsigm_to_softplus(self):
+        x = vector()
+
+        out = log(sigmoid(x))
+        f = aesara.function([x], out, mode=self.m)
+
+        # Fix ticket #4581 first
+        # assert check_stack_trace(
+        #     f, ops_to_check=(aesara.scalar.Neg,
+        #                      ScalarSoftplus))
+        topo = f.maker.fgraph.toposort()
+        assert len(topo) == 3
+        assert isinstance(topo[0].op.scalar_op, aesara.scalar.Neg)
+        assert isinstance(topo[1].op.scalar_op, aesara.scalar.Softplus)
+        assert isinstance(topo[2].op.scalar_op, aesara.scalar.Neg)
+        f(np.random.rand(54).astype(config.floatX))
+
+    def test_log1msigm_to_softplus(self):
+        x = matrix()
+
+        out = log(1 - sigmoid(x))
+        f = aesara.function([x], out, mode=self.m)
+        topo = f.maker.fgraph.toposort()
+        assert len(topo) == 2
+        assert isinstance(topo[0].op.scalar_op, aesara.scalar.Softplus)
+        assert isinstance(topo[1].op.scalar_op, aesara.scalar.Neg)
+        # assert check_stack_trace(f, ops_to_check='all')
+        f(np.random.rand(54, 11).astype(config.floatX))
+
+        # Same test with a flatten
+        out = log(1 - aet.flatten(sigmoid(x)))
+        f = aesara.function([x], out, mode=self.m)
+
+        # assert check_stack_trace(f, ops_to_check='all')
+        topo = f.maker.fgraph.toposort()
+        assert len(topo) == 3
+        assert aet.is_flat(topo[0].outputs[0])
+        assert isinstance(topo[1].op.scalar_op, aesara.scalar.Softplus)
+        assert isinstance(topo[2].op.scalar_op, aesara.scalar.Neg)
+        f(np.random.rand(54, 11).astype(config.floatX))
+
+        # Same test with a reshape
+        out = log(1 - sigmoid(x).reshape([x.size]))
+        f = aesara.function([x], out, mode=self.m)
+        topo = f.maker.fgraph.toposort()
+        # assert len(topo) == 3
+        assert any(isinstance(node.op, Reshape) for node in topo)
+        assert any(
+            isinstance(
+                getattr(node.op, "scalar_op", None),
+                aesara.scalar.Softplus,
+            )
+            for node in topo
+        )
+        f(np.random.rand(54, 11).astype(config.floatX))
+
+    def test_log1pexp_to_softplus(self):
+        m = aesara.config.mode
+        if m == "FAST_COMPILE":
+            m = "FAST_RUN"
+
+        x = vector()
+
+        out = log(1 + exp(x))
+        f = aesara.function([x], out, mode=self.m)
+
+        # Fix ticket #4581 first
+        # assert check_stack_trace(f, ops_to_check='all')
+        topo = f.maker.fgraph.toposort()
+        assert len(topo) == 1
+        assert isinstance(topo[0].op.scalar_op, aesara.scalar.Softplus)
+        f(np.random.rand(54).astype(config.floatX))
+
+
+class TestSigmoidUtils:
+    """
+    Test utility functions found in 'math_opt.py' used in the optimization of
+    sigmoid / softplus expressions.
+    """
+
+    def test_compute_mul(self):
+        x, y, z = vectors("x", "y", "z")
+        tree = (x * y) * -z
+        mul_tree = parse_mul_tree(tree)
+        assert parse_mul_tree(compute_mul(mul_tree)) == mul_tree
+        assert is_same_graph(compute_mul(parse_mul_tree(tree)), tree)
+
+    def test_parse_mul_tree(self):
+        x, y, z = vectors("x", "y", "z")
+        assert parse_mul_tree(x * y) == [False, [[False, x], [False, y]]]
+        assert parse_mul_tree(-(x * y)) == [True, [[False, x], [False, y]]]
+        assert parse_mul_tree(-x * y) == [False, [[True, x], [False, y]]]
+        assert parse_mul_tree(-x) == [True, x]
+        assert parse_mul_tree((x * y) * -z) == [
+            False,
+            [[False, [[False, x], [False, y]]], [True, z]],
+        ]
+
+    def test_is_1pexp(self):
+        backup = config.warn__identify_1pexp_bug
+        config.warn__identify_1pexp_bug = False
+        try:
+            x = vector("x")
+            exp_op = exp
+            assert is_1pexp(1 + exp_op(x), False) == (False, x)
+            assert is_1pexp(exp_op(x) + 1, False) == (False, x)
+            for neg_, exp_arg in map(
+                lambda x: is_1pexp(x, only_process_constants=False),
+                [(1 + exp_op(-x)), (exp_op(-x) + 1)],
+            ):
+                assert not neg_ and is_same_graph(exp_arg, -x)
+            assert is_1pexp(1 - exp_op(x), False) is None
+            assert is_1pexp(2 + exp_op(x), False) is None
+            assert is_1pexp(exp_op(x) + 2, False) is None
+            assert is_1pexp(exp_op(x) - 1, False) is None
+            assert is_1pexp(-1 + exp_op(x), False) is None
+            assert is_1pexp(1 + 2 * exp_op(x), False) is None
+        finally:
+            config.warn__identify_1pexp_bug = backup