Documentation formatting and NumPy usage updates

e40c1b29 · Brandon T. Willard · Brandon T. Willard · 8b7446e0 · e40c1b29 · e40c1b29
--- a/aesara/compile/builders.py
+++ b/aesara/compile/builders.py
@@ -146,7 +146,7 @@ class OpFromGraph(Op, HasInnerGraph):
        from aesara.compile.builders import OpFromGraph

        x, y, z = at.scalars('xyz')
-        s = aesara.shared(np.random.rand(2, 2).astype(config.floatX))
+        s = aesara.shared(np.random.random((2, 2)).astype(config.floatX))
        e = x + y * z + s
        op = OpFromGraph([x, y, z], [e])
        # op behaves like a normal aesara op

--- a/aesara/gradient.py
+++ b/aesara/gradient.py
@@ -5,6 +5,7 @@ import time
 import warnings
 from collections import OrderedDict
 from functools import partial, reduce
+from typing import TYPE_CHECKING, Callable, List, Optional, Union

 import numpy as np

@@ -18,6 +19,10 @@ from aesara.graph.op import get_test_values
 from aesara.graph.type import Type


+if TYPE_CHECKING:
+    from aesara.compile.mode import Mode
+
+
 __docformat__ = "restructuredtext en"
 _logger = logging.getLogger("aesara.gradient")

@@ -684,8 +689,8 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
    .. code-block:: python

        x, t = aesara.tensor.fvector('x'), aesara.tensor.fvector('t')
-        w1 = aesara.shared(np.random.randn(3,4))
-        w2 = aesara.shared(np.random.randn(4,2))
+        w1 = aesara.shared(np.random.standard_normal((3,4)))
+        w2 = aesara.shared(np.random.standard_normal((4,2)))
        a1 = aesara.tensor.tanh(aesara.tensor.dot(x,w1))
        a2 = aesara.tensor.tanh(aesara.tensor.dot(a1,w2))
        cost2 = aesara.tensor.sqr(a2 - t).sum()
@@ -1690,17 +1695,17 @@ def mode_not_slow(mode):


 def verify_grad(
-    fun,
-    pt,
-    n_tests=2,
-    rng=None,
-    eps=None,
-    out_type=None,
-    abs_tol=None,
-    rel_tol=None,
-    mode=None,
-    cast_to_output_type=False,
-    no_debug_ref=True,
+    fun: Callable,
+    pt: List[np.ndarray],
+    n_tests: int = 2,
+    rng: Optional[Union[np.random.Generator, np.random.RandomState]] = None,
+    eps: Optional[float] = None,
+    out_type: Optional[str] = None,
+    abs_tol: Optional[float] = None,
+    rel_tol: Optional[float] = None,
+    mode: Optional[Union["Mode", str]] = None,
+    cast_to_output_type: bool = False,
+    no_debug_ref: bool = True,
 ):
    """Test a gradient by Finite Difference Method. Raise error on failure.

@@ -1713,47 +1718,47 @@ def verify_grad(
    --------
    >>> verify_grad(aesara.tensor.tanh,
    ...             (np.asarray([[2, 3, 4], [-1, 3.3, 9.9]]),),
-    ...             rng=np.random)
+    ...             rng=np.random.default_rng(23098))

    Parameters
    ----------
-    fun : a Python function
+    fun
        `fun` takes Aesara variables as inputs, and returns an Aesara variable.
-        For instance, an Op instance with  a single output.
-    pt : list of numpy.ndarrays
+        For instance, an `Op` instance with  a single output.
+    pt
        Input values, points where the gradient is estimated.
        These arrays must be either float16, float32, or float64 arrays.
-    n_tests : int
-        Number of times to run the test
-    rng : numpy.random.RandomState
+    n_tests
+        Number o to run the test.
+    rng
        Random number generator used to sample the output random projection `u`,
-        we test gradient of sum(u * fun) at `pt`
-    eps : float, optional
+        we test gradient of ``sum(u * fun)`` at `pt`.
+    eps
        Step size used in the Finite Difference Method (Default
-        None is type-dependent).
-        Raising the value of eps can raise or lower the absolute
+        ``None`` is type-dependent).
+        Raising the value of `eps` can raise or lower the absolute
        and relative errors of the verification depending on the
-        Op. Raising eps does not lower the verification quality for
+        `Op`. Raising `eps` does not lower the verification quality for
        linear operations. It is better to raise `eps` than raising
        `abs_tol` or `rel_tol`.
-    out_type : string
-        Dtype of output, if complex (i.e., 'complex32' or 'complex64')
-    abs_tol : float
+    out_type
+        Dtype of output, if complex (i.e., ``'complex32'`` or ``'complex64'``)
+    abs_tol
        Absolute tolerance used as threshold for gradient comparison
-    rel_tol : float
+    rel_tol
        Relative tolerance used as threshold for gradient comparison
-    cast_to_output_type : bool
-        If the output is float32 and cast_to_output_type is True, cast
-        the random projection to float32. Otherwise it is float64.
+    cast_to_output_type
+        If the output is float32 and `cast_to_output_type` is ``True``, cast
+        the random projection to float32; otherwise, it is float64.
        float16 is not handled here.
-    no_debug_ref : bool
-        Don't use DebugMode for the numerical gradient function.
+    no_debug_ref
+        Don't use `DebugMode` for the numerical gradient function.

    Notes
    -----
-    This function does not support multiple outputs. In
-    tests/scan/test_basic.py there is an experimental `verify_grad` that covers
-    that case as well by using random projections.
+    This function does not support multiple outputs. In `tests.scan.test_basic`
+    there is an experimental `verify_grad` that covers that case as well by
+    using random projections.

    """
    from aesara.compile.function import function

--- a/doc/extending/creating_an_op.rst
+++ b/doc/extending/creating_an_op.rst
@@ -404,13 +404,14 @@ You can try the new :class:`Op` as follows:

 .. testcode:: example

+    import numpy as np
    import aesara
+
    x = aesara.tensor.matrix()
    f = aesara.function([x], DoubleOp1()(x))
-    import numpy
-    inp = numpy.random.rand(5, 4)
+    inp = np.random.random_sample((5, 4))
    out = f(inp)
-    assert numpy.allclose(inp * 2, out)
+    assert np.allclose(inp * 2, out)
    print(inp)
    print(out)

@@ -435,13 +436,14 @@ You can try the new :class:`Op` as follows:

 .. testcode:: example

+    import numpy as np
    import aesara
+
    x = aesara.tensor.matrix()
    f = aesara.function([x], DoubleOp2()(x))
-    import numpy
-    inp = numpy.random.rand(5, 4)
+    inp = np.random.random_sample((5, 4))
    out = f(inp)
-    assert numpy.allclose(inp * 2, out)
+    assert np.allclose(inp * 2, out)
    print(inp)
    print(out)

@@ -530,10 +532,9 @@ We can test this by running the following segment:
    f = aesara.function([x], mult4plus5op(x))
    g = aesara.function([x], mult2plus3op(x))

-    import numpy
-    inp = numpy.random.rand(5, 4).astype(numpy.float32)
-    assert numpy.allclose(4 * inp + 5, f(inp))
-    assert numpy.allclose(2 * inp + 3, g(inp))
+    inp = np.random.random_sample((5, 4)).astype(np.float32)
+    assert np.allclose(4 * inp + 5, f(inp))
+    assert np.allclose(2 * inp + 3, g(inp))


 How To Test it
@@ -553,11 +554,11 @@ returns the right answer. If you detect an error, you must raise an

 .. testcode:: tests

-    import numpy
+    import numpy as np
    import aesara
-
    from tests import unittest_tools as utt
-    from aesara.configdefaults import config
+
+
    class TestDouble(utt.InferShapeTester):
        def setup_method(self):
            super().setup_method()
@@ -565,9 +566,12 @@ returns the right answer. If you detect an error, you must raise an
            self.op = DoubleOp()

        def test_basic(self):
+            rng = np.random.default_rng(utt.fetch_seed())
+
            x = aesara.tensor.matrix()
            f = aesara.function([x], self.op(x))
-            inp = numpy.asarray(numpy.random.rand(5, 4), dtype=config.floatX)
+
+            inp = np.asarray(rng.random((5, 4)), dtype=aesara.config.floatX)
            out = f(inp)
            # Compare the result computed to the expected value.
            utt.assert_allclose(inp * 2, out)
@@ -612,20 +616,26 @@ your :class:`Op` works only with such matrices, you can disable the warning with

 .. testcode:: tests

-    from tests import unittest_tools as utt
    from aesara.configdefaults import config
+    from tests import unittest_tools as utt
+
+
    class TestDouble(utt.InferShapeTester):
+
        # [...] as previous tests.
+
        def test_infer_shape(self):
+            rng = np.random.default_rng(utt.fetch_seed())
            x = aesara.tensor.matrix()
-            self._compile_and_check([x],  # aesara.function inputs
-                                    [self.op(x)],  # aesara.function outputs
-                                    # Always use not square matrix!
-                                    # inputs data
-                                    [numpy.asarray(numpy.random.rand(5, 4),
-                                                   dtype=config.floatX)],
-                                    # Op that should be removed from the graph.
-                                    self.op_class)
+            self._compile_and_check(
+                [x],  # aesara.function inputs
+                [self.op(x)],  # aesara.function outputs
+                # Always use not square matrix!
+                # inputs data
+                [np.asarray(rng.random((5, 4)), dtype=config.floatX)],
+                # Op that should be removed from the graph.
+                self.op_class,
+            )

 Testing the gradient
 ^^^^^^^^^^^^^^^^^^^^
@@ -642,8 +652,11 @@ the multiplication by 2).
 .. testcode:: tests

        def test_grad(self):
-            tests.unittest_tools.verify_grad(self.op,
-                                             [numpy.random.rand(5, 7, 2)])
+            rng = np.random.default_rng(utt.fetch_seed())
+            tests.unittest_tools.verify_grad(
+                self.op,
+                [rng.random((5, 7, 2))]
+            )

 Testing the Rop
 ^^^^^^^^^^^^^^^
@@ -778,40 +791,34 @@ signature:
 .. testcode:: asop

    import aesara
-    import numpy
+    import aesara.tensor as at
+    import numpy as np
    from aesara import function
    from aesara.compile.ops import as_op

+
    def infer_shape_numpy_dot(fgraph, node, input_shapes):
        ashp, bshp = input_shapes
        return [ashp[:-1] + bshp[-1:]]

-    @as_op(itypes=[aesara.tensor.fmatrix, aesara.tensor.fmatrix],
-           otypes=[aesara.tensor.fmatrix], infer_shape=infer_shape_numpy_dot)
+
+    @as_op(itypes=[at.matrix, at.matrix],
+           otypes=[at.matrix], infer_shape=infer_shape_numpy_dot)
    def numpy_dot(a, b):
-       return numpy.dot(a, b)
+       return np.dot(a, b)

 You can try it as follows:

 .. testcode:: asop

-    x = aesara.tensor.fmatrix()
-    y = aesara.tensor.fmatrix()
+    x = at.matrix()
+    y = at.matrix()
    f = function([x, y], numpy_dot(x, y))
-    inp1 = numpy.random.rand(5, 4).astype('float32')
-    inp2 = numpy.random.rand(4, 7).astype('float32')
+    inp1 = np.random.random_sample((5, 4))
+    inp2 = np.random.random_sample((4, 7))
    out = f(inp1, inp2)


-Exercise
-^^^^^^^^
-
-Run the code of the ``numpy_dot`` example above.
-
-Modify and execute to compute: ``numpy.add`` and ``numpy.subtract``.
-
-Modify and execute the example to return two outputs: ``x + y`` and ``x - y``.
-
 .. _Documentation:

 Documentation and Coding Style
@@ -822,7 +829,7 @@ will not be accepted.
 :class:`NanGuardMode` and :class:`AllocEmpty`
 ---------------------------------------------

-:class:`NanGuardMode` help users find where in the graph NaN appear. But
+:class:`NanGuardMode` help users find where in the graph ``NaN`` appear. But
 sometimes, we want some variables to not be checked. For example, in
 the old GPU back-end, we used a float32 :class:`CudaNdarray` to store the MRG
 random number generator state (they are integers). So if :class:`NanGuardMode`

--- a/doc/extending/extending_aesara_solution_1.py
+++ b/doc/extending/extending_aesara_solution_1.py
@@ -81,60 +81,60 @@ from aesara.tensor.type import dmatrix, matrix


 class TestProdOp(utt.InferShapeTester):
-
-    rng = np.random.RandomState(43)
-
    def setup_method(self):
        super().setup_method()
        self.op_class = ProdOp  # case 1

    def test_perform(self):
+        rng = np.random.default_rng(43)
        x = matrix()
        y = matrix()
        f = aesara.function([x, y], self.op_class()(x, y))
-        x_val = np.random.rand(5, 4)
-        y_val = np.random.rand(5, 4)
+        x_val = rng.random((5, 4))
+        y_val = rng.random((5, 4))
        out = f(x_val, y_val)
        assert np.allclose(x_val * y_val, out)

    def test_gradient(self):
+        rng = np.random.default_rng(43)
        utt.verify_grad(
            self.op_class(),
-            [np.random.rand(5, 4), np.random.rand(5, 4)],
+            [rng.random((5, 4)), rng.random((5, 4))],
            n_tests=1,
            rng=TestProdOp.rng,
        )

    def test_infer_shape(self):
+        rng = np.random.default_rng(43)
        x = dmatrix()
        y = dmatrix()

        self._compile_and_check(
            [x, y],
            [self.op_class()(x, y)],
-            [np.random.rand(5, 6), np.random.rand(5, 6)],
+            [rng.random(5, 6), rng.random((5, 6))],
            self.op_class,
        )


 class TestSumDiffOp(utt.InferShapeTester):
-
-    rng = np.random.RandomState(43)
-
    def setup_method(self):
        super().setup_method()
        self.op_class = SumDiffOp

    def test_perform(self):
+        rng = np.random.RandomState(43)
        x = matrix()
        y = matrix()
        f = aesara.function([x, y], self.op_class()(x, y))
-        x_val = np.random.rand(5, 4)
-        y_val = np.random.rand(5, 4)
+        x_val = rng.random((5, 4))
+        y_val = rng.random((5, 4))
        out = f(x_val, y_val)
        assert np.allclose([x_val + y_val, x_val - y_val], out)

    def test_gradient(self):
+        rng = np.random.RandomState(43)
+
        def output_0(x, y):
            return self.op_class()(x, y)[0]

@@ -143,18 +143,20 @@ class TestSumDiffOp(utt.InferShapeTester):

        utt.verify_grad(
            output_0,
-            [np.random.rand(5, 4), np.random.rand(5, 4)],
+            [rng.random((5, 4)), rng.random((5, 4))],
            n_tests=1,
            rng=TestSumDiffOp.rng,
        )
        utt.verify_grad(
            output_1,
-            [np.random.rand(5, 4), np.random.rand(5, 4)],
+            [rng.random((5, 4)), rng.random((5, 4))],
            n_tests=1,
            rng=TestSumDiffOp.rng,
        )

    def test_infer_shape(self):
+        rng = np.random.RandomState(43)
+
        x = dmatrix()
        y = dmatrix()

@@ -163,7 +165,7 @@ class TestSumDiffOp(utt.InferShapeTester):
        self._compile_and_check(
            [x, y],
            self.op_class()(x, y),
-            [np.random.rand(5, 6), np.random.rand(5, 6)],
+            [rng.random((5, 6)), rng.random((5, 6))],
            self.op_class,
        )


--- a/doc/extending/unittest.rst
+++ b/doc/extending/unittest.rst
@@ -97,12 +97,23 @@ Example:

 .. code-block:: python

+    import numpy as np
+    import aesara.tensor as at
+
+
    def test_dot_validity():
        a = at.dmatrix('a')
        b = at.dmatrix('b')
        c = at.dot(a, b)
-        f = aesara.function([a, b], [c])
-        assert np.array_equal(f(self.avals, self.bvals), numpy.dot(self.avals, self.bvals))
+
+        c_fn = aesara.function([a, b], [c])
+
+        avals = ...
+        bvals = ...
+
+        res = c_fn(avals, bvals)
+        exp_res = np.dot(self.avals, self.bvals)
+        assert np.array_equal(res, exp_res)


 Creating an :class:`Op` Unit Test
@@ -117,16 +128,16 @@ unit tests for Aesara :class:`Op`\s.
 Validating the Gradient
 -----------------------

-The :func:`verify_grad` function can be used to validate that the :meth:`Op.grad`
+The :func:`aesara.gradient.verify_grad` function can be used to validate that the :meth:`Op.grad`
 method of your :class:`Op` is properly implemented. :func:`verify_grad` is based
-on the Finite Difference Method where the derivative of function ``f``
-at point ``x`` is approximated as:
+on the Finite Difference Method where the derivative of function :math:`f`
+at point :math:`x` is approximated as:

 .. math::

   \frac{\partial{f}}{\partial{x}} = lim_{\Delta \rightarrow 0} \frac {f(x+\Delta) - f(x-\Delta)} {2\Delta}

-``verify_grad`` performs the following steps:
+:func:`verify_grad` performs the following steps:

 * approximates the gradient numerically using the Finite Difference Method

@@ -142,7 +153,7 @@ Here is the prototype for the :func:`verify_grad` function.

    def verify_grad(fun, pt, n_tests=2, rng=None, eps=1.0e-7, abs_tol=0.0001, rel_tol=0.0001):

-:func:`verify_grad` raises an ``Exception`` if the difference between the analytic gradient and
+:func:`verify_grad` raises an :class:`Exception` if the difference between the analytic gradient and
 numerical gradient (computed through the Finite Difference Method) of a random
 projection of the fun's output to a scalar exceeds both the given absolute and
 relative tolerances.
@@ -152,15 +163,15 @@ The parameters are as follows:
 * ``fun``: a Python function that takes Aesara variables as inputs,
  and returns an Aesara variable.
  For instance, an :class:`Op` instance with a single output is such a function.
-  It can also be a Python function that calls an op with some of its
+  It can also be a Python function that calls an :class:`Op` with some of its
  inputs being fixed to specific values, or that combine multiple :class:`Op`\s.

-* ``pt``: the list of numpy.ndarrays to use as input values
+* ``pt``: the list of `np.ndarrays` to use as input values

 * ``n_tests``: number of times to run the test

-* ``rng``: random number generator used to generate a random vector u,
-  we check the gradient of sum(u*fn) at pt
+* ``rng``: random number generator used to generate a random vector `u`,
+  we check the gradient of ``sum(u*fn)`` at ``pt``

 * ``eps``: stepsize used in the Finite Difference Method

@@ -176,12 +187,12 @@ symbolic variable:

    def test_verify_exprgrad():
        def fun(x,y,z):
-            return (x + tensor.cos(y)) / (4 * z)**2
+            return (x + at.cos(y)) / (4 * z)**2

-        x_val = numpy.asarray([[1], [1.1], [1.2]])
-        y_val = numpy.asarray([0.1, 0.2])
-        z_val = numpy.asarray(2)
-        rng = numpy.random.RandomState(42)
+        x_val = np.asarray([[1], [1.1], [1.2]])
+        y_val = np.asarray([0.1, 0.2])
+        z_val = np.asarray(2)
+        rng = np.random.default_rng(42)

        aesara.gradient.verify_grad(fun, [x_val, y_val, z_val], rng=rng)

@@ -190,11 +201,13 @@ Here is an example showing how to use :func:`verify_grad` on an :class:`Op` inst
 .. testcode::

    def test_flatten_outdimNone():
-        # Testing gradient w.r.t. all inputs of an op (in this example the op
-        # being used is Flatten(), which takes a single input).
-        a_val = numpy.asarray([[0,1,2],[3,4,5]], dtype='float64')
-        rng = numpy.random.RandomState(42)
-        aesara.gradient.verify_grad(tensor.Flatten(), [a_val], rng=rng)
+        """
+        Testing gradient w.r.t. all inputs of an `Op` (in this example the `Op`
+        being used is `Flatten`, which takes a single input).
+        """
+        a_val = np.asarray([[0,1,2],[3,4,5]], dtype='float64')
+        rng = np.random.default_rng(42)
+        aesara.gradient.verify_grad(at.Flatten(), [a_val], rng=rng)

 Here is another example, showing how to verify the gradient w.r.t. a subset of
 an :class:`Op`'s inputs. This is useful in particular when the gradient w.r.t. some of
@@ -204,29 +217,30 @@ which would cause :func:`verify_grad` to crash.
 .. testcode::

    def test_crossentropy_softmax_grad():
-        op = tensor.nnet.crossentropy_softmax_argmax_1hot_with_bias
+        op = at.nnet.crossentropy_softmax_argmax_1hot_with_bias
+
        def op_with_fixed_y_idx(x, b):
-            # Input `y_idx` of this Op takes integer values, so we fix them
+            # Input `y_idx` of this `Op` takes integer values, so we fix them
            # to some constant array.
-            # Although this op has multiple outputs, we can return only one.
+            # Although this `Op` has multiple outputs, we can return only one.
            # Here, we return the first output only.
-            return op(x, b, y_idx=numpy.asarray([0, 2]))[0]
+            return op(x, b, y_idx=np.asarray([0, 2]))[0]

-        x_val = numpy.asarray([[-1, 0, 1], [3, 2, 1]], dtype='float64')
-        b_val = numpy.asarray([1, 2, 3], dtype='float64')
-        rng = numpy.random.RandomState(42)
+        x_val = np.asarray([[-1, 0, 1], [3, 2, 1]], dtype='float64')
+        b_val = np.asarray([1, 2, 3], dtype='float64')
+        rng = np.random.default_rng(42)

        aesara.gradient.verify_grad(op_with_fixed_y_idx, [x_val, b_val], rng=rng)

 .. note::

-    Although ``verify_grad`` is defined in ``aesara.tensor.basic``, unittests
-    should use the version of ``verify_grad`` defined in ``tests.unittest_tools``.
+    Although :func:`verify_grad` is defined in :mod:`aesara.gradient`, unittests
+    should use the version of :func:`verify_grad` defined in :mod:`tests.unittest_tools`.
    This is simply a wrapper function which takes care of seeding the random
-    number generator appropriately before calling ``aesara.gradient.verify_grad``
+    number generator appropriately before calling :func:`aesara.gradient.verify_grad`

-makeTester and makeBroadcastTester
-==================================
+:func:`makeTester` and :func:`makeBroadcastTester`
+==================================================

 Most :class:`Op` unittests perform the same function. All such tests must
 verify that the :class:`Op` generates the proper output, that the gradient is
@@ -244,21 +258,23 @@ product :class:`Op`:

    from tests.tensor.utils import makeTester

+
    rng = np.random.default_rng(23098)

    TestDot = makeTester(
        name="DotTester",
        op=np.dot,
-        expected=lambda x, y: numpy.dot(x, y),
+        expected=lambda x, y: np.dot(x, y),
        checks={},
        good=dict(
-            correct1=(rng.rand(5, 7), rng.rand(7, 5)),
-            correct2=(rng.rand(5, 7), rng.rand(7, 9)),
-            correct3=(rng.rand(5, 7), rng.rand(7)),
+            correct1=(rng.random((5, 7)), rng.random((7, 5))),
+            correct2=(rng.random((5, 7)), rng.random((7, 9))),
+            correct3=(rng.random((5, 7)), rng.random((7,))),
        ),
        bad_build=dict(),
        bad_runtime=dict(
-            bad1=(rng.rand(5, 7), rng.rand(5, 7)), bad2=(rng.rand(5, 7), rng.rand(8, 3))
+            bad1=(rng.random((5, 7)), rng.random((5, 7))),
+            bad2=(rng.random((5, 7)), rng.random((8, 3)))
        ),
        grad=dict(),
    )

--- a/doc/library/compile/nanguardmode.rst
+++ b/doc/library/compile/nanguardmode.rst
@@ -14,37 +14,36 @@ Guide
 =====


-The NanGuardMode aims to prevent the model from outputting NaNs or Infs. It has
-a number of self-checks, which can help to find out which apply node is
-generating those incorrect outputs. It provides automatic detection of 3 types
+The :class:`NanGuardMode` aims to prevent the model from outputting NaNs or Infs. It has
+a number of self-checks, which can help to find out which :class:`Apply` node is
+generating those incorrect outputs. It provides automatic detection of three types
 of abnormal values: NaNs, Infs, and abnormally big values.

-NanGuardMode can be used as follows:
+`NanGuardMode` can be used as follows:

 .. testcode::

-    import numpy
+    import numpy as np
    import aesara
    import aesara.tensor as at
    from aesara.compile.nanguardmode import NanGuardMode

    x = at.matrix()
-    w = aesara.shared(numpy.random.randn(5, 7).astype(aesara.config.floatX))
+    w = aesara.shared(np.random.standard_normal((5, 7)).astype(aesara.config.floatX))
    y = at.dot(x, w)
    fun = aesara.function(
        [x], y,
        mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)
    )

-While using the aesara function ``fun``, it will monitor the values of each
+While using the Aesara function ``fun``, it will monitor the values of each
 input and output variable of each node. When abnormal values are
 detected, it raises an error to indicate which node yields the NaNs. For
 example, if we pass the following values to ``fun``:

 .. testcode::

-    infa = numpy.tile(
-        (numpy.asarray(100.) ** 1000000).astype(aesara.config.floatX), (3, 5))
+    infa = np.tile((np.asarray(100.) ** 1000000).astype(aesara.config.floatX), (3, 5))
    fun(infa)

 .. testoutput::
@@ -55,17 +54,17 @@ example, if we pass the following values to ``fun``:
     ...
   AssertionError: ...

-It will raise an AssertionError indicating that Inf value is detected while
+It will raise an `AssertionError` indicating that Inf value is detected while
 executing the function.

-You can also set the three parameters in ``NanGuardMode()`` to indicate which
+You can also set the three parameters in `NanGuardMode` to indicate which
 kind of abnormal values to monitor. ``nan_is_error`` and ``inf_is_error`` has
 no default values, so they need to be set explicitly, but ``big_is_error`` is
 set to be ``True`` by default.

 .. note::

-        NanGuardMode significantly slows down computations; only
+        `NanGuardMode` significantly slows down computations; only
        enable as needed.

 Reference

--- a/doc/library/config.rst
+++ b/doc/library/config.rst
@@ -797,8 +797,7 @@ import ``aesara`` and print the config variable, as in:
    Aesara will execute the graph using constants and/or shared variables
    provided by the user. Purely symbolic variables (e.g. ``x =
    aesara.tensor.dmatrix()``) can be augmented with test values, by writing to
-    their ``tag.test_value`` attribute (e.g. ``x.tag.test_value =
-    numpy.random.rand(5, 4)``).
+    their ``.tag.test_value`` attributes (e.g. ``x.tag.test_value = np.ones((5, 4))``).

    When not ``'off'``, the value of this option dictates what happens when
    an :class:`Op`'s inputs do not provide appropriate test values:

--- a/doc/library/tensor/basic.rst
+++ b/doc/library/tensor/basic.rst
--- a/doc/sandbox/logistic_regression_example.rst
+++ b/doc/sandbox/logistic_regression_example.rst
@@ -65,8 +65,8 @@ BUT, YOU GOTTA RUN THIS CODE AND MAKE SURE IT STILL WORKS NICELY, HEY?

    up_fn, app_fn = build_logistic_regression_model(n_in=10, n_out=3, l2_coef=30.0)

-    x_data = numpy.random.randn(100, 10)
-    y_data = numpy.random.randn(100, 3)
+    x_data = numpy.random.standard_normal((100, 10))
+    y_data = numpy.random.standard_normal((100, 3))
    y_data = _asarray(y_data == numpy.max(y_data, axis=1), dtype='int64')

    print "Model Training ..."

--- a/doc/sandbox/sparse.rst
+++ b/doc/sandbox/sparse.rst
@@ -11,31 +11,36 @@ Note that you want SciPy >= 0.7.2

 .. warning::

-    In SciPy 0.6, ``scipy.csc_matrix.dot`` has a bug with singleton
+    In SciPy 0.6, `scipy.csc_matrix.dot` has a bug with singleton
    dimensions. There may be more bugs. It also has inconsistent
    implementation of sparse matrices.

    We do not test against SciPy versions below 0.7.2.

 We describe the details of the compressed sparse matrix types.
-    ``scipy.sparse.csc_matrix``
-        should be used if there are more rows than column (shape[0] > shape[1]).
-    ``scipy.sparse.csr_matrix``
-        should be used if there are more columns than rows (shape[0] < shape[1]).
-    ``scipy.sparse.lil_matrix``
+    `scipy.sparse.csc_matrix`
+        should be used if there are more rows than column (``shape[0] > shape[1]``).
+    `scipy.sparse.csr_matrix`
+        should be used if there are more columns than rows (``shape[0] < shape[1]``).
+    `scipy.sparse.lil_matrix`
        is faster if we are modifying the array. After initial inserts,
        we can then convert to the appropriate sparse matrix format.

 The following types also exist:
-    ``dok_matrix``
+    `dok_matrix`
        Dictionary of Keys format. From their doc: This is an efficient structure for constructing sparse matrices incrementally.
-    ``coo_matrix``
+    `coo_matrix`
        Coordinate format. From their lil doc: consider using the COO format when constructing large matrices.

-There seems to be a new format planned for scipy 0.7.x:
-    ``bsr_matrix``
-        Block Compressed Row (BSR). From their doc: The Block Compressed Row (BSR) format is very similar to the Compressed Sparse Row (CSR) format. BSR is appropriate for sparse matrices with dense sub matrices like the last example below. Block matrices often arise in vector-valued finite element discretizations. In such cases, BSR is considerably more efficient than CSR and CSC for many sparse arithmetic operations.
-    ``dia_matrix``
+There seems to be a new format planned for SciPy 0.7.x:
+    `bsr_matrix`
+        Block Compressed Row (BSR). From their doc: The Block Compressed Row
+        (BSR) format is very similar to the Compressed Sparse Row (CSR)
+        format. BSR is appropriate for sparse matrices with dense sub matrices
+        like the last example below. Block matrices often arise in vector-valued
+        finite element discretizations. In such cases, BSR is considerably more
+        efficient than CSR and CSC for many sparse arithmetic operations.
+    `dia_matrix`
        Sparse matrix with DIAgonal storage

 There are four member variables that comprise a compressed matrix ``sp`` (for at least csc, csr and bsr):
@@ -52,9 +57,9 @@ There are four member variables that comprise a compressed matrix ``sp`` (for at
        row location.
    ``sp.indptr``
        gives the other location of the non-zero entry. For CSC, there are
-        as many values of indptr as there are columns + 1 in the matrix.
+        as many values of indptr as there are ``columns + 1`` in the matrix.
        ``sp.indptr[k] = x`` and ``indptr[k+1] = y`` means that column
-        k contains sp.data[x:y], i.e. the xth through the y-1th non-zero values.
+        ``k`` contains ``sp.data[x:y]``, i.e. the ``x``-th through the y-1th non-zero values.

 See the example below for details.

@@ -63,7 +68,7 @@ See the example below for details.
    >>> import scipy.sparse
    >>> sp = scipy.sparse.csc_matrix((5, 10))
    >>> sp[4, 0] = 20
-    /u/lisa/local/byhost/test_maggie46.iro.umontreal.ca/lib64/python2.5/site-packages/scipy/sparse/compressed.py:494: SparseEfficiencyWarning: changing the sparsity structure of a csc_matrix is expensive. lil_matrix is more efficient.
+    SparseEfficiencyWarning: changing the sparsity structure of a csc_matrix is expensive. lil_matrix is more efficient.
     SparseEfficiencyWarning)
    >>> sp[0, 0] = 10
    >>> sp[2, 3] = 30
@@ -91,13 +96,13 @@ Several things should be learned from the above example:
 * We actually use the wrong sparse matrix type. In fact, it is the
  *rows* that are sparse, not the columns. So, it would have been
  better to use ``sp = scipy.sparse.csr_matrix((5, 10))``.
-* We should have actually created the matrix as a ``lil_matrix``,
+* We should have actually created the matrix as a `lil_matrix`,
  which is more efficient for inserts. Afterwards, we should convert
  to the appropriate compressed format.
-* `sp.indptr[0] = 0` and `sp.indptr[1] = 2`, which means that
-  column 0 contains sp.data[0:2], i.e. the first two non-zero values.
-* `sp.indptr[3] = 2` and `sp.indptr[4] = 3`, which means that column
-  3 contains sp.data[2:3], i.e. the third non-zero value.
+* ``sp.indptr[0] = 0`` and ``sp.indptr[1] = 2``, which means that
+  column 0 contains ``sp.data[0:2]``, i.e. the first two non-zero values.
+* ``sp.indptr[3] = 2`` and ``sp.indptr[4] = 3``, which means that column
+  three contains ``sp.data[2:3]``, i.e. the third non-zero value.

 TODO: Rewrite this documentation to do things in a smarter way.

@@ -112,7 +117,7 @@ For faster sparse code:

 Misc
 ----
-The sparse equivalent of dmatrix is csc_matrix and csr_matrix.
+The sparse equivalent of `dmatrix` is `csc_matrix` and `csr_matrix`.

 :class:`~aesara.sparse.basic.Dot` vs. :class:`~aesara.sparse.basic.StructuredDot`
 ---------------------------------------------------------------------------------
@@ -121,22 +126,22 @@ Often when you use a sparse matrix it is because there is a meaning to the
 structure of non-zeros. The gradient on terms outside that structure
 has no meaning, so it is computationally efficient not to compute them.

-StructuredDot is when you want the gradient to have zeroes corresponding to
+`StructuredDot` is when you want the gradient to have zeroes corresponding to
 the sparse entries in the matrix.

-TrueDot and Structured dot have different gradients
+`TrueDot` and `Structured` dot have different gradients
 but their perform functions should be the same.

-The gradient of TrueDot can have non-zeros where the sparse matrix had zeros.
-The gradient of StructuredDot can't.
+The gradient of `TrueDot` can have non-zeros where the sparse matrix had zeros.
+The gradient of `StructuredDot` can't.

 Suppose you have ``dot(x,w)`` where ``x`` and ``w`` are square matrices.
-If ``w`` is dense, like ``randn((5,5))`` and ``x`` is of full rank (though
-potentially sparse, like a diagonal matrix of 1s) then the output will
-be dense too. (But i guess the density of the output is a red herring.)
+If ``w`` is dense, like ``standard_normal((5,5))`` and ``x`` is of full rank (though
+potentially sparse, like a diagonal matrix of ones) then the output will
+be dense too.
 What's important is the density of the gradient on the output.
 If the gradient on the output is dense, and ``w`` is dense (as we said it was)
-then the True gradient on ``x`` will be dense.
-If our dot is a TrueDot, then it will say that the gradient on ``x`` is dense.
-If our dot is a StructuredDot, then it will say the gradient on ``x`` is only
+then the ``True`` gradient on ``x`` will be dense.
+If our dot is a `TrueDot`, then it will say that the gradient on ``x`` is dense.
+If our dot is a `StructuredDot`, then it will say the gradient on ``x`` is only
 defined on the diagonal and ignore the gradients on the off-diagonal.
--- a/doc/tutorial/debug_faq.rst
+++ b/doc/tutorial/debug_faq.rst
--- a/doc/tutorial/examples.rst
+++ b/doc/tutorial/examples.rst
--- a/doc/tutorial/modes.rst
+++ b/doc/tutorial/modes.rst
@@ -9,11 +9,11 @@ Configuration Settings and Compiling Modes
 Configuration
 =============

-The ``config`` module contains several *attributes* that modify Aesara's behavior.  Many of these
-attributes are examined during the import of the ``aesara`` module and several are assumed to be
+The :mod:`aesara.config` module contains several *attributes* that modify Aesara's behavior.  Many of these
+attributes are examined during the import of the :mod:`aesara` module and several are assumed to be
 read-only.

-*As a rule, the attributes in the* ``config`` *module should not be modified inside the user code.*
+*As a rule, the attributes in the* :mod:`aesara.config` *module should not be modified inside the user code.*

 Aesara's code comes with default values for these attributes, but you can
 override them from your ``.aesararc`` file, and override those values in turn by
@@ -21,12 +21,12 @@ the :envvar:`AESARA_FLAGS` environment variable.

 The order of precedence is:

-1. an assignment to aesara.config.<property>
+1. an assignment to ``aesara.config.<property>``
 2. an assignment in :envvar:`AESARA_FLAGS`
-3. an assignment in the .aesararc file (or the file indicated in :envvar:`AESARARC`)
+3. an assignment in the ``.aesararc`` file (or the file indicated in :envvar:`AESARARC`)

 You can display the current/effective configuration at any time by printing
-aesara.config.  For example, to see a list  of all active configuration
+`aesara.config`.  For example, to see a list  of all active configuration
 variables, type this from the command-line:

 .. code-block:: bash
@@ -45,22 +45,24 @@ Consider the logistic regression:

 .. testcode::

-    import numpy
+    import numpy as np
    import aesara
    import aesara.tensor as at
-    rng = numpy.random
+
+
+    rng = np.random.default_rng(2498)

    N = 400
    feats = 784
-    D = (rng.randn(N, feats).astype(aesara.config.floatX),
-    rng.randint(size=N,low=0, high=2).astype(aesara.config.floatX))
+    D = (rng.standard_normal((N, feats)).astype(aesara.config.floatX),
+    rng.integers(size=N,low=0, high=2).astype(aesara.config.floatX))
    training_steps = 10000

    # Declare Aesara symbolic variables
    x = at.matrix("x")
    y = at.vector("y")
-    w = aesara.shared(rng.randn(feats).astype(aesara.config.floatX), name="w")
-    b = aesara.shared(numpy.asarray(0., dtype=aesara.config.floatX), name="b")
+    w = aesara.shared(rng.standard_normal(feats).astype(aesara.config.floatX), name="w")
+    b = aesara.shared(np.asarray(0., dtype=aesara.config.floatX), name="b")
    x.tag.test_value = D[0]
    y.tag.test_value = D[1]

@@ -73,15 +75,18 @@ Consider the logistic regression:

    # Compile expressions to functions
    train = aesara.function(
-                inputs=[x,y],
-                outputs=[prediction, xent],
-                updates=[(w, w-0.01*gw), (b, b-0.01*gb)],
-                name = "train")
-    predict = aesara.function(inputs=[x], outputs=prediction,
-                name = "predict")
-
-    if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for x in
-            train.maker.fgraph.toposort()]):
+        inputs=[x,y],
+        outputs=[prediction, xent],
+        updates=[(w, w-0.01*gw), (b, b-0.01*gb)],
+        name = "train"
+    )
+    predict = aesara.function(
+        inputs=[x], outputs=prediction,
+        name = "predict"
+    )
+
+    if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm']
+           for x in train.maker.fgraph.toposort()]):
        print('Used the cpu')
    else:
        print('ERROR, not able to tell if aesara used the cpu or another device')
@@ -106,7 +111,7 @@ Consider the logistic regression:
   prediction on D
   ...

-Modify and execute this example to run on CPU (the default) with floatX=float32 and
+Modify and execute this example to run on CPU (the default) with ``floatX=float32`` and
 time the execution using the command line ``time python file.py``.  Save your code
 as it will be useful later on.

@@ -114,10 +119,10 @@ as it will be useful later on.

   * Apply the Aesara flag ``floatX=float32`` (through ``aesara.config.floatX``) in your code.
   * Cast inputs before storing them into a shared variable.
-   * Circumvent the automatic cast of *int32* with *float32* to *float64*:
+   * Circumvent the automatic cast of int32 with float32 to float64:

-     * Insert manual cast in your code or use *[u]int{8,16}*.
-     * Insert manual cast around the mean operator (this involves division by length, which is an *int64*).
+     * Insert manual cast in your code or use [u]int{8,16}.
+     * Insert manual cast around the mean operator (this involves division by length, which is an int64).
     * Note that a new casting mechanism is being developed.

 :download:`Solution<modes_solution_1.py>`
@@ -156,7 +161,7 @@ short name        Full constructor

 .. Note::

-    For debugging purpose, there also exists a ``MonitorMode`` (which has no
+    For debugging purpose, there also exists a :class:`MonitorMode` (which has no
    short name). It can be used to step through the execution of a function:
    see :ref:`the debugging FAQ<faq_monitormode>` for details.

@@ -165,8 +170,8 @@ Linkers
 =======

 A mode is composed of 2 things: an optimizer and a linker. Some modes,
-like ``NanGuardMode`` and ``DebugMode``, add logic around the
-optimizer and linker. ``DebugMode`` uses its own linker.
+like `NanGuardMode` and `DebugMode`, add logic around the
+optimizer and linker. `DebugMode` uses its own linker.

 You can select which linker to use with the Aesara flag :attr:`config.linker`.
 Here is a table to compare the different linkers.
@@ -233,8 +238,8 @@ Using DebugMode

 While normally you should use the ``FAST_RUN`` or ``FAST_COMPILE`` mode,
 it is useful at first (especially when you are defining new kinds of
-expressions or new optimizations) to run your code using the DebugMode
-(available via ``mode='DebugMode``). The DebugMode is designed to
+expressions or new optimizations) to run your code using the `DebugMode`
+(available via ``mode='DebugMode``). The `DebugMode` is designed to
 run several self-checks and assertions that can help diagnose
 possible programming errors leading to incorrect output. Note that
 ``DebugMode`` is much slower than ``FAST_RUN`` or ``FAST_COMPILE`` so
@@ -245,7 +250,7 @@ cluster!).
 .. If you modify this code, also change :
 .. tests/test_tutorial.py:T_modes.test_modes_1

-DebugMode is used as follows:
+`DebugMode` is used as follows:

 .. testcode::

@@ -258,21 +263,21 @@ DebugMode is used as follows:
    f([7])


-If any problem is detected, DebugMode will raise an exception according to
-what went wrong, either at call time (*f(5)*) or compile time (
+If any problem is detected, `DebugMode` will raise an exception according to
+what went wrong, either at call time (e.g. ``f(5)``) or compile time (
 ``f = aesara.function(x, 10 * x, mode='DebugMode')``). These exceptions
 should *not* be ignored; talk to your local Aesara guru or email the
 users list if you cannot make the exception go away.

 Some kinds of errors can only be detected for certain input value combinations.
 In the example above, there is no way to guarantee that a future call to, say
-*f(-1)*, won't cause a problem.  DebugMode is not a silver bullet.
+``f(-1)``, won't cause a problem.  `DebugMode` is not a silver bullet.

 .. TODO: repair the following link

-If you instantiate DebugMode using the constructor (see :class:`DebugMode`)
-rather than the keyword ``DebugMode`` you can configure its behaviour via
-constructor arguments. The keyword version of DebugMode (which you get by using ``mode='DebugMode'``)
+If you instantiate `DebugMode` using the constructor (see :class:`DebugMode`)
+rather than the keyword `DebugMode` you can configure its behaviour via
+constructor arguments. The keyword version of `DebugMode` (which you get by using ``mode='DebugMode'``)
 is quite strict.

 For more detail, see :ref:`DebugMode<debugmode>` in the library.
--- a/doc/tutorial/modes_solution_1.py
+++ b/doc/tutorial/modes_solution_1.py
@@ -2,59 +2,62 @@
 # Aesara tutorial
 # Solution to Exercise in section 'Configuration Settings and Compiling Modes'

-
 import numpy as np
 import aesara
 import aesara.tensor as at

-aesara.config.floatX = 'float32'

-rng = np.random
+aesara.config.floatX = "float32"
+
+rng = np.random.default_rng(428)

 N = 400
 feats = 784
-D = (rng.randn(N, feats).astype(aesara.config.floatX),
-rng.randint(size=N, low=0, high=2).astype(aesara.config.floatX))
+D = (
+    rng.standard_normal((N, feats)).astype(aesara.config.floatX),
+    rng.integers(size=N, low=0, high=2).astype(aesara.config.floatX),
+)
 training_steps = 10000

 # Declare Aesara symbolic variables
 x = at.matrix("x")
 y = at.vector("y")
-w = aesara.shared(rng.randn(feats).astype(aesara.config.floatX), name="w")
-b = aesara.shared(np.asarray(0., dtype=aesara.config.floatX), name="b")
+w = aesara.shared(rng.standard_normal(feats).astype(aesara.config.floatX), name="w")
+b = aesara.shared(np.asarray(0.0, dtype=aesara.config.floatX), name="b")
 x.tag.test_value = D[0]
 y.tag.test_value = D[1]
-#print "Initial model:"
-#print w.get_value(), b.get_value()
+# print "Initial model:"
+# print w.get_value(), b.get_value()

 # Construct Aesara expression graph
 p_1 = 1 / (1 + at.exp(-at.dot(x, w) - b))  # Probability of having a one
 prediction = p_1 > 0.5  # The prediction that is done: 0 or 1
 xent = -y * at.log(p_1) - (1 - y) * at.log(1 - p_1)  # Cross-entropy
-cost = at.cast(xent.mean(), 'float32') + \
-       0.01 * (w ** 2).sum()  # The cost to optimize
+cost = at.cast(xent.mean(), "float32") + 0.01 * (w**2).sum()  # The cost to optimize
 gw, gb = at.grad(cost, [w, b])

 # Compile expressions to functions
 train = aesara.function(
-            inputs=[x, y],
-            outputs=[prediction, xent],
-            updates={w: w - 0.01 * gw, b: b - 0.01 * gb},
-            name="train")
-predict = aesara.function(inputs=[x], outputs=prediction,
-            name="predict")
-
-if any(x.op.__class__.__name__ in ('Gemv', 'CGemv', 'Gemm', 'CGemm') for x in
-train.maker.fgraph.toposort()):
-    print('Used the cpu')
+    inputs=[x, y],
+    outputs=[prediction, xent],
+    updates={w: w - 0.01 * gw, b: b - 0.01 * gb},
+    name="train",
+)
+predict = aesara.function(inputs=[x], outputs=prediction, name="predict")
+
+if any(
+    x.op.__class__.__name__ in ("Gemv", "CGemv", "Gemm", "CGemm")
+    for x in train.maker.fgraph.toposort()
+):
+    print("Used the cpu")
 else:
-    print('ERROR, not able to tell if aesara used the cpu or another device')
+    print("ERROR, not able to tell if aesara used the cpu or another device")
    print(train.maker.fgraph.toposort())

 for i in range(training_steps):
    pred, err = train(D[0], D[1])
-#print "Final model:"
-#print w.get_value(), b.get_value()
+# print "Final model:"
+# print w.get_value(), b.get_value()

 print("target values for D")
 print(D[1])

--- a/doc/tutorial/printing_drawing.rst
+++ b/doc/tutorial/printing_drawing.rst
@@ -25,20 +25,20 @@ that creates an image of the function. You can read about them in

 Consider again the logistic regression example:

->>> import numpy
+>>> import numpy as np
 >>> import aesara
 >>> import aesara.tensor as at
->>> rng = numpy.random
+>>> rng = np.random.default_rng(2382)
 >>> # Training data
 >>> N = 400
 >>> feats = 784
->>> D = (rng.randn(N, feats).astype(aesara.config.floatX), rng.randint(size=N,low=0, high=2).astype(aesara.config.floatX))
+>>> D = (rng.standard_normal(N, feats).astype(aesara.config.floatX), rng.integers(size=N,low=0, high=2).astype(aesara.config.floatX))
 >>> training_steps = 10000
 >>> # Declare Aesara symbolic variables
 >>> x = at.matrix("x")
 >>> y = at.vector("y")
->>> w = aesara.shared(rng.randn(feats).astype(aesara.config.floatX), name="w")
->>> b = aesara.shared(numpy.asarray(0., dtype=aesara.config.floatX), name="b")
+>>> w = aesara.shared(rng.standard_normal(feats).astype(aesara.config.floatX), name="w")
+>>> b = aesara.shared(np.asarray(0., dtype=aesara.config.floatX), name="b")
 >>> x.tag.test_value = D[0]
 >>> y.tag.test_value = D[1]
 >>> # Construct Aesara expression graph

--- a/doc/tutorial/profiling_example.py
+++ b/doc/tutorial/profiling_example.py
-
 import numpy as np

 import aesara

-x, y, z = aesara.tensor.vectors('xyz')
+x, y, z = aesara.tensor.vectors("xyz")
 f = aesara.function([x, y, z], [(x + y + z) * 2])
-xv = np.random.rand(10).astype(aesara.config.floatX)
-yv = np.random.rand(10).astype(aesara.config.floatX)
-zv = np.random.rand(10).astype(aesara.config.floatX)
+xv = np.random.random((10,)).astype(aesara.config.floatX)
+yv = np.random.random((10,)).astype(aesara.config.floatX)
+zv = np.random.random((10,)).astype(aesara.config.floatX)
 f(xv, yv, zv)
--- a/doc/tutorial/shape_info.rst
+++ b/doc/tutorial/shape_info.rst
@@ -49,7 +49,7 @@ upgrade.  Here is the current state of what can be done:

    aesara.tensor.nnet.conv2d(..., image_shape=(7, 3, 5, 5), filter_shape=(2, 3, 4, 4))

- You can use the ``SpecifyShape`` op to add shape information anywhere in the
+- You can use the :class:`SpecifyShape`\ :class:`Op` to add shape information anywhere in the
  graph. This allows to perform some optimizations. In the following example,
  this makes it possible to precompute the Aesara function to a constant.

@@ -67,13 +67,13 @@ Problems with Shape inference

 Sometimes this can lead to errors.  Consider this example:

->>> import numpy
+>>> import numpy as np
 >>> import aesara
 >>> x = aesara.tensor.matrix('x')
 >>> y = aesara.tensor.matrix('y')
 >>> z = aesara.tensor.join(0, x, y)
->>> xv = numpy.random.rand(5, 4)
->>> yv = numpy.random.rand(3, 3)
+>>> xv = np.random.random((5, 4))
+>>> yv = np.random.random((3, 3))

 >>> f = aesara.function([x, y], z.shape)
 >>> aesara.printing.debugprint(f) # doctest: +NORMALIZE_WHITESPACE
@@ -109,7 +109,7 @@ This makes the computation of the shape faster, but it can also hide errors. In
 this example, the computation of the shape of the output of ``join`` is done only
 based on the first input Aesara variable, which leads to an error.

-This might happen with other ops such as ``elemwise`` and ``dot``, for example.
+This might happen with other `Op`\s such as :class:`Elemwise` and :class:`Dot`, for example.
 Indeed, to perform some optimizations (for speed or stability, for instance),
 Aesara assumes that the computation is correct and consistent
 in the first place, as it does here.
@@ -118,5 +118,5 @@ You can detect those problems by running the code without this
 optimization, using the Aesara flag
 ``optimizer_excluding=local_shape_to_shape_i``. You can also obtain the
 same effect by running in the modes ``FAST_COMPILE`` (it will not apply this
-optimization, nor most other optimizations) or ``DebugMode`` (it will test
-before and after all optimizations (much slower)).
+optimization, nor most other optimizations) or :class:`DebugMode` (it will test
+before and after all optimizations).