Documentation formatting and NumPy usage updates

e40c1b29 · Brandon T. Willard · Brandon T. Willard · 8b7446e0 · e40c1b29 · e40c1b29
--- a/aesara/compile/builders.py
+++ b/aesara/compile/builders.py
@@ -146,7 +146,7 @@ class OpFromGraph(Op, HasInnerGraph):
        from aesara.compile.builders import OpFromGraph

        x, y, z = at.scalars('xyz')
-        s = aesara.shared(np.random.rand(2, 2).astype(config.floatX))
+        s = aesara.shared(np.random.random((2, 2)).astype(config.floatX))
        e = x + y * z + s
        op = OpFromGraph([x, y, z], [e])
        # op behaves like a normal aesara op

--- a/aesara/gradient.py
+++ b/aesara/gradient.py
@@ -5,6 +5,7 @@ import time
 import warnings
 from collections import OrderedDict
 from functools import partial, reduce
+from typing import TYPE_CHECKING, Callable, List, Optional, Union

 import numpy as np

@@ -18,6 +19,10 @@ from aesara.graph.op import get_test_values
 from aesara.graph.type import Type


+if TYPE_CHECKING:
+    from aesara.compile.mode import Mode
+
+
 __docformat__ = "restructuredtext en"
 _logger = logging.getLogger("aesara.gradient")

@@ -684,8 +689,8 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
    .. code-block:: python

        x, t = aesara.tensor.fvector('x'), aesara.tensor.fvector('t')
-        w1 = aesara.shared(np.random.randn(3,4))
-        w2 = aesara.shared(np.random.randn(4,2))
+        w1 = aesara.shared(np.random.standard_normal((3,4)))
+        w2 = aesara.shared(np.random.standard_normal((4,2)))
        a1 = aesara.tensor.tanh(aesara.tensor.dot(x,w1))
        a2 = aesara.tensor.tanh(aesara.tensor.dot(a1,w2))
        cost2 = aesara.tensor.sqr(a2 - t).sum()
@@ -1690,17 +1695,17 @@ def mode_not_slow(mode):


 def verify_grad(
-    fun,
-    pt,
-    n_tests=2,
-    rng=None,
-    eps=None,
-    out_type=None,
-    abs_tol=None,
-    rel_tol=None,
-    mode=None,
-    cast_to_output_type=False,
-    no_debug_ref=True,
+    fun: Callable,
+    pt: List[np.ndarray],
+    n_tests: int = 2,
+    rng: Optional[Union[np.random.Generator, np.random.RandomState]] = None,
+    eps: Optional[float] = None,
+    out_type: Optional[str] = None,
+    abs_tol: Optional[float] = None,
+    rel_tol: Optional[float] = None,
+    mode: Optional[Union["Mode", str]] = None,
+    cast_to_output_type: bool = False,
+    no_debug_ref: bool = True,
 ):
    """Test a gradient by Finite Difference Method. Raise error on failure.

@@ -1713,47 +1718,47 @@ def verify_grad(
    --------
    >>> verify_grad(aesara.tensor.tanh,
    ...             (np.asarray([[2, 3, 4], [-1, 3.3, 9.9]]),),
-    ...             rng=np.random)
+    ...             rng=np.random.default_rng(23098))

    Parameters
    ----------
-    fun : a Python function
+    fun
        `fun` takes Aesara variables as inputs, and returns an Aesara variable.
-        For instance, an Op instance with  a single output.
-    pt : list of numpy.ndarrays
+        For instance, an `Op` instance with  a single output.
+    pt
        Input values, points where the gradient is estimated.
        These arrays must be either float16, float32, or float64 arrays.
-    n_tests : int
-        Number of times to run the test
-    rng : numpy.random.RandomState
+    n_tests
+        Number o to run the test.
+    rng
        Random number generator used to sample the output random projection `u`,
-        we test gradient of sum(u * fun) at `pt`
-    eps : float, optional
+        we test gradient of ``sum(u * fun)`` at `pt`.
+    eps
        Step size used in the Finite Difference Method (Default
-        None is type-dependent).
-        Raising the value of eps can raise or lower the absolute
+        ``None`` is type-dependent).
+        Raising the value of `eps` can raise or lower the absolute
        and relative errors of the verification depending on the
-        Op. Raising eps does not lower the verification quality for
+        `Op`. Raising `eps` does not lower the verification quality for
        linear operations. It is better to raise `eps` than raising
        `abs_tol` or `rel_tol`.
-    out_type : string
-        Dtype of output, if complex (i.e., 'complex32' or 'complex64')
-    abs_tol : float
+    out_type
+        Dtype of output, if complex (i.e., ``'complex32'`` or ``'complex64'``)
+    abs_tol
        Absolute tolerance used as threshold for gradient comparison
-    rel_tol : float
+    rel_tol
        Relative tolerance used as threshold for gradient comparison
-    cast_to_output_type : bool
-        If the output is float32 and cast_to_output_type is True, cast
-        the random projection to float32. Otherwise it is float64.
+    cast_to_output_type
+        If the output is float32 and `cast_to_output_type` is ``True``, cast
+        the random projection to float32; otherwise, it is float64.
        float16 is not handled here.
-    no_debug_ref : bool
-        Don't use DebugMode for the numerical gradient function.
+    no_debug_ref
+        Don't use `DebugMode` for the numerical gradient function.

    Notes
    -----
-    This function does not support multiple outputs. In
-    tests/scan/test_basic.py there is an experimental `verify_grad` that covers
-    that case as well by using random projections.
+    This function does not support multiple outputs. In `tests.scan.test_basic`
+    there is an experimental `verify_grad` that covers that case as well by
+    using random projections.

    """
    from aesara.compile.function import function

--- a/doc/extending/creating_an_op.rst
+++ b/doc/extending/creating_an_op.rst
@@ -404,13 +404,14 @@ You can try the new :class:`Op` as follows:

 .. testcode:: example

+    import numpy as np
    import aesara
+
    x = aesara.tensor.matrix()
    f = aesara.function([x], DoubleOp1()(x))
-    import numpy
-    inp = numpy.random.rand(5, 4)
+    inp = np.random.random_sample((5, 4))
    out = f(inp)
-    assert numpy.allclose(inp * 2, out)
+    assert np.allclose(inp * 2, out)
    print(inp)
    print(out)

@@ -435,13 +436,14 @@ You can try the new :class:`Op` as follows:

 .. testcode:: example

+    import numpy as np
    import aesara
+
    x = aesara.tensor.matrix()
    f = aesara.function([x], DoubleOp2()(x))
-    import numpy
-    inp = numpy.random.rand(5, 4)
+    inp = np.random.random_sample((5, 4))
    out = f(inp)
-    assert numpy.allclose(inp * 2, out)
+    assert np.allclose(inp * 2, out)
    print(inp)
    print(out)

@@ -530,10 +532,9 @@ We can test this by running the following segment:
    f = aesara.function([x], mult4plus5op(x))
    g = aesara.function([x], mult2plus3op(x))

-    import numpy
-    inp = numpy.random.rand(5, 4).astype(numpy.float32)
-    assert numpy.allclose(4 * inp + 5, f(inp))
-    assert numpy.allclose(2 * inp + 3, g(inp))
+    inp = np.random.random_sample((5, 4)).astype(np.float32)
+    assert np.allclose(4 * inp + 5, f(inp))
+    assert np.allclose(2 * inp + 3, g(inp))


 How To Test it
@@ -553,11 +554,11 @@ returns the right answer. If you detect an error, you must raise an

 .. testcode:: tests

-    import numpy
+    import numpy as np
    import aesara
-
    from tests import unittest_tools as utt
-    from aesara.configdefaults import config
+
+
    class TestDouble(utt.InferShapeTester):
        def setup_method(self):
            super().setup_method()
@@ -565,9 +566,12 @@ returns the right answer. If you detect an error, you must raise an
            self.op = DoubleOp()

        def test_basic(self):
+            rng = np.random.default_rng(utt.fetch_seed())
+
            x = aesara.tensor.matrix()
            f = aesara.function([x], self.op(x))
-            inp = numpy.asarray(numpy.random.rand(5, 4), dtype=config.floatX)
+
+            inp = np.asarray(rng.random((5, 4)), dtype=aesara.config.floatX)
            out = f(inp)
            # Compare the result computed to the expected value.
            utt.assert_allclose(inp * 2, out)
@@ -612,20 +616,26 @@ your :class:`Op` works only with such matrices, you can disable the warning with

 .. testcode:: tests

-    from tests import unittest_tools as utt
    from aesara.configdefaults import config
+    from tests import unittest_tools as utt
+
+
    class TestDouble(utt.InferShapeTester):
+
        # [...] as previous tests.
+
        def test_infer_shape(self):
+            rng = np.random.default_rng(utt.fetch_seed())
            x = aesara.tensor.matrix()
-            self._compile_and_check([x],  # aesara.function inputs
-                                    [self.op(x)],  # aesara.function outputs
-                                    # Always use not square matrix!
-                                    # inputs data
-                                    [numpy.asarray(numpy.random.rand(5, 4),
-                                                   dtype=config.floatX)],
-                                    # Op that should be removed from the graph.
-                                    self.op_class)
+            self._compile_and_check(
+                [x],  # aesara.function inputs
+                [self.op(x)],  # aesara.function outputs
+                # Always use not square matrix!
+                # inputs data
+                [np.asarray(rng.random((5, 4)), dtype=config.floatX)],
+                # Op that should be removed from the graph.
+                self.op_class,
+            )

 Testing the gradient
 ^^^^^^^^^^^^^^^^^^^^
@@ -642,8 +652,11 @@ the multiplication by 2).
 .. testcode:: tests

        def test_grad(self):
-            tests.unittest_tools.verify_grad(self.op,
-                                             [numpy.random.rand(5, 7, 2)])
+            rng = np.random.default_rng(utt.fetch_seed())
+            tests.unittest_tools.verify_grad(
+                self.op,
+                [rng.random((5, 7, 2))]
+            )

 Testing the Rop
 ^^^^^^^^^^^^^^^
@@ -778,40 +791,34 @@ signature:
 .. testcode:: asop

    import aesara
-    import numpy
+    import aesara.tensor as at
+    import numpy as np
    from aesara import function
    from aesara.compile.ops import as_op

+
    def infer_shape_numpy_dot(fgraph, node, input_shapes):
        ashp, bshp = input_shapes
        return [ashp[:-1] + bshp[-1:]]

-    @as_op(itypes=[aesara.tensor.fmatrix, aesara.tensor.fmatrix],
-           otypes=[aesara.tensor.fmatrix], infer_shape=infer_shape_numpy_dot)
+
+    @as_op(itypes=[at.matrix, at.matrix],
+           otypes=[at.matrix], infer_shape=infer_shape_numpy_dot)
    def numpy_dot(a, b):
-       return numpy.dot(a, b)
+       return np.dot(a, b)

 You can try it as follows:

 .. testcode:: asop

-    x = aesara.tensor.fmatrix()
-    y = aesara.tensor.fmatrix()
+    x = at.matrix()
+    y = at.matrix()
    f = function([x, y], numpy_dot(x, y))
-    inp1 = numpy.random.rand(5, 4).astype('float32')
-    inp2 = numpy.random.rand(4, 7).astype('float32')
+    inp1 = np.random.random_sample((5, 4))
+    inp2 = np.random.random_sample((4, 7))
    out = f(inp1, inp2)


-Exercise
-^^^^^^^^
-
-Run the code of the ``numpy_dot`` example above.
-
-Modify and execute to compute: ``numpy.add`` and ``numpy.subtract``.
-
-Modify and execute the example to return two outputs: ``x + y`` and ``x - y``.
-
 .. _Documentation:

 Documentation and Coding Style
@@ -822,7 +829,7 @@ will not be accepted.
 :class:`NanGuardMode` and :class:`AllocEmpty`
 ---------------------------------------------

-:class:`NanGuardMode` help users find where in the graph NaN appear. But
+:class:`NanGuardMode` help users find where in the graph ``NaN`` appear. But
 sometimes, we want some variables to not be checked. For example, in
 the old GPU back-end, we used a float32 :class:`CudaNdarray` to store the MRG
 random number generator state (they are integers). So if :class:`NanGuardMode`

--- a/doc/extending/extending_aesara_solution_1.py
+++ b/doc/extending/extending_aesara_solution_1.py
@@ -81,60 +81,60 @@ from aesara.tensor.type import dmatrix, matrix


 class TestProdOp(utt.InferShapeTester):
-
-    rng = np.random.RandomState(43)
-
    def setup_method(self):
        super().setup_method()
        self.op_class = ProdOp  # case 1

    def test_perform(self):
+        rng = np.random.default_rng(43)
        x = matrix()
        y = matrix()
        f = aesara.function([x, y], self.op_class()(x, y))
-        x_val = np.random.rand(5, 4)
-        y_val = np.random.rand(5, 4)
+        x_val = rng.random((5, 4))
+        y_val = rng.random((5, 4))
        out = f(x_val, y_val)
        assert np.allclose(x_val * y_val, out)

    def test_gradient(self):
+        rng = np.random.default_rng(43)
        utt.verify_grad(
            self.op_class(),
-            [np.random.rand(5, 4), np.random.rand(5, 4)],
+            [rng.random((5, 4)), rng.random((5, 4))],
            n_tests=1,
            rng=TestProdOp.rng,
        )

    def test_infer_shape(self):
+        rng = np.random.default_rng(43)
        x = dmatrix()
        y = dmatrix()

        self._compile_and_check(
            [x, y],
            [self.op_class()(x, y)],
-            [np.random.rand(5, 6), np.random.rand(5, 6)],
+            [rng.random(5, 6), rng.random((5, 6))],
            self.op_class,
        )


 class TestSumDiffOp(utt.InferShapeTester):
-
-    rng = np.random.RandomState(43)
-
    def setup_method(self):
        super().setup_method()
        self.op_class = SumDiffOp

    def test_perform(self):
+        rng = np.random.RandomState(43)
        x = matrix()
        y = matrix()
        f = aesara.function([x, y], self.op_class()(x, y))
-        x_val = np.random.rand(5, 4)
-        y_val = np.random.rand(5, 4)
+        x_val = rng.random((5, 4))
+        y_val = rng.random((5, 4))
        out = f(x_val, y_val)
        assert np.allclose([x_val + y_val, x_val - y_val], out)

    def test_gradient(self):
+        rng = np.random.RandomState(43)
+
        def output_0(x, y):
            return self.op_class()(x, y)[0]

@@ -143,18 +143,20 @@ class TestSumDiffOp(utt.InferShapeTester):

        utt.verify_grad(
            output_0,
-            [np.random.rand(5, 4), np.random.rand(5, 4)],
+            [rng.random((5, 4)), rng.random((5, 4))],
            n_tests=1,
            rng=TestSumDiffOp.rng,
        )
        utt.verify_grad(
            output_1,
-            [np.random.rand(5, 4), np.random.rand(5, 4)],
+            [rng.random((5, 4)), rng.random((5, 4))],
            n_tests=1,
            rng=TestSumDiffOp.rng,
        )

    def test_infer_shape(self):
+        rng = np.random.RandomState(43)
+
        x = dmatrix()
        y = dmatrix()

@@ -163,7 +165,7 @@ class TestSumDiffOp(utt.InferShapeTester):
        self._compile_and_check(
            [x, y],
            self.op_class()(x, y),
-            [np.random.rand(5, 6), np.random.rand(5, 6)],
+            [rng.random((5, 6)), rng.random((5, 6))],
            self.op_class,
        )


--- a/doc/extending/unittest.rst
+++ b/doc/extending/unittest.rst
@@ -97,12 +97,23 @@ Example:

 .. code-block:: python

+    import numpy as np
+    import aesara.tensor as at
+
+
    def test_dot_validity():
        a = at.dmatrix('a')
        b = at.dmatrix('b')
        c = at.dot(a, b)
-        f = aesara.function([a, b], [c])
-        assert np.array_equal(f(self.avals, self.bvals), numpy.dot(self.avals, self.bvals))
+
+        c_fn = aesara.function([a, b], [c])
+
+        avals = ...
+        bvals = ...
+
+        res = c_fn(avals, bvals)
+        exp_res = np.dot(self.avals, self.bvals)
+        assert np.array_equal(res, exp_res)


 Creating an :class:`Op` Unit Test
@@ -117,16 +128,16 @@ unit tests for Aesara :class:`Op`\s.
 Validating the Gradient
 -----------------------

-The :func:`verify_grad` function can be used to validate that the :meth:`Op.grad`
+The :func:`aesara.gradient.verify_grad` function can be used to validate that the :meth:`Op.grad`
 method of your :class:`Op` is properly implemented. :func:`verify_grad` is based
-on the Finite Difference Method where the derivative of function ``f``
-at point ``x`` is approximated as:
+on the Finite Difference Method where the derivative of function :math:`f`
+at point :math:`x` is approximated as:

 .. math::

   \frac{\partial{f}}{\partial{x}} = lim_{\Delta \rightarrow 0} \frac {f(x+\Delta) - f(x-\Delta)} {2\Delta}

-``verify_grad`` performs the following steps:
+:func:`verify_grad` performs the following steps:

 * approximates the gradient numerically using the Finite Difference Method

@@ -142,7 +153,7 @@ Here is the prototype for the :func:`verify_grad` function.

    def verify_grad(fun, pt, n_tests=2, rng=None, eps=1.0e-7, abs_tol=0.0001, rel_tol=0.0001):

-:func:`verify_grad` raises an ``Exception`` if the difference between the analytic gradient and
+:func:`verify_grad` raises an :class:`Exception` if the difference between the analytic gradient and
 numerical gradient (computed through the Finite Difference Method) of a random
 projection of the fun's output to a scalar exceeds both the given absolute and
 relative tolerances.
@@ -152,15 +163,15 @@ The parameters are as follows:
 * ``fun``: a Python function that takes Aesara variables as inputs,
  and returns an Aesara variable.
  For instance, an :class:`Op` instance with a single output is such a function.
-  It can also be a Python function that calls an op with some of its
+  It can also be a Python function that calls an :class:`Op` with some of its
  inputs being fixed to specific values, or that combine multiple :class:`Op`\s.

-* ``pt``: the list of numpy.ndarrays to use as input values
+* ``pt``: the list of `np.ndarrays` to use as input values

 * ``n_tests``: number of times to run the test

-* ``rng``: random number generator used to generate a random vector u,
-  we check the gradient of sum(u*fn) at pt
+* ``rng``: random number generator used to generate a random vector `u`,
+  we check the gradient of ``sum(u*fn)`` at ``pt``

 * ``eps``: stepsize used in the Finite Difference Method

@@ -176,12 +187,12 @@ symbolic variable:

    def test_verify_exprgrad():
        def fun(x,y,z):
-            return (x + tensor.cos(y)) / (4 * z)**2
+            return (x + at.cos(y)) / (4 * z)**2

-        x_val = numpy.asarray([[1], [1.1], [1.2]])
-        y_val = numpy.asarray([0.1, 0.2])
-        z_val = numpy.asarray(2)
-        rng = numpy.random.RandomState(42)
+        x_val = np.asarray([[1], [1.1], [1.2]])
+        y_val = np.asarray([0.1, 0.2])
+        z_val = np.asarray(2)
+        rng = np.random.default_rng(42)

        aesara.gradient.verify_grad(fun, [x_val, y_val, z_val], rng=rng)

@@ -190,11 +201,13 @@ Here is an example showing how to use :func:`verify_grad` on an :class:`Op` inst
 .. testcode::

    def test_flatten_outdimNone():
-        # Testing gradient w.r.t. all inputs of an op (in this example the op
-        # being used is Flatten(), which takes a single input).
-        a_val = numpy.asarray([[0,1,2],[3,4,5]], dtype='float64')
-        rng = numpy.random.RandomState(42)
-        aesara.gradient.verify_grad(tensor.Flatten(), [a_val], rng=rng)
+        """
+        Testing gradient w.r.t. all inputs of an `Op` (in this example the `Op`
+        being used is `Flatten`, which takes a single input).
+        """
+        a_val = np.asarray([[0,1,2],[3,4,5]], dtype='float64')
+        rng = np.random.default_rng(42)
+        aesara.gradient.verify_grad(at.Flatten(), [a_val], rng=rng)

 Here is another example, showing how to verify the gradient w.r.t. a subset of
 an :class:`Op`'s inputs. This is useful in particular when the gradient w.r.t. some of
@@ -204,29 +217,30 @@ which would cause :func:`verify_grad` to crash.
 .. testcode::

    def test_crossentropy_softmax_grad():
-        op = tensor.nnet.crossentropy_softmax_argmax_1hot_with_bias
+        op = at.nnet.crossentropy_softmax_argmax_1hot_with_bias
+
        def op_with_fixed_y_idx(x, b):
-            # Input `y_idx` of this Op takes integer values, so we fix them
+            # Input `y_idx` of this `Op` takes integer values, so we fix them
            # to some constant array.
-            # Although this op has multiple outputs, we can return only one.
+            # Although this `Op` has multiple outputs, we can return only one.
            # Here, we return the first output only.
-            return op(x, b, y_idx=numpy.asarray([0, 2]))[0]
+            return op(x, b, y_idx=np.asarray([0, 2]))[0]

-        x_val = numpy.asarray([[-1, 0, 1], [3, 2, 1]], dtype='float64')
-        b_val = numpy.asarray([1, 2, 3], dtype='float64')
-        rng = numpy.random.RandomState(42)
+        x_val = np.asarray([[-1, 0, 1], [3, 2, 1]], dtype='float64')
+        b_val = np.asarray([1, 2, 3], dtype='float64')
+        rng = np.random.default_rng(42)

        aesara.gradient.verify_grad(op_with_fixed_y_idx, [x_val, b_val], rng=rng)

 .. note::

-    Although ``verify_grad`` is defined in ``aesara.tensor.basic``, unittests
-    should use the version of ``verify_grad`` defined in ``tests.unittest_tools``.
+    Although :func:`verify_grad` is defined in :mod:`aesara.gradient`, unittests
+    should use the version of :func:`verify_grad` defined in :mod:`tests.unittest_tools`.
    This is simply a wrapper function which takes care of seeding the random
-    number generator appropriately before calling ``aesara.gradient.verify_grad``
+    number generator appropriately before calling :func:`aesara.gradient.verify_grad`

-makeTester and makeBroadcastTester
-==================================
+:func:`makeTester` and :func:`makeBroadcastTester`
+==================================================

 Most :class:`Op` unittests perform the same function. All such tests must
 verify that the :class:`Op` generates the proper output, that the gradient is
@@ -244,21 +258,23 @@ product :class:`Op`:

    from tests.tensor.utils import makeTester

+
    rng = np.random.default_rng(23098)

    TestDot = makeTester(
        name="DotTester",
        op=np.dot,
-        expected=lambda x, y: numpy.dot(x, y),
+        expected=lambda x, y: np.dot(x, y),
        checks={},
        good=dict(
-            correct1=(rng.rand(5, 7), rng.rand(7, 5)),
-            correct2=(rng.rand(5, 7), rng.rand(7, 9)),
-            correct3=(rng.rand(5, 7), rng.rand(7)),
+            correct1=(rng.random((5, 7)), rng.random((7, 5))),
+            correct2=(rng.random((5, 7)), rng.random((7, 9))),
+            correct3=(rng.random((5, 7)), rng.random((7,))),
        ),
        bad_build=dict(),
        bad_runtime=dict(
-            bad1=(rng.rand(5, 7), rng.rand(5, 7)), bad2=(rng.rand(5, 7), rng.rand(8, 3))
+            bad1=(rng.random((5, 7)), rng.random((5, 7))),
+            bad2=(rng.random((5, 7)), rng.random((8, 3)))
        ),
        grad=dict(),
    )

--- a/doc/library/compile/nanguardmode.rst
+++ b/doc/library/compile/nanguardmode.rst
@@ -14,37 +14,36 @@ Guide
 =====


-The NanGuardMode aims to prevent the model from outputting NaNs or Infs. It has
-a number of self-checks, which can help to find out which apply node is
-generating those incorrect outputs. It provides automatic detection of 3 types
+The :class:`NanGuardMode` aims to prevent the model from outputting NaNs or Infs. It has
+a number of self-checks, which can help to find out which :class:`Apply` node is
+generating those incorrect outputs. It provides automatic detection of three types
 of abnormal values: NaNs, Infs, and abnormally big values.

-NanGuardMode can be used as follows:
+`NanGuardMode` can be used as follows:

 .. testcode::

-    import numpy
+    import numpy as np
    import aesara
    import aesara.tensor as at
    from aesara.compile.nanguardmode import NanGuardMode

    x = at.matrix()
-    w = aesara.shared(numpy.random.randn(5, 7).astype(aesara.config.floatX))
+    w = aesara.shared(np.random.standard_normal((5, 7)).astype(aesara.config.floatX))
    y = at.dot(x, w)
    fun = aesara.function(
        [x], y,
        mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)
    )

-While using the aesara function ``fun``, it will monitor the values of each
+While using the Aesara function ``fun``, it will monitor the values of each
 input and output variable of each node. When abnormal values are
 detected, it raises an error to indicate which node yields the NaNs. For
 example, if we pass the following values to ``fun``:

 .. testcode::

-    infa = numpy.tile(
-        (numpy.asarray(100.) ** 1000000).astype(aesara.config.floatX), (3, 5))
+    infa = np.tile((np.asarray(100.) ** 1000000).astype(aesara.config.floatX), (3, 5))
    fun(infa)

 .. testoutput::
@@ -55,17 +54,17 @@ example, if we pass the following values to ``fun``:
     ...
   AssertionError: ...

-It will raise an AssertionError indicating that Inf value is detected while
+It will raise an `AssertionError` indicating that Inf value is detected while
 executing the function.

-You can also set the three parameters in ``NanGuardMode()`` to indicate which
+You can also set the three parameters in `NanGuardMode` to indicate which
 kind of abnormal values to monitor. ``nan_is_error`` and ``inf_is_error`` has
 no default values, so they need to be set explicitly, but ``big_is_error`` is
 set to be ``True`` by default.

 .. note::

-        NanGuardMode significantly slows down computations; only
+        `NanGuardMode` significantly slows down computations; only
        enable as needed.

 Reference

--- a/doc/library/config.rst
+++ b/doc/library/config.rst
@@ -797,8 +797,7 @@ import ``aesara`` and print the config variable, as in:
    Aesara will execute the graph using constants and/or shared variables
    provided by the user. Purely symbolic variables (e.g. ``x =
    aesara.tensor.dmatrix()``) can be augmented with test values, by writing to
-    their ``tag.test_value`` attribute (e.g. ``x.tag.test_value =
-    numpy.random.rand(5, 4)``).
+    their ``.tag.test_value`` attributes (e.g. ``x.tag.test_value = np.ones((5, 4))``).

    When not ``'off'``, the value of this option dictates what happens when
    an :class:`Op`'s inputs do not provide appropriate test values:

--- a/doc/library/tensor/basic.rst
+++ b/doc/library/tensor/basic.rst
@@ -8,29 +8,32 @@ Basic Tensor Functionality

 .. testsetup::

+   import numpy as np
   import aesara
   import aesara.tensor as at
   from aesara.tensor.type import scalar, iscalar, TensorType, dmatrix, ivector, fmatrix
   from aesara.tensor import set_subtensor, inc_subtensor, batched_dot
   from aesara import shared
-   import numpy
-   import numpy as np

-Aesara supports any kind of Python object, but its focus is support for
-symbolic matrix expressions.  When you type,
+Aesara supports symbolic tensor expressions.  When you type,

+>>> import aesara.tensor as at
 >>> x = at.fmatrix()

 the ``x`` is a :class:`TensorVariable` instance.
+
 The ``at.fmatrix`` object itself is an instance of :class:`TensorType`.
 Aesara knows what type of variable ``x`` is because ``x.type``
 points back to ``at.fmatrix``.

-This chapter explains the various ways of creating tensor variables,
+This section explains the various ways in which a tensor variable can be created,
 the attributes and methods of :class:`TensorVariable` and :class:`TensorType`,
 and various basic symbolic math and arithmetic that Aesara supports for
 tensor variables.

+In general, Aesara's API tries to mirror NumPy's, so, in most cases, it's safe
+to assume that the basic NumPy array functions and methods will be available.
+
 .. _libdoc_tensor_creation:

 Creation
@@ -39,63 +42,65 @@ Creation
 Aesara provides a list of predefined tensor types that can be used
 to create a tensor variables.  Variables can be named to facilitate debugging,
 and all of these constructors accept an optional ``name`` argument.
-For example, the following each produce a TensorVariable instance that stands
-for a 0-dimensional ndarray of integers with the name ``'myvar'``:
+For example, the following each produce a `TensorVariable` instance that stands
+for a 0-dimensional `ndarray` of integers with the name ``'myvar'``:

->>> x = scalar('myvar', dtype='int32')
->>> x = iscalar('myvar')
+>>> x = at.scalar('myvar', dtype='int32')
+>>> x = at.iscalar('myvar')
+>>> x = at.tensor(dtype='int32', shape=(), name='myvar')
+>>> from aesara.tensor.type import TensorType
 >>> x = TensorType(dtype='int32', shape=())('myvar')

 Constructors with optional dtype
----------------------------------------
+--------------------------------

 These are the simplest and often-preferred methods for creating symbolic
 variables in your code.  By default, they produce floating-point variables
-(with dtype determined by config.floatX, see :attr:`floatX`) so if you use
+(with dtype determined by `aesara.config.floatX`) so if you use
 these constructors it is easy to switch your code between different levels of
 floating-point precision.

 .. function:: scalar(name=None, dtype=config.floatX)

-    Return a Variable for a 0-dimensional ndarray
+    Return a `Variable` for a 0-dimensional `ndarray`

 .. function:: vector(name=None, dtype=config.floatX)

-    Return a Variable for a 1-dimensional ndarray
+    Return a `Variable` for a 1-dimensional `ndarray`

 .. function:: row(name=None, dtype=config.floatX)

-    Return a Variable for a 2-dimensional ndarray
+    Return a `Variable` for a 2-dimensional `ndarray`
    in which the number of rows is guaranteed to be 1.

 .. function:: col(name=None, dtype=config.floatX)

-    Return a Variable for a 2-dimensional ndarray
+    Return a `Variable` for a 2-dimensional `ndarray`
    in which the number of columns is guaranteed to be 1.

 .. function:: matrix(name=None, dtype=config.floatX)

-    Return a Variable for a 2-dimensional ndarray
+    Return a `Variable` for a 2-dimensional `ndarray`

 .. function:: tensor3(name=None, dtype=config.floatX)

-    Return a Variable for a 3-dimensional ndarray
+    Return a `Variable` for a 3-dimensional `ndarray`

 .. function:: tensor4(name=None, dtype=config.floatX)

-    Return a Variable for a 4-dimensional ndarray
+    Return a `Variable` for a 4-dimensional `ndarray`

 .. function:: tensor5(name=None, dtype=config.floatX)

-    Return a Variable for a 5-dimensional ndarray
+    Return a `Variable` for a 5-dimensional `ndarray`

 .. function:: tensor6(name=None, dtype=config.floatX)

-    Return a Variable for a 6-dimensional ndarray
+    Return a `Variable` for a 6-dimensional `ndarray`

 .. function:: tensor7(name=None, dtype=config.floatX)

-    Return a Variable for a 7-dimensional ndarray
+    Return a `Variable` for a 7-dimensional `ndarray`

 .. #COMMENT
    Each of the types described above can be constructed by two methods:
@@ -109,16 +114,14 @@ floating-point precision.
 All Fully-Typed Constructors
 ----------------------------

-The following TensorType instances are provided in the aesara.tensor module.
+The following `TensorType` instances are provided in the `aesara.tensor` module.
 They are all callable, and accept an optional ``name`` argument.  So for example:

 .. testcode:: constructors

-   from aesara.tensor import *
-
-   x = dmatrix()        # creates one Variable with no name
-   x = dmatrix('x')     # creates one Variable with name 'x'
-   xyz = dmatrix('xyz') # creates one Variable with name 'xyz'
+   x = at.dmatrix()        # creates one Variable with no name
+   x = at.dmatrix('x')     # creates one Variable with name 'x'
+   xyz = at.dmatrix('xyz') # creates one Variable with name 'xyz'

 .. #COMMENT
    table generated by
@@ -210,7 +213,7 @@ ztensor7     complex128  7    (?,?,?,?,?,?,?)  (False,) * 7
 ============ =========== ==== ================ ===================================

 Plural Constructors
--------------------------
+-------------------

 There are several constructors that can produce multiple variables at once.
 These are not frequently used in practice, but often used in tutorial examples to save space!
@@ -237,16 +240,16 @@ These are not frequently used in practice, but often used in tutorial examples t

 Each of these plural constructors accepts
 an integer or several strings. If an integer is provided, the method
-will return that many Variables and if strings are provided, it will
-create one Variable for each string, using the string as the Variable's
+will return that many `Variables` and if strings are provided, it will
+create one `Variable` for each string, using the string as the `Variable`'s
 name. For example:

 .. testcode:: constructors

-   from aesara.tensor import *
-
-   x, y, z = dmatrices(3) # creates three matrix Variables with no names
-   x, y, z = dmatrices('x', 'y', 'z') # creates three matrix Variables named 'x', 'y' and 'z'
+   # Creates three matrix `Variable`s with no names
+   x, y, z = at.dmatrices(3)
+   # Creates three matrix `Variables` named 'x', 'y' and 'z'
+   x, y, z = at.dmatrices('x', 'y', 'z')


 Custom tensor types
@@ -258,110 +261,121 @@ your own :class:`TensorType` instance.  You create such an instance by passing
 the dtype and broadcasting pattern to the constructor.  For example, you
 can create your own 8-dimensional tensor type

->>> dtensor8 = TensorType('float64', (False,)*8)
+>>> dtensor8 = TensorType(dtype='float64', shape=(None,)*8)
 >>> x = dtensor8()
 >>> z = dtensor8('z')

 You can also redefine some of the provided types and they will interact
 correctly:

->>> my_dmatrix = TensorType('float64', (False,)*2)
->>> x = my_dmatrix()       # allocate a matrix variable
+>>> my_dmatrix = TensorType('float64', shape=(None,)*2)
+>>> x = my_dmatrix()  # allocate a matrix variable
 >>> my_dmatrix == dmatrix
 True

 See :class:`TensorType` for more information about creating new types of
-Tensor.
+tensors.


 Converting from Python Objects
 -------------------------------

-Another way of creating a TensorVariable (a TensorSharedVariable to be
-precise) is by calling :func:`shared()`
+Another way of creating a `TensorVariable` (a `TensorSharedVariable` to be
+precise) is by calling :func:`aesara.shared`

 .. testcode::

-    x = shared(numpy.random.randn(3,4))
+    x = aesara.shared(np.random.standard_normal((3, 4)))

 This will return a :term:`shared variable <shared variable>` whose ``.value`` is
-a numpy ndarray.  The number of dimensions and dtype of the Variable are
-inferred from the ndarray argument.  The argument to `shared` *will not be
+a NumPy `ndarray`.  The number of dimensions and dtype of the `Variable` are
+inferred from the `ndarray` argument.  The argument to `shared` *will not be
 copied*, and subsequent changes will be reflected in ``x.value``.

 For additional information, see the :func:`shared() <shared.shared>` documentation.

 .. _libdoc_tensor_autocasting:

-Finally, when you use a numpy ndarray or a Python number together with
+Finally, when you use a NumPy `ndarray` or a Python number together with
 :class:`TensorVariable` instances in arithmetic expressions, the result is a
-:class:`TensorVariable`. What happens to the ndarray or the number?
-Aesara requires that the inputs to all expressions be Variable instances, so
+:class:`TensorVariable`. What happens to the `ndarray` or the number?
+Aesara requires that the inputs to all expressions be `Variable` instances, so
 Aesara automatically wraps them in a :class:`TensorConstant`.

 .. note::

-    Aesara makes a copy of any ndarray that you use in an expression, so
-    subsequent
-    changes to that ndarray will not have any effect on the Aesara expression.
+    Aesara makes a copy of any `ndarray` that is used in an expression, so
+    subsequent changes to that `ndarray` will not have any effect on the Aesara
+    expression in which they're contained.

-For numpy ndarrays the dtype is given, but the broadcastable pattern must be
-inferred.  The TensorConstant is given a type with a matching dtype,
-and a broadcastable pattern with a ``True`` for every shape dimension that is 1.
+For NumPy `ndarrays` the dtype is given, but the static shape/broadcastable pattern must be
+inferred.  The `TensorConstant` is given a type with a matching dtype,
+and a static shape/broadcastable pattern with a ``1``\/``True`` for every shape
+dimension that is one and ``None``\/``False`` for every dimension with an unknown
+shape.

-For python numbers, the broadcastable pattern is ``()`` but the dtype must be
+For Python numbers, the static shape/broadcastable pattern is ``()`` but the dtype must be
 inferred.  Python integers are stored in the smallest dtype that can hold
-them, so small constants like ``1`` are stored in a ``bscalar``.
-Likewise, Python floats are stored in an fscalar if fscalar suffices to hold
-them perfectly, but a dscalar otherwise.
+them, so small constants like ``1`` are stored in a `bscalar`.
+Likewise, Python floats are stored in an `fscalar` if `fscalar` suffices to hold
+them perfectly, but a `dscalar` otherwise.

 .. note::

-    When config.floatX==float32 (see :mod:`config`), then Python floats
+    When ``config.floatX == float32`` (see :mod:`config`), then Python floats
    are stored instead as single-precision floats.

    For fine control of this rounding policy, see
-    aesara.tensor.basic.autocast_float.
+    `aesara.tensor.basic.autocast_float`.

 .. function:: as_tensor_variable(x, name=None, ndim=None)

-    Turn an argument `x` into a TensorVariable or TensorConstant.
+    Turn an argument `x` into a `TensorVariable` or `TensorConstant`.

-    Many tensor Ops run their arguments through this function as
-    pre-processing.  It passes through TensorVariable instances, and tries to
-    wrap other objects into TensorConstant.
+    Many tensor `Op`\s run their arguments through this function as
+    pre-processing.  It passes through `TensorVariable` instances, and tries to
+    wrap other objects into `TensorConstant`.

    When `x` is a Python number, the dtype is inferred as described above.

-    When `x` is a `list` or `tuple` it is passed through numpy.asarray
+    When `x` is a `list` or `tuple` it is passed through `np.asarray`

-    If the `ndim` argument is not None, it must be an integer and the output
+    If the `ndim` argument is not ``None``, it must be an integer and the output
    will be broadcasted if necessary in order to have this many dimensions.

    :rtype: :class:`TensorVariable` or :class:`TensorConstant`


-TensorType and TensorVariable
-=============================
+`TensorType` and `TensorVariable`
+=================================

 .. class:: TensorType(Type)

-    The Type class used to mark Variables that stand for `numpy.ndarray`
-    values (`numpy.memmap`, which is a subclass of `numpy.ndarray`, is also allowed).
-    Recalling to the tutorial, the purple box in
+    The `Type` class used to mark Variables that stand for `numpy.ndarray`
+    values.  `numpy.memmap`, which is a subclass of `numpy.ndarray`, is also
+    allowed.  Recalling to the tutorial, the purple box in
    :ref:`the tutorial's graph-structure figure <tutorial-graphfigure>` is an instance of this class.

+    .. attribute:: shape
+        A tuple of ``None`` and integer values representing the static shape associated with this
+        `Type`.  ``None`` values represent unknown/non-fixed shape values.
+
+        .. note::
+
+            Broadcastable tuples/values are an old Theano construct that are
+            being phased-out in Aesara.
+
    .. attribute:: broadcastable

-        A tuple of True/False values, one for each dimension.  True in
-        position 'i' indicates that at evaluation-time, the ndarray will have
-        size 1 in that 'i'-th dimension.  Such a dimension is called a
+        A tuple of ``True``\/``False`` values, one for each dimension.  ``True`` in
+        position ``i`` indicates that at evaluation-time, the `ndarray` will have
+        size one in that ``i``-th dimension.  Such a dimension is called a
        *broadcastable dimension* (see :ref:`tutbroadcasting`).

        The broadcastable pattern indicates both the number of dimensions and
-        whether a particular dimension must have length 1.
+        whether a particular dimension must have length one.

-        Here is a table mapping some `broadcastable` patterns to what they
+        Here is a table mapping some broadcastable patterns to what they
        mean:

        ===================== =================================
@@ -380,19 +394,18 @@ TensorType and TensorVariable
        [False, False, False] A MxNxP tensor (pattern of a + b)
        ===================== =================================

-        For dimensions in which broadcasting is False, the length of this
-        dimension can be 1 or more.  For dimensions in which broadcasting is True,
-        the length of this dimension must be 1.
+        For dimensions in which broadcasting is ``False``, the length of this
+        dimension can be one or more.  For dimensions in which broadcasting is ``True``,
+        the length of this dimension must be one.

        When two arguments to an element-wise operation (like addition or
-        subtraction) have a different
-        number of dimensions, the broadcastable
+        subtraction) have a different number of dimensions, the broadcastable
        pattern is *expanded to the left*, by padding with ``True``. For example,
        a vector's pattern, ``[False]``, could be expanded to ``[True, False]``, and
        would behave like a row (1xN matrix). In the same way, a matrix (``[False,
        False]``) would behave like a 1xNxP tensor (``[True, False, False]``).

-        If we wanted to create a type representing a matrix that would
+        If we wanted to create a `TensorType` representing a matrix that would
        broadcast over the middle dimension of a 3-dimensional tensor when
        adding them together, we would define it like this:

@@ -400,19 +413,18 @@ TensorType and TensorVariable

    .. attribute:: ndim

-        The number of dimensions that a Variable's value will have at
+        The number of dimensions that a `Variable`'s value will have at
        evaluation-time.  This must be known when we are building the
        expression graph.

    .. attribute:: dtype

-        A string indicating
-        the numerical type of the ndarray for which a Variable of this Type
-        is standing.
+        A string indicating the numerical type of the `ndarray` for which a
+        `Variable` of this `Type` represents.

        .. _dtype_list:

-        The dtype attribute of a TensorType instance can be any of the
+        The :attr:`dtype` attribute of a `TensorType` instance can be any of the
        following strings.

        ================= =================== =================
@@ -434,31 +446,31 @@ TensorType and TensorVariable

    .. method:: __init__(self, dtype, broadcastable)

-        If you wish to use a type of tensor which is not already available
-        (for example, a 5D tensor) you can build an appropriate type by instantiating
+        If you wish to use a `Type` that is not already available (for example,
+        a 5D tensor), you can build an appropriate `Type` by instantiating
        :class:`TensorType`.


-TensorVariable
+`TensorVariable`
 ----------------

 .. class:: TensorVariable(Variable, _tensor_py_operators)

-    The result of symbolic operations typically have this type.
+    A `Variable` type that represents symbolic tensors.

    See :class:`_tensor_py_operators` for most of the attributes and methods
    you'll want to call.

 .. class:: TensorConstant(Variable, _tensor_py_operators)

-    Python and numpy numbers are wrapped in this type.
+    Python and NumPy numbers are wrapped in this type.

    See :class:`_tensor_py_operators` for most of the attributes and methods
    you'll want to call.

 .. class:: TensorSharedVariable(Variable, _tensor_py_operators)

-    This type is returned by :func:`shared` when the value to share is a numpy
+    This type is returned by :func:`shared` when the value to share is a NumPy
    ndarray.

    See :class:`_tensor_py_operators` for most of the attributes and methods
@@ -469,7 +481,7 @@ TensorVariable
   :members:

    This mix-in class adds convenient attributes, methods, and support
-    to TensorVariable, TensorConstant and TensorSharedVariable for
+    to `TensorVariable`, `TensorConstant` and `TensorSharedVariable` for
    Python operators (see :ref:`tensor_operator_support`).

    .. attribute:: type
@@ -493,7 +505,7 @@ TensorVariable
       :noindex:

        Returns a view of this tensor that has been reshaped as in
-        numpy.reshape.  If the shape is a Variable argument, then you might
+        `numpy.reshape`.  If the shape is a `Variable` argument, then you might
        need to use the optional `ndim` parameter to declare how many elements
        the shape has, and therefore how many dimensions the reshaped Variable
        will have.
@@ -504,32 +516,32 @@ TensorVariable
       :noindex:

        Returns a view of this tensor with permuted dimensions.  Typically the
-        pattern will include the integers 0, 1, ... ndim-1, and any number of
-        'x' characters in dimensions where this tensor should be broadcasted.
+        pattern will include the integers ``0, 1, ... ndim-1``, and any number of
+        ``'x'`` characters in dimensions where this tensor should be broadcasted.

        A few examples of patterns and their effect:

-            * ('x') -> make a 0d (scalar) into a 1d vector
-            * (0, 1) -> identity for 2d vectors
-            * (1, 0) -> inverts the first and second dimensions
-            * ('x', 0) -> make a row out of a 1d vector (N to 1xN)
-            * (0, 'x') -> make a column out of a 1d vector (N to Nx1)
-            * (2, 0, 1) -> AxBxC to CxAxB
-            * (0, 'x', 1) -> AxB to Ax1xB
-            * (1, 'x', 0) -> AxB to Bx1xA
-            * (1,) -> This remove dimensions 0. It must be a broadcastable dimension (1xA to A)
+            * ``('x',)``: make a 0d (scalar) into a 1d vector
+            * ``(0, 1)``: identity for 2d vectors
+            * ``(1, 0)``: inverts the first and second dimensions
+            * ``('x', 0)``: make a row out of a 1d vector (N to 1xN)
+            * ``(0, 'x')``: make a column out of a 1d vector (N to Nx1)
+            * ``(2, 0, 1)``: AxBxC to CxAxB
+            * ``(0, 'x', 1)``: AxB to Ax1xB
+            * ``(1, 'x', 0)``: AxB to Bx1xA
+            * ``(1,)``: This removes the dimension at index 0. It must be a broadcastable dimension.

    .. method:: flatten(ndim=1)

        Returns a view of this tensor with `ndim` dimensions, whose shape for the first
-        `ndim-1` dimensions will be the same as `self`, and shape in the
-        remaining dimension will be expanded to fit in all the data from self.
+        ``ndim-1`` dimensions will be the same as ``self``, and shape in the
+        remaining dimension will be expanded to fit in all the data from ``self``.

        See :func:`flatten`.

    .. method:: ravel()

-        return self.flatten(). For NumPy compatibility.
+        return `flatten`. For NumPy compatibility.

    .. attribute:: T

@@ -538,19 +550,16 @@ TensorVariable
        >>> x = at.zmatrix()
        >>> y = 3+.2j * x.T

-        .. note::
-
-            In numpy and in Aesara, the transpose of a vector is exactly the
-            same vector!  Use `reshape` or `dimshuffle` to turn your vector
-            into a row or column matrix.
-
    .. method:: {any,all}(axis=None, keepdims=False)
    .. method:: {sum,prod,mean}(axis=None, dtype=None, keepdims=False, acc_dtype=None)
    .. method:: {var,std,min,max,argmin,argmax}(axis=None, keepdims=False),
    .. method:: diagonal(offset=0, axis1=0, axis2=1)
    .. method:: astype(dtype)
    .. method:: take(indices, axis=None, mode='raise')
-    .. method:: copy() Return a new symbolic variable that is a copy of the variable. Does not copy the tag.
+    .. method:: copy()
+
+        Return a new symbolic variable that is a copy of the variable. Does not copy the tag.
+
    .. method:: norm(L, axis=None)
    .. method:: nonzero(self, return_matrix=False)
       :noindex:
@@ -584,27 +593,27 @@ dimensions, see :meth:`_tensor_py_operators.dimshuffle`.

 .. function:: shape(x)

-    Returns an lvector representing the shape of `x`.
+    Returns an `lvector` representing the shape of `x`.

 .. function:: reshape(x, newshape, ndim=None)
   :noindex:

-    :type x: any TensorVariable (or compatible)
+    :type x: any `TensorVariable` (or compatible)
    :param x: variable to be reshaped

-    :type newshape: lvector (or compatible)
+    :type newshape: `lvector` (or compatible)
    :param newshape: the new shape for `x`

    :param ndim: optional - the length that `newshape`'s value will have.
-        If this is ``None``, then `reshape()` will infer it from `newshape`.
+        If this is ``None``, then `reshape` will infer it from `newshape`.

-    :rtype: variable with x's dtype, but ndim dimensions
+    :rtype: variable with `x`'s dtype, but `ndim` dimensions

    .. note::

-        This function can infer the length of a symbolic newshape in some
-        cases, but if it cannot and you do not provide the `ndim`, then this
-        function will raise an Exception.
+        This function can infer the length of a symbolic `newshape` value in
+        some cases, but if it cannot and you do not provide the `ndim`, then
+        this function will raise an Exception.


 .. function:: shape_padleft(x, n_ones=1)
@@ -614,7 +623,7 @@ dimensions, see :meth:`_tensor_py_operators.dimshuffle`.
    see the :func:`unbroadcast`.

    :param x: variable to be reshaped
-    :type x: any TensorVariable (or compatible)
+    :type x: any `TensorVariable` (or compatible)

    :type n_ones: int
    :type n_ones: number of dimension to be added to `x`
@@ -623,7 +632,7 @@ dimensions, see :meth:`_tensor_py_operators.dimshuffle`.

 .. function:: shape_padright(x, n_ones=1)

-    Reshape `x` by right padding the shape with `n_ones` 1s. Note that all
+    Reshape `x` by right padding the shape with `n_ones` ones. Note that all
    this new dimension will be broadcastable. To make them non-broadcastable
    see the :func:`unbroadcast`.

@@ -636,11 +645,11 @@ dimensions, see :meth:`_tensor_py_operators.dimshuffle`.

 .. function:: shape_padaxis(t, axis)

-    Reshape `t` by inserting 1 at the dimension `axis`. Note that this new
+    Reshape `t` by inserting ``1`` at the dimension `axis`. Note that this new
    dimension will be broadcastable. To make it non-broadcastable
    see the :func:`unbroadcast`.

-    :type x: any TensorVariable (or compatible)
+    :type x: any `TensorVariable` (or compatible)
    :param x: variable to be reshaped

    :type axis: int
@@ -669,7 +678,7 @@ dimensions, see :meth:`_tensor_py_operators.dimshuffle`.
    Similar to :func:`reshape`, but the shape is inferred from the shape of `x`.

    :param x: variable to be flattened
-    :type x: any TensorVariable (or compatible)
+    :type x: any `TensorVariable` (or compatible)

    :type ndim: int
    :param ndim: the number of dimensions in the returned variable
@@ -679,10 +688,10 @@ dimensions, see :meth:`_tensor_py_operators.dimshuffle`.
        dimensions, but with all remaining dimensions of `x` collapsed into
        the last dimension.

-    For example, if we flatten a tensor of shape (2, 3, 4, 5) with flatten(x,
-    ndim=2), then we'll have the same (2-1=1) leading dimensions (2,), and the
-    remaining dimensions are collapsed.  So the output in this example would
-    have shape (2, 60).
+    For example, if we flatten a tensor of shape ``(2, 3, 4, 5)`` with ``flatten(x,
+    ndim=2)``, then we'll have the same (i.e. ``2-1=1``) leading dimensions
+    ``(2,)``, and the remaining dimensions are collapsed, so the output in this
+    example would have shape ``(2, 60)``.


 .. function:: tile(x, reps, ndim=None)
@@ -702,13 +711,13 @@ dimensions, see :meth:`_tensor_py_operators.dimshuffle`.
        <aesara.tensor.extra_ops.repeat>`

    :note: Currently, `reps` must be a constant, `x.ndim` and
-        `len(reps)` must be equal and, if specified, `ndim` must be
+        ``len(reps)`` must be equal and, if specified, `ndim` must be
        equal to both.

 .. autofunction:: roll

-Creating Tensor
-===============
+Creating Tensors
+================


 .. function:: zeros_like(x, dtype=None)
@@ -717,7 +726,7 @@ Creating Tensor
    :param dtype: data-type, optional
                  By default, it will be x.dtype.

-    Returns a tensor the shape of x filled with zeros of the type of dtype.
+    Returns a tensor the shape of `x` filled with zeros of the type of `dtype`.


 .. function:: ones_like(x)
@@ -725,31 +734,31 @@ Creating Tensor

    :param x: tensor that has the same shape as output
    :param dtype: data-type, optional
-                  By default, it will be x.dtype.
+                  By default, it will be `x.dtype`.

-    Returns a tensor the shape of x filled with ones of the type of dtype.
+    Returns a tensor the shape of `x` filled with ones of the type of `dtype`.


 .. function:: zeros(shape, dtype=None)

    :param shape: a tuple/list of scalar with the shape information.
-    :param dtype: the dtype of the new tensor. If None, will use floatX.
+    :param dtype: the dtype of the new tensor. If ``None``, will use ``"floatX"``.

-    Returns a tensor filled with 0s of the provided shape.
+    Returns a tensor filled with zeros of the provided shape.

 .. function:: ones(shape, dtype=None)

    :param shape: a tuple/list of scalar with the shape information.
-    :param dtype: the dtype of the new tensor. If None, will use floatX.
+    :param dtype: the dtype of the new tensor. If ``None``, will use ``"floatX"``.

-    Returns a tensor filled with 1s of the provided shape.
+    Returns a tensor filled with ones of the provided shape.

 .. function:: fill(a,b)

    :param a: tensor that has same shape as output
-    :param b: aesara scalar or value with which you want to fill the output
+    :param b: Aesara scalar or value with which you want to fill the output

-    Create a matrix by filling the shape of `a` with `b`
+    Create a matrix by filling the shape of `a` with `b`.

 .. function:: alloc(value, *shape)

@@ -759,9 +768,9 @@ Creating Tensor

 .. function:: eye(n, m=None, k=0, dtype=aesara.config.floatX)

-    :param n: number of rows in output (value or aesara scalar)
-    :param m: number of columns in output (value or aesara scalar)
-    :param k: Index of the diagonal: 0 refers to the main diagonal,
+    :param n: number of rows in output (value or Aesara scalar)
+    :param m: number of columns in output (value or Aesara scalar)
+    :param k: Index of the diagonal: ``0`` refers to the main diagonal,
              a positive value refers to an upper diagonal, and a
              negative value to a lower diagonal. It can be an Aesara
              scalar.
@@ -771,21 +780,21 @@ Creating Tensor
 .. function:: identity_like(x)

    :param x: tensor
-    :returns: A tensor of same shape as `x` that is filled with 0s everywhere
+    :returns: A tensor of same shape as `x` that is filled with zeros everywhere
              except for the main diagonal, whose values are equal to one. The output
              will have same dtype as `x`.

 .. function:: stack(tensors, axis=0)

-    Stack tensors in sequence on given axis (default is 0).
+    Stack tensors in sequence on given axis (default is ``0``).

    Take a sequence of tensors and stack them on given axis to make a single
    tensor. The size in dimension `axis` of the result will be equal to the number
    of tensors passed.

    :param tensors: a list or a tuple of one or more tensors of the same rank.
-    :param axis: the axis along which the tensors will be stacked. Default value is 0.
-    :returns: A tensor such that rval[0] == tensors[0], rval[1] == tensors[1], etc.
+    :param axis: the axis along which the tensors will be stacked. Default value is ``0``.
+    :returns: A tensor such that ``rval[0] == tensors[0]``, ``rval[1] == tensors[1]``, etc.

    Examples:

@@ -805,7 +814,7 @@ Creating Tensor
    >>> rval.shape # 3 tensors are stacked on axis 0
    (3, 2, 2, 2, 2)

-    We can also specify different axis than default value 0
+    We can also specify different axis than default value ``0``:

    >>> x = aesara.tensor.stack([a, b, c], axis=3)
    >>> x.ndim
@@ -834,7 +843,7 @@ Creating Tensor
    tensor.

    :param tensors: one or more tensors of the same rank
-    :returns: A tensor such that rval[0] == tensors[0], rval[1] == tensors[1], etc.
+    :returns: A tensor such that ``rval[0] == tensors[0]``, ``rval[1] == tensors[1]``, etc.

    >>> x0 = at.scalar()
    >>> x1 = at.scalar()
@@ -906,7 +915,7 @@ Reductions
    :Returns: maximum of *x* along *axis*

    axis can be:
-     * *None* - in which case the maximum is computed along all axes (like numpy)
+     * *None* - in which case the maximum is computed along all axes (like NumPy)
     * an *int* - computed along this axis
     * a *list of ints* - computed along these axes

@@ -919,7 +928,7 @@ Reductions
 		will broadcast correctly against the original tensor.
    :Returns: the index of the maximum value along a given axis

-    if axis=None, argmax over the flattened tensor (like numpy)
+    if ``axis == None``, `argmax` over the flattened tensor (like NumPy)

 .. function:: max_and_argmax(x, axis=None, keepdims=False)

@@ -930,7 +939,7 @@ Reductions
 		will broadcast correctly against the original tensor.
    :Returns: the maximum value along a given axis and its index.

-    if axis=None, max_and_argmax over the flattened tensor (like numpy)
+    if ``axis == None``, `max_and_argmax` over the flattened tensor (like NumPy)

 .. function:: min(x, axis=None, keepdims=False)

@@ -941,8 +950,8 @@ Reductions
 		will broadcast correctly against the original tensor.
    :Returns: minimum of *x* along *axis*

-    axis can be:
-     * *None* - in which case the minimum is computed along all axes (like numpy)
+    `axis` can be:
+     * ``None`` - in which case the minimum is computed along all axes (like NumPy)
     * an *int* - computed along this axis
     * a *list of ints* - computed along these axes

@@ -955,7 +964,7 @@ Reductions
 		will broadcast correctly against the original tensor.
    :Returns: the index of the minimum value along a given axis

-    if axis=None, argmin over the flattened tensor (like numpy)
+    if ``axis == None``, `argmin` over the flattened tensor (like NumPy)

 .. function:: sum(x, axis=None, dtype=None, keepdims=False, acc_dtype=None)

@@ -987,10 +996,10 @@ Reductions

    :Returns: sum of *x* along *axis*

-    axis can be:
-     * *None* - in which case the sum is computed along all axes (like numpy)
-     * an *int* - computed along this axis
-     * a *list of ints* - computed along these axes
+    `axis` can be:
+     * ``None`` - in which case the sum is computed along all axes (like NumPy)
+     * an int - computed along this axis
+     * a list of ints - computed along these axes

 .. function:: prod(x, axis=None, dtype=None, keepdims=False, acc_dtype=None, no_zeros_in_input=False)

@@ -1037,10 +1046,10 @@ Reductions

    :Returns: product of every term in *x* along *axis*

-    axis can be:
-     * *None* - in which case the sum is computed along all axes (like numpy)
-     * an *int* - computed along this axis
-     * a *list of ints* - computed along these axes
+    `axis` can be:
+     * ``None`` - in which case the sum is computed along all axes (like NumPy)
+     * an int - computed along this axis
+     * a list of ints - computed along these axes

 .. function:: mean(x, axis=None, dtype=None, keepdims=False, acc_dtype=None)

@@ -1060,10 +1069,10 @@ Reductions
        rules as :func:`sum()`.
    :Returns: mean value of *x* along *axis*

-    axis can be:
-     * *None* - in which case the mean is computed along all axes (like numpy)
-     * an *int* - computed along this axis
-     * a *list of ints* - computed along these axes
+    `axis` can be:
+     * ``None`` - in which case the mean is computed along all axes (like NumPy)
+     * an int - computed along this axis
+     * a list of ints - computed along these axes

 .. function:: var(x, axis=None, keepdims=False)

@@ -1074,10 +1083,10 @@ Reductions
 		will broadcast correctly against the original tensor.
    :Returns: variance of *x* along *axis*

-    axis can be:
-     * *None* - in which case the variance is computed along all axes (like numpy)
-     * an *int* - computed along this axis
-     * a *list of ints* - computed along these axes
+    `axis` can be:
+     * ``None`` - in which case the variance is computed along all axes (like NumPy)
+     * an int - computed along this axis
+     * a list of ints - computed along these axes

 .. function:: std(x, axis=None, keepdims=False)

@@ -1088,10 +1097,10 @@ Reductions
 		will broadcast correctly against the original tensor.
    :Returns: variance of *x* along *axis*

-    axis can be:
-     * *None* - in which case the standard deviation is computed along all axes (like numpy)
-     * an *int* - computed along this axis
-     * a *list of ints* - computed along these axes
+    `axis` can be:
+     * ``None`` - in which case the standard deviation is computed along all axes (like NumPy)
+     * an int - computed along this axis
+     * a list of ints - computed along these axes

 .. function:: all(x, axis=None, keepdims=False)

@@ -1102,10 +1111,10 @@ Reductions
 		will broadcast correctly against the original tensor.
    :Returns: bitwise and of *x* along *axis*

-    axis can be:
-     * *None* - in which case the 'bitwise and' is computed along all axes (like numpy)
-     * an *int* - computed along this axis
-     * a *list of ints* - computed along these axes
+    `axis` can be:
+     * ``None`` - in which case the 'bitwise and' is computed along all axes (like NumPy)
+     * an int - computed along this axis
+     * a list of ints - computed along these axes

 .. function:: any(x, axis=None, keepdims=False)

@@ -1116,10 +1125,10 @@ Reductions
 		will broadcast correctly against the original tensor.
    :Returns: bitwise or of *x* along *axis*

-    axis can be:
-     * *None* - in which case the 'bitwise or' is computed along all axes (like numpy)
-     * an *int* - computed along this axis
-     * a *list of ints* - computed along these axes
+    `axis` can be:
+     * ``None`` - in which case the 'bitwise or' is computed along all axes (like NumPy)
+     * an int - computed along this axis
+     * a list of ints - computed along these axes

 .. function:: ptp(x, axis = None)

@@ -1205,30 +1214,30 @@ Casting

 .. function:: cast(x, dtype)

-    Cast any tensor `x` to a Tensor of the same shape, but with a different
+    Cast any tensor `x` to a tensor of the same shape, but with a different
    numerical type `dtype`.

-    This is not a reinterpret cast, but a coercion cast, similar to
+    This is not a reinterpret cast, but a coercion `cast`, similar to
    ``numpy.asarray(x, dtype=dtype)``.

    .. testcode:: cast

-        import Aesara.tensor as at
+        import aesara.tensor as at
        x = at.matrix()
        x_as_int = at.cast(x, 'int32')

    Attempting to casting a complex value to a real value is ambiguous and
-    will raise an exception.  Use `real()`, `imag()`, `abs()`, or `angle()`.
+    will raise an exception.  Use `real`, `imag`, `abs`, or `angle`.

 .. function:: real(x)

-    Return the real (not imaginary) components of Tensor x.
-    For non-complex `x` this function returns x.
+    Return the real (not imaginary) components of tensor `x`.
+    For non-complex `x` this function returns `x`.

 .. function:: imag(x)

-    Return the imaginary components of Tensor x.
-    For non-complex `x` this function returns zeros_like(x).
+    Return the imaginary components of tensor `x`.
+    For non-complex `x` this function returns ``zeros_like(x)``.


 Comparisons
@@ -1249,7 +1258,7 @@ The six usual equality and inequality operators share the same interface.

  .. testcode:: oper

-    import Aesara.tensor as at
+    import aesara.tensor as at
    x,y = at.dmatrices('x','y')
    z = at.le(x,y)

@@ -1332,8 +1341,8 @@ Condition

 .. function:: switch(cond, ift, iff)

-    Returns a variable representing a switch between ift (iftrue) and iff (iffalse)
-     based on the condition cond. This is the Aesara equivalent of numpy.where.
+    Returns a variable representing a switch between ift (i.e. "if true") and iff (i.e. "if false")
+    based on the condition cond. This is the Aesara equivalent of `numpy.where`.

      :Parameter:  *cond* - symbolic Tensor (or compatible)
      :Parameter:  *ift* - symbolic Tensor (or compatible)
@@ -1342,32 +1351,32 @@ Condition

    .. testcode:: switch

-      import Aesara.tensor as at
+      import aesara.tensor as at
      a,b = at.dmatrices('a','b')
      x,y = at.dmatrices('x','y')
      z = at.switch(at.lt(a,b), x, y)

 .. function:: where(cond, ift, iff)

-   Alias for `switch`. where is the numpy name.
+   Alias for `switch`. where is the NumPy name.

 .. function:: clip(x, min, max)

-    Return a variable representing x, but with all elements greater than
+    Return a variable representing `x`, but with all elements greater than
    `max` clipped to `max` and all elements less than `min` clipped to `min`.

    Normal broadcasting rules apply to each of `x`, `min`, and `max`.

    Note that there is no warning for inputs that are the wrong way round
-    (`min > max`), and that results in this case may differ from ``numpy.clip``.
+    (`min > max`), and that results in this case may differ from `numpy.clip`.

 Bit-wise
 --------


 The bitwise operators possess this interface:
-    :Parameter:  *a* - symbolic Tensor of integer type.
-    :Parameter:  *b* - symbolic Tensor of integer type.
+    :Parameter:  *a* - symbolic tensor of integer type.
+    :Parameter:  *b* - symbolic tensor of integer type.

    .. note::

@@ -1375,7 +1384,7 @@ The bitwise operators possess this interface:

        The bit-wise not (invert) takes only one parameter.

-    :Return type: symbolic Tensor with corresponding dtype.
+    :Return type: symbolic tensor with corresponding dtype.

 .. function:: and_(a, b)

@@ -1395,25 +1404,25 @@ The bitwise operators possess this interface:

 .. function:: bitwise_and(a, b)

-   Alias for `and_`. bitwise_and is the numpy name.
+   Alias for `and_`. bitwise_and is the NumPy name.

 .. function:: bitwise_or(a, b)

-   Alias for `or_`. bitwise_or is the numpy name.
+   Alias for `or_`. bitwise_or is the NumPy name.

 .. function:: bitwise_xor(a, b)

-   Alias for `xor_`. bitwise_xor is the numpy name.
+   Alias for `xor_`. bitwise_xor is the NumPy name.

 .. function:: bitwise_not(a, b)

-   Alias for invert. invert is the numpy name.
+   Alias for invert. invert is the NumPy name.

 Here is an example using the bit-wise ``and_`` via the ``&`` operator:

 .. testcode:: bitwise

-    import Aesara.tensor as at
+    import aesara.tensor as at
    x,y = at.imatrices('x','y')
    z = x & y

@@ -1518,7 +1527,7 @@ Mathematical
   Returns a variable representing the survival function (1-cdf —
   sometimes more accurate).

-   C code is provided in the Aesara_lgpl repository.
+   C code is provided in the Theano_lgpl repository.
   This makes it faster.

   https://github.com/Theano/Theano_lgpl.git
@@ -1542,7 +1551,7 @@ Linear Algebra
    :param Y: right term
    :type X: symbolic tensor
    :type Y: symbolic tensor
-    :rtype: `symbolic matrix or vector`
+    :rtype: symbolic matrix or vector
    :return: the inner product of `X` and `Y`.

 .. function:: outer(X, Y)
@@ -1560,7 +1569,7 @@ Linear Algebra
    Given two tensors a and b,tensordot computes a generalized dot product over
    the provided axes. Aesara's implementation reduces all expressions to
    matrix or vector dot products and is based on code from Tijmen Tieleman's
-    gnumpy (http://www.cs.toronto.edu/~tijmen/gnumpy.html).
+    `gnumpy` (http://www.cs.toronto.edu/~tijmen/gnumpy.html).

    :param a: the first tensor variable
    :type a: symbolic tensor
@@ -1575,7 +1584,7 @@ Linear Algebra
                 Note that the default value of 2 is not guaranteed to work
                 for all values of a and b, and an error will be raised if
                 that is the case. The reason for keeping the default is to
-                 maintain the same signature as numpy's tensordot function
+                 maintain the same signature as NumPy's tensordot function
                 (and np.tensordot raises analogous errors for non-compatible
                 inputs).

@@ -1612,21 +1621,17 @@ Linear Algebra
        a = np.random.random((2,3,4))
        b = np.random.random((5,6,4,3))

-        #tensordot
        c = np.tensordot(a, b, [[1,2],[3,2]])

-        #loop replicating tensordot
        a0, a1, a2 = a.shape
        b0, b1, _, _ = b.shape
        cloop = np.zeros((a0,b0,b1))

-        #loop over non-summed indices -- these exist
-        #in the tensor product.
+        # Loop over non-summed indices--these exist in the tensor product
        for i in range(a0):
            for j in range(b0):
                for k in range(b1):
-                    #loop over summed indices -- these don't exist
-                    #in the tensor product.
+                    # Loop over summed indices--these don't exist in the tensor product
                    for l in range(a1):
                        for m in range(a2):
                            cloop[i,j,k] += a[i,l,m] * b[j,k,m,l]
@@ -1668,9 +1673,7 @@ Linear Algebra
    >>> second = at.tensor3('second')
    >>> result = batched_dot(first, second)

-    :note:  This is a subset of numpy.einsum, but we do not provide it for now.
-        But numpy einsum is slower than dot or tensordot:
-        http://mail.scipy.org/pipermail/numpy-discussion/2012-October/064259.html
+    :note:  This is a subset of `numpy.einsum`, but we do not provide it for now.

    :param X: left term
    :param Y: right term

--- a/doc/sandbox/logistic_regression_example.rst
+++ b/doc/sandbox/logistic_regression_example.rst
@@ -65,8 +65,8 @@ BUT, YOU GOTTA RUN THIS CODE AND MAKE SURE IT STILL WORKS NICELY, HEY?

    up_fn, app_fn = build_logistic_regression_model(n_in=10, n_out=3, l2_coef=30.0)

-    x_data = numpy.random.randn(100, 10)
-    y_data = numpy.random.randn(100, 3)
+    x_data = numpy.random.standard_normal((100, 10))
+    y_data = numpy.random.standard_normal((100, 3))
    y_data = _asarray(y_data == numpy.max(y_data, axis=1), dtype='int64')

    print "Model Training ..."

--- a/doc/sandbox/sparse.rst
+++ b/doc/sandbox/sparse.rst
@@ -11,31 +11,36 @@ Note that you want SciPy >= 0.7.2

 .. warning::

-    In SciPy 0.6, ``scipy.csc_matrix.dot`` has a bug with singleton
+    In SciPy 0.6, `scipy.csc_matrix.dot` has a bug with singleton
    dimensions. There may be more bugs. It also has inconsistent
    implementation of sparse matrices.

    We do not test against SciPy versions below 0.7.2.

 We describe the details of the compressed sparse matrix types.
-    ``scipy.sparse.csc_matrix``
-        should be used if there are more rows than column (shape[0] > shape[1]).
-    ``scipy.sparse.csr_matrix``
-        should be used if there are more columns than rows (shape[0] < shape[1]).
-    ``scipy.sparse.lil_matrix``
+    `scipy.sparse.csc_matrix`
+        should be used if there are more rows than column (``shape[0] > shape[1]``).
+    `scipy.sparse.csr_matrix`
+        should be used if there are more columns than rows (``shape[0] < shape[1]``).
+    `scipy.sparse.lil_matrix`
        is faster if we are modifying the array. After initial inserts,
        we can then convert to the appropriate sparse matrix format.

 The following types also exist:
-    ``dok_matrix``
+    `dok_matrix`
        Dictionary of Keys format. From their doc: This is an efficient structure for constructing sparse matrices incrementally.
-    ``coo_matrix``
+    `coo_matrix`
        Coordinate format. From their lil doc: consider using the COO format when constructing large matrices.

-There seems to be a new format planned for scipy 0.7.x:
-    ``bsr_matrix``
-        Block Compressed Row (BSR). From their doc: The Block Compressed Row (BSR) format is very similar to the Compressed Sparse Row (CSR) format. BSR is appropriate for sparse matrices with dense sub matrices like the last example below. Block matrices often arise in vector-valued finite element discretizations. In such cases, BSR is considerably more efficient than CSR and CSC for many sparse arithmetic operations.
-    ``dia_matrix``
+There seems to be a new format planned for SciPy 0.7.x:
+    `bsr_matrix`
+        Block Compressed Row (BSR). From their doc: The Block Compressed Row
+        (BSR) format is very similar to the Compressed Sparse Row (CSR)
+        format. BSR is appropriate for sparse matrices with dense sub matrices
+        like the last example below. Block matrices often arise in vector-valued
+        finite element discretizations. In such cases, BSR is considerably more
+        efficient than CSR and CSC for many sparse arithmetic operations.
+    `dia_matrix`
        Sparse matrix with DIAgonal storage

 There are four member variables that comprise a compressed matrix ``sp`` (for at least csc, csr and bsr):
@@ -52,9 +57,9 @@ There are four member variables that comprise a compressed matrix ``sp`` (for at
        row location.
    ``sp.indptr``
        gives the other location of the non-zero entry. For CSC, there are
-        as many values of indptr as there are columns + 1 in the matrix.
+        as many values of indptr as there are ``columns + 1`` in the matrix.
        ``sp.indptr[k] = x`` and ``indptr[k+1] = y`` means that column
-        k contains sp.data[x:y], i.e. the xth through the y-1th non-zero values.
+        ``k`` contains ``sp.data[x:y]``, i.e. the ``x``-th through the y-1th non-zero values.

 See the example below for details.

@@ -63,7 +68,7 @@ See the example below for details.
    >>> import scipy.sparse
    >>> sp = scipy.sparse.csc_matrix((5, 10))
    >>> sp[4, 0] = 20
-    /u/lisa/local/byhost/test_maggie46.iro.umontreal.ca/lib64/python2.5/site-packages/scipy/sparse/compressed.py:494: SparseEfficiencyWarning: changing the sparsity structure of a csc_matrix is expensive. lil_matrix is more efficient.
+    SparseEfficiencyWarning: changing the sparsity structure of a csc_matrix is expensive. lil_matrix is more efficient.
     SparseEfficiencyWarning)
    >>> sp[0, 0] = 10
    >>> sp[2, 3] = 30
@@ -91,13 +96,13 @@ Several things should be learned from the above example:
 * We actually use the wrong sparse matrix type. In fact, it is the
  *rows* that are sparse, not the columns. So, it would have been
  better to use ``sp = scipy.sparse.csr_matrix((5, 10))``.
-* We should have actually created the matrix as a ``lil_matrix``,
+* We should have actually created the matrix as a `lil_matrix`,
  which is more efficient for inserts. Afterwards, we should convert
  to the appropriate compressed format.
-* `sp.indptr[0] = 0` and `sp.indptr[1] = 2`, which means that
-  column 0 contains sp.data[0:2], i.e. the first two non-zero values.
-* `sp.indptr[3] = 2` and `sp.indptr[4] = 3`, which means that column
-  3 contains sp.data[2:3], i.e. the third non-zero value.
+* ``sp.indptr[0] = 0`` and ``sp.indptr[1] = 2``, which means that
+  column 0 contains ``sp.data[0:2]``, i.e. the first two non-zero values.
+* ``sp.indptr[3] = 2`` and ``sp.indptr[4] = 3``, which means that column
+  three contains ``sp.data[2:3]``, i.e. the third non-zero value.

 TODO: Rewrite this documentation to do things in a smarter way.

@@ -112,7 +117,7 @@ For faster sparse code:

 Misc
 ----
-The sparse equivalent of dmatrix is csc_matrix and csr_matrix.
+The sparse equivalent of `dmatrix` is `csc_matrix` and `csr_matrix`.

 :class:`~aesara.sparse.basic.Dot` vs. :class:`~aesara.sparse.basic.StructuredDot`
 ---------------------------------------------------------------------------------
@@ -121,22 +126,22 @@ Often when you use a sparse matrix it is because there is a meaning to the
 structure of non-zeros. The gradient on terms outside that structure
 has no meaning, so it is computationally efficient not to compute them.

-StructuredDot is when you want the gradient to have zeroes corresponding to
+`StructuredDot` is when you want the gradient to have zeroes corresponding to
 the sparse entries in the matrix.

-TrueDot and Structured dot have different gradients
+`TrueDot` and `Structured` dot have different gradients
 but their perform functions should be the same.

-The gradient of TrueDot can have non-zeros where the sparse matrix had zeros.
-The gradient of StructuredDot can't.
+The gradient of `TrueDot` can have non-zeros where the sparse matrix had zeros.
+The gradient of `StructuredDot` can't.

 Suppose you have ``dot(x,w)`` where ``x`` and ``w`` are square matrices.
-If ``w`` is dense, like ``randn((5,5))`` and ``x`` is of full rank (though
-potentially sparse, like a diagonal matrix of 1s) then the output will
-be dense too. (But i guess the density of the output is a red herring.)
+If ``w`` is dense, like ``standard_normal((5,5))`` and ``x`` is of full rank (though
+potentially sparse, like a diagonal matrix of ones) then the output will
+be dense too.
 What's important is the density of the gradient on the output.
 If the gradient on the output is dense, and ``w`` is dense (as we said it was)
-then the True gradient on ``x`` will be dense.
-If our dot is a TrueDot, then it will say that the gradient on ``x`` is dense.
-If our dot is a StructuredDot, then it will say the gradient on ``x`` is only
+then the ``True`` gradient on ``x`` will be dense.
+If our dot is a `TrueDot`, then it will say that the gradient on ``x`` is dense.
+If our dot is a `StructuredDot`, then it will say the gradient on ``x`` is only
 defined on the diagonal and ignore the gradients on the off-diagonal.
--- a/doc/tutorial/debug_faq.rst
+++ b/doc/tutorial/debug_faq.rst
@@ -55,16 +55,15 @@ Running the code above we see:

 Arguably the most useful information is approximately half-way through
 the error message, where the kind of error is displayed along with its
-cause (`ValueError: Input dimension mismatch. (input[0].shape[0] = 3,
-input[1].shape[0] = 2`).
-Below it, some other information is given, such as the apply node that
+cause (e.g. ``ValueError: Input dimension mismatch. (input[0].shape[0] = 3, input[1].shape[0] = 2``).
+Below it, some other information is given, such as the `Apply` node that
 caused the error, as well as the input types, shapes, strides and
 scalar values.

-The two hints can also be helpful when debugging. Using the aesara flag
+The two hints can also be helpful when debugging. Using the Aesara flag
 ``optimizer=fast_compile`` or ``optimizer=None`` can often tell you
 the faulty line, while ``exception_verbosity=high`` will display a
-debugprint of the apply node. Using these hints, the end of the error
+debug print of the apply node. Using these hints, the end of the error
 message becomes :

 .. code-block:: none
@@ -90,10 +89,10 @@ Using Test Values
 -----------------

 As of v.0.4.0, Aesara has a new mechanism by which graphs are executed
-on-the-fly, before a ``aesara.function`` is ever compiled. Since optimizations
+on-the-fly, before a :func:`aesara.function` is ever compiled. Since optimizations
 haven't been applied at this stage, it is easier for the user to locate the
 source of some bug. This functionality is enabled through the config flag
-``aesara.config.compute_test_value``. Its use is best shown through the
+`aesara.config.compute_test_value`. Its use is best shown through the
 following example. Here, we use ``exception_verbosity=high`` and
 ``optimizer=fast_compile``, which would not tell you the line at fault.
 ``optimizer=None`` would and it could therefore be used instead of test values.
@@ -101,7 +100,7 @@ following example. Here, we use ``exception_verbosity=high`` and

 .. testcode:: testvalue

-    import numpy
+    import numpy as np
    import aesara
    import aesara.tensor as at

@@ -109,15 +108,15 @@ following example. Here, we use ``exception_verbosity=high`` and
    aesara.config.compute_test_value = 'off' # Use 'warn' to activate this feature

    # configure shared variables
-    W1val = numpy.random.rand(2, 10, 10).astype(aesara.config.floatX)
+    W1val = np.random.random((2, 10, 10)).astype(aesara.config.floatX)
    W1 = aesara.shared(W1val, 'W1')
-    W2val = numpy.random.rand(15, 20).astype(aesara.config.floatX)
+    W2val = np.random.random((15, 20)).astype(aesara.config.floatX)
    W2 = aesara.shared(W2val, 'W2')

    # input which will be of shape (5,10)
    x  = at.matrix('x')
    # provide Aesara with a default test-value
-    #x.tag.test_value = numpy.random.rand(5, 10)
+    #x.tag.test_value = np.random.random((5, 10))

    # transform the shared variable in some way. Aesara does not
    # know off hand that the matrix func_of_W1 has shape (20, 10)
@@ -131,7 +130,7 @@ following example. Here, we use ``exception_verbosity=high`` and

    # compile and call the actual function
    f = aesara.function([x], h2)
-    f(numpy.random.rand(5, 10))
+    f(np.random.random((5, 10)))

 Running the above code generates the following error message:

@@ -139,7 +138,7 @@ Running the above code generates the following error message:

    Traceback (most recent call last):
      File "test1.py", line 31, in <module>
-        f(numpy.random.rand(5, 10))
+        f(np.random.random((5, 10)))
      File "PATH_TO_AESARA/aesara/compile/function/types.py", line 605, in __call__
        self.fn.thunks[self.fn.position_of_error])
      File "PATH_TO_AESARA/aesara/compile/function/types.py", line 595, in __call__
@@ -171,10 +170,10 @@ so slightly, we can get Aesara to reveal the exact source of the error.

    ...

-    # input which will be of shape (5, 10)
+    # Input which will have the shape (5, 10)
    x  = at.matrix('x')
-    # provide Aesara with a default test-value
-    x.tag.test_value = numpy.random.rand(5, 10)
+    # Provide Aesara with a default test-value
+    x.tag.test_value = np.random.random((5, 10))

 In the above, we are tagging the symbolic matrix *x* with a special test
 value. This allows Aesara to evaluate symbolic expressions on-the-fly (by
@@ -195,7 +194,7 @@ following error message, which properly identifies *line 24* as the culprit.
      File "PATH_TO_AESARA/aesara/graph/op.py", line 752, in rval
        r = p(n, [x[0] for x in i], o)
      File "PATH_TO_AESARA/aesara/tensor/basic.py", line 4554, in perform
-        z[0] = numpy.asarray(numpy.dot(x, y))
+        z[0] = np.asarray(np.dot(x, y))
    ValueError: matrices are not aligned

 The ``compute_test_value`` mechanism works as follows:
@@ -254,11 +253,11 @@ Running the code above returns the following output:
 "How do I Print an Intermediate Value in a Function?"
 -----------------------------------------------------

-Aesara provides a 'Print' op to do this.
+Aesara provides a :class:`Print`\ :class:`Op` to do this.

 .. testcode::

-    import numpy
+    import numpy as np
    import aesara

    x = aesara.tensor.dvector('x')
@@ -268,11 +267,11 @@ Aesara provides a 'Print' op to do this.
    f = aesara.function([x], x * 5)
    f_with_print = aesara.function([x], x_printed * 5)

-    #this runs the graph without any printing
-    assert numpy.all( f([1, 2, 3]) == [5, 10, 15])
+    # This runs the graph without any printing
+    assert np.array_equal(f([1, 2, 3]), [5, 10, 15])

-    #this runs the graph with the message, and value printed
-    assert numpy.all( f_with_print([1, 2, 3]) == [5, 10, 15])
+    # This runs the graph with the message, and value printed
+    assert np.array_equal(f_with_print([1, 2, 3]), [5, 10, 15])

 .. testoutput::

@@ -361,17 +360,16 @@ shows how to print all inputs and outputs:
    0 Elemwise{mul,no_inplace}(TensorConstant{5.0}, x) input(s) value(s): [array(5.0), array(3.0)] output(s) value(s): [array(15.0)]

 When using these ``inspect_inputs`` and ``inspect_outputs`` functions
-with ``MonitorMode``, you should see [potentially a lot of] printed output.
-Every ``Apply`` node will be printed out,
-along with its position in the graph, the arguments to the functions ``perform`` or
-``c_code`` and the output it computed.
-Admittedly, this may be a huge amount of
-output to read through if you are using big tensors... but you can choose to
-add logic that would, for instance, print
+with ``MonitorMode``, you should see (potentially a lot of) printed output.
+Every ``Apply`` node will be printed out, along with its position in the graph,
+the arguments to the functions ``perform`` or ``c_code`` and the output it
+computed.
+Admittedly, this may be a huge amount of output to read through if you are using
+large tensors, but you can choose to add logic that would, for instance, print
 something out only if a certain kind of op were used, at a certain program
-position, or only if a particular value showed up in one of the inputs or outputs.
-A typical example is to detect when NaN values are added into computations, which
-can be achieved as follows:
+position, or only if a particular value showed up in one of the inputs or
+outputs.  A typical example is to detect when NaN values are added into
+computations, which can be achieved as follows:

 .. testcode:: compiled

@@ -382,13 +380,13 @@ can be achieved as follows:
    # This is the current suggested detect_nan implementation to
    # show you how it work.  That way, you can modify it for your
    # need.  If you want exactly this method, you can use
-    # ``aesara.compile.monitormode.detect_nan`` that will always
+    # `aesara.compile.monitormode.detect_nan` that will always
    # contain the current suggested version.

    def detect_nan(fgraph, i, node, fn):
        for output in fn.outputs:
-            if (not isinstance(output[0], numpy.random.RandomState) and
-                numpy.isnan(output[0]).any()):
+            if (not isinstance(output[0], np.ndarray) and
+                np.isnan(output[0]).any()):
                print('*** NaN detected ***')
                aesara.printing.debugprint(node)
                print('Inputs : %s' % [input[0] for input in fn.inputs])
@@ -396,9 +394,11 @@ can be achieved as follows:
                break

    x = aesara.tensor.dscalar('x')
-    f = aesara.function([x], [aesara.tensor.log(x) * x],
-                        mode=aesara.compile.MonitorMode(
-                            post_func=detect_nan))
+    f = aesara.function(
+        [x], [aesara.tensor.log(x) * x],
+        mode=aesara.compile.MonitorMode(
+        post_func=detect_nan)
+    )
    f(0)  # log(0) * 0 = -inf * 0 = NaN

 .. testoutput:: compiled
@@ -458,12 +458,12 @@ Intermediate results don't necessarily have a clear name and you can get
 exceptions which are hard to decipher, due to the "compiled" nature of the
 functions.

-Consider this example script ("ex.py"):
+Consider this example script (``ex.py``):

 .. testcode::

+   import numpy as np
   import aesara
-   import numpy
   import aesara.tensor as at

   a = at.dmatrix('a')
@@ -471,9 +471,9 @@ Consider this example script ("ex.py"):

   f = aesara.function([a, b], [a * b])

-   # matrices chosen so dimensions are unsuitable for multiplication
-   mat1 = numpy.arange(12).reshape((3, 4))
-   mat2 = numpy.arange(25).reshape((5, 5))
+   # Matrices chosen so dimensions are unsuitable for multiplication
+   mat1 = np.arange(12).reshape((3, 4))
+   mat2 = np.arange(25).reshape((5, 5))

   f(mat1, mat2)

@@ -514,18 +514,18 @@ illustrative purposes. As the matrices can't be multiplied element-wise
 The call stack contains some useful information to trace back the source
 of the error. There's the script where the compiled function was called --
 but if you're using (improperly parameterized) prebuilt modules, the error
-might originate from ops in these modules, not this script. The last line
-tells us about the op that caused the exception. In this case it's a "mul"
+might originate from `Op`\s in these modules, not this script. The last line
+tells us about the `Op` that caused the exception. In this case it's a "mul"
 involving variables with names "a" and "b". But suppose we instead had an
 intermediate result to which we hadn't given a name.

 After learning a few things about the graph structure in Aesara, we can use
 the Python debugger to explore the graph, and then we can get runtime
 information about the error. Matrix dimensions, especially, are useful to
-pinpoint the source of the error. In the printout, there are also 2 of the 4
-dimensions of the matrices involved, but for the sake of example say we'd
-need the other dimensions to pinpoint the error. First, we re-launch with
-the debugger module and run the program with "c":
+pinpoint the source of the error. In the printout, there are also two of the
+four dimensions of the matrices involved, but for the sake of example say we'd
+need the other dimensions to pinpoint the error. First, we re-launch with the
+debugger module and run the program with "c":

 .. code-block:: text

@@ -537,22 +537,22 @@ the debugger module and run the program with "c":
 Then we get back the above error printout, but the interpreter breaks in
 that state. Useful commands here are

-* "up" and "down" (to move up and down the call stack),
-* "l" (to print code around the line in the current stack position),
-* "p variable_name" (to print the string representation of 'variable_name'),
-* "p dir(object_name)", using the Python dir() function to print the list of an object's members
+* ``up`` and ``down`` (to move up and down the call stack),
+* ``l`` (to print code around the line in the current stack position),
+* ``p variable_name`` (to print the string representation of ``variable_name``),
+* ``p dir(object_name)``, using the Python :func:`dir` function to print the list of an object's members

-Here, for example, I do "up", and a simple "l" shows me there's a local
-variable "node". This is the "node" from the computation graph, so by
-following the "node.inputs", "node.owner" and "node.outputs" links I can
+Here, for example, I do ``up``, and a simple ``l`` shows me there's a local
+variable ``node``. This is the ``node`` from the computation graph, so by
+following the ``node.inputs``, ``node.owner`` and ``node.outputs`` links I can
 explore around the graph.

 That graph is purely symbolic (no data, just symbols to manipulate it
 abstractly). To get information about the actual parameters, you explore the
-"thunk" objects, which bind the storage for the inputs (and outputs) with
-the function itself (a "thunk" is a concept related to closures). Here, to
-get the current node's first input's shape, you'd therefore do "p
-thunk.inputs[0][0].shape", which prints out "(3, 4)".
+"thunk" objects, which bind the storage for the inputs (and outputs) with the
+function itself (a "thunk" is a concept related to closures). Here, to get the
+current node's first input's shape, you'd therefore do
+``p thunk.inputs[0][0].shape``, which prints out ``(3, 4)``.

 .. _faq_dump_fct:

@@ -562,14 +562,13 @@ Dumping a Function to help debug
 If you are reading this, there is high chance that you emailed our
 mailing list and we asked you to read this section. This section
 explain how to dump all the parameter passed to
-``aesara.function()``. This is useful to help us reproduce a problem
+:func:`aesara.function`. This is useful to help us reproduce a problem
 during compilation and it doesn't request you to make a self contained
 example.

-For this to work, we need to be able to import the code for all Op in
-the graph. So if you create your own Op, we will need this
-code. Otherwise, we won't be able to unpickle it. We already have all
-the Ops from Aesara and Pylearn2.
+For this to work, we need to be able to import the code for all `Op` in
+the graph. So if you create your own `Op`, we will need this
+code; otherwise, we won't be able to unpickle it.

 .. code-block:: python

@@ -577,9 +576,9 @@ the Ops from Aesara and Pylearn2.
    aesara.function(...)
    # with
    aesara.function_dump(filename, ...)
-    # Where filename is a string to a file that we will write to.
+    # Where `filename` is a string to a file that we will write to.

-Then send us filename.
+Then send us ``filename``.


 Breakpoint during Aesara function execution

--- a/doc/tutorial/examples.rst
+++ b/doc/tutorial/examples.rst
@@ -27,7 +27,7 @@ the logistic curve, which is given by:

 .. figure:: logistic.png

-    A plot of the logistic function, with x on the x-axis and s(x) on the
+    A plot of the logistic function, with :math:`x` on the x-axis and :math:`s(x)` on the
    y-axis.

 You want to compute the function :ref:`element-wise
@@ -49,9 +49,9 @@ Well, what you do is this:
 array([[ 0.5       ,  0.73105858],
       [ 0.26894142,  0.11920292]])

-The reason logistic is performed elementwise is because all of its
-operations---division, addition, exponentiation, and division---are
-themselves elementwise operations.
+The reason the logistic is applied element-wise is because all of its
+operations--division, addition, exponentiation, and division--are
+themselves element-wise operations.

 It is also the case that:

@@ -76,7 +76,7 @@ Computing More than one Thing at the Same Time

 Aesara supports functions with multiple outputs. For example, we can
 compute the :ref:`element-wise <libdoc_tensor_elemwise>` difference, absolute difference, and
-squared difference between two matrices *a* and *b* at the same time:
+squared difference between two matrices ``a`` and ``b`` at the same time:

 .. If you modify this code, also change :
 .. tests/test_tutorial.py:T_examples.test_examples_3
@@ -92,7 +92,7 @@ squared difference between two matrices *a* and *b* at the same time:
   shortcut for allocating symbolic variables that we will often use in the
   tutorials.

-When we use the function f, it returns the three variables (the printing
+When we use the function ``f``, it returns the three variables (the printing
 was reformatted for readability):

 >>> f([[1, 1], [1, 1]], [[0, 1], [2, 3]])
@@ -124,12 +124,12 @@ array(35.0)

 This makes use of the :ref:`In <function_inputs>` class which allows
 you to specify properties of your function's parameters with greater detail. Here we
-give a default value of 1 for *y* by creating a ``In`` instance with
-its ``value`` field set to 1.
+give a default value of ``1`` for ``y`` by creating a :class:`In` instance with
+its ``value`` field set to ``1``.

-Inputs with default values must follow inputs without default
-values (like Python's functions).  There can be multiple inputs with default values. These parameters can
-be set positionally or by name, as in standard Python:
+Inputs with default values must follow inputs without default values (like
+Python's functions).  There can be multiple inputs with default values. These
+parameters can be set positionally or by name, as in standard Python:


 .. If you modify this code, also change :
@@ -150,9 +150,9 @@ array(34.0)
 array(33.0)

 .. note::
-   ``In`` does not know the name of the local variables *y* and *w*
+   `In` does not know the name of the local variables ``y`` and ``w``
   that are passed as arguments.  The symbolic variable objects have name
-   attributes (set by ``dscalars`` in the example above) and *these* are the
+   attributes (set by `dscalars` in the example above) and *these* are the
   names of the keyword parameters in the functions that we build.  This is
   the mechanism at work in ``In(y, value=1)``.  In the case of ``In(w,
   value=2, name='w_by_name')``. We override the symbolic variable's name
@@ -169,11 +169,11 @@ Using Shared Variables

 It is also possible to make a function with an internal state. For
 example, let's say we want to make an accumulator: at the beginning,
-the state is initialized to zero. Then, on each function call, the state
+the state is initialized to zero, then, on each function call, the state
 is incremented by the function's argument.

 First let's define the *accumulator* function. It adds its argument to the
-internal state, and returns the old state value.
+internal state and returns the old state value.

 .. If you modify this code, also change :
 .. tests/test_tutorial.py:T_examples.test_examples_8
@@ -187,17 +187,17 @@ This code introduces a few new concepts.  The ``shared`` function constructs
 so-called :ref:`shared variables<libdoc_compile_shared>`.
 These are hybrid symbolic and non-symbolic variables whose value may be shared
 between multiple functions.  Shared variables can be used in symbolic expressions just like
-the objects returned by ``dmatrices(...)`` but they also have an internal
+the objects returned by `dmatrices` but they also have an internal
 value that defines the value taken by this symbolic variable in *all* the
 functions that use it.  It is called a *shared* variable because its value is
 shared between many functions.  The value can be accessed and modified by the
-``.get_value()`` and ``.set_value()`` methods. We will come back to this soon.
+:meth:`get_value` and :meth:`set_value` methods. We will come back to this soon.

-The other new thing in this code is the ``updates`` parameter of ``function``.
+The other new thing in this code is the ``updates`` parameter of :func:`aesara.function`.
 ``updates`` must be supplied with a list of pairs of the form (shared-variable, new expression).
 It can also be a dictionary whose keys are shared-variables and values are
 the new expressions.  Either way, it means "whenever this function runs, it
-will replace the ``.value`` of each shared variable with the result of the
+will replace the :attr:`value` of each shared variable with the result of the
 corresponding expression".  Above, our accumulator replaces the ``state``'s value with the sum
 of the state and the increment amount.

@@ -246,7 +246,7 @@ updates).

 It may happen that you expressed some formula using a shared variable, but
 you do *not* want to use its value. In this case, you can use the
-``givens`` parameter of ``function`` which replaces a particular node in a graph
+``givens`` parameter of :func:`aesara.function` which replaces a particular node in a graph
 for the purpose of one particular function.

 .. If you modify this code, also change :
@@ -274,18 +274,26 @@ expression that evaluates to a tensor of same shape and dtype.

 .. note::

-    Aesara shared variable broadcast pattern default to False for each
+    Aesara shared variable broadcast pattern default to ``False`` for each
    dimensions. Shared variable size can change over time, so we can't
    use the shape to find the broadcastable pattern. If you want a
    different pattern, just pass it as a parameter
-    ``aesara.shared(..., shape=(True, False))``
+    ``aesara.shared(..., broadcastable=(True, False))``
+
+.. note::
+    Use the ``shape`` parameter to specify tuples of static shapes instead;
+    the old broadcastable values are being phased-out.  Unknown shape values
+    for dimensions take the value ``None``; otherwise, integers are used for
+    known static shape values.
+    For example, ``aesara.shared(..., shape=(1, None))``.

 Copying functions
 =================
 Aesara functions can be copied, which can be useful for creating similar
 functions but with different shared variables or updates. This is done using
-the :func:`copy()<aesara.compile.function.types.Function.copy>` method of ``function`` objects. The optimized graph of the original function is copied,
-so compilation only needs to be performed once.
+the :func:`aesara.compile.function.types.Function.copy` method of :class:`Function` objects.
+The optimized graph of the original function is copied, so compilation only
+needs to be performed once.

 Let's start from the accumulator defined above:

@@ -302,7 +310,7 @@ array(0)
 >>> print(state.get_value())
 10

-We can use ``copy()`` to create a similar accumulator but with its own internal state
+We can use :meth:`copy` to create a similar accumulator but with its own internal state
 using the ``swap`` parameter, which is a dictionary of shared variables to exchange:

 >>> new_state = aesara.shared(0)
@@ -361,11 +369,13 @@ Here's a brief example.  The setup code is:

    from aesara.tensor.random.utils import RandomStream
    from aesara import function
+
+
    srng = RandomStream(seed=234)
    rv_u = srng.uniform(0, 1, size=(2,2))
    rv_n = srng.normal(0, 1, size=(2,2))
    f = function([], rv_u)
-    g = function([], rv_n, no_default_updates=True)    #Not updating rv_n.rng
+    g = function([], rv_n, no_default_updates=True)
    nearly_zeros = function([], rv_u + rv_u - 2 * rv_u)

 Here, ``rv_u`` represents a random stream of 2x2 matrices of draws from a uniform
@@ -383,16 +393,16 @@ so we get different random numbers every time.
 >>> f_val1 = f()  #different numbers from f_val0

 When we add the extra argument ``no_default_updates=True`` to
-``function`` (as in *g*), then the random number generator state is
+``function`` (as in ``g``), then the random number generator state is
 not affected by calling the returned function.  So, for example, calling
-*g* multiple times will return the same numbers.
+``g`` multiple times will return the same numbers.

 >>> g_val0 = g()  # different numbers from f_val0 and f_val1
 >>> g_val1 = g()  # same numbers as g_val0!

 An important remark is that a random variable is drawn at most once during any
-single function execution.  So the *nearly_zeros* function is guaranteed to
-return approximately 0 (except for rounding error) even though the *rv_u*
+single function execution.  So the `nearly_zeros` function is guaranteed to
+return approximately 0 (except for rounding error) even though the ``rv_u``
 random variable appears three times in the output expression.

 >>> nearly_zeros = function([], rv_u + rv_u - 2 * rv_u)
@@ -400,17 +410,8 @@ random variable appears three times in the output expression.
 Seeding Streams
 ---------------

-Random variables can be seeded individually or collectively.
-
-You can seed just one random variable by seeding or assigning to the
-``.rng`` attribute, using ``.rng.set_value()``.
-
->>> rng_val = rv_u.rng.get_value(borrow=True)   # Get the rng for rv_u
->>> rng_val.seed(89234)                         # seeds the generator
->>> rv_u.rng.set_value(rng_val, borrow=True)    # Assign back seeded rng
-
-You can also seed *all* of the random variables allocated by a :class:`RandomStream`
-object by that object's ``seed`` method.  This seed will be used to seed a
+You can seed all of the random variables allocated by a :class:`RandomStream`
+object by that object's :meth:`RandomStream.seed` method.  This seed will be used to seed a
 temporary random number generator, that will in turn generate seeds for each
 of the random variables.

@@ -420,28 +421,15 @@ Sharing Streams Between Functions
 ---------------------------------

 As usual for shared variables, the random number generators used for random
-variables are common between functions.  So our *nearly_zeros* function will
-update the state of the generators used in function *f* above.
-
-For example:
-
->>> state_after_v0 = rv_u.rng.get_value().get_state()
->>> nearly_zeros()       # this affects rv_u's generator
-array([[ 0.,  0.],
-       [ 0.,  0.]])
->>> v1 = f()
->>> rng = rv_u.rng.get_value(borrow=True)
->>> rng.set_state(state_after_v0)
->>> rv_u.rng.set_value(rng, borrow=True)
->>> v2 = f()             # v2 != v1
->>> v3 = f()             # v3 == v1
+variables are common between functions.  So our ``nearly_zeros`` function will
+update the state of the generators used in function ``f`` above.

 Copying Random State Between Aesara Graphs
 ------------------------------------------

 In some use cases, a user might want to transfer the "state" of all random
-number generators associated with a given aesara graph (e.g. g1, with compiled
-function f1 below) to a second graph (e.g. g2, with function f2). This might
+number generators associated with a given Aesara graph (e.g. ``g1``, with compiled
+function ``f1`` below) to a second graph (e.g. ``g2``, with function ``f2``). This might
 arise for example if you are trying to initialize the state of a model, from
 the parameters of a pickled version of a previous model. For
 :class:`aesara.tensor.random.utils.RandomStream` and
@@ -449,50 +437,10 @@ the parameters of a pickled version of a previous model. For
 this can be achieved by copying elements of the `state_updates` parameter.

 Each time a random variable is drawn from a `RandomStream` object, a tuple is
-added to the `state_updates` list. The first element is a shared variable,
+added to its :attr:`state_updates` list. The first element is a shared variable,
 which represents the state of the random number generator associated with this
-*particular* variable, while the second represents the aesara graph
-corresponding to the random number generation process (i.e. RandomFunction{uniform}.0).
-
-An example of how "random states" can be transferred from one aesara function
-to another is shown below.
-
->>> import aesara
->>> import numpy
->>> import aesara.tensor as at
->>> from aesara.sandbox.rng_mrg import MRG_RandomStream
->>> from aesara.tensor.random.utils import RandomStream
-
->>> class Graph():
-...     def __init__(self, seed=123):
-...         self.rng = RandomStream(seed)
-...         self.y = self.rng.uniform(size=(1,))
-
->>> g1 = Graph(seed=123)
->>> f1 = aesara.function([], g1.y)
-
->>> g2 = Graph(seed=987)
->>> f2 = aesara.function([], g2.y)
-
->>> # By default, the two functions are out of sync.
->>> f1()
-array([ 0.72803009])
->>> f2()
-array([ 0.55056769])
-
->>> def copy_random_state(g1, g2):
-...     if isinstance(g1.rng, MRG_RandomStream):
-...         g2.rng.rstate = g1.rng.rstate
-...     for (su1, su2) in zip(g1.rng.state_updates, g2.rng.state_updates):
-...         su2[0].set_value(su1[0].get_value())
-
->>> # We now copy the state of the aesara random number generators.
->>> copy_random_state(g1, g2)
->>> f1()
-array([ 0.59044123])
->>> f2()
-array([ 0.59044123])
-
+*particular* variable, while the second represents the Aesara graph
+corresponding to the random number generation process.

 Other Random Distributions
 --------------------------
@@ -511,16 +459,18 @@ It will be used repeatedly.

 .. testcode::

-    import numpy
+    import numpy as np
    import aesara
    import aesara.tensor as at
-    rng = numpy.random
+
+
+    rng = np.random.default_rng(2882)

    N = 400                                   # training sample size
    feats = 784                               # number of input variables

    # generate a dataset: D = (input_values, target_class)
-    D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2))
+    D = (rng.standard_normal((N, feats)), rng.integers(size=N, low=0, high=2))
    training_steps = 10000

    # Declare Aesara symbolic variables
@@ -532,7 +482,7 @@ It will be used repeatedly.
    # this and the following bias variable b
    # are shared so they keep their values
    # between training iterations (updates)
-    w = aesara.shared(rng.randn(feats), name="w")
+    w = aesara.shared(rng.standard_normal(feats), name="w")

    # initialize the bias term
    b = aesara.shared(0., name="b")
@@ -542,7 +492,7 @@ It will be used repeatedly.
    print(b.get_value())

    # Construct Aesara expression graph
-    p_1 = 1 / (1 + at.exp(-at.dot(x, w) - b))        # Probability that target = 1
+    p_1 = 1 / (1 + at.exp(-at.dot(x, w) - b))       # Probability that target = 1
    prediction = p_1 > 0.5                          # The prediction thresholded
    xent = -y * at.log(p_1) - (1-y) * at.log(1-p_1) # Cross-entropy loss function
    cost = xent.mean() + 0.01 * (w ** 2).sum()      # The cost to minimize

--- a/doc/tutorial/modes.rst
+++ b/doc/tutorial/modes.rst
@@ -9,11 +9,11 @@ Configuration Settings and Compiling Modes
 Configuration
 =============

-The ``config`` module contains several *attributes* that modify Aesara's behavior.  Many of these
-attributes are examined during the import of the ``aesara`` module and several are assumed to be
+The :mod:`aesara.config` module contains several *attributes* that modify Aesara's behavior.  Many of these
+attributes are examined during the import of the :mod:`aesara` module and several are assumed to be
 read-only.

-*As a rule, the attributes in the* ``config`` *module should not be modified inside the user code.*
+*As a rule, the attributes in the* :mod:`aesara.config` *module should not be modified inside the user code.*

 Aesara's code comes with default values for these attributes, but you can
 override them from your ``.aesararc`` file, and override those values in turn by
@@ -21,12 +21,12 @@ the :envvar:`AESARA_FLAGS` environment variable.

 The order of precedence is:

-1. an assignment to aesara.config.<property>
+1. an assignment to ``aesara.config.<property>``
 2. an assignment in :envvar:`AESARA_FLAGS`
-3. an assignment in the .aesararc file (or the file indicated in :envvar:`AESARARC`)
+3. an assignment in the ``.aesararc`` file (or the file indicated in :envvar:`AESARARC`)

 You can display the current/effective configuration at any time by printing
-aesara.config.  For example, to see a list  of all active configuration
+`aesara.config`.  For example, to see a list  of all active configuration
 variables, type this from the command-line:

 .. code-block:: bash
@@ -45,22 +45,24 @@ Consider the logistic regression:

 .. testcode::

-    import numpy
+    import numpy as np
    import aesara
    import aesara.tensor as at
-    rng = numpy.random
+
+
+    rng = np.random.default_rng(2498)

    N = 400
    feats = 784
-    D = (rng.randn(N, feats).astype(aesara.config.floatX),
-    rng.randint(size=N,low=0, high=2).astype(aesara.config.floatX))
+    D = (rng.standard_normal((N, feats)).astype(aesara.config.floatX),
+    rng.integers(size=N,low=0, high=2).astype(aesara.config.floatX))
    training_steps = 10000

    # Declare Aesara symbolic variables
    x = at.matrix("x")
    y = at.vector("y")
-    w = aesara.shared(rng.randn(feats).astype(aesara.config.floatX), name="w")
-    b = aesara.shared(numpy.asarray(0., dtype=aesara.config.floatX), name="b")
+    w = aesara.shared(rng.standard_normal(feats).astype(aesara.config.floatX), name="w")
+    b = aesara.shared(np.asarray(0., dtype=aesara.config.floatX), name="b")
    x.tag.test_value = D[0]
    y.tag.test_value = D[1]

@@ -73,15 +75,18 @@ Consider the logistic regression:

    # Compile expressions to functions
    train = aesara.function(
-                inputs=[x,y],
-                outputs=[prediction, xent],
-                updates=[(w, w-0.01*gw), (b, b-0.01*gb)],
-                name = "train")
-    predict = aesara.function(inputs=[x], outputs=prediction,
-                name = "predict")
-
-    if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for x in
-            train.maker.fgraph.toposort()]):
+        inputs=[x,y],
+        outputs=[prediction, xent],
+        updates=[(w, w-0.01*gw), (b, b-0.01*gb)],
+        name = "train"
+    )
+    predict = aesara.function(
+        inputs=[x], outputs=prediction,
+        name = "predict"
+    )
+
+    if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm']
+           for x in train.maker.fgraph.toposort()]):
        print('Used the cpu')
    else:
        print('ERROR, not able to tell if aesara used the cpu or another device')
@@ -106,7 +111,7 @@ Consider the logistic regression:
   prediction on D
   ...

-Modify and execute this example to run on CPU (the default) with floatX=float32 and
+Modify and execute this example to run on CPU (the default) with ``floatX=float32`` and
 time the execution using the command line ``time python file.py``.  Save your code
 as it will be useful later on.

@@ -114,10 +119,10 @@ as it will be useful later on.

   * Apply the Aesara flag ``floatX=float32`` (through ``aesara.config.floatX``) in your code.
   * Cast inputs before storing them into a shared variable.
-   * Circumvent the automatic cast of *int32* with *float32* to *float64*:
+   * Circumvent the automatic cast of int32 with float32 to float64:

-     * Insert manual cast in your code or use *[u]int{8,16}*.
-     * Insert manual cast around the mean operator (this involves division by length, which is an *int64*).
+     * Insert manual cast in your code or use [u]int{8,16}.
+     * Insert manual cast around the mean operator (this involves division by length, which is an int64).
     * Note that a new casting mechanism is being developed.

 :download:`Solution<modes_solution_1.py>`
@@ -156,7 +161,7 @@ short name        Full constructor

 .. Note::

-    For debugging purpose, there also exists a ``MonitorMode`` (which has no
+    For debugging purpose, there also exists a :class:`MonitorMode` (which has no
    short name). It can be used to step through the execution of a function:
    see :ref:`the debugging FAQ<faq_monitormode>` for details.

@@ -165,8 +170,8 @@ Linkers
 =======

 A mode is composed of 2 things: an optimizer and a linker. Some modes,
-like ``NanGuardMode`` and ``DebugMode``, add logic around the
-optimizer and linker. ``DebugMode`` uses its own linker.
+like `NanGuardMode` and `DebugMode`, add logic around the
+optimizer and linker. `DebugMode` uses its own linker.

 You can select which linker to use with the Aesara flag :attr:`config.linker`.
 Here is a table to compare the different linkers.
@@ -233,8 +238,8 @@ Using DebugMode

 While normally you should use the ``FAST_RUN`` or ``FAST_COMPILE`` mode,
 it is useful at first (especially when you are defining new kinds of
-expressions or new optimizations) to run your code using the DebugMode
-(available via ``mode='DebugMode``). The DebugMode is designed to
+expressions or new optimizations) to run your code using the `DebugMode`
+(available via ``mode='DebugMode``). The `DebugMode` is designed to
 run several self-checks and assertions that can help diagnose
 possible programming errors leading to incorrect output. Note that
 ``DebugMode`` is much slower than ``FAST_RUN`` or ``FAST_COMPILE`` so
@@ -245,7 +250,7 @@ cluster!).
 .. If you modify this code, also change :
 .. tests/test_tutorial.py:T_modes.test_modes_1

-DebugMode is used as follows:
+`DebugMode` is used as follows:

 .. testcode::

@@ -258,21 +263,21 @@ DebugMode is used as follows:
    f([7])


-If any problem is detected, DebugMode will raise an exception according to
-what went wrong, either at call time (*f(5)*) or compile time (
+If any problem is detected, `DebugMode` will raise an exception according to
+what went wrong, either at call time (e.g. ``f(5)``) or compile time (
 ``f = aesara.function(x, 10 * x, mode='DebugMode')``). These exceptions
 should *not* be ignored; talk to your local Aesara guru or email the
 users list if you cannot make the exception go away.

 Some kinds of errors can only be detected for certain input value combinations.
 In the example above, there is no way to guarantee that a future call to, say
-*f(-1)*, won't cause a problem.  DebugMode is not a silver bullet.
+``f(-1)``, won't cause a problem.  `DebugMode` is not a silver bullet.

 .. TODO: repair the following link

-If you instantiate DebugMode using the constructor (see :class:`DebugMode`)
-rather than the keyword ``DebugMode`` you can configure its behaviour via
-constructor arguments. The keyword version of DebugMode (which you get by using ``mode='DebugMode'``)
+If you instantiate `DebugMode` using the constructor (see :class:`DebugMode`)
+rather than the keyword `DebugMode` you can configure its behaviour via
+constructor arguments. The keyword version of `DebugMode` (which you get by using ``mode='DebugMode'``)
 is quite strict.

 For more detail, see :ref:`DebugMode<debugmode>` in the library.
--- a/doc/tutorial/modes_solution_1.py
+++ b/doc/tutorial/modes_solution_1.py
@@ -2,59 +2,62 @@
 # Aesara tutorial
 # Solution to Exercise in section 'Configuration Settings and Compiling Modes'

-
 import numpy as np
 import aesara
 import aesara.tensor as at

-aesara.config.floatX = 'float32'

-rng = np.random
+aesara.config.floatX = "float32"
+
+rng = np.random.default_rng(428)

 N = 400
 feats = 784
-D = (rng.randn(N, feats).astype(aesara.config.floatX),
-rng.randint(size=N, low=0, high=2).astype(aesara.config.floatX))
+D = (
+    rng.standard_normal((N, feats)).astype(aesara.config.floatX),
+    rng.integers(size=N, low=0, high=2).astype(aesara.config.floatX),
+)
 training_steps = 10000

 # Declare Aesara symbolic variables
 x = at.matrix("x")
 y = at.vector("y")
-w = aesara.shared(rng.randn(feats).astype(aesara.config.floatX), name="w")
-b = aesara.shared(np.asarray(0., dtype=aesara.config.floatX), name="b")
+w = aesara.shared(rng.standard_normal(feats).astype(aesara.config.floatX), name="w")
+b = aesara.shared(np.asarray(0.0, dtype=aesara.config.floatX), name="b")
 x.tag.test_value = D[0]
 y.tag.test_value = D[1]
-#print "Initial model:"
-#print w.get_value(), b.get_value()
+# print "Initial model:"
+# print w.get_value(), b.get_value()

 # Construct Aesara expression graph
 p_1 = 1 / (1 + at.exp(-at.dot(x, w) - b))  # Probability of having a one
 prediction = p_1 > 0.5  # The prediction that is done: 0 or 1
 xent = -y * at.log(p_1) - (1 - y) * at.log(1 - p_1)  # Cross-entropy
-cost = at.cast(xent.mean(), 'float32') + \
-       0.01 * (w ** 2).sum()  # The cost to optimize
+cost = at.cast(xent.mean(), "float32") + 0.01 * (w**2).sum()  # The cost to optimize
 gw, gb = at.grad(cost, [w, b])

 # Compile expressions to functions
 train = aesara.function(
-            inputs=[x, y],
-            outputs=[prediction, xent],
-            updates={w: w - 0.01 * gw, b: b - 0.01 * gb},
-            name="train")
-predict = aesara.function(inputs=[x], outputs=prediction,
-            name="predict")
-
-if any(x.op.__class__.__name__ in ('Gemv', 'CGemv', 'Gemm', 'CGemm') for x in
-train.maker.fgraph.toposort()):
-    print('Used the cpu')
+    inputs=[x, y],
+    outputs=[prediction, xent],
+    updates={w: w - 0.01 * gw, b: b - 0.01 * gb},
+    name="train",
+)
+predict = aesara.function(inputs=[x], outputs=prediction, name="predict")
+
+if any(
+    x.op.__class__.__name__ in ("Gemv", "CGemv", "Gemm", "CGemm")
+    for x in train.maker.fgraph.toposort()
+):
+    print("Used the cpu")
 else:
-    print('ERROR, not able to tell if aesara used the cpu or another device')
+    print("ERROR, not able to tell if aesara used the cpu or another device")
    print(train.maker.fgraph.toposort())

 for i in range(training_steps):
    pred, err = train(D[0], D[1])
-#print "Final model:"
-#print w.get_value(), b.get_value()
+# print "Final model:"
+# print w.get_value(), b.get_value()

 print("target values for D")
 print(D[1])

--- a/doc/tutorial/printing_drawing.rst
+++ b/doc/tutorial/printing_drawing.rst
@@ -25,20 +25,20 @@ that creates an image of the function. You can read about them in

 Consider again the logistic regression example:

->>> import numpy
+>>> import numpy as np
 >>> import aesara
 >>> import aesara.tensor as at
->>> rng = numpy.random
+>>> rng = np.random.default_rng(2382)
 >>> # Training data
 >>> N = 400
 >>> feats = 784
->>> D = (rng.randn(N, feats).astype(aesara.config.floatX), rng.randint(size=N,low=0, high=2).astype(aesara.config.floatX))
+>>> D = (rng.standard_normal(N, feats).astype(aesara.config.floatX), rng.integers(size=N,low=0, high=2).astype(aesara.config.floatX))
 >>> training_steps = 10000
 >>> # Declare Aesara symbolic variables
 >>> x = at.matrix("x")
 >>> y = at.vector("y")
->>> w = aesara.shared(rng.randn(feats).astype(aesara.config.floatX), name="w")
->>> b = aesara.shared(numpy.asarray(0., dtype=aesara.config.floatX), name="b")
+>>> w = aesara.shared(rng.standard_normal(feats).astype(aesara.config.floatX), name="w")
+>>> b = aesara.shared(np.asarray(0., dtype=aesara.config.floatX), name="b")
 >>> x.tag.test_value = D[0]
 >>> y.tag.test_value = D[1]
 >>> # Construct Aesara expression graph

--- a/doc/tutorial/profiling_example.py
+++ b/doc/tutorial/profiling_example.py
-
 import numpy as np

 import aesara

-x, y, z = aesara.tensor.vectors('xyz')
+x, y, z = aesara.tensor.vectors("xyz")
 f = aesara.function([x, y, z], [(x + y + z) * 2])
-xv = np.random.rand(10).astype(aesara.config.floatX)
-yv = np.random.rand(10).astype(aesara.config.floatX)
-zv = np.random.rand(10).astype(aesara.config.floatX)
+xv = np.random.random((10,)).astype(aesara.config.floatX)
+yv = np.random.random((10,)).astype(aesara.config.floatX)
+zv = np.random.random((10,)).astype(aesara.config.floatX)
 f(xv, yv, zv)
--- a/doc/tutorial/shape_info.rst
+++ b/doc/tutorial/shape_info.rst
@@ -49,7 +49,7 @@ upgrade.  Here is the current state of what can be done:

    aesara.tensor.nnet.conv2d(..., image_shape=(7, 3, 5, 5), filter_shape=(2, 3, 4, 4))

- You can use the ``SpecifyShape`` op to add shape information anywhere in the
+- You can use the :class:`SpecifyShape`\ :class:`Op` to add shape information anywhere in the
  graph. This allows to perform some optimizations. In the following example,
  this makes it possible to precompute the Aesara function to a constant.

@@ -67,13 +67,13 @@ Problems with Shape inference

 Sometimes this can lead to errors.  Consider this example:

->>> import numpy
+>>> import numpy as np
 >>> import aesara
 >>> x = aesara.tensor.matrix('x')
 >>> y = aesara.tensor.matrix('y')
 >>> z = aesara.tensor.join(0, x, y)
->>> xv = numpy.random.rand(5, 4)
->>> yv = numpy.random.rand(3, 3)
+>>> xv = np.random.random((5, 4))
+>>> yv = np.random.random((3, 3))

 >>> f = aesara.function([x, y], z.shape)
 >>> aesara.printing.debugprint(f) # doctest: +NORMALIZE_WHITESPACE
@@ -109,7 +109,7 @@ This makes the computation of the shape faster, but it can also hide errors. In
 this example, the computation of the shape of the output of ``join`` is done only
 based on the first input Aesara variable, which leads to an error.

-This might happen with other ops such as ``elemwise`` and ``dot``, for example.
+This might happen with other `Op`\s such as :class:`Elemwise` and :class:`Dot`, for example.
 Indeed, to perform some optimizations (for speed or stability, for instance),
 Aesara assumes that the computation is correct and consistent
 in the first place, as it does here.
@@ -118,5 +118,5 @@ You can detect those problems by running the code without this
 optimization, using the Aesara flag
 ``optimizer_excluding=local_shape_to_shape_i``. You can also obtain the
 same effect by running in the modes ``FAST_COMPILE`` (it will not apply this
-optimization, nor most other optimizations) or ``DebugMode`` (it will test
-before and after all optimizations (much slower)).
+optimization, nor most other optimizations) or :class:`DebugMode` (it will test
+before and after all optimizations).