Add missing documentation formatting and docstrings

1ddf666e · Brandon T. Willard · Brandon T. Willard · f22d3165 · 1ddf666e · 1ddf666e
--- a/aesara/gpuarray/dnn.py
+++ b/aesara/gpuarray/dnn.py
@@ -470,11 +470,7 @@ def get_precision(precision, inputs, for_grad=False):


 class DnnBase(_NoPythonExternalCOp):
-
-    """
-    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
-
-    """
+    """An `Op` that creates a handle for cudnn and pulls in the cudnn libraries and headers."""

    # dnn does not know about broadcasting, so we do not need to assert
    # the input broadcasting pattern.

--- a/aesara/graph/basic.py
+++ b/aesara/graph/basic.py
@@ -287,7 +287,7 @@ class Variable(Node):
    A :term:`Variable` is a node in an expression graph that represents a
    variable.

-    The inputs and outputs of every `Apply` (aesara.graph.basic.Apply) are `Variable`
+    The inputs and outputs of every `Apply` are `Variable`
    instances. The input and output arguments to create a `function` are also
    `Variable` instances. A `Variable` is like a strongly-typed variable in
    some other languages; each `Variable` contains a reference to a `Type`
@@ -318,23 +318,23 @@ class Variable(Node):
    - `Constant`: a subclass which adds a default and un-replaceable
      :literal:`value`, and requires that owner is None.

-    - `TensorVariable` subclass of `Variable` that represents a `numpy.ndarray`
+    - `TensorVariable` subclass of `Variable` that represents a ``numpy.ndarray``
       object.

    - `TensorSharedVariable`: a shared version of `TensorVariable`.

    - `SparseVariable`: a subclass of `Variable` that represents
-      a `scipy.sparse.{csc,csr}_matrix` object.
+      a ``scipy.sparse.{csc,csr}_matrix`` object.

    - `GpuArrayVariable`: a subclass of `Variable` that represents our object on
-      the GPU that is a subset of `numpy.ndarray`.
+      the GPU that is a subset of ``numpy.ndarray``.

    - `RandomVariable`.

    A `Variable` which is the output of a symbolic computation will have an owner
    not equal to None.

-    Using the `Variables`' owner field and the `Apply` nodes' inputs fields,
+    Using a `Variable`\s' owner field and an `Apply` node's inputs fields,
    one can navigate a graph from an output all the way to the inputs. The
    opposite direction is possible with a ``FunctionGraph`` and its
    ``FunctionGraph.clients`` ``dict``, which maps `Variable`\s to a list of their
@@ -346,9 +346,9 @@ class Variable(Node):
        The type governs the kind of data that can be associated with this
        variable.
    owner : None or Apply instance
-        The Apply instance which computes the value for this variable.
+        The `Apply` instance which computes the value for this variable.
    index : None or int
-        The position of this Variable in owner.outputs.
+        The position of this `Variable` in owner.outputs.
    name : None or str
        A string for pretty-printing and debugging.

@@ -374,8 +374,8 @@ class Variable(Node):
        aesara.function([a,b], [c])     # compilation error because a is constant, it can't be an input


-    The python variables :literal:`a,b,c` all refer to instances of type
-    `Variable`. The `Variable` referred to by `a` is also an instance of
+    The python variables ``a, b, c`` all refer to instances of type
+    `Variable`. The `Variable` referred to by ``a`` is also an instance of
    `Constant`.

    """
@@ -421,7 +421,7 @@ class Variable(Node):
        return self.tag.test_value

    def __str__(self):
-        """Return a str representation of the Variable."""
+        """Return a ``str`` representation of the `Variable`."""
        if self.name is not None:
            return self.name
        if self.owner is not None:
@@ -434,7 +434,7 @@ class Variable(Node):
            return f"<{self.type}>"

    def __repr_test_value__(self):
-        """Return a repr of the test value.
+        """Return a ``repr`` of the test value.

        Return a printable representation of the test value. It can be
        overridden by classes with non printable test_value to provide a
@@ -443,11 +443,11 @@ class Variable(Node):
        return repr(self.get_test_value())

    def __repr__(self, firstPass=True):
-        """Return a repr of the Variable.
+        """Return a ``repr`` of the `Variable`.

        Return a printable name or description of the Variable. If
-        config.print_test_value is True it will also print the test_value if
-        any.
+        ``config.print_test_value`` is ``True`` it will also print the test
+        value, if any.
        """
        to_print = [str(self)]
        if config.print_test_value and firstPass:
@@ -458,13 +458,12 @@ class Variable(Node):
        return "\n".join(to_print)

    def clone(self):
-        """
-        Return a new Variable like self.
+        """Return a new `Variable` like `self`.

        Returns
        -------
        Variable instance
-            A new Variable instance (or subclass instance) with no owner or
+            A new `Variable` instance (or subclass instance) with no owner or
            index.

        Notes
@@ -505,13 +504,12 @@ class Variable(Node):
        return []

    def eval(self, inputs_to_values=None):
-        """
-        Evaluates this variable.
+        r"""Evaluate the `Variable`.

        Parameters
        ----------
-        inputs_to_values
-            A dictionary mapping aesara Variables to values.
+        inputs_to_values :
+            A dictionary mapping Aesara `Variable`\s to values.

        Examples
        --------
@@ -524,16 +522,16 @@ class Variable(Node):
        >>> np.allclose(z.eval({x : 16.3, y : 12.1}), 28.4)
        True

-        We passed :func:`eval` a dictionary mapping symbolic aesara
-        variables to the values to substitute for them, and it returned
+        We passed :meth:`eval` a dictionary mapping symbolic Aesara
+        `Variable`\s to the values to substitute for them, and it returned
        the numerical value of the expression.

        Notes
        -----

-        `eval` will be slow the first time you call it on a variable --
+        :meth:`eval` will be slow the first time you call it on a variable --
        it needs to call :func:`function` to compile the expression behind
-        the scenes. Subsequent calls to :func:`eval` on that same variable
+        the scenes. Subsequent calls to :meth:`eval` on that same variable
        will be fast, because the variable caches the compiled function.

        This way of computing has more overhead than a normal Aesara
@@ -588,10 +586,10 @@ class Variable(Node):


 class Constant(Variable):
-    """A `Variable` with a fixed `value` field.
+    """A `Variable` with a fixed `data` field.

-    Constant nodes make numerous optimizations possible (e.g. constant inlining
-    in C code, constant folding, etc.)
+    `Constant` nodes make numerous optimizations possible (e.g. constant
+    in-lining in C code, constant folding, etc.)

    Notes
    -----
@@ -630,7 +628,8 @@ class Constant(Variable):
            return f"{type(self).__name__}{{{name}}}"

    def clone(self):
-        """
+        """Create a shallow clone.
+
        We clone this object, but we don't clone the data to lower memory
        requirement. We suppose that the data will never change.

@@ -640,13 +639,12 @@ class Constant(Variable):
        return cp

    def __set_owner(self, value):
-        """
-        WRITEME
+        """Prevent the :prop:`owner` property from being set.

        Raises
        ------
        ValueError
-            If `value` is not `None`.
+            If `value` is not ``None``.

        """
        if value is not None:
@@ -888,8 +886,8 @@ def clone(
    -----

    A constant, if in the `inputs` list is not an orphan. So it will be copied
-    depending of the `copy_inputs` parameter. Otherwise it will be copied
-    depending of the `copy_orphans` parameter.
+    conditional on the `copy_inputs` parameter; otherwise, it will be copied
+    conditional on the `copy_orphans` parameter.

    """
    if copy_orphans is None:
@@ -906,7 +904,7 @@ def clone_get_equiv(
    memo: Optional[Dict[Variable, Variable]] = None,
 ):
    """
-    Return a dictionary that maps from Variable and Apply nodes in the
+    Return a dictionary that maps from `Variable` and `Apply` nodes in the
    original graph to a new node (a clone) in a new graph.

    This function works by recursively cloning inputs... rebuilding a directed
@@ -921,8 +919,8 @@ def clone_get_equiv(
        nodes (the bottom of a feed-upward graph).
        False means to clone a graph that is rooted at the original input
        nodes.
-    copy_orphans:
-        When True, new constant nodes are created. When False, original
+    copy_orphans :
+        When ``True``, new constant nodes are created. When ``False``, original
        constant nodes are reused in the new graph.
    memo : None or dict
        Optionally start with a partly-filled dictionary for the return value.
@@ -984,8 +982,8 @@ def clone_replace(
    replace : dict
        Dictionary describing which subgraphs should be replaced by what.
    share_inputs : bool
-        If True, use the same inputs (and shared variables) as the original
-        graph. If False, clone them. Note that cloned shared variables still
+        If ``True``, use the same inputs (and shared variables) as the original
+        graph. If ``False``, clone them. Note that cloned shared variables still
        use the same underlying storage, so they will always have the same
        value.

@@ -1032,15 +1030,15 @@ def general_toposort(
    Parameters
    ----------
    deps : callable
-        A python function that takes a node as input and returns its dependence.
+        A Python function that takes a node as input and returns its dependence.
    compute_deps_cache : optional
-        If provided deps_cache should also be provided. This is a function like
-        deps, but that also cache its results in a dict passed as deps_cache.
+        If provided, `deps_cache` should also be provided. This is a function like
+        `deps`, but that also caches its results in a ``dict`` passed as `deps_cache`.
    deps_cache : dict
-        A dict mapping nodes to their children.  This is populated by
+        A ``dict`` mapping nodes to their children.  This is populated by
        `compute_deps_cache`.
    clients : dict
-        If a dict is passed it will be filled with a mapping of
+        If a ``dict`` is passed, it will be filled with a mapping of
        nodes-to-clients for each node in the subgraph.

    Notes
@@ -1226,11 +1224,7 @@ def default_node_formatter(op, argstrings):


 def io_connection_pattern(inputs, outputs):
-    """
-    Returns the connection pattern of a subgraph defined by given
-    inputs and outputs.
-
-    """
+    """Return the connection pattern of a subgraph defined by given inputs and outputs."""
    inner_nodes = io_toposort(inputs, outputs)

    # Initialize 'connect_pattern_by_var' by establishing each input as
@@ -1298,10 +1292,7 @@ def io_connection_pattern(inputs, outputs):
 def op_as_string(
    i, op, leaf_formatter=default_leaf_formatter, node_formatter=default_node_formatter
 ):
-    """
-    Op to return a string representation of the subgraph
-    between i and o
-    """
+    """Return a function that returns a string representation of the subgraph between `i` and :attr:`op.inputs`"""
    strs = as_string(i, op.inputs, leaf_formatter, node_formatter)
    return node_formatter(op, strs)

@@ -1312,7 +1303,7 @@ def as_string(
    leaf_formatter=default_leaf_formatter,
    node_formatter=default_node_formatter,
 ) -> List[str]:
-    r"""Returns a string representation of the subgraph between inputs and outputs.
+    r"""Returns a string representation of the subgraph between `inputs` and `outputs`.

    Parameters
    ----------
@@ -1321,9 +1312,9 @@ def as_string(
    outputs : list
        Output `Variable`\s.
    leaf_formatter : callable
-        Takes a `Variable`  and returns a string to describe it.
+        Takes a `Variable` and returns a string to describe it.
    node_formatter : callable
-        Takes an `Op`  and the list of strings corresponding to its arguments
+        Takes an `Op` and the list of strings corresponding to its arguments
        and returns a string to describe it.

    Returns
@@ -1332,7 +1323,7 @@ def as_string(
        Returns a string representation of the subgraph between `inputs` and
        `outputs`. If the same node is used by several other nodes, the first
        occurrence will be marked as :literal:`*n -> description` and all
-        subsequent occurrences will be marked as :literal:`*n`, where n is an id
+        subsequent occurrences will be marked as :literal:`*n`, where ``n`` is an id
        number (ids are attributed in an unspecified order and only exist for
        viewing convenience).

@@ -1465,29 +1456,29 @@ def is_in_ancestors(l_apply: Apply, f_node: Apply) -> bool:

 @contextlib.contextmanager
 def nodes_constructed():
-    """
-    A contextmanager that is used in inherit_stack_trace and keeps track
+    r"""
+    A context manager that is used in ``inherit_stack_trace`` and keeps track
    of all the newly created variable nodes inside an optimization. A list
-    of new_nodes is instantiated but will be filled in a lazy manner (when
-    Variable.notify_construction_observers is called).
+    of ``new_nodes`` is instantiated but will be filled in a lazy manner (when
+    ``Variable.notify_construction_observers`` is called).


-    `observer` is the entity that updates the new_nodes list.
-    construction_observers is a list inside Variable class and contains
+    ``observer`` is the entity that updates the ``new_nodes`` list.
+    ``construction_observers`` is a list inside `Variable` class and contains
    a list of observer functions. The observer functions inside
-    construction_observers are only called when a variable node is
-    instantiated (where Variable.notify_construction_observers is called).
-    When the observer function is called, a new variable node is added to
-    the new_nodes list.
+    ``construction_observers`` are only called when a `Variable` is
+    instantiated (where ``Variable.notify_construction_observers`` is called).
+    When the observer function is called, a new `Variable` is added to
+    the `new_nodes` list.


    Parameters
    ----------
    new_nodes
-        A list of all the variable nodes that are created inside the optimization.
+        A list of all the `Variable`\s that are created inside the optimization.

    yields
-        new_nodes list.
+        ``new_nodes`` list.
    """
    new_nodes = []

@@ -1503,8 +1494,8 @@ def equal_computations(xs, ys, in_xs=None, in_ys=None):
    """Checks if Aesara graphs represent the same computations.

    The two lists `xs`, `ys` should have the same number of entries. The
-    function checks if for any corresponding pair `(x,y)` from `zip(xs,ys)`
-    `x` and `y` represent the same computations on the same variables
+    function checks if for any corresponding pair ``(x, y)`` from ``zip(xs, ys)``
+    ``x`` and ``y`` represent the same computations on the same variables
    (unless equivalences are provided using `in_xs`, `in_ys`).

    If `in_xs` and `in_ys` are provided, then when comparing a node ``x`` with

--- a/aesara/graph/fg.py
+++ b/aesara/graph/fg.py
@@ -241,9 +241,10 @@ class FunctionGraph(MetaObject):

        Parameters
        ----------
-        var : Variable.
+        var : Variable
+            The `Variable` to be updated.
        new_client : (Apply, int)
-            A `(node, i)` pair such that `node.inputs[i]` is `var`.
+            A ``(node, i)`` pair such that ``node.inputs[i]`` is `var`.

        """
        self.clients[var].append(new_client)
@@ -251,7 +252,7 @@ class FunctionGraph(MetaObject):
    def remove_client(
        self, var: Variable, client_to_remove: Tuple[Apply, int], reason: str = None
    ) -> None:
-        """Recursively removes clients of a variable.
+        """Recursively remove clients of a variable.

        This is the main method to remove variables or `Apply` nodes from
        a `FunctionGraph`.
@@ -265,7 +266,7 @@ class FunctionGraph(MetaObject):
        var : Variable
            The clients of `var` that will be removed.
        client_to_remove : pair of (Apply, int)
-            A `(node, i)` pair such that `node.inputs[i]` will no longer be
+            A ``(node, i)`` pair such that ``node.inputs[i]`` will no longer be
            `var` in this `FunctionGraph`.

        """
@@ -359,11 +360,11 @@ class FunctionGraph(MetaObject):
        reason: str = None,
        import_missing: bool = False,
    ) -> None:
-        """Recursively import everything between an `Apply` node and the `FunctionGraph`'s outputs.
+        """Recursively import everything between an ``Apply`` node and the ``FunctionGraph``'s outputs.

        Parameters
        ----------
-        apply_node : aesara.graph.basic.Apply
+        apply_node : Apply
            The node to be imported.
        check : bool
            Check that the inputs for the imported nodes are also present in
@@ -419,7 +420,7 @@ class FunctionGraph(MetaObject):

    def change_input(
        self,
-        node: Apply,
+        node: Union[Apply, str],
        i: int,
        new_var: Variable,
        reason: str = None,
@@ -435,15 +436,15 @@ class FunctionGraph(MetaObject):

        Parameters
        ----------
-        node : aesara.graph.basic.Apply or str
+        node
            The node for which an input is to be changed.  If the value is
            the string ``"output"`` then the ``self.outputs`` will be used
            instead of ``node.inputs``.
-        i : int
+        i
            The index in `node.inputs` that we want to change.
-        new_var : aesara.graph.basic.Variable
+        new_var
            The new variable to take the place of ``node.inputs[i]``.
-        import_missing : bool
+        import_missing
            Add missing inputs instead of raising an exception.
        """
        # TODO: ERROR HANDLING FOR LISTENERS (should it complete the change or revert it?)
@@ -494,15 +495,15 @@ class FunctionGraph(MetaObject):

        Parameters
        ----------
-        var : aesara.graph.basic.Variable
+        var
            The variable to be replaced.
-        new_var : aesara.graph.basic.Variable
+        new_var
            The variable to replace `var`.
-        reason : str
+        reason
            The name of the optimization or operation in progress.
-        verbose : bool
+        verbose
            Print `reason`, `var`, and `new_var`.
-        import_missing : bool
+        import_missing
            Import missing variables.

        """
@@ -548,12 +549,12 @@ class FunctionGraph(MetaObject):
            )

    def replace_all(self, pairs: List[Tuple[Variable, Variable]], **kwargs) -> None:
-        """Replace variables in the ``FunctionGraph`` according to ``(var, new_var)`` pairs in a list."""
+        """Replace variables in the `FunctionGraph` according to ``(var, new_var)`` pairs in a list."""
        for var, new_var in pairs:
            self.replace(var, new_var, **kwargs)

    def attach_feature(self, feature: Feature) -> None:
-        """Add a ``graph.features.Feature`` to this function graph and trigger its on_attach callback."""
+        """Add a ``graph.features.Feature`` to this function graph and trigger its ``on_attach`` callback."""
        # Filter out literally identical `Feature`s
        if feature in self._features:
            return  # the feature is already present
@@ -579,10 +580,9 @@ class FunctionGraph(MetaObject):
        self._features.append(feature)

    def remove_feature(self, feature: Feature) -> None:
-        """
-        Removes the feature from the graph.
+        """Remove a feature from the graph.

-        Calls feature.on_detach(function_graph) if an on_detach method
+        Calls ``feature.on_detach(function_graph)`` if an ``on_detach`` method
        is defined.

        """
@@ -596,9 +596,9 @@ class FunctionGraph(MetaObject):
            detach(self)

    def execute_callbacks(self, name: str, *args, **kwargs) -> None:
-        """Execute callbacks
+        """Execute callbacks.

-        Calls `getattr(feature, name)(*args)` for each feature which has
+        Calls ``getattr(feature, name)(*args)`` for each feature which has
        a method called after name.

        """
@@ -619,8 +619,7 @@ class FunctionGraph(MetaObject):
    def collect_callbacks(self, name: str, *args) -> Dict[Feature, Any]:
        """Collects callbacks

-        Returns a dictionary d such that
-        `d[feature] == getattr(feature, name)(*args)`
+        Returns a dictionary d such that ``d[feature] == getattr(feature, name)(*args)``
        For each feature which has a method called after name.
        """
        d = {}
@@ -633,17 +632,17 @@ class FunctionGraph(MetaObject):
        return d

    def toposort(self) -> List[Apply]:
-        """Toposort
+        """Return a toposorted list of the nodes.

-        Return an ordering of the graph's Apply nodes such that
+        Return an ordering of the graph's ``Apply`` nodes such that:

-        * All the nodes of the inputs of a node are before that node.
-        * Satisfies the orderings provided by each feature that has
-          an 'orderings' method.
+        * all the nodes of the inputs of a node are before that node and
+        * they satisfy the orderings provided by each feature that has
+          an ``orderings`` method.

-        If a feature has an 'orderings' method, it will be called with
-        this FunctionGraph as sole argument. It should return a dictionary of
-        `{node: predecessors}` where predecessors is a list of nodes that
+        If a feature has an ``orderings`` method, it will be called with
+        this `FunctionGraph` as sole argument. It should return a dictionary of
+        ``{node: predecessors}`` where predecessors is a list of nodes that
        should be computed before the key node.
        """
        if len(self.apply_nodes) < 2:
@@ -661,15 +660,15 @@ class FunctionGraph(MetaObject):
        return order

    def orderings(self) -> Dict[Apply, List[Apply]]:
-        """Return `dict` `d` s.t. `d[node]` is a list of nodes that must be evaluated before `node` itself can be evaluated.
+        """Return ``dict`` ``d`` s.t. ``d[node]`` is a list of nodes that must be evaluated before ``node`` itself can be evaluated.

-        This is used primarily by the destroy_handler feature to ensure that
+        This is used primarily by the ``destroy_handler`` feature to ensure that
        the clients of any destroyed inputs have already computed their
        outputs.

        Notes
        -----
-        This only calls the `orderings()` function on all features. It does not
+        This only calls the ``orderings()`` function on all features. It does not
        take care of computing the dependencies by itself.

        """
@@ -707,10 +706,7 @@ class FunctionGraph(MetaObject):
            return ords

    def check_integrity(self) -> None:
-        """
-        Call this for a diagnosis if things go awry.
-
-        """
+        """Check the integrity of nodes in the graph."""
        nodes = set(applys_between(self.inputs, self.outputs))
        if self.apply_nodes != nodes:
            missing = nodes.difference(self.apply_nodes)
@@ -763,10 +759,7 @@ class FunctionGraph(MetaObject):
        return f"FunctionGraph({', '.join(graph_as_string(self.inputs, self.outputs))})"

    def clone(self, check_integrity=True) -> "FunctionGraph":
-        """
-        Clone the graph and get a memo( a dict )that map old node to new node
-
-        """
+        """Clone the graph."""
        return self.clone_get_equiv(check_integrity)[0]

    def clone_get_equiv(
@@ -806,11 +799,8 @@ class FunctionGraph(MetaObject):
        return e, equiv

    def __getstate__(self):
-        """
-        This is needed as some features introduce instance methods.
-        This is not picklable.
-
-        """
+        # This is needed as some features introduce instance methods
+        # This is not picklable
        d = self.__dict__.copy()
        for feature in self._features:
            for attr in getattr(feature, "pickle_rm_attr", []):

--- a/aesara/graph/op.py
+++ b/aesara/graph/op.py
@@ -43,8 +43,6 @@ from aesara.graph.utils import (
 from aesara.link.c.interface import CLinkerOp


-__docformat__ = "restructuredtext en"
-
 StorageMapType = List[Optional[List[Any]]]
 ComputeMapType = List[bool]
 OutputStorageType = List[Optional[List[Any]]]
@@ -150,14 +148,14 @@ class Op(MetaObject):
    page on :doc:`graph`.

    For more details regarding how these methods should behave: see the `Op
-    Contract` in the sphinx docs (advanced tutorial on `Op`-making).
+    Contract` in the sphinx docs (advanced tutorial on `Op` making).

    """

    default_output: Optional[int] = None
    """
-    An `int` that specifies which output `Op.__call__` should return.  If
-    `None`, then all outputs are returned.
+    An ``int`` that specifies which output :meth:`Op.__call__` should return.  If
+    ``None``, then all outputs are returned.

    A subclass should not change this class variable, but instead override it
    with a subclass variable or an instance variable.
@@ -228,9 +226,9 @@ class Op(MetaObject):
        return Apply(self, inputs, [o() for o in self.otypes])

    def __call__(self, *inputs: Any, **kwargs) -> Union[Variable, List[Variable]]:
-        r"""Construct an `Apply` node using `self.make_node` and return its outputs.
+        r"""Construct an `Apply` node using :meth:`Op.make_node` and return its outputs.

-        This method is just a wrapper around `Op.make_node`.
+        This method is just a wrapper around :meth:`Op.make_node`.

        It is called by code such as:

@@ -240,14 +238,13 @@ class Op(MetaObject):
           y = aesara.tensor.exp(x)


-        `tensor.exp` is an Op instance, so `tensor.exp(x)` calls
-        `tensor.exp.__call__` (i.e. this method) and returns its single output
-        `Variable`, `y`.  The `Apply` node constructed by `self.make_node`
-        behind the scenes is available via `y.owner`.
+        `aesara.tensor.exp` is an `Op` instance, so ``aesara.tensor.exp(x)`` calls
+        :meth:`aesara.tensor.exp.__call__` (i.e. this method) and returns its single output
+        `Variable`, ``y``.  The `Apply` node constructed by :meth:`self.make_node`
+        behind the scenes is available via ``y.owner``.

        `Op` authors are able to determine which output is returned by this method
-        via the `Op.default_output` property., but subclasses are free to override this
-        function and ignore `default_output`.
+        via the :attr:`Op.default_output` property.

        Parameters
        ----------
@@ -304,7 +301,7 @@ class Op(MetaObject):
        Each returned `Variable` represents the gradient with respect to that
        input computed based on the symbolic gradients with respect to each
        output. If the output is not differentiable with respect to an input,
-        then this method should return an instance of type `NullType` for that
+        then this method should return an instance of type ``NullType`` for that
        input.

        Parameters
@@ -331,12 +328,12 @@ class Op(MetaObject):
        r"""Construct a graph for the L-operator.

        This method is primarily used by `Lop` and dispatches to
-        `Op.grad` by default.
+        :meth:`Op.grad` by default.

-        The *L-operator* computes a *row* vector times the Jacobian. The
+        The L-operator computes a *row* vector times the Jacobian. The
        mathematical relationship is
        :math:`v \frac{\partial f(x)}{\partial x}`.
-        The *L-operator* is also supported for generic tensors (not only for
+        The L-operator is also supported for generic tensors (not only for
        vectors).

        Parameters
@@ -389,26 +386,26 @@ class Op(MetaObject):
            The symbolic `Apply` node that represents this computation.
        inputs : Sequence
            Immutable sequence of non-symbolic/numeric inputs.  These
-            are the values of each `Variable` in `node.inputs`.
+            are the values of each `Variable` in :attr:`node.inputs`.
        output_storage : list of list
            List of mutable single-element lists (do not change the length of
            these lists).  Each sub-list corresponds to value of each
-            `Variable` in `node.outputs`.  The primary purpose of this method
+            `Variable` in :attr:`node.outputs`.  The primary purpose of this method
            is to set the values of these sub-lists.
        params : tuple
-            A tuple containing the values of each entry in `__props__`.
+            A tuple containing the values of each entry in :attr:`Op.__props__`.

        Notes
        -----
        The `output_storage` list might contain data. If an element of
-        output_storage is not `None`, it has to be of the right type, for
-        instance, for a `TensorVariable`, it has to be a NumPy `ndarray`
+        output_storage is not ``None``, it has to be of the right type, for
+        instance, for a `TensorVariable`, it has to be a NumPy ``ndarray``
        with the right number of dimensions and the correct dtype.
        Its shape and stride pattern can be arbitrary. It is not
        guaranteed that such pre-set values were produced by a previous call to
-        this `Op.perform`; they could've been allocated by another
+        this :meth:`Op.perform`; they could've been allocated by another
        `Op`'s `perform` method.
-        A `Op` is free to reuse `output_storage` as it sees fit, or to
+        An `Op` is free to reuse `output_storage` as it sees fit, or to
        discard it and allocate new memory.

        """
@@ -420,7 +417,7 @@ class Op(MetaObject):
        folded when all its inputs are constant. This allows it to choose where
        it puts its memory/speed trade-off. Also, it could make things faster
        as constants can't be used for in-place operations (see
-        `*IncSubtensor`).
+        ``*IncSubtensor``).

        Parameters
        ----------
@@ -435,7 +432,7 @@ class Op(MetaObject):
        return True

    def get_params(self, node: Apply) -> Params:
-        """Try to detect params from the op if `Op.params_type` is set to a `ParamsType`."""
+        """Try to get parameters for the `Op` when :attr:`Op.params_type` is set to a `ParamsType`."""
        if hasattr(self, "params_type") and isinstance(self.params_type, ParamsType):
            wrapper = self.params_type
            if not all(hasattr(self, field) for field in wrapper.fields):
@@ -457,13 +454,16 @@ class Op(MetaObject):
        compute_map: ComputeMapType,
        impl: Optional[Text],
    ) -> None:
-        """Make any special modifications that the Op needs before doing `Op.make_thunk`.
+        """Make any special modifications that the `Op` needs before doing :meth:`Op.make_thunk`.

        This can modify the node inplace and should return nothing.

-        It can be called multiple time with different impl. It is the
-        op responsibility to don't re-prepare the node when it isn't
-        good to do so.
+        It can be called multiple time with different `impl` values.
+
+        .. warning::
+
+            It is the `Op`'s responsibility to not re-prepare the node when it
+            isn't good to do so.

        """

@@ -477,7 +477,7 @@ class Op(MetaObject):
    ) -> ThunkType:
        """Make a Python thunk.

-        Like `Op.make_thunk` but only makes python thunks.
+        Like :meth:`Op.make_thunk` but only makes Python thunks.

        """
        node_input_storage = [storage_map[r] for r in node.inputs]
@@ -527,7 +527,7 @@ class Op(MetaObject):
        no_recycling: bool,
        impl: Optional[Text] = None,
    ) -> ThunkType:
-        """Create a thunk.
+        r"""Create a thunk.

        This function must return a thunk, that is a zero-arguments
        function that encapsulates the computation to be performed
@@ -536,32 +536,34 @@ class Op(MetaObject):
        Parameters
        ----------
        node
-            Something previously returned by self.make_node.
+            Something previously returned by :meth:`Op.make_node`.
        storage_map
-            dict variable -> one-element-list where a computed
-            value for this variable may be found.
+            A ``dict`` mapping `Variable`\s to single-element lists where a
+            computed value for each `Variable` may be found.
        compute_map
-            dict variable -> one-element-list where a boolean
-            value will be found. The boolean indicates whether the
-            variable's storage_map container contains a valid value (True)
-            or if it has not been computed yet (False).
+            A ``dict`` mapping `Variable`\s to single-element lists where a
+            boolean value can be found. The boolean indicates whether the
+            `Variable`'s `storage_map` container contains a valid value
+            (i.e. ``True``) or whether it has not been computed yet
+            (i.e. ``False``).
        no_recycling
-            List of variables for which it is forbidden to reuse memory
+            List of `Variable`\s for which it is forbidden to reuse memory
            allocated by a previous call.
-        impl: str
+        impl : str
            Description for the type of node created (e.g. ``"c"``, ``"py"``,
            etc.)

        Notes
        -----
-        If the thunk consults the storage_map on every call, it is safe
-        for it to ignore the no_recycling argument, because elements of the
-        no_recycling list will have a value of None in the storage map.  If
-        the thunk can potentially cache return values (like CLinker does),
-        then it must not do so for variables in the no_recycling list.
-
-        self.prepare_node(node, ...) is always called. If we try 'c' and it
-        fail and we try again 'py', prepare_node will be called twice.
+        If the thunk consults the `storage_map` on every call, it is safe
+        for it to ignore the `no_recycling` argument, because elements of the
+        `no_recycling` list will have a value of ``None`` in the `storage_map`.
+        If the thunk can potentially cache return values (like `CLinker` does),
+        then it must not do so for variables in the `no_recycling` list.
+
+        :meth:`Op.prepare_node` is always called. If it tries ``'c'`` and it
+        fails, then it tries ``'py'``, and :meth:`Op.prepare_node` will be
+        called twice.
        """
        self.prepare_node(
            node, storage_map=storage_map, compute_map=compute_map, impl="py"
@@ -584,7 +586,7 @@ class COp(Op, CLinkerOp):
    ) -> ThunkType:
        """Create a thunk for a C implementation.

-        Like `Op.make_thunk`, but will only try to make a C thunk.
+        Like :meth:`Op.make_thunk`, but will only try to make a C thunk.

        """
        # FIXME: Putting the following import on the module level causes an import cycle.
@@ -640,13 +642,13 @@ class COp(Op, CLinkerOp):
    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
        """Create a thunk.

-        See `Op.make_thunk`.
+        See :meth:`Op.make_thunk`.

        Parameters
        ----------
-        impl
-            Currently, None, 'c' or 'py'. If 'c' or 'py' we will only try
-            that version of the code.
+        impl :
+            Currently, ``None``, ``'c'`` or ``'py'``. If ``'c'`` or ``'py'`` we
+            will only try that version of the code.

        """
        if (impl is None and config.cxx) or impl == "c":
@@ -669,11 +671,11 @@ def get_test_value(v: Variable) -> Any:
    """Get the test value for `v`.

    If input `v` is not already a variable, it is turned into one by calling
-    `as_tensor_variable(v)`.
+    ``as_tensor_variable(v)``.

    Raises
    ------
-    AttributeError if no test value is set.
+    ``AttributeError`` if no test value is set.

    """
    if not isinstance(v, Variable):
@@ -771,33 +773,33 @@ Registry of `Op`\s that have an inner compiled Aesara function.

 The keys are `Op` classes (not instances), and values are the name of the
 attribute that contains the function. For instance, if the function is
-self.fn, the value will be 'fn'.
+``self.fn``, the value will be ``'fn'``.

 We need that to be able not to run debug checks a number of times that is
-exponential in the nesting level of those ops.
-For instance, Scan will be registered here.
+exponential in the nesting level of those `Op`\s.
+
+For instance, `Scan` will be registered here.

 """


 class OpenMPOp(COp):
-    """
-    All op using OpenMP code should inherit from this Op.
+    r"""Base class for `Op`\s using OpenMP.

-    This op will check that the compiler support correctly OpenMP code.
-    If not, it will print a warning and disable openmp for this Op.
-    Then it will generate the not OpenMP code.
+    This `Op` will check that the compiler support correctly OpenMP code.
+    If not, it will print a warning and disable OpenMP for this `Op`, then it
+    will generate the not OpenMP code.

-    This is needed as EPD on Windows g++ version spec information tell
-    it support OpenMP, but does not include the OpenMP files.
+    This is needed, as EPD on the Windows version of ``g++`` says it supports
+    OpenMP, but does not include the OpenMP files.

-    We also add the correct compiler flags in c_compile_args.
+    We also add the correct compiler flags in ``c_compile_args``.

    """

    gxx_support_openmp: Optional[bool] = None
    """
-    True/False after we tested this.
+    ``True``/``False`` after we tested this.

    """

@@ -813,18 +815,14 @@ class OpenMPOp(COp):
            self.openmp = False

    def c_compile_args(self, **kwargs):
-        """
-        Return the compilation arg "fopenmp" if openMP is supported
-        """
+        """Return the compilation argument ``"-fopenmp"`` if OpenMP is supported."""
        self.update_self_openmp()
        if self.openmp:
            return ["-fopenmp"]
        return []

    def c_headers(self, **kwargs):
-        """
-        Return the header file name "omp.h" if openMP is supported
-        """
+        """Return the header file name ``"omp.h"`` if OpenMP is supported."""
        self.update_self_openmp()
        if self.openmp:
            return ["omp.h"]
@@ -832,7 +830,7 @@ class OpenMPOp(COp):

    @staticmethod
    def test_gxx_support():
-        """Check if openMP is supported."""
+        """Check if OpenMP is supported."""
        from aesara.link.c.cmodule import GCC_compiler

        code = """
@@ -852,10 +850,7 @@ int main( int argc, const char* argv[] )
        return default_openmp

    def update_self_openmp(self) -> None:
-        """
-        Make sure self.openmp is not True if there is no support in gxx.
-
-        """
+        """Make sure ``self.openmp`` is not ``True`` if there is no OpenMP support in ``gxx``."""
        if self.openmp:
            if OpenMPOp.gxx_support_openmp is None:
                OpenMPOp.gxx_support_openmp = OpenMPOp.test_gxx_support()
@@ -1072,7 +1067,7 @@ class ExternalCOp(COp):
        it returns:
         - a default macro ``PARAMS_TYPE`` which defines the class name of the
           corresponding C struct.
-         - a macro ``DTYPE_PARAM_key`` for every ``key`` in the ParamsType for which associated
+         - a macro ``DTYPE_PARAM_key`` for every ``key`` in the :class:`ParamsType` for which associated
           type implements the method :func:`aesara.graph.type.CLinkerType.c_element_type`.
           ``DTYPE_PARAM_key`` defines the primitive C type name of an item in a variable
           associated to ``key``.
@@ -1223,10 +1218,7 @@ class ExternalCOp(COp):
        return "\n".join(define_macros), "\n".join(undef_macros)

    def c_init_code_struct(self, node, name, sub):
-        """
-        Stitches all the macros and "init_code" together
-
-        """
+        r""" Stitches all the macros and ``init_code_*``\s together."""
        if "init_code_struct" in self.code_sections:
            op_code = self.code_sections["init_code_struct"]

@@ -1291,9 +1283,7 @@ class ExternalCOp(COp):
                raise NotImplementedError()

    def c_code_cleanup(self, node, name, inputs, outputs, sub):
-        """
-        Stitches all the macros and "code_cleanup" together
-        """
+        r"""Stitches all the macros and ``code_cleanup``\s together."""
        if "code_cleanup" in self.code_sections:
            op_code = self.code_sections["code_cleanup"]

@@ -1339,7 +1329,7 @@ class _NoPythonCOp(COp):


 class _NoPythonExternalCOp(ExternalCOp):
-    """A class used to indicate that a `ExternalCOp` does not provide a Python implementation.
+    """A class used to indicate that an `ExternalCOp` does not provide a Python implementation.

    XXX: Do not use this class; it's only for tracking bad implementations internally.


--- a/aesara/graph/opt.py
+++ b/aesara/graph/opt.py
@@ -52,22 +52,20 @@ class LocalMetaOptimizerSkipAssertionError(AssertionError):


 class GlobalOptimizer(abc.ABC):
-    """
+    """A optimizer that can be applied to a `FunctionGraph` in order to transform it.

-    A L{GlobalOptimizer} can be applied to an L{FunctionGraph} to transform it.
-    It can represent an optimization or in general any kind
-    of transformation you could apply to an L{FunctionGraph}.
+    It can represent an optimization or, in general, any kind of transformation
+    one could apply to a `FunctionGraph`.

    """

    @abc.abstractmethod
    def apply(self, fgraph):
-        """
+        """Apply the optimization to a `FunctionGraph`.

-        Applies the optimization to the provided L{FunctionGraph}. It may
-        use all the methods defined by the L{FunctionGraph}. If the
-        L{GlobalOptimizer} needs to use a certain tool, such as an
-        L{InstanceFinder}, it can do so in its L{add_requirements} method.
+        It may use all the methods defined by the `FunctionGraph`. If the
+        `GlobalOptimizer` needs to use a certain tool, such as an
+        `InstanceFinder`, it can do so in its `add_requirements` method.

        """
        raise NotImplementedError()
@@ -86,9 +84,9 @@ class GlobalOptimizer(abc.ABC):
        return ret

    def __call__(self, fgraph):
-        """
+        """Optimize a `FunctionGraph`.

-        Same as self.optimize(fgraph).
+        This is the same as ``self.optimize(fgraph)``.

        """
        return self.optimize(fgraph)
@@ -151,20 +149,14 @@ class FromFunctionOptimizer(GlobalOptimizer):


 def optimizer(f):
-    """
-    Decorator for FromFunctionOptimizer.
-
-    """
+    """Decorator for `FromFunctionOptimizer`."""
    rval = FromFunctionOptimizer(f)
    rval.__name__ = f.__name__
    return rval


 def inplace_optimizer(f):
-    """
-    Decorator for FromFunctionOptimizer.
-
-    """
+    """Decorator for `FromFunctionOptimizer` that also adds the `DestroyHandler` features."""
    dh_handler = dh.DestroyHandler
    requirements = (lambda fgraph: fgraph.attach_feature(dh_handler()),)
    rval = FromFunctionOptimizer(f, requirements)
@@ -177,10 +169,7 @@ class SeqOptimizer(GlobalOptimizer, UserList):

    @staticmethod
    def warn(exc, self, optimizer):
-        """
-        Default failure_callback for SeqOptimizer.
-
-        """
+        """Default ``failure_callback`` for `SeqOptimizer`."""
        _logger.error(f"SeqOptimizer apply {optimizer}")
        _logger.error("Traceback:")
        _logger.error(traceback.format_exc())
@@ -209,11 +198,7 @@ class SeqOptimizer(GlobalOptimizer, UserList):
        assert len(kw) == 0

    def apply(self, fgraph):
-        """
-
-        Applies each L{GlobalOptimizer} in self in turn.
-
-        """
+        """Applies each `GlobalOptimizer` in ``self.data`` to `fgraph`."""
        l = []
        if fgraph.profile:
            validate_before = fgraph.profile.validate_time
@@ -375,10 +360,7 @@ class SeqOptimizer(GlobalOptimizer, UserList):

    @staticmethod
    def merge_profile(prof1, prof2):
-        """
-        Merge 2 profiles returned by this cass apply() fct.
-
-        """
+        """Merge two profiles."""
        new_t = []  # the time for the optimization
        new_l = []  # the optimization
        new_sub_profile = []
@@ -536,10 +518,7 @@ class MergeFeature(Feature):
                self.seen_constants.discard(id(c))

    def process_constant(self, fgraph, c):
-        """
-        Check if a constant can be merged, and queue that replacement.
-
-        """
+        """Check if a constant `c` can be merged, and queue that replacement."""
        if id(c) in self.seen_constants:
            return
        sig = c.merge_signature()
@@ -557,10 +536,7 @@ class MergeFeature(Feature):
            self.seen_constants.add(id(c))

    def process_node(self, fgraph, node):
-        """
-        Check if a node can be merged, and queue that replacement.
-
-        """
+        """Check if a `node` can be merged, and queue that replacement."""
        if node in self.nodes_seen:
            return

@@ -739,16 +715,16 @@ class MergeFeature(Feature):


 class MergeOptimizer(GlobalOptimizer):
-    """
-    Merges parts of the graph that are identical and redundant.
+    r"""Merges parts of the graph that are identical and redundant.

-    The basic principle is that if two Applies have ops that compare equal, and
+    The basic principle is that if two `Apply`\s have `Op`\s that compare equal, and
    identical inputs, then they do not both need to be computed. The clients of
    one are transferred to the other and one of them is removed from the graph.
-    This procedure is carried out in input->output order through the graph.
+    This procedure is carried out in input-to-output order throughout the graph.

    The first step of merging is constant-merging, so that all clients of an
-    int(1) for example, are transferred to a particular instance of int(1).
+    ``int(1)`` for example, are transferred to just one particular instance of
+    ``int(1)``.

    """

@@ -965,17 +941,24 @@ class MergeOptimizer(GlobalOptimizer):


 def pre_constant_merge(fgraph, variables):
-    """Merge constants in the graphs for a list of `variables`.
+    """Merge constants in the graphs given by `variables`.

-    XXX: This changes the nodes in a graph in-place!
+    .. warning::

-    `variables` is a list of nodes, and we want to merge together nodes that
-    are constant inputs used to compute nodes in that list.
+        This changes the nodes in a graph in-place!

-    We also want to avoid terms in the graphs for `variables` that are
-    contained in the `FunctionGraph` given by `fgraph`.  The reason for that:
-    it will break consistency of `fgraph` and its features
-    (e.g. `ShapeFeature`).
+    Parameters
+    ----------
+    fgraph
+        A `FunctionGraph` instance in which some of these `variables` may
+        reside.
+
+        We want to avoid terms in `variables` that are contained in `fgraph`.
+        The reason for that: it will break consistency of `fgraph` and its
+        features (e.g. `ShapeFeature`).
+
+    variables
+        A list of nodes for which we want to merge constant inputs.

    Notes
    -----
@@ -1034,54 +1017,49 @@ class LocalOptimizer(abc.ABC):
        return self._optimizer_idx

    def tracks(self):
-        """
-        Return the list of op classes that this opt applies to.
+        """Return the list of `Op` classes to which this optimization applies.

-        Return None to apply to all nodes.
+        Returns ``None`` when the optimization applies to all nodes.

        """
        return None

    @abc.abstractmethod
    def transform(self, fgraph, node, *args, **kwargs):
-        """
-        Transform a subgraph whose output is `node`.
+        r"""Transform a subgraph whose output is `node`.

-        Subclasses should implement this function so that it returns one of two
-        kinds of things:
+        Subclasses should implement this function so that it returns one of the
+        following:
+
+        - ``False`` to indicate that no optimization can be applied to this `node`;
+        - A list of `Variable`\s to use in place of the `node`'s current outputs.
+        - A ``dict`` mapping old `Variable`\s to `Variable`\s.

-        - False to indicate that no optimization can be applied to this `node`;
-          or
-        - <list of variables> to use in place of `node`'s outputs in the
-          greater graph.
-        - dict(old variables -> new variables). A dictionary that map
-          from old variables to new variables to replace.

        Parameters
        ----------
-        node : an Apply instance
+        fgraph :
+            A `FunctionGraph` containing `node`.
+        node :
+            An `Apply` node to be transformed.

        """

        raise NotImplementedError()

    def add_requirements(self, fgraph):
-        """
-        If this local optimization wants to add some requirements to the
-        fgraph, this is the place to do it.
-
-        """
+        r"""Add required `Feature`\s to `fgraph`."""

    def print_summary(self, stream=sys.stdout, level=0, depth=-1):
        print(f"{' ' * level}{self.__class__.__name__} id={id(self)}", file=stream)


 class LocalMetaOptimizer(LocalOptimizer):
-    """
-    Base class for meta-optimizers that try a set of LocalOptimizers
+    r"""
+    Base class for meta-optimizers that try a set of `LocalOptimizer`\s
    to replace a node and choose the one that executes the fastest.

-    If the error LocalMetaOptimizerSkipAssertionError is raised during
+    If the error ``LocalMetaOptimizerSkipAssertionError`` is raised during
    compilation, we will skip that function compilation and not print
    the error.

@@ -1175,17 +1153,17 @@ class LocalMetaOptimizer(LocalOptimizer):
        return

    def provide_inputs(self, node, inputs):
-        """
-        If implemented, returns a dictionary mapping all symbolic variables
-        in ``inputs`` to SharedVariable instances of suitable dummy values.
-        The ``node`` can be inspected to infer required input shapes.
+        """Return a dictionary mapping some `inputs` to `SharedVariable` instances of with dummy values.
+
+        The `node` argument can be inspected to infer required input shapes.

        """
        raise NotImplementedError()

    def get_opts(self, node):
-        """
-        Can be overridden to change the way opts are selected
+        """Return the optimizations that apply to `node`.
+
+        This uses ``self.track_dict[type(node.op)]`` by default.
        """
        return self.track_dict[type(node.op)]

@@ -1196,7 +1174,7 @@ class LocalMetaOptimizer(LocalOptimizer):


 class FromFunctionLocalOptimizer(LocalOptimizer):
-    """An optimizer constructed from a given function."""
+    """A `LocalOptimizer` constructed from a function."""

    def __init__(self, fn, tracks=None, requirements=()):
        self.fn = fn
@@ -1222,10 +1200,6 @@ class FromFunctionLocalOptimizer(LocalOptimizer):

 def local_optimizer(tracks, inplace=False, requirements=()):
    def decorator(f):
-        """
-        WRITEME
-
-        """
        if tracks is not None:
            if len(tracks) == 0:
                raise ValueError(
@@ -1252,22 +1226,28 @@ def local_optimizer(tracks, inplace=False, requirements=()):


 class LocalOptGroup(LocalOptimizer):
-    """Takes a list of LocalOptimizer and applies them to the node.
+    r"""An optimizer that applies a list of `LocalOptimizer`\s to a node.

    Parameters
    ----------
    optimizers :
-        The List of optimizers to be applied to a node
-    reentrant : bool (Default True)
-        Keyword only argument. Reentrant information. Some global
-        optimizer like NavigatorOptimizer can use this value to
-        determine if it ignore new nodes during a pass on the
-        nodes. Sometimes, ignore_newtrees is not reentrant.
+        A list of optimizers to be applied to nodes.
    apply_all_opts : bool (Default False)
-        If False, it will return after the new node after the first optimizer
+        If ``False``, it will return after the new node after the first optimizer
        applied. Otherwise, it will start again with the new node until no new
        optimization apply.
+    profile :
+        Whether or not to profile the optimizations.

+    Attributes
+    ----------
+    reentrant : bool
+        Some global optimizer like `NavigatorOptimizer` can use this value to
+        determine if it ignore new nodes during a pass on the nodes. Sometimes,
+        ``ignore_newtrees`` is not reentrant.
+    retains_inputs : bool
+        States whether or not the inputs of a transformed node are transferred
+        to the outputs.
    """

    def __init__(self, *optimizers, **kwargs):
@@ -1429,13 +1409,13 @@ class LocalOptGroup(LocalOptimizer):


 class GraphToGPULocalOptGroup(LocalOptGroup):
-    """This is the equivalent of LocalOptGroup for GraphToGPU.
+    """This is the equivalent of `LocalOptGroup` for `GraphToGPU`.

    The main different is the function signature of the local
-    optimizer that use the GraphToGPU signature and not the normal
-    LocalOptimizer signature.
+    optimizer that use the `GraphToGPU` signature and not the normal
+    `LocalOptimizer` signature.

-    apply_all_opts=True is not supported
+    ``apply_all_opts=True`` is not supported

    """

@@ -1468,13 +1448,13 @@ class GraphToGPULocalOptGroup(LocalOptGroup):
 class OpSub(LocalOptimizer):
    """

-    Replaces the application of a certain op by the application of
-    another op that takes the same inputs as what they are replacing.
+    Replaces the application of a certain `Op` by the application of
+    another `Op` that takes the same inputs as what it is replacing.

    Parameters
    ----------
    op1, op2
-        op1.make_node and op2.make_node must take the same number of
+        ``op1.make_node`` and ``op2.make_node`` must take the same number of
        inputs and have the same number of outputs.

    Examples
@@ -1517,8 +1497,7 @@ class OpSub(LocalOptimizer):

 class OpRemove(LocalOptimizer):
    """
-
-    Removes all applications of an op by transferring each of its
+    Removes all applications of an `Op` by transferring each of its
    outputs to the corresponding input.

    """
@@ -1583,31 +1562,31 @@ class PatternSub(LocalOptimizer):
    match iff a constant variable with the same value and the same type
    is found in its place.

-    You can add a constraint to the match by using the dict(...)  form
-    described above with a 'constraint' key. The constraint must be a
+    You can add a constraint to the match by using the ``dict(...)`` form
+    described above with a ``'constraint'`` key. The constraint must be a
    function that takes the fgraph and the current Variable that we are
    trying to match and returns True or False according to an
    arbitrary criterion.

-    The constructor creates a PatternSub that replaces occurrences of
-    in_pattern by occurrences of out_pattern.
+    The constructor creates a `PatternSub` that replaces occurrences of
+    `in_pattern` by occurrences of `out_pattern`.

    Parameters
    ----------
-    in_pattern
+    in_pattern :
        The input pattern that we want to replace.
-    out_pattern
+    out_pattern :
        The replacement pattern.
    allow_multiple_clients : bool
        If False, the pattern matching will fail if one of the subpatterns has
        more than one client.
    skip_identities_fn : TODO
-    name
+    name :
        Allows to override this optimizer name.
    pdb : bool
        If True, we invoke pdb when the first node in the pattern matches.
    tracks : optional
-        The values that self.tracks() will return. Useful to speed up
+        The values that :meth:`self.tracks` will return. Useful to speed up
        optimization sometimes.
    get_nodes : optional
        If you provide `tracks`, you must provide this parameter. It must be a
@@ -1617,7 +1596,7 @@ class PatternSub(LocalOptimizer):
    Notes
    -----
    `tracks` and `get_nodes` can be used to make this optimizer track a less
-    frequent Op, so this will make this optimizer tried less frequently.
+    frequent `Op`, so this will make this optimizer tried less frequently.

    Examples
    --------
@@ -1653,7 +1632,7 @@ class PatternSub(LocalOptimizer):
            self.op = self.in_pattern["pattern"][0]
        else:
            raise TypeError(
-                "The pattern to search for must start with " "a specific Op instance."
+                "The pattern to search for must start with a specific Op instance."
            )
        self.__doc__ = (
            self.__class__.__doc__ + "\n\nThis instance does: " + str(self) + "\n"
@@ -1677,9 +1656,9 @@ class PatternSub(LocalOptimizer):
        return [self.op]

    def transform(self, fgraph, node, get_nodes=True):
-        """
-        Checks if the graph from node corresponds to in_pattern. If it does,
-        constructs out_pattern and performs the replacement.
+        """Check if the graph from node corresponds to ``in_pattern``.
+
+        If it does, it constructs ``out_pattern`` and performs the replacement.

        """
        from aesara.graph import unify
@@ -1857,19 +1836,23 @@ class Updater(Feature):


 class NavigatorOptimizer(GlobalOptimizer):
-    """
-    Abstract class.
+    r"""An optimizer that applies a `LocalOptimizer` with considerations for the new nodes it creates.
+
+
+    This optimizer also allows the `LocalOptimizer` to use a special ``"remove"`` value
+    in the ``dict``\s returned by :meth:`LocalOptimizer`.  `Variable`\s mapped to this
+    value are removed from the `FunctionGraph`.

    Parameters
    ----------
-    local_opt
-        A LocalOptimizer to apply over a FunctionGraph (or None is Ok too).
-    ignore_newtrees
-        - True: new subgraphs returned by an optimization is not a
+    local_opt :
+        A `LocalOptimizer` to apply over a `FunctionGraph` (or ``None``).
+    ignore_newtrees :
+        - ``True``: new subgraphs returned by an optimization are not a
          candidate for optimization.
-        - False: new subgraphs returned by an optimization is a candidate
+        - ``False``: new subgraphs returned by an optimization is a candidate
          for optimization.
-        - 'auto': let the local_opt set this parameter via its 'reentrant'
+        - ``'auto'``: let the `local_opt` set this parameter via its :attr:`reentrant`
          attribute.
    failure_callback
        A function with the signature ``(exception, navigator, [(old, new),
@@ -1888,10 +1871,7 @@ class NavigatorOptimizer(GlobalOptimizer):

    @staticmethod
    def warn(exc, nav, repl_pairs, local_opt, node):
-        """
-        Failure_callback for NavigatorOptimizer: print traceback.
-
-        """
+        """A failure callback that prints a traceback."""
        if config.on_opt_error != "ignore":
            _logger.error(f"Optimization failure due to: {local_opt}")
            _logger.error(f"node: {node}")
@@ -1906,12 +1886,10 @@ class NavigatorOptimizer(GlobalOptimizer):

    @staticmethod
    def warn_inplace(exc, nav, repl_pairs, local_opt, node):
-        """
-        Failure_callback for NavigatorOptimizer.
+        r"""A failure callback that ignores ``InconsistencyError``\s and prints a traceback.

-        Ignore InconsistencyErrors, print traceback.
-
-        If error during replacement repl_pairs is set. Otherwise None.
+        If the error occurred during replacement, ``repl_pairs`` is set;
+        otherwise, its value is ``None``.

        """
        if isinstance(exc, InconsistencyError):
@@ -1920,10 +1898,7 @@ class NavigatorOptimizer(GlobalOptimizer):

    @staticmethod
    def warn_ignore(exc, nav, repl_pairs, local_opt, node):
-        """
-        Failure_callback for NavigatorOptimizer: ignore all errors.
-
-        """
+        """A failure callback that ignores all errors."""

    def __init__(self, local_opt, ignore_newtrees="auto", failure_callback=None):
        self.local_opt = local_opt
@@ -1934,28 +1909,25 @@ class NavigatorOptimizer(GlobalOptimizer):
        self.failure_callback = failure_callback

    def attach_updater(self, fgraph, importer, pruner, chin=None, name=None):
-        """
-        Install some FunctionGraph listeners to help the navigator deal with
-        the ignore_trees-related functionality.
+        r"""Install `FunctionGraph` listeners to help the navigator deal with the ``ignore_trees``-related functionality.

        Parameters
        ----------
-        importer
+        importer :
            Function that will be called whenever optimizations add stuff
            to the graph.
-        pruner
+        pruner :
            Function to be called when optimizations remove stuff
            from the graph.
-        chin
+        chin :
            "on change input" called whenever a node's inputs change.
-        name
-            name of the Updater to attach.
+        name :
+            name of the ``Updater`` to attach.

        Returns
        -------
-        object
-            The FunctionGraph plugin that handles the three tasks.
-            Keep this around so that you can detach later!
+        The `FunctionGraph` plugin that handles the three tasks.
+        Keep this around so that `Feature`\s can be detached later.

        """
        if self.ignore_newtrees:
@@ -1969,13 +1941,14 @@ class NavigatorOptimizer(GlobalOptimizer):
        return u

    def detach_updater(self, fgraph, u):
-        """
-        Undo the work of attach_updater.
+        """Undo the work of ``attach_updater``.

        Parameters
        ----------
+        fgraph
+            The `FunctionGraph`.
        u
-            A return-value of attach_updater.
+            A return-value of ``attach_updater``.

        Returns
        -------
@@ -1986,31 +1959,31 @@ class NavigatorOptimizer(GlobalOptimizer):
            fgraph.remove_feature(u)

    def process_node(self, fgraph, node, lopt=None):
-        """
-        This function will use `lopt` to `transform` the `node`. The
-        `transform` method will return either False or a list of Variables
-        that are intended to replace `node.outputs`.
+        r"""Apply `lopt` to `node`.
+
+        The :meth:`lopt.transform` method will return either ``False`` or a
+        list of `Variable`\s that are intended to replace :attr:`node.outputs`.

-        If the fgraph accepts the replacement, then the optimization is
-        successful, and this function returns True.
+        If the `fgraph` accepts the replacement, then the optimization is
+        successful, and this function returns ``True``.

-        If there are no replacement candidates or the fgraph rejects the
-        replacements, this function returns False.
+        If there are no replacement candidates or the `fgraph` rejects the
+        replacements, this function returns ``False``.

        Parameters
        ----------
-        fgraph
-            A FunctionGraph.
-        node
-            An Apply instance in `fgraph`
-        lopt
-            A LocalOptimizer instance that may have a better idea for
+        fgraph :
+            A `FunctionGraph`.
+        node :
+            An `Apply` instance in `fgraph`
+        lopt :
+            A `LocalOptimizer` instance that may have a better idea for
            how to compute node's outputs.

        Returns
        -------
        bool
-            True iff the `node`'s outputs were replaced in the `fgraph`.
+            ``True`` iff the `node`'s outputs were replaced in the `fgraph`.

        """
        lopt = lopt or self.local_opt
@@ -2085,11 +2058,7 @@ class NavigatorOptimizer(GlobalOptimizer):


 class TopoOptimizer(NavigatorOptimizer):
-    """
-    TopoOptimizer has one local optimizer. It tries to apply to each node, in topological order (or reverse).
-    Each time the local optimizer applies, the node gets replaced, and the topooptimizer moves on to the next one.
-
-    """
+    """An optimizer that applies a single `LocalOptimizer` to each node in topological order (or reverse)."""

    def __init__(
        self, local_opt, order="in_to_out", ignore_newtrees=False, failure_callback=None
@@ -2243,16 +2212,19 @@ def in2out(*local_opts, **kwargs):


 class OpKeyOptimizer(NavigatorOptimizer):
-    """
-    WRITEME
+    r"""An optimizer that applies a `LocalOptimizer` to specific `Op`\s.
+
+    The `Op`\s are provided by a :meth:`LocalOptimizer.op_key` method (either
+    as a list of `Op`\s or a single `Op`), and discovered within a
+    `FunctionGraph` using the `NodeFinder` `Feature`.
+
+    This is similar to the ``tracks`` feature used by other optimizers.

    """

    def __init__(self, local_opt, ignore_newtrees=False, failure_callback=None):
        if not hasattr(local_opt, "op_key"):
-            raise TypeError(
-                "LocalOptimizer for OpKeyOptimizer must have " "an 'op_key' method."
-            )
+            raise TypeError(f"{local_opt} must have an `op_key` method.")
        super().__init__(local_opt, ignore_newtrees, failure_callback)

    def apply(self, fgraph):
@@ -2281,12 +2253,6 @@ class OpKeyOptimizer(NavigatorOptimizer):
            self.detach_updater(fgraph, u)

    def add_requirements(self, fgraph):
-        """
-        Requires the following features:
-          - NodeFinder
-          - ReplaceValidate(Added by default)
-
-        """
        super().add_requirements(fgraph)
        fgraph.attach_feature(NodeFinder())

@@ -2314,9 +2280,7 @@ class ChangeTracker(Feature):


 def merge_dict(d1, d2):
-    """
-    merge 2 dicts by adding the values.
-    """
+    r"""Merge two ``dict``\s by adding their values."""
    d = d1.copy()
    for k, v in d2.items():
        if k in d:
@@ -2327,8 +2291,7 @@ def merge_dict(d1, d2):


 class EquilibriumOptimizer(NavigatorOptimizer):
-    """
-    Apply optimizations until equilibrium point.
+    """An optimizer that applies an optimization until a fixed-point/equilibrium is reached.

    Parameters
    ----------
@@ -2337,13 +2300,13 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        The global optimizer will be run at the start of each iteration before
        the local optimizer.
    max_use_ratio : int or float
-        Each optimizer can be applied at most (size of graph * this number)
+        Each optimizer can be applied at most ``(size of graph * this number)``
        times.
-    ignore_newtrees
-        See EquilibriumDB ignore_newtrees parameter definition.
-    final_optimizers
+    ignore_newtrees :
+        See :attr:`EquilibriumDB.ignore_newtrees`.
+    final_optimizers :
        Global optimizers that will be run after each iteration.
-    cleanup_optimizers
+    cleanup_optimizers :
        Global optimizers that apply a list of pre determined optimization.
        They must not traverse the graph as they are called very frequently.
        The MergeOptimizer is one example of optimization that respect this.
@@ -2931,9 +2894,11 @@ def pre_greedy_local_optimizer(fgraph, optimizations, out):

    This function traverses the computation graph in the graph before the
    variable `out` but that are not in the `fgraph`. It applies
-    `local_optimizations` to each variable on the traversed graph.
+    `optimizations` to each variable on the traversed graph.
+
+    .. warning::

-    XXX: This changes the nodes in a graph in-place!
+        This changes the nodes in a graph in-place.

    Its main use is to apply locally constant folding when generating
    the graph of the indices of a subtensor.
@@ -2943,10 +2908,10 @@ def pre_greedy_local_optimizer(fgraph, optimizations, out):

    Notes
    -----
-    This doesn't do an equilibrium optimization, so, if there is optimization
-    like `local_upcast_elemwise_constant_inputs` in the list that adds
-    additional nodes to the inputs of the node, it might be necessary to call
-    this function multiple times.
+    This doesn't do an equilibrium optimization, so, if there is an
+    optimization--like `local_upcast_elemwise_constant_inputs`--in the list
+    that adds additional nodes to the inputs of the node, it might be necessary
+    to call this function multiple times.

    Parameters
    ----------
@@ -3013,23 +2978,21 @@ def pre_greedy_local_optimizer(fgraph, optimizations, out):


 def copy_stack_trace(from_var, to_var):
-    """
-    Copies the stack trace from one or more tensor variables to
-    one or more tensor variables and returns the destination variables.
+    r"""Copy the stack traces from `from_var` to `to_var`.

    Parameters
    ----------
-    from_var
-        Tensor variable or list of tensor variables to copy stack traces from.
-    to_var
-        Tensor variable or list of tensor variables to copy stack traces to.
+    from_var :
+        `Variable` or list `Variable`\s to copy stack traces from.
+    to_var :
+        `Variable` or list `Variable`\s to copy stack traces to.

    Notes
    -----
    The stacktrace is assumed to be of the form of a list of lists
    of tuples. Each tuple contains the filename, line number, function name
    and so on. Each list of tuples contains the truples belonging to a
-    particular variable.
+    particular `Variable`.

    """

@@ -3065,14 +3028,14 @@ def copy_stack_trace(from_var, to_var):
 @contextlib.contextmanager
 def inherit_stack_trace(from_var):
    """
-    Contextmanager that copies the stack trace from one or more variable nodes to all
-    variable nodes constructed in the body. new_nodes is the list of all the newly created
-    variable nodes inside an optimization that is managed by graph.nodes_constructed().
+    A context manager that copies the stack trace from one or more variable nodes to all
+    variable nodes constructed in the body. ``new_nodes`` is the list of all the newly created
+    variable nodes inside an optimization that is managed by ``graph.nodes_constructed``.

    Parameters
    ----------
-    from_var
-        Variable node or a list of variable nodes to copy stack traces from.
+    from_var :
+        `Variable` node or a list of `Variable` nodes to copy stack traces from.

    """
    with nodes_constructed() as new_nodes:
@@ -3081,9 +3044,7 @@ def inherit_stack_trace(from_var):


 def check_stack_trace(f_or_fgraph, ops_to_check="last", bug_print="raise"):
-    r"""
-    This function checks if the outputs of specific ops of a compiled graph
-    have a stack.
+    r"""Checks if the outputs of specific `Op`\s have a stack trace.

    Parameters
    ----------
@@ -3115,7 +3076,8 @@ def check_stack_trace(f_or_fgraph, ops_to_check="last", bug_print="raise"):
    Returns
    -------
    boolean
-        True if the outputs of the specified ops have a stack, False otherwise.
+        ``True`` if the outputs of the specified ops have a stack, ``False``
+        otherwise.

    """
    if isinstance(f_or_fgraph, aesara.compile.function.types.Function):
@@ -3231,7 +3193,7 @@ class CheckStackTraceFeature(Feature):


 class CheckStackTraceOptimization(GlobalOptimizer):
-    """Optimizer that serves to add CheckStackTraceOptimization as an fgraph feature."""
+    """Optimizer that serves to add `CheckStackTraceOptimization` as a feature."""

    def add_requirements(self, fgraph):
        if not hasattr(fgraph, "CheckStackTraceFeature"):

--- a/aesara/graph/type.py
+++ b/aesara/graph/type.py
@@ -33,11 +33,15 @@ class Type(MetaObject):

    """

-    # the type that will be created by a call to make_variable.
    Variable = Variable
+    """
+    The `Type` that will be created by a call to `Type.make_variable`.
+    """

-    # the type that will be created by a call to make_constant
    Constant = Constant
+    """
+    The `Type` that will be created by a call to `Type.make_constant`.
+    """

    @abstractmethod
    def filter(

--- a/aesara/link/c/interface.py
+++ b/aesara/link/c/interface.py
@@ -11,7 +11,7 @@ class CLinkerObject:
    def c_headers(self, **kwargs) -> List[Text]:
        """Return a list of header files required by code returned by this class.

-        These strings will be prefixed with ``#include``  and inserted at the
+        These strings will be prefixed with ``#include`` and inserted at the
        beginning of the C source code.

        Strings in this list that start neither with ``<`` nor ``"`` will be
@@ -35,8 +35,11 @@ class CLinkerObject:
        Provides search paths for headers, in addition to those in any relevant
        environment variables.

-        Note: for Unix compilers, these are the things that get `-I` prefixed
-        in the compiler command line arguments.
+        .. note::
+
+            For Unix compilers, these are the things that get ``-I`` prefixed
+            in the compiler command line arguments.
+

        Examples
        --------
@@ -53,10 +56,13 @@ class CLinkerObject:
        """Return a list of libraries required by code returned by this class.

        The compiler will search the directories specified by the environment
-        variable LD_LIBRARY_PATH in addition to any returned by `c_lib_dirs`.
+        variable ``LD_LIBRARY_PATH`` in addition to any returned by
+        :meth:`CLinkerOp.c_lib_dirs`.
+
+        .. note::

-        Note: for Unix compilers, these are the things that get ``-l`` prefixed
-        in the compiler command line arguments.
+            For Unix compilers, these are the things that get ``-l`` prefixed
+            in the compiler command line arguments.


        Examples
@@ -76,8 +82,11 @@ class CLinkerObject:
        Provides search paths for libraries, in addition to those in any
        relevant environment variables (e.g. ``LD_LIBRARY_PATH``).

-        Note: for Unix compilers, these are the things that get ``-L`` prefixed
-        in the compiler command line arguments.
+        .. note::
+
+            For Unix compilers, these are the things that get ``-L`` prefixed
+            in the compiler command line arguments.
+

        Examples
        --------
@@ -127,8 +136,8 @@ class CLinkerObject:
        """Return a list of incompatible ``gcc`` compiler arguments.

        We will remove those arguments from the command line of ``gcc``. So if
-        another Op adds a compile arg in the graph that is incompatible
-        with this Op, the incompatible arg will not be used.
+        another `Op` adds a compile arg in the graph that is incompatible
+        with this `Op`, the incompatible arg will not be used.

        This is used, for instance, to remove ``-ffast-math``.

@@ -142,7 +151,7 @@ class CLinkerObject:
    def c_code_cache_version(self) -> Union[Tuple[int], Tuple]:
        """Return a tuple of integers indicating the version of this `Op`.

-        An empty tuple indicates an 'unversioned' `Op` that will not be cached
+        An empty tuple indicates an "unversioned" `Op` that will not be cached
        between processes.

        The cache mechanism may erase cached modules that have been superseded
@@ -157,14 +166,7 @@ class CLinkerObject:


 class CLinkerOp(CLinkerObject):
-    """Interface definition for `Op` subclasses compiled by `CLinker`.
-
-    A subclass should implement WRITEME.
-
-    WRITEME: structure of automatically generated C code.
-    Put this in doc/code_structure.txt
-
-    """
+    """Interface definition for `Op` subclasses compiled by `CLinker`."""

    @abstractmethod
    def c_code(
@@ -175,9 +177,9 @@ class CLinkerOp(CLinkerObject):
        outputs: List[Text],
        sub: Dict[Text, Text],
    ) -> Text:
-        """Return the C implementation of an `Op`.
+        """Return the C implementation of an ``Op``.

-        Returns C code that does the computation associated to this `Op`,
+        Returns C code that does the computation associated to this ``Op``,
        given names for the inputs and outputs.

        Parameters
@@ -196,7 +198,7 @@ class CLinkerOp(CLinkerObject):
            can be accessed by prepending ``"py_"`` to the name in the
            list.
        outputs : list of strings
-            Each string is the name of a C variable where the Op should
+            Each string is the name of a C variable where the `Op` should
            store its output.  The type depends on the declared type of
            the output.  There is a corresponding Python variable that
            can be accessed by prepending ``"py_"`` to the name in the
@@ -204,8 +206,7 @@ class CLinkerOp(CLinkerObject):
            the value of the variable may be pre-filled.  The value for
            an unallocated output is type-dependent.
        sub : dict of strings
-            Extra symbols defined in `CLinker` sub symbols (such as 'fail').
-            WRITEME
+            Extra symbols defined in `CLinker` sub symbols (such as ``'fail'``).

        """
        raise NotImplementedError()
@@ -213,7 +214,7 @@ class CLinkerOp(CLinkerObject):
    def c_code_cache_version_apply(self, node: Apply) -> Tuple[int]:
        """Return a tuple of integers indicating the version of this `Op`.

-        An empty tuple indicates an 'unversioned' `Op` that will not be
+        An empty tuple indicates an "unversioned" `Op` that will not be
        cached between processes.

        The cache mechanism may erase cached modules that have been
@@ -221,7 +222,7 @@ class CLinkerOp(CLinkerObject):

        See Also
        --------
-        c_code_cache_version()
+        c_code_cache_version

        Notes
        -----
@@ -240,9 +241,9 @@ class CLinkerOp(CLinkerObject):
        outputs: List[Text],
        sub: Dict[Text, Text],
    ) -> Text:
-        """Return C code to run after `CLinkerOp.c_code`, whether it failed or not.
+        """Return C code to run after :meth:`CLinkerOp.c_code`, whether it failed or not.

-        This is a convenient place to clean up things allocated by `CLinkerOp.c_code`.
+        This is a convenient place to clean up things allocated by :meth:`CLinkerOp.c_code`.

        Parameters
        ----------
@@ -255,18 +256,17 @@ class CLinkerOp(CLinkerObject):
            There is a string for each input of the function, and the
            string is the name of a C variable pointing to that input.
            The type of the variable depends on the declared type of
-            the input. There is a corresponding python variable that
+            the input. There is a corresponding Python variable that
            can be accessed by prepending ``"py_"`` to the name in the
            list.
        outputs : list of str
            Each string is the name of a C variable corresponding to
-            one of the outputs of the Op. The type depends on the
+            one of the outputs of the `Op`. The type depends on the
            declared type of the output. There is a corresponding
-            python variable that can be accessed by prepending ``"py_"`` to
+            Python variable that can be accessed by prepending ``"py_"`` to
            the name in the list.
        sub : dict of str
-            extra symbols defined in `CLinker` sub symbols (such as 'fail').
-            WRITEME
+            Extra symbols defined in `CLinker` sub symbols (such as ``'fail'``).

        """
        return ""
@@ -276,24 +276,24 @@ class CLinkerOp(CLinkerObject):

        Parameters
        ----------
-        node: Apply
+        node : Apply
            The node in the graph being compiled.
-        name: str
+        name : str
            A string or number that serves to uniquely identify this node.
            Symbol names defined by this support code should include the name,
-            so that they can be called from the `CLinkerOp.c_code`, and so that
+            so that they can be called from the :meth:`CLinkerOp.c_code`, and so that
            they do not cause name collisions.

        Notes
        -----
-        This function is called in addition to `CLinkerObject.c_support_code`
+        This function is called in addition to :meth:`CLinkerObject.c_support_code`
        and will supplement whatever is returned from there.

        """
        return ""

    def c_init_code_apply(self, node: Apply, name: Text) -> Text:
-        """Return a code string specific to the apply to be inserted in the module initialization code.
+        """Return a code string specific to the `Apply` to be inserted in the module initialization code.

        Parameters
        ----------
@@ -302,13 +302,14 @@ class CLinkerOp(CLinkerObject):
        name : str
            A string or number that serves to uniquely identify this node.
            Symbol names defined by this support code should include the name,
-            so that they can be called from the c_code, and so that they do not
-            cause name collisions.
+            so that they can be called from :meth:`CLinkerOp.c_code`, and so
+            that they do not cause name collisions.

        Notes
        -----
-        This function is called in addition to c_init_code and will supplement
-        whatever is returned from there.
+        This function is called in addition to
+        :meth:`CLinkerObject.c_init_code` and will supplement whatever is
+        returned from there.

        """
        return ""
@@ -318,11 +319,11 @@ class CLinkerOp(CLinkerObject):

        Parameters
        ----------
-        node: Apply
+        node : Apply
            The node in the graph being compiled.
-        name: str
+        name : str
            A unique name to distinguish variables from those of other nodes.
-        sub: dict of str
+        sub : dict of str
            A dictionary of values to substitute in the code.
            Most notably it contains a ``'fail'`` entry that you should place
            in your code after setting a Python exception to indicate an error.
@@ -359,27 +360,24 @@ class CLinkerOp(CLinkerObject):


 class CLinkerType(CLinkerObject):
-    """
-    Interface specification for Types that can be arguments to a `CLinkerOp`.
+    r"""Interface specification for `Type`\s that can be arguments to a `CLinkerOp`.

-    A CLinkerType instance is mainly responsible  for providing the C code that
+    A `CLinkerType` instance is mainly responsible  for providing the C code that
    interfaces python objects with a C `CLinkerOp` implementation.

-    See WRITEME for a general overview of code generation by `CLinker`.
-
    """

    @abstractmethod
    def c_declare(
        self, name: Text, sub: Dict[Text, Text], check_input: bool = True
    ) -> Text:
-        """Return C code to declare variables that will be instantiated by `CLinkerType.c_extract`.
+        """Return C code to declare variables that will be instantiated by :meth:`CLinkerType.c_extract`.

        Parameters
        ----------
-        name : str
-            The name of the ``PyObject *`` pointer that will
-            the value for this Type
+        name
+            The name of the ``PyObject *`` pointer that will the value for this
+            `Type`.
        sub
            A dictionary of special codes.  Most importantly
            ``sub['fail']``. See `CLinker` for more info on ``sub`` and
@@ -391,9 +389,9 @@ class CLinkerType(CLinkerObject):
        are declared here, so that name collisions do not occur in the
        source file that is generated.

-        The variable called ``name`` is not necessarily defined yet
+        The variable called `name` is not necessarily defined yet
        where this code is inserted. This code might be inserted to
-        create class variables for example, whereas the variable ``name``
+        create class variables for example, whereas the variable `name`
        might only exist inside certain functions in that class.

        TODO: Why should variable declaration fail?  Is it even allowed to?
@@ -410,13 +408,13 @@ class CLinkerType(CLinkerObject):

    @abstractmethod
    def c_init(self, name: Text, sub: Dict[Text, Text]) -> Text:
-        """Return C code to initialize the variables that were declared by `CLinkerType.c_declare`.
+        """Return C code to initialize the variables that were declared by :meth:`CLinkerType.c_declare`.

        Notes
        -----
-        The variable called ``name`` is not necessarily defined yet
+        The variable called `name` is not necessarily defined yet
        where this code is inserted. This code might be inserted in a
-        class constructor for example, whereas the variable ``name``
+        class constructor for example, whereas the variable `name`
        might only exist inside certain functions in that class.

        TODO: Why should variable initialization fail?  Is it even allowed to?
@@ -450,10 +448,10 @@ class CLinkerType(CLinkerObject):

        Parameters
        ----------
-        name: str
+        name
            The name of the ``PyObject *`` pointer that will store the value
            for this type.
-        sub: dict string -> string
+        sub
            A dictionary of special codes. Most importantly
            ``sub['fail']``. See `CLinker` for more info on ``sub`` and
            ``fail``.
@@ -485,9 +483,9 @@ class CLinkerType(CLinkerObject):

        Parameters
        ----------
-        name : str
+        name
            WRITEME
-        sub : dict of str
+        sub
            WRITEME

        """
@@ -518,7 +516,7 @@ class CLinkerType(CLinkerObject):

        Parameters
        ----------
-        data : Constant
+        data
            The data to be converted into a C literal string.

        """
@@ -529,7 +527,7 @@ class CLinkerType(CLinkerObject):
    ) -> Text:
        """Return C code to extract a ``PyObject *`` instance.

-        Unlike `CLinkerType.c_extract`, `CLinkerType.c_extract_out` has to
+        Unlike :math:`CLinkerType.c_extract`, :meth:`CLinkerType.c_extract_out` has to
        accept ``Py_None``, meaning that the variable should be left
        uninitialized.

@@ -550,10 +548,10 @@ class CLinkerType(CLinkerObject):
        )

    def c_cleanup(self, name: Text, sub: Dict[Text, Text]) -> Text:
-        """Return C code to clean up after `CLinkerType.c_extract`.
+        """Return C code to clean up after :meth:`CLinkerType.c_extract`.

        This returns C code that should deallocate whatever
-        `CLinkerType.c_extract` allocated or decrease the reference counts. Do
+        :meth:`CLinkerType.c_extract` allocated or decrease the reference counts. Do
        not decrease ``py_%(name)s``'s reference count.

        Parameters
@@ -569,7 +567,7 @@ class CLinkerType(CLinkerObject):
    def c_code_cache_version(self) -> Union[Tuple, Tuple[int]]:
        """Return a tuple of integers indicating the version of this type.

-        An empty tuple indicates an 'unversioned' type that will not
+        An empty tuple indicates an "unversioned" type that will not
        be cached between processes.

        The cache mechanism may erase cached modules that have been

--- a/aesara/sparse/opt.py
+++ b/aesara/sparse/opt.py
@@ -1845,10 +1845,10 @@ class SamplingDotCSR(_NoPythonCOp):
    multiplication.

    If we have the input of mixed dtype, we insert cast elemwise
-    in the graph to be able to call blas function as they don't
+    in the graph to be able to call BLAS function as they don't
    allow mixed dtype.

-    This op is used as an optimization for SamplingDot.
+    This `Op` is used as an optimization for `SamplingDot`.

    """


--- a/aesara/tensor/basic_opt.py
+++ b/aesara/tensor/basic_opt.py
@@ -216,8 +216,8 @@ def broadcast_like(value, template, fgraph, dtype=None):


 class InplaceElemwiseOptimizer(GlobalOptimizer):
-    """
-    We parametrise it to make it work for Elemwise and GpuElemwise op.
+    r"""
+    This is parameterized so that it works for `Elemwise` and `GpuElemwise` `Op`\s.
    """

    def __init__(self, OP):
@@ -1469,7 +1469,7 @@ class ShapeFeature(features.Feature):


 class ShapeOptimizer(GlobalOptimizer):
-    """Optimizer that serves to add ShapeFeature as an fgraph feature."""
+    """Optimizer that adds `ShapeFeature` as a feature."""

    def add_requirements(self, fgraph):
        fgraph.attach_feature(ShapeFeature())
@@ -1479,7 +1479,7 @@ class ShapeOptimizer(GlobalOptimizer):


 class UnShapeOptimizer(GlobalOptimizer):
-    """Optimizer remove ShapeFeature as an fgraph feature."""
+    """Optimizer that removes `ShapeFeature` as a feature."""

    def apply(self, fgraph):
        for feature in fgraph._features:

--- a/aesara/tensor/extra_ops.py
+++ b/aesara/tensor/extra_ops.py
@@ -39,8 +39,9 @@ from aesara.utils import LOCAL_BITWIDTH, PYTHON_INT_BITWIDTH

 class CpuContiguous(COp):
    """
-    Check to see if the input is c-contiguous,
-    if it is, do nothing, else return a contiguous array.
+    Check to see if the input is c-contiguous.
+
+    If it is, do nothing, else return a contiguous array.
    """

    __props__ = ()
@@ -99,13 +100,13 @@ cpu_contiguous = CpuContiguous()


 class SearchsortedOp(COp):
-    """Wrapper of numpy.searchsorted.
+    """Wrapper for ``numpy.searchsorted``.

    For full documentation, see :func:`searchsorted`.

    See Also
    --------
-    searchsorted : numpy-like function to use the SearchsortedOp
+    searchsorted : numpy-like function that uses `SearchsortedOp`

    """

@@ -222,24 +223,24 @@ class SearchsortedOp(COp):
 def searchsorted(x, v, side="left", sorter=None):
    """Find indices where elements should be inserted to maintain order.

-    Wrapping of numpy.searchsorted. Find the indices into a sorted array
+    This wraps ``numpy.searchsorted``. Find the indices into a sorted array
    `x` such that, if the corresponding elements in `v` were inserted
    before the indices, the order of `x` would be preserved.

    Parameters
    ----------
-    x: 1-D tensor (array-like)
-        Input array. If `sorter` is None, then it must be sorted in
+    x : 1-D tensor (array-like)
+        Input array. If `sorter` is ``None``, then it must be sorted in
        ascending order, otherwise `sorter` must be an array of indices
        which sorts it.
-    v: tensor (array-like)
+    v : tensor (array-like)
        Contains the values to be inserted into `x`.
-    side: {'left', 'right'}, optional.
-        If 'left' (default), the index of the first suitable
-        location found is given. If 'right', return the last such index. If
+    side : {'left', 'right'}, optional.
+        If ``'left'`` (default), the index of the first suitable
+        location found is given. If ``'right'``, return the last such index. If
        there is no suitable index, return either 0 or N (where N is the length
        of `x`).
-    sorter: 1-D tensor of integers (array-like), optional
+    sorter : 1-D tensor of integers (array-like), optional
        Contains indices that sort array `x` into ascending order.
        They are typically the result of argsort.

@@ -410,9 +411,9 @@ class CumOp(COp):


 def cumsum(x, axis=None):
-    """Return the cumulative sum of the elements along a given axis.
+    """Return the cumulative sum of the elements along a given `axis`.

-    Wrapping of numpy.cumsum.
+    This wraps ``numpy.cumsum``.

    Parameters
    ----------
@@ -430,18 +431,17 @@ def cumsum(x, axis=None):


 def cumprod(x, axis=None):
-    """Return the cumulative product of the elements along a given axis.
+    """Return the cumulative product of the elements along a given `axis`.

-    Wrapping of numpy.cumprod.
+    This wraps ``numpy.cumprod``.

    Parameters
    ----------
    x
        Input tensor variable.
-
    axis
        The axis along which the cumulative product is computed.
-        The default (None) is to compute the cumprod over the flattened array.
+        The default (None) is to compute the `cumprod` over the flattened array.


    .. versionadded:: 0.7
@@ -520,20 +520,18 @@ class DiffOp(Op):


 def diff(x, n=1, axis=-1):
-    """Calculate the n-th order discrete difference along given axis.
+    """Calculate the `n`-th order discrete difference along the given `axis`.

-    The first order difference is given by out[i] = a[i + 1] - a[i]
-    along the given axis, higher order differences are calculated by
-    using diff recursively. Wrapping of numpy.diff.
+    The first order difference is given by ``out[i] = a[i + 1] - a[i]``
+    along the given `axis`, higher order differences are calculated by
+    using `diff` recursively. This wraps ``numpy.diff``.

    Parameters
    ----------
    x
        Input tensor variable.
-
    n
        The number of times values are differenced, default is 1.
-
    axis
        The axis along which the difference is taken, default is the last axis.

@@ -545,27 +543,28 @@ def diff(x, n=1, axis=-1):


 def bincount(x, weights=None, minlength=None, assert_nonneg=False):
-    """Count number of occurrences of each value in array of ints.
+    """Count number of occurrences of each value in an array of integers.

    The number of bins (of size 1) is one larger than the largest
-    value in x. If minlength is specified, there will be at least
+    value in `x`. If minlength is specified, there will be at least
    this number of bins in the output array (though it will be longer
-    if necessary, depending on the contents of x). Each bin gives the
-    number of occurrences of its index value in x. If weights is
-    specified the input array is weighted by it, i.e. if a value n
-    is found at position i, out[n] += weight[i] instead of out[n] += 1.
+    if necessary, depending on the contents of `x`). Each bin gives the
+    number of occurrences of its index value in `x`. If `weights` is
+    specified the input array is weighted by it, i.e. if a value ``n`` is found
+    at position ``i``, ``out[n] += weight[i]`` instead of ``out[n] += 1``.

    Parameters
    ----------
-    x : 1 dimension, nonnegative ints
-    weights : array of the same shape as x with corresponding weights.
-        Optional.
-    minlength : A minimum number of bins for the output array.
-        Optional.
-    assert_nonneg : A flag that inserts an assert_op to check if
-        every input x is nonnegative.
+    x
+        A one dimensional array of non-negative integers
+    weights
+        An array of the same shape as `x` with corresponding weights.
        Optional.
-
+    minlength
+        A minimum number of bins for the output array.  Optional.
+    assert_nonneg
+        A flag that inserts an ``assert_op`` to check if
+        every input `x` is non-negative.  Optional.

    .. versionadded:: 0.6

@@ -597,27 +596,23 @@ def squeeze(x, axis=None):
    """
    Remove broadcastable dimensions from the shape of an array.

-    It returns the input array, but with the
-    broadcastable dimensions removed. This is
-    always `x` itself or a view into `x`.
+    It returns the input array, but with the broadcastable dimensions
+    removed. This is always `x` itself or a view into `x`.

    .. versionadded:: 0.6

    Parameters
    ----------
-    x
+    x :
        Input data, tensor variable.
-
    axis : None or int or tuple of ints, optional
-
        Selects a subset of the single-dimensional entries in the
        shape. If an axis is selected with shape entry greater than
        one, an error is raised.

    Returns
    -------
-    object
-        `x` without its broadcastable dimensions.
+    `x` without its broadcastable dimensions.

    """
    if axis is None:
@@ -635,24 +630,25 @@ def compress(condition, x, axis=None):
    """
    Return selected slices of an array along given axis.

-    It returns the input tensor, but with selected slices along a given axis
-    retained. If no axis is provided, the tensor is flattened.
-    Corresponds to numpy.compress
+    It returns the input tensor, but with selected slices along a given `axis`
+    retained. If no `axis` is provided, the tensor is flattened.
+    Corresponds to ``numpy.compress``

    .. versionadded:: 0.7

    Parameters
    ----------
+    condition
+        One dimensional array of non-zero and zero values
+        corresponding to indices of slices along a selected axis.
    x
        Input data, tensor variable.
-    condition
-         1 dimensional array of non-zero and zero values
-         corresponding to indices of slices along a selected axis.
+    axis
+        The axis along which to slice.

    Returns
    -------
-    object
-        `x` with selected slices.
+    `x` with selected slices.

    """
    indices = aet.flatnonzero(condition)
@@ -774,13 +770,12 @@ class Repeat(Op):
 def repeat(x, repeats, axis=None):
    """Repeat elements of an array.

-    It returns an array which has the same shape as `x`, except
-    along the given axis. The axis is used to specify along which
-    axis to repeat values. By default, use the flattened input
-    array, and return a flat output array.
+    It returns an array which has the same shape as `x`, except along the given
+    `axis`. The `axis` parameter is used to specify the axis along which values
+    are repeated. By default, a flattened version of `x` is used.

-    The number of repetitions for each element is `repeats`.
-    `repeats` is broadcasted to fit the length of the given `axis`.
+    The number of repetitions for each element is `repeats`.  `repeats` is
+    broadcasted to fit the length of the given `axis`.

    Parameters
    ----------
@@ -973,8 +968,8 @@ fill_diagonal_ = FillDiagonal()
 # I create a function only to have the doc show well.
 def fill_diagonal(a, val):
    """
-    Returns a copy of an array with all
-    elements of the main diagonal set to a specified scalar value.
+    Returns a copy of an array with all elements of the main diagonal set to a
+    specified scalar value.

    .. versionadded:: 0.6

@@ -984,18 +979,18 @@ def fill_diagonal(a, val):
        Rectangular array of at least two dimensions.
    val
        Scalar value to fill the diagonal whose type must be
-        compatible with that of array 'a' (i.e. 'val' cannot be viewed
-        as an upcast of 'a').
+        compatible with that of array `a` (i.e. `val` cannot be viewed
+        as an upcast of `a`).

    Returns
    -------
    array
-        An array identical to 'a' except that its main diagonal
-        is filled with scalar 'val'. (For an array 'a' with a.ndim >=
-        2, the main diagonal is the list of locations a[i, i, ..., i]
+        An array identical to `a` except that its main diagonal
+        is filled with scalar `val`. (For an array `a` with ``a.ndim >=
+        2``, the main diagonal is the list of locations ``a[i, i, ..., i]``
        (i.e. with indices all identical).)

-    Support rectangular matrix and tensor with more than 2 dimensions
+    Support rectangular matrix and tensor with more than two dimensions
    if the later have all dimensions are equals.


@@ -1134,8 +1129,8 @@ def fill_diagonal_offset(a, val, offset):
        Rectangular array of two dimensions.
    val
        Scalar value to fill the diagonal whose type must be
-        compatible with that of array 'a' (i.e. 'val' cannot be viewed
-        as an upcast of 'a').
+        compatible with that of array `a` (i.e. `val` cannot be viewed
+        as an upcast of `a`).
    offset
        Scalar value Offset of the diagonal from the main
        diagonal. Can be positive or negative integer.
@@ -1143,8 +1138,8 @@ def fill_diagonal_offset(a, val, offset):
    Returns
    -------
    array
-        An array identical to 'a' except that its offset diagonal
-        is filled with scalar 'val'. The output is unwrapped.
+        An array identical to `a` except that its offset diagonal
+        is filled with scalar `val`. The output is unwrapped.

    """
    return fill_diagonal_offset_(a, val, offset)
@@ -1153,21 +1148,21 @@ def fill_diagonal_offset(a, val, offset):
 def to_one_hot(y, nb_class, dtype=None):
    """
    Return a matrix where each row correspond to the one hot
-    encoding of each element in y.
+    encoding of each element in `y`.

    Parameters
    ----------
    y
-        A vector of integer value between 0 and nb_class - 1.
+        A vector of integer value between ``0`` and ``nb_class - 1``.
    nb_class : int
-        The number of class in y.
+        The number of class in `y`.
    dtype : data-type
-        The dtype of the returned matrix. Default floatX.
+        The dtype of the returned matrix. Default ``aesara.config.floatX``.

    Returns
    -------
    object
-        A matrix of shape (y.shape[0], nb_class), where each row ``i`` is
+        A matrix of shape ``(y.shape[0], nb_class)``, where each row ``i`` is
        the one hot encoding of the corresponding ``y[i]`` value.

    """
@@ -1178,7 +1173,7 @@ def to_one_hot(y, nb_class, dtype=None):

 class Unique(Op):
    """
-    Wraps numpy.unique. This op is not implemented on the GPU.
+    Wraps `numpy.unique`. This `Op` is not implemented on the GPU.

    Examples
    --------
@@ -1368,9 +1363,9 @@ def unravel_index(indices, dims, order="C"):
    ----------
    indices : Aesara or NumPy array
        An integer array whose elements are indices into the flattened
-        version of an array of dimensions ``dims``.
+        version of an array of dimensions `dims`.
    dims : tuple of ints
-        The shape of the array to use for unraveling ``indices``.
+        The shape of the array to use for unraveling `indices`.
    order : {'C', 'F'}, optional
        Determines whether the indices should be viewed as indexing in
        row-major (C-style) or column-major (Fortran-style) order.
@@ -1378,7 +1373,7 @@ def unravel_index(indices, dims, order="C"):
    Returns
    -------
    unraveled_coords : tuple of ndarray
-        Each array in the tuple has the same shape as the ``indices``
+        Each array in the tuple has the same shape as the `indices`
        array.

    See Also
@@ -1455,7 +1450,7 @@ def ravel_multi_index(multi_index, dims, mode="raise", order="C"):

    Returns
    -------
-    raveled_indices : Aesara array
+    raveled_indices : TensorVariable
        An array of indices into the flattened version of an array
        of dimensions ``dims``.

@@ -1481,7 +1476,7 @@ def broadcast_shape(*arrays, **kwargs):
    arrays_are_shapes: bool (Optional)
        Indicates whether or not the `arrays` contains shape tuples.
        If you use this approach, make sure that the broadcastable dimensions
-        are (scalar) constants with the value `1` or `1` exactly.
+        are (scalar) constants with the value ``1`` or ``1`` exactly.

    """
    return broadcast_shape_iter(arrays, **kwargs)
@@ -1500,7 +1495,7 @@ def broadcast_shape_iter(arrays, **kwargs):
    arrays_are_shapes: bool (Optional)
        Indicates whether or not the `arrays` contains shape tuples.
        If you use this approach, make sure that the broadcastable dimensions
-        are (scalar) constants with the value `1` or `1` exactly.
+        are (scalar) constants with the value ``1`` or ``1`` exactly.

    """
    one = aesara.scalar.ScalarConstant(aesara.scalar.int64, 1)
@@ -1625,7 +1620,7 @@ def broadcast_arrays(*args: TensorVariable) -> Tuple[TensorVariable, ...]:

    Parameters
    ----------
-    `*args` : array_likes
+    *args
        The arrays to broadcast.

    """

--- a/aesara/tensor/subtensor.py
+++ b/aesara/tensor/subtensor.py
@@ -112,7 +112,7 @@ def indices_from_subtensor(


 def as_index_constant(a):
-    r"""Convert Python literals to Aesara constants--when possible--in Subtensor arguments.
+    r"""Convert Python literals to Aesara constants--when possible--in `Subtensor` arguments.

    This will leave `Variable`\s untouched.
    """

--- a/doc/conf.py
+++ b/doc/conf.py
@@ -102,7 +102,7 @@ exclude_dirs = ["images", "scripts", "sandbox"]

 # The reST default role (used for this markup: `text`) to use for all
 # documents.
-# default_role = None
+default_role = "py:obj"

 # If true, '()' will be appended to :func: etc. cross-reference text.
 # add_function_parentheses = True

--- a/doc/extending/extending_aesara.rst
+++ b/doc/extending/extending_aesara.rst

 .. _extending_aesara:

-Creating a new Op: Python implementation
-========================================
+Creating a new :class:`Op`: Python implementation
+=================================================

 So suppose you have looked through the library documentation and you don't see
 a function that does what you want.

-If you can implement something in terms of an existing ``Op``, you should do that.
+If you can implement something in terms of an existing :ref:`Op`, you should do that.
 Odds are your function that uses existing Aesara expressions is short,
 has no bugs, and potentially profits from optimizations that have already been
 implemented.

-However, if you cannot implement an ``Op`` in terms of an existing ``Op``, you have to
+However, if you cannot implement an :class:`Op` in terms of an existing :class:`Op`, you have to
 write a new one. Don't worry, Aesara was designed to make it easy to add a new
-``Op``, ``Type``, and ``Optimization``.
+:class:`Op`, :class:`Type`, and :class:`Optimization`.

 .. These first few pages will walk you through the definition of a new :ref:`type`,
-.. ``double``, and a basic arithmetic :ref:`operations <op>` on that `Type`.
+.. ``double``, and a basic arithmetic :ref:`operations <op>` on that :class:`Type`.

 As an illustration, this tutorial shows how to write a simple Python-based
 :ref:`operations <op>` which performs operations on
 :ref:`type`, ``double<Double>``.
 .. It also shows how to implement tests that
-.. ensure the proper working of an ``Op``.
+.. ensure the proper working of an :class:`Op`.

 .. note::

@@ -34,12 +34,12 @@ As an illustration, this tutorial shows how to write a simple Python-based
    ``output_storage`` of the :func:`perform` function. See
    :ref:`views_and_inplace` for an explanation on how to do this.

-    If your ``Op`` returns a view or changes the value of its inputs
+    If your :class:`Op` returns a view or changes the value of its inputs
    without doing as prescribed in that page, Aesara will run, but will
    return correct results for some graphs and wrong results for others.

    It is recommended that you run your tests in DebugMode (Aesara *flag*
-    ``mode=DebugMode``) since it verifies if your ``Op`` behaves correctly in this
+    ``mode=DebugMode``) since it verifies if your :class:`Op` behaves correctly in this
    regard.


@@ -52,12 +52,12 @@ Aesara Graphs refresher
 Aesara represents symbolic mathematical computations as graphs. Those graphs
 are bi-partite graphs (graphs with 2 types of nodes), they are composed of
 interconnected :ref:`apply` and :ref:`variable` nodes.
-:ref:`variable` nodes represent data in the graph, either inputs, outputs or
-intermediary values. As such, Inputs and Outputs of a graph are lists of Aesara
-:ref:`variable` nodes. :ref:`apply` nodes perform computation on these
-variables to produce new variables. Each :ref:`apply` node has a link to an
-instance of :ref:`Op` which describes the computation to perform. This tutorial
-details how to write such an ``Op`` instance. Please refers to
+:class:`Variable` nodes represent data in the graph, either inputs, outputs or
+intermediary values. As such, inputs and outputs of a graph are lists of Aesara
+:class:`Variable` nodes. :class:`Apply` nodes perform computation on these
+variables to produce new variables. Each :class:`Apply` node has a link to an
+instance of :class:`Op` which describes the computation to perform. This tutorial
+details how to write such an :class:`Op` instance. Please refers to
 :ref:`graphstructures` for a more detailed explanation about the graph
 structure.

@@ -65,9 +65,9 @@ structure.
 Op's basic methods
 ------------------

-An ``Op`` is any Python object which inherits from :class:`Op`.
+An :class:`Op` is any Python object which inherits from :class:`Op`.
 This section provides an overview of the basic methods you typically have to
-implement to make a new ``Op``.  It does not provide extensive coverage of all the
+implement to make a new :class:`Op`.  It does not provide extensive coverage of all the
 possibilities you may encounter or need.  For that refer to
 :ref:`op_contract`.

@@ -119,14 +119,14 @@ possibilities you may encounter or need.  For that refer to
        def infer_shape(self, fgraph, node, input_shapes):
            pass

-An ``Op`` has to implement some methods defined in the the interface of
-:class:`Op`. More specifically, it is mandatory for an ``Op`` to define either
+An :class:`Op` has to implement some methods defined in the the interface of
+:class:`Op`. More specifically, it is mandatory for an :class:`Op` to define either
 the method :func:`make_node` or :attr:`itypes`, :attr:`otypes` and one of the
 implementation methods, either :func:`perform`, :meth:`COp.c_code`
 or :func:`make_thunk`.

  :func:`make_node` method creates an Apply node representing the application
-  of the ``Op`` on the inputs provided. This method is responsible for three things:
+  of the :class:`Op` on the inputs provided. This method is responsible for three things:

    - it first checks that the input :class:`Variable`\s types are compatible
      with the current :class:`Op`. If the :class:`Op` cannot be applied on the provided
@@ -136,29 +136,29 @@ or :func:`make_thunk`.
      the symbolic output :class:`Variable`\s. It creates output :class:`Variable`\s of a suitable
      symbolic :class:`Type` to serve as the outputs of this :class:`Op`'s
      application.
-    - it creates an Apply instance with the input and output ``Variable``, and
-      return the Apply instance.
+    - it creates an :class:`Apply` instance with the input and output :class:`Variable`, and
+      return the :class:`Apply` instance.



-  :func:`perform` method defines the Python implementation of an ``Op``.
+  :func:`perform` method defines the Python implementation of an :class:`Op`.
  It takes several arguments:

    - ``node`` is a reference to an Apply node which was previously
      obtained via the :func:`make_node` method. It is typically not
-      used in a simple ``Op``, but it contains symbolic information that
-      could be required by a complex ``Op``.
+      used in a simple :class:`Op`, but it contains symbolic information that
+      could be required by a complex :class:`Op`.
    - ``inputs`` is a list of references to data which can be operated on using
      non-symbolic statements, (i.e., statements in Python, Numpy).
    - ``output_storage`` is a list of storage cells where the output
-      is to be stored. There is one storage cell for each output of the ``Op``.
+      is to be stored. There is one storage cell for each output of the :class:`Op`.
      The data put in ``output_storage`` must match the type of the
      symbolic output. It is forbidden to change the length of the list(s)
      contained in ``output_storage``.
      A function Mode may allow ``output_storage`` elements to persist
      between evaluations, or it may reset ``output_storage`` cells to
      hold a value of ``None``.  It can also pre-allocate some memory
-      for the ``Op`` to use.  This feature can allow ``perform`` to reuse
+      for the :class:`Op` to use.  This feature can allow ``perform`` to reuse
      memory between calls, for example. If there is something
      preallocated in the ``output_storage``, it will be of the good
      dtype, but can have the wrong shape and have any stride pattern.
@@ -166,20 +166,19 @@ or :func:`make_thunk`.
  :func:`perform` method must be determined by the inputs. That is to say,
  when applied to identical inputs the method must return the same outputs.

-  :class:`Op` allows some other way to define the ``Op`` implementation.
-  For instance, it is possible to define :meth:`COp.c_code` to provide a
-  C-implementation to the ``Op``. Please refers to tutorial
-  :ref:`extending_aesara_c` for a description of :meth:`COp.c_code` and other
-  related c_methods. Note that an ``Op`` can provide both Python and C
-  implementation.
+  An :class:`Op`\s implementation can be defined in other ways, as well.
+  For instance, it is possible to define a C-implementation via :meth:`COp.c_code`.
+  Please refers to tutorial :ref:`extending_aesara_c` for a description of
+  :meth:`COp.c_code` and other related ``c_**`` methods. Note that an
+  :class:`Op` can provide both Python and C implementations.

  :func:`make_thunk` method is another alternative to :func:`perform`.
  It returns a thunk. A thunk is defined as a zero-arguments
  function which encapsulates the computation to be performed by an
-  ``Op`` on the arguments of its corresponding node. It takes several parameters:
+  :class:`Op` on the arguments of its corresponding node. It takes several parameters:

-    - ``node`` is the Apply instance for which a thunk is requested,
-    - ``storage_map`` is a dict of lists which  maps variables to a one-element
+    - ``node`` is the :class:`Apply` instance for which a thunk is requested,
+    - ``storage_map`` is a ``dict`` of lists which  maps variables to a one-element
      lists holding the variable's current value. The one-element list acts as
      pointer to the value and allows sharing that "pointer" with other nodes
      and instances.
@@ -191,28 +190,28 @@ or :func:`make_thunk`.
      is 2 the variable has been garbage-collected and is no longer
      valid, but shouldn't be required anymore for this call.
      The returned function must ensure that it sets the computed
-      variables as computed in the `compute_map`.
+      variables as computed in the :obj:`compute_map`.
    - ``impl`` allow to select between multiple implementation.
-      It should have a default value of None.
+      It should have a default value of ``None``.

  :func:`make_thunk` is useful if you want to generate code and compile
  it yourself.

-  If :func:`make_thunk()` is defined by an ``Op``, it will be used by Aesara
-  to obtain the ``Op``'s implementation.
+  If :func:`make_thunk()` is defined by an :class:`Op`, it will be used by Aesara
+  to obtain the :class:`Op`'s implementation.
  :func:`perform` and :meth:`COp.c_code` will be ignored.

  If :func:`make_node` is not defined, the :attr:`itypes` and :attr:`otypes`
-  are used by the ``Op``'s :func:`make_node` method to implement the functionality
+  are used by the :class:`Op`'s :func:`make_node` method to implement the functionality
  of :func:`make_node` method mentioned above.

-Op's auxiliary methods
----------------------
+:class:`Op`'s auxiliary methods
+-------------------------------

-There are other methods that can be optionally defined by the ``Op``:
+There are other methods that can be optionally defined by the :class:`Op`:

  The :func:`__str__` method provides a meaningful string representation of
-  your ``Op``.
+  your :class:`Op`.

  :func:`__eq__` and :func:`__hash__` define respectivelly equality
  between two :class:`Op`\s and the hash of an :class:`Op` instance.
@@ -222,11 +221,10 @@ There are other methods that can be optionally defined by the ``Op``:
  Two :class:`Op`\s that are equal according :func:`__eq__`
  should return the same output when they are applied on the same inputs.

-  The :attr:`__props__` lists the properties
-  that influence how the computation is performed (Usually these are those
-  that you set in  :func:`__init__`). It must be a tuple.
+  The :attr:`__props__` attribute lists the properties that influence how the computation
+  is performed (usually these are set in :func:`__init__`). It must be a tuple.
  If you don't have any properties, then you should set this attribute to the
-  empty tuple `()`.
+  empty tuple ``()``.

  :attr:`__props__` enables the  automatic generation of appropriate
  :func:`__eq__` and :func:`__hash__`.
@@ -236,10 +234,10 @@ There are other methods that can be optionally defined by the ``Op``:
  Given to the method :func:`__hash__` automatically generated from
  :attr:`__props__`, two :class:`Op`\s will be have the same hash if they have the same
  values for all the properties listed in :attr:`__props__`.
-  :attr:`__props__` will also generate a  suitable :func:`__str__` for your ``Op``.
+  :attr:`__props__` will also generate a  suitable :func:`__str__` for your :class:`Op`.
  This requires development version after September 1st, 2014 or version 0.7.

-  The :func:`infer_shape` method allows an `Op` to infer the shape of its
+  The :func:`infer_shape` method allows an :class:`Op` to infer the shape of its
  output variables without actually computing them.
  It takes as input ``fgraph``, a :class:`FunctionGraph`; ``node``, a reference
  to the :class:`Op`'s :class:`Apply` node;
@@ -247,12 +245,12 @@ There are other methods that can be optionally defined by the ``Op``:
  which are the dimensions of the :class:`Op` input :class:`Variable`\s.
  :func:`infer_shape` returns a list where each element is a tuple representing
  the shape of one output.
-  This could be helpful if one only
-  needs the shape of the output instead of the actual outputs, which
-  can be useful, for instance, for optimization procedures.
+  This could be helpful if one only needs the shape of the output instead of the
+  actual outputs, which can be useful, for instance, for optimization
+  procedures.

  The :func:`grad` method is required if you want to differentiate some cost
-  whose expression includes your ``Op``. The gradient may be
+  whose expression includes your :class:`Op`. The gradient may be
  specified symbolically in this method. It takes two arguments ``inputs`` and
  ``output_gradients``, which are both lists of :class:`Variable`\s, and
  those must be operated on using Aesara's symbolic language. The :func:`grad`
@@ -261,28 +259,28 @@ There are other methods that can be optionally defined by the ``Op``:
  to that input computed based on the symbolic gradients with respect
  to each output.
  If the output is not differentiable with respect to an input then
-  this method should be defined to return a variable of type NullType
+  this method should be defined to return a variable of type ``NullType``
  for that input. Likewise, if you have not implemented the grad
  computation for some input, you may return a variable of type
-  NullType for that input. Please refer to :func:`grad` for a more detailed
+  ``NullType`` for that input. Please refer to :func:`grad` for a more detailed
  view.

  The :func:`R_op` method is needed if you want ``aesara.gradient.Rop`` to
-  work with your `Op`.
+  work with your :class:`Op`.
  This function implements the application of the R-operator on the
-  function represented by your `Op`. Let assume that function is :math:`f`,
+  function represented by your :class:`Op`. Let assume that function is :math:`f`,
  with input :math:`x`, applying the R-operator means computing the
  Jacobian of :math:`f` and right-multiplying it by :math:`v`, the evaluation
  point, namely: :math:`\frac{\partial f}{\partial x} v`.

  The optional boolean :attr:`check_input` attribute is used to specify
-  if you want the types used in your ``COp`` to check their inputs in their
-  ``COp.c_code``. It can be used to speed up compilation, reduce overhead
+  if you want the types used in your :class:`COp` to check their inputs in their
+  :meth:`COp.c_code`. It can be used to speed up compilation, reduce overhead
  (particularly for scalars) and reduce the number of generated C files.


-Example: Op definition
----------------------
+Example: :class:`Op` definition
+-------------------------------

 .. testcode:: example

@@ -357,12 +355,12 @@ At a high level, the code fragment declares a class (e.g., ``DoubleOp1``) and th
 creates one instance of it (e.g., ``doubleOp1``).

 We often gloss over this distinction, but will be precise here:
-``doubleOp1`` (the instance) is an ``Op``, not ``DoubleOp1`` (the class which is a
-subclass of ``Op``). You can call ``doubleOp1(tensor.vector())`` on a
+``doubleOp1`` (the instance) is an :class:`Op`, not ``DoubleOp1`` (the class which is a
+subclass of :class:`Op`). You can call ``doubleOp1(tensor.vector())`` on a
 ``Variable`` to build an expression, and in the expression there will be
 a ``.op`` attribute that refers to ``doubleOp1``.

-.. The first two methods in the ``Op`` are relatively boilerplate: ``__eq__``
+.. The first two methods in the :class:`Op` are relatively boilerplate: ``__eq__``
 .. and ``__hash__``.
 .. When two :class:`Op`\s are equal, Aesara will merge their outputs if they are applied to the same inputs.
 .. The base class says two objects are equal if (and only if)
@@ -386,32 +384,30 @@ a ``.op`` attribute that refers to ``doubleOp1``.
 .. see wrong calculation.

 The ``make_node`` method creates a node to be included in the expression graph.
-It runs when we apply our ``Op`` (``doubleOp1``) to the ``Variable`` (``x``), as
+It runs when we apply our :class:`Op` (``doubleOp1``) to the ``Variable`` (``x``), as
 in ``doubleOp1(tensor.vector())``.
-When an ``Op`` has multiple inputs, their order in the inputs argument to ``Apply``
+When an :class:`Op` has multiple inputs, their order in the inputs argument to ``Apply``
 is important:  Aesara will call ``make_node(*inputs)`` to copy the graph,
 so it is important not to change the semantics of the expression by changing
 the argument order.

 All the ``inputs`` and ``outputs`` arguments to :class:`Apply` must be :class:`Variable`\s.
 A common and easy way to ensure inputs are variables is to run them through
-``as_tensor_variable``. This function leaves TensorType variables alone, raises
-an error for non-TensorType variables, and copies any ``numpy.ndarray`` into
-the storage for a TensorType Constant. The ``make_node`` method dictates the
-appropriate `Type` for all output variables.
+``as_tensor_variable``. This function leaves :class:`TensorType` variables alone, raises
+an error for non-:class:`TensorType` variables, and copies any ``numpy.ndarray`` into
+the storage for a :class:`TensorType` :class:`Constant`. The :func:`make_node` method dictates the
+appropriate :class:`Type` for all output variables.

-The ``perform`` method implements the ``Op``'s mathematical logic in Python.
+The :func:`perform` method implements the :class:`Op`'s mathematical logic in Python.
 The inputs (here ``x``) are passed by value, but a single output is returned
 indirectly as the first element of single-element lists.  If ``doubleOp1`` had
 a second output, it would be stored in ``output_storage[1][0]``.

-.. jpt: DOn't understand the following
-
 In some execution modes, the output storage might contain the return value of
 a previous call.  That old value can be reused to avoid memory re-allocation,
-but it must not influence the semantics of the ``Op`` output.
+but it must not influence the semantics of the :class:`Op` output.

-You can try the new ``Op`` as follows:
+You can try the new :class:`Op` as follows:

 .. testcode:: example

@@ -477,8 +473,8 @@ You can try the new ``Op`` as follows:
     [ 0.48165539  0.98642904  0.4913309   0.30702264]]


-Example: __props__ definition
-----------------------------
+Example: :attr:`__props__` definition
+-------------------------------------

 We can modify the previous piece of code in order to demonstrate
 the usage of the :attr:`__props__` attribute.
@@ -551,13 +547,13 @@ How To Test it
 --------------

 Aesara has some functionalities to simplify testing. These help test the
-``infer_shape``, ``grad`` and ``R_op`` methods. Put the following code
+:meth:`infer_shape`, :meth:`grad` and :meth:`R_op` methods. Put the following code
 in a file and execute it with the ``pytest`` program.

 Basic Tests
 ^^^^^^^^^^^

-Basic tests are done by you just by using the ``Op`` and checking that it
+Basic tests are done by you just by using the :class:`Op` and checking that it
 returns the right answer. If you detect an error, you must raise an
 *exception*. You can use the ``assert`` keyword to automatically raise an
 ``AssertionError``.
@@ -593,32 +589,32 @@ comparison.
 Testing the infer_shape
 ^^^^^^^^^^^^^^^^^^^^^^^

-When a class inherits from the ``InferShapeTester`` class, it gets the
-``self._compile_and_check`` method that tests the ``Op``'s ``infer_shape``
-method. It tests that the ``Op`` gets optimized out of the graph if only
+When a class inherits from the :class:`InferShapeTester` class, it gets the
+:meth:`InferShapeTester._compile_and_check` method that tests the :meth:`Op.infer_shape`
+method. It tests that the :class:`Op` gets optimized out of the graph if only
 the shape of the output is needed and not the output
 itself. Additionally, it checks that the optimized graph computes
 the correct shape, by comparing it to the actual shape of the computed
 output.

-``self._compile_and_check`` compiles an Aesara function. It takes as
+:meth:`InferShapeTester._compile_and_check` compiles an Aesara function. It takes as
 parameters the lists of input and output Aesara variables, as would be
-provided to ``aesara.function``, and a list of real values to pass to the
-compiled function. It also takes the ``Op`` class as a parameter
+provided to :func:`aesara.function`, and a list of real values to pass to the
+compiled function. It also takes the :class:`Op` class as a parameter
 in order to verify that no instance of it appears in the shape-optimized graph.

 If there is an error, the function raises an exception. If you want to
-see it fail, you can implement an incorrect ``infer_shape``.
+see it fail, you can implement an incorrect :meth:`Op.infer_shape`.

 When testing with input values with shapes that take the same value
-over different dimensions (for instance, a square matrix, or a tensor3
-with shape (n, n, n), or (m, n, m)), it is not possible to detect if
+over different dimensions (for instance, a square matrix, or a ``tensor3``
+with shape ``(n, n, n)``, or ``(m, n, m)``), it is not possible to detect if
 the output shape was computed correctly, or if some shapes with the
 same value have been mixed up. For instance, if the infer_shape uses
 the width of a matrix instead of its height, then testing with only
 square matrices will not detect the problem. This is why the
-``self._compile_and_check`` method prints a warning in such a case. If
-your ``Op`` works only with such matrices, you can disable the warning with the
+:meth:`InferShapeTester._compile_and_check` method prints a warning in such a case. If
+your :class:`Op` works only with such matrices, you can disable the warning with the
 ``warn=False`` parameter.

 .. testcode:: tests
@@ -642,7 +638,7 @@ Testing the gradient
 ^^^^^^^^^^^^^^^^^^^^

 The function :ref:`verify_grad <validating_grad>`
-verifies the gradient of an ``Op`` or Aesara graph. It compares the
+verifies the gradient of an :class:`Op` or Aesara graph. It compares the
 analytic (symbolically computed) gradient and the numeric
 gradient (computed through the Finite Difference Method).

@@ -664,9 +660,9 @@ Testing the Rop
 The class :class:`RopLop_checker` defines the functions
 :func:`RopLop_checker.check_mat_rop_lop`, :func:`RopLop_checker.check_rop_lop` and
 :func:`RopLop_checker.check_nondiff_rop`. These allow to test the
-implementation of the Rop method of a particular ``Op``.
+implementation of the :meth:`Rop` method of a particular :class:`Op`.

-For instance, to verify the Rop method of the DoubleOp, you can use this:
+For instance, to verify the :meth:`Rop` method of the ``DoubleOp``, you can use this:

 .. testcode:: tests

@@ -689,8 +685,8 @@ In-file

 One may also add a block of code similar to the following at the end
 of the file containing a specific test of interest and run the
-file. In this example, the test *TestDoubleRop* in the class
-*test_double_op* would be performed.
+file. In this example, the test ``TestDoubleRop`` in the class
+``test_double_op`` would be performed.

 .. testcode:: tests

@@ -710,13 +706,13 @@ file. This can be done by adding this at the end of your test files:
 Exercise
 """"""""

-Run the code of the *DoubleOp* example above.
+Run the code of the ``DoubleOp`` example above.

-Modify and execute to compute: x * y.
+Modify and execute to compute: ``x * y``.

-Modify and execute the example to return two outputs: x + y and x - y.
+Modify and execute the example to return two outputs: ``x + y`` and `jx - yj`.

-You can omit the Rop functions. Try to implement the testing apparatus
+You can omit the :meth:`Rop` functions. Try to implement the testing apparatus
 described above.

 (Notice that Aesara's current *elemwise fusion* optimization is
@@ -758,21 +754,21 @@ signature:
        # ...
        return output_shapes

-  - `input_shapes` and `output_shapes` are lists of tuples that
-    represent the shape of the corresponding inputs/outputs, and `fgraph`
-    is a `FunctionGraph`.
+  - :obj:`input_shapes` and :obj:`output_shapes` are lists of tuples that
+    represent the shape of the corresponding inputs/outputs, and :obj:`fgraph`
+    is a :class:`FunctionGraph`.

-.. note::
+.. warning::

-    Not providing the `infer_shape` method prevents shape-related
-    optimizations from working with this ``Op``. For example
-    `your_op(inputs, ...).shape` will need the ``Op`` to be executed just
+    Not providing a :obj:`infer_shape` prevents shape-related
+    optimizations from working with this :class:`Op`. For example
+    ``your_op(inputs, ...).shape`` will need the :class:`Op` to be executed just
    to get the shape.

 .. note::

    As no grad is defined, this means you won't be able to
-    differentiate paths that include this ``Op``.
+    differentiate paths that include this :class:`Op`.

 .. note::

@@ -780,11 +776,11 @@ signature:
    inputs Aesara variables that were declared.

 .. note::
-    The python function wrapped by the `as_op` decorator needs to return a new
+    The python function wrapped by the :func:`as_op` decorator needs to return a new
    data allocation, no views or in place modification of the input.

-as_op Example
-^^^^^^^^^^^^^
+:func:`as_op` Example
+^^^^^^^^^^^^^^^^^^^^^

 .. testcode:: asop

@@ -817,7 +813,7 @@ You can try it as follows:
 Exercise
 ^^^^^^^^

-Run the code of the *``numpy_dot``* example above.
+Run the code of the ``numpy_dot`` example above.

 Modify and execute to compute: ``numpy.add`` and ``numpy.subtract``.

@@ -830,18 +826,18 @@ Documentation and Coding Style
 Please always respect the :ref:`quality_contributions` or your contribution
 will not be accepted.

-NanGuardMode and AllocEmpty
---------------------------
+:class:`NanGuardMode` and :class:`AllocEmpty`
+---------------------------------------------

-``NanGuardMode`` help users find where in the graph NaN appear. But
+:class:`NanGuardMode` help users find where in the graph NaN appear. But
 sometimes, we want some variables to not be checked. For example, in
-the old GPU back-end, we use a float32 CudaNdarray to store the MRG
-random number generator state (they are integers). So if ``NanGuardMode``
+the old GPU back-end, we use a float32 :class:`CudaNdarray` to store the MRG
+random number generator state (they are integers). So if :class:`NanGuardMode`
 check it, it will generate false positive. Another case is related to
-``[Gpu]AllocEmpty`` or some computation on it (like done by ``Scan``).
+:class:`[Gpu]AllocEmpty` or some computation on it (like done by :class:`Scan`).

-You can tell ``NanGuardMode`` to do not check a variable with:
-``variable.tag.nan_guard_mode_check``. Also, this tag automatically
+You can tell :class:`NanGuardMode` to do not check a variable with:
+:attr:`variable.tag.nan_guard_mode_check`. Also, this tag automatically
 follow that variable during optimization. This mean if you tag a
 variable that get replaced by an inplace version, it will keep that
 tag.

--- a/doc/extending/graphstructures.rst
+++ b/doc/extending/graphstructures.rst
@@ -91,7 +91,7 @@ output. You can now print the name of the op that is applied to get
 >>> y.owner.op.name
 'Elemwise{mul,no_inplace}'

-Hence, an elementwise multiplication is used to compute *y*. This
+Hence, an element-wise multiplication is used to compute *y*. This
 multiplication is done between the inputs:

 >>> len(y.owner.inputs)

--- a/doc/extending/op.rst
+++ b/doc/extending/op.rst

-===============================
-Making arithmetic Ops on double
-===============================
+=========================================
+Making arithmetic :class:`Op`\s on double
+=========================================

 .. testsetup:: *

@@ -41,8 +41,8 @@ computations. We'll start by defining multiplication.

 .. _op_contract:

-Op's contract
-=============
+:class:`Op`'s contract
+======================

 An `Op` is any object which inherits from :class:`Op`.  It has to
 define the following methods.
@@ -53,32 +53,32 @@ define the following methods.
  suitable symbolic `Type` to serve as the outputs of this :Class:`Op`'s
  application.  The :class:`Variable`\s found in ``*inputs`` must be operated on
  using Aesara's symbolic language to compute the symbolic output
-  Variables. This method should put these outputs into an Apply
-  instance, and return the Apply instance.
+  :class:`Variable`\s. This method should put these outputs into an :class:`Apply`
+  instance, and return the :class:`Apply` instance.

-  This method creates an Apply node representing the application of
+  This method creates an :class:`Apply` node representing the application of
  the `Op` on the inputs provided. If the `Op` cannot be applied to these
  inputs, it must raise an appropriate exception.

-  The inputs of the Apply instance returned by this call must be
+  The inputs of the :class:`Apply` instance returned by this call must be
  ordered correctly: a subsequent ``self.make_node(*apply.inputs)``
  must produce something equivalent to the first ``apply``.

 .. function:: perform(node, inputs, output_storage)

-  This method computes the function associated to this `Op`. ``node`` is
-  an Apply node created by the Op's ``make_node`` method. ``inputs``
+  This method computes the function associated to this :class:`Op`. ``node`` is
+  an :class:`Apply` node created by the :class:`Op`'s :meth:`Op.make_node` method. ``inputs``
  is a list of references to data to operate on using non-symbolic
-  statements, (i.e., statements in Python, Numpy). ``output_storage``
+  statements, (i.e., statements in Python, NumPy). ``output_storage``
  is a list of storage cells where the variables of the computation
  must be put.

  More specifically:

-    - ``node``: This is a reference to an Apply node which was previously
-      obtained via the ``Op``'s ``make_node`` method. It is typically not
-      used in simple Ops, but it contains symbolic information that
-      could be required for complex Ops.
+    - ``node``: This is a reference to an :class:`Apply` node which was previously
+      obtained via the :meth:`Op.make_node` method. It is typically not
+      used in simple :class:`Op`\s, but it contains symbolic information that
+      could be required for complex :class:`Op`\s.

    - ``inputs``: This is a list of data from which the values stored in ``output_storage``
      are to be computed using non-symbolic language.
@@ -86,16 +86,16 @@ define the following methods.
    - ``output_storage``: This is a list of storage cells where the output is to be stored.
      A storage cell is a one-element list. It is forbidden to change
      the length of the list(s) contained in ``output_storage``.
-      There is one storage cell for each output of the `Op`.
+      There is one storage cell for each output of the :class:`Op`.

      The data put in ``output_storage`` must match the type of the
      symbolic output. This is a situation where the ``node`` argument
      can come in handy.

-      A function Mode may allow ``output_storage`` elements to persist
+      A function :class:`Mode` may allow ``output_storage`` elements to persist
      between evaluations, or it may reset ``output_storage`` cells to
      hold a value of ``None``.  It can also pre-allocate some memory
-      for the `Op` to use.  This feature can allow ``perform`` to reuse
+      for the :class:`Op` to use.  This feature can allow :meth:`Op.perform` to reuse
      memory between calls, for example. If there is something
      preallocated in the ``output_storage``, it will be of the good
      dtype, but can have the wrong shape and have any stride pattern.
@@ -107,41 +107,42 @@ define the following methods.

  You must be careful about aliasing outputs to inputs, and making
  modifications to any of the inputs. See :ref:`Views and inplace
-  operations <views_and_inplace>` before writing a ``perform``
+  operations <views_and_inplace>` before writing a :meth:`Op.perform`
  implementation that does either of these things.

 Instead (or in addition to) ``perform()`` You can also provide a
 :ref:`C implementation <cop>` of For more details, refer to the
-documentation for :ref:`op`.
+documentation for :class:`Op`.

 .. function:: __eq__(other)

-  ``other`` is also an `Op`.
+  ``other`` is also an :class:`Op`.

  Returning ``True`` here is a promise to the optimization system
-  that the other `Op` will produce exactly the same graph effects
+  that the other :class:`Op` will produce exactly the same graph effects
  (from perform) as this one, given identical inputs. This means it
  will produce the same output values, it will destroy the same
-  inputs (same destroy_map), and will alias outputs to the same
-  inputs (same view_map). For more details, see
+  inputs (same ``destroy_map``), and will alias outputs to the same
+  inputs (same ``view_map``). For more details, see
  :ref:`views_and_inplace`.

-   .. note::
+  .. note::
+
+      If you set ``__props__``, this will be automatically generated.

-     If you set `__props__`, this will be automatically generated.

 .. function:: __hash__()

-  If two `Op` instances compare equal, then they **must** return the
+  If two :class:`Op` instances compare equal, then they **must** return the
  same hash value.

  Equally important, this hash value must not change during the
-  lifetime of self.  `Op` instances should be immutable in this
+  lifetime of self.  :class:`Op` instances should be immutable in this
  sense.

-   .. note::
+  .. note::

-     If you set `__props__`, this will be automatically generated.
+      If you set `__props__`, this will be automatically generated.

 .. op_optional:

@@ -154,8 +155,8 @@ Optional methods or attributes

  Must be a tuple.  Lists the name of the attributes which influence
  the computation performed.  This will also enable the automatic
-  generation of appropriate __eq__, __hash__ and __str__ methods.
-  Should be set to `()` if you have no attributes that are relevant to
+  generation of appropriate ``__eq__``, ``__hash__`` and ``__str__`` methods.
+  Should be set to ``()`` if you have no attributes that are relevant to
  the computation to generate the methods.

  .. versionadded:: 0.7
@@ -167,7 +168,7 @@ Optional methods or attributes
  If this member variable is an integer, then the default
  implementation of ``__call__`` will return
  ``node.outputs[self.default_output]``, where ``node`` was returned
-  by ``make_node``.  Otherwise, the entire list of outputs will be
+  by :meth:`Op.make_node`.  Otherwise, the entire list of outputs will be
  returned, unless it is of length 1, where the single element will be
  returned by itself.

@@ -175,9 +176,9 @@ Optional methods or attributes

   This function must return a thunk, that is a zero-arguments
   function that encapsulates the computation to be performed by this
-   op on the arguments of the node.
+   :class:`Op` on the arguments of the node.

-   :param node: Apply instance
+   :param node: :class:`Apply` instance
     The node for which a thunk is requested.
   :param storage_map: dict of lists
     This maps variables to a one-element lists holding the variable's
@@ -208,18 +209,18 @@ Optional methods or attributes
   :meth:`make_node` with the supplied arguments and returns the
   result indexed by `default_output`.  This can be overridden by
   subclasses to do anything else, but must return either an Aesara
-   Variable or a list of Variables.
+   :class:`Variable` or a list of :class:`Variable`\s.

   If you feel the need to override `__call__` to change the graph
   based on the arguments, you should instead create a function that
-   will use your `Op` and build the graphs that you want and call that
-   instead of the `Op` instance directly.
+   will use your :class:`Op` and build the graphs that you want and call that
+   instead of the :class:`Op` instance directly.

 .. function:: infer_shape(fgraph, node, shapes)

   This function is needed for shape optimization. ``shapes`` is a
-   list with one tuple for each input of the Apply node (which corresponds
-   to the inputs of the op).  Each tuple contains as many elements as the
+   list with one tuple for each input of the :class:`Apply` node (which corresponds
+   to the inputs of the :class:`Op`).  Each tuple contains as many elements as the
   number of dimensions of the corresponding input. The value of each element
   is the shape (number of items) along the corresponding dimension of that
   specific input.
@@ -245,8 +246,8 @@ Optional methods or attributes
 .. function:: __str__()

   This allows you to specify a more informative string representation of your
-   `Op`. If an `Op` has parameters, it is highly recommended to have the
-   ``__str__`` method include the name of the op and the Op's parameters'
+   :class:`Op`. If an `Op` has parameters, it is highly recommended to have the
+   ``__str__`` method include the name of the :class:`Op` and the :Class:`Op`'s parameters'
   values.

   .. note::
@@ -259,13 +260,13 @@ Optional methods or attributes
   *Default:* Return True

   By default when optimizations are enabled, we remove during
-   function compilation Apply nodes whose inputs are all constants.
-   We replace the Apply node with an Aesara constant variable.
-   This way, the Apply node is not executed at each function
-   call. If you want to force the execution of an op during the
+   function compilation :class:`Apply` nodes whose inputs are all constants.
+   We replace the :class:`Apply` node with an Aesara constant variable.
+   This way, the :class:`Apply` node is not executed at each function
+   call. If you want to force the execution of an :class:`Op` during the
   function call, make do_constant_folding return False.

-   As done in the Alloc op, you can return False only in some cases by
+   As done in the Alloc :class:`Op`, you can return False only in some cases by
   analyzing the graph from the node parameter.

 .. function:: debug_perform(node, inputs, output_storage)
@@ -277,69 +278,69 @@ Optional methods or attributes
   DebugMode, but others may also use it in the future).  It has the
   same signature and contract as :func:`perform`.

-   This enables ops that cause trouble with DebugMode with their
+   This enables :class:`Op`\s that cause trouble with DebugMode with their
   normal behaviour to adopt a different one when run under that
-   mode. If your op doesn't have any problems, don't implement this.
+   mode. If your :class:`Op` doesn't have any problems, don't implement this.

-If you want your op to work with gradient.grad() you also need to
-implement the functions described below.
+If you want your :class:`Op` to work with :func:`aesara.gradient.grad` you also
+need to implement the functions described below.

 Gradient
 ========

-These are the function required to work with gradient.grad().
+These are the function required to work with :func:`aesara.gradient.grad`.

 .. function:: grad(inputs, output_gradients)

-  If the `Op` being defined is differentiable, its gradient may be
+  If the :class:`Op` being defined is differentiable, its gradient may be
  specified symbolically in this method. Both ``inputs`` and
-  ``output_gradients`` are lists of symbolic Aesara Variables and
-  those must be operated on using Aesara's symbolic language. The grad
-  method must return a list containing one Variable for each
-  input. Each returned Variable represents the gradient with respect
+  ``output_gradients`` are lists of symbolic Aesara :class:`Variable`\s and
+  those must be operated on using Aesara's symbolic language. The :meth:`Op.grad`
+  method must return a list containing one :class:`Variable` for each
+  input. Each returned :class:`Variable` represents the gradient with respect
  to that input computed based on the symbolic gradients with respect
  to each output.

  If the output is not differentiable with respect to an input then
-  this method should be defined to return a variable of type NullType
-  for that input. Likewise, if you have not implemented the grad
+  this method should be defined to return a variable of type :class:`NullType`
+  for that input. Likewise, if you have not implemented the gradient
  computation for some input, you may return a variable of type
-  NullType for that input. aesara.gradient contains convenience
+  :class:`NullType` for that input. :mod:`aesara.gradient` contains convenience
  methods that can construct the variable for you:
  :func:`aesara.gradient.grad_undefined` and
  :func:`aesara.gradient.grad_not_implemented`, respectively.

-  If an element of output_gradient is of type
-  `aesara.gradient.DisconnectedType`, it means that the cost is not a
-  function of this output. If any of the `Op`'s inputs participate in
-  the computation of only disconnected outputs, then `Op.grad` should
-  return `DisconnectedType` variables for those inputs.
+  If an element of ``output_gradient`` is of type
+  :class:`aesara.gradient.DisconnectedType`, it means that the cost is not a
+  function of this output. If any of the :class:`Op`'s inputs participate in
+  the computation of only disconnected outputs, then :meth:`Op.grad` should
+  return :class:`DisconnectedType` variables for those inputs.

-  If the `Op.grad` method is not defined, then Aesara assumes it has been
+  If the :meth:`Op.grad` method is not defined, then Aesara assumes it has been
  forgotten.  Symbolic differentiation will fail on a graph that
-  includes this `Op`.
+  includes this :class:`Op`.

-  It must be understood that the `Op`'s `grad` method is not meant to
-  return the gradient of the `Op`'s output. `aesara.grad` computes
-  gradients; `Op.grad` is a helper function that computes terms that
+  It must be understood that the :meth:`Op.grad` method is not meant to
+  return the gradient of the :class:`Op`'s output. :func:`aesara.grad` computes
+  gradients; :meth:`Op.grad` is a helper function that computes terms that
  appear in gradients.

-  If an `Op` has a single vector-valued output ``y`` and a single
-  vector-valued input ``x``, then the grad method will be passed ``x`` and a
+  If an :class:`Op` has a single vector-valued output ``y`` and a single
+  vector-valued input ``x``, then the :meth:`Op.grad` method will be passed ``x`` and a
  second vector ``z``. Define ``J`` to be the Jacobian of ``y`` with respect to
-  ``x``. The `Op`'s `grad` method should return ``dot(J.T,z)``. When
-  `aesara.grad` calls the grad method, it will set ``z`` to be the
-  gradient of the cost ``C`` with respect to ``y``. If this `Op` is the only `Op`
+  ``x``. The :meth:`Op.grad` method should return ``dot(J.T,z)``. When
+  :func:`aesara.grad` calls the :meth:`Op.grad` method, it will set ``z`` to be the
+  gradient of the cost ``C`` with respect to ``y``. If this :class:`Op` is the only :class:`Op`
  that acts on ``x``, then ``dot(J.T,z)`` is the gradient of C with respect to
-  ``x``.  If there are other `Op`s that act on ``x``, `aesara.grad` will
+  ``x``.  If there are other :class:`Op`\s that act on ``x``, :func:`aesara.grad` will
  have to add up the terms of ``x``'s gradient contributed by the other
-  `Op`'s grad method.
+  :meth:`Op.grad` method.

-  In practice, an `Op`'s input and output are rarely implemented as
-  single vectors.  Even if an op's output consists of a list
+  In practice, an :class:`Op`'s input and output are rarely implemented as
+  single vectors.  Even if an :class:`Op`'s output consists of a list
  containing a scalar, a sparse matrix, and a 4D tensor, you can think
  of these objects as being formed by rearranging a vector. Likewise
-  for the input. In this view, the values computed by the grad method
+  for the input. In this view, the values computed by the :meth:`Op.grad` method
  still represent a Jacobian-vector product.

  In practice, it is probably not a good idea to explicitly construct
@@ -347,21 +348,21 @@ These are the function required to work with gradient.grad().
  the returned value should be equal to the Jacobian-vector product.

  So long as you implement this product correctly, you need not
-  understand what `aesara.gradient.grad` is doing, but for the curious the
+  understand what :func:`aesara.gradient.grad` is doing, but for the curious the
  mathematical justification is as follows:

-  In essence, the grad method must simply implement through symbolic
-  Variables and operations the chain rule of differential
+  In essence, the :meth:`Op.grad` method must simply implement through symbolic
+  :class:`Variable`\s and operations the chain rule of differential
  calculus. The chain rule is the mathematical procedure that allows
  one to calculate the total derivative :math:`\frac{d C}{d x}` of the
  final scalar symbolic `Variable` ``C`` with respect to a primitive
-  symbolic Variable x found in the list ``inputs``.  The grad method
+  symbolic :class:`Variable` x found in the list ``inputs``.  The :meth:`Op.grad` method
  does this using ``output_gradients`` which provides the total
  derivative :math:`\frac{d C}{d f}` of ``C`` with respect to a symbolic
-  Variable that is returned by the `Op` (this is provided in
+  :class:`Variable` that is returned by the `Op` (this is provided in
  ``output_gradients``), as well as the knowledge of the total
  derivative :math:`\frac{d f}{d x}` of the latter with respect to the
-  primitive Variable (this has to be computed).
+  primitive :class:`Variable` (this has to be computed).

  In mathematics, the total derivative of a scalar variable (C) with
  respect to a vector of scalar variables (x), i.e. the gradient, is
@@ -377,16 +378,16 @@ These are the function required to work with gradient.grad().

  Here, the chain rule must be implemented in a similar but slightly
  more complex setting: Aesara provides in the list
-  ``output_gradients`` one gradient for each of the Variables returned
-  by the `Op`. Where f is one such particular Variable, the
+  ``output_gradients`` one gradient for each of the :class:`Variable`\s returned
+  by the `Op`. Where f is one such particular :class:`Variable`, the
  corresponding gradient found in ``output_gradients`` and
  representing :math:`\frac{d C}{d f}` is provided with a shape
  similar to f and thus not necessarily as a row vector of scalars.
-  Furthermore, for each Variable x of the Op's list of input variables
+  Furthermore, for each :class:`Variable` x of the Op's list of input variables
  ``inputs``, the returned gradient representing :math:`\frac{d C}{d
-  x}` must have a shape similar to that of Variable x.
+  x}` must have a shape similar to that of :class:`Variable` x.

-  If the output list of the op is :math:`[f_1, ... f_n]`, then the
+  If the output list of the :class:`Op` is :math:`[f_1, ... f_n]`, then the
  list ``output_gradients`` is :math:`[grad_{f_1}(C), grad_{f_2}(C),
  ... , grad_{f_n}(C)]`.  If ``inputs`` consists of the list
  :math:`[x_1, ..., x_m]`, then `Op.grad` should return the list
@@ -394,137 +395,137 @@ These are the function required to work with gradient.grad().
  :math:`(grad_{y}(Z))_i = \frac{\partial Z}{\partial y_i}` (and
  :math:`i` can stand for multiple dimensions).

-  In other words, :func:`grad` does not return :math:`\frac{d f_i}{d
+  In other words, :meth:`Op.grad` does not return :math:`\frac{d f_i}{d
  x_j}`, but instead the appropriate dot product specified by the
  chain rule: :math:`\frac{d C}{d x_j} = \frac{d C}{d f_i} \cdot
  \frac{d f_i}{d x_j}`.  Both the partial differentiation and the
-  multiplication have to be performed by :func:`grad`.
+  multiplication have to be performed by :meth:`Op.grad`.

  Aesara currently imposes the following constraints on the values
-  returned by the grad method:
+  returned by the :meth:`Op.grad` method:

-  1) They must be Variable instances.
+  1) They must be :class:`Variable` instances.
  2) When they are types that have dtypes, they must never have an integer dtype.

  The output gradients passed *to* `Op.grad` will also obey these constraints.

  Integers are a tricky subject. Integers are the main reason for
-  having DisconnectedType, NullType or zero gradient. When you have an
-  integer as an argument to your grad method, recall the definition of
+  having :class:`DisconnectedType`, :class:`NullType` or zero gradient. When you have an
+  integer as an argument to your :meth:`Op.grad` method, recall the definition of
  a derivative to help you decide what value to return:

  :math:`\frac{d f}{d x} = \lim_{\epsilon \rightarrow 0} (f(x+\epsilon)-f(x))/\epsilon`.

  Suppose your function f has an integer-valued output. For most
-  functions you're likely to implement in aesara, this means your
-  gradient should be zero, because f(x+epsilon) = f(x) for almost all
-  x. (The only other option is that the gradient could be undefined,
+  functions you're likely to implement in Aesara, this means your
+  gradient should be zero, because :math:`f(x+epsilon) = f(x)` for almost all
+  :math:`x`. (The only other option is that the gradient could be undefined,
  if your function is discontinuous everywhere, like the rational
  indicator function)

-  Suppose your function f has an integer-valued input. This is a
+  Suppose your function :math:`f` has an integer-valued input. This is a
  little trickier, because you need to think about what you mean
  mathematically when you make a variable integer-valued in
-  aesara. Most of the time in machine learning we mean "f is a
-  function of a real-valued x, but we are only going to pass in
-  integer-values of x". In this case, f(x+epsilon) exists, so the
-  gradient through f should be the same whether x is an integer or a
-  floating point variable. Sometimes what we mean is "f is a function
-  of an integer-valued x, and f is only defined where x is an
-  integer." Since f(x+epsilon) doesn't exist, the gradient is
-  undefined.  Finally, many times in aesara, integer valued inputs
+  Aesara. Most of the time in machine learning we mean ":math:`f` is a
+  function of a real-valued :math:`x`, but we are only going to pass in
+  integer-values of :math:`x`". In this case, :math:`f(x+\epsilon)` exists, so the
+  gradient through :math:`f` should be the same whether :math:`x` is an integer or a
+  floating point variable. Sometimes what we mean is ":math:`f` is a function
+  of an integer-valued :math:`x`, and :math:`f` is only defined where :math:`x` is an
+  integer." Since :math:`f(x+\epsilon)` doesn't exist, the gradient is
+  undefined.  Finally, many times in Aesara, integer valued inputs
  don't actually affect the elements of the output, only its shape.

-  If your function f has both an integer-valued input and an
+  If your function :math:`f` has both an integer-valued input and an
  integer-valued output, then both rules have to be combined:

-  - If f is defined at (x+epsilon), then the input gradient is
-    defined. Since f(x+epsilon) would be equal to f(x) almost
-    everywhere, the gradient should be 0 (first rule).
+  - If :math:`f` is defined at :math:`x + \epsilon`, then the input gradient is
+    defined. Since :math:`f(x+\epsilon)` would be equal to :math:`f(x)` almost
+    everywhere, the gradient should be zero (first rule).

-  - If f is only defined where x is an integer, then the gradient
+  - If :math:`f` is only defined where :math:`x` is an integer, then the gradient
    is undefined, regardless of what the gradient with respect to the
    output is.

  Examples:

-  1) f(x,y) = dot product between x and y. x and y are integers.
-        Since the output is also an integer, f is a step function.
-        Its gradient is zero almost everywhere, so `Op.grad` should return
-        zeros in the shape of x and y.
-  2) f(x,y) = dot product between x and y. x is floating point and y is an integer.
-        In this case the output is floating point. It doesn't matter
-        that y is an integer.  We consider f to still be defined at
-        f(x,y+epsilon). The gradient is exactly the same as if y were
-        floating point.
-  3) f(x,y) = argmax of x along axis y.
-        The gradient with respect to y is undefined, because f(x,y) is
-        not defined for floating point y. How could you take an argmax
-        along a fraActional axis?  The gradient with respect to x is
-        0, because f(x+epsilon, y) = f(x) almost everywhere.
-  4) f(x,y) = a vector with y elements, each of which taking on the value x
-        The grad method should return DisconnectedType()() for y,
-        because the elements of f don't depend on y. Only the shape of
-        f depends on y. You probably also want to implement a
-        connection_pattern method to encode this.
-  5) f(x) = int(x) converts float x into an int. g(y) = float(y) converts an integer y into a float.
-        If the final cost C = 0.5 * g(y) = 0.5 g(f(x)), then the
-        gradient with respect to y will be 0.5, even if y is an
-        integer. However, the gradient with respect to x will be 0,
-        because the output of f is integer-valued.
+  1) :math:`f(x,y)` is a dot product between x and y. x and y are integers.
+     Since the output is also an integer, f is a step function.
+     Its gradient is zero almost everywhere, so :meth:`Op.grad` should return
+     zeros in the shape of x and y.
+  2) :math:`f(x,y)` is a dot product between x and y. x is floating point and y is an integer.
+     In this case the output is floating point. It doesn't matter
+     that y is an integer.  We consider f to still be defined at
+     :math:`f(x,y+\epsilon)`. The gradient is exactly the same as if y were
+     floating point.
+  3) :math:`f(x,y)` is the argmax of x along axis y.
+     The gradient with respect to y is undefined, because :math:`f(x,y)` is
+     not defined for floating point y. How could you take an argmax
+     along a fractional axis?  The gradient with respect to x is
+     0, because :math:`f(x+\epsilon, y) = f(x)` almost everywhere.
+  4) :math:`f(x,y)` is a vector with y elements, each of which taking on the value x
+     The :meth:`Op.grad` method should return :class:`DisconnectedType` for y,
+     because the elements of f don't depend on y. Only the shape of
+     f depends on y. You probably also want to implement a
+     connection_pattern method to encode this.
+  5) :math:`f(x) = int(x)` converts float x into an int. :math:`g(y) = float(y)`
+     converts an integer y into a float.  If the final cost :math:`C = 0.5 *
+     g(y) = 0.5 g(f(x))`, then the gradient with respect to y will be 0.5,
+     even if y is an integer. However, the gradient with respect to x will be
+     0, because the output of f is integer-valued.

 .. function:: connection_pattern(node):

-  Sometimes needed for proper operation of gradient.grad().
+  Sometimes needed for proper operation of :func:`aesara.gradient.grad`.

-  Returns a list of list of bools.
+  Returns a list of list of booleans.

  ``Op.connection_pattern[input_idx][output_idx]`` is true if the
-  elements of inputs[input_idx] have an effect on the elements of
-  outputs[output_idx].
+  elements of ``inputs[input_idx]`` have an effect on the elements of
+  ``outputs[output_idx]``.

  The ``node`` parameter is needed to determine the number of
-  inputs. Some ops such as Subtensor take a variable number of
+  inputs. Some :class:`Op`\s such as :class:`Subtensor` take a variable number of
  inputs.

-  If no connection_pattern is specified, gradient.grad will
+  If no connection_pattern is specified, :func:`aesara.gradient.grad` will
  assume that all inputs have some elements connected to some
  elements of all outputs.

  This method conveys two pieces of information that are otherwise
-  not part of the aesara graph:
+  not part of the Aesara graph:

-  1) Which of the op's inputs are truly ancestors of each of the
-     op's outputs. Suppose an op has two inputs, x and y, and
-     outputs f(x) and g(y). y is not really an ancestor of f, but
-     it appears to be so in the aesara graph.
+  1) Which of the :class:`Op`'s inputs are truly ancestors of each of the
+     :class:`Op`'s outputs. Suppose an :class:`Op` has two inputs, ``x`` and ``y``, and
+     outputs ``f(x)`` and ``g(y)``. ``y`` is not really an ancestor of ``f``, but
+     it appears to be so in the Aesara graph.
  2) Whether the actual elements of each input/output are relevant
     to a computation.
-     For example, the shape op does not read its input's elements,
-     only its shape metadata. d shape(x) / dx should thus raise
+     For example, the shape :class:`Op` does not read its input's elements,
+     only its shape metadata. :math:`\frac{d shape(x)}{dx}` should thus raise
     a disconnected input exception (if these exceptions are
     enabled).
-     As another example, the elements of the Alloc op's outputs
-     are not affected by the shape arguments to the Alloc op.
+     As another example, the elements of the :class:`Alloc` :class:`Op`'s outputs
+     are not affected by the shape arguments to the :class:`Alloc` :class:`Op`.

-  Failing to implement this function for an op that needs it can
+  Failing to implement this function for an :class:`Op` that needs it can
  result in two types of incorrect behavior:

-  1) gradient.grad erroneously raising a TypeError reporting that
+  1) :func:`aesara.gradient.grad` erroneously raising a ``TypeError`` reporting that
     a gradient is undefined.
-  2) gradient.grad failing to raise a ValueError reporting that
+  2) :func:`aesara.gradient.grad` failing to raise a ``ValueError`` reporting that
     an input is disconnected.

  Even if connection_pattern is not implemented correctly, if
-  gradient.grad returns an expression, that expression will be
+  :func:`aesara.gradient.grad` returns an expression, that expression will be
  numerically correct.

 .. function:: R_op(inputs, eval_points)

-   Optional, to work with gradient.R_op().
+   Optional, to work with :func:`aesara.gradient.R_op`.

   This function implements the application of the R-operator on the
-   function represented by your op. Let assume that function is :math:`f`,
+   function represented by your :class:`Op`. Let assume that function is :math:`f`,
   with input :math:`x`, applying the R-operator means computing the
   Jacobian of :math:`f` and right-multiplying it by :math:`v`, the evaluation
   point, namely: :math:`\frac{\partial f}{\partial x} v`.
@@ -534,10 +535,10 @@ These are the function required to work with gradient.grad().
   are the symbolic variables corresponding to the value you want to
   right multiply the jacobian with.

-   Same conventions as for the grad method hold. If your op is not
-   differentiable, you can return None. Note that in contrast to
-   the method :func:`grad`, for :func:`R_op` you need to return the
-   same number of outputs as there are outputs of the op. You can think
+   Same conventions as for the :meth:`Op.grad` method hold. If your :class:`Op`
+   is not differentiable, you can return None. Note that in contrast to the
+   method :meth:`Op.grad`, for :meth:`Op.R_op` you need to return the
+   same number of outputs as there are outputs of the :class:`Op`. You can think
   of it in the following terms. You have all your inputs concatenated
   into a single vector :math:`x`. You do the same with the evaluation
   points (which are as many as inputs and of the shame shape) and obtain
@@ -546,17 +547,17 @@ These are the function required to work with gradient.grad().
   multiply it by :math:`v`. As a last step you reshape each of these
   vectors you obtained for each outputs (that have the same shape as
   the outputs) back to their corresponding shapes and return them as the
-   output of the :func:`R_op` method.
+   output of the :meth:`Op.R_op` method.

   :ref:`List of op with r op support <R_op_list>`.

-Defining an Op: ``mul``
-=======================
+Defining an :class:`Op`: ``mul``
+================================

 We'll define multiplication as a *binary* operation, even though a
 multiplication `Op` could take an arbitrary number of arguments.

-First, we'll instantiate a ``mul`` Op:
+First, we'll instantiate a ``mul`` :class:`Op`:

 .. testcode:: mul

@@ -572,7 +573,7 @@ This function must take as many arguments as the operation we are
 defining is supposed to take as inputs---in this example that would be
 two.  This function ensures that both inputs have the ``double`` type.
 Since multiplying two doubles yields a double, this function makes an
-Apply node with an output Variable of type ``double``.
+:class:`Apply` node with an output :class:`Variable` of type ``double``.

 .. testcode:: mul

@@ -583,20 +584,20 @@ Apply node with an output Variable of type ``double``.
   mul.make_node = make_node


-The first two lines make sure that both inputs are Variables of the
+The first two lines make sure that both inputs are :class:`Variable`\s of the
 ``double`` type that we created in the previous section. We would not
 want to multiply two arbitrary types, it would not make much sense
 (and we'd be screwed when we implement this in C!)

-The last line is the meat of the definition. There we create an Apply
-node representing the application of `Op` ``mul`` to inputs ``x`` and
-``y``, giving a Variable instance of type ``double`` as the output.
+The last line is the meat of the definition. There we create an :class:`Apply`
+node representing the application of the `Op` ``mul`` to inputs ``x`` and
+``y``, giving a :class:`Variable` instance of type ``double`` as the output.

 .. note::

-   Aesara relies on the fact that if you call the ``make_node`` method
-   of Apply's first argument on the inputs passed as the Apply's
-   second argument, the call will not fail and the returned Apply
+   Aesara relies on the fact that if you call the :meth:`Op.make_node` method
+   of :class:`Apply`'s first argument on the inputs passed as the :class:`Apply`'s
+   second argument, the call will not fail and the returned :class:`Apply`
   instance will be equivalent.  This is how graphs are copied.

 **perform**
@@ -621,21 +622,21 @@ Here, ``z`` is a list of one element. By default, ``z == [None]``.

   It is possible that ``z`` does not contain ``None``. If it contains
   anything else, Aesara guarantees that whatever it contains is what
-   ``perform`` put there the last time it was called with this
+   :meth:`Op.perform` put there the last time it was called with this
   particular storage. Furthermore, Aesara gives you permission to do
   whatever you want with ``z``'s contents, chiefly reusing it or the
   memory allocated for it. More information can be found in the
-   :ref:`op` documentation.
+   :class:`Op` documentation.

 .. warning::

-   We gave ``z`` the Aesara type ``double`` in ``make_node``, which means
+   We gave ``z`` the Aesara type ``double`` in :meth:`Op.make_node`, which means
   that a Python ``float`` must be put there. You should not put, say, an
-   ``int`` in ``z[0]`` because Aesara assumes Ops handle typing properly.
+   ``int`` in ``z[0]`` because Aesara assumes :class:`Op`\s handle typing properly.


-Trying out our new Op
-=====================
+Trying out our new :class:`Op`
+==============================

 In the following code, we use our new `Op`:

@@ -668,7 +669,7 @@ Automatic Constant Wrapping
 ---------------------------

 Well, OK. We'd like our `Op` to be a bit more flexible. This can be done
-by modifying ``make_node`` to accept Python ``int`` or ``float`` as
+by modifying :meth:`Op.make_node` to accept Python ``int`` or ``float`` as
 ``x`` and/or ``y``:

 .. testcode:: mul
@@ -683,8 +684,8 @@ by modifying ``make_node`` to accept Python ``int`` or ``float`` as
       return Apply(mul, [x, y], [double()])
   mul.make_node = make_node

-Whenever we pass a Python int or float instead of a Variable as ``x`` or
-``y``, ``make_node`` will convert it to :ref:`constant` for us. ``Constant``
+Whenever we pass a Python int or float instead of a :class:`Variable` as ``x`` or
+``y``, :meth:`Op.make_node` will convert it to :ref:`constant` for us. ``Constant``
 is a :ref:`variable` we statically know the value of.

 .. doctest:: mul
@@ -701,10 +702,10 @@ is a :ref:`variable` we statically know the value of.
 Now the code works the way we want it to.

 .. note::
-   Most Aesara Ops follow this convention of up-casting literal
-   make_node arguments to Constants.
+   Most Aesara :class:`Op`\s follow this convention of up-casting literal
+   :meth:`Op.make_node` arguments to :class:`Constant`\s.
   This makes typing expressions more natural.  If you do
-   not want a constant somewhere in your graph, you have to pass a Variable
+   not want a constant somewhere in your graph, you have to pass a :class:`Variable`
   (like ``double('x')`` here).


@@ -713,8 +714,8 @@ Final version
 =============

 The above example is pedagogical.  When you define other basic arithmetic
-operations ``add``, ``sub`` and ``div``, code for ``make_node`` can be
-shared between these Ops. Here is revised implementation of these four
+operations ``add``, ``sub`` and ``div``, code for :meth:`Op.make_node` can be
+shared between these :class:`Op`\s. Here is revised implementation of these four
 arithmetic operators:

 .. testcode::
@@ -763,9 +764,9 @@ arithmetic operators:
 Instead of working directly on an instance of `Op`, we create a subclass of
 `Op` that we can parametrize. All the operations we define are binary. They
 all work on two inputs with type ``double``. They all return a single
-Variable of type ``double``. Therefore, ``make_node`` does the same thing
+:class:`Variable` of type ``double``. Therefore, :meth:`Op.make_node` does the same thing
 for all these operations, except for the `Op` reference ``self`` passed
-as first argument to Apply.  We define ``perform`` using the function
+as first argument to :class:`Apply`.  We define :meth:`Op.perform` using the function
 ``fn`` passed in the constructor.

 This design is a flexible way to define basic operations without
@@ -773,7 +774,7 @@ duplicating code. The same way a `Type` subclass represents a set of
 structurally similar types (see previous section), an `Op` subclass
 represents a set of structurally similar operations: operations that
 have the same input/output types, operations that only differ in one
-small detail, etc. If you see common patterns in several Ops that you
+small detail, etc. If you see common patterns in several :class:`Op`\s that you
 want to define, it can be a good idea to abstract out what you can.
 Remember that an `Op` is just an object which satisfies the contract
 described above on this page and that you should use all the tools at

--- a/doc/extending/optimization.rst
+++ b/doc/extending/optimization.rst
@@ -5,12 +5,12 @@
 Graph optimization
 ==================

-In this section we will define a couple optimizations on doubles.
+In this document we will explain how optimizations work and construct a couple examples.

 .. todo::

   This tutorial goes way too far under the hood, for someone who just wants
-   to add yet another pattern to the libraries in `tensor.basic_opt` for example.
+   to add yet another pattern to the libraries in :py:mod:`aesara.tensor.basic_opt` for example.

   We need another tutorial that covers the decorator syntax, and explains how
   to register your optimization right away.  That's what you need to get
@@ -21,23 +21,22 @@ In this section we will define a couple optimizations on doubles.

 .. note::

-   The optimization tag `cxx_only` is used for optimizations that insert
-   Ops which have no Python implementation (so they only have C code).
+   The optimization tag ``cxx_only`` is used for optimizations that insert
+   :class:`Op`\s which have no Python implementation (so they only have C code).
   Optimizations with this tag are skipped when there is no C++ compiler
   available.

-Global and local optimizations
+Global and Local Optimizations
 ==============================

 First, let's lay out the way optimizations work in Aesara. There are
 two types of optimizations: *global* optimizations and *local*
-optimizations. A global optimization takes a ``FunctionGraph`` object (a
-FunctionGraph is a wrapper around a whole computation graph, you can see its
-:class:`documentation <FunctionGraph>` for more details) and navigates through it
-in a suitable way, replacing some Variables by others in the process. A
+optimizations. A global optimization takes a :class:`FunctionGraph` object (see its
+:doc:`documentation </library/graph/fgraph>` for more details) and navigates through it
+in a suitable way, replacing some :class:`Variable`\s by others in the process. A
 local optimization, on the other hand, is defined as a function on a
 *single* :ref:`apply` node and must return either ``False`` (to mean that
-nothing is to be done) or a list of new Variables that we would like to
+nothing is to be done) or a list of new :class:`Variable`\s that we would like to
 replace the node's outputs with. A :ref:`navigator` is a special kind
 of global optimization which navigates the computation graph in some
 fashion (in topological order, reverse-topological order, random
@@ -61,13 +60,13 @@ methods:

    .. method:: apply(fgraph)

-      This method takes a FunctionGraph object which contains the computation graph
+      This method takes a ``FunctionGraph`` object which contains the computation graph
      and does modifications in line with what the optimization is meant
      to do. This is one of the main methods of the optimizer.

    .. method:: add_requirements(fgraph)

-      This method takes a FunctionGraph object and adds :ref:`features
+      This method takes a ``FunctionGraph`` object and adds :ref:`features
      <libdoc_graph_fgraphfeature>` to it. These features are "plugins" that are needed
      for the ``apply`` method to do its job properly.

@@ -75,7 +74,7 @@ methods:

      This is the interface function called by Aesara.

-      *Default:* this is defined by GlobalOptimizer as ``add_requirement(fgraph);
+      *Default:* this is defined by ``GlobalOptimizer`` as ``add_requirement(fgraph);
      apply(fgraph)``.

 See the section about :class:`FunctionGraph` to understand how to define these
@@ -91,7 +90,7 @@ A local optimization is an object which defines the following methods:

    .. method:: transform(fgraph, node)

-      This method takes a :class:`FunctionGraph` and an :ref:`Apply` node and
+      This method takes a :class:`FunctionGraph` and an :class:`Apply` node and
      returns either ``False`` to signify that no changes are to be done or a
      list of :class:`Variable`\s which matches the length of the node's ``outputs``
      list. When the :class:`LocalOptimizer` is applied by a :class:`NavigatorOptimizer`, the outputs
@@ -110,7 +109,7 @@ For starters, let's define the following simplification:
   \frac{xy}{y} = x

 We will implement it in three ways: using a global optimization, a
-local optimization with a Navigator and then using the PatternSub
+local optimization with a :class:`NavigatorOptimizer` and then using the :class:`PatternSub`
 facility.

 Global optimization
@@ -147,22 +146,22 @@ simplification described above:
   What is add_requirements? Why would we know to do this? Are there other
   requirements we might want to  know about?

-Here's how it works: first, in ``add_requirements``, we add the
-``ReplaceValidate`` :ref:`libdoc_graph_fgraphfeature` located in
-:ref:`libdoc_graph_features`. This feature adds the ``replace_validate``
-method to ``fgraph``, which is an enhanced version of ``replace`` that
+Here's how it works: first, in :meth:`add_requirements`, we add the
+:class:`ReplaceValidate` :ref:`libdoc_graph_fgraphfeature` located in
+:ref:`libdoc_graph_features`. This feature adds the :meth:`replace_validate`
+method to ``fgraph``, which is an enhanced version of :meth:`replace` that
 does additional checks to ensure that we are not messing up the
-computation graph (note: if ``ReplaceValidate`` was already added by
+computation graph (note: if :class:`ReplaceValidate` was already added by
 another optimizer, ``extend`` will do nothing). In a nutshell,
-``features.ReplaceValidate`` grants access to ``fgraph.replace_validate``,
-and ``fgraph.replace_validate`` allows us to replace a Variable with
+:class:`ReplaceValidate` grants access to :meth:`fgraph.replace_validate`,
+and :meth:`fgraph.replace_validate` allows us to replace a :class:`Variable` with
 another while respecting certain validation constraints. You can
 browse the list of :ref:`libdoc_graph_fgraphfeaturelist` and see if some of
 them might be useful to write optimizations with. For example, as an
-exercise, try to rewrite Simplify using :class:`NodeFinder`. (Hint: you
+exercise, try to rewrite ``Simplify`` using :class:`NodeFinder`. (Hint: you
 want to use the method it publishes instead of the call to toposort!)

-Then, in ``apply`` we do the actual job of simplification. We start by
+Then, in :meth:`apply` we do the actual job of simplification. We start by
 iterating through the graph in topological order. For each node
 encountered, we check if it's a ``div`` node. If not, we have nothing
 to do here. If so, we put in ``x``, ``y`` and ``z`` the numerator,
@@ -172,7 +171,7 @@ so we check for that. If the numerator is a multiplication we put the
 two operands in ``a`` and ``b``, so
 we can now say that ``z == (a*b)/y``. If ``y==a`` then ``z==b`` and if
 ``y==b`` then ``z==a``. When either case happens then we can replace
-``z`` by either ``a`` or ``b`` using ``fgraph.replace_validate`` - else we do
+``z`` by either ``a`` or ``b`` using :meth:`fgraph.replace_validate` - else we do
 nothing. You might want to check the documentation about :ref:`variable`
 and :ref:`apply` to get a better understanding of the
 pointer-following game you need to get ahold of the nodes of interest
@@ -212,8 +211,8 @@ optimization you wrote. For example, consider the following:
 Nothing happened here. The reason is: ``add(y, z) != add(y,
 z)``. That is the case for efficiency reasons. To fix this problem we
 first need to merge the parts of the graph that represent the same
-computation, using the ``MergeOptimizer`` defined in
-``aesara.graph.opt``.
+computation, using the :class:`MergeOptimizer` defined in
+:mod:`aesara.graph.opt`.

 >>> from aesara.graph.opt import MergeOptimizer
 >>> MergeOptimizer().optimize(e)  # doctest: +ELLIPSIS
@@ -239,11 +238,11 @@ for this somewhere in the future.
   phase. It is used internally by function and is rarely
   exposed to the end user. You can use it to test out optimizations,
   etc. if you are comfortable with it, but it is recommended to use
-   the function frontend and to interface optimizations with
+   the function front-end and interface optimizations with
   :class:`optdb` (we'll see how to do that soon).


-Local optimization
+Local Optimization
 ------------------

 The local version of the above code would be the following:
@@ -272,18 +271,20 @@ The local version of the above code would be the following:

 .. todo::

-    Fix up previous example... it's bad and incomplete.
+    Fix up previous example.
+

-The definition of transform is the inner loop of the global optimizer,
-where the node is given as argument. If no changes are to be made,
-``False`` must be returned. Else, a list of what to replace the node's
-outputs with must be returned. This list must have the same length as
-node.outputs. If one of node.outputs don't have clients(it is not used
-in the graph), you can put None in the returned list to remove it.
+The definition of the transform is the inner loop of the global optimizer,
+where the node is given as an argument. If no changes are to be made,
+``False`` must be returned; otherwise, a list of replacements for the node's
+outputs must be returned. This list must have the same length as
+:attr:`node.outputs`. If one of :attr:`node.outputs` doesn't have clients
+(i.e. it is not used in the graph), you can put ``None`` in the returned
+list to remove it.

 In order to apply the local optimizer we must use it in conjunction
-with a :ref:`navigator`. Basically, a :ref:`navigator` is a global
-optimizer that loops through all nodes in the graph (or a well-defined
+with a :class:`NavigatorOptimizer`. Basically, a :class:`NavigatorOptimizer` is
+a global optimizer that loops through all nodes in the graph (or a well-defined
 subset of them) and applies one or several local optimizers on them.

 >>> x = float64('x')
@@ -299,21 +300,21 @@ subset of them) and applies one or several local optimizers on them.
 >>> e
 [add(z, mul(x, true_div(z, x)))]

-OpSub, OpRemove, PatternSub
-+++++++++++++++++++++++++++
+:class:`OpSub`, :class:`OpRemove`, :class:`PatternSub`
++++++++++++++++++++++++++++++++++++++++++++++++++++++

-Aesara defines some shortcuts to make LocalOptimizers:
+Aesara defines some shortcuts to make :class:`LocalOptimizers`:

 .. function:: OpSub(op1, op2)

-  Replaces all uses of *op1* by *op2*. In other
-  words, the outputs of all :ref:`apply` involving *op1* by the outputs
-  of Apply nodes involving *op2*, where their inputs are the same.
+  Replaces all uses of `op1` by `op2`. In other
+  words, the outputs of all :ref:`apply` involving `op1` by the outputs
+  of :class:`Apply` nodes involving `op2`, where their inputs are the same.

 .. function:: OpRemove(op)

-  Removes all uses of *op* in the following way:
-  if ``y = op(x)`` then ``y`` is replaced by ``x``. *op* must have as many
+  Removes all uses of `op` in the following way:
+  if ``y = op(x)`` then ``y`` is replaced by ``x``. `op` must have as many
  outputs as it has inputs. The first output becomes the first input,
  the second output becomes the second input, and so on.

@@ -347,86 +348,89 @@ Aesara defines some shortcuts to make LocalOptimizers:

 .. note::

-   ``OpSub``, ``OpRemove`` and ``PatternSub`` produce local optimizers, which
+   :class:`OpSub`, :class:`OpRemove` and :class:`PatternSub` produce local optimizers, which
   means that everything we said previously about local optimizers
-   apply: they need to be wrapped in a Navigator, etc.
+   apply: they need to be wrapped in a :class:`NavigatorOptimizer`, etc.

 .. todo::

-   wtf is a navigator?
+   Explain what a :class:`NavigatorOptimizer`?

-When an optimization can be naturally expressed using ``OpSub``, ``OpRemove``
-or ``PatternSub``, it is highly recommended to use them.
+When an optimization can be naturally expressed using :class:`OpSub`, :class:`OpRemove`
+or :class:``PatternSub``, it is highly recommended to use them.

-WRITEME: more about using PatternSub (syntax for the patterns, how to
-use constraints, etc. - there's some decent doc at
-:class:`PatternSub` for those interested)
+.. todo::

+   More about using :class:`PatternSub` (syntax for the patterns, how to use
+   constraints, etc. - there's some decent doc at :class:`PatternSub` for those
+   interested)


 .. _optdb:

-The optimization database (optdb)
-=================================
+The optimization database (:obj:`optdb`)
+========================================

-Aesara exports a symbol called ``optdb`` which acts as a sort of
+Aesara exports a symbol called :obj:`optdb` which acts as a sort of
 ordered database of optimizations. When you make a new optimization,
 you must insert it at the proper place in the database. Furthermore,
 you can give each optimization in the database a set of tags that can
 serve as a basis for filtering.

-The point of optdb is that you might want to apply many optimizations
+The point of :obj:`optdb` is that you might want to apply many optimizations
 to a computation graph in many unique patterns. For example, you might
-want to do optimization X, then optimization Y, then optimization
-Z. And then maybe optimization Y is an EquilibriumOptimizer containing
-LocalOptimizers A, B and C which are applied on every node of the
-graph until they all fail to change it. If some optimizations act up,
-we want an easy way to turn them off. Ditto if some optimizations are
-very CPU-intensive and we don't want to take the time to apply them.
-
-The optdb system allows us to tag each optimization with a unique name
+want to do optimization X, then optimization Y, then optimization Z. And then
+maybe optimization Y is an :class:`EquilibriumOptimizer` containing :class:`LocalOptimizer`\s A, B
+and C which are applied on every node of the graph until they all fail to change
+it. If some optimizations act up, we want an easy way to turn them off. Ditto if
+some optimizations are very CPU-intensive and we don't want to take the time to
+apply them.
+
+The :obj:`optdb` system allows us to tag each optimization with a unique name
 as well as informative tags such as 'stable', 'buggy' or
 'cpu_intensive', all this without compromising the structure of the
 optimizations.


-Definition of optdb
-------------------
+Definition of :obj:`optdb`
+--------------------------

-optdb is an object which is an instance of
+:obj:`optdb` is an object which is an instance of
 :class:`SequenceDB <optdb.SequenceDB>`,
 itself a subclass of :class:`OptimizationDatabase <optdb.OptimizationDatabase>`.
-There exist (for now) two types of OptimizationDatabase, SequenceDB and EquilibriumDB.
-When given an appropriate OptimizationQuery, OptimizationDatabase objects build an Optimizer matching
+There exist (for now) two types of :class:`OptimizationDatabase`, :class:`SequenceDB` and :class:`EquilibriumDB`.
+When given an appropriate :class:`OptimizationQuery`, :class:`OptimizationDatabase` objects build an :class:`Optimizer` matching
 the query.

-A SequenceDB contains Optimizer or OptimizationDatabase objects. Each of them
+A :class:`SequenceDB` contains :class:`Optimizer` or :class:`OptimizationDatabase` objects. Each of them
 has a name, an arbitrary number of tags and an integer representing their order
-in the sequence. When a OptimizationQuery is applied to a SequenceDB, all Optimizers whose
-tags match the query are inserted in proper order in a SequenceOptimizer, which
-is returned. If the SequenceDB contains OptimizationDatabase instances, the OptimizationQuery will be passed
-to them as well and the optimizers they return will be put in their places.
-
-An EquilibriumDB contains LocalOptimizer or OptimizationDatabase objects. Each of them
-has a name and an arbitrary number of tags. When a OptimizationQuery is applied to
-an EquilibriumDB, all LocalOptimizers that match the query are
-inserted into an EquilibriumOptimizer, which is returned. If the
-SequenceDB contains OptimizationDatabase instances, the OptimizationQuery will be passed to them as
-well and the LocalOptimizers they return will be put in their places
-(note that as of yet no OptimizationDatabase can produce LocalOptimizer objects, so this
+in the sequence. When a :class:`OptimizationQuery` is applied to a :class:`SequenceDB`, all :class:`Optimizer`\s whose
+tags match the query are inserted in proper order in a :class:`SequenceOptimizer`, which
+is returned. If the :class:`SequenceDB` contains :class:`OptimizationDatabase`
+instances, the :class:`OptimizationQuery` will be passed to them as well and the
+optimizers they return will be put in their places.
+
+An :class:`EquilibriumDB` contains :class:`LocalOptimizer` or :class:`OptimizationDatabase` objects. Each of them
+has a name and an arbitrary number of tags. When a :class:`OptimizationQuery` is applied to
+an :class:`EquilibriumDB`, all :class:`LocalOptimizer`\s that match the query are
+inserted into an :class:`EquilibriumOptimizer`, which is returned. If the
+:class:`SequenceDB` contains :class:`OptimizationDatabase` instances, the
+:class:`OptimizationQuery` will be passed to them as well and the
+:class:`LocalOptimizer`\s they return will be put in their places
+(note that as of yet no :class:`OptimizationDatabase` can produce :class:`LocalOptimizer` objects, so this
 is a moot point).

-Aesara contains one principal OptimizationDatabase object, :class:`optdb`, which
+Aesara contains one principal :class:`OptimizationDatabase` object, :class:`optdb`, which
 contains all of Aesara's optimizers with proper tags. It is
-recommended to insert new Optimizers in it. As mentioned previously,
-optdb is a SequenceDB, so, at the top level, Aesara applies a sequence
+recommended to insert new :class:`Optimizer`\s in it. As mentioned previously,
+optdb is a :class:`SequenceDB`, so, at the top level, Aesara applies a sequence
 of global optimizations to the computation graphs.


 :class:`OptimizationQuery`
 --------------------------

-A OptimizationQuery is built by the following call:
+A :class:`OptimizationQuery` is built by the following call:

 .. code-block:: python

@@ -437,37 +441,37 @@ A OptimizationQuery is built by the following call:
    .. attribute:: include

       A set of tags (a tag being a string) such that every
-       optimization obtained through this OptimizationQuery must have **one** of the tags
+       optimization obtained through this :class:`OptimizationQuery` must have **one** of the tags
       listed. This field is required and basically acts as a starting point
       for the search.

    .. attribute:: require

       A set of tags such that every optimization obtained
-       through this OptimizationQuery must have **all** of these tags.
+       through this :class:`OptimizationQuery` must have **all** of these tags.

    .. attribute:: exclude

       A set of tags such that every optimization obtained
-       through this OptimizationQuery must have **none** of these tags.
+       through this :class:`OptimizationQuery` must have **none** of these tags.

    .. attribute:: subquery

-       optdb can contain sub-databases; subquery is a
-       dictionary mapping the name of a sub-database to a special OptimizationQuery.
-       If no subquery is given for a sub-database, the original OptimizationQuery will be
+       :obj:`optdb` can contain sub-databases; subquery is a
+       dictionary mapping the name of a sub-database to a special :class:`OptimizationQuery`.
+       If no subquery is given for a sub-database, the original :class:`OptimizationQuery` will be
       used again.

-Furthermore, a OptimizationQuery object includes three methods, ``including``,
-``requiring`` and ``excluding`` which each produce a new OptimizationQuery object
-with include, require and exclude sets refined to contain the new [WRITEME]
+Furthermore, a :class:`OptimizationQuery` object includes three methods, :meth:`including`,
+:meth:`requiring` and :meth:`excluding`, which each produce a new :class:`OptimizationQuery` object
+with the include, require, and exclude sets refined to contain the new entries.


 Examples
 --------

-Here are a few examples of how to use a OptimizationQuery on optdb to produce an
-Optimizer:
+Here are a few examples of how to use a :class:`OptimizationQuery` on :obj:`optdb` to produce an
+:class:`Optimizer`:

 .. testcode::

@@ -488,35 +492,35 @@ Optimizer:
                                           exclude=['inplace']))


-Registering an Optimizer
------------------------
+Registering an :class:`Optimizer`
+---------------------------------

 Let's say we have a global optimizer called ``simplify``. We can add
-it to ``optdb`` as follows:
+it to :obj:`optdb` as follows:

 .. testcode::

   # optdb.register(name, optimizer, order, *tags)
   optdb.register('simplify', simplify, 0.5, 'fast_run')

-Once this is done, the FAST_RUN mode will automatically include your
-optimization (since you gave it the 'fast_run' tag). Of course,
+Once this is done, the ``FAST_RUN`` mode will automatically include your
+optimization (since you gave it the ``'fast_run'`` tag). Of course,
 already-compiled functions will see no change. The 'order' parameter
 (what it means and how to choose it) will be explained in
 :ref:`optdb-structure` below.



-Registering a LocalOptimizer
----------------------------
+Registering a :class:`LocalOptimizer`
+-------------------------------------

-LocalOptimizers may be registered in two ways:
+:class:`LocalOptimizer`\s may be registered in two ways:

-* Wrap them in a Navigator and insert them like a global optimizer
+* Wrap them in a :class:`NavigatorOptimizer` and insert them like a global optimizer
  (see previous section).
-* Put them in an EquilibriumDB.
+* Put them in an :class:`EquilibriumDB`.

-Aesara defines two EquilibriumDBs where you can put local
+Aesara defines two :class:`EquilibriumDB`\s in which one can put local
 optimizations:


@@ -543,7 +547,7 @@ optimizations:


 For each group, all optimizations of the group that are selected by
-the OptimizationQuery will be applied on the graph over and over again until none
+the :class:`OptimizationQuery` will be applied on the graph over and over again until none
 of them is applicable, so keep that in mind when designing it: check
 carefully that your optimization leads to a fixpoint (a point where it
 cannot apply anymore) at which point it returns ``False`` to indicate its
@@ -554,10 +558,10 @@ two or more states and nothing will get done.

 .. _optdb-structure:

-optdb structure
---------------
+:obj:`optdb` structure
+----------------------

-optdb contains the following Optimizers and sub-DBs, with the given
+:obj:`optdb` contains the following :class:`Optimizer`\s and sub-DBs, with the given
 priorities and tags:

 +-------+---------------------+------------------------------+
@@ -605,8 +609,8 @@ under the assumption there are no inplace operations.

 .. _navigator:

-Navigator
-------------------
+:class:`NavigatorOptimizer`
+---------------------------

 WRITEME

@@ -651,12 +655,12 @@ return. The C code generation and compilation is cached, so the first
 time you compile a function and the following ones could take different
 amount of execution time.

-Detailed profiling of Aesara optimizer
--------------------------------------
+Detailed profiling of Aesara optimizations
+------------------------------------------

 You can get more detailed profiling information about the Aesara
-optimizer phase by setting to `True` the Aesara flags
-:attr:`config.profile_optimizer` (this require `config.profile` to be `True`
+optimizer phase by setting to ``True`` the Aesara flags
+:attr:`config.profile_optimizer` (this requires ``config.profile`` to be ``True``
 as well).

 This will output something like this:
@@ -852,15 +856,15 @@ This will output something like this:
 To understand this profile here is some explanation of how optimizations work:

 * Optimizations are organized in an hierarchy. At the top level, there
-  is a ``SeqOptimizer`` (Sequence Optimizer). It contains other optimizers,
+  is a :class:`SeqOptimizer`. It contains other optimizers,
  and applies them in the order they were specified. Those sub-optimizers can be
  of other types, but are all *global* optimizers.

-* Each Optimizer in the hierarchy will print some stats about
+* Each :class:`Optimizer` in the hierarchy will print some stats about
  itself. The information that it prints depends of the type of the
  optimizer.

-* The SeqOptimizer will print some stats at the start:
+* The :class:`SeqOptimizer` will print some stats at the start:

    .. code-block:: none

@@ -881,10 +885,12 @@ To understand this profile here is some explanation of how optimizations work:
  * 0.028s means it spent that time calls to ``fgraph.validate()``
  * 0.131s means it spent that time for callbacks. This is a mechanism that can trigger other execution when there is a change to the FunctionGraph.
  * ``time      - (name, class, index) - validate time`` tells how the information for each sub-optimizer get printed.
-  * All other instances of ``SeqOptimizer`` are described like this. In particular, some sub-optimizer from OPT_FAST_RUN that are also ``SeqOptimizer``.
+  * All other instances of :class:`SeqOptimizer` are described like this. In
+    particular, some sub-optimizer from ``OPT_FAST_RUN`` that are also
+    :class:`SeqOptimizer`.


-* The ``SeqOptimizer`` will print some stats at the start:
+* The :class:`SeqOptimizer` will print some stats at the start:

    .. code-block:: none

@@ -955,14 +961,14 @@ To understand this profile here is some explanation of how optimizations work:
             0.000s - local_subtensor_merge

  * ``0.751816s - ('canonicalize', 'EquilibriumOptimizer', 4) - 0.004s``
-    This line is from ``SeqOptimizer``, and indicates information related
+    This line is from :class:`SeqOptimizer`, and indicates information related
    to a sub-optimizer. It means that this sub-optimizer took
    a total of .7s. Its name is ``'canonicalize'``. It is an
-    ``EquilibriumOptimizer``. It was executed at index 4 by the
-    ``SeqOptimizer``. It spent 0.004s in the *validate* phase.
-  * All other lines are from the profiler of the ``EquilibriumOptimizer``.
+    :class:`EquilibriumOptimizer`. It was executed at index 4 by the
+    :class:`SeqOptimizer`. It spent 0.004s in the *validate* phase.
+  * All other lines are from the profiler of the :class:`EquilibriumOptimizer`.

-  * An ``EquilibriumOptimizer`` does multiple passes on the Apply nodes from
+  * An :class:`EquilibriumOptimizer` does multiple passes on the Apply nodes from
    the graph, trying to apply local and global optimizations.
    Conceptually, it tries to execute all global optimizations,
    and to apply all local optimizations on all
@@ -977,29 +983,29 @@ To understand this profile here is some explanation of how optimizations work:
    was 117.

  * Then it prints some global timing information: it spent 0.029s in
-    ``io_toposort``, all local optimizers took 0.687s together for all
+    :func:`io_toposort`, all local optimizers took 0.687s together for all
    passes, and global optimizers took a total of 0.010s.

  * Then we print the timing for each pass, the optimization that
    got applied, and the number of time they got applied. For example,
-    in pass 0, the ``local_dimshuffle_lift`` optimizer changed the graph 9
+    in pass 0, the :func:`local_dimshuffle_lift` optimizer changed the graph 9
    time.

  * Then we print the time spent in each optimizer, the number of times
    they changed the graph and the number of nodes they introduced in
    the graph.

-  * Optimizations with that pattern `local_op_lift` means that a node
+  * Optimizations with that pattern :func:`local_op_lift` means that a node
    with that op will be replaced by another node, with the same op,
    but will do computation closer to the inputs of the graph.
    For instance, ``local_op(f(x))`` getting replaced by ``f(local_op(x))``.

-  * Optimization with that pattern `local_op_sink` is the opposite of
-    `lift`. For instance ``f(local_op(x))`` getting replaced by ``local_op(f(x))``.
+  * Optimization with that pattern :func:`local_op_sink` is the opposite of
+    "lift". For instance ``f(local_op(x))`` getting replaced by ``local_op(f(x))``.

  * Local optimizers can replace any arbitrary node in the graph, not
    only the node it received as input. For this, it must return a
-    dict. The keys being nodes to replace and the
+    ``dict``. The keys being nodes to replace and the
    values being the corresponding replacement.

    This is useful to replace a client of the node received as

--- a/doc/extending/pipeline.rst
+++ b/doc/extending/pipeline.rst
@@ -24,10 +24,10 @@ in the :ref:`graphstructures` article.
 Compilation of the computation graph
 ------------------------------------

-Once the user has built a computation graph, she can use
-``aesara.function`` in order to make one or more functions that
+Once the user has built a computation graph, they can use
+:func:`aesara.function` in order to make one or more functions that
 operate on real data. function takes a list of input :ref:`Variables
-<variable>` as well as a list of output Variables that define a
+<variable>` as well as a list of output :class:`Variable`\s that define a
 precise subgraph corresponding to the function(s) we want to define,
 compile that subgraph and produce a callable.

@@ -35,32 +35,32 @@ Here is an overview of the various steps that are done with the
 computation graph in the compilation phase:


-Step 1 - Create a FunctionGraph
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Step 1 - Create a :class:`FunctionGraph`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 The subgraph given by the end user is wrapped in a structure called
-*FunctionGraph*. That structure defines several hooks on adding and
+:class:`FunctionGraph`. That structure defines several hooks on adding and
 removing (pruning) nodes as well as on modifying links between nodes
 (for example, modifying an input of an :ref:`apply` node) (see the
 article about :ref:`libdoc_graph_fgraph` for more information).

-FunctionGraph provides a method to change the input of an Apply node from one
-Variable to another and a more high-level method to replace a Variable
+:class:`FunctionGraph` provides a method to change the input of an :class:`Apply` node from one
+:class:`Variable` to another and a more high-level method to replace a :class:`Variable`
 with another. This is the structure that :ref:`Optimizers
 <optimization>` work on.

 Some relevant :ref:`Features <libdoc_graph_fgraphfeature>` are typically added to the
-FunctionGraph, namely to prevent any optimization from operating inplace on
+:class:`FunctionGraph`, namely to prevent any optimization from operating inplace on
 inputs declared as immutable.


-Step 2 - Execute main Optimizer
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Step 2 - Execute main :class:`Optimizer`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Once the FunctionGraph is made, an :term:`optimizer` is produced by
-the :term:`mode` passed to ``function`` (the Mode basically has two
-important fields, ``linker`` and ``optimizer``). That optimizer is
-applied on the FunctionGraph using its optimize() method.
+Once the :class:`FunctionGraph` is made, an :term:`optimizer` is produced by
+the :term:`mode` passed to :func:`function` (the :class:`Mode` basically has two
+important fields, :attr:`linker` and :attr:`optimizer`). That optimizer is
+applied on the :class:`FunctionGraph` using its :meth:`Optimizer.optimize` method.

 The optimizer is typically obtained through :attr:`optdb`.

@@ -69,11 +69,10 @@ Step 3 - Execute linker to obtain a thunk
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 Once the computation graph is optimized, the :term:`linker` is
-extracted from the Mode. It is then called with the FunctionGraph as
-argument to
-produce a ``thunk``, which is a function with no arguments that
+extracted from the :class:`Mode`. It is then called with the :class:`FunctionGraph` as
+argument to produce a ``thunk``, which is a function with no arguments that
 returns nothing. Along with the thunk, one list of input containers (a
-`aesara.link.basic.Container` is a sort of object that wraps another and does
+:class:`aesara.link.basic.Container` is a sort of object that wraps another and does
 type casting) and one list of output containers are produced,
 corresponding to the input and output :class:`Variable`\s as well as the updates
 defined for the inputs when applicable. To perform the computations,
@@ -83,18 +82,18 @@ where the thunk put them.

 Typically, the linker calls the ``toposort`` method in order to obtain
 a linear sequence of operations to perform. How they are linked
-together depends on the Linker used. The `CLinker` produces a single
-block of C code for the whole computation, whereas the `OpWiseCLinker`
+together depends on the Linker used. The :class:`CLinker` produces a single
+block of C code for the whole computation, whereas the :class:`OpWiseCLinker`
 produces one thunk for each individual operation and calls them in
 sequence.

 The linker is where some options take effect: the ``strict`` flag of
 an input makes the associated input container do type checking. The
-``borrow`` flag of an output, if False, adds the output to a
+``borrow`` flag of an output, if ``False``, adds the output to a
 ``no_recycling`` list, meaning that when the thunk is called the
 output containers will be cleared (if they stay there, as would be the
-case if ``borrow`` was True, the thunk would be allowed to reuse (or
-"recycle") the storage).
+case if ``borrow`` was True, the thunk would be allowed to reuse--or
+"recycle"--the storage).

 .. note::

@@ -119,6 +118,6 @@ Step 4 - Wrap the thunk in a pretty package
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 The thunk returned by the linker along with input and output
-containers is unwieldy. ``function`` hides that complexity away so
+containers is unwieldy. :func:`aesara.function` hides that complexity away so
 that it can be used like a normal function with arguments and return
 values.
--- a/doc/extending/tips.rst
+++ b/doc/extending/tips.rst
-
-
 ====
 Tips
 ====
@@ -8,15 +6,15 @@ Tips
 Reusing outputs
 ===============

-WRITEME
+.. todo:: Write this.


-Don't define new Ops unless you have to
-=======================================
+Don't define new :class:`Op`\s unless you have to
+=================================================

-It is usually not useful to define Ops that can be easily
-implemented using other already existing Ops. For example, instead of
-writing a "sum_square_difference" Op, you should probably just write a
+It is usually not useful to define :class:`Op`\s that can be easily
+implemented using other already existing :class:`Op`\s. For example, instead of
+writing a "sum_square_difference" :class:`Op`, you should probably just write a
 simple function:

 .. testcode::
@@ -33,23 +31,23 @@ a custom implementation would probably only bother to support
 contiguous vectors/matrices of doubles...


-Use Aesara's high order Ops when applicable
-===========================================
+Use Aesara's high order :class:`Op`\s when applicable
+=====================================================

-Aesara provides some generic Op classes which allow you to generate a
-lot of Ops at a lesser effort. For instance, Elemwise can be used to
-make :term:`elemwise` operations easily whereas DimShuffle can be
-used to make transpose-like transformations. These higher order Ops
-are mostly Tensor-related, as this is Aesara's specialty.
+Aesara provides some generic :class:`Op` classes which allow you to generate a
+lot of :class:`Op`\s at a lesser effort. For instance, :class:`Elemwise` can be used to
+make :term:`elemwise` operations easily, whereas :class:`DimShuffle` can be
+used to make transpose-like transformations. These higher order :class:`Op`\s
+are mostly tensor-related, as this is Aesara's specialty.


 .. _opchecklist:

-Op Checklist
-============
+:class:`Op` Checklist
+=====================

 Use this list to make sure you haven't forgotten anything when
-defining a new Op. It might not be exhaustive but it covers a lot of
+defining a new :class:`Op`. It might not be exhaustive but it covers a lot of
 common mistakes.

-WRITEME
+.. todo:: Write a list.
--- a/doc/extending/type.rst
+++ b/doc/extending/type.rst
 .. _aesara_type:

-======================
-Making the double type
-======================
+===============================
+Making the double :class:`Type`
+===============================


 .. _type_contract:

-Type's contract
-===============
+:class:`Type`'s contract
+========================

-In Aesara's framework, a ``Type`` (:class:`Type`)
-is any object which defines the following
-methods. To obtain the default methods described below, the Type should
-be an instance of ``Type`` or should be an instance of a
-subclass of ``Type``. If you will write all methods yourself,
-you need not use an instance of ``Type``.
+In Aesara's framework, a :class:`Type` is any object which defines the following
+methods. To obtain the default methods described below, the :class:`Type` should be an
+instance of `Type` or should be an instance of a subclass of `Type`. If you
+will write all methods yourself, you need not use an instance of `Type`.

 Methods with default arguments must be defined with the same signature,
 i.e.  the same default argument names and values. If you wish to add
@@ -26,8 +24,8 @@ default values.

    .. method:: filter(value, strict=False, allow_downcast=None)

-      This casts a value to match the Type and returns the
-      cast value. If ``value`` is incompatible with the Type,
+      This casts a value to match the :class:`Type` and returns the
+      cast value. If ``value`` is incompatible with the :class:`Type`,
      the method must raise an exception. If ``strict`` is True, ``filter`` must return a
      reference to ``value`` (i.e. casting prohibited).
      If ``strict`` is False, then casting may happen, but downcasting should
@@ -55,9 +53,9 @@ default values.

    .. method:: is_valid_value(value)

-      Returns True iff the value is compatible with the Type. If
+      Returns True iff the value is compatible with the :class:`Type`. If
      ``filter(value, strict = True)`` does not raise an exception, the
-      value is compatible with the Type.
+      value is compatible with the :class:`Type`.

      *Default:* True iff ``filter(value, strict=True)`` does not raise
      an exception.
@@ -71,19 +69,19 @@ default values.
    .. method:: values_eq_approx(a, b)

      Returns True iff ``a`` and ``b`` are approximately equal, for a
-      definition of "approximately" which varies from Type to Type.
+      definition of "approximately" which varies from :class:`Type` to :class:`Type`.

      *Default:* ``values_eq(a, b)``

    .. method:: make_variable(name=None)

-      Makes a :term:`Variable` of this Type with the specified name, if
-      ``name`` is not ``None``. If ``name`` is ``None``, then the Variable does
-      not have a name. The Variable will have its ``type`` field set to
-      the Type object.
+      Makes a :term:`Variable` of this :class:`Type` with the specified name, if
+      ``name`` is not ``None``. If ``name`` is ``None``, then the `Variable` does
+      not have a name. The `Variable` will have its ``type`` field set to
+      the :class:`Type` object.

-      *Default:* there is a generic definition of this in Type. The
-      Variable's ``type`` will be the object that defines this method (in
+      *Default:* there is a generic definition of this in `Type`. The
+      `Variable`'s ``type`` will be the object that defines this method (in
      other words, ``self``).

    .. method:: __call__(name=None)
@@ -94,13 +92,13 @@ default values.

    .. method:: __eq__(other)

-      Used to compare Type instances themselves
+      Used to compare :class:`Type` instances themselves

      *Default:* ``object.__eq__``

    .. method:: __hash__()

-      Types should not be mutable, so it should be OK to define a hash
+      :class:`Type`\s should not be mutable, so it should be OK to define a hash
      function.  Typically this function should hash all of the terms
      involved in ``__eq__``.

@@ -108,7 +106,7 @@ default values.

    .. method:: get_shape_info(obj)

-      Optional. Only needed to profile the memory of this Type of object.
+      Optional. Only needed to profile the memory of this :class:`Type` of object.

      Return the information needed to compute the memory size of ``obj``.

@@ -124,7 +122,7 @@ default values.
      ``get_size()`` will be called on the output of this function
      when printing the memory profile.

-      :param obj: The object that this Type represents during execution
+      :param obj: The object that this :class:`Type` represents during execution
      :return: Python object that ``self.get_size()`` understands


@@ -132,7 +130,7 @@ default values.

        Number of bytes taken by the object represented by shape_info.

-        Optional. Only needed to profile the memory of this Type of object.
+        Optional. Only needed to profile the memory of this :class:`Type` of object.

        :param shape_info: the output of the call to get_shape_info()
        :return: the number of bytes taken by the object described by
@@ -150,16 +148,16 @@ default values.

    .. method:: may_share_memory(a, b)

-        Optional to run, but mandatory for DebugMode. Return True if the Python
-        objects `a` and `b` could share memory. Return False
-        otherwise. It is used to debug when Ops did not declare memory
+        Optional to run, but mandatory for `DebugMode`. Return ``True`` if the Python
+        objects `a` and `b` could share memory. Return ``False``
+        otherwise. It is used to debug when :class:`Op`\s did not declare memory
        aliasing between variables. Can be a static method.
-        It is highly recommended to use and is mandatory for Type in Aesara
-        as our buildbot runs in DebugMode.
+        It is highly recommended to use and is mandatory for :class:`Type` in Aesara
+        as our buildbot runs in `DebugMode`.

-For each method, the *default* is what ``Type`` defines
-for you. So, if you create an instance of ``Type`` or an
-instance of a subclass of ``Type``, you
+For each method, the *default* is what `Type` defines
+for you. So, if you create an instance of `Type` or an
+instance of a subclass of `Type`, you
 must define ``filter``. You might want to override ``values_eq_approx``,
 as well as ``values_eq``. The other defaults generally need not be
 overridden.
@@ -189,7 +187,7 @@ with it as argument.
 Defining double
 ===============

-We are going to base Type ``double`` on Python's ``float``. We
+We are going to base :class:`Type` ``double`` on Python's ``float``. We
 must define ``filter`` and shall override ``values_eq_approx``.


@@ -219,7 +217,7 @@ must define ``filter`` and shall override ``values_eq_approx``.

 If ``strict`` is True we need to return ``x``. If ``strict`` is True and ``x`` is not a
 ``float`` (for example, ``x`` could easily be an ``int``) then it is
-incompatible with our Type and we must raise an exception.
+incompatible with our :class:`Type` and we must raise an exception.

 If ``strict is False`` then we are allowed to cast ``x`` to a ``float``,
 so if ``x`` is an ``int`` it we will return an equivalent ``float``.
@@ -238,7 +236,7 @@ when ``allow_downcast`` is False, i.e. no precision loss is allowed.
       return abs(x - y) / (abs(x) + abs(y)) < tolerance

 The second method we define is ``values_eq_approx``. This method
-allows approximate comparison between two values respecting our Type's
+allows approximate comparison between two values respecting our :class:`Type`'s
 constraints. It might happen that an optimization changes the computation
 graph in such a way that it produces slightly different variables, for
 example because of numerical instability like rounding errors at the
@@ -259,9 +257,9 @@ chose to be 1e-4.
 **Putting them together**

 What we want is an object that respects the aforementioned
-contract. Recall that Type defines default implementations for all
+contract. Recall that :class:`Type` defines default implementations for all
 required methods of the interface, except ``filter``. One way to make
-the Type is to instantiate a plain Type and set the needed fields:
+the :class:`Type` is to instantiate a plain :class:`Type` and set the needed fields:

 .. testcode::

@@ -272,7 +270,7 @@ the Type is to instantiate a plain Type and set the needed fields:
   double.values_eq_approx = values_eq_approx


-Another way to make this Type is to make a subclass of ``Type``
+Another way to make this :class:`Type` is to make a subclass of `Type`
 and define ``filter`` and ``values_eq_approx`` in the subclass:

 .. code-block:: python
@@ -291,12 +289,12 @@ and define ``filter`` and ``values_eq_approx`` in the subclass:

   double = Double()

-``double`` is then an instance of Type ``Double``, which in turn is a
-subclass of ``Type``.
+``double`` is then an instance of :class:`Type`\ `Double`, which in turn is a
+subclass of `Type`.

 There is a small issue with defining ``double`` this way. All
-instances of ``Double`` are technically the same Type. However, different
-``Double`` Type instances do not compare the same:
+instances of `Double` are technically the same :class:`Type`. However, different
+`Double`\ :class:`Type` instances do not compare the same:

 .. testsetup::

@@ -335,9 +333,9 @@ instances of ``Double`` are technically the same Type. However, different
 >>> double1 == double2
 False

-Aesara compares Types using ``==`` to see if they are the same.
-This happens in DebugMode.  Also, Ops can (and should) ensure that their inputs
-have the expected Type by checking something like ``if x.type == lvector``.
+Aesara compares :class:`Type`\s using ``==`` to see if they are the same.
+This happens in :class:`DebugMode`.  Also, :class:`Op`\s can (and should) ensure that their inputs
+have the expected :class:`Type` by checking something like ``if x.type == lvector``.

 There are several ways to make sure that equality testing works properly:

@@ -349,48 +347,48 @@ There are several ways to make sure that equality testing works properly:
        def __eq__(self, other):
            return type(self) is Double and type(other) is Double

- #. Override ``Double.__new__`` to always return the same instance.
+ #. Override :meth:`Double.__new__` to always return the same instance.
 #. Hide the Double class and only advertise a single instance of it.

 Here we will prefer the final option, because it is the simplest.
-Ops in the Aesara code often define the ``__eq__`` method though.
+:class:`Op`\s in the Aesara code often define the :meth:`__eq__` method though.


 Untangling some concepts
 ========================

-Initially, confusion is common on what an instance of Type is versus
-a subclass of Type or an instance of Variable. Some of this confusion is
-syntactic. A Type is any object which has fields corresponding to the
-functions defined above. The Type class provides sensible defaults for
-all of them except ``filter``, so when defining new Types it is natural
-to subclass Type. Therefore, we often end up with Type subclasses and
+Initially, confusion is common on what an instance of :class:`Type` is versus
+a subclass of :class:`Type` or an instance of :class:`Variable`. Some of this confusion is
+syntactic. A :class:`Type` is any object which has fields corresponding to the
+functions defined above. The :class:`Type` class provides sensible defaults for
+all of them except ``filter``, so when defining new :class:`Type`\s it is natural
+to subclass :class:`Type`. Therefore, we often end up with :class:`Type` subclasses and
 it is can be confusing what these represent semantically. Here is an
 attempt to clear up the confusion:


-* An **instance of Type** (or an instance of a subclass)
+* An **instance of :class:`Type`** (or an instance of a subclass)
  is a set of constraints on real data. It is
  akin to a primitive type or class in C. It is a *static*
  annotation.

-* An **instance of Variable** symbolizes data nodes in a data flow
+* An **instance of :class:`Variable`** symbolizes data nodes in a data flow
  graph. If you were to parse the C expression ``int x;``, ``int``
-  would be a Type instance and ``x`` would be a Variable instance of
-  that Type instance. If you were to parse the C expression ``c = a +
-  b;``, ``a``, ``b`` and ``c`` would all be Variable instances.
+  would be a :class:`Type` instance and ``x`` would be a :class:`Variable` instance of
+  that :class:`Type` instance. If you were to parse the C expression ``c = a +
+  b;``, ``a``, ``b`` and ``c`` would all be :class:`Variable` instances.

-* A **subclass of Type** is a way of implementing
-  a set of Type instances that share
+* A **subclass of :class:`Type`** is a way of implementing
+  a set of :class:`Type` instances that share
  structural similarities. In the ``double`` example that we are doing,
-  there is actually only one Type in that set, therefore the subclass
+  there is actually only one :class:`Type` in that set, therefore the subclass
  does not represent anything that one of its instances does not. In this
  case it is a singleton, a set with one element. However, the
  :class:`TensorType`
-  class in Aesara (which is a subclass of Type)
+  class in Aesara (which is a subclass of :class:`Type`)
  represents a set of types of tensors
  parametrized by their data type or number of dimensions. We could say
-  that subclassing Type builds a hierarchy of Types which is based upon
+  that subclassing :class:`Type` builds a hierarchy of :class:`Type`\s which is based upon
  structural similarity rather than compatibility.



--- a/doc/extending/unittest.rst
+++ b/doc/extending/unittest.rst
@@ -12,11 +12,11 @@ stressed enough!

 Unit Testing revolves around the following principles:

-* ensuring correctness: making sure that your Op, Type or Optimization
-  works in the way you intended it to work. It is important for this
-  testing to be as thorough as possible: test not only the obvious
-  cases, but more importantly the corner cases which are more likely
-  to trigger bugs down the line.
+* ensuring correctness: making sure that your :class:`Op`, :class:`Type` or
+  optimization works in the way you intended it to work. It is important for
+  this testing to be as thorough as possible: test not only the obvious cases,
+  but more importantly the corner cases which are more likely to trigger bugs
+  down the line.

 * test all possible failure paths. This means testing that your code
  fails in the appropriate manner, by raising the correct errors when
@@ -30,39 +30,43 @@ Unit Testing revolves around the following principles:
  that person to produce a fix. If this sounds like too much of a
  burden... then good! APIs aren't meant to be changed on a whim!

-This page is in no way meant to replace tutorials on Python's unittest
-module, for this we refer the reader to the `official documentation
-<http://docs.python.org/library/unittest.html>`_.  We will however
-address certain specifics about how unittests relate to aesara.

-PyTest Primer
-===============
+We use `pytest <https://docs.pytest.org>`_.  New tests should
+generally take the form of a test function, and each check within a test should
+involve an assertion of some kind.

-We use pytest now! New tests should mostly be functions, with assertions
+.. note::
+
+  Tests that check for a lack of failures (e.g. that ``Exception``\s aren't
+  raised) are generally *not* good tests.  Instead, assert something more
+  relevant and explicit about the expected outputs or side-effects of the code
+  being tested.

-How to Run Unit Tests ?
-----------------------

-Mostly `pytest aesara/`
+How to Run Unit Tests
+---------------------
+
+Mostly ``pytest aesara/``

 Folder Layout
 -------------

-Files containing unittests should be prefixed with the word "test".
+Files containing unit tests should be prefixed with the word "test".

-Optimally every python module should have a unittest file associated
+Ideally, every python module should have a unittest file associated
 with it, as shown below. Unit tests that test functionality of module
 ``<module>.py`` should therefore be stored in
 ``tests/<sub-package>/test_<module>.py``::

    Aesara/aesara/tensor/basic.py
-    Aesara/aesara/tensor/elemwise.py
    Aesara/tests/tensor/test_basic.py
+
+    Aesara/aesara/tensor/elemwise.py
    Aesara/tests/tensor/test_elemwise.py


-How to Write a Unittest
-=======================
+How to Write a Unit Test
+========================

 Test Cases and Methods
 ----------------------
@@ -74,7 +78,7 @@ concept.
 Test cases should be functions or classes prefixed with the word "test".

 Test methods should be as specific as possible and cover a particular
-aspect of the problem. For example, when testing the ``Dot`` ``Op``, one
+aspect of the problem. For example, when testing the :class:`Dot` :class:`Op`, one
 test method could check for validity, while another could verify that
 the proper errors are raised when inputs have invalid dimensions.

@@ -101,11 +105,11 @@ Example:
        assert np.array_equal(f(self.avals, self.bvals), numpy.dot(self.avals, self.bvals))


-Creating an Op Unit Test
-========================
+Creating an :class:`Op` Unit Test
+=================================

 A few tools have been developed to help automate the development of
-unit tests for Aesara Ops.
+unit tests for Aesara :class:`Op`\s.


 .. _validating_grad:
@@ -113,8 +117,8 @@ unit tests for Aesara Ops.
 Validating the Gradient
 -----------------------

-The ``verify_grad`` function can be used to validate that the ``grad``
-function of your Op is properly implemented. ``verify_grad`` is based
+The :func:`verify_grad` function can be used to validate that the :meth:`Op.grad`
+method of your :class:`Op` is properly implemented. :func:`verify_grad` is based
 on the Finite Difference Method where the derivative of function ``f``
 at point ``x`` is approximated as:

@@ -132,24 +136,24 @@ at point ``x`` is approximated as:
 * compares the two values. The tests passes if they are equal to
  within a certain tolerance.

-Here is the prototype for the verify_grad function.
+Here is the prototype for the :func:`verify_grad` function.

 .. code-block:: python

    def verify_grad(fun, pt, n_tests=2, rng=None, eps=1.0e-7, abs_tol=0.0001, rel_tol=0.0001):

-``verify_grad`` raises an Exception if the difference between the analytic gradient and
+:func:`verify_grad` raises an ``Exception`` if the difference between the analytic gradient and
 numerical gradient (computed through the Finite Difference Method) of a random
-projection of the fun's output to a scalar  exceeds
-both the given absolute and relative tolerances.
+projection of the fun's output to a scalar exceeds both the given absolute and
+relative tolerances.

 The parameters are as follows:

 * ``fun``: a Python function that takes Aesara variables as inputs,
  and returns an Aesara variable.
-  For instance, an Op instance with a single output is such a function.
+  For instance, an :class:`Op` instance with a single output is such a function.
  It can also be a Python function that calls an op with some of its
-  inputs being fixed to specific values, or that combine multiple ops.
+  inputs being fixed to specific values, or that combine multiple :class:`Op`\s.

 * ``pt``: the list of numpy.ndarrays to use as input values

@@ -181,7 +185,7 @@ symbolic variable:

        aesara.gradient.verify_grad(fun, [x_val, y_val, z_val], rng=rng)

-Here is an example showing how to use ``verify_grad`` on an Op instance:
+Here is an example showing how to use :func:`verify_grad` on an :class:`Op` instance:

 .. testcode::

@@ -193,9 +197,9 @@ Here is an example showing how to use ``verify_grad`` on an Op instance:
        aesara.gradient.verify_grad(tensor.Flatten(), [a_val], rng=rng)

 Here is another example, showing how to verify the gradient w.r.t. a subset of
-an Op's inputs. This is useful in particular when the gradient w.r.t. some of
+an :class:`Op`'s inputs. This is useful in particular when the gradient w.r.t. some of
 the inputs cannot be computed by finite difference (e.g. for discrete inputs),
-which would cause ``verify_grad`` to crash.
+which would cause :func:`verify_grad` to crash.

 .. testcode::

@@ -224,15 +228,15 @@ which would cause ``verify_grad`` to crash.
 makeTester and makeBroadcastTester
 ==================================

-Most Op unittests perform the same function. All such tests must
-verify that the op generates the proper output, that the gradient is
-valid, that the Op fails in known/expected ways. Because so much of
+Most :class:`Op` unittests perform the same function. All such tests must
+verify that the :class:`Op` generates the proper output, that the gradient is
+valid, that the :class:`Op` fails in known/expected ways. Because so much of
 this is common, two helper functions exists to make your lives easier:
-``makeTester`` and ``makeBroadcastTester`` (defined in module
-``tests.tensor.utils``).
+:func:`makeTester` and :func:`makeBroadcastTester` (defined in module
+:mod:`tests.tensor.utils`).

-Here is an example of ``makeTester`` generating testcases for the Dot
-product op:
+Here is an example of ``makeTester`` generating testcases for the dot
+product :class:`Op`:

 .. testcode::

@@ -253,34 +257,34 @@ product op:
                                           bad2 = (rand(5, 7), rand(8,3))),
                         grad = dict())

-In the above example, we provide a name and a reference to the op we
+In the above example, we provide a name and a reference to the :class:`Op` we
 want to test. We then provide in the ``expected`` field, a function
-which ``makeTester`` can use to compute the correct values. The
+which :func:`makeTester` can use to compute the correct values. The
 following five parameters are dictionaries which contain:

 * checks: dictionary of validation functions (dictionary key is a
  description of what each function does). Each function accepts two
  parameters and performs some sort of validation check on each
-  op-input/op-output value pairs.  If the function returns False, an
-  Exception is raised containing the check's description.
+  :class:`Op`-input/:class:`Op`-output value pairs.  If the function returns ``False``, an
+  ``Exception`` is raised containing the check's description.

 * good: contains valid input values, for which the output should match
-  the expected output. Unittest will fail if this is not the case.
+  the expected output. Unit tests will fail if this is not the case.

-* bad_build: invalid parameters which should generate an Exception
-  when attempting to build the graph (call to ``make_node`` should
-  fail).  Fails unless an Exception is raised.
+* bad_build: invalid parameters which should generate an ``Exception``
+  when attempting to build the graph (call to :meth:`Op.make_node` should
+  fail).  Fails unless an ``Exception`` is raised.

-* bad_runtime: invalid parameters which should generate an Exception
+* bad_runtime: invalid parameters which should generate an ``Exception``
  at runtime, when trying to compute the actual output values (call to
-  ``perform`` should fail). Fails unless an Exception is raised.
+  :meth:`Op.perform` should fail). Fails unless an ``Exception`` is raised.

 * grad: dictionary containing input values which will be used in the
-  call to ``verify_grad``
+  call to :func:`verify_grad`


-``makeBroadcastTester`` is a wrapper function for makeTester.  If an
+:func:`makeBroadcastTester` is a wrapper function for :func:`makeTester`.  If an
 ``inplace=True`` parameter is passed to it, it will take care of
 adding an entry to the ``checks`` dictionary. This check will ensure
-that inputs and outputs are equal, after the Op's perform function has
+that inputs and outputs are equal, after the :class:`Op`'s perform function has
 been applied.
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -19,7 +19,7 @@ Glossary
    Broadcasting
        Broadcasting is a mechanism which allows tensors with
        different numbers of dimensions to be used in element-by-element
-        (elementwise) computations.  It works by
+        (i.e. element-wise) computations.  It works by
        (virtually) replicating the smaller tensor along
        the dimensions that it is lacking.

@@ -41,9 +41,9 @@ Glossary

    Elemwise
        An element-wise operation ``f`` on two tensor variables ``M`` and ``N``
-        is one such that:
+        is one such that::

-        ``f(M, N)[i, j] == f(M[i, j], N[i, j])``
+          f(M, N)[i, j] == f(M[i, j], N[i, j])

        In other words, each element of an input matrix is combined
        with the corresponding element of the other(s). There are no
@@ -52,6 +52,8 @@ Glossary
        operation generalized along several dimensions.  Element-wise
        operations are defined for tensors of different numbers of dimensions by
        :term:`broadcasting` the smaller ones.
+        The :class:`Op` responsible for performing element-wise computations
+        is :class:`Elemwise`.

    Expression
        See :term:`Apply`
@@ -68,14 +70,14 @@ Glossary
    Destructive
        An :term:`Op` is destructive (of particular input[s]) if its
        computation requires that one or more inputs be overwritten or
-        otherwise invalidated.  For example, :term:`inplace` Ops are
-        destructive.  Destructive Ops can sometimes be faster than
+        otherwise invalidated.  For example, :term:`inplace`\ :class:`Op`\s are
+        destructive.  Destructive :class:`Op`\s can sometimes be faster than
        non-destructive alternatives.  Aesara encourages users not to put
-        destructive Ops into graphs that are given to :term:`aesara.function`,
+        destructive :class:`Op`\s into graphs that are given to :term:`aesara.function`,
        but instead to trust the optimizations to insert destructive ops
        judiciously.

-        Destructive Ops are indicated via a ``destroy_map`` Op attribute. (See
+        Destructive :class:`Op`\s are indicated via a :attr:`Op.destroy_map` attribute. (See
        :class:`Op`.


@@ -86,7 +88,7 @@ Glossary
        Inplace computations are computations that destroy their inputs as a
        side-effect.  For example, if you iterate over a matrix and double
        every element, this is an inplace operation because when you are done,
-        the original input has been overwritten.  Ops representing inplace
+        the original input has been overwritten.  :class:`Op`\s representing inplace
        computations are :term:`destructive`, and by default these can only be
        inserted by optimizations, not user code.

@@ -102,9 +104,9 @@ Glossary
    Op
        The ``.op`` of an :term:`Apply`, together with its symbolic inputs
        fully determines what kind of computation will be carried out for that
-        ``Apply`` at run-time.  Mathematical functions such as addition
-        (``T.add``) and indexing  ``x[i]`` are Ops in Aesara.  Much of the
-        library documentation is devoted to describing the various Ops that
+        :class:`Apply` at run-time.  Mathematical functions such as addition
+        (``T.add``) and indexing  ``x[i]`` are :class:`Op`\s in Aesara.  Much of the
+        library documentation is devoted to describing the various :class:`Op`\s that
        are provided with Aesara, but you can add more.

        See also :term:`Variable`, :term:`Type`, and :term:`Apply`,
@@ -122,7 +124,7 @@ Glossary
        An :term:`Op` is *pure* if it has no :term:`destructive` side-effects.

    Storage
-        The memory that is used to store the value of a Variable.  In most
+        The memory that is used to store the value of a :class:`Variable`.  In most
        cases storage is internal to a compiled function, but in some cases
        (such as :term:`constant` and :term:`shared variable <shared variable>` the storage is not internal.

@@ -150,19 +152,18 @@ Glossary
        >>> x = aet.ivector()
        >>> y = -x**2

-        ``x`` and ``y`` are both `Variables`, i.e. instances of the :class:`Variable` class.
+        ``x`` and ``y`` are both :class:`Variable`\s, i.e. instances of the :class:`Variable` class.

        See also :term:`Type`, :term:`Op`, and :term:`Apply`,
        or read more about :ref:`graphstructures`.

    View
-        Some Tensor Ops (such as Subtensor and Transpose) can be computed in
-        constant time by simply re-indexing their inputs.   The outputs from
-        [the Apply instances from] such Ops are called `Views` because their
+        Some tensor :class:`Op`\s (such as :class:`Subtensor` and :class:`DimShuffle`) can be computed in
+        constant time by simply re-indexing their inputs.   The outputs of
+        such :class:`Op`\s are views because their
        storage might be aliased to the storage of other variables (the inputs
-        of the Apply).  It is important for Aesara to know which Variables are
+        of the :class:`Apply`).  It is important for Aesara to know which :class:`Variable`\s are
        views of which other ones in order to introduce :term:`Destructive`
-        Ops correctly.
+        :class:`Op`\s correctly.

-        View Ops are indicated via a ``view_map`` Op attribute. (See
-        :class:`Op`.
+        :class:`Op`\s that are views have their :attr:`Op.view_map` attributes set.
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -6,12 +6,10 @@ Aesara is a Python library that allows you to define, optimize, and
 evaluate mathematical expressions involving multi-dimensional
 arrays efficiently. Aesara features:

-* **tight integration with NumPy** -- Use `numpy.ndarray` in Aesara-compiled functions.
-* **transparent use of a GPU** -- Perform data-intensive computations much faster than on a CPU.
-* **efficient symbolic differentiation** -- Aesara does your derivatives for functions with one or many inputs.
-* **speed and stability optimizations** -- Get the right answer for ``log(1+x)`` even when ``x`` is really tiny.
-* **dynamic C code generation** -- Evaluate expressions faster.
-* **extensive unit-testing and self-verification** -- Detect and diagnose many types of errors.
+* **Tight integration with NumPy** -- Use ``numpy.ndarray`` in Aesara-compiled functions.
+* **Efficient symbolic differentiation** -- Aesara does your derivatives for functions with one or many inputs.
+* **Speed and stability optimizations** -- Get the right answer for ``log(1+x)`` even when ``x`` is really tiny.
+* **Dynamic C/JAX/Numba code generation** -- Evaluate expressions faster.

 Aesara is based on `Theano`_, which has been powering large-scale computationally
 intensive scientific investigations since 2007.

--- a/doc/introduction.rst
+++ b/doc/introduction.rst
@@ -6,8 +6,8 @@ Aesara at a Glance
 ==================

 Aesara is a Python library that lets you define, optimize, and evaluate
-mathematical expressions, especially ones with multi-dimensional arrays
-(numpy.ndarray).  Using Aesara it is
+mathematical expressions, especially ones involving multi-dimensional arrays
+(e.g. :class:`numpy.ndarray`\s).  Using Aesara it is
 possible to attain speeds rivaling hand-crafted C implementations for problems
 involving large amounts of data.

@@ -16,7 +16,7 @@ optimizing compiler. It can also generate customized C code for many
 mathematical operations.  This combination of CAS with optimizing compilation
 is particularly useful for tasks in which complicated mathematical expressions
 are evaluated repeatedly and evaluation speed is critical.  For situations
-where many different expressions are each evaluated once Aesara can minimize
+where many different expressions are each evaluated once, Aesara can minimize
 the amount of compilation/analysis overhead, but still provide symbolic
 features such as automatic differentiation.

@@ -29,11 +29,12 @@ limited to:
 * arithmetic simplification (e.g. ``x*y/x -> y``, ``--x -> x``)
 * inserting efficient BLAS_ operations (e.g. ``GEMM``) in a variety of
  contexts
-* using memory aliasing to avoid calculation
-* using inplace operations wherever it does not interfere with aliasing
-* loop fusion for elementwise sub-expressions
+* using memory aliasing to avoid unnecessary calculations
+* using in-place operations wherever it does not interfere with aliasing
+* loop fusion for element-wise sub-expressions
 * improvements to numerical stability (e.g.  :math:`\log(1+\exp(x))` and :math:`\log(\sum_i \exp(x[i]))`)
-* for a complete list, see :ref:`optimizations`
+
+For more information see :ref:`optimizations`.

 The library that Aesara is based on, Theano, was written at the LISA_ lab to
 support rapid development of efficient machine learning algorithms. Theano was
@@ -45,7 +46,7 @@ Sneak peek
 ==========

 Here is an example of how to use Aesara. It doesn't show off many of
-Aesara's features, but it illustrates concretely what Aesara is.
+its features, but it illustrates concretely what Aesara is.


 .. If you modify this code, also change :
@@ -75,35 +76,33 @@ Aesara is not a programming language in the normal sense because you
 write a program in Python that builds expressions for Aesara. Still it
 is like a programming language in the sense that you have to

- declare variables (``a, b``) and give their types
-
- build expressions for how to put those variables together
-
- compile expression graphs to functions in order to use them for computation.
+- declare variables ``a`` and ``b`` and give their types,
+- build expressions graphs using those variables,
+- compile the expression graphs into functions that can be used for computation.

-It is good to think of ``aesara.function`` as the interface to a
+It is good to think of :func:`aesara.function` as the interface to a
 compiler which builds a callable object from a purely symbolic graph.
-One of Aesara's most important features is that ``aesara.function``
+One of Aesara's most important features is that :func:`aesara.function`
 can optimize a graph and even compile some or all of it into native
 machine instructions.


-What does it do that they don't?
-================================
+What does it do that NumPy doesn't
+==================================

-Aesara is a Python library and optimizing compiler for manipulating
-and evaluating expressions, especially matrix-valued
-ones. Manipulation of matrices is typically done using the numpy
-package, so what does Aesara do that Python and numpy do not?
+Aesara is a essentially an optimizing compiler for manipulating
+and evaluating expressions, especially tensor-valued
+ones. Manipulation of tensors is typically done using the NumPy
+package, so what does Aesara do that Python and NumPy don't do?

- *execution speed optimizations*: Aesara can use `g++` or `nvcc` to compile
+- *execution speed optimizations*: Aesara can use C, Numba, or JAX to compile
  parts your expression graph into CPU or GPU instructions, which run
  much faster than pure Python.

 - *symbolic differentiation*: Aesara can automatically build symbolic graphs
  for computing gradients.

- *stability optimizations*: Aesara can recognize [some] numerically unstable
+- *stability optimizations*: Aesara can recognize some numerically unstable
  expressions and compute them with more stable algorithms.

 The closest Python package to Aesara is sympy_.

--- a/doc/library/config.rst
+++ b/doc/library/config.rst
@@ -175,7 +175,7 @@ import ``aesara`` and print the config variable, as in:

    Default: ``'ignore'``

-    This option determines what's done when a ``TensorVariable`` with dtype
+    This option determines what's done when a :class:`TensorVariable` with dtype
    equal to ``float64`` is created.
    This can be used to help find upcasts to ``float64`` in user code.

@@ -185,10 +185,10 @@ import ``aesara`` and print the config variable, as in:

    Default: ``'default'``

-    If ``more``, sometimes Aesara will select ``Op`` implementations that
+    If ``more``, sometimes Aesara will select :class:`Op` implementations that
    are more "deterministic", but slower. In particular, on the GPU,
    Aesara will avoid using ``AtomicAdd``. Sometimes Aesara will still use
-    non-deterministic implementations, e.g. when there isn't a GPU ``Op``
+    non-deterministic implementations, e.g. when there isn't a GPU :class:`Op`
    implementation that is deterministic. See the ``dnn.conv.algo*``
    flags for more cases.

@@ -216,7 +216,7 @@ import ``aesara`` and print the config variable, as in:

    Default: ``True``

-    This enables, or disables, an optimization in ``Scan`` that tries to
+    This enables, or disables, an optimization in :class:`Scan` that tries to
    pre-allocate memory for its outputs. Enabling the optimization can give a
    significant speed up at the cost of slightly increased memory usage.

@@ -230,7 +230,7 @@ import ``aesara`` and print the config variable, as in:

    If :attr:`config.allow_gc` is ``True``, but :attr:`config.scan__allow_gc` is
    ``False``, then Aesara will perform garbage collection during the inner
-    operations of a ``Scan`` after each iterations.
+    operations of a :class:`Scan` after each iterations.

 .. attribute:: config.scan__debug

@@ -238,7 +238,7 @@ import ``aesara`` and print the config variable, as in:

    Default: ``False``

-    If ``True``, Aesara will print extra ``Scan`` debug information.
+    If ``True``, Aesara will print extra :class:`Scan` debug information.

 .. attribute:: cycle_detection

@@ -376,7 +376,7 @@ import ``aesara`` and print the config variable, as in:

    Positive int value, default: 20.

-    The number of ``Apply`` nodes to print in the profiler output.
+    The number of :class:`Apply` nodes to print in the profiler output.

 .. attribute:: config.profiling__n_ops

@@ -388,7 +388,7 @@ import ``aesara`` and print the config variable, as in:

    Positive int value, default: 1024.

-    During memory profiling, do not print ``Apply`` nodes if the size
+    During memory profiling, do not print :class:`Apply` nodes if the size
    of their outputs (in bytes) is lower than this value.

 .. attribute:: config.profiling__min_peak_memory
@@ -540,7 +540,7 @@ import ``aesara`` and print the config variable, as in:

    Default: ``'ignore'``

-    If there is a CPU ``Op`` in the computational graph, depending on its value,
+    If there is a CPU :class:`Op` in the computational graph, depending on its value,
    this flag can either raise a warning, an exception or drop into the frame
    with ``pdb``.

@@ -550,7 +550,7 @@ import ``aesara`` and print the config variable, as in:

    Default: ``'warn'``

-    When an exception is raised while inferring the shape of an ``Apply``
+    When an exception is raised while inferring the shape of an :class:`Apply`
    node, either warn the user and use a default value (i.e. ``'warn'``), or
    raise the exception (i.e. ``'raise'``).

@@ -856,10 +856,10 @@ import ``aesara`` and print the config variable, as in:
    Default: ``''``

    A list of kinds of preallocated memory to use as output buffers for
-    each ``Op``'s computations, separated by ``:``. Implemented modes are:
+    each :class:`Op`'s computations, separated by ``:``. Implemented modes are:

    * ``"initial"``: initial storage present in storage map
-      (for instance, it can happen in the inner function of Scan),
+      (for instance, it can happen in the inner function of :class:`Scan`),
    * ``"previous"``: reuse previously-returned memory,
    * ``"c_contiguous"``: newly-allocated C-contiguous memory,
    * ``"f_contiguous"``: newly-allocated Fortran-contiguous memory,
@@ -883,7 +883,7 @@ import ``aesara`` and print the config variable, as in:
    Bool value, default: ``True``

    Generate a warning when a ``destroy_map`` or ``view_map`` says that an
-    ``Op`` will work inplace, but the ``Op`` does not reuse the input for its
+    :class:`Op` will work inplace, but the :class:`Op` does not reuse the input for its
    output.

 .. attribute:: config.NanGuardMode__nan_is_error
@@ -923,7 +923,7 @@ import ``aesara`` and print the config variable, as in:
    numpy.random.rand(5, 4)``).

    When not ``'off'``, the value of this option dictates what happens when
-    an ``Op``'s inputs do not provide appropriate test values:
+    an :class:`Op`'s inputs do not provide appropriate test values:

        - ``'ignore'`` will do nothing
        - ``'warn'`` will raise a ``UserWarning``
@@ -956,7 +956,7 @@ import ``aesara`` and print the config variable, as in:

    If ``'low'``, the text of exceptions will generally refer to apply nodes
    with short names such as ``'Elemwise{add_no_inplace}'``. If ``'high'``,
-    some exceptions will also refer to ``Apply`` nodes with long descriptions
+    some exceptions will also refer to :class:`Apply` nodes with long descriptions
    like:

    ::
@@ -970,7 +970,7 @@ import ``aesara`` and print the config variable, as in:

    Bool value, default: ``False``

-    If ``True``, will print a warning when compiling one or more ``Op`` with C
+    If ``True``, will print a warning when compiling one or more :class:`Op` with C
    code that can't be cached because there is no ``c_code_cache_version()``
    function associated to at least one of those :class:`Op`\s.

@@ -1028,7 +1028,7 @@ import ``aesara`` and print the config variable, as in:
    Int value, default: 0

    The verbosity level of the meta-optimizer: ``0`` for silent, ``1`` to only
-    warn when Aesara cannot meta-optimize an ``Op``, ``2`` for full output (e.g.
+    warn when Aesara cannot meta-optimize an :class:`Op`, ``2`` for full output (e.g.
    timings and the optimizations selected).



--- a/doc/library/tensor/basic.rst
+++ b/doc/library/tensor/basic.rst
@@ -1238,7 +1238,7 @@ The six usual equality and inequality operators share the same interface.
  :Parameter:  *a* - symbolic Tensor (or compatible)
  :Parameter:  *b* - symbolic Tensor (or compatible)
  :Return type: symbolic Tensor
-  :Returns: a symbolic tensor representing the application of the logical elementwise operator.
+  :Returns: a symbolic tensor representing the application of the logical :class:`Elemwise` operator.

  .. note::


--- a/doc/library/tensor/nnet/basic.rst
+++ b/doc/library/tensor/nnet/basic.rst
@@ -106,7 +106,7 @@
   Returns the softplus nonlinearity applied to x
    :Parameter: *x* - symbolic Tensor (or compatible)
    :Return type: same as x
-    :Returns: elementwise softplus: :math:`softplus(x) = \log_e{\left(1 + \exp(x)\right)}`.
+    :Returns: element-wise softplus: :math:`softplus(x) = \log_e{\left(1 + \exp(x)\right)}`.

   .. note:: The underlying code will return an exact 0 if an element of x is too small.

@@ -162,7 +162,7 @@
       * *output* - symbolic Tensor (or compatible)

    :Return type: same as target
-    :Returns: a symbolic tensor, where the following is applied elementwise :math:`crossentropy(t,o) = -(t\cdot log(o) + (1 - t) \cdot log(1 - o))`.
+    :Returns: a symbolic tensor, where the following is applied element-wise :math:`crossentropy(t,o) = -(t\cdot log(o) + (1 - t) \cdot log(1 - o))`.

   The following block implements a simple auto-associator with a
   sigmoid nonlinearity and a reconstruction error which corresponds
@@ -187,7 +187,7 @@
       * *output* - symbolic Tensor (or compatible)

    :Return type: same as target
-    :Returns: a symbolic tensor, where the following is applied elementwise :math:`crossentropy(o,t) = -(t\cdot log(sigmoid(o)) + (1 - t) \cdot log(1 - sigmoid(o)))`.
+    :Returns: a symbolic tensor, where the following is applied element-wise :math:`crossentropy(o,t) = -(t\cdot log(sigmoid(o)) + (1 - t) \cdot log(1 - sigmoid(o)))`.

   It is equivalent to `binary_crossentropy(sigmoid(output), target)`,
   but with more efficient and numerically stable computation, especially when

--- a/doc/library/tensor/random/basic.rst
+++ b/doc/library/tensor/random/basic.rst
@@ -9,8 +9,8 @@
   :synopsis: symbolic random variables


-The `aesara.tensor.random` module provides random-number drawing functionality
-that closely resembles the `numpy.random` module.
+The :mod:`aesara.tensor.random` module provides random-number drawing functionality
+that closely resembles the :mod:`numpy.random` module.

 Reference
 =========
@@ -30,15 +30,16 @@ Reference

 .. class:: RandomStateType(Type)

-    A `Type` for variables that will take ``numpy.random.RandomState``
+    A :class:`Type` for variables that will take :class:`numpy.random.RandomState`
    values.

 .. function:: random_state_type(name=None)

-    Return a new Variable whose ``.type`` is ``random_state_type``.
+    Return a new :class:`Variable` whose :attr:`Variable.type` is an instance of
+    :class:`RandomStateType`.

 .. class:: RandomVariable(Op)

-    `Op` that draws random numbers from a `numpy.random.RandomState` object.
-    This `Op` is parameterized to draw numbers from many possible
+    :class:`Op` that draws random numbers from a :class:`numpy.random.RandomState` object.
+    This :class:`Op` is parameterized to draw numbers from many possible
    distributions.
--- a/doc/sandbox/elemwise_compiler.rst
+++ b/doc/sandbox/elemwise_compiler.rst
 .. _sandbox_elemwise:

-=================
-Elemwise compiler
-=================
+==========================
+:class:`Elemwise` compiler
+==========================

-'''Stale specification page.  Upgrade this to provide useful developer doc. 2008.09.04'''
-== Definitions ==
+.. todo:: Stale specification page.  Upgrade this to provide useful developer doc. 2008.09.04

-The elementwise compiler takes inputs {{{(in0, in1, in2, ...)}}}, outputs {{{(out0, out1, out2, ...)}}}, broadcast modes {{{(mod0, mod1, mod2, ...)}}} where each mode corresponds to an output as well as {{{order}}} which determines if we broadcast/accumulate over the first or last dimensions (the looping order, basically, but some operations are only valid for one particular order!).
+Definitions
+===========
+
+The element-wise compiler takes inputs {{{(in0, in1, in2, ...)}}}, outputs {{{(out0, out1, out2, ...)}}}, broadcast modes {{{(mod0, mod1, mod2, ...)}}} where each mode corresponds to an output as well as {{{order}}} which determines if we broadcast/accumulate over the first or last dimensions (the looping order, basically, but some operations are only valid for one particular order!).

 The broadcast mode serves to calculate the rank of the corresponding output and how to map each input element to an output element:

@@ -38,7 +40,8 @@ Point of clarification: the order discussed here corresponds to a set of broadca

 Question: does it make sense to apply the order to the loop, or is this broadcast order something which will be local to each input argument.  What happens when the elemwise compiler deals with more complex subgraphs with multiple inputs and outputs?

-== The loop ==
+The loop
+========

 Here is the loop for {{{order == c}}}. Check for errors!

@@ -70,7 +73,8 @@ When {{{order == f}}}, the iterators ''ideally'' (but not necessarily) iterate i

 An Optimizer should look at the operations in the graph and figure out whether to allocate C_CONTIGUOUS (ideal for {{{order == c}}}) or F_CONTIGUOUS (ideal for {{{order == f}}}) arrays.

-== Gradient ==
+Gradient
+========

 The input ranks become the output ranks and gradients of the same rank as the outputs are added to the input list. If an output was given mode {{{broadcast}}}, then all inputs used to calculate it had to be broadcasted to that shape, so we must sum over the broadcasted dimensions on the gradient. The mode that we give to those inputs is therefore {{{(accumulate, sum)}}}. Inversely, if an output was given mode {{{(accumulate, sum)}}}, then all inputs used to calculate it had to be summed over those dimensions. Therefore, we give them mode {{{broadcast}}} in grad. Other accumulators than sum might prove more difficult. For example, the ith gradient for product is grad*product/x_i. Not sure how to handle that automatically.
 * I don't exactly follow this paragraph, but I think I catch the general idea and it seems to me like it will work very well.
@@ -80,5 +84,3 @@ The input ranks become the output ranks and gradients of the same rank as the ou
 * Could you explain why the accumulator gradient (e.g. product) can be trickier?

  * I thought about it and I figured that the general case is {{{g_accum[N-i+1], g_m[i] = grad_fn(accum[i-1], m[i], g_accum[N-i])}}} where {{{g_accum}}} is the accumulated gradient wrt the accumulator {{{accum}}}. It can be short-circuited in sum and product's case: for sum, grad_fn is the identity on its last argument so {{{g_m[i] == g_accum[i] == g_accum[0] == g_z for all i}}}. In product's case, {{{accum[i-1] == product(m[1:i-1]) and g_accum[N-i] == g_z * product(m[i+1:N])}}}, multiply them together and you obtain {{{g_z * product(m)/m[i]}}} where obviously we only need to compute {{{product(m)}}} once. It's worth handling those two special cases, for the general case I don't know.
-
-
--- a/doc/sandbox/sandbox.rst
+++ b/doc/sandbox/sandbox.rst
@@ -8,7 +8,7 @@ or correct documentation.
 How do you define the grad function?
 ======================================

-Let's talk about defining the `grad()` function in an Op, using an
+Let's talk about defining the :meth:`Op.grad` function in an :class:`Op`, using an
 illustrative example.

 In Poisson regression (Ranzato and Szummer, 2008), the target *t* is
@@ -19,15 +19,15 @@ In the negative log likelihood of the Poisson regressor, there is a term:

    \log(t!)

-Let's say we write a logfactorial Op. We then compute the gradient
+Let's say we write a logfactorial :class:`Op`. We then compute the gradient

 You should define gradient, even if it is undefined.
 [give log factorial example]

-If an Op does not define ``grad``, but this Op does not appear in the path when
+If an :class:`Op` does not define ``grad``, but this :class:`Op` does not appear in the path when
 you compute the gradient, then there is no problem.

-If an Op does not define ``grad``, and this Op *does* appear in the path when
+If an :class:`Op` does not define ``grad``, and this :class:`Op` *does* appear in the path when
 you compute the gradient, **WRITEME**.

 Gradients for a particular variable can be one of four kinds:
@@ -45,26 +45,26 @@ currently, there is no way for a ``grad()`` method to distinguish between cases
 and 4
 but the distinction is important because graphs with type-3 gradients are ok
 to run, whereas graphs with type-4 gradients are not.
-so I suggested that Joseph return a type-4 gradient by defining an Op with no
+so I suggested that Joseph return a type-4 gradient by defining an :class:`Op` with no
 perform method.
 the idea would be that this would suit the graph-construction phase, but would
 prevent linking.
 how does that sound to you?

-**This documentation is useful when we show users how to write Ops.**
+**This documentation is useful when we show users how to write :class:`Op`\s.**

 ======================================
 What is staticmethod, st_impl?
 ======================================

-``st_impl`` is an optional method in an Op.
+``st_impl`` is an optional method in an :class:`Op`.
 ``@staticmethod`` is a Python decorator for a class method that does not
 implicitly take the class instance as a first argument. Hence, st_impl
-can be used for Op implementations when no information from the Op
+can be used for :class:`Op` implementations when no information from the :class:`Op`
 instance is needed. This can be useful for testing an implementation.
 See the ``XlogX`` class below for an example.

-**This documentation is useful when we show users how to write Ops.
+**This documentation is useful when we show users how to write :class:`Op`\s.
 Olivier says this behavior should be discouraged but I feel that st_impl
 should be encouraged where possible.**

@@ -74,7 +74,7 @@ how do we write scalar ops and upgrade them to tensor ops?

 **Olivier says that** :class:`~aesara.tensor.xlogx.XlogX` **gives a good example. In fact, I would
 like to beef xlogx up into our running example for demonstrating how to
-write an Op:**
+write an :class:`Op`:**

 .. code-block:: python

@@ -111,10 +111,10 @@ UnaryScalarOp is the same as scalar.ScalarOp with member variable nin=1.
 **give an example of this**

 =======================================================
-How to use the PrintOp
+How to use the `PrintOp`
 =======================================================

-** This is also useful in the How to write an Op tutorial. **
+** This is also useful in the How to write an :class:`Op` tutorial. **

 =======================================================
 Mammouth

--- a/doc/tutorial/examples.rst
+++ b/doc/tutorial/examples.rst
@@ -370,15 +370,15 @@ Here's a brief example.  The setup code is:
    g = function([], rv_n, no_default_updates=True)    #Not updating rv_n.rng
    nearly_zeros = function([], rv_u + rv_u - 2 * rv_u)

-Here, 'rv_u' represents a random stream of 2x2 matrices of draws from a uniform
-distribution.  Likewise,  'rv_n' represents a random stream of 2x2 matrices of
+Here, ``rv_u`` represents a random stream of 2x2 matrices of draws from a uniform
+distribution.  Likewise,  ``rv_n`` represents a random stream of 2x2 matrices of
 draws from a normal distribution.  The distributions that are implemented are
 defined as :class:`RandomVariable`\s
 in :ref:`basic<libdoc_tensor_random_basic>`. They only work on CPU.
 See `Other Implementations`_ for GPU version.


-Now let's use these objects.  If we call f(), we get random uniform numbers.
+Now let's use these objects.  If we call ``f()``, we get random uniform numbers.
 The internal state of the random number generator is automatically updated,
 so we get different random numbers every time.