Update docstrings and type hints for Op's gradient methods

1d369b55 · Brandon T. Willard · Brandon T. Willard · c8eea207 · 1d369b55
--- a/aesara/graph/op.py
+++ b/aesara/graph/op.py
@@ -320,53 +320,73 @@ class Op(MetaObject):
    add_tag_trace = staticmethod(add_tag_trace)

    def grad(
-        self, inputs: List[Variable], output_grads: List[Variable]
+        self, inputs: Sequence[Variable], output_grads: Sequence[Variable]
    ) -> List[Variable]:
-        """Construct a graph for the gradient with respect to each input variable.
+        r"""Construct a graph for the gradient with respect to each input variable.

        Each returned `Variable` represents the gradient with respect to that
        input computed based on the symbolic gradients with respect to each
        output. If the output is not differentiable with respect to an input,
-        then this method should return an instance of type ``NullType`` for that
+        then this method should return an instance of type `NullType` for that
        input.

+        Using the reverse-mode AD characterization given in [1]_, for a
+        :math:`C = f(A, B)` representing the function implemented by the `Op`
+        and its two arguments :math:`A` and :math:`B`, given by the
+        `Variable`\s in `inputs`, the values returned by `Op.grad` represent
+        the quantities :math:`\bar{A} \equiv \frac{\partial S_O}{A}` and
+        :math:`\bar{B}`, for some scalar output term :math:`S_O` of :math:`C`
+        in
+
+        .. math::
+
+            \operatorname{Tr}\left(\bar{C}^\top dC\right) =
+                \operatorname{Tr}\left(\bar{A}^\top dA\right) +
+                \operatorname{Tr}\left(\bar{B}^\top dB\right)
+
+
        Parameters
        ----------
-        inputs : list of Variable
+        inputs
            The input variables.
-        output_grads : list of Variable
+        output_grads
            The gradients of the output variables.

        Returns
        -------
-        grads : list of Variable
+        grads
            The gradients with respect to each `Variable` in `inputs`.

+        .. [1] Giles, Mike. 2008. “An Extended Collection of Matrix Derivative Results for Forward and Reverse Mode Automatic Differentiation.”
+
        """
        raise NotImplementedError()

    def L_op(
        self,
-        inputs: List[Variable],
-        outputs: List[Variable],
-        output_grads: List[Variable],
+        inputs: Sequence[Variable],
+        outputs: Sequence[Variable],
+        output_grads: Sequence[Variable],
    ) -> List[Variable]:
        r"""Construct a graph for the L-operator.

-        This method is primarily used by `Lop` and dispatches to
-        :meth:`Op.grad` by default.
+        The L-operator computes a row vector times the Jacobian.
+
+        This method dispatches to :meth:`Op.grad` by default.  In one sense,
+        this method provides the original outputs when they're needed to
+        compute the return value, whereas `Op.grad` doesn't.

-        The L-operator computes a *row* vector times the Jacobian. The
-        mathematical relationship is
-        :math:`v \frac{\partial f(x)}{\partial x}`.
-        The L-operator is also supported for generic tensors (not only for
-        vectors).
+        See `Op.grad` for a mathematical explanation of the inputs and outputs
+        of this method.

        Parameters
        ----------
-        inputs : list of Variable
-        outputs : list of Variable
-        output_grads : list of Variable
+        inputs
+            The inputs of the `Apply` node using this `Op`.
+        outputs
+            The outputs of the `Apply` node using this `Op`
+        output_grads
+            The gradients with respect to each `Variable` in `inputs`.

        """
        return self.grad(inputs, output_grads)
@@ -378,8 +398,6 @@ class Op(MetaObject):

        This method is primarily used by `Rop`.

-        Suppose the `Op` outputs ``[ f_1(inputs), ..., f_n(inputs) ]``.
-
        Parameters
        ----------
        inputs