Merge pull request #6165 from lamblin/docstrings

Docstring improvements

Merge pull request #6165 from lamblin/docstrings
c58166ca · abergeron · GitHub · e76d05d6 · 7dfdf20a · c58166ca
--- a/doc/library/gradient.txt
+++ b/doc/library/gradient.txt
@@ -14,8 +14,8 @@
   from theano.gradient import *
 Symbolic gradient is usually computed from :func:`gradient.grad`, which offers a
-more convenient syntax for the common case of wanting the gradient in some
+more convenient syntax for the common case of wanting the gradient of some
-expressions with respect to a scalar cost.  The :func:`grad_sources_inputs`
+scalar cost with respect to some input expressions. The :func:`grad_sources_inputs`
 function does the underlying work, and is more flexible, but is also more
 awkward to use when :func:`gradient.grad` can do the job.

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -44,6 +44,7 @@ grad_time = 0
 def format_as(use_list, use_tuple, outputs):
    """
    Formats the outputs according to the flags `use_list` and `use_tuple`.
    If `use_list` is True, `outputs` is returned as a list (if `outputs`
    is not a list or a tuple then it is converted in a one element list).
    If `use_tuple` is True, `outputs` is returned as a tuple (if `outputs`
@@ -163,20 +164,23 @@ disconnected_type = DisconnectedType()
 def Rop(f, wrt, eval_points, disconnected_outputs="raise",
        return_disconnected="zero"):
    """
-    Computes the R operation on `f` wrt to `wrt` evaluated at points given
+    Computes the R operation on `f` wrt to `wrt` at `eval_points`.
-    in `eval_points`. Mathematically this stands for the jacobian of `f` wrt
+    Mathematically this stands for the jacobian of `f` wrt
    to `wrt` right muliplied by the eval points.
-    :type f: Variable or list of Variables
+    Parameters
+    ----------
+    f : :class:`~theano.gof.graph.Variable` or list of Variables
        `f` stands for the output of the computational graph to which you
        want to apply the R operator
-    :type wrt: Variable or list of `Variables`s
+    wrt : :class:`~theano.gof.graph.Variable` or list of Variables
        variables for which you compute the R operator of the expression
        described by `f`
-    :type eval_points: Variable or list of Variables
+    eval_points : :class:`~theano.gof.graph.Variable` or list of Variables
        evalutation points for each of the variables in `wrt`
-    :type disconnected_outputs: str
+    disconnected_outputs : str
-        Defines the behaviour if some of the variables in `f` are
+        Defines the behaviour if some of the variables in `f`
        have no dependency on any of the variable in `wrt` (or if
        all links are non-differentiable). The possible values are:
@@ -184,16 +188,18 @@ def Rop(f, wrt, eval_points, disconnected_outputs="raise",
        - 'warn': consider the gradient zero, and print a warning.
        - 'raise': raise DisconnectedInputError.
-    :type return_disconnected : {'zero', 'None', 'Disconnected'}
+    return_disconnected : {'zero', 'None', 'Disconnected'}
        - 'zero' : If wrt[i] is disconnected, return value i will be
          wrt[i].zeros_like()
        - 'None' : If wrt[i] is disconnected, return value i will be
          None
        - 'Disconnected' : returns variables of type DisconnectedType
-    :rtype: :class:`~theano.gof.Variable` or list/tuple of Variables depending on type of f
+    Returns
-    :return: symbolic expression such that
+    -------
-        R_op[i] = sum_j ( d f[i] / d wrt[j]) eval_point[j]
+    :class:`~theano.gof.graph.Variable` or list/tuple of Variables depending on type of f
+        Symbolic expression such that
+        R_op[i] = sum_j (d f[i] / d wrt[j]) eval_point[j]
        where the indices in that expression are magic multidimensional
        indices that specify both the position within a list and all
        coordinates of the tensor element in the last.
@@ -349,22 +355,27 @@ def Rop(f, wrt, eval_points, disconnected_outputs="raise",
 def Lop(f, wrt, eval_points, consider_constant=None,
        disconnected_inputs='raise'):
    """
-    Computes the L operation on `f` wrt to `wrt` evaluated at points given
+    Computes the L operation on `f` wrt to `wrt` at `eval_points`.
-    in `eval_points`. Mathematically this stands for the jacobian of `f` wrt
+    Mathematically this stands for the jacobian of `f` wrt
    to `wrt` left muliplied by the eval points.
-    :type f: Variable or list of Variables
+    Parameters
+    ----------
+    f : :class:`~theano.gof.graph.Variable` or list of Variables
        `f` stands for the output of the computational graph to which you
        want to apply the L operator
-    :type wrt: Variable or list of `Variables`s
+    wrt : :class:`~theano.gof.graph.Variable` or list of Variables
        variables for which you compute the L operator of the expression
        described by `f`
-    :type eval_points: Variable or list of Variables
+    eval_points : :class:`~theano.gof.graph.Variable` or list of Variables
        evalutation points for each of the variables in `f`
-    :rtype: :class:`~theano.gof.Variable` or list/tuple of Variables depending on type of f
+    Returns
-    :return: symbolic expression such that
+    -------
-        L_op[i] = sum_i ( d f[i] / d wrt[j]) eval_point[i]
+    :class:`~theano.gof.Variable` or list/tuple of Variables depending on type of f
+        Symbolic expression such that
+        L_op[i] = sum_i (d f[i] / d wrt[j]) eval_point[i]
        where the indices in that expression are magic multidimensional
        indices that specify both the position within a list and all
        coordinates of the tensor element in the last
@@ -405,8 +416,7 @@ def grad(cost, wrt, consider_constant=None,
         known_grads=None, return_disconnected='zero',
         null_gradients='raise'):
    """
-    Return symbolic gradients for one or more variables with respect to some
+    Return symbolic gradients of one cost with respect to one or more variables.
-    cost.
    For more information about how automatic differentiation works in Theano,
    see :mod:`gradient`. For information on how to implement the gradient of
@@ -414,13 +424,13 @@ def grad(cost, wrt, consider_constant=None,
    Parameters
    ----------
-    cost : :class:`~theano.gof.Variable` scalar (0-dimensional) tensor variable or None
+    cost : :class:`~theano.gof.graph.Variable` scalar (0-dimensional) tensor variable or ``None``
-        Value with respect to which we are differentiating.  May be
+        Value that we are differentiating (that we want the gradient of).
-        `None` if known_grads is provided.
+        May be `None` if `known_grads` is provided.
-    wrt : :class:`~theano.gof.Variable` or list of Variables
+    wrt : :class:`~theano.gof.graph.Variable` or list of Variables
-        term[s] for which we want gradients
+        Term[s] with respect to which we want gradients
    consider_constant : list of variables
-        expressions not to backpropagate through
+        Expressions not to backpropagate through
    disconnected_inputs : {'ignore', 'warn', 'raise'}
        Defines the behaviour if some of the variables in `wrt` are
        not part of the computational graph computing `cost` (or if
@@ -453,7 +463,7 @@ def grad(cost, wrt, consider_constant=None,
    Returns
    -------
    variable or list/tuple of variables (matches `wrt`)
-        symbolic expression of gradient of `cost` with respect to each
+        Symbolic expression of gradient of `cost` with respect to each
        of the `wrt` terms.  If an element of `wrt` is not
        differentiable with respect to the output, then a zero
        variable is returned.
@@ -670,32 +680,28 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
            next_grad = dict(zip(grad_ends[i], next_grad))
            param_grads.extend(param_grad)
-    :type wrt: list of variables
+    Parameters
-    :param wrt:
+    ----------
+    wrt : list of variables
        Gradients are computed with respect to `wrt`.
-    :type end: list of variables
+    end : list of variables
-    :param end:
        Theano variables at which to end gradient descent (they are
        considered constant in theano.grad).  For convenience, the
        gradients with respect to these variables are also returned.
-    :type start: dictionary of variables
+    start : dictionary of variables
-    :param start:
        If not None, a dictionary mapping variables to their
        gradients. This is useful when the gradient on some variables
        are known. These are used to compute the gradients backwards up
        to the variables in `end` (they are used as known_grad in
        theano.grad).
-    :type cost: :class:`~theano.gof.Variable` scalar (0-dimensional) variable
+    cost : :class:`~theano.gof.Variable` scalar (0-dimensional) variable
-    :param cost:
        Additional costs for which to compute the gradients.  For
        example, these could be weight decay, an l1 constraint, MSE,
-      NLL, etc. May optionally be None if start is provided.  Warning
+        NLL, etc. May optionally be None if start is provided.
-      : If the gradients of `cost` with respect to any of the `start`
-      variables is already part of the `start` dictionary, then it may
-      be counted twice with respect to `wrt` and `end`.
        .. warning::
@@ -703,18 +709,18 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
            variables is already part of the `start` dictionary, then it
            may be counted twice with respect to `wrt` and `end`.
+    details : bool
-    :type details: bool
-    :param details:
        When True, additionally returns the list of gradients from
        `start` and of `cost`, respectively, with respect to `wrt` (not
        `end`).
-    :rtype: Tuple of 2 or 4 Lists of Variables
+    Returns
+    -------
-    :return: Returns lists of gradients with respect to `wrt` and `end`,
+    Tuple of 2 or 4 Lists of Variables
+        Returns lists of gradients with respect to `wrt` and `end`,
        respectively.
    .. versionadded:: 0.7
    '''
    assert ((cost is not None) or (start is not None))
@@ -808,18 +814,21 @@ def _populate_var_to_app_to_idx(outputs, wrt, consider_constant):
    """
    Helper function for grad function.
-    outputs: a list of variables we want to take gradients of
+    Parameters
+    ----------
+    outputs
+        a list of variables we want to take gradients of
-    wrt: a list of variables we want to take the gradient with
+    wrt
+        a list of variables we want to take the gradient with
        respect to.
-    consider_constant: a list of variables not to backpropagate
+    consider_constant
-        through.
+        a list of variables not to backpropagate through.
-    returns:
+    Returns
+    -------
    var_to_app_to_idx:
        A dictionary mapping a variable to a second dictionary.
        The second dictionary maps apply nodes acting on this
        variable to the variable's index in the apply node's
@@ -967,30 +976,35 @@ class DisconnectedInputError(ValueError):
 def _populate_grad_dict(var_to_app_to_idx,
                        grad_dict, wrt, cost_name=None):
-    """
+    """Helper function for grad function.
-        Helper function for grad function.
-        var_to_app_to_idx: a dictionary mapping a variable to
+    Parameters
-                a second dictionary.
+    ----------
+    var_to_app_to_idx : dict
+        a dictionary mapping a variable to a second dictionary.
        the second dictionary maps apply nodes acting on
        this variable to the variable's index in the apply
        node's input list
+    grad_dict : dict
-        grad_dict: A dictionary mapping variables to their gradients.
+        A dictionary mapping variables to their gradients.
        Should be populated by grad function, which should:
-                       -Set the gradient with respect to the cost to 1
-                       -Load all gradients from known_grads, possibly
+        - Set the gradient with respect to the cost to 1
+        - Load all gradients from known_grads, possibly
          overriding the cost
-                       -Set the gradient for disconnected
+        - Set the gradient for disconnected
          inputs to a variable with type DisconnectedType()
-        wrt: the minimal set of variables that must be included in grad_dict
+    wrt : list of Variables
+        the minimal set of variables that must be included in `grad_dict`
-        cost_name: The name of the cost being differentiated, optional.
+    cost_name: string
-                    used to name the grad with respect to x as
+        The name of the cost being differentiated, optional.
-                    (d<cost_name>/dx)
+        Used to name the grad with respect to x as (d<cost_name>/dx)
-        returns: a list of gradients corresponding to wrt
+    Returns
+    -------
+    list of Variables
+        A list of gradients corresponding to `wrt`
    """
    # build a dict mapping node to the terms node contributes to each of
@@ -1421,17 +1435,21 @@ class numeric_grad(object):
    def __init__(self, f, pt, eps=None, out_type=None):
        """Return the gradient of f at pt.
-        :param f: a differentiable function such that f(*pt) is a scalar
-        :param pt: an ndarray, a list of ndarrays or tuple of ndarrays
-        :param out_type: dtype of output, if complex (i.e. 'complex32' or
-        'complex64')
        This function computes the gradient by a one-sided finite
        differences of a fixed step size (eps).
+        Parameters
+        ----------
+        f : a differentiable function such that f(*pt) is a scalar
+            The function to compute the gradient of.
            It is assumed that f(...) will return a scalar.
            It is assumed that all f's inputs are numpy.ndarray objects.
+        pt : an ndarray, a list of ndarrays or tuple of ndarrays
-        :param eps: the stepsize for the finite differencing.  None means
+            The point where to evaluate the gradient
+        out_type: float
+            dtype of output, if complex (i.e. 'complex32' or 'complex64')
+        eps : float, optional
+            The stepsize for the finite differencing.  None means
            input dtype-dependent. See `type_eps`.
        """
@@ -1515,6 +1533,7 @@ class numeric_grad(object):
        Formulas used:
            abs_err = abs(a - b)
            rel_err = abs_err / max(abs(a) + abs(b), 1e-8)
        The denominator is clipped at 1e-8 to avoid dividing by 0 when a and b
@@ -1609,44 +1628,54 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
                no_debug_ref=True):
    """Test a gradient by Finite Difference Method. Raise error on failure.
-    Example:
-        >>> verify_grad(theano.tensor.tanh,
-        ...             (np.asarray([[2,3,4], [-1, 3.3, 9.9]]),),
-        ...             rng=np.random)
    Raises an Exception if the difference between the analytic gradient and
    numerical gradient (computed through the Finite Difference Method) of a
    random projection of the fun's output to a scalar exceeds the given
    tolerance.
-    :param fun: a Python function that takes Theano variables as inputs,
+    Examples
-        and returns a Theano variable. For instance, an Op instance with
+    --------
-        a single output.
+    >>> verify_grad(theano.tensor.tanh,
-    :param pt: the list of numpy.ndarrays to use as input values.
+    ...             (np.asarray([[2, 3, 4], [-1, 3.3, 9.9]]),),
+    ...             rng=np.random)
+    Parameters
+    ----------
+    fun : a Python function
+        `fun` takes Theano variables as inputs, and returns a Theano variable.
+        For instance, an Op instance with  a single output.
+    pt : list of numpy.ndarrays
+        Input values, points where the gradient is estimated.
        These arrays must be either float16, float32, or float64 arrays.
-    :param n_tests: number of times to run the test
+    n_tests : int
-    :param rng: random number generator used to sample u, we test gradient
+        number of times to run the test
-        of sum(u * fun) at pt
+    rng : numpy.random.RandomState, optional
-    :param eps: stepsize used in the Finite Difference Method (Default
+        random number generator used to sample the output random projection `u`,
-        None is type-dependent)
+        we test gradient of sum(u * fun) at `pt`
-        Raising the value of eps can raise or lower the absolute and
+    eps : float, optional
-        relative errors of the verification depending on the
+        stepsize used in the Finite Difference Method (Default
-        Op. Raising eps does not lower the verification quality
+        None is type-dependent).
-        for linear operations. It
+        Raising the value of eps can raise or lower the absolute
-        is better to raise eps than raising abs_tol or rel_tol.
+        and relative errors of the verification depending on the
-    :param out_type: dtype of output, if complex (i.e. 'complex32' or
+        Op. Raising eps does not lower the verification quality for
-        'complex64')
+        linear operations. It is better to raise `eps` than raising
-    :param abs_tol: absolute tolerance used as threshold for gradient
+        `abs_tol` or `rel_tol`.
-        comparison
+    out_type : string
-    :param rel_tol: relative tolerance used as threshold for gradient
+        dtype of output, if complex (i.e., 'complex32' or 'complex64')
-        comparison
+    abs_tol : float
-    :param cast_to_output_type: if the output is float32 and
+        absolute tolerance used as threshold for gradient comparison
-        cast_to_output_type is True, cast the random projection to
+    rel_tol : float
-        float32. Otherwise it is float64. float16 is not handled here.
+        relative tolerance used as threshold for gradient comparison
-    :param no_debug_ref: Don't use DebugMode for the numerical
+    cast_to_output_type : bool
-        gradient function.
+        if the output is float32 and cast_to_output_type is True, cast
+        the random projection to float32. Otherwise it is float64.
-    :note: This function does not support multiple outputs. In
+        float16 is not handled here.
+    no_debug_ref : bool
+        Don't use DebugMode for the numerical gradient function.
+    Note
+    ----
+    This function does not support multiple outputs. In
    tests/test_scan.py there is an experimental verify_grad that
    covers that case as well by using random projections.
@@ -1813,26 +1842,33 @@ verify_grad.E_grad = GradientError
 def jacobian(expression, wrt, consider_constant=None,
             disconnected_inputs='raise'):
    """
-    :type expression: Vector (1-dimensional) Variable
+    Compute the full Jacobian, row by row.
-    :type wrt: Variable or list of Variables
-    :param consider_constant: a list of expressions not to backpropagate
+    Parameters
-        through
+    ----------
+    expression : Vector (1-dimensional) :class:`~theano.gof.graph.Variable`
+        Values that we are differentiating (that we want the Jacobian of)
+    wrt : :class:`~theano.gof.graph.Variable` or list of Variables
+        Term[s] with respect to which we compute the Jacobian
+    consider_constant : list of variables
+        Expressions not to backpropagate through
-    :type disconnected_inputs: string
+    disconnected_inputs: string
-    :param disconnected_inputs: Defines the behaviour if some of the variables
+        Defines the behaviour if some of the variables
-        in ``wrt`` are not part of the computational graph computing ``cost``
+        in `wrt` are not part of the computational graph computing `cost`
        (or if all links are non-differentiable). The possible values are:
        - 'ignore': considers that the gradient on these parameters is zero.
        - 'warn': consider the gradient zero, and print a warning.
        - 'raise': raise an exception.
-    :return: either a instance of Variable or list/tuple of Variables
+    Returns
-            (depending upon `wrt`) repesenting the jacobian of `expression`
+    -------
-            with respect to (elements of) `wrt`. If an element of `wrt` is not
+    :class:`~theano.gof.graph.Variable` or list/tuple of Variables (depending upon `wrt`)
-            differentiable with respect to the output, then a zero
+        The Jacobian of `expression` with respect to (elements of) `wrt`.
-            variable is returned. The return value is of same type
+        If an element of `wrt` is not differentiable with respect to the
-            as `wrt`: a list/tuple or TensorVariable in all cases.
+        output, then a zero variable is returned. The return value is
+        of same type as `wrt`: a list/tuple or TensorVariable in all cases.
    """
    from theano.tensor import arange
    # Check inputs have the right format
@@ -1886,27 +1922,29 @@ def jacobian(expression, wrt, consider_constant=None,
 def hessian(cost, wrt, consider_constant=None,
            disconnected_inputs='raise'):
    """
-    :type cost: Scalar (0-dimensional) Variable.
+    Parameters
-    :type wrt: Vector (1-dimensional tensor) 'Variable' or list of
+    ----------
+    cost: Scalar (0-dimensional) variable.
+    wrt: Vector (1-dimensional tensor) 'Variable' or list of
    vectors (1-dimensional tensors) Variables
+    consider_constant:
-    :param consider_constant: a list of expressions not to backpropagate
+        a list of expressions not to backpropagate through
-        through
+    disconnected_inputs: string
+        Defines the behaviour if some of the variables
-    :type disconnected_inputs: string
-    :param disconnected_inputs: Defines the behaviour if some of the variables
        in ``wrt`` are not part of the computational graph computing ``cost``
        (or if all links are non-differentiable). The possible values are:
        - 'ignore': considers that the gradient on these parameters is zero.
        - 'warn': consider the gradient zero, and print a warning.
        - 'raise': raise an exception.
-    :return: either a instance of Variable or list/tuple of Variables
+    Returns
-            (depending upon `wrt`) repressenting the Hessian of the `cost`
+    -------
-            with respect to (elements of) `wrt`. If an element of `wrt` is not
+    :class:`~theano.gof.graph.Variable` or list/tuple of Variables
-            differentiable with respect to the output, then a zero
+        The Hessian of the `cost` with respect to (elements of) `wrt`.
-            variable is returned. The return value is of same type
+        If an element of `wrt` is not differentiable with respect to the
-            as `wrt`: a list/tuple or TensorVariable in all cases.
+        output, then a zero variable is returned. The return value is
+        of same type as `wrt`: a list/tuple or TensorVariable in all cases.
    """
    from theano.tensor import arange
    # Check inputs have the right format
@@ -2034,10 +2072,16 @@ def zero_grad(x):
    through with a value of zero. In other words, the gradient of
    the expression is truncated to 0.
-    :param x: A Theano expression whose gradient should be truncated.
+    Parameters
+    ----------
+    x: :class:`~theano.gof.graph.Variable`
+        A Theano expression whose gradient should be truncated.
-    :return: The expression is returned unmodified, but its gradient
+    Returns
-        is now truncated to 0.
+    -------
+    :class:`~theano.gof.graph.Variable`
+        An expression equivalent to ``x``, with its gradient
+        truncated to 0.
    """
    return zero_grad_(x)
@@ -2058,18 +2102,24 @@ undefined_grad_ = UndefinedGrad()
 def undefined_grad(x):
    """
-    Consider the gradient of this variable undefined and
+    Consider the gradient of this variable undefined.
-    generate an error message if its gradient is taken.
+    This will generate an error message if its gradient is taken.
    The expression itself is unaffected, but when its gradient is
    computed, or the gradient of another expression that this
    expression is a subexpression of, an error message will be generated
    specifying such gradient is not defined.
-    :param x: A Theano expression whose gradient should be undefined.
+    Parameters
+    ----------
+    x: :class:`~theano.gof.graph.Variable`
+        A Theano expression whose gradient should be undefined.
-    :return: The expression is returned unmodified, but its gradient
+    Returns
-        is now undefined.
+    -------
+    :class:`~theano.gof.graph.Variable`
+        An expression equivalent to ``x``, with its gradient undefined.
    """
    return undefined_grad_(x)
@@ -2090,8 +2140,9 @@ disconnected_grad_ = DisconnectedGrad()
 def disconnected_grad(x):
    """
-    Consider an expression constant when computing gradients,
+    Consider an expression constant when computing gradients.
-    while effectively not backpropagating through it.
+    It will effectively not backpropagating through it.
    The expression itself is unaffected, but when its gradient is
    computed, or the gradient of another expression that this
@@ -2101,11 +2152,17 @@ def disconnected_grad(x):
    has to go through the underlying computational graph related to the
    expression.
-    :param x: A Theano expression whose gradient should not be
+    Parameters
+    ----------
+    x: :class:`~theano.gof.graph.Variable`
+        A Theano expression whose gradient should not be
        backpropagated through.
-    :return: The expression is returned unmodified, but its gradient
+    Returns
-        is now effectively truncated to 0.
+    -------
+    :class:`~theano.gof.graph.Variable`
+        An expression equivalent to ``x``, with its gradient
+        now effectively truncated to 0.
    """
    return disconnected_grad_(x)
@@ -2133,22 +2190,27 @@ def grad_clip(x, lower_bound, upper_bound):
    This is an elemwise operation.
-    :param x: the variable we want its gradient inputs clipped
+    Parameters
-    :param lower_bound: The lower bound of the gradient value
+    ----------
-    :param upper_bound: The upper bound of the gradient value.
+    x:
+        The variable we want its gradient inputs clipped
-    :examples:
+    lower_bound:
+        The lower bound of the gradient value
-        x = theano.tensor.scalar()
+    upper_bound:
+        The upper bound of the gradient value.
-        z = theano.tensor.grad(grad_clip(x, -1, 1)**2, x)
-        z2 = theano.tensor.grad(x**2, x)
+    Examples
+    --------
-        f = theano.function([x], outputs = [z, z2])
+    >>> x = theano.tensor.scalar()
+    >>> z = theano.tensor.grad(grad_clip(x, -1, 1)**2, x)
-        print(f(2.0))  # output (1.0, 4.0)
+    >>> z2 = theano.tensor.grad(x**2, x)
+    >>> f = theano.function([x], outputs = [z, z2])
-    :note: We register an opt in tensor/opt.py that remove the GradClip.
+    >>> print(f(2.0))
+    [array(1.0), array(4.0)]
+    Note
+    ----
+    We register an opt in tensor/opt.py that remove the GradClip.
    So it have 0 cost in the forward and only do work in the grad.
    """
@@ -2167,21 +2229,25 @@ def grad_scale(x, multiplier):
    """
    This op scale or inverse the gradient in the backpropagation.
-    :param x: the variable we want its gradient inputs scale
+    Parameters
-    :param multiplier: scale of the gradient
+    ----------
+    x:
-    :examples:
+        The variable we want its gradient inputs scale
+    multiplier:
-        x = theano.tensor.fscalar()
+        Scale of the gradient
-        fx = theano.tensor.sin(x)
+    Examples
-        fp = theano.tensor.grad(fx, wrt=x)
+    --------
-        fprime = theano.function([x], fp)
+    >>> x = theano.tensor.fscalar()
-        print(fprime(2))#-0.416
+    >>> fx = theano.tensor.sin(x)
+    >>> fp = theano.tensor.grad(fx, wrt=x)
-        f_inverse=grad_scale(fx,-1.)
+    >>> fprime = theano.function([x], fp)
-        fpp = theano.tensor.grad(f_inverse, wrt=x)
+    >>> print(fprime(2))  # doctest: +ELLIPSIS
-        fpprime = theano.function([x], fpp)
+    -0.416...
-        print(fpprime(2))#0.416
+    >>> f_inverse=grad_scale(fx, -1.)
+    >>> fpp = theano.tensor.grad(f_inverse, wrt=x)
+    >>> fpprime = theano.function([x], fpp)
+    >>> print(fpprime(2))  # doctest: +ELLIPSIS
+    0.416...
    """
    return GradScale(multiplier)(x)
--- a/theano/tensor/slinalg.py
+++ b/theano/tensor/slinalg.py
@@ -84,11 +84,11 @@ class Cholesky(Op):
        """
        Cholesky decomposition reverse-mode gradient update.
-        Symbolic expression for reverse-mode Cholesky gradient taken from [0]_
+        Symbolic expression for reverse-mode Cholesky gradient taken from [#]_
        References
        ----------
-        .. [0] I. Murray, "Differentiation of the Cholesky decomposition",
+        .. [#] I. Murray, "Differentiation of the Cholesky decomposition",
           http://arxiv.org/abs/1602.07527
        """
@@ -158,12 +158,12 @@ class CholeskyGrad(Op):
    def perform(self, node, inputs, outputs):
        """
-        Implements the "reverse-mode" gradient [1]_ for the
+        Implements the "reverse-mode" gradient [#]_ for the
        Cholesky factorization of a positive-definite matrix.
        References
        ----------
-        .. [1] S. P. Smith. "Differentiation of the Cholesky Algorithm".
+        .. [#] S. P. Smith. "Differentiation of the Cholesky Algorithm".
           Journal of Computational and Graphical Statistics,
           Vol. 4, No. 2 (Jun.,1995), pp. 134-147
           http://www.jstor.org/stable/1390762
@@ -268,13 +268,13 @@ class Solve(Op):
    def grad(self, inputs, output_gradients):
        """
-        Reverse-mode gradient updates for matrix solve operation c = A \ b.
+        Reverse-mode gradient updates for matrix solve operation c = A \\\ b.
-        Symbolic expression for updates taken from [1]_.
+        Symbolic expression for updates taken from [#]_.
        References
        ----------
-        ..[1] M. B. Giles, "An extended collection of matrix derivative results
+        .. [#] M. B. Giles, "An extended collection of matrix derivative results
          for forward and reverse mode automatic differentiation",
          http://eprints.maths.ox.ac.uk/1079/