Merge pull request #3293 from harlouci/numpydoc_tensor

Numpydoc tensor

Merge pull request #3293 from harlouci/numpydoc_tensor
6304a061 · abergeron · 5e536853 · 8e88a292 · 6304a061 · 6304a061
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
 """A `Type` and `Op` classes to work with numpy.ndarrays symbolically."""
 import sys
 import warnings
@@ -63,8 +62,10 @@ class ShapeError(Exception):
 def check_equal_numpy(x, y):
    """
-    Returns True iff x and y are equal (checks the dtype and
+    Return True iff x and y are equal.
-    shape if x and y are numpy.ndarray instances).
+    Checks the dtype and shape if x and y are numpy.ndarray instances.
    """
    if isinstance(x, numpy.ndarray) and isinstance(y, numpy.ndarray):
        return (x.dtype == y.dtype and x.shape == y.shape and
@@ -89,6 +90,7 @@ def constructor(f):
    Make `f` appear as a constructor in the oplist (`gen_oplist`,
    doc/oplist.txt).
    """
    __oplist_constructor_list.append(f)
    return f
@@ -107,8 +109,7 @@ if 0:
    # - JB 20100226
    def as_cuda_or_tensor_variable(x, name=None, ndim=None):
        """
-        This function do the same as_tensor_variable, but don't
+        Do the same as_tensor_variable, but do not transfer the value on the gpu.
-        transfert the value on the gpu
        """
        if hasattr(x, '_as_CudaNdarrayVariable'):
            # TODO: pass name and ndim arguments
@@ -117,29 +118,31 @@ if 0:
 def as_tensor_variable(x, name=None, ndim=None):
-    """Return `x`, transformed into a `TensorType`
+    """Return `x`, transformed into a `TensorType`.
-    This function is often used by `make_node` methods of `Op`
+    This function is often used by `make_node` methods of `Op` subclasses
-    subclasses to turn ndarrays, numbers, `Scalar` instances, `Apply`
+    to turn ndarrays, numbers, `Scalar` instances, `Apply` instances and
-    instances and `TensorType` instances into valid input list
+    `TensorType` instances into valid input list elements.
-    elements.
+    Parameters
-    :Parameters:
+    ----------
-     - `x`: Apply instance, Variable instance, numpy.ndarray, or number
+    x : Apply instance, Variable instance, numpy.ndarray, or number
-       This thing will be transformed into a `Variable` in a sensible way.  An
+        This thing will be transformed into a `Variable` in a sensible way. An
-       ndarray argument will not be copied, but a list of numbers will be
+        ndarray argument will not be copied, but a list of numbers will be
-       copied to make an ndarray.
+        copied to make an ndarray.
-     - `name`: str or None
+    name : str or None
-       If a new `Variable` instance is created, it will be named with this
+        If a new `Variable` instance is created, it will be named with this
-       string.
+        string.
-     - `ndim`: None or integer
+    ndim : None or integer
-       Return a Variable with this many dimensions.  Raise TypeError if it's
+        Return a Variable with this many dimensions. Raise TypeError if it's
-       not possible.
+        not possible.
-    :Exceptions:
+    Raises
-     - `ValueError`: raised if an `Apply` with more then one output is fetched
+    ------
-     - `AsTensorError`: raised if `x` cannot be converted to a TensorType
+    ValueError
-       Variable
+        If an `Apply` with more than one output is fetched.
+    AsTensorError
+        If `x` cannot be converted to a TensorType Variable.
    """
    if hasattr(x, '_as_TensorVariable'):
@@ -231,16 +234,18 @@ class NumpyAutocaster(object):
              float32);
            - if no data type can represent `x` without loss of precision, then
              the last data type in the tuple will be used.
+    Parameters
+    ----------
+    dtypes: tuple of strings
+        The ordered list of preferred data types (only used when
+        `config.cast_policy` is set to 'custom', see the `NumpyAutocaster`
+        help for details).
    """
-    def __init__(self, dtypes):
-        """
-        Constructor.
-        :type dtypes: Tuple of strings.
+    def __init__(self, dtypes):
-        :param dtypes: The ordered list of preferred data types (only used when
-        `config.cast_policy` is set to 'custom', see the `NumpyAutocaster` help
-        for details).
-        """
        self.dtypes = tuple(dtypes)
    def __call__(self, x):
@@ -312,17 +317,20 @@ autocast_float = NumpyAutocaster(('float16', 'float32', 'float64'))
 #
 class autocast_float_as(object):
    """
+    Temporarily adjust autocasting behavior.
    This class makes it possible to temporarily and locally adjust autocasting
    behavior when `config.cast_policy` is set to 'custom'.
    If `config.cast_policy` is not 'custom', an exception is raised.
+    This class might be convenient in some code, but it definitely
+    helps to test the autocasting mechanism.
-    For example:
+    Examples
+    --------
    >>> with autocast_float_as('float32'):
    ...    assert (fvector() + 1.1).dtype == 'float32'  # temporary downcasting
    >>> assert (fvector() + 1.1).dtype == 'float64' # back to default behaviour
-    This class might be convenient in some code, but it definitely
-    helps to test the autocasting mechanism.
    """
    def __init__(self, *dtypes):
        self.dtypes = dtypes
@@ -339,11 +347,14 @@ class autocast_float_as(object):
 def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
-    """Return a symbolic `Constant` with value `x`
+    """Return a symbolic `Constant` with value `x`.
-    :Exceptions:
+    Raises
-     - `TypeError`: `x` could not be converted to a numpy.ndarray
+    ------
-     - `ValueError`: `x` could not be expanded to have ndim dimensions
+    TypeError
+        `x` could not be converted to a numpy.ndarray.
+    ValueError
+        `x` could not be expanded to have ndim dimensions.
    """
    if dtype is not None:
@@ -507,8 +518,13 @@ class EmptyConstantError(NotScalarConstantError):
 def numpy_scalar(data):
-    """ Return a scalar stored in a numpy ndarray, or raise
+    """ Return a scalar stored in a numpy ndarray.
-    NotScalarConstantError if the numpy ndarray is not a scalar
+    Raises
+    ------
+     NotScalarConstantError
+        If the numpy ndarray is not a scalar.
    """
    # handle case where data is numpy.array([])
@@ -536,26 +552,29 @@ get_scalar_constant_value_elemwises = (
 def get_scalar_constant_value(orig_v, elemwise=True,
                              only_process_constants=False):
-    """return the constant scalar(0-D) value underlying variable `v`
+    """Return the constant scalar(0-D) value underlying variable `v`.
-    If v is the output of dimshuffles, fills, allocs, rebroadcasts,
+    If `v` is the output of dimshuffles, fills, allocs, rebroadcasts,
-    cast, OutputGuard, DeepCopyOp, ScalarFromTensor, ScalarOp,
+    cast, OutputGuard, DeepCopyOp, ScalarFromTensor, ScalarOp, Elemwise
-    Elemwise and some pattern with Subtensor,
+    and some pattern with Subtensor, this function digs through them.
-    this function digs through them.
    If `v` is not some view of constant scalar data, then raise a
    NotScalarConstantError.
-    :param elemwise: If False, we won't try to go into elemwise.
+    Parameters
-        So this call is faster.
+    ----------
+    elemwise : bool
+        If False, we won't try to go into elemwise. So this call is faster.
+    only_process_constants : bool
+        If True, we only attempt to obtain the value of `orig_v` if it's
+        directly constant and don't try to dig through dimshuffles, fills,
+        allocs, and other to figure out its value.
-    :param only_process_constants: If True, we only attempt to obtain
+    Notes
-            the value of `orig_v` if it's directly constant and don't
+    -----
-            try to dig through dimshuffles, fills, allocs, and other to figure
+        There may be another function similar to this one in the code,
-            out its value.
+        but I'm not sure where it is.
-    :note: There may be another function similar to this one in the
-        code, but I'm not sure where it is.
    """
    v = orig_v
    while True:
@@ -773,8 +792,14 @@ lscalar = TensorType('int64', ())
 def scalar(name=None, dtype=None):
    """Return a symbolic scalar variable.
-    :param dtype: numeric type (None means to use theano.config.floatX)
-    :param name: a name to attach to this variable
+    Parameters
+    ----------
+    dtype: numeric
+        None means to use theano.config.floatX.
+    name
+        A name to attach to this variable.
    """
    if dtype is None:
        dtype = config.floatX
@@ -803,8 +828,14 @@ lvector = TensorType('int64', (False, ))
 def vector(name=None, dtype=None):
    """Return a symbolic vector variable.
-    :param dtype: numeric type (None means to use theano.config.floatX)
-    :param name: a name to attach to this variable
+    Parameters
+    ----------
+    dtype: numeric
+        None means to use theano.config.floatX.
+    name
+        A name to attach to this variable
    """
    if dtype is None:
        dtype = config.floatX
@@ -830,8 +861,14 @@ lmatrix = TensorType('int64', (False, False))
 def matrix(name=None, dtype=None):
    """Return a symbolic matrix variable.
-    :param dtype: numeric type (None means to use theano.config.floatX)
-    :param name: a name to attach to this variable
+    Parameters
+    ----------
+    dtype: numeric
+        None means to use theano.config.floatX.
+    name
+        A name to attach to this variable.
    """
    if dtype is None:
        dtype = config.floatX
@@ -857,8 +894,14 @@ lrow = TensorType('int64', (True, False))
 def row(name=None, dtype=None):
    """Return a symbolic row variable (ndim=2, broadcastable=[True,False]).
-    :param dtype: numeric type (None means to use theano.config.floatX)
-    :param name: a name to attach to this variable
+    Parameters
+    ----------
+    dtype: numeric type
+        None means to use theano.config.floatX.
+    name
+        A name to attach to this variable.
    """
    if dtype is None:
        dtype = config.floatX
@@ -878,8 +921,14 @@ lcol = TensorType('int64', (False, True))
 def col(name=None, dtype=None):
    """Return a symbolic column variable (ndim=2, broadcastable=[False,True]).
-    :param dtype: numeric type (None means to use theano.config.floatX)
-    :param name: a name to attach to this variable
+    Parameters
+    ----------
+    dtype : numeric
+        None means to use theano.config.floatX.
+    name
+        A name to attach to this variable.
    """
    if dtype is None:
        dtype = config.floatX
@@ -899,8 +948,14 @@ ltensor3 = TensorType('int64', ((False,) * 3))
 def tensor3(name=None, dtype=None):
    """Return a symbolic 3-D variable.
-    :param dtype: numeric type (None means to use theano.config.floatX)
-    :param name: a name to attach to this variable
+    Parameters
+    ----------
+    dtype: numeric type
+        None means to use theano.config.floatX.
+    name
+        A name to attach to this variable.
    """
    if dtype is None:
        dtype = config.floatX
@@ -922,8 +977,14 @@ ltensor4 = TensorType('int64', ((False,) * 4))
 def tensor4(name=None, dtype=None):
    """Return a symbolic 4-D variable.
-    :param dtype: numeric type (None means to use theano.config.floatX)
-    :param name: a name to attach to this variable
+    Parameters
+    ----------
+    dtype: numeric type
+        None means to use theano.config.floatX.
+    name
+        A name to attach to this variable.
    """
    if dtype is None:
        dtype = config.floatX
@@ -957,6 +1018,7 @@ def _scal_elemwise_with_nfunc(nfunc, nin, nout):
    **destination** inputs it takes. That is, the function should
    take nin+nout inputs. nout == 0 means that the numpy function
    does not take a numpy array argument to put its result in.
    """
    def construct(symbol):
        symbolname = symbol.__name__
@@ -1183,7 +1245,9 @@ def cast(x, dtype):
 class MaxAndArgmax(Op):
-    """Calculate the max and argmax over a given axis or over all axes.
+    """
+    Calculate the max and argmax over a given axis or over all axes.
    """
    nin = 2  # tensor, axis
    nout = 2  # max val, max idx
@@ -1418,6 +1482,7 @@ def makeKeepDims(x, y, axis):
    Reintroduces in y with length one the axes of x which have been left out
    in a prior reduction of x. With this option, the resulting tensor will
    broadcast correctly against the original tensor x.
    """
    x = as_tensor_variable(x)
    y = as_tensor_variable(y)
@@ -1453,14 +1518,18 @@ def makeKeepDims(x, y, axis):
 def max_and_argmax(a, axis=None, keepdims=False):
    """
    Returns maximum elements and their indices obtained by iterating over
-    given axis
+    given axis.
    When axis is None (the default value), the max is performed
    over the flattened tensor.
-    keepdims: If this is set to True, the axes which are reduced are left in
+    Parameters
+    ----------
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in
        the result as dimensions with size one. With this option, the result
        will broadcast correctly against the original tensor.
    """
    out, argout = _max_and_argmax(a, axis)
@@ -1474,16 +1543,22 @@ def max_and_argmax(a, axis=None, keepdims=False):
 @constructor
 def max(x, axis=None, keepdims=False):
    """
-    Returns maximum elements obtained by iterating over given axis
+    Returns maximum elements obtained by iterating over given axis.
    When axis is None (the default value), the max is performed
    over the flattened tensor.
-    keepdims: If this is set to True, the axes which are reduced are left in
+    Parameters
+    ----------
+    keepdims: bool
+        If this is set to True, the axes which are reduced are left in
        the result as dimensions with size one. With this option, the result
        will broadcast correctly against the original tensor.
-    :note: we return an error as numpy when we reduce a dim with a shape of 0
+    Notes
+    -----
+    We return an error as numpy when we reduce a dim with a shape of 0.
    """
    # We have a choice of implementing this call with the
@@ -1511,14 +1586,18 @@ def max(x, axis=None, keepdims=False):
 @constructor
 def argmax(x, axis=None, keepdims=False):
    """
-    Returns indices of maximum elements obtained by iterating over given axis
+    Returns indices of maximum elements obtained by iterating over given axis.
    When axis is None (the default value), the argmax is performed
    over the flattened tensor.
-    keepdims: If this is set to True, the axes which are reduced are left in
+    Parameters
+    ----------
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in
        the result as dimensions with size one. With this option, the result
        will broadcast correctly against the original tensor.
    """
    # In python (using MaxAndArgmax.perform()) this leads to a wasteful
    # implementation that goes through the data twice instead of once
@@ -1534,14 +1613,18 @@ def argmax(x, axis=None, keepdims=False):
 @constructor
 def min(x, axis=None, keepdims=False):
    """
-    Returns minimum elements obtained by iterating over given axis
+    Returns minimum elements obtained by iterating over given axis.
    When axis is None (the default value), the min is performed
    over the flattened tensor.
-    keepdims: If this is set to True, the axes which are reduced are left in
+    Parameters
+    ----------
+    keepdims: bool
+        If this is set to True, the axes which are reduced are left in
        the result as dimensions with size one. With this option, the result
        will broadcast correctly against the original tensor.
    """
    x = as_tensor_variable(x)
    str_x_type = str(x.dtype)
@@ -1555,14 +1638,18 @@ def min(x, axis=None, keepdims=False):
 @constructor
 def argmin(x, axis=None, keepdims=False):
    """
-    Returns indices of minimum elements obtained by iterating over given axis
+    Returns indices of minimum elements obtained by iterating over given axis.
    When axis is None (the default value), the argmin is performed
    over the flattened tensor.
-    keepdims: If this is set to True, the axes which are reduced are left in
+    Parameters
+    ----------
+    keepdims: bool
+        If this is set to True, the axes which are reduced are left in
        the result as dimensions with size one. With this option, the result
        will broadcast correctly against the original tensor.
    """
    x = as_tensor_variable(x)
    str_x_type = str(x.dtype)
@@ -1579,6 +1666,7 @@ def smallest(*args):
    Return the [elementwise] smallest of a variable number of arguments.
    Like python's min.
    """
    if len(args) == 2:
        a, b = args
@@ -1593,6 +1681,7 @@ def largest(*args):
    Return the [elementwise] largest of a variable number of arguments.
    Like python's max.
    """
    if len(args) == 2:
        a, b = args
@@ -1647,31 +1736,34 @@ def isinf(a):
 def allclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
    """
-    Implements Numpy's ``allclose`` on tensors.
+    Implement Numpy's ``allclose`` on tensors.
    ``absolute(a - b) <= (atol + rtol * absolute(b))``
-    :note: Not a symmetric equation. See Numpy's documentation.
+    Parameters
+    ----------
-    :param a: input to compare
+    a : tensor
-    :type a: tensor
+        Input to compare.
+    b : tensor
-    :param b: input to compare
+        Input to compare.
-    :type b: tensor
+    rtol : float
+        The relative tolerance parameter.
-    :param rtol: the relative tolerance parameter
+    atol : float
-    :type rtol: float
+        The absolute tolerance parameter.
+    equal_nan: bool
+        Whether to consider nan's in the same place to be close.
-    :param atol: the absolute tolerance parameter
+    Returns
-    :type atol: float
+    -------
+    bool
+        A boolean value (of type int8 returned by the tensor elementwise `all`
+        function) whether all elements in a and b are in the tolerance range
+        defined above.
-    :param equal_nan: whether to consider nan's in the same place to be close
+    Notes
-    :type equal_nan: bool
+    -----
+    Not a symmetric equation. See Numpy's documentation.
-    :returns: a boolean value (of type int8 returned by the tensor
-            elementwise `all` function) whether all elements in a and b are in
-            the tolerance range defined above.
-    :rtype: int8
    """
    return all(isclose(a, b, rtol, atol, equal_nan))
@@ -1680,34 +1772,38 @@ def isclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
    """
    Implements Numpy's ``isclose`` on tensors.
-    The tolerance values are positive, typically very small numbers.  The
+    The tolerance values are positive, typically very small numbers. The
    relative difference (`rtol` * abs(`b`)) and the absolute difference
    `atol` are added together to compare against the absolute difference
    between `a` and `b`.
    ``absolute(a - b) <= (atol + rtol * absolute(b))``
-    :note: Not a symmetric equation. See Numpy's documentation.
+    Parameters
+    ----------
-    :param a: input to compare
+    a : tensor
-    :type a: tensor
+        Input to compare.
+    b : tensor
-    :param b: input to compare
+        Input to compare.
-    :type b: tensor
+    rtol : float
+        The relative tolerance parameter.
-    :param rtol: the relative tolerance parameter
+    atol : float
-    :type rtol: float
+        The absolute tolerance parameter.
+    equal_nan : bool
-    :param atol: the absolute tolerance parameter
+        Whether to consider nan's in the same place to be close
-    :type atol: float
-    :param equal_nan: whether to consider nan's in the same place to be close
+    Returns
-    :type equal_nan: bool
+    -------
+    int8
+        A boolean (int8) array where two arrays are element-wise equal
+        within a tolerance.
-    :returns: returns a boolean (int8) array where two arrays are element-wise
+    Notes
-            equal within a tolerance.
+    -----
-    :rtype: int8
+    Not a symmetric equation. See Numpy's documentation.
+    Examples
+    --------
    >>> import theano
    >>> import numpy as np
    >>> a = theano._asarray([1e10, 1e-7], dtype="float64")
@@ -1738,6 +1834,7 @@ def isclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
    >>> b = theano._asarray([1.0, np.inf], dtype="float64")
    >>> theano.tensor.isclose(a, b).eval()
    array([1, 1], dtype==int8)
    """
    # close will be an int8 array of 1 where within tolerance
    # and 0 where not within tolerance or there was a nan or inf value.
@@ -2164,8 +2261,8 @@ class Nonzero(gof.Op):
    Returns
    -------
-    result : matrix
+    matrix
-        matrix containing the indices of the non-zero elements of a.
+        Matrix containing the indices of the non-zero elements of a.
    See Also
    --------
@@ -2220,14 +2317,13 @@ def nonzero(a, return_matrix=False):
    ----------
    a : array_like
        Input array.
    return_matrix : bool
        If True, returns a symbolic matrix. If False, returns a tuple of
        arrays. Defaults to False.
    Returns
    -------
-    result : tuple of vectors or matrix
+    tuple of vectors or matrix
    See Also
    --------
@@ -2260,7 +2356,7 @@ def flatnonzero(a):
    Returns
    -------
-    res : vector
+    vector
        Output vector, containing the indices of the elements of `a.flatten()`
        that are non-zero.
@@ -2268,6 +2364,7 @@ def flatnonzero(a):
    --------
    nonzero : Return the indices of the non-zero elements of the input array.
    nonzero_values : Return the non-zero elements of the input array
    """
    if a.ndim == 0:
        raise ValueError('Nonzero only supports non-scalar arrays.')
@@ -2299,7 +2396,7 @@ def nonzero_values(a):
    Returns
    -------
-    res : vector
+    vector
        Output vector, containing the non-zero elements of a.
    See Also
@@ -2307,6 +2404,7 @@ def nonzero_values(a):
    nonzero : Return the indices of the non-zero elements of the input array.
    flatnonzero : Return the indices of the non-zero elements of the
        flattened input array.
    """
    return a.flatten()[flatnonzero(a)]
@@ -2362,9 +2460,10 @@ def tri(N, M=None, k=0, dtype=None):
    Returns
    -------
-    tri : Array of shape (N, M)
+    Array of shape (N, M)
        Array with its lower triangle filled with ones and zero elsewhere;
        in other words ``T[i,j] == 1`` for ``i <= j + k``, 0 otherwise.
    """
    if dtype is None:
        dtype = config.floatX
@@ -2390,12 +2489,13 @@ def tril(m, k=0):
    Returns
    -------
-    tril : array, shape (M, N)
+    array, shape (M, N)
        Lower triangle of `m`, of same shape and data-type as `m`.
    See Also
    --------
-    triu : same thing, only for the upper triangle
+    triu : Same thing, only for the upper triangle.
    """
    return m * tri(m.shape[0], m.shape[1], k=k, dtype=m.dtype)
@@ -2411,7 +2511,8 @@ def triu(m, k=0):
    See Also
    --------
-    tril : lower triangle of an array
+    tril : Lower triangle of an array.
    """
    return m * (1 - tri(m.shape[0], m.shape[1], k=k - 1, dtype=m.dtype))
@@ -2456,21 +2557,22 @@ def eye(n, m=None, k=0, dtype=None):
    Parameters
    ----------
    n : int
-      Number of rows in the output.
+        Number of rows in the output.
    m : int, optional
-      Number of columns in the output. If None, defaults to `N`.
+        Number of columns in the output. If None, defaults to `N`.
    k : int, optional
-      Index of the diagonal: 0 (the default) refers to the main diagonal,
+        Index of the diagonal: 0 (the default) refers to the main diagonal,
-      a positive value refers to an upper diagonal, and a negative value
+        a positive value refers to an upper diagonal, and a negative value
-      to a lower diagonal.
+        to a lower diagonal.
    dtype : data-type, optional
-      Data-type of the returned array.
+        Data-type of the returned array.
    Returns
    -------
-    I : ndarray of shape (N,M)
+    ndarray of shape (N,M)
-      An array where all elements are equal to zero, except for the `k`-th
+        An array where all elements are equal to zero, except for the `k`-th
-      diagonal, whose values are equal to one.
+        diagonal, whose values are equal to one.
    """
    if dtype is None:
        dtype = config.floatX
@@ -2485,7 +2587,7 @@ def identity_like(x):
 class Alloc(gof.Op):
-    """Create a Tensor from an initial value and a desired shape
+    """Create a Tensor from an initial value and a desired shape.
    alloc(value, shape0, shape1, ..., shapeN)
@@ -2500,6 +2602,7 @@ class Alloc(gof.Op):
    This Op is used to replace fill() during optimizations because after shapes
    are lifted, the first argument to fill can often be pruned from the graph.
    """
    __props__ = ()
@@ -2642,6 +2745,7 @@ class Alloc(gof.Op):
        for size mismatches.
        If you always want an Alloc node, call make_node.
        """
        ret = super(Alloc, self).__call__(val, *shapes, **kwargs)
        try:
@@ -2709,18 +2813,22 @@ pprint.assign(tensor_copy, printing.IgnorePrinter())
 @constructor
 def sum(input, axis=None, dtype=None, keepdims=False, acc_dtype=None):
    """
-    Computes the sum along the given axis(es) of a tensor `input`
+    Computes the sum along the given axis(es) of a tensor `input`.
    When axis is None (the default value), the sum is performed
    over the flattened tensor.
-    keepdims: If this is set to True, the axes which are reduced are left in
-        the result as dimensions with size one. With this option, the result
-        will broadcast correctly against the original tensor.
    For full documentation see ``tensor.elemwise.Sum``.
    In particular please pay attention to the important warning when using
    a custom acc_dtype.
+    Parameters
+    ----------
+    keepdims: bool
+        If this is set to True, the axes which are reduced are left in
+        the result as dimensions with size one. With this option, the result
+        will broadcast correctly against the original tensor.
    """
    out = elemwise.Sum(axis=axis, dtype=dtype, acc_dtype=acc_dtype)(input)
@@ -2736,16 +2844,20 @@ pprint.assign(Sum(), printing.FunctionPrinter('sum'))
 def prod(input, axis=None, dtype=None, keepdims=False, acc_dtype=None,
         no_zeros_in_input=False):
    """
-    Computes the product along the given axis(es) of a tensor `input`
+    Computes the product along the given axis(es) of a tensor `input`.
    When axis is None (the default value), the product is performed
    over the flattened tensor.
-    keepdims: If this is set to True, the axes which are reduced are left in
+    For full documentation see ``tensor.elemwise.Prod``.
+    Parameters
+    ----------
+    keepdims: bool
+        If this is set to True, the axes which are reduced are left in
        the result as dimensions with size one. With this option, the result
        will broadcast correctly against the original tensor.
-    For full documentation see ``tensor.elemwise.Prod``.
    """
    out = elemwise.Prod(axis, dtype=dtype, acc_dtype=acc_dtype,
@@ -2803,31 +2915,32 @@ class Mean(elemwise.CAReduce):
 def mean(input, axis=None, dtype=None, op=False, keepdims=False,
         acc_dtype=None):
    """
-    Computes the mean value along the given axis(es) of a tensor `input`
+    Computes the mean value along the given axis(es) of a tensor `input`.
-    :param axis: compute the mean along this axis of the tensor.
+    Parameters
-                 None means all axes (like numpy).
+    ----------
-    :type axis: None or int or (list of int) (see `Sum`)
+    axis : None or int or (list of int) (see `Sum`)
+        Compute the mean along this axis of the tensor.
-    :param dtype: dtype to cast the result of the inner summation into.
+        None means all axes (like numpy).
+    dtype: None or string
+        Dtype to cast the result of the inner summation into.
        For instance, by default, a sum of a float32 tensor will be
        done in float64 (acc_dtype would be float64 by default),
        but that result will be casted back in float32.
-    :type dtype: None or string
+    keepdims: bool
+        If this is set to True, the axes which are reduced are
-    :param keepdims: If this is set to True, the axes which are reduced are
        left in the result as dimensions with size one. With this option,
        the result will broadcast correctly against the original tensor.
+    acc_dtype: None or string
+        Dtype to use for the inner summation. This will not
+        necessarily be the dtype of the output (in particular
+        if it is a discrete (int/uint) dtype, the output will
+        be in a float type). If None, then we use the same rules as `sum()`.
-    :param acc_dtype: dtype to use for the inner summation. This will not
+    Notes
-                  necessarily be the dtype of the output (in particular
+    -----
-                  if it is a discrete (int/uint) dtype, the output will
+    For gpu, if you specify dtype=float32, everything will be done on the gpu.
-                  be in a float type).
-                  If None, then we use the same rules as `sum()`.
-    :type acc_dtype: None or string
-    :note: for gpu, if you specify dtype=float32, everything will be done
-           on the gpu.
    """
    if op:
@@ -2896,18 +3009,23 @@ def var(input, axis=None, keepdims=False):
    """
    Computes the variance along the given axis(es) of a tensor `input`.
-    :param axis: Compute the variance along this axis of the tensor.
+    Parameters
-                 None means all axes (like numpy).
+    ----------
-    :type axis: None or int or (list of int) (see `Sum`)
+    axis: None or int or (list of int) (see `Sum`)
+        Compute the variance along this axis of the tensor.
-    :param keepdims: If this is set to True, the axes which are reduced are
+        None means all axes (like numpy).
+    keepdims : bool
+        If this is set to True, the axes which are reduced are
        left in the result as dimensions with size one. With this option,
        the result will broadcast correctly against the original tensor.
-    :note: It uses the two-pass algorithm for more stable results.
+    Notes
-        https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
+    -----
-        There exist other implementations that are even more stable, but
+    It uses the two-pass algorithm for more stable results.
-        probably slower.
+    https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
+    There exist other implementations that are even more stable, but probably
+    slower.
    """
    input_ndim = input.type.ndim
@@ -2933,26 +3051,26 @@ def var(input, axis=None, keepdims=False):
 @constructor
 def std(input, axis=None, keepdims=False):
    """
-    Computes the standard deviation along the given axis(es)
+    Computes the standard deviation along the given axis(es) of a tensor `input`.
-    of a tensor `input`.
-    :param axis: Compute the standard deviation along this
+    Parameters
-                axis of the tensor.
+    ----------
-                 None means all axes (like numpy).
+    axis : None or int or (list of int) (see `Sum`)
-    :type axis: None or int or (list of int) (see `Sum`)
+        Compute the standard deviation along this axis of the tensor.
+        None means all axes (like numpy).
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the
+        result as dimensions with size one. With this option, the result will
+        broadcast correctly against the original tensor.
-    :param keepdims: If this is set to True, the axes
+    Notes
-        which are reduced are
+    -----
-        left in the result as dimensions with size one.
+    It calls `var()` and `var()` uses the two-pass algorithm for more stable
-        With this option,
+    results.
-        the result will broadcast correctly against the
+    https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
-        original tensor.
+    There exist other implementations that are even more stable, but probably
+    slower.
-    :note: It calls `var()` and `var()` uses the two-pass algorithm for more
-        stable results.
-        https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
-        There exist other implementations that are even more stable, but
-        probably slower.
    """
    return sqrt(var(input=input, axis=axis, keepdims=keepdims))
@@ -2960,10 +3078,12 @@ def std(input, axis=None, keepdims=False):
 class Default(gof.Op):
    """
-    Takes an input x and a default value. If the input is not None, a
+    Takes an input x and a default value.
-    reference to it is returned. If the input is None, a copy of the
-    default value is returned instead. The input and the default must
+    If the input is not None, a reference to it is returned.
-    have exactly the same type.
+    If the input is None, a copy of the default value is returned instead.
+    The input and the default must have exactly the same type.
    """
    view_map = {0: [0]}
    __props__ = ()
@@ -2994,15 +3114,13 @@ setdefault = default  # legacy
 ##########################
 @_scal_elemwise_with_nfunc('maximum', 2, 1)
 def maximum(x, y):
-    """elemwise maximum. See max for the maximum in one tensor
+    """elemwise maximum. See max for the maximum in one tensor"""
-    """
    # see decorator for function body
 @_scal_elemwise_with_nfunc('minimum', 2, 1)
 def minimum(x, y):
-    """elemwise minimum. See min for the minimum in one tensor
+    """elemwise minimum. See min for the minimum in one tensor"""
-    """
    # see decorator for function body
@@ -3058,6 +3176,7 @@ def ceil_intdiv(a, b):
    Safely compute ceil(float_division(a, b)).
    Works for all dtypes, but mostly useful when a and b are int.
    """
    # If a and b are int with not many significant bits, we could
    # cast them to float to avoid doing the modulo. We do not know if this
@@ -3099,13 +3218,17 @@ def pow(a, b):
 # So we do not use @scal_elemwise_with_nfunc('clip', 3, 1)
 @_scal_elemwise
 def clip(x, min, max):
-    """clip x to be between min and max.
+    """
+    Clip x to be between min and max.
+    Notes
+    -----
+    When `x` is equal to the boundaries, the output is considered
+    to be `x`, so at these points, the gradient of the cost wrt the output
+    will be propagated to `x`, not to `min` nor `max`. In other words,
+    on these points, the gradient wrt `x` will be equal to the gradient wrt
+    the output, and the gradient wrt `min` and `max` will be zero.
-    :note: When `x` is equal to the boundaries, the output is considered
-        to be `x`, so at these points, the gradient of the cost wrt the output
-        will be propagated to `x`, not to `min` nor `max`. In other words,
-        on these points, the gradient wrt `x` will be equal to the gradient wrt
-        the output, and the gradient wrt `min` and `max` will be zero.
    """
    # see decorator for function body
    # for grep: clamp, bound
@@ -3125,14 +3248,16 @@ pprint.assign(pow, printing.OperatorPrinter('**', 1, 'right'))
 def extract_constant(x, elemwise=True):
-    '''
+    """
-     This function is basically a call to tensor.get_scalar_constant_value. The
+    This function is basically a call to tensor.get_scalar_constant_value.
-     main difference is the behaviour in case of failure. While
-     get_scalar_constant_value raises an TypeError, this function returns x,
+    The main difference is the behaviour in case of failure. While
-     as a tensor if possible. If x is a ScalarVariable from a
+    get_scalar_constant_value raises an TypeError, this function returns x,
-     scalar_from_tensor, we remove the conversion. If x is just a
+    as a tensor if possible. If x is a ScalarVariable from a
-     ScalarVariable, we convert it to a tensor with tensor_from_scalar.
+    scalar_from_tensor, we remove the conversion. If x is just a
-    '''
+    ScalarVariable, we convert it to a tensor with tensor_from_scalar.
+    """
    try:
        x = get_scalar_constant_value(x, elemwise=elemwise)
    except NotScalarConstantError:
@@ -3150,8 +3275,7 @@ def transpose(x, axes=None):
    """
    Reorder the dimensions of x. (Default: reverse them)
-    This is a macro around dimshuffle that matches the numpy.transpose
+    This is a macro around dimshuffle that matches the numpy.transpose function.
-    function.
    """
    if axes is None:
@@ -3164,18 +3288,33 @@ def transpose(x, axes=None):
 def batched_dot(x, y):
    """
-    :param x: A Tensor with sizes e.g.: for  3D (dim1, dim3, dim2)
-    :param y: A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4)
    This function computes the dot product between the two tensors, by
    iterating over the first dimension using scan.
-    Returns a tensor of size e.g. if it is 3D: (dim1, dim3, dim4)
-    Example:
+    Parameters
+    ----------
+    x : tensor
+        A Tensor with sizes e.g.: for  3D (dim1, dim3, dim2).
+    y : tensor
+        A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4).
+    Returns
+    -------
+    tensor
+        A tensor of size e.g. if it is 3D: (dim1, dim3, dim4).
+    Notes
+    -----
+    This is a subset of numpy.einsum, but we do not provide it for now.
+    But numpy einsum is slower than dot or tensordot:
+    http://mail.scipy.org/pipermail/numpy-discussion/2012-October/064259.html
+    Examples
+    --------
    >>> first = tensor.tensor3('first')
    >>> second = tensor.tensor3('second')
    >>> result = batched_dot(first, second)
-    :note:  This is a subset of numpy.einsum, but we do not provide it for now.
-    But numpy einsum is slower than dot or tensordot:
-    http://mail.scipy.org/pipermail/numpy-discussion/2012-October/064259.html
    """
    result, updates = theano.scan(
        fn=lambda x_mat, y_mat:
@@ -3188,11 +3327,22 @@ def batched_dot(x, y):
 def batched_tensordot(x, y, axes=2):
    """
-    :param x: A Tensor with sizes e.g.: for 3D (dim1, dim3, dim2)
+    Compute the tensordot product.
-    :param y: A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4)
-    :param axes: an integer or array. If an integer, the number of axes
+    A hybrid of batch_dot and tensordot, this function computes the
-        to sum over. If an array, it must have two array
+    tensordot product between the two tensors, by iterating over the
-        elements containing the axes to sum over in each tensor.
+    first dimension using scan to perform a sequence of tensordots.
+    Parameters
+    ----------
+    x : tensor
+        A Tensor with sizes e.g.: for 3D (dim1, dim3, dim2)
+    y : tensor
+        A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4)
+    axes: int or array-like of length 2
+        If an integer, the number of axes to sum over.
+        If an array, it must have two array elements containing the axes to sum
+        over in each tensor.
        If an integer i, it is converted to an array containing
        the last i dimensions of the first tensor and the first
@@ -3206,11 +3356,7 @@ def batched_tensordot(x, y, axes=2):
        (Remember axes are zero-indexed!) The 2nd axis of a and the
        3rd axis of b must have the same shape; the same is true for
        the 3rd axis of a and the 5th axis of b.
-    :type axes: int or array-like of length 2
-    A hybrid of batch_dot and tensordot, this function computes the
-    tensordot product between the two tensors, by iterating over the
-    first dimension using scan to perform a sequence of tensordots.
    """
    if isinstance(axes, (list, numpy.ndarray)):
        if isinstance(axes, list):
@@ -3239,20 +3385,17 @@ def split(x, splits_size, n_splits, axis=0):
 class Split(Op):
    """Partition a `TensorVariable` along some axis.
-    .. python::
+    Examples
+    --------
-        x = vector()
+    >>> x = vector()
-        splits = lvector()
+    >>> splits = lvector()
-        # you have to declare right away how many split_points there will be.
+    You have to declare right away how many split_points there will be.
-        ra, rb, rc = split(x, splits, n_splits = 3, axis = 0)
+    >>> ra, rb, rc = split(x, splits, n_splits = 3, axis = 0)
+    >>> f = function([x, splits], [ra, rb, rc])
-        f = function([x, splits], [ra, rb, rc])
+    >>> a, b, c = f([0,1,2,3,4,5], [3, 2, 1])
+    a == [0,1,2]
-        a, b, c = f([0,1,2,3,4,5], [3, 2, 1])
+    b == [3, 4]
+    c == [5]
-        #a == [0,1,2]
-        #b == [3, 4]
-        #c == [5]
    """
@@ -3370,6 +3513,7 @@ class Split(Op):
 def addbroadcast(x, *axes):
    """
    Make the input broadcastable in the specified axes.
    For example, addbroadcast(x, 0) will make the first dimension of
    x broadcastable. When performing the function, if the length of
    x along that dimension is not 1, a ValueError will be raised.
@@ -3377,20 +3521,19 @@ def addbroadcast(x, *axes):
    We apply the opt here not to pollute the graph especially during
    the gpu optimization
-    Parameters:
+    Parameters
-    ------------
-        x : tensor_like
-            Input theano tensor.
-        axis : an int or an iterable object such as list or tuple
-               of int values
-               The dimension along which the tensor x should be
-               broadcastable.  if the length of x along these
-               dimensions is not 1, a ValueError will be raised.
-    returns:
    ----------
-        a theano tensor, which is broadcastable along the specified dimensions.
+    x : tensor_like
+        Input theano tensor.
+    axis : an int or an iterable object such as list or tuple of int values
+        The dimension along which the tensor x should be broadcastable.
+        If the length of x along these dimensions is not 1, a ValueError will
+        be raised.
+    Returns
+    -------
+    tensor
+        A theano tensor, which is broadcastable along the specified dimensions.
    """
    rval = Rebroadcast(*[(axis, True) for axis in axes])(x)
@@ -3400,6 +3543,7 @@ def addbroadcast(x, *axes):
 def unbroadcast(x, *axes):
    """
    Make the input impossible to broadcast in the specified axes.
    For example, addbroadcast(x, 0) will make the first dimension
    of x broadcastable. When performing the function, if the length
    of x along that dimension is not 1, a ValueError will be raised.
@@ -3407,20 +3551,19 @@ def unbroadcast(x, *axes):
    We apply the opt here not to pollute the graph especially during
    the gpu optimization
-    Parameters:
+    Parameters
-    ------------
-        x : tensor_like
-            Input theano tensor.
-        axis : an int or an iterable object such as list or tuple
-               of int values
-               The dimension along which the tensor x should be
-               unbroadcastable.  if the length of x along these
-               dimensions is not 1, a ValueError will be raised.
-    returns:
    ----------
-        a theano tensor, which is unbroadcastable along the specified dimensions.
+    x : tensor_like
+        Input theano tensor.
+    axis : an int or an iterable object such as list or tuple of int values
+        The dimension along which the tensor x should be unbroadcastable.
+        If the length of x along these dimensions is not 1, a ValueError will
+        be raised.
+    Returns
+    -------
+    tensor
+        A theano tensor, which is unbroadcastable along the specified dimensions.
    """
    rval = Rebroadcast(*[(axis, False) for axis in axes])(x)
@@ -3430,7 +3573,8 @@ def unbroadcast(x, *axes):
 def patternbroadcast(x, broadcastable):
    """
    Make the input adopt a specific broadcasting pattern.
-    broadcastable must be iterable. For example,
+    Broadcastable must be iterable. For example,
    patternbroadcast(x, (True, False)) will make the first
    dimension of x broadcastable and the second dimension
    not broadcastable, so x will now be a row.
@@ -3438,21 +3582,20 @@ def patternbroadcast(x, broadcastable):
    We apply the opt here not to pollute the graph especially during the gpu
    optimization.
-    Parameters:
+    Parameters
-    ------------
+    ----------
-        x : tensor_like
+    x : tensor_like
-            Input theano tensor.
+        Input theano tensor.
-        broadcastable : an iterable object such as list or tuple
+    broadcastable : an iterable object such as list or tuple of bool values
-                        of bool values
+        A set of boolean values indicating whether a dimension should be
+        broadcastable or not. If the length of x along these dimensions is
+        not 1, a ValueError will be raised.
-            a set of boolean values indicating whether a dimension
+    Returns
-            should be broadcastable or not.
+    -------
-            if the length of x along these dimensions is not 1,
+    tensor
-            a ValueError will be raised.
+        A theano tensor, which is unbroadcastable along the specified dimensions.
-    returns:
-    ----------
-        a theano tensor, which is unbroadcastable along the specified dimensions.
    """
    rval = Rebroadcast(*[(i, broadcastable[i])
                         for i in xrange(len(broadcastable))])(x)
@@ -3468,31 +3611,39 @@ class Join(Op):
    Of course, TensorVariable instances do not have a shape, so this error
    cannot be caught until runtime.  See `perform()`.
-    For joins involving scalar values, see @stack.
+    See Also
+    --------
+    stack : For joins involving scalar values
-    .. python::
+    Examples
+    --------
+    >>> x, y, z = tensor.matrix(), tensor.matrix(), tensor.matrix()
+    >>> u = tensor.vector()
-        x, y, z = tensor.matrix(), tensor.matrix(), tensor.matrix()
+    >>> r = join(0, x, y, z)
-        u = tensor.vector()
+    >>> c = join(1, x, y, z)
+    >>> join(2, x, y, z)    # WRONG: the axis has to be an index into the shape
+    >>> join(0, x, u)       # WRONG: joined tensors must have the same rank
-        r = join(0, x, y, z)
-        c = join(1, x, y, z)
-        join(2, x, y, z)    # WRONG: the axis has to be an index into the shape
-        join(0, x, u)       # WRONG: joined tensors must have the same rank
    """
    check_input = False
    __props__ = ()
    def make_node(self, *axis_and_tensors):
        """
-        :param axis: an Int or integer-valued Variable
+        Parameters
+        ----------
-        :param tensors: a variable number (but not zero) of tensors to
+        axis: an Int or integer-valued Variable
-          concatenate along the specified axis.  These tensors must have
+        tensors
-          the same shape along all dimensions other than this axis.
+            A variable number (but not zero) of tensors to
+            concatenate along the specified axis.  These tensors must have
-        :returns: a symbolic Variable.  It has the same ndim as the
+            the same shape along all dimensions other than this axis.
-            input tensors, and the most inclusive dtype.
+        Returns
+        -------
+        A symbolic Variable
+            It has the same ndim as the input tensors, and the most inclusive
+            dtype.
        """
        axis, tensors = axis_and_tensors[0], axis_and_tensors[1:]
@@ -3709,26 +3860,25 @@ class Join(Op):
 """
    Convenience function to concatenate `TensorType`s along the given axis.
-    :Parameters:
+    Parameters
-     - `tensors` : list of tensors (or list-like)
+    ----------
-       A list of tensors to be concatenated along the given axis.
+    tensors : list of tensors (or list-like)
-     - `axis` : int (symbolic or literal)
+        A list of tensors to be concatenated along the given axis.
+        The shapes of the tensors to be concatenated must be all
-       On which dimension should the tensors be joined?  The `axis`
+        identical, except in the dimension (`axis`) on which they are to
-       must be a valid index into the shape of the tensors to be
+        be joined.
-       concatenated.
+    axis : int (symbolic or literal)
+        On which dimension should the tensors be joined?  The `axis`
-       The `axis` parameter may either be an integer or an object that
+        must be a valid index into the shape of the tensors to be
-       can be converted to a scalar using `as_scalar`(`axis`). In the
+        concatenated.
-       former case, the axis is fixed at construction, while in the
+        The `axis` parameter may either be an integer or an object that
-       latter it may vary over time depending on the value of the
+        can be converted to a scalar using `as_scalar`(`axis`). In the
-       `axis` variable.
+        former case, the axis is fixed at construction, while in the
+        latter it may vary over time depending on the value of the
+        `axis` variable.
-    The shapes of the tensors to be concatenated must be all
+"""
-    identical, except in the dimension (`axis`) on which they are to
-    be joined.
-    """
 join = Join()
 pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Join),
@@ -3738,7 +3888,8 @@ pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Join),
 def roll(x, shift, axis=None):
    """
    Convenience function to roll `TensorType`s along the given axis.
-    Syntax copies numpy.roll function
+    Syntax copies numpy.roll function.
    Parameters
    ----------
@@ -3746,15 +3897,16 @@ def roll(x, shift, axis=None):
        Input tensor.
    shift : int (symbolic or literal)
        The number of places by which elements are shifted.
-    axis : int (symbolic or literal) (optional)
+    axis : int (symbolic or literal), optional
        The axis along which elements are shifted. By default, the array
        is flattened before shifting, after which the original
        shape is restored.
    Returns
    -------
-    res : tensor
+    tensor
        Output tensor, with the same shape as `x`.
    """
    if axis is None:
        if x.ndim > 1:
@@ -3780,9 +3932,13 @@ def roll(x, shift, axis=None):
 @constructor
 def shape_padleft(t, n_ones=1):
-    """Reshape `t` by left-padding the shape with `n_ones` 1s
+    """Reshape `t` by left-padding the shape with `n_ones` 1s.
+    See Also
+    --------
+    shape_padright
+    Dimshuffle
-    See also: `shape_padright` and `Dimshuffle`
    """
    _t = as_tensor_variable(t)
@@ -3792,9 +3948,13 @@ def shape_padleft(t, n_ones=1):
 @constructor
 def shape_padright(t, n_ones=1):
-    """Reshape `t` by right-padding the shape with `n_ones` 1s
+    """Reshape `t` by right-padding the shape with `n_ones` 1s.
+    See Also
+    --------
+    shape_padleft
+    Dimshuffle
-    See also: `shape_padleft` and `Dimshuffle`
    """
    _t = as_tensor_variable(t)
@@ -3808,6 +3968,7 @@ def stack(*tensors):
    The size in dimension 0 of the result will be equal to the number
    of tensors passed.
    """
    if len(tensors) == 0:
        raise Exception('theano.tensor.stack(*tensors) must have at least'
@@ -3843,9 +4004,10 @@ def concatenate(tensor_list, axis=0):
    This function is similar to `join`, but uses the signature of
    numpy's concatenate function.
-    This function
+    Raises
-    :Exceptions:
+    ------
-     - `TypeError` : the tensor_list must be a tuple or list
+    TypeError
+        The tensor_list must be a tuple or list.
    """
    # Check someone did not make the common mistake to do something like:
@@ -3863,16 +4025,20 @@ def concatenate(tensor_list, axis=0):
 def get_vector_length(v):
    """Return the run-time length of a symbolic vector.
-    :Parameters:
+    Parameters
-     - `v` : A rank-1 TensorType variable.
+    ----------
+    v
-    :Exceptions:
+        A rank-1 TensorType variable.
-     - `TypeError` : `v` hasn't the proper type.
-     - `ValueError` : No special case applies, the length is not known.
-    In general this is not possible, but for a number of special cases
+    Raises
-    the length can be determined at compile / graph-construction time.
+    ------
-    This function implements these special cases.
+    TypeError
+        `v` hasn't the proper type.
+    ValueError
+        No special case applies, the length is not known.
+        In general this is not possible, but for a number of special cases
+        the length can be determined at compile / graph-construction time.
+        This function implements these special cases.
    """
    v = as_tensor_variable(v)
@@ -3909,9 +4075,11 @@ def get_vector_length(v):
 def horizontal_stack(*args):
    """
    Horizontally stack two L{TensorType}s.
    Stack two L{TensorType}s along the second axis (column wise). These
    L{TensorType}s must have the same shape along all dimensions but the
    second.
    """
    # Note: 'horizontal_stack' and 'vertical_stack' do not behave exactly like
    # Numpy's hstack and vstack functions. This is intended, because Numpy's
@@ -3937,7 +4105,9 @@ class Reshape(Op):
    """Perform a reshape operation of the input x to the new shape shp.
    The number of dimensions to which to reshape to (ndim) must be
-    known at graph build time."""
+    known at graph build time.
+    """
    view_map = {0: [0]}  # output 0 is potentially aliased to inputs [0]
    _f16_ok = True
@@ -4131,8 +4301,11 @@ def reshape(x, newshape, ndim=None, name=None):
 class Flatten(Op):
    """
+    Flatten a tensor.
    Flattens a tensor to `outdim` dimensions by preserving the leading
    outdim - 1 shape components.
    """
    view_map = {0: [0]}
@@ -4305,16 +4478,19 @@ def flatten(x, outdim=1):
 class Tile(Op):
    """
-    DEPRECATED: use tile() instead.
    Construct an array by repeating the input x according to reps pattern.
+    .. note:: Deprecated
+              Use tile() instead.
    Tiles its input according to reps. The length of reps is the number of
    dimension of x and contains the number of times to tile x in each
    dimension.
-    :see: `numpy.tile
+    See Also
-    <http://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html>`_
+    --------
+    numpy.tile : http://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html
    """
    __props__ = ("ndim",)
@@ -4377,13 +4553,15 @@ class Tile(Op):
 def tile(x, reps, ndim=None):
    """
-    Tile input array `x` according to `reps`. See the docstring of `numpy.tile`
+    Tile input array `x` according to `reps`.
-    for details.
+    See the docstring of `numpy.tile` for details.
    Currently, x.ndim and len(reps) must be equal, and, if specified, 'ndim'
    must be equal to both.
    TODO: expand this.
    """
    try:
@@ -4420,6 +4598,7 @@ class ARange(Op):
    """Create an array containing evenly spaced values within a given interval.
    Parameters and behaviour are the same as numpy.arange().
    """
    __props__ = ("dtype",)
@@ -4550,13 +4729,13 @@ class _nd_grid(object):
    to their numpy equivalents.
    Parameters
-    ==========
+    ----------
-        sparse : boolean, optional, default=True
+    sparse : boolean, optional, default=True
-            Specifying False leads to the equivalent of numpy's mgrid
+        Specifying False leads to the equivalent of numpy's mgrid functionality.
-            functionality. Specifying True leads to the equivalent of ogrid.
+        Specifying True leads to the equivalent of ogrid.
    Examples
-    ========
+    --------
    >>> a = T.mgrid[0:5, 0:3]
    >>> a[0].eval()
    array([[0, 0, 0],
@@ -4570,7 +4749,6 @@ class _nd_grid(object):
           [0, 1, 2],
           [0, 1, 2],
           [0, 1, 2]], dtype=int8)
    >>> b = T.ogrid[0:5, 0:3]
    >>> b[0].eval()
    array([[0],
@@ -4580,6 +4758,7 @@ class _nd_grid(object):
           [4]], dtype=int8)
    >>> b[1].eval()
    array([[0, 1, 2, 3]], dtype=int8)
    """
    def __init__(self, sparse=False):
@@ -4689,11 +4868,19 @@ class PermuteRowElements(Op):
        The terminal case is reached when the current tensors are vector,
        then the permutation contained in y is applied to x.
-        :param x: The input tensor, on which the permutation is applied
+        Parameters
-        :param y: Tensor containing the permutations to apply
+        ----------
-        :param out: Tensor storing the output result
+        x : tensor
-        :param curdim: Counter of the current depth of recursion
+            The input tensor, on which the permutation is applied.
-        :param inverse: Wether to apply permutations or their inverse
+        y : tensor
+            Tensor containing the permutations to apply.
+        out : tensor
+            Tensor storing the output result.
+        curdim : int
+            Counter of the current depth of recursion.
+        inverse
+            Wether to apply permutations or their inverse.
        """
        if len(x.shape) == 1:
            # Numpy advanced indexing works in this case
@@ -4817,7 +5004,9 @@ def permute_row_elements(x, y, inverse=0):
 def inverse_permutation(perm):
    """Computes the inverse of permutations.
    Each row of input should contain a permutation of the first integers.
    """
    return permute_row_elements(
        arange(perm.shape[-1], dtype=perm.dtype),
@@ -4840,14 +5029,14 @@ class Dot(Op):
    equivalent to matrix multiplication. For two vectors, this is the inner
    product.
-    :note: matrix-matrix products are sometimes optimized to Dot22 or Gemm ops.
+    Notes
-    (see tensor.blas)
+    -----
+    Matrix-matrix products are sometimes optimized to Dot22 or Gemm ops
-    :note: vector-vector products are sometimes optimized to Ger or CGer.  (see
+    (see tensor.blas).
-    tensor.blas)
+    Vector-vector products are sometimes optimized to Ger or CGer (see
+    tensor.blas).
-    :note: matrix-vector products are sometimes optimized to Gemv, CGemv (see
+    Matrix-vector products are sometimes optimized to Gemv, CGemv (see
-    tensor.blas)
+    tensor.blas).
    """
    __props__ = ()
@@ -5031,10 +5220,12 @@ pprint.assign(_dot, printing.OperatorPrinter(printing.special['middle_dot'],
 def dot(a, b):
    """
-    Computes the dot product of two variables. For two matrices, this is
+    Computes the dot product of two variables.
-    equivalent to matrix multiplication. For two vectors, this is the inner
-    product. When one variable is a scalar, this is like elementwise
+    For two matrices, this is equivalent to matrix multiplication.
-    multiplication.  For N dimensions, this is a sum product over the last axis
+    For two vectors, this is the inner product.
+    When one variable is a scalar, this is like elementwise multiplication.
+    For N dimensions, this is a sum product over the last axis
    of the first array and the second-to-last axis of the second array:
        dot(a, b)[i,j,k,m] = sum(a[i,j,:] * b[k,:,m])
@@ -5054,14 +5245,14 @@ def dot(a, b):
        3.  If both a and b have either 1 or 2 dimensions, it calls Theano's
            Dot op on a and b.
-    :note: matrix-matrix products are sometimes optimized to Dot22 or Gemm ops.
+    Notes
-    (see tensor.blas)
+    -----
+    Matrix-matrix products are sometimes optimized to Dot22 or Gemm ops
-    :note: vector-vector products are sometimes optimized to Ger or CGer.  (see
+    (see tensor.blas).
-    tensor.blas)
+    Vector-vector products are sometimes optimized to Ger or CGer (see
+    tensor.blas).
-    :note: matrix-vector products are sometimes optimized to Gemv, CGemv (see
+    Matrix-vector products are sometimes optimized to Gemv, CGemv (see
-    tensor.blas)
+    tensor.blas).
    """
    a, b = as_tensor_variable(a), as_tensor_variable(b)
@@ -5080,46 +5271,52 @@ def dot(a, b):
 def tensordot(a, b, axes=2):
    """
-    Given two tensors a and b,tensordot computes a generalized dot product over
+    Compute a generalized dot product over provided axes.
+    Given two tensors a and b, tensordot computes a generalized dot product over
    the provided axes. Theano's implementation reduces all expressions to
    matrix or vector dot products and is based on code from Tijmen Tieleman's
    gnumpy (http://www.cs.toronto.edu/~tijmen/gnumpy.html).
-    :param a: the first tensor variable
+    Parameters
-    :type a: symbolic tensor
+    ----------
+    a: symbolic tensor
-    :param b: the second tensor variable
+        The first tensor variable.
-    :type b: symbolic tensor
+    b: symbolic tensor
+        The second tensor variable
-    :param axes: an integer or array. If an integer, the number of axes
+    axes: int or array-like of length 2
-                 to sum over. If an array, it must have two array
+        If an integer, the number of axes to sum over.
-                 elements containing the axes to sum over in each tensor.
+        If an array, it must have two array elements containing the axes
+        to sum over in each tensor.
-                 Note that the default value of 2 is not guaranteed to work
-                 for all values of a and b, and an error will be raised if
+        Note that the default value of 2 is not guaranteed to work
-                 that is the case. The reason for keeping the default is to
+        for all values of a and b, and an error will be raised if
-                 maintain the same signature as numpy's tensordot function
+        that is the case. The reason for keeping the default is to
-                 (and np.tensordot raises analogous errors for non-compatible
+        maintain the same signature as numpy's tensordot function
-                 inputs).
+        (and np.tensordot raises analogous errors for non-compatible
+        inputs).
-                 If an integer i, it is converted to an array containing
-                 the last i dimensions of the first tensor and the first
-                 i dimensions of the second tensor:
-                     axes = [list(range(a.ndim - i, b.ndim)), list(range(i))]
-                 If an array, its two elements must contain compatible axes
-                 of the two tensors. For example, [[1, 2], [2, 0]] means sum
-                 over the 2nd and 3rd axes of a and the 3rd and 1st axes of b.
-                 (Remember axes are zero-indexed!) The 2nd axis of a and the
-                 3rd axis of b must have the same shape; the same is true for
-                 the 3rd axis of a and the 1st axis of b.
-    :type axes: int or array-like of length 2
-    :returns: a tensor with shape equal to the concatenation of a's shape
-              (less any dimensions that were summed over) and b's shape
-              (less any dimensions that were summed over).
-    :rtype: symbolic tensor
+        If an integer i, it is converted to an array containing
+        the last i dimensions of the first tensor and the first
+        i dimensions of the second tensor:
+            axes = [list(range(a.ndim - i, b.ndim)), list(range(i))]
+        If an array, its two elements must contain compatible axes
+        of the two tensors. For example, [[1, 2], [2, 0]] means sum
+        over the 2nd and 3rd axes of a and the 3rd and 1st axes of b.
+        (Remember axes are zero-indexed!) The 2nd axis of a and the
+        3rd axis of b must have the same shape; the same is true for
+        the 3rd axis of a and the 1st axis of b.
+    Returns
+    -------
+    symbolic tensor
+        A tensor with shape equal to the concatenation of a's shape
+        (less any dimensions that were summed over) and b's shape
+        (less any dimensions that were summed over).
+    Examples
+    --------
    It may be helpful to consider an example to see what tensordot does.
    Theano's implementation is identical to NumPy's. Here a has shape (2, 3, 4)
    and b has shape (5, 6, 4, 3). The axes to sum over are [[1, 2], [3, 2]] --
@@ -5127,29 +5324,30 @@ def tensordot(a, b, axes=2):
    are compatible. The resulting tensor will have shape (2, 5, 6) -- the
    dimensions that are not being summed:
-        a = np.random.random((2,3,4))
+    >>> a = np.random.random((2,3,4))
-        b = np.random.random((5,6,4,3))
+    >>> b = np.random.random((5,6,4,3))
-        #tensordot
+    #tensordot
-        c = np.tensordot(a, b, [[1,2],[3,2]])
+    >>> c = np.tensordot(a, b, [[1,2],[3,2]])
-        #loop replicating tensordot
+    #loop replicating tensordot
-        a0, a1, a2 = a.shape
+    >>> a0, a1, a2 = a.shape
-        b0, b1, _, _ = b.shape
+    >>> b0, b1, _, _ = b.shape
-        cloop = np.zeros((a0,b0,b1))
+    >>> cloop = np.zeros((a0,b0,b1))
-        #loop over non-summed indices -- these exist
+    #loop over non-summed indices -- these exist
-        #in the tensor product.
+    #in the tensor product.
-        for i in range(a0):
+    >>> for i in range(a0):
-            for j in range(b0):
+    ...     for j in range(b0):
-                for k in range(b1):
+    ...         for k in range(b1):
-                    #loop over summed indices -- these don't exist
+    ...             #loop over summed indices -- these don't exist
-                    #in the tensor product.
+    ...             #in the tensor product.
-                    for l in range(a1):
+    ...             for l in range(a1):
-                        for m in range(a2):
+    ...                 for m in range(a2):
-                            cloop[i,j,k] += a[i,l,m] * b[j,k,m,l]
+    ...                     cloop[i,j,k] += a[i,l,m] * b[j,k,m,l]
-        np.allclose(c, cloop) #true
+    >>> np.allclose(c, cloop)
+    true
    This specific implementation avoids a loop by transposing a and b such that
    the summed axes of a are last and the summed axes of b are first. The
@@ -5160,12 +5358,16 @@ def tensordot(a, b, axes=2):
    In an extreme case, no axes may be specified. The resulting tensor
    will have shape equal to the concatenation of the shapes of a and b:
-        c = np.tensordot(a, b, 0)
+    >>> c = np.tensordot(a, b, 0)
-        print(a.shape) #(2,3,4)
+    >>> print(a.shape)
-        print(b.shape) #(5,6,4,3)
+    (2,3,4)
-        print(c.shape) #(2,3,4,5,6,4,3)
+    >>> print(b.shape)
+    (5,6,4,3)
+    >>> print(c.shape)
+    (2,3,4,5,6,4,3)
    See the documentation of numpy.tensordot for more examples.
    """
    a, b = as_tensor_variable(a), as_tensor_variable(b)
@@ -5275,6 +5477,7 @@ def outer(x, y):
    """Return vector-vector outer product.
    If an input isn't a vector, we flatten it first.
    """
    if x.ndim != 1:
        x = x.flatten()
@@ -5310,9 +5513,16 @@ del x
 class Diagonal(Op):
    """Return specified diagonals.
-    :param x: A tensor variable with x.ndim >= 2.
+    Parameters
+    ----------
+    x
+        A tensor variable with x.ndim >= 2.
+    Returns
+    -------
+    vector
+        A vector representing the diagonal elements.
-    :return: A vector representing the diagonal elements.
    """
    __props__ = ("offset", "axis1", "axis2")
@@ -5402,6 +5612,8 @@ def stacklists(arg):
    This function can create a tensor from a shaped list of scalars:
+    Examples
+    --------
    >>> from theano.tensor import stacklists, scalars, matrices
    >>> from theano import function
    >>> a, b, c, d = scalars('abcd')
@@ -5421,6 +5633,7 @@ def stacklists(arg):
    >>> x = ones((4, 4), 'float32')
    >>> f(x, x, x, x).shape
    (2, 2, 4, 4)
    """
    if isinstance(arg, (tuple, list)):
        return stack(*list(map(stacklists, arg)))
@@ -5434,12 +5647,18 @@ def ptp(a, axis=None):
    The name of the function comes from the acronym for peak to peak.
-    :param a : Input tensor.
+    Parameters
+    ----------
+    a
+        Input tensor.
+    axis
+        Axis along which to find the peaks. By default, flatten the array.
-    :param axis : Axis along which to find the peaks. By default,
+    Returns
-                flatten the array.
+    -------
+    array
+        A new array holding the result.
-    :return : A new array holding the result.
    """
    a = as_tensor_variable(a)
@@ -5495,28 +5714,36 @@ def choose(a, choices, out=None, mode='raise'):
      negative integers are mapped to 0; values greater than n-1 are mapped
      to n-1; and then the new array is constructed as above.
-    :Parameter: *a* - int array
+    Parameters
+    ----------
+    a : int array
        This array must contain integers in [0, n-1], where n is the number of
        choices, unless mode=wrap or mode=clip, in which cases any integers
        are permissible.
-    :Parameter: *choices* - sequence of arrays
+    choices : sequence of arrays
        Choice arrays. a and all of the choices must be broadcastable to
        the same shape. If choices is itself an array (not recommended),
        then its outermost dimension (i.e., the one corresponding to
        choices.shape[0]) is taken as defining the ``sequence``.
-    :Parameter: *out* - array, optional
+    out : array, optional
        If provided, the result will be inserted into this array.
        It should be of the appropriate shape and dtype.
-    :Parameter: *mode* - {``raise`` (default), ``wrap``, ``clip``}, optional
+    mode : {``raise`` (default), ``wrap``, ``clip``}, optional
        Specifies how indices outside [0, n-1] will be treated:
        ``raise`` : an exception is raised
        ``wrap`` : value becomes value mod n
        ``clip`` : values < 0 are mapped to 0, values > n-1 are mapped to n-1
-    :Returns: merged_array - array
+    Returns
+    -------
+    merged_array - array
        The merged result.
-    :Raises:
-        ValueError - shape mismatch
+    Raises
+    ------
+    ValueError - shape mismatch
        If a and each choice array are not all broadcastable to the same shape.
    """
    # This is done to keep the same function signature then NumPy.
    assert out is None
@@ -5609,6 +5836,7 @@ class Choose(Op):
 class AllocEmpty(gof.Op):
    """Implement Alloc on the cpu, but without initializing memory."""
    __props__ = ("dtype",)
    # specify the type of the data

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -17,10 +17,12 @@ There are four kinds of BLAS Ops in Theano:
    - C-based (blas_c)
    - CUDA-based (theano.sandbox.cuda.blas)
-:note: Unfortunately (because it's confusing) this file currently contains Ops
+Notes
-    that contain both Python and C versions.  I think it would be better to
+-----
-    move the C implementations to blas_c so that this file is pure Python.
+Unfortunately (because it's confusing) this file currently contains Ops
-    -JB
+that contain both Python and C versions.  I think it would be better to
+move the C implementations to blas_c so that this file is pure Python.
+-JB
 Ops
@@ -121,7 +123,6 @@ Specialize Gemm to Gemv
 If arguments to GEMM are dimshuffled vectors, then we can use GEMV
 instead. This optimization is `local_gemm_to_gemv`.
 """
 from __future__ import print_function
 import copy
@@ -359,7 +360,9 @@ class Gemv(Op):
    x, y are vectors
    alpha, beta are scalars
    output is a vector that can be inplace on y
    """
    __props__ = ("inplace",)
    def __init__(self, inplace):
@@ -443,12 +446,13 @@ class Ger(Op):
    for matrix A, scalar alpha, vectors x and y.
    This interface to GER allows non-destructive operation on A via the
-    `destructive`
+    `destructive` argument to the constructor.
-    argument to the constructor.
    :TODO: Create better classes ScipyGer and CGer that inherit from this class
    and override the make_thunk() method to use Scipy and C respectively.
    """
    __props__ = ("destructive",)
    def __init__(self, destructive):
@@ -508,16 +512,22 @@ def ldflags(libs=True, flags=False, libs_dir=False, include_dir=False):
    It returns a list of libraries against which an Op's object file
    should be linked to benefit from a BLAS implementation.
-    :type libs: bool, defaults to True
+    Parameters
-    :param libs: extract flags starting with "-l"
+    ----------
-    :type libs_dir: bool, defaults to False
+    libs : bool, optional
-    :param libs_dir: extract flags starting with "-L"
+        Extract flags starting with "-l" (the default is True).
-    :type include_dir: bool, defaults to False
+    libs_dir : bool, optional
-    :param include_dir: extract flags starting with "-I"
+        Extract flags starting with "-L" (the default is False).
-    :type flags: bool, defaults to False
+    include_dir : bool, optional
-    :param flags: extract all the other flags
+        Extract flags starting with "-I" (the default is False).
-    :rtype: list of strings
+    flags: bool, optional
-    :returns: extracted flags
+        Extract all the other flags (the default is False).
+    Returns
+    -------
+    list of strings
+        Extracted flags.
    """
    ldflags_str = theano.config.blas.ldflags
    return _ldflags(ldflags_str=ldflags_str,
@@ -533,19 +543,25 @@ def _ldflags(ldflags_str, libs, flags, libs_dir, include_dir):
    Depending on the options, different type of flags will be kept.
-    :type ldflags_str: string
+    Parameters
-    :param ldflags_str: the string to process. Typically, this will
+    ----------
-        be the content of `theano.config.blas.ldflags`
+    ldflags_str : string
-    :type libs: bool
+        The string to process. Typically, this will be the content of
-    :param libs: extract flags starting with "-l"
+        `theano.config.blas.ldflags`.
-    :type libs_dir: bool
+    libs : bool
-    :param libs_dir: extract flags starting with "-L"
+        Extract flags starting with "-l".
-    :type include_dir: bool
+    flags: bool
-    :param include_dir: extract flags starting with "-I"
+        Extract all the other flags.
-    :type flags: bool
+    libs_dir: bool
-    :param flags: extract all the other flags
+        Extract flags starting with "-L".
-    :rtype: list of strings
+    include_dir: bool
-    :returns: extracted flags
+        Extract flags starting with "-I".
+    Returns
+    -------
+    list of strings
+        Extracted flags.
    """
    rval = []
    if libs_dir:
@@ -598,10 +614,12 @@ def _ldflags(ldflags_str, libs, flags, libs_dir, include_dir):
 class GemmRelated(Op):
-    """Base class for Gemm and Dot22
+    """Base class for Gemm and Dot22.
    This class provides a kind of templated gemm Op.
    """
    __props__ = ()
    def c_support_code(self):
@@ -915,7 +933,7 @@ class GemmRelated(Op):
 class Gemm(GemmRelated):
-    """In-place version of matrix-matrix multiplication (with accumulation):
+    """In-place version of matrix-matrix multiplication (with accumulation).
    When a and b are scalars and x, y, and z are matrices, then
@@ -936,6 +954,7 @@ class Gemm(GemmRelated):
    optimized linear algebra operations.)
    """
    E_rank = 'gemm only works for rank 2'
    E_scalar = 'gemm requires scalar argument'
    E_z_uniq = 'argument z aliased to x or y'  # TODO: justify / delete this
@@ -1430,9 +1449,10 @@ def _factor_canonicalized(lst):
 def _gemm_from_factored_list(lst):
-    """Returns None, or a list to replace node.outputs
    """
+    Returns None, or a list to replace node.outputs.
+    """
    lst2 = []
    # Remove the tuple that can't be cast correctly.
    # This can happen when we try to cast a complex to a real
@@ -1524,7 +1544,7 @@ def _gemm_from_node2(node):
 class GemmOptimizer(Optimizer):
-    """Graph optimizer for inserting Gemm operations"""
+    """Graph optimizer for inserting Gemm operations."""
    def __init__(self):
        Optimizer.__init__(self)
        self.warned = False
@@ -1645,8 +1665,11 @@ class GemmOptimizer(Optimizer):
 class Dot22(GemmRelated):
    """Compute a matrix-matrix product.
-    This is a specialization of the more general Dot()
+    This is a specialization of the more general Dot().
    """
    def make_node(self, x, y):
        dtypes = ('float32', 'float64', 'complex64', 'complex128')
        if x.type.ndim != 2 or x.type.dtype not in dtypes:
@@ -1780,8 +1803,7 @@ def local_inplace_ger(node):
 @local_optimizer([gemm_no_inplace])
 def local_gemm_to_gemv(node):
-    """GEMM acting on row or column matrices -> GEMV
+    """GEMM acting on row or column matrices -> GEMV."""
-    """
    if node.op == gemm_no_inplace:
        z, a, x, y, b = node.inputs
        if z.broadcastable == x.broadcastable == (True, False):
@@ -1794,8 +1816,7 @@ def local_gemm_to_gemv(node):
 @local_optimizer([gemm_no_inplace])
 def local_gemm_to_ger(node):
-    """GEMM computing an outer-product -> GER
+    """GEMM computing an outer-product -> GER."""
-    """
    if node.op == gemm_no_inplace:
        z, a, x, y, b = node.inputs
        if x.broadcastable[1] and y.broadcastable[0]:
@@ -1825,8 +1846,7 @@ def local_gemm_to_ger(node):
 #      working
 @local_optimizer([_dot22])
 def local_dot22_to_ger_or_gemv(node):
-    """dot22 computing an outer-product -> GER
+    """dot22 computing an outer-product -> GER."""
-    """
    if node.op == _dot22:
        x, y = node.inputs
        xb = x.broadcastable
@@ -1904,11 +1924,14 @@ optdb.register('InplaceBlasOpt',
 class Dot22Scalar(GemmRelated):
    """Compute a matrix-matrix product.
    This is a specialization of the more general Dot()
    Used to call optimized gemm implementation.
    Also used to generate a gemm later.
-    compute scalar*dot(x,y)
+    compute scalar*dot(x,y).
    """
    def make_node(self, x, y, a):
        if a.ndim != 0:
            raise TypeError(Gemm.E_scalar, a)
@@ -1996,25 +2019,27 @@ _dot22scalar = Dot22Scalar()
 @local_optimizer([T.mul])
 def local_dot22_to_dot22scalar(node):
    """
-    :note: Previous attempts to alter this optimization to replace dot22 with
+    Notes
-        gemm instead of dot22scalar resulted in some Scan nodes being
+    -----
-        duplicated and the ScanSaveMem optimization never running on them,
+    Previous attempts to alter this optimization to replace dot22 with
-        resulting in highly increased memory usage. Until this issue is
+    gemm instead of dot22scalar resulted in some Scan nodes being
-        resolved, this optimization should keep using dot22scalar instead of
+    duplicated and the ScanSaveMem optimization never running on them,
-        gemm.
+    resulting in highly increased memory usage. Until this issue is
+    resolved, this optimization should keep using dot22scalar instead of
-    :note: we upcast the scalar if after the multiplication with the
+    gemm.
-        dot this give the same type.
+    We upcast the scalar if after the multiplication with the dot this give
-    .. note: We execute this optimizer after the gemm optimizer. This
+    the same type.
-        allow to give more priority to gemm that give more speed up
-        then this optimizer, but allow the gemm optimizer to ignore
+    We execute this optimizer after the gemm optimizer. This
-        this op.
+    allow to give more priority to gemm that give more speed up
+    then this optimizer, but allow the gemm optimizer to ignore
+    this op.
    TODO: support when we can reorder the mul to generate a
    dot22scalar or fix the canonizer to merge them(1 mul with multiple
    inputs)
    """
    if node.op != T.mul:
        return False
@@ -2102,7 +2127,6 @@ def local_dot22_to_dot22scalar(node):
        return [T.mul(_dot22scalar(d.owner.inputs[0],
                                   d.owner.inputs[1], a), *o)]
 # must happen after gemm as the gemm optimizer don't understant
 # dot22scalar and gemm give more speed up then dot22scalar
 blas_optdb.register('local_dot22_to_dot22scalar',

--- a/theano/tensor/blas_headers.py
+++ b/theano/tensor/blas_headers.py
 """ Header text for the C and Fortran BLAS interfaces.
 There is no standard name or location for this header, so we just insert it
-ourselves into the C code
+ourselves into the C code.
 """
 import logging
 import textwrap
@@ -32,6 +33,7 @@ def detect_macos_sdot_bug():
          detected. Its value is returned by the function
        - detect_macos_sdot_bug.fix_works will be set to True if the fix was
          attempted, and succeeded.
    """
    _logger.debug('Starting detection of bug in Mac OS BLAS sdot_ routine')
    if detect_macos_sdot_bug.tested:

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -62,67 +62,70 @@ class DimShuffle(Op):
    dimension and a numerical index represents the dimension of the same
    rank in the tensor passed to perform.
-    Examples:
+    Parameters
-      DimShuffle((False, False, False), ['x', 2, 'x', 0, 1])
+    ----------
+    input_broadcastable
-       This op will only work on 3d tensors with no broadcastable
+        The expected broadcastable pattern of the input
-       dimensions.  The first dimension will be broadcastable,
+    new_order
-       then we will have the third dimension of the input tensor as
+        A list representing the relationship between the input's
-       the second of the resulting tensor, etc. If the tensor has
+        dimensions and the output's dimensions. Each element of the
-       shape (20, 30, 40), the resulting tensor will have dimensions
+        list can either be an index or 'x'. Indices must be encoded
-       (1, 40, 1, 20, 30). (AxBxC tensor is mapped to 1xCx1xAxB tensor)
+        as python integers, not theano symbolic integers.
+    inplace : bool, optional
-      DimShuffle((True, False), [1])
+        If True, the output will be a view of the input.
+        If False (default), the output will be a copy of the input.
-       This op will only work on 2d tensors with the first dimension
-       broadcastable.
+    If j = new_order[i] is an index, the output's ith dimension
-       The second dimension of the input tensor will be the first dimension of
+    will be the input's jth dimension.
-       the resulting tensor.
+    If new_order[i] is 'x', the output's ith dimension will
-       If the tensor has shape (1, 20), the resulting tensor will have shape
+    be 1 and Broadcast operations will be allowed to do broadcasting
-       (20, ).
+    over that dimension.
-    More examples:
+    If input.broadcastable[i] == False then i must be found in new_order.
-      DimShuffle((), ['x']) -> make a 0d (scalar) into a 1d vector
+    Broadcastable dimensions, on the other hand, can be discarded.
-      DimShuffle((False, False), [0, 1]) -> identity
-      DimShuffle((False, False), [1, 0]) -> inverts the 1st and 2nd dimensions
+    Extended Summary
-      DimShuffle((False,), ['x', 0]) -> make a row out
+    ----------------
-                                        of a 1d vector (N to 1xN)
+    DimShuffle((False, False, False), ['x', 2, 'x', 0, 1])
-      DimShuffle((False,), [0, 'x']) -> make a column
-                                        out of a 1d vector (N to Nx1)
+    This op will only work on 3d tensors with no broadcastable
-      DimShuffle((False, False, False), [2, 0, 1]) -> AxBxC to CxAxB
+    dimensions.  The first dimension will be broadcastable,
-      DimShuffle((False, False), [0, 'x', 1]) -> AxB to Ax1xB
+    then we will have the third dimension of the input tensor as
-      DimShuffle((False, False), [1, 'x', 0]) -> AxB to Bx1xA
+    the second of the resulting tensor, etc. If the tensor has
+    shape (20, 30, 40), the resulting tensor will have dimensions
+    (1, 40, 1, 20, 30). (AxBxC tensor is mapped to 1xCx1xAxB tensor)
+    DimShuffle((True, False), [1])
+    This op will only work on 2d tensors with the first dimension
+    broadcastable.
+    The second dimension of the input tensor will be the first dimension of
+    the resulting tensor.
+    If the tensor has shape (1, 20), the resulting tensor will have shape
+    (20, ).
+    More examples :
+    DimShuffle((), ['x']) -> make a 0d (scalar) into a 1d vector
+    DimShuffle((False, False), [0, 1]) -> identity
+    DimShuffle((False, False), [1, 0]) -> inverts the 1st and 2nd dimensions
+    DimShuffle((False,), ['x', 0]) -> make a row out
+                                      of a 1d vector (N to 1xN)
+    DimShuffle((False,), [0, 'x']) -> make a column
+                                      out of a 1d vector (N to Nx1)
+    DimShuffle((False, False, False), [2, 0, 1]) -> AxBxC to CxAxB
+    DimShuffle((False, False), [0, 'x', 1]) -> AxB to Ax1xB
+    DimShuffle((False, False), [1, 'x', 0]) -> AxB to Bx1xA
    The reordering of the dimensions can be done in numpy with the
    transpose function.
    Adding, subtracting dimensions can be done with reshape.
    """
    _f16_ok = True
    check_input = False
    def __init__(self, input_broadcastable, new_order, inplace=False):
-        """
-        Usage: DimShuffle(input_broadcastable, new_order, inplace = False)
-        - input_broadcastable: the expected broadcastable pattern of the
-                               input
-        - new_order: a list representing the relationship between the
-                     input's dimensions and the output's dimensions. Each
-                     element of the list can either be an index or 'x'.
-                     Indices must be encoded as python integers, not
-                     theano symbolic integers.
-        - inplace: if True, the output will be a view of the input.
-                   If False, the output will be a copy of the input.
-        If j = new_order[i] is an index, the output's ith dimension
-          will be the input's jth dimension.
-        If new_order[i] is 'x', the output's ith dimension will
-          be 1 and Broadcast operations will be allowed to do broadcasting
-          over that dimension.
-        If input.broadcastable[i] == False then i must be found in new_order.
-        Broadcastable dimensions, on the other hand, can be discarded.
-        """
        input_broadcastable = tuple(input_broadcastable)
        self.input_broadcastable = input_broadcastable
        new_order = tuple(new_order)
@@ -456,36 +459,40 @@ class Elemwise(OpenMPOp):
    be the same as the corresponding input type (see the doc of
    scalar.ScalarOp to get help about controlling the output type)
-    Examples:
+    Parameters
-      Elemwise(add) # represents + on tensors (x + y)
+    -----------
-      Elemwise(add, {0 : 0}) # represents the += operation (x += y)
+    scalar_op
-      Elemwise(add, {0 : 1}) # represents += on the second argument (y += x)
+        An instance of a subclass of scalar.ScalarOp which works uniquely
-      Elemwise(mul)(rand(10, 5), rand(1, 5)) # the second input is completed
+        on scalars.
-        # along the first dimension to match the first input
+    inplace_pattern
-      Elemwise(true_div)(rand(10, 5), rand(10, 1)) # same but along the
+        A dictionary that maps the index of an output to the
-        # second dimension
+        index of an input so the output is calculated inplace using
-      Elemwise(int_div)(rand(1, 5), rand(10, 1)) # the output has size (10, 5)
+        the input's storage. (Just like destroymap, but without the lists.)
-      Elemwise(log)(rand(3, 4, 5))
+    nfunc_spec
+        Either None or a tuple of three elements,
+        (nfunc_name, nin, nout) such that getattr(numpy, nfunc_name)
+        implements this operation, takes nin inputs and nout outputs.
+        Note that nin cannot always be inferred from the scalar op's
+        own nin field because that value is sometimes 0 (meaning a
+        variable number of inputs), whereas the numpy function may
+        not have varargs.
+    Examples
+    --------
+    Elemwise(add) # represents + on tensors (x + y)
+    Elemwise(add, {0 : 0}) # represents the += operation (x += y)
+    Elemwise(add, {0 : 1}) # represents += on the second argument (y += x)
+    Elemwise(mul)(rand(10, 5), rand(1, 5)) # the second input is completed
+    # along the first dimension to match the first input
+    Elemwise(true_div)(rand(10, 5), rand(10, 1)) # same but along the
+    # second dimension
+    Elemwise(int_div)(rand(1, 5), rand(10, 1)) # the output has size (10, 5)
+    Elemwise(log)(rand(3, 4, 5))
    """
    def __init__(self, scalar_op, inplace_pattern=None, name=None,
                 nfunc_spec=None, openmp=None):
-        """
-        Usage: Elemwise(scalar_op, inplace_pattern = {})
-        * scalar_op: an instance of a subclass of scalar.ScalarOp which works
-            uniquely on scalars
-        * inplace_pattern: a dictionary that maps the index of an output to the
-            index of an input so the output is calculated inplace using
-            the input's storage. (Just like destroymap, but without the lists.)
-        * nfunc_spec: either None or a tuple of three elements,
-            (nfunc_name, nin, nout) such that getattr(numpy, nfunc_name)
-            implements this operation, takes nin inputs and nout outputs.
-            Note that nin cannot always be inferred from the scalar op's
-            own nin field because that value is sometimes 0 (meaning a
-            variable number of inputs), whereas the numpy function may
-            not have varargs.
-        """
        if inplace_pattern is None:
            inplace_pattern = {}
        self.name = name
@@ -1252,14 +1259,25 @@ class CAReduce(Op):
    dimensions. It will contain the variable of accumulating all values
    over the reduced dimensions using the specified scalar op.
-    Examples:
+    Parameters
-     CAReduce(add) -> sum (ie, acts like the numpy sum operation)
+    ----------
-     CAReduce(mul) -> product
+    scalar_op
-     CAReduce(maximum) -> max
+        A binary scalar op with only one output.
-     CAReduce(minimum) -> min
+        It must be commutative and associative.
-     CAReduce(or_) -> any # not lazy
+    axis
-     CAReduce(and_) -> all # not lazy
+        - The dimension along which we want to reduce
-     CAReduce(xor) -> a bit at 1 tell that there was an odd number of bit at
+        - List of dimensions that we want to reduce
+        - If None, all dimensions are reduced
+    Examples
+    --------
+    CAReduce(add) -> sum (ie, acts like the numpy sum operation)
+    CAReduce(mul) -> product
+    CAReduce(maximum) -> max
+    CAReduce(minimum) -> min
+    CAReduce(or_) -> any # not lazy
+    CAReduce(and_) -> all # not lazy
+    CAReduce(xor) -> a bit at 1 tell that there was an odd number of bit at
                      that position that where 1.
                      0 it was an even number ...
@@ -1270,18 +1288,10 @@ class CAReduce(Op):
    operation represented by the reduction must be both commutative
    and associative (eg add, multiply, maximum, binary or/and/xor - but not
    subtract, divide or power).
    """
    def __init__(self, scalar_op, axis=None):
-        """
-        Usage: CAReduce(scalar_op, axis = None)
-        * scalar_op: a binary scalar op with only one output.
-                     It must be commutative and associative.
-        * axis: - the dimension along which we want to reduce
-                - list of dimensions that we want to reduce
-                - if None, all dimensions are reduced
-        """
        if scalar_op.nin not in [-1, 2] or scalar_op.nout != 1:
            raise NotImplementedError((
                "CAReduce only supports binary functions with a single "
@@ -1656,8 +1666,10 @@ class All(CAReduce):
    """ Applies `bitwise and` to all the values of a tensor along the
    specified axis(es).
-    Equivalent to CAReduce(scalar.and_, axis=axis)
+    Equivalent to CAReduce(scalar.and_, axis=axis).
    """
    def __init__(self, axis=None):
        CAReduce.__init__(self, scalar.and_, axis)
@@ -1686,8 +1698,10 @@ class Any(CAReduce):
    """ Applies `bitwise or` to all the values of a tensor along the
    specified axis(es).
-    Equivalent to CAReduce(scalar.or_, axis=axis)
+    Equivalent to CAReduce(scalar.or_, axis=axis).
    """
    def __init__(self, axis=None):
        CAReduce.__init__(self, scalar.or_, axis)
@@ -1727,40 +1741,42 @@ class CAReduceDtype(CAReduce):
    If no dtype is provided, one will be inferred so as not to lose
    too much precision.
+    Parameters
+    ----------
+    scalar_op
+        A binary scalar op with only one output.
+        It must be commutative and associative.
+    axis
+        - the dimension along which we want to reduce
+        - list of dimensions that we want to reduce
+        - if None, all dimensions are reduced
+    dtype
+        The dtype of the returned tensor. If None, then we use the default
+        dtype which is the same as the input tensor's dtype except when:
+        - the input dtype is a signed integer of precision < 64 bit, in
+        which case we use int64
+        - the input dtype is an unsigned integer of precision < 64 bit, in
+        which case we use uint64
+        This default dtype does _not_ depend on the value of "acc_dtype".
+        This behavior is similar in spirit to that of numpy (except numpy
+        uses the default machine integer while we always use 64 bit
+        integers to avoid platform-dependent behavior).
+    acc_dtype
+        The dtype of the internal accumulator.
+        If None (default), we use the dtype in the list below,
+        or the input dtype if its precision is higher:
+        - for int dtypes, we use at least int64;
+        - for uint dtypes, we use at least uint64;
+        - for float dtypes, we use at least float64;
+        - for complex dtypes, we use at least complex128.
    """
    def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
-        """
-        Usage: CAReduceDtype(scalar_op, axis=None, dtype=None, acc_dtype=None)
-        :param scalar_op: a binary scalar op with only one output.
-                     It must be commutative and associative.
-        :param axis: - the dimension along which we want to reduce
-                     - list of dimensions that we want to reduce
-                     - if None, all dimensions are reduced
-        :param dtype: The dtype of the returned
-            tensor. If None, then we use the default dtype which is the same
-            as the input tensor's dtype except when:
-            - the input dtype is a signed integer of precision < 64 bit, in
-              which case we use int64
-            - the input dtype is an unsigned integer of precision < 64 bit, in
-              which case we use uint64
-            This default dtype does _not_ depend on the value of "acc_dtype".
-            This behavior is similar in spirit to that of numpy (except numpy
-            uses the default machine integer while we always use 64 bit
-            integers to avoid platform-dependent behavior).
-        :param acc_dtype: The dtype of the internal accumulator.
-            If None (default), we use the dtype in the list below,
-            or the input dtype if its precision is higher:
-            - for int dtypes, we use at least int64;
-            - for uint dtypes, we use at least uint64;
-            - for float dtypes, we use at least float64;
-            - for complex dtypes, we use at least complex128.
-        """
        CAReduce.__init__(self, scalar_op, axis=axis)
        self.dtype = dtype
        self.acc_dtype = acc_dtype
@@ -1888,33 +1904,36 @@ class Sum(CAReduceDtype):
    Equivalent to CAReduceDtype(scalar.add, axis=axis, dtype=dtype),
    with the difference that this defines the gradient of sum wrt its
    tensor input.
-    """
-    def __init__(self, axis=None, dtype=None, acc_dtype=None):
+    Parameters
-        """
+    ----------
-        Constructor.
+    axis
+        Axis(es) along which the tensor should be summed
-        :param axis: Axis(es) along which the tensor should be summed
        (use None to sum over all axes, and a list or tuple to sum along more
        than one axis).
-        :param dtype: The dtype of the internal accumulator and returned
+    dtype
+        The dtype of the internal accumulator and returned
        tensor. If None, then we use the default dtype which is the same as the
        input tensor's dtype except when:
-            - the input dtype is a signed integer of precision < 64 bit, in
+        - the input dtype is a signed integer of precision < 64 bit, in
-              which case we use int64
+        which case we use int64
-            - the input dtype is an unsigned integer of precision < 64 bit, in
+        - the input dtype is an unsigned integer of precision < 64 bit, in
-              which case we use uint64
+        which case we use uint64
-            This value does not depend on the value of "acc_dtype".
+        This value does not depend on the value of "acc_dtype".
-        :param acc_dtype: The dtype of the internal accumulator.
+    acc_dtype
-            If None (default), we use the dtype in the list below,
+        The dtype of the internal accumulator.
-            or the input dtype if its precision is higher:
+        If None (default), we use the dtype in the list below,
-            - for int dtypes, we use at least int64;
+        or the input dtype if its precision is higher:
-            - for uint dtypes, we use at least uint64;
+        - for int dtypes, we use at least int64;
-            - for float dtypes, we use at least float64;
+        - for uint dtypes, we use at least uint64;
-            - for complex dtypes, we use at least complex128.
+        - for float dtypes, we use at least float64;
-        """
+        - for complex dtypes, we use at least complex128.
+    """
+    def __init__(self, axis=None, dtype=None, acc_dtype=None):
        CAReduceDtype.__init__(self, scalar.add, axis=axis,
                               dtype=dtype, acc_dtype=acc_dtype)
@@ -1960,7 +1979,9 @@ class Prod(CAReduceDtype):
    Equivalent to CAReduce(scalar.prod, axis = axis), with the
    difference that this defines the gradient of prod wrt its tensor
    input.
    """
    def __init__(self, axis=None, dtype=None, acc_dtype=None,
                 no_zeros_in_input=False):
        CAReduceDtype.__init__(self, scalar.mul, axis=axis,
@@ -1982,7 +2003,7 @@ class Prod(CAReduceDtype):
                hash(self.no_zeros_in_input))
    def grad(self, inp, grads):
-        '''
+        """
        The grad of this Op could be very easy, if it is was not for the case
        where zeros are present in a given "group" (ie. elements reduced
        together to form the product).
@@ -2026,7 +2047,8 @@ class Prod(CAReduceDtype):
        I do this by first counting the number of zeros in each group (see
        the "T.eq()" bits), then taking this or that behavior (see T.switch)
        based on the result of this count.
-        '''
+        """
        prod_in, = inp
        gz, = grads

--- a/theano/tensor/elemwise_cgen.py
+++ b/theano/tensor/elemwise_cgen.py
@@ -5,8 +5,8 @@ import theano
 def make_declare(loop_orders, dtypes, sub):
    """
    Produce code to declare all necessary variables.
-    """
+    """
    decl = ""
    for i, (loop_order, dtype) in enumerate(zip(loop_orders, dtypes)):
        var = sub['lv%i' % i]  # input name corresponding to ith loop variable
@@ -117,8 +117,11 @@ def make_checks(loop_orders, dtypes, sub):
 def make_alloc(loop_orders, dtype, sub, fortran='0'):
    """Generate C code to allocate outputs.
-    :param fortran: a string included in the generated code. If it
+    Parameters
-        evaludate to non-zero, an ndarray in fortran order will be
+    ----------
+    fortran : str
+        A string included in the generated code. If it
+        evaluate to non-zero, an ndarray in fortran order will be
        created, otherwise it will be c order.
    """
@@ -179,25 +182,24 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub, openmp=None):
    Make a nested loop over several arrays and associate specific code
    to each level of nesting.
-    @type loop_orders: list of N tuples of length M.
+    Parameters
-    @param loop_orders: Each value of each
+    ----------
-      tuple can be either the index of a dimension to loop over or
+    loop_orders : list of N tuples of length M
-      the letter 'x' which means there is no looping to be done
+        Each value of each tuple can be either the index of a dimension to
-      over that variable at that point (in other words we broadcast
+        loop over or the letter 'x' which means there is no looping to be done
-      over that dimension). If an entry is an integer, it will become
+        over that variable at that point (in other words we broadcast
-      an alias of the entry of that rank.
+        over that dimension). If an entry is an integer, it will become
+        an alias of the entry of that rank.
-    @type loop_tasks: list of M+1 pieces of code.
+    loop_tasks : list of M+1 pieces of code
-    @param loop_tasks: The ith loop_task is a pair of strings, the first
+        The ith loop_task is a pair of strings, the first
-      string is code to be executed before the ith loop starts, the second
+        string is code to be executed before the ith loop starts, the second
-      one contains code to be executed just before going to the next element
+        one contains code to be executed just before going to the next element
-      of the ith dimension.
+        of the ith dimension.
-      The last element if loop_tasks is a single string, containing code
+        The last element if loop_tasks is a single string, containing code
-      to be executed at the very end.
+        to be executed at the very end.
+    sub : dictionary
-    @type sub: a dictionary.
+        Maps 'lv#' to a suitable variable name.
-    @param sub: Maps 'lv#' to a suitable variable name.
+        The 'lvi' variable corresponds to the ith element of loop_orders.
-      The 'lvi' variable corresponds to the ith element of loop_orders.
    """
    def loop_over(preloop, code, indices, i):
@@ -244,8 +246,9 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub, openmp=None):
    return "{%s}" % s
-def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub, openmp=None):
+def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub,
-    '''A bit like make_loop, but when only the inner-most loop executes code.
+                        openmp=None):
+    """A bit like make_loop, but when only the inner-most loop executes code.
    All the loops will be reordered so that the loops over the output tensor
    are executed with memory access as contiguous as possible.
@@ -253,7 +256,8 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub, op
    will be on its rows; if it's f_contiguous, it will be on its columns.
    The output tensor's index among the loop variables is indicated by olv_index.
-    '''
+    """
    # Number of variables
    nvars = len(init_loop_orders)
@@ -338,6 +342,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub, op
        Returns a list containing a C expression representing the
        stride for each dimension of the ith variable, in the
        specified loop_order.
        """
        var = sub["lv%i" % i]
        r = []
@@ -463,25 +468,25 @@ def make_loop_careduce(loop_orders, dtypes, loop_tasks, sub):
    Make a nested loop over several arrays and associate specific code
    to each level of nesting.
-    @type loop_orders: list of N tuples of length M.
+    Parameters
-    @param loop_orders: Each value of each
+    ----------
-      tuple can be either the index of a dimension to loop over or
+    loop_orders : list of N tuples of length M
-      the letter 'x' which means there is no looping to be done
+        Each value of each tuple can be either the index of a dimension to
-      over that variable at that point (in other words we broadcast
+        loop over or the letter 'x' which means there is no looping to be done
-      over that dimension). If an entry is an integer, it will become
+        over that variable at that point (in other words we broadcast
-      an alias of the entry of that rank.
+        over that dimension). If an entry is an integer, it will become
+        an alias of the entry of that rank.
-    @type loop_tasks: list of M+1 pieces of code.
+    loop_tasks : list of M+1 pieces of code
-    @param loop_tasks: The ith loop_task is a pair of strings, the first
+        The ith loop_task is a pair of strings, the first
-      string is code to be executed before the ith loop starts, the second
+        string is code to be executed before the ith loop starts, the second
-      one contains code to be executed just before going to the next element
+        one contains code to be executed just before going to the next element
-      of the ith dimension.
+        of the ith dimension.
-      The last element if loop_tasks is a single string, containing code
+        The last element if loop_tasks is a single string, containing code
-      to be executed at the very end.
+        to be executed at the very end.
+    sub: dictionary
-    @type sub: a dictionary.
+        Maps 'lv#' to a suitable variable name.
-    @param sub: Maps 'lv#' to a suitable variable name.
+        The 'lvi' variable corresponds to the ith element of loop_orders.
-      The 'lvi' variable corresponds to the ith element of loop_orders.
    """
    def loop_over(preloop, code, indices, i):

--- a/theano/tensor/extra_ops.py
+++ b/theano/tensor/extra_ops.py
@@ -14,8 +14,9 @@ tensor = basic
 class CpuContiguous(theano.Op):
    """
    Check to see if the input is c-contiguous,
-    if it is, do nothing, else return a contiguous array
+    if it is, do nothing, else return a contiguous array.
    """
    __props__ = ()
    view_map = {0: [0]}
@@ -171,12 +172,16 @@ def cumsum(x, axis=None):
    Wraping of numpy.cumsum.
-    :param x: Input tensor variable.
+    Parameters
+    ----------
-    :param axis: The axis along which the cumulative sum is computed.
+    x
+        Input tensor variable.
+    axis
+        The axis along which the cumulative sum is computed.
        The default (None) is to compute the cumsum over the flattened array.
    .. versionadded:: 0.7
    """
    return CumsumOp(axis=axis)(x)
@@ -291,18 +296,24 @@ def cumprod(x, axis=None):
    Wraping of numpy.cumprod.
-    :param x: Input tensor variable.
+    Parameters
+    ----------
+    x
+        Input tensor variable.
-    :param axis: The axis along which the cumulative product is computed.
+    axis
+        The axis along which the cumulative product is computed.
        The default (None) is to compute the cumprod over the flattened array.
    .. versionadded:: 0.7
    """
    return CumprodOp(axis=axis)(x)
 class DiffOp(theano.Op):
    # See function diff for docstring
    __props__ = ("n", "axis")
    def __init__(self, n=1, axis=-1):
@@ -354,23 +365,29 @@ def diff(x, n=1, axis=-1):
    along the given axis, higher order differences are calculated by
    using diff recursively. Wraping of numpy.diff.
-    :param x: Input tensor variable.
+    Parameters
+    ----------
+    x
+        Input tensor variable.
-    :param n: The number of times values are differenced, default is 1.
+    n
+        The number of times values are differenced, default is 1.
-    :param axis: The axis along which the difference is taken,
+    axis
-        default is the last axis.
+        The axis along which the difference is taken, default is the last axis.
    .. versionadded:: 0.6
    """
    return DiffOp(n=n, axis=axis)(x)
 class BinCountOp(theano.Op):
    """
-    DEPRECATED: use bincount() instead.
+    .. note:: Deprecated
+              Use bincount() instead.
+              See function bincount for docstring.
-    See function bincount for docstring
    """
    compatible_type = ('int8', 'int16', 'int32', 'int64',
                       'uint8', 'uint16', 'uint32', 'uint64')
@@ -473,17 +490,19 @@ def bincount(x, weights=None, minlength=None, assert_nonneg=False):
    specified the input array is weighted by it, i.e. if a value n
    is found at position i, out[n] += weight[i] instead of out[n] += 1.
-    :param x: 1 dimension, nonnegative ints
+    Parameters
+    ----------
-    :param weights: array of the same shape as x with corresponding weights.
+    x : 1 dimension, nonnegative ints
+    weights : array of the same shape as x with corresponding weights.
        Optional.
-    :param minlength: A minimum number of bins for the output array.
+    minlength : A minimum number of bins for the output array.
        Optional.
-    :param assert_nonneg: A flag that inserts an assert_op to check if
+    assert_nonneg : A flag that inserts an assert_op to check if
        every input x is nonnegative.
        Optional.
    .. versionadded:: 0.6
    """
    compatible_type = ('int8', 'int16', 'int32', 'int64',
                       'uint8', 'uint16', 'uint32')
@@ -520,18 +539,25 @@ def bincount(x, weights=None, minlength=None, assert_nonneg=False):
 def squeeze(x):
-    """Remove broadcastable dimensions from
+    """
-    the shape of an array.
+    Remove broadcastable dimensions from the shape of an array.
    It returns the input array, but with the
    broadcastable dimensions removed. This is
    always `x` itself or a view into `x`.
-    :param x: Input data, tensor variable.
+    .. versionadded:: 0.6
+    Parameters
+    ----------
+    x
+        Input data, tensor variable.
-    :return: `x` without its broadcastable dimensions.
+    Returns
+    -------
+    object
+        `x` without its broadcastable dimensions.
-    .. versionadded:: 0.6
    """
    view = x.dimshuffle([i for i in range(x.ndim)
                         if not x.broadcastable[i]])
@@ -539,20 +565,28 @@ def squeeze(x):
 def compress(condition, x, axis=None):
-    """Return selected slices of an array along given axis.
+    """
+    Return selected slices of an array along given axis.
    It returns the input tensor, but with selected slices along a given axis
-    retained. If no axis is provided, the tensor is flattened
+    retained. If no axis is provided, the tensor is flattened.
    Corresponds to numpy.compress
-    :param x: Input data, tensor variable
+    .. versionadded:: 0.7
-    :param condition: 1 dimensional array of non-zero and zero values
+    Parameters
-        corresponding to indices of slices along a selected axis
+    ----------
+    x
+        Input data, tensor variable.
+    condition
+         1 dimensional array of non-zero and zero values
+         corresponding to indices of slices along a selected axis.
-    :return: `x` with selected slices
+    Returns
+    -------
+    object
+        `x` with selected slices.
-    .. versionadded:: 0.7
    """
    indices = theano.tensor.basic.flatnonzero(condition)
    return x.take(indices, axis=axis)
@@ -560,6 +594,7 @@ def compress(condition, x, axis=None):
 class RepeatOp(theano.Op):
    # See the repeat function for docstring
    __props__ = ("axis",)
    def __init__(self, axis=None):
@@ -678,14 +713,19 @@ def repeat(x, repeats, axis=None):
    The number of repetitions for each element is `repeat`.
    `repeats` is broadcasted to fit the length of the given `axis`.
-    :param x: Input data, tensor variable.
+    Parameters
-    :param repeats: int, scalar or tensor variable.
+    ----------
+    x
-    :param axis: int, optional.
+        Input data, tensor variable.
+    repeats : int, scalar or tensor variable
+    axis : int, optional
-    :see: :func:`tensor.tile <tensor.tile>`
+    See Also
+    --------
+    tensor.tile
    .. versionadded:: 0.6
    """
    repeats = tensor.as_tensor_variable(repeats)
@@ -763,21 +803,27 @@ bartlett_ = Bartlett()
 # I create a function only to have the doc show well.
 def bartlett(M):
-    """An instance of this class returns the Bartlett spectral window in the
+    """
+    An instance of this class returns the Bartlett spectral window in the
    time-domain. The Bartlett window is very similar to a triangular window,
    except that the end points are at zero. It is often used in signal
    processing for tapering a signal, without generating too much ripple in
    the frequency domain.
-    :param M: (integer scalar) Number of points in the output
+    .. versionadded:: 0.6
-        window. If zero or less, an empty vector is returned.
-    :return: (vector of doubles) The triangular window, with the
+    Parameters
-        maximum value normalized to one (the value one appears only if
+    ----------
-        the number of samples is odd), with the first and last samples
+    M : integer scalar
-        equal to zero.
+        Number of points in the output window. If zero or less,
+        an empty vector is returned.
-    .. versionadded:: 0.6
+    Returns
+    -------
+    vector of doubles
+        The triangular window, with the maximum value normalized to one
+        (the value one appears only if the number of samples is odd), with
+        the first and last samples equal to zero.
    """
    return bartlett_(M)
@@ -823,8 +869,10 @@ class FillDiagonal(gof.Op):
    def grad(self, inp, cost_grad):
        """
-        Note: The gradient is currently implemented for matrices
+        Notes
-        only.
+        -----
+        The gradient is currently implemented for matrices only.
        """
        a, val = inp
        grad = cost_grad[0]
@@ -843,15 +891,25 @@ fill_diagonal_ = FillDiagonal()
 # I create a function only to have the doc show well.
 def fill_diagonal(a, val):
-    """ Returns a copy of an array with all
+    """
+    Returns a copy of an array with all
    elements of the main diagonal set to a specified scalar value.
-    :param a: Rectangular array of at least two dimensions.
+    .. versionadded:: 0.6
-    :param val: Scalar value to fill the diagonal whose type must be
+    Parameters
+    ----------
+    a
+        Rectangular array of at least two dimensions.
+    val
+        Scalar value to fill the diagonal whose type must be
        compatible with that of array 'a' (i.e. 'val' cannot be viewed
        as an upcast of 'a').
-    :return: An array identical to 'a' except that its main diagonal
+    Returns
+    -------
+    array
+        An array identical to 'a' except that its main diagonal
        is filled with scalar 'val'. (For an array 'a' with a.ndim >=
        2, the main diagonal is the list of locations a[i, i, ..., i]
        (i.e. with indices all identical).)
@@ -859,7 +917,8 @@ def fill_diagonal(a, val):
    Support rectangular matrix and tensor with more than 2 dimensions
    if the later have all dimensions are equals.
-    .. versionadded:: 0.6
    """
    return fill_diagonal_(a, val)
@@ -902,13 +961,16 @@ class FillDiagonalOffset(gof.Op):
        height, width = a.shape
        """
-        Note: The fill_diagonal only support rectangular matrix. The output
+        Notes
+        -----
+        The fill_diagonal only support rectangular matrix. The output
        of tall matrix is "wrapped", which is an option in numpy 1.9.0
        but was regarded as a bug in numpy 1.6.2. Here I implement the
        fill_diagonal_offset with unwrapped output, so fill_diagonal_offset
        supports tall matrix.(This make a little difference between the output
        of fill_diagonal and fill_diagonal_offset only in the case of tall
        matrix)
        """
        if offset >= 0:
            start = offset
@@ -925,8 +987,9 @@ class FillDiagonalOffset(gof.Op):
    def grad(self, inp, cost_grad):
        """
-        Note: The gradient is currently implemented for matrices
+        Notes
-        only.
+        -----
+        The gradient is currently implemented for matrices only.
        """
        a, val, offset = inp
        grad = cost_grad[0]
@@ -972,31 +1035,49 @@ def fill_diagonal_offset(a, val, offset):
    Returns a copy of an array with all
    elements of the main diagonal set to a specified scalar value.
-      :param a: Rectangular array of two dimensions.
+    Parameters
-      :param val: Scalar value to fill the diagonal whose type must be
+    ----------
-          compatible with that of array 'a' (i.e. 'val' cannot be viewed
+    a
-          as an upcast of 'a').
+        Rectangular array of two dimensions.
-      :param offset: Scalar value Offset of the diagonal from the main
+    val
-          diagonal. Can be positive or negative integer.
+        Scalar value to fill the diagonal whose type must be
-      :return: An array identical to 'a' except that its offset diagonal
+        compatible with that of array 'a' (i.e. 'val' cannot be viewed
-          is filled with scalar 'val'. The output is unwrapped.
+        as an upcast of 'a').
+    offset
+        Scalar value Offset of the diagonal from the main
+        diagonal. Can be positive or negative integer.
+    Returns
+    -------
+    array
+        An array identical to 'a' except that its offset diagonal
+        is filled with scalar 'val'. The output is unwrapped.
    """
    return fill_diagonal_offset_(a, val, offset)
 def to_one_hot(y, nb_class, dtype=None):
-    """Return a matrix where each row correspond to the one hot
+    """
+    Return a matrix where each row correspond to the one hot
    encoding of each element in y.
-        :param y: A vector of integer value between 0 and nb_class - 1.
+    Parameters
-        :param nb_class: The number of class in y.
+    ----------
-        :param dtype: The dtype of the returned matrix. Default floatX.
+    y
+        A vector of integer value between 0 and nb_class - 1.
+    nb_class : int
+        The number of class in y.
+    dtype : data-type
+        The dtype of the returned matrix. Default floatX.
+    Returns
+    -------
+    object
+        A matrix of shape (y.shape[0], nb_class), where each row ``i`` is
+        the one hot encoding of the corresponding ``y[i]`` value.
-        :return: A matrix of shape (y.shape[0], nb_class), where each
+    """
-          row ``i`` is the one hot encoding of the corresponding ``y[i]``
-          value.
-   """
    ret = theano.tensor.zeros((y.shape[0], nb_class),
                              dtype=dtype)
    ret = theano.tensor.set_subtensor(ret[theano.tensor.arange(y.shape[0]), y],
@@ -1006,11 +1087,10 @@ def to_one_hot(y, nb_class, dtype=None):
 class Unique(theano.Op):
    """
-    Wraps numpy.unique.
+    Wraps numpy.unique. This op is not implemented on the GPU.
-    This op is not implemented on the GPU.
    Examples
-    ========
+    --------
    >>> import numpy as np
    >>> x = theano.tensor.vector()
@@ -1022,7 +1102,9 @@ class Unique(theano.Op):
    >>> g = theano.function([y], Unique(True, True, False)(y))
    >>> g([[1, 1, 1.0], (2, 3, 3.0)])
    [array([ 1.,  2.,  3.]), array([0, 3, 4]), array([0, 0, 0, 1, 2, 2])]
    """
    __props__ = ("return_index", "return_inverse", "return_counts")
    def __init__(self, return_index=False, return_inverse=False,

--- a/theano/tensor/io.py
+++ b/theano/tensor/io.py
@@ -11,13 +11,18 @@ import theano
 class LoadFromDisk(Op):
    """
-    An operation to load an array from disk
+    An operation to load an array from disk.
    See Also
-        load
+    --------
+    load
+    Notes
+    -----
+    Non-differentiable.
-    @note: Non-differentiable.
    """
    __props__ = ("dtype", "broadcastable", "mmap_mode")
    def __init__(self, dtype, broadcastable, mmap_mode=None):
@@ -53,18 +58,26 @@ def load(path, dtype, broadcastable, mmap_mode=None):
    """
    Load an array from an .npy file.
-    :param path: A Generic symbolic variable, that will contain a string
+    Parameters
-    :param dtype: The data type of the array to be read.
+    ----------
-    :param broadcastable: The broadcastable pattern of the loaded array,
+    path
-      for instance, (False,) for a vector, (False, True) for a column,
+        A Generic symbolic variable, that will contain a string
-      (False, False) for a matrix.
+    dtype : data-type
-    :param mmap_mode: How the file will be loaded. None means that the
+        The data type of the array to be read.
-      data will be copied into an array in memory, 'c' means that the file
+    broadcastable
-      will be mapped into virtual memory, so only the parts that are
+        The broadcastable pattern of the loaded array, for instance,
-      needed will be actually read from disk and put into memory.
+        (False,) for a vector, (False, True) for a column,
-      Other modes supported by numpy.load ('r', 'r+', 'w+') cannot
+        (False, False) for a matrix.
-      be supported by Theano.
+    mmap_mode
+        How the file will be loaded. None means that the
+        data will be copied into an array in memory, 'c' means that the file
+        will be mapped into virtual memory, so only the parts that are
+        needed will be actually read from disk and put into memory.
+        Other modes supported by numpy.load ('r', 'r+', 'w+') cannot
+        be supported by Theano.
+    Examples
+    --------
    >>> from theano import *
    >>> path = Variable(Generic())
    >>> x = tensor.load(path, 'int64', (False,))
@@ -72,6 +85,7 @@ def load(path, dtype, broadcastable, mmap_mode=None):
    >>> fn = function([path], y)
    >>> fn("stored-array.npy")
    array([0, 2, 4, 6, 8], dtype=int64)
    """
    return LoadFromDisk(dtype, broadcastable, mmap_mode)(path)
@@ -91,14 +105,19 @@ else:
 class MPIRecv(Op):
    """
-    An operation to asynchronously receive an array to a remote host using MPI
+    An operation to asynchronously receive an array to a remote host using MPI.
    See Also
-       MPIRecv
+    --------
-       MPIWait
+    MPIRecv
+    MPIWait
+    Notes
+    -----
+    Non-differentiable.
-    @note: Non-differentiable.
    """
    __props__ = ("source", "tag", "shape", "dtype")
    def __init__(self, source, tag, shape, dtype):
@@ -134,13 +153,18 @@ class MPIRecv(Op):
 class MPIRecvWait(Op):
    """
-    An operation to wait on a previously received array using MPI
+    An operation to wait on a previously received array using MPI.
    See Also
-       MPIRecv
+    --------
+    MPIRecv
+    Notes
+    -----
+    Non-differentiable.
-    @note: Non-differentiable.
    """
    __props__ = ("tag",)
    def __init__(self, tag):
@@ -168,14 +192,19 @@ class MPIRecvWait(Op):
 class MPISend(Op):
    """
-    An operation to asynchronously Send an array to a remote host using MPI
+    An operation to asynchronously Send an array to a remote host using MPI.
    See Also
-       MPIRecv
+    --------
-       MPISendWait
+    MPIRecv
+    MPISendWait
+    Notes
+    -----
+    Non-differentiable.
-    @note: Non-differentiable.
    """
    __props__ = ("dest", "tag")
    def __init__(self, dest, tag):
@@ -202,12 +231,16 @@ class MPISend(Op):
 class MPISendWait(Op):
    """
-    An operation to wait on a previously sent array using MPI
+    An operation to wait on a previously sent array using MPI.
+    See Also
+    --------
+    MPISend
-    See Also:
+    Notes
-       MPISend
+    -----
+    Non-differentiable.
-    @note: Non-differentiable.
    """
    __props__ = ("tag",)
@@ -227,35 +260,35 @@ class MPISendWait(Op):
 def isend(var, dest, tag):
    """
-    Non blocking send
+    Non blocking send.
    """
    return MPISend(dest, tag)(var)
 def send(var, dest, tag):
    """
-    blocking send
+    Blocking send.
    """
    return MPISendWait(tag)(*isend(var, dest, tag))
 def irecv(shape, dtype, source, tag):
    """
-    non-blocking receive
+    Non-blocking receive.
    """
    return MPIRecv(source, tag, shape, dtype)()
 def recv(shape, dtype, source, tag):
    """
-    blocking receive
+    Blocking receive.
    """
    return MPIRecvWait(tag)(*irecv(shape, dtype, source, tag))
 # Ordering keys for scheduling
 def mpi_send_wait_key(a):
-    """ Wait as long as possible on Waits, Start Send/Recvs early """
+    """Wait as long as possible on Waits, Start Send/Recvs early."""
    if isinstance(a.op, (MPIRecvWait, MPISendWait)):
        return 1
    if isinstance(a.op, (MPIRecv, MPISend)):
@@ -264,7 +297,7 @@ def mpi_send_wait_key(a):
 def mpi_tag_key(a):
-    """ Break MPI ties by using the variable tag - prefer lower tags first """
+    """Break MPI ties by using the variable tag - prefer lower tags first."""
    if isinstance(a.op, (MPISend, MPIRecv, MPIRecvWait, MPISendWait)):
        return a.op.tag
    else:

--- a/theano/tensor/nlinalg.py
+++ b/theano/tensor/nlinalg.py
@@ -17,17 +17,18 @@ logger = logging.getLogger(__name__)
 class MatrixPinv(Op):
    """Computes the pseudo-inverse of a matrix :math:`A`.
-    The pseudo-inverse of a matrix A, denoted :math:`A^+`, is
+    The pseudo-inverse of a matrix :math:`A`, denoted :math:`A^+`, is
    defined as: "the matrix that 'solves' [the least-squares problem]
    :math:`Ax = b`," i.e., if :math:`\\bar{x}` is said solution, then
    :math:`A^+` is that matrix such that :math:`\\bar{x} = A^+b`.
    Note that :math:`Ax=AA^+b`, so :math:`AA^+` is close to the identity matrix.
-    This method is not faster then `matrix_inverse`. Its strength comes from
+    This method is not faster than `matrix_inverse`. Its strength comes from
    that it works for non-square matrices.
    If you have a square matrix though, `matrix_inverse` can be both more
    exact and faster to compute. Also this op does not get optimized into a
    solve op.
    """
    __props__ = ()
@@ -55,8 +56,11 @@ class MatrixInverse(Op):
    matrix :math:`A_{inv}` such that the dot product :math:`A \cdot A_{inv}`
    and :math:`A_{inv} \cdot A` equals the identity matrix :math:`I`.
-    :note: When possible, the call to this op will be optimized to the call
+    Notes
-           of ``solve``.
+    -----
+    When possible, the call to this op will be optimized to the call
+    of ``solve``.
    """
    __props__ = ()
@@ -82,7 +86,7 @@ class MatrixInverse(Op):
        where :math:`V` corresponds to ``g_outputs`` and :math:`X` to
        ``inputs``. Using the `matrix cookbook
        <http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=3274>`_,
-        once can deduce that the relation corresponds to
+        one can deduce that the relation corresponds to
            .. math:: (X^{-1} \cdot V^{T} \cdot X^{-1})^T.
@@ -99,9 +103,9 @@ class MatrixInverse(Op):
            .. math:: \frac{\partial X^{-1}}{\partial X}V,
        where :math:`V` corresponds to ``g_outputs`` and :math:`X` to
-        ``inputs``.  Using the `matrix cookbook
+        ``inputs``. Using the `matrix cookbook
        <http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=3274>`_,
-        once can deduce that the relation corresponds to
+        one can deduce that the relation corresponds to
            .. math:: X^{-1} \cdot V \cdot X^{-1}.
@@ -120,11 +124,12 @@ matrix_inverse = MatrixInverse()
 def matrix_dot(*args):
-    """ Shorthand for product between several dots
+    """ Shorthand for product between several dots.
    Given :math:`N` matrices :math:`A_0, A_1, .., A_N`, ``matrix_dot`` will
    generate the matrix product between all in the given order, namely
    :math:`A_0 \cdot A_1 \cdot A_2 \cdot .. \cdot A_N`.
    """
    rval = args[0]
    for a in args[1:]:
@@ -163,10 +168,14 @@ alloc_diag = AllocDiag()
 class ExtractDiag(Op):
-    """ Return the diagonal of a matrix.
+    """Return the diagonal of a matrix.
+    Notes
+    -----
+    Works on the GPU.
-    :note: work on the GPU.
    """
    __props__ = ("view",)
    def __init__(self, view=False):
@@ -246,14 +255,18 @@ def trace(X):
    """
    Returns the sum of diagonal elements of matrix X.
-    :note: work on GPU since 0.6rc4.
+    Notes
+    -----
+    Works on GPU since 0.6rc4.
    """
    return extract_diag(X).sum()
 class Det(Op):
-    """Matrix determinant
+    """
-    Input should be a square matrix
+    Matrix determinant. Input should be a square matrix.
    """
    __props__ = ()
@@ -287,9 +300,11 @@ det = Det()
 class Eig(Op):
-    """Compute the eigenvalues and right eigenvectors of a square array.
+    """
+    Compute the eigenvalues and right eigenvectors of a square array.
    """
    _numop = staticmethod(numpy.linalg.eig)
    __props__ = ()
@@ -317,6 +332,7 @@ class Eigh(Eig):
    Return the eigenvalues and eigenvectors of a Hermitian or symmetric matrix.
    """
    _numop = staticmethod(numpy.linalg.eigh)
    __props__ = ('UPLO',)
@@ -363,6 +379,7 @@ class Eigh(Eig):
           .. math:: \frac{\partial\,v_{kn}}
                          {\partial a_{ij}} =
                \sum_{m\ne n}\frac{v_{km}v_{jn}}{w_n-w_m}
        """
        x, = inputs
        w, v = self(x)
@@ -383,9 +400,11 @@ def _zero_disconnected(outputs, grads):
 class EighGrad(Op):
-    """Gradient of an eigensystem of a Hermitian matrix.
+    """
+    Gradient of an eigensystem of a Hermitian matrix.
    """
    __props__ = ('UPLO',)
    def __init__(self, UPLO='L'):
@@ -414,6 +433,7 @@ class EighGrad(Op):
        """
        Implements the "reverse-mode" gradient for the eigensystem of
        a square matrix.
        """
        x, w, v, W, V = inputs
        N = x.shape[0]
@@ -453,10 +473,13 @@ def eigh(a, UPLO='L'):
 class QRFull(Op):
    """
    Full QR Decomposition.
    Computes the QR decomposition of a matrix.
    Factor the matrix a as qr, where q is orthonormal
    and r is upper-triangular.
    """
    _numop = staticmethod(numpy.linalg.qr)
    __props__ = ('mode',)
@@ -484,9 +507,12 @@ class QRFull(Op):
 class QRIncomplete(Op):
    """
    Incomplete QR Decomposition.
    Computes the QR decomposition of a matrix.
    Factor the matrix a as qr and return a single matrix.
    """
    _numop = staticmethod(numpy.linalg.qr)
    __props__ = ('mode',)
@@ -513,15 +539,12 @@ def qr(a, mode="full"):
    Factor the matrix a as qr, where q
    is orthonormal and r is upper-triangular.
-    :type a:
+    Parameters
-        array_like, shape (M, N)
+    ----------
-    :param a:
+    a : array_like, shape (M, N)
        Matrix to be factored.
-    :type mode:
+    mode : {'reduced', 'complete', 'r', 'raw', 'full', 'economic'}, optional
-        one of 'reduced', 'complete', 'r', 'raw', 'full' and
-        'economic', optional
-    :keyword mode:
        If K = min(M, N), then
        'reduced'
@@ -558,19 +581,18 @@ def qr(a, mode="full"):
           both doing the same thing in the new numpy version but only
           full works on the old previous numpy version.
-    :rtype q:
+    Returns
-      matrix of float or complex, optional
+    -------
-    :return q:
+    q : matrix of float or complex, optional
-      A matrix with orthonormal columns. When mode = 'complete' the
+        A matrix with orthonormal columns. When mode = 'complete' the
-      result is an orthogonal/unitary matrix depending on whether or
+        result is an orthogonal/unitary matrix depending on whether or
-      not a is real/complex. The determinant may be either +/- 1 in
+        not a is real/complex. The determinant may be either +/- 1 in
-      that case.
+        that case.
+    r : matrix of float or complex, optional
-    :rtype r:
+        The upper-triangular matrix.
-      matrix of float or complex, optional
-    :return r:
-      The upper-triangular matrix.
    """
    x = [[2, 1], [3, 4]]
    if isinstance(numpy.linalg.qr(x, mode), tuple):
        return QRFull(mode)(a)
@@ -579,22 +601,26 @@ def qr(a, mode="full"):
 class SVD(Op):
+    """
+    Parameters
+    ----------
+    full_matrices : bool, optional
+        If True (default), u and v have the shapes (M, M) and (N, N),
+        respectively.
+        Otherwise, the shapes are (M, K) and (K, N), respectively,
+        where K = min(M, N).
+    compute_uv : bool, optional
+        Whether or not to compute u and v in addition to s.
+        True by default.
+    """
    # See doc in the docstring of the function just after this class.
    _numop = staticmethod(numpy.linalg.svd)
    __props__ = ('full_matrices', 'compute_uv')
    def __init__(self, full_matrices=True, compute_uv=True):
-        """
-        full_matrices : bool, optional
-            If True (default), u and v have the shapes (M, M) and (N, N),
-            respectively.
-            Otherwise, the shapes are (M, K) and (K, N), respectively,
-            where K = min(M, N).
-        compute_uv : bool, optional
-            Whether or not to compute u and v in addition to s.
-            True by default.
-        """
        self.full_matrices = full_matrices
        self.compute_uv = compute_uv
@@ -619,18 +645,21 @@ def svd(a, full_matrices=1, compute_uv=1):
    """
    This function performs the SVD on CPU.
-    :type full_matrices: bool, optional
+    Parameters
-    :param full_matrices:
+    ----------
+    full_matrices : bool, optional
        If True (default), u and v have the shapes (M, M) and (N, N),
        respectively.
        Otherwise, the shapes are (M, K) and (K, N), respectively,
        where K = min(M, N).
-    :type compute_uv: bool, optional
+    compute_uv : bool, optional
-    :param compute_uv:
        Whether or not to compute u and v in addition to s.
        True by default.
-    :returns: U, V and D matrices.
+    Returns
+    -------
+    U, V,  D : matrices
    """
    return SVD(full_matrices, compute_uv)(a)

--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
@@ -44,8 +44,13 @@ from theano.gradient import grad_undefined
 # the output function is only defined when dr, dc, dt are natural numbers.
 class Conv3D(theano.Op):
-    """ 3D `convolution` of multiple filters on a minibatch
+    """
-        :note: does not flip the kernel, moves kernel with a user specified stride
+    3D `convolution` of multiple filters on a minibatch.
+    Notes
+    -----
+    Does not flip the kernel, moves kernel with a user specified stride.
    """
    __props__ = ()
@@ -54,10 +59,17 @@ class Conv3D(theano.Op):
    def make_node(self, V, W, b, d):
        """
-            :param V: Visible unit, input(batch,row,column,time,in channel)
+        Parameters
-            :param W: Weights, filter(out channel,row,column,time,in channel)
+        ----------
-            :param b: bias, shape == (W.shape[0],)
+        V
-            :param d: strides when moving the filter over the input(dx,dy,dt)
+            Visible unit, input(batch,row,column,time,in channel)
+        W
+            Weights, filter(out channel,row,column,time,in channel)
+        b
+            bias, shape == (W.shape[0],)
+        d
+            strides when moving the filter over the input(dx,dy,dt)
        """
        V_ = T.as_tensor_variable(V)
@@ -539,28 +551,39 @@ _conv3D = Conv3D()
 def conv3D(V, W, b, d):
    """
-    3D "convolution" of multiple filters on a minibatch
+    3D "convolution" of multiple filters on a minibatch.
    (does not flip the kernel, moves kernel with a user specified stride)
-    :param V: Visible unit, input.
+    Parameters
-        dimensions: (batch, row, column, time, in channel)
+    ----------
-    :param W: Weights, filter.
+    V
-        dimensions: (out channel, row, column, time ,in channel)
+        Visible unit, input.
-    :param b: bias, shape == (W.shape[0],)
+        Dimensions: (batch, row, column, time, in channel).
-    :param d: strides when moving the filter over the input(dx, dy, dt)
+    W
+        Weights, filter.
-    :note: The order of dimensions does not correspond to the one in `conv2d`.
+        Dimensions: (out channel, row, column, time ,in channel).
-           This is for optimization.
+    b
+        Bias, shape == (W.shape[0],).
-    :note: The GPU implementation is very slow. You should use
+    d
-           :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>` or
+        Strides when moving the filter over the input(dx, dy, dt).
-           :func:`conv3d_fft <theano.sandbox.cuda.fftconv.conv3d_fft>` for a
-           GPU graph instead.
+    Notes
+    -----
-    :see: Someone made a script that shows how to swap the axes
+    The order of dimensions does not correspond to the one in `conv2d`.
-          between both 3d convolution implementations in Theano. See
+    This is for optimization.
-          the last `attachment
-          <https://groups.google.com/d/msg/theano-users/1S9_bZgHxVw/0cQR9a4riFUJ>`_.
+    The GPU implementation is very slow. You should use
+    :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>` or
+    :func:`conv3d_fft <theano.sandbox.cuda.fftconv.conv3d_fft>` for a
+    GPU graph instead.
+    See Also
+    --------
+    Someone made a script that shows how to swap the axes
+    between both 3d convolution implementations in Theano. See
+    the last `attachment <https://groups.google.com/d/msg/theano-users/1S9_bZgHxVw/0cQR9a4riFUJ>`_
 """
    return _conv3D(V, W, b, d)

--- a/theano/tensor/nnet/ConvGrad3D.py
+++ b/theano/tensor/nnet/ConvGrad3D.py
@@ -13,7 +13,11 @@ from theano.gradient import DisconnectedType
 # than visiting each weight gradient element once and passing through whole video
 class ConvGrad3D(theano.Op):
-    """ Gradient of Conv3D with respect to W """
+    """
+    Gradient of Conv3D with respect to W.
+    """
    __props__ = ()
    def c_code_cache_version(self):

--- a/theano/tensor/nnet/ConvTransp3D.py
+++ b/theano/tensor/nnet/ConvTransp3D.py
@@ -11,7 +11,12 @@ from theano.gradient import DisconnectedType
 class ConvTransp3D(theano.Op):
-    """ "Transpose" of Conv3D (Conv3D implements multiplication by an implicitly defined matrix W. This implements multiplication by its transpose) """
+    """
+    "Transpose" of Conv3D (Conv3D implements multiplication by an implicitly
+    defined matrix W. This implements multiplication by its transpose).
+    """
    __props__ = ()
    def c_code_cache_version(self):
@@ -19,10 +24,17 @@ class ConvTransp3D(theano.Op):
    def make_node(self, W, b, d, H, RShape=None):
        """
-        :param W: Weights, filter
+        Parameters
-        :param b: bias, shape == (W.shape[0],)
+        ----------
-        :param d: strides when moving the filter over the input
+        W
-        :param H: The output of Conv3D
+            Weights, filter
+        b
+            Bias, shape == (W.shape[0],).
+        d
+            Strides when moving the filter over the input.
+        H
+            The output of Conv3D.
        """
        W_ = T.as_tensor_variable(W)
        b_ = T.as_tensor_variable(b)

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -36,61 +36,58 @@ _logger = logging.getLogger("theano.tensor.nnet.conv")
 def conv2d(input, filters, image_shape=None, filter_shape=None,
           border_mode='valid', subsample=(1, 1), **kargs):
-    """This function will build the symbolic graph for convolving a stack of
+    """
+    This function will build the symbolic graph for convolving a stack of
    input images with a set of filters. The implementation is modelled after
    Convolutional Neural Networks (CNN). It is simply a wrapper to the ConvOp
    but provides a much cleaner interface.
-    :type input: symbolic 4D tensor
+    Parameters
-    :param input: mini-batch of feature map stacks, of shape
+    ----------
-                  (batch size, stack size, nb row, nb col)
+    input : symbolic 4D tensor
-                  see the optional parameter image_shape
+        Mini-batch of feature map stacks, of shape
+        (batch size, stack size, nb row, nb col)
-    :type filters: symbolic 4D tensor
+        see the optional parameter image_shape
-    :param filters: set of filters used in CNN layer of shape
+    filters: symbolic 4D tensor
-                    (nb filters, stack size, nb row, nb col)
+        Set of filters used in CNN layer of shape
-                    see the optional parameter filter_shape
+        (nb filters, stack size, nb row, nb col)
+        see the optional parameter filter_shape
-    :param border_mode:
+    border_mode : {'valid', 'full'}
-       'valid'-- only apply filter to complete patches of the image. Generates
+       'valid'only apply filter to complete patches of the image. Generates
-                 output of shape: image_shape - filter_shape + 1
+       output of shape: image_shape - filter_shape + 1.
-       'full' -- zero-pads image to multiple of filter shape to generate output
+       'full' zero-pads image to multiple of filter shape to generate output
-                 of shape: image_shape + filter_shape - 1
+       of shape: image_shape + filter_shape - 1.
+    subsample: tuple of len 2
-    :type subsample: tuple of len 2
+        Factor by which to subsample the output. Also called strides elsewhere.
-    :param subsample: factor by which to subsample the output.
+    image_shape: None, tuple/list of len 4 of int, None or Constant variable
-                      Also called strides elsewhere.
+        The shape of the input parameter.
+        Optional, used for optimization like loop unrolling
-    :type image_shape: None, tuple/list of len 4 of int, None or
+        You can put None for any element of the list to tell that this element
-                       Constant variable
+        is not constant.
-    :param image_shape: The shape of the input parameter.
+    filter_shape : None, tuple/list of len 4 of int, None or Constant variable
-                        Optional, used for optimization like loop unrolling
+        Optional, used for optimization like loop unrolling
-                        You can put None for any element of the list
+        You can put None for any element of the list
-                        to tell that this element is not constant.
+        to tell that this element is not constant.
-    :type filter_shape: None, tuple/list of len 4 of int, None or
+    kwargs
-                        Constant variable
+        Kwargs are passed onto ConvOp. Can be used to set the following:
-    :param filter_shape: Optional, used for optimization like loop unrolling
+        unroll_batch, unroll_kern, unroll_patch, openmp (see ConvOp doc).
-                         You can put None for any element of the list
-                         to tell that this element is not constant.
+        openmp: By default have the same value as
-    :param kwargs: kwargs are passed onto ConvOp.
+                config.openmp. For small image, filter,
-                   Can be used to set the following:
+                batch size, nkern and stack size, it can be
-                   unroll_batch, unroll_kern, unroll_patch,
+                faster to disable manually openmp. A fast and
-                   openmp (see ConvOp doc)
+                incomplete test show that with image size
+                6x6, filter size 4x4, batch size==1,
-                   openmp: By default have the same value as
+                n kern==1 and stack size==1, it is faster
-                           config.openmp. For small image, filter,
+                to disable it in valid mode. But if we
-                           batch size, nkern and stack size, it can be
+                grow the batch size to 10, it is faster
-                           faster to disable manually openmp. A fast and
+                with openmp on a core 2 duo.
-                           incomplete test show that with image size
-                           6x6, filter size 4x4, batch size==1,
+    Returns
-                           n kern==1 and stack size==1, it is faster
+    -------
-                           to disable it in valid mode. But if we
+    symbolic 4D tensor
-                           grow the batch size to 10, it is faster
+        Set of feature maps generated by convolutional layer. Tensor is
-                           with openmp on a core 2 duo.
+        of shape (batch size, nb filters, output row, output col).
-    :rtype: symbolic 4D tensor
-    :return: set of feature maps generated by convolutional layer. Tensor is
-        of shape (batch size, nb filters, output row, output col)
    """
@@ -171,6 +168,97 @@ class ConvOp(OpenMPOp):
        output[b,k,:,:] = \sum_i input[b,i,:,:] * filter[k,i,:,:] \forall b,k
    where b is the mini-batch index, k the filter index and * is the
    convolution operator.
+    The constructor initializes a ConvOp with given output_mode (full/valid).
+    All other parameters are optional and are only used to generate more
+    optimized c code, or to enable graph optimizers to optimally replace the
+    ConvOp.
+    NOTES ON OPTIMIZATION:
+    There are two types of optimization. The first is the selection of the
+    fastest algo when bsize and nkern are provided with imshp and kshp.
+    By default we try to select the fastest version. You can specify it
+    with the unroll_batch, unroll_kern, and unroll_patch parameter.
+    The second type of optimization is hardcoding some dimensions into the
+    code when all shape are know.
+    This make a significant difference for the 'full' output_mode.
+    Sometimes, the fastest implementation on x86-64 uses
+    {unroll_batch=4, unroll_kern=4, unroll_patch=False}
+    with all other shape parameters being provided.
+    For optimizing other architectures, see:
+    Kazushige Goto and Robert A. Van De Geijn, Anatomy of High-Performance
+    Matrix Multiplication, (mr x nr). ACM Transactions on Mathematical
+    Software, May 2008.
+    Figure 12: (mr x nr). For x86 use 2x4, itanium 8x8, etc.
+    Parameters
+    ----------
+    output_mode : {'valid', 'full'}
+        'valid' gives an output smaller then the image.
+        'full' gives an output bigger then the image.
+         See 'border_mode' in conv2d's doc.
+    Optional parameters: (will generate more optimal c code)
+    imshp : tuple of len 2 or 3: 2 for 2d image, 3 for a stack of 2d images.
+        Stacksize, nb image row, nb image col.
+    kshp : tuple of len 2
+        Nb kernel row, nb kernel col.
+    nkern : int
+        The number of kernel.
+    bsize : int
+        The size of the minibatch.
+    dx : int
+        Patch stride rows.
+    dy : int
+        Patch stride cols
+    Params which select the version of code used:
+    unroll_patch : bool
+        Use a version of c_code that unroll the patch loop that don't
+        request all shape information to work, but if all shape information
+        are present, will use it to hardcode the value in the code for
+        faster code.
+    unroll_batch : int
+        Use a version of c_code that unroll the batch (by unroll_batch)
+        and the nkern (by unroll_kern) loop. The size must by a multiple
+        of bsize or nkern respectively.
+    unroll_kern : int
+        Use a version of c_code that unroll the batch
+        (by unroll_batch) and the nkern(by unroll_kern) loop. The size
+        must by a multiple of bsize or nkern respectively.
+    verbose : int
+        Passed to GpuConv.
+    version: int or str
+        Passed to GpuConv, if version='no_fft', fft
+        optimization will be desactivated at the op level.
+    direction_hint: {'forward', 'bprop weights', 'bprop inputs'}
+        Passed to GpuConv, used by graph optimizers to aid algorithm choice.
+    The 3 following parameters are used internally when we generate
+    the gradient when dx!=1 or dy!=1.
+    imshp_logical
+        Default None. None value is equivalent to imshp value.
+        When imshp_logical != imshp, it tell we need to insert 0 in
+        the image before we do the convolution. For example, when dx==dy==2
+        and the image is [[1, 2], [3, 4]], we should make as if the image
+        was [[1, 0, 2, 0], [0, 0, 0, 0], [3, 0, 4, 0], [0, 0, 0, 0]].
+        Our python code insert the zero, but the c code optimize it.
+        imshp_logical != imshp when taking the grad again the weights or
+        the image when the output_mode is full and `dx != 1` or `dy != 1`.
+    kshp_logical
+        Idem but for kshp and used for the grad again the
+        weights when the output_mode is valid and `dx != 1` or `dy != 1`.
+    kshp_logical_top_aligned
+        Used in the same case. Default to True.
+        Set to False in the grad again the weight when the
+        output_mode is full.
    """
    __attrnames = ['imshp', 'kshp', 'nkern', 'bsize', 'dx', 'dy', 'out_mode',
@@ -257,10 +345,20 @@ class ConvOp(OpenMPOp):
        with kernels of shape "kshp". Accepts symbolic or integer shapes.
        Propagates `None`s (for unknown shapes).
-        :param inshp: (rows,cols) of input image
+        Parameters
-        :param kshp: (rows,cols) of filters
+        ----------
-        :param mode: 'valid' or 'full' (see 'border_mode' in conv2d's doc)
+        inshp
-        :return: (rows,cols) of output image
+            (rows,cols) of input image.
+        kshp
+            (rows,cols) of filters.
+        mode: {'valid', 'full'}
+            See 'border_mode' in conv2d's doc.
+        Returns
+        -------
+        object
+            (rows,cols) of output image.
        """
        # The formula would be ceil((i + s * k - s * 1) / float(d)),
        # with s=1 for mode=='full' and s=-1 for mode=='valid'.
@@ -284,92 +382,6 @@ class ConvOp(OpenMPOp):
                 version=-1,
                 direction_hint='forward',
                 openmp=None):
-        """
-        Initializes a ConvOp with given output_mode (full/valid). All other
-        parameters are optional and are only used to generate more optimized c
-        code, or to enable graph optimizers to optimally replace the ConvOp.
-        NOTES ON OPTIMIZATION:
-        Their is two type of optimization. The first is the selection of the
-        fastest algo when bsize and nkern are probided with imshp and kshp.
-        By default we try to select the fastest version. You can specify it
-        with the unroll_batch, unroll_kern, and unroll_patch parameter.
-        The second type of optimization is hardcoding some dimensions into the
-        code when all shape are know.
-        This make a significant difference for the 'full' output_mode.
-        Some times, the fastest implementation on x86-64 uses
-        {unroll_batch=4, unroll_kern=4, unroll_patch=False}
-        with all other shape parameters being provided.
-        For optimizing other architectures, see:
-        Kazushige Goto and Robert A. Van De Geijn, Anatomy of High-Performance
-        Matrix Multiplication, (mr x nr). ACM Transactions on Mathematical
-        Software, May 2008.
-        Figure 12: (mr x nr). For x86 use 2x4, itanium 8x8, etc.
-        :type output_mode: string
-        :param output_mode: 'valid' -- gives an output smaller then the image
-                            'full' -- gives an output bigger then the image
-        Optional parameters: (will generate more optimal c code)
-        :type imshp: tuple of len 2 or 3: 2 for 2d image,
-                                          3 for a stack of 2d images.
-        :param imshp: (stacksize, nb image row, nb image col)
-        :type kshp: tuple of len 2
-        :param kshp: (nb kernel row, nb kernel col)
-        :type nkern: int
-        :param nkern: the number of kernel
-        :type bsize: int
-        :param bsize: the size of the minibatch
-        :type dx: int
-        :param dx: patch stride rows
-        :type dy: int
-        :param dy: patch stride cols
-        Params which select the version of code used:
-        :type unroll_patch: bool
-        :param unroll_patch: use a version of c_code that unroll the patch loop
-            that don't request all shape information to work, but if all shape
-            information are present, will
-        use it to hardcode the value in the code for faster code.
-        :type unroll_batch:int
-        :param unroll_batch: use a version of c_code that unroll the batch
-            (by unroll_batch) and the nkern(by unroll_kern) loop. The size
-            must by a multiple of bsize or nkern respectively.
-        :type unroll_kern:int
-        :param unroll_kern: use a version of c_code that unroll the batch
-            (by unroll_batch) and the nkern(by unroll_kern) loop. The size
-            must by a multiple of bsize or nkern
-        respectively.
-        :type verbose: int
-        :param verbose: passed to GpuConv
-        :type version: int or str
-        :param version: passed to GpuConv, if version='no_fft', fft
-            optimization will be desactivated at the op level.
-        :param direction_hint: 'forward', 'bprop weights' or 'bprop inputs'.
-            Passed to GpuConv, used by graph optimizers to aid algorithm choice
-        The 3 following parameters are used internally when we generate
-        the gradient when dx!=1 or dy!=1.
-        :param imshp_logical: Default None. None value is equivalent to imshp
-            value. When imshp_logical != imshp, it tell we need to insert 0 in
-            the image before we do the convolution. For example, when dx==dy==2
-            and the image is [[1, 2], [3, 4]], we should make as if the image
-            was [[1, 0, 2, 0], [0, 0, 0, 0], [3, 0, 4, 0], [0, 0, 0, 0]].
-            Our python code insert the zero, but the c code optimize it.
-            imshp_logical != imshp when taking the grad again the weights or
-            the image when the output_mode is full and `dx != 1` or `dy != 1`.
-        :param kshp_logical: idem but for kshp and used for the grad again the
-            weights when the output_mode is valid and `dx != 1` or `dy != 1`.
-        :param kshp_logical_top_aligned: Used in the same case.Default to True.
-            Set to False in the grad again the weight when the
-            output_mode is full.
-        """
        # Deactivate fft_optimization at the op level if specified
        if version == "no_fft":
            self.fft_opt = False
@@ -587,7 +599,10 @@ class ConvOp(OpenMPOp):
                                    for a in self.__attrnames) + "}"
    def flops(self, inputs, outputs):
-        """ Useful with the hack in profilemode to print the MFlops"""
+        """
+        Useful with the hack in profilemode to print the MFlops.
+        """
        images, kerns = inputs
        out, = outputs
        assert images[1] == kerns[1]
@@ -608,8 +623,13 @@ class ConvOp(OpenMPOp):
    def make_node(self, inputs, kerns):
        # TODO: find a way to make ConvOp work for N-D (after NIPS09)
        """
-        inputs - 4 dim: batches x stacksize x rows x cols
+        Parameters
-        kerns - 4 dim: nkern x stackidx x rows x cols
+        ----------
+        inputs
+            4 dim: batches x stacksize x rows x cols.
+        kerns
+            4 dim: nkern x stackidx x rows x cols.
        """
        _inputs = as_tensor_variable(inputs)
        _kerns = as_tensor_variable(kerns)
@@ -655,7 +675,8 @@ class ConvOp(OpenMPOp):
    def perform(self, node, inp, out):
        """
-        By default if len(img2d.shape)==3, we
+        By default if len(img2d.shape)==3, we TODO
        """
        img2d, filtersflipped = inp
        z, = out
@@ -1818,7 +1839,9 @@ Py_XDECREF(img2d);
 def gen_conv_code_unroll_batch_kern(d, unroll_bsize=1, unroll_ksize=1):
-    """ c_code for ConvOp that unroll the batch size loop
+    """
+    c_code for ConvOp that unroll the batch size loop.
    """
    assert unroll_bsize > 0 and unroll_ksize > 0
    if "unroll_bsize" in d or "unroll_ksize" in d or "unroll_iter" in d or "unroll_biter" in d or "unroll_kiter" in d:

--- a/theano/tensor/nnet/conv3d2d.py
+++ b/theano/tensor/nnet/conv3d2d.py
@@ -6,10 +6,13 @@ import theano.sandbox.cuda as cuda
 def get_diagonal_subtensor_view(x, i0, i1):
-    """Helper function for DiagonalSubtensor and
+    """
-    IncDiagonalSubtensor
+    Helper function for DiagonalSubtensor and IncDiagonalSubtensor.
+    Notes
+    -----
+    It returns a partial view of x, not a partial copy.
-    :note: it return a partial view of x, not a partial copy.
    """
    # We have to cast i0 and i0 to int because python 2.4 (and maybe later)
    # do not support indexing with 0-dim, 'int*' ndarrays.
@@ -27,13 +30,24 @@ def get_diagonal_subtensor_view(x, i0, i1):
 class DiagonalSubtensor(Op):
-    """Return a form a nd diagonal subtensor.
+    """
+    Return a form a nd diagonal subtensor.
-    :param x: n-d tensor
-    :param i0: axis index in x
+    Parameters
-    :param i1: axis index in x
+    ----------
-    :note: Work on the GPU.
+    x
+        n-d tensor
+    i0
+        Axis index in x
+    i1
+        Axis index in x
+    Notes
+    -----
+    Work on the GPU.
+    Extended summary
+    ----------------
    ``x`` is some n-dimensional tensor, but this Op only deals with a
    matrix-shaped slice, using axes i0 and i1. Without loss of
    generality, suppose that ``i0`` picks out our ``row`` dimension,
@@ -73,6 +87,7 @@ class DiagonalSubtensor(Op):
    see what's necessary at that point.
    """
    __props__ = ("inplace",)
    def __str__(self):
@@ -111,8 +126,10 @@ diagonal_subtensor = DiagonalSubtensor(False)
 class IncDiagonalSubtensor(Op):
    """
-    The gradient of DiagonalSubtensor
+    The gradient of DiagonalSubtensor.
    """
    __props__ = ("inplace",)
    def __str__(self):
@@ -153,26 +170,39 @@ inc_diagonal_subtensor = IncDiagonalSubtensor(False)
 def conv3d(signals, filters,
           signals_shape=None, filters_shape=None,
           border_mode='valid'):
-    """Convolve spatio-temporal filters with a movie.
+    """
+    Convolve spatio-temporal filters with a movie.
    It flips the filters.
-    :param signals: timeseries of images whose pixels have color channels.
+    Parameters
-            shape: [Ns, Ts, C, Hs, Ws]
+    ----------
-    :param filters: spatio-temporal filters
+    signals
-            shape: [Nf, Tf, C, Hf, Wf]
+        Timeseries of images whose pixels have color channels.
-    :param signals_shape: None or a tuple/list with the shape of signals
+        Shape: [Ns, Ts, C, Hs, Ws].
-    :param filters_shape: None or a tuple/list with the shape of filters
+    filters
-    :param border_mode: The only one tested is 'valid'.
+        Spatio-temporal filters.
+        Shape: [Nf, Tf, C, Hf, Wf].
-    :note: Another way to define signals: (batch,  time, in channel, row, column)
+    signals_shape
-           Another way to define filters: (out channel,time,in channel, row, column)
+        None or a tuple/list with the shape of signals.
-    :note: For the GPU, you can use this implementation or
+    filters_shape
-           :func:`conv3d_fft <theano.sandbox.cuda.fftconv.conv3d_fft>`.
+        None or a tuple/list with the shape of filters.
+    border_mode
-    :see: Someone made a script that shows how to swap the axes between
+        The only one tested is 'valid'.
-          both 3d convolution implementations in Theano. See the last
-          `attachment <https://groups.google.com/d/msg/theano-users/1S9_bZgHxVw/0cQR9a4riFUJ>`_.
+    Notes
+    -----
+    Another way to define signals: (batch,  time, in channel, row, column)
+    Another way to define filters: (out channel,time,in channel, row, column)
+    For the GPU, you can use this implementation or
+    :func:`conv3d_fft <theano.sandbox.cuda.fftconv.conv3d_fft>`.
+    See Also
+    --------
+    Someone made a script that shows how to swap the axes between
+    both 3d convolution implementations in Theano. See the last
+    `attachment <https://groups.google.com/d/msg/theano-users/1S9_bZgHxVw/0cQR9a4riFUJ>`_
    """
@@ -264,7 +294,8 @@ def conv3d(signals, filters,
 def make_gpu_optimizer(op, to_gpu):
-    """This function create optimizer that move some inputs to the GPU
+    """
+    This function create optimizer that move some inputs to the GPU
    for op that work on both CPU and GPU.
    The op object is created by calling op(), so good default value
@@ -272,8 +303,12 @@ def make_gpu_optimizer(op, to_gpu):
    We suppose the same op work with CPU and GPU inputs.
-    :param op: the op that support GPU inputs
+    Parameters
-    :param to_gpu: a list of op inputs that are moved to the GPU.
+    ----------
+    op
+        The op that support GPU inputs.
+    to_gpu
+        A list of op inputs that are moved to the GPU.
    """
    @theano.gof.local_optimizer([op, cuda.gpu_from_host])
@@ -281,6 +316,7 @@ def make_gpu_optimizer(op, to_gpu):
        """
        op(host_from_gpu()) -> host_from_gpu(op)
        gpu_from_host(op) -> op(gpu_from_host)
        """
        if isinstance(node.op, op):
            # op(host_from_gpu()) -> host_from_gpu(op)
@@ -314,7 +350,7 @@ if cuda.cuda_available:
 @theano.gof.local_optimizer([DiagonalSubtensor, IncDiagonalSubtensor])
 def local_inplace_DiagonalSubtensor(node):
-    """ also work for IncDiagonalSubtensor """
+    """Also work for IncDiagonalSubtensor."""
    if (isinstance(node.op, (DiagonalSubtensor, IncDiagonalSubtensor)) and
            not node.op.inplace):
        new_op = node.op.__class__(inplace=True)

--- a/theano/tensor/nnet/neighbours.py
+++ b/theano/tensor/nnet/neighbours.py
@@ -13,23 +13,29 @@ from theano.gradient import grad_undefined
 class Images2Neibs(Op):
+    """
+    Parameters
+    ----------
+    mode : {'valid', 'ignore_borders', 'wrap_centered'}
+        'valid': Requires an input that is a multiple of the
+            pooling factor (in each direction).
+        'ignore_borders': Same as valid, but will ignore the borders
+            if the shape(s) of the input is not a multiple of the pooling
+            factor(s).
+        'wrap_centered' : ?? TODO comment
+    Returns
+    -------
+    object
+        Reshapes the input as a 2D tensor where each row is an
+        pooling example.
+    """
    __props__ = ("mode",)
    def __init__(self, mode='valid'):
-        """
-        :type mode: str
-        :param mode: Possible values:
-            'valid': Requires an input that is a multiple of the
-                pooling factor (in each direction)
-            'ignore_borders': Same as valid, but will ignore the borders
-                if the shape(s) of the input
-                is not a multiple of the pooling factor(s)
-            'wrap_centered' : ?? TODO comment
-        :return:
-            Reshapes the input as a 2D tensor where each row is an
-            pooling example
-        """
        if mode not in ['valid', 'wrap_centered', 'ignore_borders']:
            raise NotImplementedError("Only the mode valid, ignore_borders"
                                      " and wrap_centered have been"
@@ -46,20 +52,22 @@ class Images2Neibs(Op):
    def make_node(self, ten4, neib_shape, neib_step=None):
        """
-        :param ten4:     a list of lists of images
+        Parameters
-                         ten4 is of shape (list 1 dim, list 2 dim,
+        ----------
-                                           row, col)
+        ten4 : a list of lists of images
-        :param neib_shape: (r,c) where r is the height of the neighborhood
+            ten4 is of shape (list 1 dim, list 2 dim, row, col).
-                        in rows and c is the width of the neighborhood
+        neib_shape
-                        in columns
+            (r,c) where r is the height of the neighborhood in rows and c is
-        :param neib_step: (dr,dc) where dr is the number of rows to
+            the width of the neighborhood in columns.
-                          skip between patch and dc is the number of
+        neib_step
-                          columns. When None, this is the same as
+            (dr,dc) where dr is the number of rows to skip between patch and dc
-                          neib_shape(patch are disjoint)
+            is the number of columns. When None, this is the same as neib_shape
+            (patch are disjoint).
-        output:
-            a 2D matrix, written using the following pattern
+        Returns
+        -------
+        matrix
+            A 2D matrix, written using the following pattern
            idx = 0
            for i in xrange(list 1 dim)
                for j in xrange(list 2 dim)
@@ -68,9 +76,10 @@ class Images2Neibs(Op):
                            output[idx,:]
                                 = flattened version of ten4[i,j,l:l+r,k:k+c]
                            idx += 1
-            (note: the op isn't necessarily implemented internally with these
+            .. note:: The op isn't necessarily implemented internally with these
            for loops, they're just the easiest way to describe the output
-            pattern)
+            pattern.
        """
        ten4 = T.as_tensor_variable(ten4)
        neib_shape = T.as_tensor_variable(neib_shape)
@@ -420,61 +429,46 @@ def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
    """
    Function :func:`images2neibs <theano.sandbox.neighbours.images2neibs>`
    allows to apply a sliding window operation to a tensor containing
-    images
+    images or other two-dimensional objects.
-    or other two-dimensional objects.
+    The sliding window operation loops over points in input data and stores
-    The sliding window operation loops
+    a rectangular neighbourhood of each point.
-    over points in input data and stores a rectangular neighbourhood of
+    It is possible to assign a step of selecting patches (parameter `neib_step`).
-    each point.
-    It is possible to assign a step of selecting patches (parameter
+    Parameters
-    `neib_step`).
+    ----------
+    ten4 : A 4d tensor-like
-    :param ten4:     A 4-dimensional tensor which represents
+        A 4-dimensional tensor which represents a list of lists of images.
-                     a list of lists of images.a list of lists of images.
+        It should have shape (list 1 dim, list 2 dim, row, col). The first
-                     It should have shape (list 1 dim, list 2 dim,
+        two dimensions can be useful to store different channels and batches.
-                     row, col). The first two dimensions can be
+    neib_shape : A 1d tensor-like of 2 values
-                     useful to store different channels and batches.
+        A tuple containing two values: height and width of the neighbourhood.
-    :type ten4:      A 4d tensor-like.
+        It should have shape (r,c) where r is the height of the neighborhood
-    :param neib_shape: A tuple containing two
+        in rows and c is the width of the neighborhood in columns.
-                    values: height and width of the neighbourhood.
+    neib_step : A 1d tensor-like of 2 values
-                    It should have shape (r,c) where r is the height of the
+        (dr,dc) where dr is the number of rows to skip between patch and dc is
-                    neighborhood in rows and c is the width of the neighborhood
+        the number of columns. The parameter should be a tuple of two elements:
-                    in columns
+        number of rows and number of columns to skip each iteration.
-    :type neib_shape: A 1d tensor-like of 2 values.
+        Basically, when the step is 1, the neighbourhood of every first element
-    :param neib_step: (dr,dc) where dr is the number of rows to
+        is taken and every possible rectangular subset is returned.
-                      skip between patch and dc is the number of
+        By default it is equal to `neib_shape` in other words, the patches are
-                      columns. The parameter should be a tuple of two elements:
+        disjoint. When the step is greater than `neib_shape`, some elements are
-                      number
+        omitted. When None, this is the same as neib_shape (patch are disjoint).
-                      of rows and number of columns to skip each iteration.
+        .. note:: Currently the step size should be chosen in the way that the
-                      Basically, when the step is 1, the neighbourhood of every
+            corresponding dimension :math:`i` (width or height) is equal to
-                      first element is taken and every possible rectangular
+            :math:`n * step\_size_i + neib\_shape_i` for some :math:`n`
-                      subset is returned. By default it is equal to
+    mode : {'valid', 'ignore_borders', 'wrap_centered}
-                      `neib_shape` in other words, the
-                      patches are disjoint. When the step is greater than
-                      `neib_shape`, some elements are omitted. When None, this
-                      is the same as
-                      neib_shape(patch are disjoint)
-                      .. note:: Currently the step size should be chosen in the way that the
-                         corresponding dimension :math:`i` (width or height) is equal to
-                         :math:`n * step\_size_i + neib\_shape_i` for some :math:`n`
-    :type neib_step: A 1d tensor-like of 2 values.
-    :param mode:
-        Possible values:
        ``valid``
-           Requires an input that is a multiple of the
+        Requires an input that is a multiple of the
-           pooling factor (in each direction)
+        pooling factor (in each direction).
        ``ignore_borders``
-           Same as valid, but will ignore the borders
+        Same as valid, but will ignore the borders if the shape(s) of
-           if the shape(s) of the input
+        the input is not a multiple of the pooling factor(s).
-           is not a multiple of the pooling factor(s)
        ``wrap_centered``
-           ?? TODO comment
+        ?? TODO comment
-    :type mode: str
+    Returns
-    :return:
+    -------
+    object
        Reshapes the input as a 2D tensor where each row is an
        pooling example. Pseudo-code of the output:
@@ -493,7 +487,8 @@ def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
             these for loops, they're just the easiest way to describe the
             output pattern.
-    Example:
+    Examples
+    --------
    .. code-block:: python
@@ -512,6 +507,7 @@ def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
    .. note:: The underlying code will construct a 2D tensor of disjoint
       patches 5x5. The output has shape 4x25.
    """
    return Images2Neibs(mode)(ten4, neib_shape, neib_step)
@@ -524,25 +520,37 @@ def neibs2images(neibs, neib_shape, original_shape, mode='valid'):
    the output of :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
    and reconstructs its input.
-    :param neibs: matrix like the one obtained by
+    Parameters
-                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
+    ----------
-    :param neib_shape: `neib_shape` that was used in
+    neibs: matrix
-                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
+        Like the one obtained by
-    :param original_shape: original shape of the 4d tensor given to
+        :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`.
-                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
+    neib_shape
+        `neib_shape` that was used in
-    :return: Reconstructs the input of
+        :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`.
-                  :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`,
+    original_shape
-                  a 4d tensor of shape `original_shape`.
+        Original shape of the 4d tensor given to
+        :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`
-    .. note:: Currently, the function doesn't support tensors created with
-       `neib_step` different from default value. This means that it may be
+    Returns
-       impossible to compute the gradient of a variable gained by
+    -------
-       :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` w.r.t.
+    object
-       its inputs in this case, because it uses
+        Reconstructs the input of
-       :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` for
+        :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`,
-       gradient computation.
+        a 4d tensor of shape `original_shape`.
+    Notes
+    -----
+    Currently, the function doesn't support tensors created with
+    `neib_step` different from default value. This means that it may be
+    impossible to compute the gradient of a variable gained by
+    :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` w.r.t.
+    its inputs in this case, because it uses
+    :func:`images2neibs <theano.sandbox.neigbours.neibs2images>` for
+    gradient computation.
+    Examples
+    --------
    Example, which uses a tensor gained in example for
    :func:`images2neibs <theano.sandbox.neigbours.neibs2images>`:
@@ -555,6 +563,7 @@ def neibs2images(neibs, neib_shape, original_shape, mode='valid'):
        im_new_val = inv_window(neibs_val)
    .. note:: The code will output the initial image array.
    """
    neibs = T.as_tensor_variable(neibs)
    neib_shape = T.as_tensor_variable(neib_shape)

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
-"""Provides neural-network specific Ops.
+"""
+Provides neural-network specific Ops.
-:note: TODO: factor this out into a neural-network toolbox.
+Notes
+-----
+TODO: factor this out into a neural-network toolbox.
-:note: We register all optimization with the gpu tag as we don't
+We register all optimization with the gpu tag as we don't
-    implement all the intermediate case on the GPU (in particular
+implement all the intermediate case on the GPU (in particular
-    AdvancedSubtensor). So to make sure it run well on the gpu with
+AdvancedSubtensor). So to make sure it run well on the gpu with
-    fast_compile, we register them as needed for the GPU. This can be
+fast_compile, we register them as needed for the GPU. This can be
-    revisited later when all the intermediate part are on the GPU.
+revisited later when all the intermediate part are on the GPU.
 """
 import logging
@@ -38,13 +41,16 @@ class SoftmaxWithBias(gof.Op):
    """
    An L{Op} for the output of neural-net multiclass classifiers.
-    @type x: is a matrix of floats (32 or 64)
+    Attributes
-    @type b: is a [row] vector of floats (32 or 64),
+    ----------
-             length is number of cols in x
+    x : a matrix of floats (32 or 64)
+    b : a [row] vector of floats (32 or 64), length is number of cols in x
    This L{Op}'s output is softmax(x+b).
    softmax(x[i]) is the i'th distribution over len(x[i]) options.
    """
    nin = 2
    nout = 1
    __props__ = ()
@@ -270,7 +276,11 @@ softmax_with_bias = SoftmaxWithBias()
 class SoftmaxGrad(gof.Op):
-    """Gradient wrt x of the Softmax Op"""
+    """
+    Gradient wrt x of the Softmax Op.
+    """
    nin = 2
    nout = 1
    __props__ = ()
@@ -391,6 +401,7 @@ class Softmax(gof.Op):
    \\frac{e^{\mathbf{x}_j}}{\sum_{k=1}^K e^{\mathbf{x}_k}}`
    where :math:`K` is the total number of neurons in the layer. This
    activation function gets applied row-wise.
    """
    nin = 1
@@ -584,7 +595,9 @@ def softmax(c):
 @opt.register_specialize('fast_compile_gpu')
 @gof.local_optimizer([softmax_op])
 def local_softmax_with_bias(node):
-    """Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias)
+    """
+    Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias).
    """
    if node.op == softmax_op:
        x, = node.inputs
@@ -789,15 +802,19 @@ if 0:
 class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
-    """A special compound L{Op} for the output of neural-net classifiers.
+    """
+    A special compound L{Op} for the output of neural-net classifiers.
-    :type x: is a matrix of floats (32 or 64)
+    Parameters
-    :type b: is a [row] vector of floats (32 or 64),
+    ----------
-             length is number of cols in x
+    x : a matrix of floats (32 or 64)
-    :type y_idx: a [column] vector of int (32 or 64),
+    b : a [row] vector of floats (32 or 64), length is number of cols in x
-                 length is number of rows in x
+    y_idx : a [column] vector of int (32 or 64), length is number of rows in x
-    :returns:  row-wise NLL, softmax(x+b), row-wise argmax of (x+b)
+    Returns
+    -------
+    object
+        row-wise NLL, softmax(x+b), row-wise argmax of (x+b).
    @precondition: every entry in y_idx is a valid (non-negative)
                   column index into x
@@ -816,6 +833,7 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
    i'th example.
    """
    nin = 3
    nout = 3
    __props__ = ()
@@ -846,7 +864,8 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
        return Apply(self, [x, b, y_idx], [nll, sm, am])
    def perform(self, node, input_storage, output_storage):
-        """The math, where x is an input vector, and t is a target index:
+        """
+        The math, where x is an input vector, and t is a target index:
            softmax(x)[i] = exp(x[i]) / sum_j(exp(x[j]))
            nll(x,t) = -log(softmax(x)[t])
@@ -1037,12 +1056,15 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
 class CrossentropySoftmax1HotWithBiasDx(gof.Op):
+    """
+    Gradient wrt x of the CrossentropySoftmaxArgmax1HotWithBias Op.
+    """
    nin = 3
    nout = 1
    __props__ = ()
-    """Gradient wrt x of the CrossentropySoftmaxArgmax1HotWithBias Op"""
    def make_node(self, dy, sm, y_idx, **kwargs):
        dy = tensor.as_tensor_variable(dy)
        sm = tensor.as_tensor_variable(sm)
@@ -1217,15 +1239,19 @@ def crossentropy_softmax_1hot(x, y_idx, **kwargs):
 def crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx, **kwargs):
    """
-    @return: The cross-entropy, the softmax output, the max probability,
+    Returns
-             and the argmax index
+    -------
+    object
+        The cross-entropy, the softmax output, the max probability,
+        and the argmax index.
-    @todo: Since we are recomputing the argmax,
+    TODO: Since we are recomputing the argmax,
           we might as well assert that it is correct.
-    @todo: Make this entire function is
+    TODO: Make this entire function is
    unnecessary? e.g. CrossentropySoftmaxArgmax1HotWithBias should return
    the appropriate information (i.e. the max probability)?
    """
    (xent, softmax) = crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs)
    (max_pr, argmax) = tensor.max_and_argmax(softmax, axis=-1)
@@ -1262,29 +1288,34 @@ crossentropy_categorical_1hot_grad = CrossentropyCategorical1HotGrad()
 class CrossentropyCategorical1Hot(gof.Op):
+    """
-    """Compute the cross entropy between a coding distribution and
+    Compute the cross entropy between a coding distribution and
-    a true distribution of the form [0, 0, ... 0, 1, 0, ..., 0]
+    a true distribution of the form [0, 0, ... 0, 1, 0, ..., 0].
    .. math::
        y[i] = - \log(coding_dist[i, one_of_n[i])
+    Notes
-    :note: In the case that the coding distribution is the output of a
+    -----
-           softmax, an application of this Op will probably be optimized
+    In the case that the coding distribution is the output of a
-           away in favour of one with a C implementation.
+    softmax, an application of this Op will probably be optimized
+    away in favour of one with a C implementation.
    """
    __props__ = ()
    def make_node(self, coding_dist, true_one_of_n):
        """
-        :type coding_dist: dense matrix
+        Parameters
+        ----------
+        coding_dist : dense matrix
+        true_one_of_n : lvector
-        :type true_one_of_n: lvector
+        Returns
+        -------
+        dvector
-        :rtype: dvector
        """
        _coding_dist = tensor.as_tensor_variable(coding_dist)
        _true_one_of_n = tensor.as_tensor_variable(true_one_of_n)
@@ -1332,10 +1363,13 @@ crossentropy_categorical_1hot = CrossentropyCategorical1Hot()
 @opt.register_specialize('fast_compile_gpu')
 @gof.optimizer
 def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph):
-    """This is a stabilization optimization
+    """
+    This is a stabilization optimization.
-    :note: not a local optimization because we are replacing outputs
+    Notes
-    from several nodes at once
+    -----
+    Not a local optimization because we are replacing outputs
+    from several nodes at once.
    """
@@ -1362,16 +1396,19 @@ def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph):
 @gof.optimizer
 def crossentropy_to_crossentropy_with_softmax(fgraph):
-    """This is a stabilization optimization that is more general then
+    """
-    crossentropy_to_crossentropy_with_softmax_with_bias
+    This is a stabilization optimization that is more general than
+    crossentropy_to_crossentropy_with_softmax_with_bias.
    It must be executed after local_softmax_with_bias optimization in
-    specialize
+    specialize.
-    :todo: This is a stabilization optimization! How to make this more cleanly?
+    TODO : This is a stabilization optimization! How to make this more cleanly?
-    :note: not a local optimization because we are replacing outputs
+    Notes
-           from several nodes at once
+    -----
+    Not a local optimization because we are replacing outputs from several
+    nodes at once.
    """
@@ -1460,11 +1497,13 @@ def local_argmax_pushdown(node):
 def _check_rows_is_arange_len_labels(rows, labels):
-    '''Check that 'rows' is the same node as T.arange(labels.shape[0])
+    """
+    Check that 'rows' is the same node as T.arange(labels.shape[0]).
    Also considers the case where labels.shape[0] is constant and equal
    to 1, and T.arange(labels.shape[0]) has been constant-folded into 0.
-    '''
+    """
    if labels.owner and hasattr(labels.owner.fgraph, 'shape_feature'):
        shape_of = labels.owner.fgraph.shape_feature.shape_of
@@ -1795,10 +1834,11 @@ def graph_merge_softmax_with_crossentropy_softmax(node):
 @gof.local_optimizer([CrossentropySoftmax1HotWithBiasDx])
 def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node):
    """
-    Replaces a CrossentropySoftmax1HotWithBiasDx op, whose incoming gradient is
+    Replace a CrossentropySoftmax1HotWithBiasDx op, whose incoming gradient is
    an `alloc` of a scalar variable or one that has either broadcastable or
    matching dimensions with the output variable, by one that skips the
    intermediate `alloc`.
    """
    if isinstance(node.op, CrossentropySoftmax1HotWithBiasDx):
        dy, sm, y_idx = node.inputs
@@ -1850,30 +1890,38 @@ def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node):
 def binary_crossentropy(output, target):
    """
-    Compute the crossentropy of binary random variables
+    Compute the crossentropy of binary random variables.
-    output and target are each expectations of binary random
+    Output and target are each expectations of binary random
    variables; target may be exactly 0 or 1 but output must
    lie strictly between 0 and 1.
-    @note: we could use the x log y op to support output=0
-    @ and output=1. The gradient would still be undefined though.
+    Notes
-    @note: We do not sum, crossentropy is computed by component.
+    -----
-    @todo: Rewrite as a scalar, and then broadcast to tensor.
+    We could use the x log y op to support output=0 and output=1.
+    The gradient would still be undefined though.
+    We do not sum, crossentropy is computed by component.
+    TODO : Rewrite as a scalar, and then broadcast to tensor.
    """
    return -(target * tensor.log(output) + (1.0 - target) * tensor.log(1.0 - output))
 def categorical_crossentropy(coding_dist, true_dist):
    """
-    WARNING: THIS FUNCTION IS UNNECESSARILY POLYMORPHIC.
+    Return the cross-entropy between an approximating distribution and a true
-    We ultimately don't want the polymorphism, and will move this function to pylearn.algorithms.cost.
+    distribution.
-    The 1hot version will be removed.
-    The length of the documentation here is a form of code smell.
-    Return the cross-entropy between an approximating distribution and a true distribution
+    .. warning:: THIS FUNCTION IS UNNECESSARILY POLYMORPHIC.
+    We ultimately don't want the polymorphism, and will move this function
+    to pylearn.algorithms.cost. The 1hot version will be removed.
+    The length of the documentation here is a form of code smell.
-    The cross entropy between two probability distributions measures the average number of bits
+    The cross entropy between two probability distributions measures the average
-    needed to identify an event from a set of possibilities, if a coding scheme is used based
+    number of bits needed to identify an event from a set of possibilities, if a
-    on a given probability distribution q, rather than the "true" distribution p.
+    coding scheme is used based on a given probability distribution q, rather
+    than the "true" distribution p.
    Mathematically it is defined as follows:
@@ -1881,20 +1929,25 @@ def categorical_crossentropy(coding_dist, true_dist):
        H(p,q) = - \sum_x p(x) \log(q(x))
-    :type coding_dist: a dense matrix.
+    Parameters
-    :param coding_dist: Each slice along axis represents one distribution.
+    ----------
+    coding_dist : a dense matrix
-    :type true_dist: a dense matrix or sparse matrix or integer vector.
+        Each slice along axis represents one distribution.
-    :param coding_dist: In the case of a matrix argument, each slice along axis represents one
+    true_dist : a dense matrix or sparse matrix or integer vector
-    distribution.  In the case of an integer vector argument, each element represents the
+        In the case of a matrix argument, each slice along axis represents one
-    position of the '1' in a 1-of-N encoding.
+        distribution. In the case of an integer vector argument, each element
+        represents the position of the '1' in a 1-of-N encoding.
-    :type axis: int
-    :param axis: the dimension over which each distribution runs. (1 for row distributions, 0
+    Returns
-    for column distributions)
+    -------
+    tensor of rank one-less-than `coding_dist`
-    :rtype: tensor of rank one-less-than `coding_dist`
+        The cross entropy between each coding and true distribution.
-    :returns: the cross entropy between each coding and true distribution.
+    Notes
+    -----
+    axis : int
+        The dimension over which each distribution runs
+        (1 for row distributions, 0 for column distributions).
    """
    if true_dist.ndim == coding_dist.ndim:
@@ -2036,23 +2089,27 @@ def relu(x, alpha=0):
    """
    Compute the element-wise rectified linear activation function.
-    :type x: symbolic tensor
+    Parameters
-    :param x: Tensor to compute the activation function for.
+    ----------
+    x : symbolic tensor
-    :type alpha: scalar or tensor, optional
+        Tensor to compute the activation function for.
-    :param alpha: Slope for negative input, usually between 0 and 1. The
+    alpha : scalar or tensor, optional
-        default value of 0 will lead to the standard rectifier, 1 will lead to
+        Slope for negative input, usually between 0 and 1. The default value
+        of 0 will lead to the standard rectifier, 1 will lead to
        a linear activation function, and any value in between will give a
        leaky rectifier. A shared variable (broadcastable against `x`) will
        result in a parameterized rectifier with learnable slope(s).
-    :rtype: symbolic tensor
+    Returns
-    :return: element-wise rectifier applied to `x`
+    -------
+    symbolic tensor
+        Element-wise rectifier applied to `x`.
-    .. note:: This is numerically equivalent to
+    Notes
-        ``T.switch(x > 0, x, alpha * x)``
+    -----
-        (or ``T.maximum(x, alpha * x)`` for ``alpha < 1``), but uses a faster
+    This is numerically equivalent to ``T.switch(x > 0, x, alpha * x)``
-        formulation or an optimized Op, so we encourage to use this function.
+    (or ``T.maximum(x, alpha * x)`` for ``alpha < 1``), but uses a faster
+    formulation or an optimized Op, so we encourage to use this function.
    """
    # This is probably the fastest implementation for GPUs. Both the forward

--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
-"""Ops and optimizations: sigmoid, softplus
+"""
+Ops and optimizations: sigmoid, softplus.
+These functions implement special cases of exp and log to improve numerical
+stability.
-These functions implement special cases of exp and log to improve numerical stability.
 """
 from __future__ import print_function
@@ -25,6 +28,7 @@ from theano.tensor import elemwise, opt, NotScalarConstantError
 class ScalarSigmoid(scalar.UnaryScalarOp):
    """
    This is just speed opt. Not for stability.
    """
    @staticmethod
    def st_impl(x):
@@ -126,7 +130,8 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
    @staticmethod
    def gen_graph():
        """
-        This method was used to generate the graph: sigmoid_prec.png in the doc
+        This method was used to generate the graph: sigmoid_prec.png in the doc.
        """
        data = numpy.arange(-15, 15, .1)
        val = 1 / (1 + numpy.exp(-data))
@@ -173,6 +178,7 @@ pprint.assign(sigmoid, printing.FunctionPrinter('sigmoid'))
 class UltraFastScalarSigmoid(scalar.UnaryScalarOp):
    """
    This is just speed opt. Not for stability.
    """
    @staticmethod
    def st_impl(x):
@@ -245,7 +251,7 @@ def local_ultra_fast_sigmoid(node):
    When enabled, change all sigmoid to ultra_fast_sigmoid.
    For example do mode.including('local_ultra_fast_sigmoid')
-    or use the Theano flag optimizer_including=local_ultra_fast_sigmoid
+    or use the Theano flag optimizer_including=local_ultra_fast_sigmoid.
    This speeds up the sigmoid op by using an approximation.
@@ -269,11 +275,12 @@ theano.compile.optdb['uncanonicalize'].register("local_ultra_fast_sigmoid",
 def hard_sigmoid(x):
-    """An approximation of sigmoid.
+    """
+    An approximation of sigmoid.
    More approximate and faster than ultra_fast_sigmoid.
-    Approx in 3 parts: 0, scaled linear, 1
+    Approx in 3 parts: 0, scaled linear, 1.
    Removing the slope and shift does not make it faster.
@@ -375,7 +382,13 @@ logsigm_to_softplus = gof.PatternSub(
 def _is_1(expr):
-    """rtype bool. True iff expr is a constant close to 1
+    """
+    Returns
+    -------
+    bool
+        True iff expr is a constant close to 1.
    """
    try:
        v = opt.get_scalar_constant_value(expr)
@@ -405,8 +418,13 @@ opt.register_stabilize(log1pexp_to_softplus, name='log1pexp_to_softplus')
 def is_1pexp(t):
    """
-    If 't' is of the form (1+exp(x)), return (False, x).
-    Else return None.
+    Returns
+    -------
+    object
+        If 't' is of the form (1+exp(x)), return (False, x).
+        Else return None.
    """
    if t.owner and t.owner.op == tensor.add:
        scalars, scalar_inputs, nonconsts = \
@@ -449,11 +467,18 @@ def is_exp(var):
    """
    Match a variable with either of the `exp(x)` or `-exp(x)` patterns.
-    :param var: The Variable to analyze.
+    Parameters
+    ----------
+    var
+        The Variable to analyze.
+    Returns
+    -------
+    tuple
+        A pair (b, x) with `b` a boolean set to True if `var` is of the
+        form `-exp(x)` and False if `var` is of the form `exp(x)`. If `var`
+        cannot be cast into either form, then return `None`.
-    :return: A pair (b, x) with `b` a boolean set to True if `var` is of the
-    form `-exp(x)` and False if `var` is of the form `exp(x)`. If `var` cannot
-    be cast into either form, then return `None`.
    """
    neg = False
    neg_info = is_neg(var)
@@ -468,10 +493,17 @@ def is_mul(var):
    """
    Match a variable with `x * y * z * ...`.
-    :param var: The Variable to analyze.
+    Parameters
+    ----------
+    var
+        The Variable to analyze.
+    Returns
+    -------
+    object
+        A list [x, y, z, ...] if `var` is of the form `x * y * z * ...`,
+        or None if `var` cannot be cast into this form.
-    :return: A list [x, y, z, ...] if `var` is of the form `x * y * z * ...`,
-    or None if `var` cannot be cast into this form.
    """
    if var.owner and var.owner.op == tensor.mul:
        return var.owner.inputs
@@ -504,9 +536,16 @@ def is_neg(var):
    """
    Match a variable with the `-x` pattern.
-    :param var: The Variable to analyze.
+    Parameters
+    ----------
+    var
+        The Variable to analyze.
+    Returns
+    -------
+    object
+        `x` if `var` is of the form `-x`, or None otherwise.
-    :return: `x` if `var` is of the form `-x`, or None otherwise.
    """
    apply = var.owner
    if not apply:
@@ -538,8 +577,10 @@ def is_neg(var):
 @opt.register_stabilize
 @gof.local_optimizer([tensor.true_div])
 def local_exp_over_1_plus_exp(node):
-    """exp(x)/(1+exp(x)) -> sigm(x)
+    """
+    exp(x)/(1+exp(x)) -> sigm(x)
    c/(1+exp(x)) -> c*sigm(-x)
    """
    # this optimization should be done for numerical stability
    # so we don't care to check client counts
@@ -585,20 +626,28 @@ def parse_mul_tree(root):
    """
    Parse a tree of multiplications starting at the given root.
-    :param root: The variable at the root of the tree.
+    Parameters
+    ----------
-    :return: A tree where each non-leaf node corresponds to a multiplication
+    root
-    in the computation of `root`, represented by the list of its inputs. Each
+        The variable at the root of the tree.
-    input is a pair [n, x] with `n` a boolean value indicating whether
-    sub-tree `x` should be negated.
+    Returns
+    -------
-    Examples:
+    object
+        A tree where each non-leaf node corresponds to a multiplication
+        in the computation of `root`, represented by the list of its inputs.
+        Each input is a pair [n, x] with `n` a boolean value indicating whether
+        sub-tree `x` should be negated.
+    Examples
+    --------
        x * y               -> [False, [[False, x], [False, y]]]
        -(x * y)            -> [True, [[False, x], [False, y]]]
        -x * y              -> [False, [[True, x], [False, y]]]
        -x                  -> [True, x]
        (x * y) * -z        -> [False, [[False, [[False, x], [False, y]]],
                                        [True, z]]]
    """
    # Is it a multiplication?
    mul_info = is_mul(root)
@@ -619,29 +668,36 @@ def parse_mul_tree(root):
 def replace_leaf(arg, leaves, new_leaves, op, neg):
    """
-    Attempts to replace a leaf of a multiplication tree.
+    Attempt to replace a leaf of a multiplication tree.
    We search for a leaf in `leaves` whose argument is `arg`, and if we find
    one, we remove it from `leaves` and add to `new_leaves` a leaf with
    argument `arg` and variable `op(arg)`.
-    :param arg: The argument of the leaf we are looking for.
+    Parameters
+    ----------
-    :param leaves: List of leaves to look into. Each leaf should be a pair
+    arg
-    (x, l) with `x` the argument of the Op found in the leaf, and `l` the
+        The argument of the leaf we are looking for.
-    actual leaf as found in a multiplication tree output by `parse_mul_tree`
+    leaves
-    (i.e. a pair [boolean, variable]).
+        List of leaves to look into. Each leaf should be a pair
+        (x, l) with `x` the argument of the Op found in the leaf, and `l` the
-    :param new_leaves: If a replacement occurred, then the leaf is removed from
+        actual leaf as found in a multiplication tree output by `parse_mul_tree`
-    `leaves` and added to the list `new_leaves` (after being modified by `op`).
+        (i.e. a pair [boolean, variable]).
+    new_leaves
-    :param op: A function that, when applied to `arg`, returns the Variable
+        If a replacement occurred, then the leaf is removed from `leaves`
-    we want to replace the original leaf variable with.
+        and added to the list `new_leaves` (after being modified by `op`).
+    op
-    :param neg: If True, then the boolean value associated to the leaf should
+        A function that, when applied to `arg`, returns the Variable
-    be swapped. If False, then this value should remain unchanged.
+        we want to replace the original leaf variable with.
+    neg : bool
+        If True, then the boolean value associated to the leaf should
+        be swapped. If False, then this value should remain unchanged.
+    Returns
+    -------
+    bool
+        True if a replacement occurred, or False otherwise.
-    :return: True if a replacement occurred, or False otherwise.
    """
    for idx, x in enumerate(leaves):
        if x[0] == arg:
@@ -657,12 +713,19 @@ def simplify_mul(tree):
    """
    Simplify a multiplication tree.
-    :param tree: A multiplication tree (as output by `parse_mul_tree`).
+    Parameters
+    ----------
+    tree
+        A multiplication tree (as output by `parse_mul_tree`).
+    Returns
+    -------
+    object
+        A multiplication tree computing the same output as `tree` but without
+        useless multiplications by 1 nor -1 (identified by leaves of the form
+        [False, None] or [True, None] respectively). Useless multiplications
+        (with less than two inputs) are also removed from the tree.
-    :return: A multiplication tree computing the same output as `tree` but
-    without useless multiplications by 1 nor -1 (identified by leaves of the
-    form [False, None] or [True, None] respectively). Useless multiplications
-    (with less than two inputs) are also removed from the tree.
    """
    neg, inputs = tree
    if isinstance(inputs, list):
@@ -694,12 +757,18 @@ def compute_mul(tree):
    Compute the Variable that is the output of a multiplication tree.
    This is the inverse of the operation performed by `parse_mul_tree`, i.e.
-        compute_mul(parse_mul_tree(tree)) == tree
+    compute_mul(parse_mul_tree(tree)) == tree.
+    Parameters
+    ----------
+    tree
+        A multiplication tree (as output by `parse_mul_tree`).
-    :param tree: A multiplication tree (as output by `parse_mul_tree`).
+    Returns
+    -------
+    object
+        A Variable that computes the multiplication represented by the tree.
-    :return: A Variable that computes the multiplication represented by the
-    tree.
    """
    neg, inputs = tree
    if inputs is None:
@@ -727,32 +796,38 @@ def perform_sigm_times_exp(tree, exp_x=None, exp_minus_x=None, sigm_x=None,
    by replacing matching pairs (exp, sigmoid) with the desired optimized
    version.
-    :param tree: The sub-tree to operate on.
+    Parameters
+    ----------
-    :exp_x: List of arguments x so that `exp(x)` exists somewhere in the whole
+    tree
-    multiplication tree. Each argument is a pair (x, leaf) with `x` the
+        The sub-tree to operate on.
-    argument of the exponential, and `leaf` the corresponding leaf in the
+    exp_x
-    multiplication tree (of the form [n, exp(x)] -- see `parse_mul_tree`).
+        List of arguments x so that `exp(x)` exists somewhere in the whole
-    If None, this argument is initialized to an empty list.
+        multiplication tree. Each argument is a pair (x, leaf) with `x` the
+        argument of the exponential, and `leaf` the corresponding leaf in the
-    :param exp_minus_x: Similar to `exp_x`, but for `exp(-x)`.
+        multiplication tree (of the form [n, exp(x)] -- see `parse_mul_tree`).
+        If None, this argument is initialized to an empty list.
+    exp_minus_x
+        Similar to `exp_x`, but for `exp(-x)`.
+    sigm_x
+        Similar to `exp_x`, but for `sigmoid(x)`.
+    sigm_minus_x
+        Similar to `exp_x`, but for `sigmoid(-x)`.
+    parent
+        Parent of `tree` (None if `tree` is the global root).
+    child_idx
+        Index of `tree` in its parent's inputs (None if `tree` is the global
+        root).
+    full_tree
+        The global multiplication tree (should not be set except by recursive
+        calls to this function). Used for debugging only.
+    Returns
+    -------
+    bool
+        True if a modification was performed somewhere in the whole multiplication
+        tree, or False otherwise.
-    :param sigm_x: Similar to `exp_x`, but for `sigmoid(x)`.
-    :param sigm_minus_x: Similar to `exp_x`, but for `sigmoid(-x)`.
-    :param parent: Parent of `tree` (None if `tree` is the global root).
-    :param child_idx: Index of `tree` in its parent's inputs (None if `tree` is
-    the global root).
-    :param full_tree: The global multiplication tree (should not be set except
-    by recursive calls to this function). Used for debugging only.
-    :return: True if a modification was performed somewhere in the whole
-    multiplication tree, or False otherwise.
    """
    if exp_x is None:
        exp_x = []
    if exp_minus_x is None:
@@ -836,6 +911,7 @@ def local_sigm_times_exp(node):
    """
    exp(x) * sigm(-x) -> sigm(x)
    exp(-x) * sigm(x) -> sigm(-x)
    """
    # Bail early if it is not a multiplication.
    if node.op != tensor.mul:
@@ -859,6 +935,7 @@ def local_sigm_times_exp(node):
 def local_inv_1_plus_exp(node):
    """
    1/(1+exp(x)) -> sigm(-x)
    """
    # this optimization should be done for numerical stability
    # so we don't care to check client counts
@@ -883,6 +960,7 @@ def local_inv_1_plus_exp(node):
 def local_1msigmoid(node):
    """
    1-sigm(x) -> sigm(-x)
    """
    if node.op == tensor.sub:
        sub_l, sub_r = node.inputs

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
 """
-Tensor optimizations addressing the ops in basic.py
+Tensor optimizations addressing the ops in basic.py.
 """
 from __future__ import print_function
 # TODO: intelligent merge for mul/add
@@ -68,15 +68,20 @@ def copy_stack_trace(from_var, to_var):
    Copies the stack trace from one or more tensor variables to
    one or more tensor variables.
-    :param from_var: tensor variable or list of tensor variables to
+    Parameters
-                     copy stack traces from.
+    ----------
-    :param to_var: tensor variable or list of tensor variables to
+    from_var
-                     copy stack traces to.
+        Tensor variable or list of tensor variables to copy stack traces from.
+    to_var
+        Tensor variable or list of tensor variables to copy stack traces to.
-    .. note:: The stacktrace is assumed to be of the form of a list of lists
+    Notes
+    -----
+    The stacktrace is assumed to be of the form of a list of lists
    of tuples. Each tuple contains the filename, line number, function name
    and so on. Each list of tuples contains the truples belonging to a
    particular variable.
    """
    # Store stack traces from from_var
@@ -151,11 +156,20 @@ def _fill_chain(new_out, orig_inputs):
 def encompasses_broadcastable(b1, b2):
    """
-    Returns True if the broadcastable patterns b1 and b2 are such that b2 is
-    broadcasted to b1's shape and not the opposite.
-    :param b1: the broadcastable attribute of a tensor type
+    Parameters
-    :param b2: the broadcastable attribute of a tensor type
+    ----------
+    b1
+        The broadcastable attribute of a tensor type.
+    b2
+        The broadcastable attribute of a tensor type.
+    Returns
+    -------
+    bool
+        True if the broadcastable patterns b1 and b2 are such that b2 is
+        broadcasted to b1's shape and not the opposite.
    """
    if len(b1) < len(b2):
        return False
@@ -184,7 +198,8 @@ def scalarconsts_rest(inputs):
 def broadcast_like(value, template, fgraph, dtype=None):
-    """Return a Variable with the same shape and dtype as the template,
+    """
+    Return a Variable with the same shape and dtype as the template,
    filled by broadcasting value through it. `value` will be cast as
    necessary.
@@ -240,9 +255,11 @@ def inplace_elemwise_optimizer_op(OP):
        see if it can operate inplace on that input. If so, makes the
        change and go to the next output or Broadcast Op.
-        Examples:
+        Examples
-          x + y + z -> x += y += z
+        --------
-          (x + y) * (x * y) -> (x += y) *= (x * y) or (x + y) *= (x *= y)
+        x + y + z -> x += y += z
+        (x + y) * (x * y) -> (x += y) *= (x * y) or (x + y) *= (x *= y)
        """
        # We should not validate too often as this takes too much time to
        # execute!
@@ -507,6 +524,7 @@ def local_dimshuffle_lift(node):
    After this transform, clusters of Elemwise operations are
    void of DimShuffle operations.
    """
    op = node.op
    if not isinstance(op, DimShuffle):
@@ -556,6 +574,7 @@ def local_lift_transpose_through_dot(node):
    The transformation should be apply whether or not the transpose is
    inplace.  The newly-introduced transpositions are not inplace, this will
    be taken care of in a later optimization phase.
    """
    if not (isinstance(node.op, T.DimShuffle) and node.op.new_order == (1, 0)):
        return False
@@ -639,11 +658,12 @@ def local_scalar_tensor_scalar(node):
 class MakeVector(T.Op):
-    """Concatenate a number of scalars together into a vector
+    """Concatenate a number of scalars together into a vector.
    This is a simple version of stack() that introduces far less cruft
    into the graph. Should work with 0 inputs. The constant_folding
    optimization will remove it.
    """
    __props__ = ("dtype",)
@@ -755,7 +775,7 @@ T.pprint.assign(lambda pstate, r: r.owner and
 class ShapeFeature(object):
-    """Graph optimizer for removing all calls to shape()
+    """Graph optimizer for removing all calls to shape().
    This optimizer replaces all Shapes and Subtensors of Shapes with
    Shape_i and MakeVector Ops.
@@ -791,7 +811,6 @@ class ShapeFeature(object):
    For example the infer_shape for a matrix-matrix product would accept
    input_shapes=((x0,x1), (y0,y1)) and return ((x0, y1),).
    Inferring the shape of internal nodes in the graph is important
    for doing size-driven optimizations.  If we know how big various
    intermediate results will be, we can estimate the cost of many Ops
@@ -800,18 +819,18 @@ class ShapeFeature(object):
    In cases where you cannot figure out the shape, raise a ShapeError.
-    .. note::
+    Notes
+    -----
-        Right now there is only the ConvOp that could really take
+    Right now there is only the ConvOp that could really take
-        advantage of this shape inference, but it is worth it even
+    advantage of this shape inference, but it is worth it even
-        just for the ConvOp.  All that's necessary to do shape
+    just for the ConvOp.  All that's necessary to do shape
-        inference is 1) to mark shared inputs as having a particular
+    inference is 1) to mark shared inputs as having a particular
-        shape, either via a .tag or some similar hacking; and 2) to
+    shape, either via a .tag or some similar hacking; and 2) to
-        add an optional Param() argument to promise that inputs will
+    add an optional Param() argument to promise that inputs will
-        have a certain shape (or even to have certain shapes in
+    have a certain shape (or even to have certain shapes in
-        certain dimensions). We can't automatically infer the shape of
+    certain dimensions). We can't automatically infer the shape of
-        shared variables as they can change of shape during the
+    shared variables as they can change of shape during the
-        execution by default.  (NOT IMPLEMENTED YET, BUT IS IN TRAC)
+    execution by default.  (NOT IMPLEMENTED YET, BUT IS IN TRAC)
    Using Shape information in Optimizations
@@ -842,7 +861,7 @@ class ShapeFeature(object):
    """
    def shape_ir(self, i, r):
-        """Return symbolic r.shape[i] for tensor variable r, int i"""
+        """Return symbolic r.shape[i] for tensor variable r, int i."""
        if hasattr(r.type, "broadcastable") and r.type.broadcastable[i]:
            return self.lscalar_one
        else:
@@ -855,7 +874,7 @@ class ShapeFeature(object):
            return s
    def shape_tuple(self, r):
-        """Return a tuple of symbolic shape vars for tensor variable r"""
+        """Return a tuple of symbolic shape vars for tensor variable r."""
        if not hasattr(r, 'ndim'):
            # This happen for NoneConst.
            return None
@@ -867,6 +886,7 @@ class ShapeFeature(object):
        This function is used for Ops that don't implement infer_shape.
        Ops that do implement infer_shape should use the i_shapes parameter,
        but this default implementation ignores it.
        """
        rval = []
        for r in node.outputs:
@@ -880,6 +900,7 @@ class ShapeFeature(object):
        """Return a symbolic integer scalar for the shape element s_i.
        The s_i argument was produced by the infer_shape() of an Op subclass.
        """
        # unpack the s_i that the Op returned
        assert s_i is not None
@@ -933,8 +954,11 @@ class ShapeFeature(object):
    def set_shape(self, r, s):
        """Assign the shape `s` to previously un-shaped variable `r`.
-        :type r: a variable
+        Parameters
-        :type s: None or a tuple of symbolic integers
+        ----------
+        r : a variable
+        s : None or a tuple of symbolic integers
        """
        assert r not in self.shape_of, 'r already in shape_of'
        if s is None:
@@ -972,11 +996,12 @@ class ShapeFeature(object):
                self.shape_of_reverse_index.setdefault(sv, set()).add(r)
    def update_shape(self, r, other_r):
-        '''Replace shape of r by shape of other_r.
+        """Replace shape of r by shape of other_r.
        If, on some dimensions, the shape of other_r is not informative,
        keep the shape of r on those dimensions.
-        '''
+        """
        # other_r should already have a shape
        assert other_r in self.shape_of, ('other_r not in shape_of', other_r)
        other_shape = self.shape_of[other_r]
@@ -1303,8 +1328,7 @@ class ShapeFeature(object):
 class ShapeOptimizer(Optimizer):
-    """Optimizer that serves to add ShapeFeature as an fgraph feature.
+    """Optimizer that serves to add ShapeFeature as an fgraph feature."""
-    """
    def __init__(self):
        Optimizer.__init__(self)
@@ -1392,6 +1416,7 @@ def local_useless_alloc(node):
    If the input type is the same as the output type (dtype and broadcast)
    there is no change in the shape of the input. So this is just a simple copy
    of the input. This is not needed.
    """
    if node.op == T.alloc:
        if node.inputs[0].type == node.outputs[0].type:
@@ -1438,14 +1463,15 @@ def local_track_shape_i(node):
 @gof.local_optimizer([Subtensor, AdvancedSubtensor1])
 def local_subtensor_make_vector(node):
    """
-    replace all subtensor(make_vector) like:
+    Replace all subtensor(make_vector) like:
    [a,b,c][0] -> a
    [a,b,c][0:2] -> [a,b]
-    replace all AdvancedSubtensor1(make_vector) like:
+    Replace all AdvancedSubtensor1(make_vector) like:
    [a,b,c][[0,2]] -> [a,c]
-    we can do this for constant indexes
+    We can do this for constant indexes.
    """
    x = node.inputs[0]
    if not x.owner or x.owner.op != make_vector:
@@ -1514,7 +1540,6 @@ def local_subtensor_make_vector(node):
 @gof.local_optimizer([T.Elemwise])
 def local_useless_elemwise(node):
    """
    eq(x,x) -> 1
    neq(x,x) -> 0
    mul(x) -> x
@@ -1559,8 +1584,7 @@ def local_useless_elemwise(node):
 @register_specialize
 @gof.local_optimizer([T.Elemwise])
 def local_alloc_unary(node):
-    """unary(alloc(x, shp)) -> alloc(unary(x), shp)
+    """unary(alloc(x, shp)) -> alloc(unary(x), shp)"""
-    """
    if isinstance(node.op, T.Elemwise) and len(node.inputs) == 1:
        a = node.inputs[0]
        if a.owner and isinstance(a.owner.op, T.Alloc):
@@ -1587,6 +1611,7 @@ def local_cast_cast(node):
    dtype1 == dtype2
    TODO: the base dtype is the same (int, uint, float, complex)
          and the first cast cause an upcast.
    """
    if (not isinstance(node.op, T.Elemwise) or
            not isinstance(node.op.scalar_op, scalar.Cast)):
@@ -1607,9 +1632,9 @@ def local_cast_cast(node):
 def local_func_inv(node):
    """
    Check for two consecutive operations that are functional inverses
-    and remove them from the function graph
+    and remove them from the function graph.
-    """
+    """
    inv_pairs = (
        (basic.Deg2Rad, basic.Rad2Deg),
        (basic.Cosh, basic.ArcCosh),
@@ -1641,9 +1666,9 @@ def local_func_inv(node):
 def is_inverse_pair(node_op, prev_op, inv_pair):
    """
    Given two consecutive operations, check if they are the
-    provided pair of inverse functions
+    provided pair of inverse functions.
-    """
+    """
    node_is_op0 = isinstance(node_op, inv_pair[0])
    node_is_op1 = isinstance(node_op, inv_pair[1])
    prev_is_op0 = isinstance(prev_op, inv_pair[0])
@@ -1659,20 +1684,24 @@ class Assert(T.Op):
    Returns the first parameter if the condition is true, otherwise, triggers
    AssertionError.
-    Example:
+    Notes
-      T = theano.tensor
+    -----
-      x = T.vector('x')
-      assert_op = T.opt.Assert()
-      func = theano.function([x], assert_op(x, x.size<2))
-    Notes:
    This Op is a debugging feature. It can be removed from the graph
    because of optimizations, and can hide some possible optimizations to
    the optimizer. Specifically, removing happens if it can be determined
    that condition will always be true. Also, the output of the Op must be
    used in the function computing the graph, but it doesn't have to be
    returned.
+    Examples
+    --------
+    T = theano.tensor
+    x = T.vector('x')
+    assert_op = T.opt.Assert()
+    func = theano.function([x], assert_op(x, x.size<2))
    """
    __props__ = ('msg',)
    view_map = {0: [0]}
@@ -1770,7 +1799,9 @@ def local_remove_all_assert(node):
    """An optimization disabled by default that removes all asserts from
    the graph.
-    :note: See the :ref:`unsafe` section to know how to enable it.
+    Notes
+    -----
+    See the :ref:`unsafe` section to know how to enable it.
    """
    if not isinstance(node.op, Assert):
@@ -1804,11 +1835,12 @@ def local_elemwise_alloc_op(ElemwiseOP, AllocOP, DimShuffleOP):
        BROADCAST CONDITION: the condition is that the one input that are
        not to be optimized to have the same broadcast pattern as the
-        output
+        output.
+        We can change the alloc by a dimshuffle as the elemwise
+        already have the shape info.  The dimshuffle will be faster
+        to exec.
-             We can change the alloc by a dimshuffle as the elemwise
-             already have the shape info.  The dimshuffle will be faster
-             to exec
        """
        if not isinstance(node.op, ElemwiseOP):
            return False
@@ -1969,6 +2001,7 @@ def local_upcast_elemwise_constant_inputs(node):
    those Ops do implicit upcasting anyway.
    Rationale: it helps merge things like (1-x) and (1.0 - x).
    """
    if len(node.outputs) > 1:
        return
@@ -2033,7 +2066,8 @@ def local_upcast_elemwise_constant_inputs(node):
 @register_specialize
 @gof.local_optimizer([IncSubtensor])
 def local_useless_inc_subtensor(node):
-    """Remove IncSubtensor, when we overwrite the full inputs with the
+    """
+    Remove IncSubtensor, when we overwrite the full inputs with the
    new value.
    """
@@ -2082,6 +2116,7 @@ def local_set_to_inc_subtensor(node):
    """
    AdvancedIncSubtensor1(x, x[ilist]+other, ilist, set_instead_of_inc=True) ->
    AdvancedIncSubtensor1(x, other, ilist, set_instead_of_inc=False)
    """
    if (isinstance(node.op, AdvancedIncSubtensor1) and
            node.op.set_instead_of_inc and
@@ -2144,6 +2179,7 @@ def local_useless_subtensor(node):
    AdvancedSubtensor1 case, the full input is taken when the indices are
    equivalent to `arange(0, input.shape[0], 1)` using either an explicit
    list/vector or the ARange op.
    """
    # This optimization needs ShapeOpt and fgraph.shape_feature
    if not hasattr(node.fgraph, 'shape_feature'):
@@ -2261,6 +2297,7 @@ def local_subtensor_lift(node):
    elemwise(x,...)[idx] -> elemwise(x[idx],...)
      when x,... are broadcasted scalar or not broadcasted at all
    rebroadcast(x)[idx] => rebroadcast(x[idx])
    """
    if isinstance(node.op, Subtensor):
        u = node.inputs[0]
@@ -2327,7 +2364,7 @@ def local_subtensor_lift(node):
 def merge_two_slices(slice1, len1, slice2, len2):
-    '''
+    """
     This function merges two slices into a single slice. The code works on
     the assumption that:
          a) slice1 is actually a slice and not an index, while slice2
@@ -2340,7 +2377,7 @@ def merge_two_slices(slice1, len1, slice2, len2):
    the two consecutive slices.
    ``len1`` is the length of the tensor **before** applying the first slice,
    while ``len2`` is the length **after** applying the first slice.
-    '''
+    """
    list_opt = [local_abs_merge, local_mul_switch_sink,
                local_upcast_elemwise_constant_inputs,
                local_remove_switch_const_cond, constant_folding]
@@ -2466,6 +2503,7 @@ def local_subtensor_merge(node):
    Refactored optimization to deal with all cases of tensor merging.
    Given a subgraph of the form Subtensor(Subtensor(u)), the optimization
    expresses all slices in a canonical form, and then merges them together.
    """
    if isinstance(node.op, Subtensor):
@@ -2601,7 +2639,8 @@ def local_subtensor_of_dot(node):
    idxs_a is the first A.ndim-1 entries of idxs,
    and idxs_b is the remaining entries of idxs (if any),
    modified to skip the second-to-last dimension of B
-    (because dot sums over this dimension)
+    (because dot sums over this dimension).
    """
    if not isinstance(node.op, Subtensor):
        return
@@ -2715,7 +2754,8 @@ compile.optdb.register('pre_local_IncSubtensor_serialize',
 @gof.local_optimizer([IncSubtensor], inplace=True)
 def local_inplace_setsubtensor(node):
    """
-    Also work for GpuIncSubtensor
+    Also work for GpuIncSubtensor.
    """
    if isinstance(node.op, IncSubtensor) and not node.op.inplace:
        new_op = node.op.__class__(
@@ -2734,7 +2774,10 @@ compile.optdb.register('local_inplace_setsubtensor',
 @gof.local_optimizer([AdvancedIncSubtensor1], inplace=True)
 def local_inplace_incsubtensor1(node):
-    """ also work for GpuAdvancedIncSubtensor1 """
+    """
+    Also work for GpuAdvancedIncSubtensor1.
+    """
    if isinstance(node.op, AdvancedIncSubtensor1) and not node.op.inplace:
        new_op = node.op.clone_inplace()
        new_node = new_op(*node.inputs)
@@ -2756,6 +2799,7 @@ compile.optdb.register('local_inplace_incsubtensor1',
 def local_incsubtensor_of_zeros(node):
    """
    IncSubtensor(x, zeros, idx) -> x
    """
    if (isinstance(node.op, (IncSubtensor,
                             AdvancedIncSubtensor,
@@ -2784,6 +2828,7 @@ def local_setsubtensor_of_constants(node):
    SetSubtensor(x, x[idx], idx) -> x
    when x is constant or alloc.
    """
    if isinstance(node.op, IncSubtensor) and node.op.set_instead_of_inc:
        x = node.inputs[0]
@@ -2813,14 +2858,16 @@ def local_setsubtensor_of_constants(node):
 @register_stabilize
 @gof.local_optimizer([AdvancedSubtensor1])
 def local_adv_sub1_adv_inc_sub1(node):
-    """Optimize the possible AdvSub1(AdvIncSub1(...), ...)
+    """Optimize the possible AdvSub1(AdvIncSub1(...), ...).
    AdvancedSubtensor1(AdvancedIncSubtensor1(0s, y, idx), idx) -> y
    AdvancedSubtensor1(AdvancedSetSubtensor1(x, y, idx), idx) -> y
-    :note: This opt add AssertOp. Otherwise, it would remove shape and
+    Notes
-        index error. If you want to get rid of them, see the
+    -----
-        :ref:`unsafe_optimization` section.
+    This opt add AssertOp. Otherwise, it would remove shape and
+    index error. If you want to get rid of them, see the
+    :ref:`unsafe_optimization` section.
    """
    if not isinstance(node.op, AdvancedSubtensor1):
@@ -2862,6 +2909,7 @@ def local_useless_inc_subtensor_alloc(node):
    Replaces an [Advanced]IncSubtensor[1], whose increment is an `alloc` of
    a fully or partially broadcastable variable, by one that skips the
    intermediate `alloc` where possible.
    """
    if isinstance(node.op, (IncSubtensor,
                            AdvancedIncSubtensor,
@@ -2962,7 +3010,8 @@ def local_useless_inc_subtensor_alloc(node):
 @gof.local_optimizer([T.Rebroadcast])
 def local_useless_rebroadcast(node):
    """
-    Remove Rebroadcast if id does not actually change the broadcasting pattern
+    Remove Rebroadcast if id does not actually change the broadcasting pattern.
    """
    if isinstance(node.op, T.Rebroadcast):
        x = node.inputs[0]
@@ -2992,6 +3041,7 @@ def local_rebroadcast_lift(node):
    Rebroadcast(Elemwise(x)) => Elemwise(Rebroadcast(x))
    Rebroadcast(Rebroadcast(x)) => Rebroadcast(x)
    """
    op = node.op
    if not isinstance(op, T.Rebroadcast):
@@ -3023,8 +3073,14 @@ def apply_rebroadcast_opt(rval):
    Apply as many times as required the optimization local_useless_rebroadcast
    and local_rebroadcast_lift.
-    :param rval: a Variable
+    Parameters
-    :return: a Variable (the same if no optimization can be applied)
+    ----------
+    rval: a Variable
+    Returns
+    -------
+    A Variable (the same if no optimization can be applied)
    """
    changed = True
@@ -3056,6 +3112,7 @@ def local_join_1(node):
    """Join(i, x) => x
    Remove Join() when only one element is joined.
    """
    if not isinstance(node.op, T.Join):
        return
@@ -3070,7 +3127,8 @@ def local_join_1(node):
 def local_join_empty(node):
    """Join(i, x, y, empty) => Join(i, x, y)
-    remove empty inputs to joins. The empty inputs can be anywhere.
+    Remove empty inputs to joins. The empty inputs can be anywhere.
    """
    if not isinstance(node.op, T.Join):
        return
@@ -3147,6 +3205,7 @@ def local_remove_switch_const_cond(node):
        T.switch(cond,left,right) -->
               if cond is constant and cond == 0: right
               if cond is constant and cond != 0: left
    """
    if (isinstance(node.op, T.Elemwise) and
            isinstance(node.op.scalar_op, scalar.basic.Switch)):
@@ -3183,7 +3242,9 @@ def local_mul_switch_sink(node):
    This is useful because A and B may not be numerically stable and give
    NaN or inf values for cases where the switch returns 0.
    With this optimization T.grad(T.switch(...)) has the right behavior.
-    Exemple:
+    Examples
+    --------
      x -> f(x)
      x -> g(x)
      y = T.switch(cond,f(x),g(x))
@@ -3193,6 +3254,7 @@ def local_mul_switch_sink(node):
      T.grad(y,x) -> switch(cond,grad(f(x),x), 0) + switch(cond,0,grad(g(x),x))
    This will be particularly useful for the lazyif because we skip
    an entire part of the graph.
    """
    if node.op != T.mul:
        return False
@@ -3234,6 +3296,7 @@ def local_div_switch_sink(node):
    This is useful because A may not be numerically stable and give
    NaN or inf values for cases where the switch returns 0.
    See local_mul_switch_sink for more details.
    """
    if (node.op != T.true_div and node.op != T.int_div):
        return False
@@ -3308,6 +3371,7 @@ def local_useless_split(node):
    """ Split{n_splits=1}(x, y) -> x
    Remove Split with only 1 split.
    """
    if isinstance(node.op, T.Split):
        if node.op.len_splits == 1:
@@ -3329,6 +3393,7 @@ def local_flatten_lift(node):
    This optimization is needed by optimization
    nnet/sigm.py:log1msigm_to_softplus to get applied when there is a flatten.
    """
    if (isinstance(node.op, T.Flatten) and
            node.inputs[0].owner and
@@ -3347,6 +3412,7 @@ def local_flatten_lift(node):
 def local_reshape_chain(node):
    """
    Reshape(Reshape(shape1),shape2) -> Reshape(shape2)
    """
    if not opt.check_chain(node, T.Reshape, T.Reshape):
        return False
@@ -3378,6 +3444,7 @@ def local_reshape_lift(node):
    This optimization is needed by optimization
    nnet/sigm.py:log1msigm_to_softplus to get applied when there is a reshape.
    """
    if (isinstance(node.op, T.Reshape) and
            node.inputs[0].owner and
@@ -3526,26 +3593,32 @@ class Canonizer(gof.LocalOptimizer):
    Usage: Canonizer(main, inverse, reciprocal, calculate)
-    * main: a suitable Op class that is commutative, associative and
+    Parameters
-            takes one to an arbitrary number of inputs, e.g. add or
+    ----------
-            mul
+    main
-    * inverse: an Op class such that inverse(main(x, y), y) == x
+        A suitable Op class that is commutative, associative and
-               e.g. sub or true_div
+        takes one to an arbitrary number of inputs, e.g. add or
-    * reciprocal: a function such that main(x, reciprocal(y)) ==
+        mul
-                  inverse(x, y) e.g. neg or inv
+    inverse
+        An Op class such that inverse(main(x, y), y) == x
-    * calculate: function that takes a list of numpy.ndarray instances
+        e.g. sub or true_div
-                 for the numerator, another list for the denumerator,
+    reciprocal
-                 and calculates inverse(main(*num), main(*denum)). It
+        A function such that main(x, reciprocal(y)) == inverse(x, y)
-                 takes a keyword argument, aslist. If True, the value
+        e.g. neg or inv
-                 should be returned as a list of one element, unless
+    calculate
-                 the value is such that value = main(). In that case,
+        Function that takes a list of numpy.ndarray instances
-                 the return value should be an empty list.
+        for the numerator, another list for the denumerator,
+        and calculates inverse(main(*num), main(*denum)). It
+        takes a keyword argument, aslist. If True, the value
+        should be returned as a list of one element, unless
+        the value is such that value = main(). In that case,
+        the return value should be an empty list.
    The variable is a local_optimizer. It is best used with a TopoOptimizer in
    in_to_out order.
-    Examples:
+    Examples
+    --------
      T = theano.tensor
      add_canonizer = Canonizer(T.add, T.sub, T.neg,
                                lambda n, d: sum(n) - sum(d))
@@ -3563,6 +3636,7 @@ class Canonizer(gof.LocalOptimizer):
      2 * x / 2 -> x
      x * y * z -> Elemwise(T.mul){x,y,z} #only one pass over the memory.
                !-> Elemwise(T.mul){x,Elemwise(T.mul){y,z}}
    """
    def __init__(self, main, inverse, reciprocal, calculate,
@@ -3747,8 +3821,13 @@ class Canonizer(gof.LocalOptimizer):
    @staticmethod
    def get_constant(v):
        """
-        Returns a numeric constant if v is a Constant or, well, a
-        numeric constant. If v is a plain Variable, returns None.
+        Returns
+        -------
+        object
+            A numeric constant if v is a Constant or, well, a
+            numeric constant. If v is a plain Variable, returns None.
        """
        if isinstance(v, Variable):
            try:
@@ -3762,6 +3841,7 @@ class Canonizer(gof.LocalOptimizer):
        """
        Shorthand for:
        self.simplify_constants(*self.simplify_factors(num, denum))
        """
        rval = self.simplify_constants(*self.simplify_factors(num, denum),
                                       out_type=out_type)
@@ -3781,6 +3861,7 @@ class Canonizer(gof.LocalOptimizer):
        [x], [x] -> [], []
        [x, y], [x] -> [y], []
        [a, b], [c, d] -> [a, b], [c, d]
        """
        for v in list(num):
            if v in denum:
@@ -3790,18 +3871,22 @@ class Canonizer(gof.LocalOptimizer):
    def simplify_constants(self, orig_num, orig_denum, out_type=None):
        """
+        Find all constants and put them together into a single constant.
        Finds all constants in orig_num and orig_denum (using
        get_constant) and puts them together into a single
        constant. The constant is inserted as the first element of the
        numerator. If the constant is the neutral element, it is
-        removed from the numerator. Examples:
+        removed from the numerator.
+        Examples
+        --------
        Let main be multiplication:
        [2, 3, x], [] -> [6, x], []
        [x, y, 2], [4, z] -> [0.5, x, y], [z]
        [x, 2, y], [z, 2] -> [x, y], [z]
        """
        # Lists representing the numerator and denumerator
@@ -3969,13 +4054,15 @@ register_canonicalize(local_neg_to_mul)
 @register_specialize
 @gof.local_optimizer([T.Sum, T.elemwise.Prod])
 def local_sum_prod_mul_by_scalar(node):
-    """sum(scalar * smth) -> scalar * sum(smth)
+    """
-       sum(-smth) -> -sum(smth)
+    sum(scalar * smth) -> scalar * sum(smth)
+    sum(-smth) -> -sum(smth)
+    or
-       or
+    prod(scalar * smth) -> scalar ** size(smth) * prod(smth)
+    prod(-smth) -> -1 ** size(smth) * prod(smth)
-       prod(scalar * smth) -> scalar ** size(smth) * prod(smth)
-       prod(-smth) -> -1 ** size(smth) * prod(smth)
    """
    # TODO: if the the thing inside the Sum is a division,
    # we should get at the numerator....
@@ -4040,8 +4127,11 @@ def local_elemwise_sub_zeros(node):
 @register_specialize
 @gof.local_optimizer([T.Sum])
 def local_sum_div_dimshuffle(node):
-    '''sum(a / dimshuffle{...}(b), axis=l) -> sum(a, axis={...}) / b,
+    """
-    if dimension l of the DimShuffle is 'x'.'''
+    sum(a / dimshuffle{...}(b), axis=l) -> sum(a, axis={...}) / b,
+    if dimension l of the DimShuffle is 'x'.
+    """
    # TODO: extend it to product, and quotient of products
    # It does not make much sense now to extend it to the case where the
@@ -4128,8 +4218,10 @@ def local_sum_div_dimshuffle(node):
 @register_canonicalize
 @gof.local_optimizer([T.Sum, T.elemwise.Prod])
 def local_sum_prod_all_to_none(node):
-    """Sum{0,1,...N} -> Sum{} or
+    """
-       Prod{0,1,...N} -> Prod{}
+    Sum{0,1,...N} -> Sum{} or
+    Prod{0,1,...N} -> Prod{}
    """
    if isinstance(node.op, T.Sum) or isinstance(node.op, T.elemwise.Prod):
        opt_type = T.Sum if isinstance(node.op, T.Sum) else T.elemwise.Prod
@@ -4148,6 +4240,7 @@ def local_op_of_op(node):
    Prod(Prod()) -> single Prod()
    or
    Sum(Sum()) -> single Sum()
    """
    if isinstance(node.op, T.elemwise.Prod) or isinstance(node.op, T.Sum):
        opt_type = T.Sum if isinstance(node.op, T.Sum) else T.elemwise.Prod
@@ -4219,14 +4312,16 @@ ALL_REDUCE = [T.elemwise.CAReduce, T.elemwise.All, T.elemwise.Any,
 @register_uncanonicalize  # Needed for MaxAndArgmax -> CAReduce
 @gof.local_optimizer(ALL_REDUCE)
 def local_reduce_join(node):
-    """Reduce{scalar.op}(Join(axis=0, a, b), axis=0) -> Elemwise{scalar.op}(a, b)
+    """
+    Reduce{scalar.op}(Join(axis=0, a, b), axis=0) -> Elemwise{scalar.op}(a, b)
-    :note: supported scalar.op are Maximum, Mimimum in some cases and
+    Notes
-        Add and Mul in all cases.
+    -----
+    Supported scalar.op are Maximum, Mimimum in some cases and Add and Mul in
+    all cases.
-    :note: Currently we must reduce on axis 0. It is probably
+    Currently we must reduce on axis 0. It is probably extensible to the case
-        extensible to the case where we join and reduce on the same
+    where we join and reduce on the same set of axis.
-        set of axis.
    """
    if (isinstance(node.op, T.CAReduce) and
@@ -4312,7 +4407,7 @@ def local_cut_useless_reduce(node):
 @register_specialize
 @gof.local_optimizer(ALL_REDUCE)
 def local_reduce_broadcastable(node):
-    """Remove reduction over broadcastable dimensions"""
+    """Remove reduction over broadcastable dimensions."""
    if isinstance(node.op, T.CAReduce):
        reduced, = node.inputs
        odtype = node.outputs[0].dtype
@@ -4351,9 +4446,11 @@ def local_reduce_broadcastable(node):
 @register_specialize
 @gof.local_optimizer([T.Sum, T.elemwise.Prod])
 def local_opt_alloc(node):
-    """ sum(alloc(constant,shapes...)) => constant*prod(shapes)
+    """
-        or
+    sum(alloc(constant,shapes...)) => constant*prod(shapes)
-        prod(alloc(constant,shapes...)) => constant**prod(shapes)
+    or
+    prod(alloc(constant,shapes...)) => constant**prod(shapes)
    """
    if isinstance(node.op, T.Sum) or isinstance(node.op, T.elemwise.Prod):
        node_inps, = node.inputs
@@ -4406,9 +4503,11 @@ def local_neg_neg(node):
 @register_specialize
 @gof.local_optimizer([T.neg])
 def local_neg_div_neg(node):
-    """- (-a / b) -> a / b
+    """
+    - (-a / b) -> a / b
    Also performs - (c / b) -> ((-c) / b) when c is a scalar constant.
    """
    if node.op == T.neg:
        if node.inputs[0].owner and node.inputs[0].owner.op == T.true_div:
@@ -4427,8 +4526,10 @@ def local_neg_div_neg(node):
 @gof.local_optimizer([T.mul])
 def local_mul_zero(node):
-    """As part of canonicalization, we replace multiplication by zero
+    """
+    As part of canonicalization, we replace multiplication by zero
    with zero.
    """
    if node.op == T.mul:
        otype = node.outputs[0].type
@@ -4489,10 +4590,12 @@ register_canonicalize(local_pow_canonicalize)
 @register_specialize
 @gof.local_optimizer([T.mul])
 def local_mul_to_sqr(node):
-    """x*x -> sqr(x)
+    """
+    x*x -> sqr(x)
    This is faster on the GPU when memory fetching is a big part of
    the computation time.
    """
    if node.op == T.mul:
        if len(node.inputs) == 2:
@@ -4620,7 +4723,8 @@ def local_pow_specialize_device(node):
 @gof.local_optimizer([T.mul])
 def local_mul_specialize(node):
-    """Remove special-case constants from mul arguments and useless neg in inputs.
+    """
+    Remove special-case constants from mul arguments and useless neg in inputs.
    mul(-1, x) -> neg(x)
    mul(1, x, y) -> mul(x, y)
@@ -4629,6 +4733,7 @@ def local_mul_specialize(node):
    This is not done if we would add more nodes in the graph, like with:
    mul(-1, x, y) -/-> neg(mul(x, y))
    """
    # here, we are past the point of canonicalization, so we don't
    # want to put in un-necessary fills.
@@ -4766,8 +4871,9 @@ local_mul_canonizer.add_simplifier(check_for_x_over_absX, 'X_over_absX')
 @gof.local_optimizer([T.abs_])
 def local_abs_lift(node):
    """
-    move the abs toward the input. This is needed for
+    Move the abs toward the input.
-    check_for_x_over_absX to apply in more case.
+    This is needed for check_for_x_over_absX to apply in more case.
    """
    if node.op == T.abs_ and node.inputs[0].owner:
@@ -4783,7 +4889,7 @@ def local_abs_lift(node):
 @gof.local_optimizer([T.mul, T.true_div])
 def local_abs_merge(node):
    """
-    merge abs generated by local_abs_lift when the canonizer don't
+    Merge abs generated by local_abs_lift when the canonizer don't
    need it anymore
    """
@@ -4968,6 +5074,8 @@ def attempt_distribution(factor, num, denum, out_type):
 @gof.local_optimizer([T.mul, T.true_div, T.inv])
 def local_greedy_distributor(node):
    """
+    Optimize by reducing the number of multiplications and/or divisions.
    This optimization tries to apply distributivity of multiplication
    to addition in order to reduce the number of multiplications
    and/or divisions that must be done. The algorithm weighs division
@@ -4985,6 +5093,7 @@ def local_greedy_distributor(node):
    This optimization aims to reduce computational cost. It may also
    increase numerical stability, e.g. when x and/or y tend to 0 in
    example 1.
    """
    out = node.outputs[0]
@@ -5083,7 +5192,13 @@ def constant_folding(node):
 def _is_1(expr):
-    """rtype bool. True iff expr is a constant close to 1
+    """
+    Returns
+    -------
+    bool
+        True iff expr is a constant close to 1.
    """
    try:
        v = get_scalar_constant_value(expr)
@@ -5093,7 +5208,13 @@ def _is_1(expr):
 def _is_minus1(expr):
-    """rtype bool. True iff expr is a constant close to -1
+    """
+    Returns
+    -------
+    bool
+        True iff expr is a constant close to -1.
    """
    try:
        v = get_scalar_constant_value(expr)
@@ -5103,13 +5224,19 @@ def _is_minus1(expr):
 def get_clients(node):
-    "Used by erf/erfc opt to track less frequent op"
+    """
+    Used by erf/erfc opt to track less frequent op.
+    """
    return [c for c, i in node.outputs[0].clients
            if c != "output"]
 def get_clients2(node):
-    "Used by erf/erfc opt to track less frequent op"
+    """
+    Used by erf/erfc opt to track less frequent op.
+    """
    l = []
    for c, i in node.outputs[0].clients:
        if c != "output":
@@ -5622,18 +5749,22 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 32,
    """
    We parametrize it to make it work for Elemwise and GpuElemwise op.
-    :param OP: GpuElemwise or Elemwise class (the one that we want to fuse)
+    Parameters
+    ----------
-    :param max_input_fct: a function that returns the maximum number of inputs
+    OP
-                          that this elemwise can take (useful for GpuElemwise).
+        GpuElemwise or Elemwise class (the one that we want to fuse)
-                          GPU kernel currently has a limit of 256 bytes for
+    max_input_fct
-                          the size of all parameters passed to it. As currently
+        A function that returns the maximum number of inputs
-                          we pass many information only by parameter, we must
+        that this elemwise can take (useful for GpuElemwise).
-                          limit how many ops we fuse together to avoid busting
+        GPU kernel currently has a limit of 256 bytes for
-                          that 256 limit.
+        the size of all parameters passed to it. As currently
+        we pass many information only by parameter, we must
+        limit how many ops we fuse together to avoid busting
+        that 256 limit.
+        On the CPU we limit to 32 input variables
+        since that is the maximum numpy support.
-                          On the CPU we limit to 32 input variables
-                          since that is the maximum numpy support.
    """
    if maker is None:
        def maker(node, scalar_op):
@@ -5647,6 +5778,7 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 32,
        For mixed dtype, we let the Composite op do the cast. It lets the C
        compiler do the cast.
        The number of dimensions is validated at call time by theano itself.
        """
        # META TODO:  PUT THESE THINGS IN TRAC, NOT TODO NOTES!!
        # TODO: use broadcast flag?
@@ -5862,7 +5994,7 @@ local_elemwise_fusion = local_elemwise_fusion_op(T.Elemwise,
 class FusionOptimizer(Optimizer):
-    """Graph optimizer for Fusion of elemwise operations"""
+    """Graph optimizer for Fusion of elemwise operations."""
    def __init__(self, local_optimizer):
        Optimizer.__init__(self)
        self.optimizer = local_optimizer

--- a/theano/tensor/opt_uncanonicalize.py
+++ b/theano/tensor/opt_uncanonicalize.py
@@ -28,8 +28,8 @@ problem.
 Also, we should make the fgraph refuse optimization that break the
 canonization of the graph in the optimizations phases where the graph is
 supposed to be canonical.
-"""
+"""
 # TODO: intelligent merge for mul/add
 # TODO: 0*x -> 0
 import logging
@@ -72,12 +72,15 @@ def local_max_and_argmax(node):
 @gof.local_optimizer([T.neg])
 def local_max_to_min(node):
    """
-    change -(max(-x)) to min
+    Change -(max(-x)) to min.
+    This is tested in tensor/tests/test_basic.py:test_min_max.
-    This is tested in tensor/tests/test_basic.py:test_min_max
+    Notes
+    -----
+    We don't need an opt that will do the reverse as by default
+    the interface put only MaxAndArgmax into the graph.
-    :note: we don't need an opt that will do the reverse as by default
-           the interface put only MaxAndArgmax into the graph.
    """
    if node.op == T.neg and node.inputs[0].owner:
        max = node.inputs[0]

--- a/theano/tensor/raw_random.py
+++ b/theano/tensor/raw_random.py
@@ -19,7 +19,8 @@ __docformat__ = "restructuredtext en"
 class RandomStateType(gof.Type):
-    """A Type wrapper for numpy.random.RandomState
+    """
+    A Type wrapper for numpy.random.RandomState.
    The reason this exists (and `Generic` doesn't suffice) is that
    RandomState objects that would appear to be equal do not compare
@@ -99,35 +100,36 @@ random_state_type = RandomStateType()
 class RandomFunction(gof.Op):
-    """Op that draws random numbers from a numpy.random.RandomState object
    """
-    __props__ = ("fn", "outtype", "inplace", "ndim_added")
+    Op that draws random numbers from a numpy.random.RandomState object.
-    def __init__(self, fn, outtype, inplace=False, ndim_added=0):
+    Parameters
-        """
+    ----------
-        :param fn: a member function of numpy.random.RandomState
+    fn : string or function reference
+        A member function of numpy.random.RandomState. A string will
+        be interpreted as the name of a member function of
+        numpy.random.RandomState.
        Technically, any function with a signature like the ones in
-        numpy.random.RandomState will do.  This function must accept
+        numpy.random.RandomState will do. This function must accept
        the shape (sometimes called size) of the output as the last
        positional argument.
+    outtype
+        The theano Type of the output.
+    args
+        A list of default arguments for the function
+        kwargs
+        If the 'inplace' key is there, its value will be used to
+        determine if the op operates inplace or not.
+        If the 'ndim_added' key is there, its value indicates how
+        many more dimensions this op will add to the output, in
+        addition to the shape's dimensions (used in multinomial and
+        permutation).
-        :type fn: string or function reference.  A string will
+    """
-        be interpreted as the name of a member function of
-        numpy.random.RandomState.
-        :param outtype: the theano Type of the output
-        :param args: a list of default arguments for the function
+    __props__ = ("fn", "outtype", "inplace", "ndim_added")
-        :param kwargs:
+    def __init__(self, fn, outtype, inplace=False, ndim_added=0):
-            If the 'inplace' key is there, its value will be used to
-            determine if the op operates inplace or not.
-            If the 'ndim_added' key is there, its value indicates how
-            many more dimensions this op will add to the output, in
-            addition to the shape's dimensions (used in multinomial and
-            permutation).
-        """
        self.__setstate__([fn, outtype, inplace, ndim_added])
    def __getstate__(self):
@@ -151,30 +153,33 @@ class RandomFunction(gof.Op):
    def make_node(self, r, shape, *args):
        """
-        :param r: a numpy.random.RandomState instance, or a Variable of Type
+        Parameters
-        RandomStateType that will contain a RandomState instance.
+        ----------
+        r
-        :param shape: an lvector with a shape defining how many samples
+            A numpy.random.RandomState instance, or a Variable of Type
-        to draw.  In the case of scalar distributions, it is the shape
+            RandomStateType that will contain a RandomState instance.
-        of the tensor output by this Op.  In that case, at runtime, the
+        shape
-        value associated with this lvector must have a length equal to
+            An lvector with a shape defining how many samples
-        the number of dimensions promised by `self.outtype`.
+            to draw.  In the case of scalar distributions, it is the shape
-        In a more general case, the number of output dimensions,
+            of the tensor output by this Op.  In that case, at runtime, the
-        len(self.outtype), is equal to len(shape)+self.ndim_added.
+            value associated with this lvector must have a length equal to
-        The special case where len(shape) == 0 means that the smallest
+            the number of dimensions promised by `self.outtype`.
-        shape compatible with the argument's shape will be used.
+            In a more general case, the number of output dimensions,
+            len(self.outtype), is equal to len(shape)+self.ndim_added.
-        :param args: the values associated with these variables will
+            The special case where len(shape) == 0 means that the smallest
-        be passed to the RandomState function during perform as extra
+            shape compatible with the argument's shape will be used.
-        "*args"-style arguments.  These should be castable to variables
+        args
-        of Type TensorType.
+            The values associated with these variables will be passed to the
+            RandomState function during perform as extra "*args"-style
-        :rtype: Apply
+            arguments. These should be castable to variables of Type TensorType.
-        :return: Apply with two outputs.  The first output is a
+        Returns
-        gof.generic Variable from which to draw further random numbers.
+        -------
-        The second output is the outtype() instance holding the random
+        Apply
-        draw.
+            Apply with two outputs. The first output is a gof.generic Variable
+            from which to draw further random numbers.
+            The second output is the outtype() instance holding the random
+            draw.
        """
        shape_ = tensor.as_tensor_variable(shape, ndim=1)
@@ -289,12 +294,15 @@ def _infer_ndim_bcast(ndim, shape, *args):
    """
    Infer the number of dimensions from the shape or the other arguments.
-    :rtype: (int, variable, tuple) triple, where the variable is an integer
+    Returns
-    vector, and the tuple contains Booleans.
+    -------
-    :returns: the first element returned is the inferred number of dimensions.
+    (int, variable, tuple) triple, where the variable is an integer vector,
-    The second element is the shape inferred (combining symbolic and constant
+    and the tuple contains Booleans
-    informations from shape and args).
+        The first element returned is the inferred number of dimensions.
-    The third element is a broadcasting pattern corresponding to that shape.
+        The second element is the shape inferred (combining symbolic and
+        constant informations from shape and args).
+        The third element is a broadcasting pattern corresponding to that shape.
    """
    # Find the minimum value of ndim required by the *args
@@ -390,7 +398,7 @@ def _infer_ndim_bcast(ndim, shape, *args):
 def _generate_broadcasting_indices(out_shape, *shapes):
-    '''
+    """
    Return indices over each shape that broadcast them to match out_shape.
    The first returned list is equivalent to numpy.ndindex(out_shape),
@@ -400,7 +408,8 @@ def _generate_broadcasting_indices(out_shape, *shapes):
    The shapes should have the same length as out_shape. If they are longer,
    the right-most dimensions are ignored.
-    '''
+    """
    all_shapes = (out_shape,) + shapes
    # Will contain the return value: a list of indices for each argument
    ret_indices = [[()] for shape in all_shapes]
@@ -447,6 +456,7 @@ def uniform(random_state, size=None, low=0.0, high=1.0, ndim=None, dtype=None):
    If dtype is not specified, it will be inferred from the dtype of
    low and high, but will be at least as precise as floatX.
    """
    low = tensor.as_tensor_variable(low)
    high = tensor.as_tensor_variable(high)
@@ -471,6 +481,7 @@ def normal(random_state, size=None, avg=0.0, std=1.0, ndim=None, dtype=None):
    If dtype is not specified, it will be inferred from the dtype of
    avg and std, but will be at least as precise as floatX.
    """
    avg = tensor.as_tensor_variable(avg)
    std = tensor.as_tensor_variable(std)
@@ -493,6 +504,7 @@ def binomial(random_state, size=None, n=1, p=0.5, ndim=None,
    If size is None, the output shape will be determined by the shapes
    of n and prob.
    """
    if prob is not None:
        p = prob
@@ -514,12 +526,13 @@ def binomial(random_state, size=None, n=1, p=0.5, ndim=None,
 def random_integers_helper(random_state, low, high, size):
-    '''
+    """
    Helper function to draw random integers.
    This is a generalization of numpy.random.random_integers to the case where
    low and high are tensors.
-    '''
+    """
    # Figure out the output shape
    if size is not None:
        out_ndim = len(size)
@@ -570,6 +583,7 @@ def random_integers(random_state, size=None, low=0, high=1, ndim=None,
    If size is None, the output shape will be determined by the shapes
    of low and high.
    """
    low = tensor.as_tensor_variable(low)
    high = tensor.as_tensor_variable(high)
@@ -580,11 +594,13 @@ def random_integers(random_state, size=None, low=0, high=1, ndim=None,
 def choice_helper(random_state, a, replace, p, size):
-    """Helper function to draw random numbers using numpy's choice function.
+    """
+    Helper function to draw random numbers using numpy's choice function.
    This is a generalization of numpy.random.choice that coerces
    `replace` to a bool and replaces `p` with None when p is a vector
    of 0 elements.
    """
    if a.ndim > 1:
        raise ValueError('a.ndim (%i) must be 0 or 1' % a.ndim)
@@ -608,6 +624,7 @@ def choice(random_state, size=None, a=2, replace=True, p=None, ndim=None,
    may be a plain integer to supplement the missing information.
    If size is None, a scalar will be returned.
    """
    # numpy.random.choice is only available for numpy versions >= 1.7
    major, minor, _ = numpy.version.short_version.split('.')
@@ -631,17 +648,21 @@ def poisson(random_state, size=None, lam=1.0, ndim=None, dtype='int64'):
    """
    Draw samples from a Poisson distribution.
-    The Poisson distribution is the limit of the Binomial distribution for large N.
+    The Poisson distribution is the limit of the Binomial distribution for
+    large N.
-    :param lam: float or ndarray-like of the same shape as size parameter
+    Parameters
+    ----------
+    lam : float or ndarray-like of the same shape as size parameter
        Expectation of interval, should be >= 0.
+    size: int or tuple of ints, optional
+        Output shape. If the given shape is, e.g., (m, n, k), then m * n * k
+        samples are drawn.
+    dtype
+        The dtype of the return value (which will represent counts).
-    :param size: int or tuple of ints, optional
+    size or ndim must be given.
-        Output shape. If the given shape is, e.g., (m, n, k), then m * n * k samples are drawn.
-    :param dtype: the dtype of the return value (which will represent counts)
-    size or ndim must be given
    """
    lam = tensor.as_tensor_variable(lam)
@@ -653,7 +674,8 @@ def poisson(random_state, size=None, lam=1.0, ndim=None, dtype='int64'):
 def permutation_helper(random_state, n, shape):
-    """Helper function to generate permutations from integers.
+    """
+    Helper function to generate permutations from integers.
    permutation_helper(random_state, n, (1,)) will generate a permutation of
    integers 0..n-1.
@@ -666,6 +688,7 @@ def permutation_helper(random_state, n, shape):
    This is a generalization of numpy.random.permutation to tensors.
    Otherwise it behaves the same.
    """
    # n should be a 0-dimension array
    assert n.shape == ()
@@ -688,17 +711,20 @@ def permutation_helper(random_state, n, shape):
 def permutation(random_state, size=None, n=1, ndim=None, dtype='int64'):
    """
-    Returns permutations of the integers between 0 and n-1, as many times
+    Return permutations of the integers between 0 and n-1.
-    as required by size. For instance, if size=(p,q), p*q permutations
-    will be generated, and the output shape will be (p,q,n), because each
+    Returns them as many times as required by size. For instance, if size=(p,q),
-    permutation is of size n.
+    p*q permutations will be generated, and the output shape will be (p,q,n),
+    because each permutation is of size n.
    Theano tries to infer the number of dimensions from the length of
    the size argument and the shape of n, but you may always specify it
    with the `ndim` parameter.
-    :note:
+    Notes
-        Note that the output will then be of dimension ndim+1.
+    -----
+    Note that the output will then be of dimension ndim+1.
    """
    if size is None or size == ():
        if not(ndim is None or ndim == 1):
@@ -718,12 +744,13 @@ def permutation(random_state, size=None, n=1, ndim=None, dtype='int64'):
 def multinomial_helper(random_state, n, pvals, size):
-    '''
+    """
    Helper function drawing from multinomial distributions.
    This is a generalization of numpy.random.multinomial to the case where
    n and pvals are tensors.
-    '''
+    """
    # Figure out the shape if it's None
    # Note: the output ndim will be ndim+1, because the multinomial
    # adds a dimension. The length of that dimension is pvals.shape[-1].
@@ -791,31 +818,40 @@ def multinomial_helper(random_state, n, pvals, size):
 def multinomial(random_state, size=None, n=1, pvals=[0.5, 0.5],
                ndim=None, dtype='int64'):
-    """Sample from one or more multinomial distributions defined by
+    """
+    Sample from one or more multinomial distributions defined by
    one-dimensional slices in pvals.
-    :param pvals: a tensor of shape "nmulti+(L,)" describing each multinomial
+    Parameters
+    ----------
+    pvals
+        A tensor of shape "nmulti+(L,)" describing each multinomial
        distribution.  This tensor must have the property that
        numpy.allclose(pvals.sum(axis=-1), 1) is true.
+    size
-    :param size: a vector of shape information for the output; this can also
+        A vector of shape information for the output; this can also
        specify the "nmulti" part of pvals' shape.  A -1 in the k'th position
        from the right means to borrow the k'th position from the
        right in nmulti. (See examples below.)
        Default ``None`` means size=nmulti.
+    n
-    :param n: the number of experiments to simulate for each
+        The number of experiments to simulate for each
        multinomial. This can be a scalar, or tensor, it will be
        broadcasted to have shape "nmulti".
+    dtype
-    :param dtype: the dtype of the return value (which will represent counts)
+        The dtype of the return value (which will represent counts)
-    :returns: tensor of len(size)+1 dimensions, and shape[-1]==L, with
+    Returns
-        the specified ``dtype``, with the experiment counts.  See
+    -------
+    tensor
+        Tensor of len(size)+1 dimensions, and shape[-1]==L, with
+        the specified ``dtype``, with the experiment counts. See
        examples to understand the shape of the return value, which is
-        derived from both size and pvals.shape.  In return value rval,
+        derived from both size and pvals.shape. In return value rval,
        "numpy.allclose(rval.sum(axis=-1), n)" will be true.
+    Extended Summary
+    ----------------
    For example, to simulate n experiments from each multinomial in a batch of
    size B:
@@ -881,8 +917,8 @@ class RandomStreamsBase(object):
        return the number of successes.
        If the size argument is ambiguous on the number of dimensions,
-        ndim may be a plain integer to supplement the missing
+        ndim may be a plain integer to supplement the missing information.
-        information.
        """
        if prob is not None:
            p = prob
@@ -895,8 +931,8 @@ class RandomStreamsBase(object):
        distribution between low and high.
        If the size argument is ambiguous on the number of dimensions,
-        ndim may be a plain integer to supplement the missing
+        ndim may be a plain integer to supplement the missing information.
-        information.
        """
        return self.gen(uniform, size, low, high, ndim=ndim, dtype=dtype)
@@ -906,8 +942,8 @@ class RandomStreamsBase(object):
        the specified standard deviation (std).
        If the size argument is ambiguous on the number of dimensions,
-        ndim may be a plain integer to supplement the missing
+        ndim may be a plain integer to supplement the missing information.
-        information.
        """
        return self.gen(normal, size, avg, std, ndim=ndim, dtype=dtype)
@@ -917,8 +953,8 @@ class RandomStreamsBase(object):
        Sample a random integer between low and high, both inclusive.
        If the size argument is ambiguous on the number of dimensions,
-        ndim may be a plain integer to supplement the missing
+        ndim may be a plain integer to supplement the missing information.
-        information.
        """
        return self.gen(random_integers, size, low, high, ndim=ndim,
                        dtype=dtype)
@@ -926,13 +962,14 @@ class RandomStreamsBase(object):
    def choice(self, size=None, a=2, replace=True, p=None, ndim=None,
               dtype='int64'):
        """
-        Choose values from `a` with or without replacement. `a` can be a 1-D
+        Choose values from `a` with or without replacement.
-        array or a positive scalar. If `a` is a scalar, the samples are drawn
-        from the range 0,...,a-1.
+        `a` can be a 1-D array or a positive scalar.
+        If `a` is a scalar, the samples are drawn from the range 0,...,a-1.
        If the size argument is ambiguous on the number of dimensions,
-        ndim may be a plain integer to supplement the missing
+        ndim may be a plain integer to supplement the missing information.
-        information.
        """
        return self.gen(choice, size, a, replace, p, ndim=ndim, dtype=dtype)
@@ -940,27 +977,32 @@ class RandomStreamsBase(object):
        """
        Draw samples from a Poisson distribution.
-        The Poisson distribution is the limit of the Binomial distribution for large N.
+        The Poisson distribution is the limit of the Binomial distribution for
+        large N.
        If the size argument is ambiguous on the number of dimensions,
-        ndim may be a plain integer to supplement the missing
+        ndim may be a plain integer to supplement the missing information.
-        information.
        """
        return self.gen(poisson, size, lam, ndim=ndim, dtype=dtype)
    def permutation(self, size=None, n=1, ndim=None, dtype='int64'):
        """
-        Returns permutations of the integers between 0 and n-1, as many times
+        Return permutations of the integers between 0 and n-1.
-        as required by size. For instance, if size=(p,q), p*q permutations
-        will be generated, and the output shape will be (p,q,n), because each
+        Returns them as many times as required by size. For instance,
+        if size=(p,q), p*q permutations will be generated,
+        and the output shape will be (p,q,n), because each
        permutation is of size n.
        Theano tries to infer the number of dimensions from the length
        of the size argument and the shape of n, but you may always
        specify it with the `ndim` parameter.
-        .. note::
+        Notes
-            Note that the output will then be of dimension ndim+1.
+        -----
+        Note that the output will then be of dimension ndim+1.
        """
        return self.gen(permutation, size, n, ndim=ndim, dtype=dtype)
@@ -976,16 +1018,20 @@ class RandomStreamsBase(object):
        of the size argument and the shapes of n and pvals, but you may
        always specify it with the `ndim` parameter.
-        .. note::
+        Notes
-            Note that the output will then be of dimension ndim+1.
+        -----
+        Note that the output will then be of dimension ndim+1.
        """
        return self.gen(multinomial, size, n, pvals, ndim=ndim, dtype=dtype)
    def shuffle_row_elements(self, input):
-        """Return a variable with every row (rightmost index) shuffled.
+        """
+        Return a variable with every row (rightmost index) shuffled.
        This uses permutation random variable internally, available via
        the ``.permutation`` attribute of the return value.
        """
        perm = self.permutation(size=input.shape[:-1], n=input.shape[-1],
                                ndim=input.ndim - 1)

--- a/theano/tensor/shared_randomstreams.py
+++ b/theano/tensor/shared_randomstreams.py
-"""Define RandomStreams, providing random number variables for Theano
-graphs.
 """
+Define RandomStreams, providing random number variables for Theano
+graphs.
+"""
 import copy
 import numpy
@@ -20,7 +21,10 @@ class RandomStateSharedVariable(SharedVariable):
 @shared_constructor
 def randomstate_constructor(value, name=None, strict=False,
                            allow_downcast=None, borrow=False):
-    """SharedVariable Constructor for RandomState"""
+    """
+    SharedVariable Constructor for RandomState.
+    """
    if not isinstance(value, numpy.random.RandomState):
        raise TypeError
    if not borrow:
@@ -37,20 +41,20 @@ class RandomStreams(raw_random.RandomStreamsBase):
    """
    Module component with similar interface to numpy.random
    (numpy.random.RandomState)
+    Parameters
+    ----------
+    seed: None or int
+        A default seed to initialize the RandomState
+        instances after build.  See `RandomStreamsInstance.__init__`
+        for more details.
    """
    def updates(self):
        return list(self.state_updates)
    def __init__(self, seed=None):
-        """
-        :type seed: None or int
-        :param seed: a default seed to initialize the RandomState
-        instances after build.  See `RandomStreamsInstance.__init__`
-        for more details.
-        """
        super(RandomStreams, self).__init__()
        # A list of pairs of the form (input_r, output_r).  This will be
        # over-ridden by the module instance to contain stream generators.
@@ -62,14 +66,18 @@ class RandomStreams(raw_random.RandomStreamsBase):
        self.gen_seedgen = numpy.random.RandomState(seed)
    def seed(self, seed=None):
-        """Re-initialize each random stream
+        """
+        Re-initialize each random stream.
-        :param seed: each random stream will be assigned a unique
-        state that depends deterministically on this value.
-        :type seed: None or integer in range 0 to 2**30
+        Parameters
+        ----------
+        seed : None or integer in range 0 to 2**30
+            Each random stream will be assigned a unique state that depends
+            deterministically on this value.
-        :rtype: None
+        Returns
+        -------
+        None
        """
        if seed is None:
@@ -82,54 +90,72 @@ class RandomStreams(raw_random.RandomStreamsBase):
                            borrow=True)
    def __getitem__(self, item):
-        """Retrieve the numpy RandomState instance associated with a
+        """
-        particular stream
+        Retrieve the numpy RandomState instance associated with a particular
+        stream.
-        :param item: a variable of type RandomStateType, associated
+        Parameters
-        with this RandomStream
+        ----------
+        item
+            A variable of type RandomStateType, associated
+            with this RandomStream.
-        :rtype: numpy RandomState (or None, before initialize)
+        Returns
+        -------
+        numpy RandomState (or None, before initialize)
-        :note: This is kept for compatibility with
+        Notes
-        `tensor.randomstreams.RandomStreams`.  The simpler syntax
+        -----
-        ``item.rng.get_value()`` is also valid.
+        This is kept for compatibility with `tensor.randomstreams.RandomStreams`.
+        The simpler syntax ``item.rng.get_value()`` is also valid.
        """
        return item.get_value(borrow=True)
    def __setitem__(self, item, val):
-        """Set the numpy RandomState instance associated with a
+        """
-        particular stream
+        Set the numpy RandomState instance associated with a particular stream.
-        :param item: a variable of type RandomStateType, associated
+        Parameters
-        with this RandomStream
+        ----------
+        item
+            A variable of type RandomStateType, associated with this
+            RandomStream.
-        :param val: the new value
+        val : numpy RandomState
-        :type val: numpy RandomState
+            The new value.
-        :rtype:  None
+        Returns
+        -------
+        None
-        :note: This is kept for compatibility with
+        Notes
-        `tensor.randomstreams.RandomStreams`.  The simpler syntax
+        -----
-        ``item.rng.set_value(val)`` is also valid.
+        This is kept for compatibility with `tensor.randomstreams.RandomStreams`.
+        The simpler syntax ``item.rng.set_value(val)`` is also valid.
        """
        item.set_value(val, borrow=True)
    def gen(self, op, *args, **kwargs):
-        """Create a new random stream in this container.
+        """
+        Create a new random stream in this container.
-        :param op: a RandomFunction instance to
+        Parameters
-        :param args: interpreted by `op`
+        ----------
+        op
-        :param kwargs: interpreted by `op`
+            A RandomFunction instance to
+        args
-        :returns: The symbolic random draw part of op()'s return
+            Interpreted by `op`.
-        value.  This function stores the updated RandomStateType
+        kwargs
-        Variable for use at `build` time.
+            Interpreted by `op`.
-        :rtype: TensorVariable
+        Returns
+        -------
+        Tensor Variable
+            The symbolic random draw part of op()'s return value.
+            This function stores the updated RandomStateType Variable
+            for use at `build` time.
        """
        seed = int(self.gen_seedgen.randint(2 ** 30))

--- a/theano/tensor/sharedvar.py
+++ b/theano/tensor/sharedvar.py
@@ -8,9 +8,12 @@ from theano.compile import shared_constructor, SharedVariable
 def load_shared_variable(val):
-    """This function is only here to keep some pickles loading
+    """
+    This function is only here to keep some pickles loading
    after a failed fix done in August 2011.
-    It can be removed after sufficient time has passed."""
+    It can be removed after sufficient time has passed.
+    """
    return tensor_constructor(val)
@@ -22,13 +25,15 @@ class TensorSharedVariable(_tensor_py_operators, SharedVariable):
 @shared_constructor
 def tensor_constructor(value, name=None, strict=False, allow_downcast=None,
                       borrow=False, broadcastable=None):
-    """SharedVariable Constructor for TensorType
+    """
+    SharedVariable Constructor for TensorType.
-    :note: Regarding the inference of the broadcastable pattern...
+    Notes
+    -----
+    Regarding the inference of the broadcastable pattern...
    The default is to assume that the value might be resized in any
-    dimension, so the default broadcastable is
+    dimension, so the default broadcastable is ``(False,)*len(value.shape)``.
-    ``(False,)*len(value.shape)``.  The optional `broadcastable`
+    The optional `broadcastable` argument will override this default.
-    argument will override this default.
    """
    if not isinstance(value, numpy.ndarray):
@@ -61,13 +66,16 @@ class ScalarSharedVariable(_tensor_py_operators, SharedVariable):
 @shared_constructor
 def scalar_constructor(value, name=None, strict=False, allow_downcast=None,
                       borrow=False):
-    """SharedVariable constructor for scalar values. Default: int64 or float64.
+    """
+    SharedVariable constructor for scalar values. Default: int64 or float64.
-    :note: We implement this using 0-d tensors for now.
+    Notes
+    -----
+    We implement this using 0-d tensors for now.
-    :note: We ignore the borrow parameter as we convert ``value`` to an
+    We ignore the borrow parameter as we convert ``value`` to an
-      ndarray (this is a new object). This respects the semantic of
+    ndarray (this is a new object). This respects the semantic of
-      borrow, as it is a hint to Theano that we can reuse it.
+    borrow, as it is a hint to Theano that we can reuse it.
    """
    if not isinstance(value, (numpy.number, float, int, complex)):

--- a/theano/tensor/signal/conv.py
+++ b/theano/tensor/signal/conv.py
 """
 Contains a wrapper function for tensor.nnet.ConvOp, which can be used to perform
 generic 2D convolution.
-"""
+"""
 __docformat__ = "restructuredtext en"
 import warnings
@@ -25,20 +25,29 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
    Shape parameters are optional and will result in faster execution.
-    :type input: dmatrix of dtensor3
+    Parameters
-    :param input: symbolic variable for images to be filtered
+    ----------
-    :type filters: dmatrix of dtensor3
+    input : dmatrix of dtensor3
-    :param filters: symbolic variable containing filter values
+        Symbolic variable for images to be filtered.
-    :param border_mode: 'valid' or 'full'. see scipy.signal.convolve2d
+    filters : dmatrix of dtensor3
-    :param subsample: factor by which to subsample output
+        Symbolic variable containing filter values.
-    :type image_shape: tuple of length 2 or 3
+    border_mode: {'valid', 'full'}
-    :param image_shape: ([number images,] image height, image width)
+        See scipy.signal.convolve2d.
-    :type filter_shape: tuple of length 2 or 3
+    subsample
-    :param filter_shape: ([number filters,] filter height, filter width)
+        Factor by which to subsample output.
-    :param kwargs: see theano.tensor.nnet.conv.conv2d
+    image_shape : tuple of length 2 or 3
-    :rtype: symbolic 2D,3D or 4D tensor
+        ([number images,] image height, image width).
-    :return: tensor of filtered images, with shape
+    filter_shape : tuple of length 2 or 3
-             ([number images,] [number filters,] image height, image width)
+        ([number filters,] filter height, filter width).
+    kwargs
+        See theano.tensor.nnet.conv.conv2d.
+    Returns
+    -------
+    symbolic 2D,3D or 4D tensor
+        Tensor of filtered images, with shape
+        ([number images,] [number filters,] image height, image width).
    """
    assert input.ndim in (2, 3)
    assert filters.ndim in (2, 3)

--- a/theano/tensor/signal/downsample.py
+++ b/theano/tensor/signal/downsample.py
-""" Ops for downsampling images.
+""" 
+Ops for downsampling images.
 Planned:
 DownsampleFactorMax, DownsampleAvg, DownsampleSoftmax.
@@ -29,12 +30,14 @@ def max_pool_2d_same_size(input, patch_size):
    keeping only the maximum values. The output has the same dimensions as
    the input.
-    :type input: 4-D theano tensor of input images.
+    Parameters
-    :param input: input images. Max pooling will be done over the 2 last
+    ----------
-        dimensions.
+    input : 4-D theano tensor of input images
-    :type patch_size: tuple of length 2
+        Input images. Max pooling will be done over the 2 last dimensions.
-    :param patch_size: size of the patch (patch height, patch width).
+    patch_size : tuple of length 2
+        Size of the patch (patch height, patch width).
        (2,2) will retain only one non-zero value per patch of 4 values.
    """
    output = DownsampleFactorMax(patch_size, True)(input)
    outs = MaxPoolGrad(patch_size, True)(input, output, output)
@@ -48,29 +51,29 @@ def max_pool_2d(input, ds, ignore_border=False, st=None, padding=(0, 0),
    the specified factor, by keeping only the maximum value of non-overlapping
    patches of size (ds[0],ds[1])
-    :type input: N-D theano tensor of input images.
+    Parameters
-    :param input: input images. Max pooling will be done over the 2 last
+    ----------
-        dimensions.
+    input : N-D theano tensor of input images
-    :type ds: tuple of length 2
+        Input images. Max pooling will be done over the 2 last dimensions.
-    :param ds: factor by which to downscale (vertical ds, horizontal ds).
+    ds : tuple of length 2
+        Factor by which to downscale (vertical ds, horizontal ds).
        (2,2) will halve the image in each dimension.
-    :type ignore_border: bool
+    ignore_border : bool
-    :param ignore_border: When True, (5,5) input with ds=(2,2)
+        When True, (5,5) input with ds=(2,2) will generate a (2,2) output.
-        will generate a (2,2) output. (3,3) otherwise.
+        (3,3) otherwise.
-    :type st: tuple of lenght 2
+    st : tuple of lenght 2
-    :param st: stride size, which is the number of shifts
+        Stride size, which is the number of shifts over rows/cols to get the
-        over rows/cols to get the the next pool region.
+        next pool region. If st is None, it is considered equal to ds
-        if st is None, it is considered equal to ds
+        (no overlap on pooling regions).
-        (no overlap on pooling regions)
+    padding : tuple of two ints
-    :param padding: (pad_h, pad_w), pad zeros to extend beyond four borders
+        (pad_h, pad_w), pad zeros to extend beyond four borders
            of the images, pad_h is the size of the top and bottom margins,
            and pad_w is the size of the left and right margins.
-    :type padding: tuple of two ints
+    mode : {'max', 'sum', 'average_inc_pad', 'average_exc_pad'}
-    :param mode: 'max', 'sum', 'average_inc_pad' or 'average_exc_pad'.
+        Operation executed on each window. `max` and `sum` always exclude
-        Operation executed on each window.  `max` and `sum` always exclude
        the padding in the computation. `average` gives you the choice to
        include or exclude it.
-    :type mode: string
    """
    if input.ndim < 2:
        raise NotImplementedError('max_pool_2d requires a dimension >= 2')
@@ -104,44 +107,69 @@ def max_pool_2d(input, ds, ignore_border=False, st=None, padding=(0, 0),
 class DownsampleFactorMax(Op):
-    """For N-dimensional tensors, consider that the last two
+    """
-    dimensions span images.  This Op downsamples these images by
+    For N-dimensional tensors, consider that the last two dimensions span
-    taking the max, sum or average over different patch.
+    images. This Op downsamples these images by taking the max, sum or average
+    over different patch.
+    The constructor takes the max, sum or average or different input patches.
+    Parameters
+    ----------
+    ds : list or tuple of two ints
+        Downsample factor over rows and column.
+        ds indicates the pool region size.
+    ignore_border : bool
+        If ds doesn't divide imgshape, do we include an extra row/col of partial
+        downsampling (False) or ignore it (True).
+    st : list or tuple of two ints or None
+        Stride size, which is the number of shifts over rows/cols to get the
+        next pool region. If st is None, it is considered equal to ds
+        (no overlap on pooling regions).
+    padding: tuple of two ints
+        (pad_h, pad_w), pad zeros to extend beyond four borders of the images,
+        pad_h is the size of the top and bottom margins, and pad_w is the size
+        of the left and right margins.
+    mode : {'max', 'sum', 'average_inc_pad', 'average_exc_pad'}
+        ('average_inc_pad' excludes the padding from the count,
+        'average_exc_pad' include it)
    """
    __props__ = ('ds', 'ignore_border', 'st', 'padding', 'mode')
    @staticmethod
    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)):
-        """Return the shape of the output from this op, for input of given
+        """
+        Return the shape of the output from this op, for input of given
        shape and flags.
-        :param imgshape: the shape of a tensor of images. The last two elements
+        Parameters
-            are interpreted as the number of rows, and the number of cols.
+        ----------
-        :type imgshape: tuple, list, or similar of integer or
+        imgshape : tuple, list, or similar of integer or scalar Theano variable
-            scalar Theano variable.
+            The shape of a tensor of images. The last two elements are
+            interpreted as the number of rows, and the number of cols.
-        :param ds: downsample factor over rows and columns
+        ds : list or tuple of two ints
-                   this parameter indicates the size of the pooling region
+            Downsample factor over rows and columns this parameter indicates
-        :type ds: list or tuple of two ints
+            the size of the pooling region.
+        st : list or tuple of two ints
-        :param st: the stride size. This is the distance between the pooling
+            The stride size. This is the distance between the pooling regions.
-                   regions. If it's set to None, in which case it equlas ds.
+            If it's set to None, it equals ds.
-        :type st: list or tuple of two ints
+        ignore_border : bool
+            If ds doesn't divide imgshape, do we include an extra row/col of
-        :param ignore_border: if ds doesn't divide imgshape, do we include an
+            partial downsampling (False) or ignore it (True).
-            extra row/col of partial downsampling (False) or ignore it (True).
+        padding : tuple of two ints
-        :type ignore_border: bool
+            (pad_h, pad_w), pad zeros to extend beyond four borders
-        :param padding: (pad_h, pad_w), pad zeros to extend beyond four borders
            of the images, pad_h is the size of the top and bottom margins,
            and pad_w is the size of the left and right margins.
-        :type padding: tuple of two ints
-        :rtype: list
+        Returns
-        :returns: the shape of the output from this op, for input of given
+        -------
-            shape.  This will have the same length as imgshape, but with last
+        list
-            two elements reduced as per the downsampling & ignore_border flags.
+            The shape of the output from this op, for input of given shape.
+            This will have the same length as imgshape, but with last two
+            elements reduced as per the downsampling & ignore_border flags.
        """
        if len(imgshape) < 2:
            raise TypeError('imgshape must have at least two elements '
@@ -190,33 +218,6 @@ class DownsampleFactorMax(Op):
    def __init__(self, ds, ignore_border=False, st=None, padding=(0, 0),
                 mode='max'):
-        """ Take the max, sum or average or different input patches.
-        :param ds: downsample factor over rows and column.
-                   ds indicates the pool region size.
-        :type ds: list or tuple of two ints
-        :param ignore_border: if ds doesn't divide imgshape, do we include
-            an extra row/col of partial downsampling (False) or
-            ignore it (True).
-        :type ignore_border: bool
-        : param st: stride size, which is the number of shifts
-            over rows/cols to get the the next pool region.
-            if st is None, it is considered equal to ds
-            (no overlap on pooling regions)
-        : type st: list or tuple of two ints or None
-        :param padding: (pad_h, pad_w), pad zeros to extend beyond four borders
-            of the images, pad_h is the size of the top and bottom margins,
-            and pad_w is the size of the left and right margins.
-        :type padding: tuple of two ints
-        :param mode: 'max', 'sum', 'average_inc_pad', 'average_exc_pad'.
-            ('average_inc_pad' excludes the padding from the count,
-            'average_exc_pad' include it)
-        """
        self.ds = tuple(ds)
        if not all([isinstance(d, int) for d in ds]):
            raise ValueError(
@@ -876,35 +877,36 @@ class DownsampleFactorMaxGradGrad(Op):
    @staticmethod
    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)):
-        """Return the shape of the output from this op, for input of given
+        """
+        Return the shape of the output from this op, for input of given
        shape and flags.
-        :param imgshape: the shape of a tensor of images. The last two elements
+        Parameters
+        ----------
+        imgshape : tuple, list, or similar of integer or scalar Theano variable
+            The shape of a tensor of images. The last two elements
            are interpreted as the number of rows, and the number of cols.
-        :type imgshape: tuple, list, or similar of integer or
+        ds : list or tuple of two ints
-            scalar Theano variable.
+            Downsample factor over rows and columns this parameter indicates the
+            size of the pooling region.
-        :param ds: downsample factor over rows and columns
+        st: list or tuple of two ints
-                   this parameter indicates the size of the pooling region
+            The stride size. This is the distance between the pooling regions.
-        :type ds: list or tuple of two ints
+            If it's set to None, in which case it equlas ds.
+        ignore_border: bool
-        :param st: the stride size. This is the distance between the pooling
+            If ds doesn't divide imgshape, do we include an
-                   regions. If it's set to None, in which case it equlas ds.
-        :type st: list or tuple of two ints
-        :param ignore_border: if ds doesn't divide imgshape, do we include an
            extra row/col of partial downsampling (False) or ignore it (True).
-        :type ignore_border: bool
+        padding : tuple of two ints
+            (pad_h, pad_w), pad zeros to extend beyond four borders
-        :param padding: (pad_h, pad_w), pad zeros to extend beyond four borders
            of the images, pad_h is the size of the top and bottom margins,
            and pad_w is the size of the left and right margins.
-        :type padding: tuple of two ints
-        :rtype: list
+        Returns
-        :returns: the shape of the output from this op, for input of given
+        -------
-            shape.  This will have the same length as imgshape, but with last
+        list
-            two elements reduced as per the downsampling & ignore_border flags.
+            The shape of the output from this op, for input of given shape.
+            This will have the same length as imgshape, but with last two
+            elements reduced as per the downsampling & ignore_border flags.
        """
        if len(imgshape) < 2:
            raise TypeError('imgshape must have at least two elements '

--- a/theano/tensor/slinalg.py
+++ b/theano/tensor/slinalg.py
@@ -31,9 +31,10 @@ MATRIX_STRUCTURES = (
 class Cholesky(Op):
    """
-    Return a triangular matrix square root of positive semi-definite `x`
+    Return a triangular matrix square root of positive semi-definite `x`.
+    L = cholesky(X, lower=True) implies dot(L, L.T) == X.
-    L = cholesky(X, lower=True) implies dot(L, L.T) == X
    """
    # TODO: inplace
    # TODO: for specific dtypes
@@ -90,13 +91,16 @@ class CholeskyGrad(Op):
        return Apply(self, [x, l, dz], [x.type()])
    def perform(self, node, inputs, outputs):
-        """Implements the "reverse-mode" gradient [1]_ for the
+        """
+        Implements the "reverse-mode" gradient [1]_ for the
        Cholesky factorization of a positive-definite matrix.
+        References
+        ----------
        .. [1] S. P. Smith. "Differentiation of the Cholesky Algorithm".
-               Journal of Computational and Graphical Statistics,
+           Journal of Computational and Graphical Statistics,
-               Vol. 4, No. 2 (Jun.,1995), pp. 134-147
+           Vol. 4, No. 2 (Jun.,1995), pp. 134-147
-               http://www.jstor.org/stable/1390762
+           http://www.jstor.org/stable/1390762
        """
        x = inputs[0]
@@ -133,7 +137,10 @@ class CholeskyGrad(Op):
 class Solve(Op):
-    """Solve a system of linear equations"""
+    """
+    Solve a system of linear equations.
+    """
    __props__ = ('A_structure', 'lower', 'overwrite_A', 'overwrite_b')
@@ -195,7 +202,9 @@ solve = Solve()  # general solve
 class Eigvalsh(Op):
-    """Generalized eigenvalues of a Hermetian positive definite eigensystem
+    """
+    Generalized eigenvalues of a Hermitian positive definite eigensystem.
    """
    __props__ = ('lower',)
@@ -243,8 +252,10 @@ class Eigvalsh(Op):
 class EigvalshGrad(Op):
-    """Gradient of generalized eigenvalues of a Hermetian positive definite
+    """
-    eigensystem
+    Gradient of generalized eigenvalues of a Hermitian positive definite
+    eigensystem.
    """
    # Note: This Op (EigvalshGrad), should be removed and replaced with a graph
@@ -303,17 +314,24 @@ def eigvalsh(a, b, lower=True):
 def kron(a, b):
-    """ Kronecker product
+    """ Kronecker product.
    Same as scipy.linalg.kron(a, b).
-    :note: numpy.kron(a, b) != scipy.linalg.kron(a, b)!
+    Parameters
-        They don't have the same shape and order when
+    ----------
-        a.ndim != b.ndim != 2.
+    a: array_like
+    b: array_like
-    :param a: array_like
+    Returns
-    :param b: array_like
+    -------
-    :return: array_like with a.ndim + b.ndim - 2 dimensions.
+    array_like with a.ndim + b.ndim - 2 dimensions
+    Notes
+    -----
+    numpy.kron(a, b) != scipy.linalg.kron(a, b)!
+    They don't have the same shape and order when
+    a.ndim != b.ndim != 2.
    """
    a = tensor.as_tensor_variable(a)
@@ -336,7 +354,9 @@ def kron(a, b):
 class Expm(Op):
-    """Compute the matrix exponential of a square array
+    """
+    Compute the matrix exponential of a square array.
    """
    __props__ = ()
@@ -365,7 +385,9 @@ class Expm(Op):
 class ExpmGrad(Op):
-    """Gradient of the matrix exponential of a square array.
+    """
+    Gradient of the matrix exponential of a square array.
    """
    __props__ = ()

--- a/theano/tensor/sort.py
+++ b/theano/tensor/sort.py
@@ -5,7 +5,8 @@ from theano.tensor.basic import mul, arange
 class SortOp(theano.Op):
    """
-    This class is a wrapper for numpy sort function
+    This class is a wrapper for numpy sort function.
    """
    __props__ = ("kind", "order")
@@ -62,12 +63,15 @@ class SortOp(theano.Op):
        return index_val
    def __get_argsort_indices(self, a, axis):
-        """Calculates indices which can be used to reverse
+        """
-        sorting operation of "a" tensor along "axis"
+        Calculates indices which can be used to reverse sorting operation of
+        "a" tensor along "axis".
+        Returns
+        -------
+        1d array if axis is None
+        list of lenght len(a.shape) otherwise
-        returns:
-          1d array if axis is None
-          list of lenght len(a.shape) otherwise
        """
        # The goal is to get gradient wrt input from gradient
@@ -99,24 +103,26 @@ class SortOp(theano.Op):
 def sort(a, axis=-1, kind='quicksort', order=None):
    """
-    Return a sorted copy of an array.
-    a : Tensor
-    Tensor to be sorted
+    Parameters
+    ----------
+    a : Tensor
+        Tensor to be sorted
    axis : Tensor
-        Axis along which to sort. If None, the array is
+        Axis along which to sort. If None, the array is flattened before
-        flattened before sorting.
+        sorting.
    kind : {'quicksort', 'mergesort', 'heapsort'}, optional
        Sorting algorithm. Default is 'quicksort'.
    order : list, optional
        When `a` is a structured array, this argument specifies which
        fields to compare first, second, and so on. This list does not
        need to include all of the fields.
+    Returns
+    -------
+    array
+        A sorted copy of an array.
    """
    if axis is None:
        a = a.flatten()
@@ -126,7 +132,8 @@ def sort(a, axis=-1, kind='quicksort', order=None):
 class ArgSortOp(theano.Op):
    """
-    This class is a wrapper for numpy argsort function
+    This class is a wrapper for numpy argsort function.
    """
    __props__ = ("kind", "order")
@@ -196,6 +203,7 @@ def argsort(a, axis=-1, kind='quicksort', order=None):
    specified by the kind keyword.  It returns an array of indices of
    the same shape as a that index data along the given axis in sorted
    order.
    """
    if axis is None:
        a = a.flatten()

--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -39,6 +39,7 @@ sparse_module_ref = None
 class AdvancedIndexingError(TypeError):
    """
    Raised when Subtensor is asked to perform advanced indexing.
    """
    def __init__(self, *args):
@@ -52,6 +53,7 @@ class AdvancedIndexingError(TypeError):
 def make_constant(args):
    """
    Convert python litterals to theano constants in subtensor arguments.
    """
    def conv(a):
            if a is None:
@@ -68,13 +70,14 @@ def make_constant(args):
 def get_idx_list(inputs, idx_list, get_count=False):
-    '''
+    """
    Given a list of inputs to the subtensor and its idx_list reorders
    the inputs according to the idx list to get the right values.
    If get_counts=True, instead returns the number of inputs consumed
    during this process.
-    '''
+    """
    # The number of indices
    n = len(inputs) - 1
@@ -102,14 +105,15 @@ def get_idx_list(inputs, idx_list, get_count=False):
 def get_canonical_form_slice(theslice, length):
-    '''
+    """
    Given a slice [start:stop:step] transform it into a canonical form
    that respects the conventions imposed by python and numpy.
    In a canonical form a slice is represented by a canonical form slice,
    in which 0 <= start <= stop <= length and step > 0, and a flag which says
    if the resulting set of numbers needs to be reversed or not.
-    '''
+    """
    from theano.tensor import switch, lt, ge, sgn
    if isinstance(theslice, slice):
@@ -252,7 +256,8 @@ def get_canonical_form_slice(theslice, length):
 class Subtensor(Op):
-    """Return a subtensor view
+    """
+    Return a subtensor view.
    The inputs array is the tensor x, followed by scalar integer types.
    TODO: WRITEME: how are the scalar integer variables formatted?
@@ -297,12 +302,16 @@ class Subtensor(Op):
    @staticmethod
    def collapse(idxs, cond):
        """
+        Parameters
+        ----------
+        idxs : a list of indices or slices.
+        cond : a callable that returns a bool
-        idxs: a list of indices or slices.
+        Returns
-        cond: a callable that returns a bool
+        -------
+        list
-        returns: idxs, with the slices flattened out into a list.
+            idxs, with the slices flattened out into a list.
-                if cond is true for an entry, does not flatten it.
+            If cond is true for an entry, does not flatten it.
        """
        ret = []
@@ -323,12 +332,14 @@ class Subtensor(Op):
    @staticmethod
    def convert(entry, slice_ok=True):
        """
+        Change references to Variables into references to Types.
        The "idx_list" field is unique to each Subtensor instance.
        It is not unique to each Apply node, so it should not refer to
-        specific Variables. This method changes references to Variables
+        specific Variables.
-        into references to Types.
        TODO: WRITEME: This method also accepts "entry" already being a Type;
            when would that happen?
        """
        invalid_scal_types = [scal.float64, scal.float32, scal.float16]
        scal_types = [scal.int64, scal.int32, scal.int16, scal.int8]
@@ -389,30 +400,33 @@ class Subtensor(Op):
                         only_process_constants=False):
        """
        Return the idx_list with constant inputs replaced by their
-        python scalar equivalent.  May raise
+        python scalar equivalent.
-        `theano.tensor.NotScalarConstantError` if the idx contains
+        May raise `theano.tensor.NotScalarConstantError` if the idx contains
        non-constant entries.
-        If allow_partial is True, then entries that are not constant
+        If allow_partial is True, then entries that are not constant will
-        will stay as their input variable rather than raising an
+        stay as their input variable rather than raising an exception.
-        exception.
        None entries are always left as-is.
-        Example usage (where v, a are appropriately typed theano variables):
+        Parameters
+        ----------
-            >>> b = a[v, 1:3]
+        only_process_constants
-            >>> b.owner.op.idx_list
+            If True, we only attempt to obtain the value of an index/slice if
-            (Scalar(int64), slice(Scalar(int64), Scalar(int64), None))
+            it's directly constant and don't try to dig through dimshuffles,
-            >>> b.owner.op.get_constant_idx(b.owner.inputs, allow_partial=True)
+            fills, allocs, and other to figure out its value.
-            [v, slice(1, 3, None)]
-            >>> b.owner.op.get_constant_idx(b.owner.inputs)
+        Examples
-            NotScalarConstantError: v
+        --------
+        Example usage where v, a are appropriately typed theano variables :
+        >>> b = a[v, 1:3]
+        >>> b.owner.op.idx_list
+        (Scalar(int64), slice(Scalar(int64), Scalar(int64), None))
+        >>> b.owner.op.get_constant_idx(b.owner.inputs, allow_partial=True)
+        [v, slice(1, 3, None)]
+        >>> b.owner.op.get_constant_idx(b.owner.inputs)
+        NotScalarConstantError: v
-        :param only_process_constants: If True, we only attempt to obtain
-            the value of an index/slice if it's directly constant and don't
-            try to dig through dimshuffles, fills, allocs, and other to figure
-            out its value.
        """
        real_idx = get_idx_list(inputs, self.idx_list)
@@ -451,8 +465,13 @@ class Subtensor(Op):
    def make_node(self, x, *inputs):
        """
-            x: the tensor to take a subtensor of
+        Parameters
-            inputs: a list of theano Scalars
+        ----------
+        x
+            The tensor to take a subtensor of.
+        inputs
+            A list of theano Scalars.
        """
        x = theano.tensor.as_tensor_variable(x)
        inputs = tuple(self.my_as_scalar(a) for a in inputs)
@@ -607,8 +626,8 @@ class Subtensor(Op):
    @staticmethod
    def default_helper_c_code_args():
        """
-        Returns a dictionary of default arguments to
+        Returns a dictionary of default arguments to helper_c_code.
-        helper_c_code
        """
        return {"c_prefix": "PyArray",
@@ -622,7 +641,8 @@ class Subtensor(Op):
        The parameters c_prefix are there to allow reusing this
        function on PyArray and CudaNdarray object.
-        This fct take as input the x,
+        This fct take as input the x.
        """
        default_args = Subtensor.default_helper_c_code_args()
@@ -986,16 +1006,25 @@ pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Subtensor),
 def set_subtensor(x, y, inplace=False,
                  tolerate_inplace_aliasing=False):
-    """Return x with the given subtensor overwritten by y.
+    """
+    Return x with the given subtensor overwritten by y.
-    Example: To replicate the numpy expression "r[10:] = 5", type
+    Parameters
+    ----------
+    x
+        Symbolic variable for the lvalue of = operation.
+    y
+        Symbolic variable for the rvalue of = operation.
+    tolerate_inplace_aliasing
+        See inc_subtensor for documentation.
+    Examples
+    --------
+    To replicate the numpy expression "r[10:] = 5", type
    >>> r = ivector()
    >>> new_r = set_subtensor(r[10:], 5)
-    :param x: symbolic variable for the lvalue of = operation
-    :param y: symbolic variable for the rvalue of = operation
-    :param tolerate_inplace_aliasing: see inc_subtensor for documentation.
    """
    return inc_subtensor(x, y, inplace, set_instead_of_inc=True,
                         tolerate_inplace_aliasing=tolerate_inplace_aliasing)
@@ -1003,22 +1032,32 @@ def set_subtensor(x, y, inplace=False,
 def inc_subtensor(x, y, inplace=False, set_instead_of_inc=False,
                  tolerate_inplace_aliasing=False):
-    """Return x with the given subtensor incremented by y.
+    """
+    Return x with the given subtensor incremented by y.
-    :param x: the symbolic result of a Subtensor operation.
-    :param y: the amount by which to increment ths subtensor in question
+    Parameters
-    :param inplace: Don't use. Theano will do it when possible.
+    ----------
-    :param set_instead_of_inc: If True, do a set_subtensor instead.
+    x
-    :param tolerate_inplace_aliasing: allow x and y to be views of a single
+        The symbolic result of a Subtensor operation.
-        underlying array even while working inplace.  For correct results,
+    y
-        x and y must not be overlapping views; if they overlap, the result
+        The amount by which to increment the subtensor in question.
-        of this Op will generally be incorrect. This value has no effect if
+    inplace
-        inplace=False.
+        Don't use. Theano will do it when possible.
+    set_instead_of_inc
-    Example: To replicate the numpy expression "r[10:] += 5", type
+        If True, do a set_subtensor instead.
+    tolerate_inplace_aliasing:
+        Allow x and y to be views of a single underlying array even while
+        working inplace. For correct results, x and y must not be overlapping
+        views; if they overlap, the result of this Op will generally be
+        incorrect. This value has no effect if inplace=False.
+    Examples
+    --------
+    To replicate the numpy expression "r[10:] += 5", type
    >>> r = ivector()
    >>> new_r = inc_subtensor(r[10:], 5)
    """
    # First of all, y cannot have a higher dimension than x,
    # nor have non-broadcastable dimensions where x is broadcastable.
@@ -1159,7 +1198,8 @@ def inc_subtensor(x, y, inplace=False, set_instead_of_inc=False,
 class IncSubtensor(Op):
-    """Increment a subtensor.
+    """
+    Increment a subtensor.
    This is like numpy's
@@ -1167,8 +1207,12 @@ class IncSubtensor(Op):
    It is used internally to implement the gradient on SubTensor.
-    :param set_instead_of_inc: if True set the subtensor to the value instead
+    Parameters
-    of incrementing it by that value.
+    ----------
+    set_instead_of_inc
+        If True set the subtensor to the value instead of incrementing it by
+        that value.
    """
    check_input = False
@@ -1225,9 +1269,14 @@ class IncSubtensor(Op):
    def make_node(self, x, y, *inputs):
        """
-            x: the tensor to increment
+        Parameters
-            y: the value to increment by
+        ----------
-            inputs: TODO WRITEME
+        x
+            The tensor to increment.
+        y
+            The value to increment by.
+        inputs: TODO WRITEME
        """
        x, y = map(theano.tensor.as_tensor_variable, [x, y])
        if y.ndim > x.ndim:
@@ -1411,8 +1460,10 @@ class IncSubtensor(Op):
                )
    def do_type_checking(self, node):
-        """ Should raise NotImplementedError if c_code does not support
+        """
+        Should raise NotImplementedError if c_code does not support
        the types involved in this node.
        """
        if not isinstance(node.inputs[0].type, theano.tensor.TensorType):
@@ -1427,13 +1478,19 @@ class IncSubtensor(Op):
    def copy_of_x(self, x):
        """
-            :param x: a string giving the name of a C variable
+        Parameters
-                pointing to an array
+        ----------
+        x
+            A string giving the name of a C variable pointing to an array.
+        Returns
+        -------
+        object
+            C code expression to make a copy of x.
-            :return: C code expression to make a copy of x
+        Base class uses PyArrayObject *, subclasses may override for
+        different types of arrays.
-            Base class uses PyArrayObject *, subclasses may override for
-            different types of arrays.
        """
        # Parameters of PyArrary_FromAny are:
        # array
@@ -1448,12 +1505,16 @@ class IncSubtensor(Op):
    def make_view_array(self, x, view_ndim):
        """
-            :param x: a string identifying an array to be viewed
+        Parameters
-            :param view_ndim: a string specifying the number of dimensions
+        ----------
-                to have in the view
+        x
+            A string identifying an array to be viewed.
+        view_ndim
+            A string specifying the number of dimensions to have in the view.
+        This doesn't need to actually set up the view with the right indexing;
+        we'll do that manually later.
-            This doesn't need to actually set up the view with the
-            right indexing; we'll do that manually later.
        """
        return """Py_INCREF(PyArray_DESCR(%(x)s));
@@ -1471,22 +1532,35 @@ class IncSubtensor(Op):
        """ % locals()
    def get_helper_c_code_args(self):
-        """ Return a dictionary of arguments to pass to helper_c_code."""
+        """
+        Return a dictionary of arguments to pass to helper_c_code.
+        """
        return Subtensor.default_helper_c_code_args()
    def copy_into(self, view, source):
        """
-            view: string, C code expression for an array
+        Parameters
-            source: string, C code expression for an array
+        ----------
+        view : string
+            C code expression for an array.
+        source : string
+            C code expression for an array.
+        Returns
+        -------
+        object
+            C code expression to copy source into view, and 0 on success.
-            returns a C code expression to copy source into view, and
-            return 0 on success
        """
        return """PyArray_CopyInto(%(view)s, %(source)s)""" % locals()
    def add_to_zview(self, name, x, fail):
-        """ Return C code to add x to zview. Should DECREF zview if the
+        """
-        add fails."""
+        Return C code to add x to zview. Should DECREF zview if the
+        add fails.
+        """
        return """
            PyArrayObject * add_rval = (PyArrayObject*)PyNumber_InPlaceAdd(
@@ -1551,11 +1625,13 @@ class IncSubtensor(Op):
 def _sum_grad_over_bcasted_dims(x, gx):
-    """Sum of gx over dimensions to reproduce x.broadcastable.
+    """
+    Sum of gx over dimensions to reproduce x.broadcastable.
    This is useful to sum gradients over certain dimensions when
    x has been broadcasted, and we need to sum the gradient contributions
    over all duplications.
    """
    if gx.broadcastable != x.broadcastable:
        x_dim_added = gx.ndim - x.ndim
@@ -1592,7 +1668,10 @@ def _sum_grad_over_bcasted_dims(x, gx):
 class AdvancedSubtensor1(Op):
-    """Implement x[ilist] where ilist is a vector of integers."""
+    """
+    Implement x[ilist] where ilist is a vector of integers.
+    """
    # sparse_grad doesn't go in here since it only affects the output
    # of the grad() method.
    __props__ = ()
@@ -1777,7 +1856,11 @@ advanced_subtensor1 = AdvancedSubtensor1()
 class AdvancedIncSubtensor1(Op):
-    """Increments a subtensor using advanced slicing (list of index)"""
+    """
+    Increments a subtensor using advanced slicing (list of index).
+    """
    __props__ = ('inplace', 'set_instead_of_inc')
    def __init__(self, inplace=False, set_instead_of_inc=False):
@@ -1828,13 +1911,19 @@ class AdvancedIncSubtensor1(Op):
    def copy_of_x(self, x):
        """
-            :param x: a string giving the name of a C variable
+        Parameters
-                pointing to an array
+        ----------
+        x : string
+            Gives the name of a C variable pointing to an array.
+        Returns
+        -------
+        object
+            C code expression to make a copy of x.
-            :return: C code expression to make a copy of x
+        Base class uses PyArrayObject *, subclasses may override for
+        different types of arrays.
-            Base class uses PyArrayObject *, subclasses may override for
-            different types of arrays.
        """
        # Parameters of PyArrary_FromAny are:
        # array
@@ -1994,6 +2083,7 @@ def adv_index_broadcastable_pattern(a, idx):
    For this, we make a fake ndarray and a fake idx and call use ask numpy
    the output. From this, we find the output broadcast pattern.
    """
    def replace_slice(v):
@@ -2021,8 +2111,11 @@ def adv_index_broadcastable_pattern(a, idx):
 class AdvancedSubtensor(Op):
-    """Return a subtensor copy, using advanced indexing.
    """
+    Return a subtensor copy, using advanced indexing.
+    """
    # Should be used by __getitem__ and __getslice__, as follow:
    # AdvancedSubtensor()(self, *args),
    # if args contains and advanced indexing pattern
@@ -2094,13 +2187,16 @@ advanced_subtensor = AdvancedSubtensor()
 class AdvancedIncSubtensor(Op):
-    """Increments a subtensor using advanced indexing.
+    """
+    Increments a subtensor using advanced indexing.
-    :note: We need the numpy.inplace_increment() function currently
+    Notes
-        numpy's PR 326 to be able to make an inplace version of this
+    -----
-        op.
+    We need the numpy.inplace_increment() function currently
+    numpy's PR 326 to be able to make an inplace version of this op.
    """
    __props__ = ("inplace", "set_instead_of_inc")
    def __init__(self, inplace=False, set_instead_of_inc=False):

--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -12,7 +12,27 @@ _logger = logging.getLogger("theano.tensor.type")
 class TensorType(Type):
-    """Symbolic `Type` representing a numpy.ndarray value."""
+    """
+    Symbolic `Type` representing a numpy.ndarray value.
+    Initialize self.dtype and self.broadcastable.
+    Parameters
+    ----------
+    dtype: str
+        Corresponding to numpy dtype (e.g., 'int64')
+        The value (ndarray) associated to a `Variable` of this `Type` will
+        have this dtype.
+    broadcastable: tuple, list, or array of boolean values
+        This argument serves two purposes. First, the True elements of this
+        list indicate the dimensions where the shape of an associated value
+        must be 1. Secondly, the length of this list is the number of
+        dimensions that an associated value must have. See
+        doc:`broadcasting` for an explanation of how this list is used.
+    name : str
+        Optional name for this type.
+    """
    filter_checks_isfinite = False
    """
@@ -21,21 +41,6 @@ class TensorType(Type):
    """
    def __init__(self, dtype, broadcastable, name=None, sparse_grad=False):
-        """Initialize self.dtype and self.broadcastable.
-        :Parameters:
-         - `dtype`: str corresponding to numpy dtype (e.g., 'int64')
-           The value (ndarray) associated to a `Variable` of this `Type` will
-           have this dtype.
-         - `broadcastable`: tuple, list, or array of boolean values
-           This argument serves two purposes.  First, the True elements of this
-           list indicate the dimensions where the shape of an associated value
-           must be 1.  Secondly, the length of this list is the number of
-           dimensions that an associated value must have.  See
-           :doc:`broadcasting` for an explanation of how this list is used.
-         - `name`: str
-           Optional name for this type.
-        """
        self.dtype = str(dtype)
        if self.dtype == 'floatX':
            self.dtype = config.floatX
@@ -56,6 +61,7 @@ class TensorType(Type):
        """
        Return a copy of the type optionally with a new dtype or
        broadcastable pattern.
        """
        if dtype is None:
            dtype = self.dtype
@@ -65,11 +71,13 @@ class TensorType(Type):
                              sparse_grad=self.sparse_grad)
    def filter(self, data, strict=False, allow_downcast=None):
-        """Convert `data` to something which can be associated to a
+        """
+        Convert `data` to something which can be associated to a
        `TensorVariable`.
-        This function is not meant to be called in user code.  It is for
+        This function is not meant to be called in user code. It is for
        `Linker` instances to use when running a compiled graph.
        """
        # Explicit error message when one accidentally uses a Variable as
        # input (typical mistake, especially with shared variables).
@@ -191,11 +199,13 @@ class TensorType(Type):
        return data
    def filter_variable(self, other, allow_convert=True):
-        """Convert a symbolic Variable into a TensorType, if compatible.
+        """
+        Convert a symbolic Variable into a TensorType, if compatible.
        For the moment, only a TensorType or CudaNdarrayType will be
        converted, provided they have the same number of dimensions,
        broadcastable pattern, and dtype.
        """
        if hasattr(other, '_as_TensorVariable'):
            other = other._as_TensorVariable()
@@ -230,10 +240,12 @@ class TensorType(Type):
        return "value is valid"
    def dtype_specs(self):
-        """Return a tuple (python type, c type, numpy typenum) that corresponds
+        """
+        Return a tuple (python type, c type, numpy typenum) that corresponds
        to self.dtype.
        This function is used internally as part of C code generation.
        """
        # TODO: add more type correspondances for e.g. int32, int64, float32,
        # complex64, etc.
@@ -261,7 +273,10 @@ class TensorType(Type):
        return scal.get_scalar_type(dtype=self.dtype)
    def __eq__(self, other):
-        """Compare True iff other is the same kind of TensorType"""
+        """
+        Compare True iff other is the same kind of TensorType.
+        """
        return type(self) == type(other) and other.dtype == self.dtype \
            and other.broadcastable == self.broadcastable
@@ -305,14 +320,19 @@ class TensorType(Type):
    def values_eq_approx(a, b, allow_remove_inf=False, allow_remove_nan=False,
                         rtol=None, atol=None):
        """
-        :param allow_remove_inf: If True, when there is an inf in a,
+        Parameters
-                                 we allow any value in b in that position.
+        ----------
-                                 Event -inf
+        allow_remove_inf
-        :param allow_remove_nan: If True, when there is a nan in a,
+            If True, when there is an inf in a, we allow any value in b in
-                                 we allow any value in b in that position.
+            that position. Event -inf
-                                 Event +-inf
+        allow_remove_nan
-        :param rtol: relative tolerance, passed to _allclose
+            If True, when there is a nan in a, we allow any value in b in
-        :param atol: absolute tolerance, passed to _allclose
+            that position. Event +-inf
+        rtol
+            Relative tolerance, passed to _allclose.
+        atol
+            Absolute tolerance, passed to _allclose.
        """
        if isinstance(a, numpy.ndarray) and isinstance(b, numpy.ndarray):
            if a.shape != b.shape:
@@ -389,7 +409,8 @@ class TensorType(Type):
    ndim = property(lambda self: len(self.broadcastable),
                    doc="number of dimensions")
-    """Number of dimensions
+    """
+    Number of dimensions.
    This read-only property is the preferred way to get the number of
    dimensions of a `TensorType`.
@@ -397,12 +418,15 @@ class TensorType(Type):
    """
    def make_variable(self, name=None):
-        """Return a `TensorVariable` of this type
+        """
+        Return a `TensorVariable` of this type.
+        Parameters
+        ----------
+        name : str
+            A pretty name to identify this `Variable` when printing and
+            debugging
-        :Parameters:
-         - `name`: str
-           A pretty name to identify this `Variable` when printing and
-           debugging
        """
        return self.Variable(self, name=name)
@@ -430,7 +454,10 @@ class TensorType(Type):
        # "TensorType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
    def c_declare(self, name, sub, check_input=True):
-        """Override `CLinkerType.c_declare` """
+        """
+        Override `CLinkerType.c_declare`.
+        """
        if(check_input):
            check = """
            typedef %(dtype)s dtype_%(name)s;
@@ -444,13 +471,19 @@ class TensorType(Type):
        return declaration + check
    def c_init(self, name, sub):
-        """Override `CLinkerType.c_init` """
+        """
+        Override `CLinkerType.c_init`.
+        """
        return """
        %(name)s = NULL;
        """ % dict(sub, name=name, type_num=self.dtype_specs()[2])
    def c_extract(self, name, sub, check_input=True):
-        """Override `CLinkerType.c_extract` """
+        """
+        Override `CLinkerType.c_extract`.
+        """
        if(check_input):
            check = """
            %(name)s = NULL;
@@ -509,7 +542,10 @@ class TensorType(Type):
        """ % dict(sub, name=name, type_num=self.dtype_specs()[2])
    def c_cleanup(self, name, sub):
-        """Override `CLinkerType.c_cleanup` """
+        """
+        Override `CLinkerType.c_cleanup`.
+        """
        return """
        if (%(name)s) {
            Py_XDECREF(%(name)s);
@@ -517,7 +553,10 @@ class TensorType(Type):
        """ % locals()
    def c_sync(self, name, sub):
-        """Override `CLinkerType.c_sync` """
+        """
+        Override `CLinkerType.c_sync`.
+        """
        fail = sub['fail']
        type_num = self.dtype_specs()[2]
        return """
@@ -558,7 +597,10 @@ class TensorType(Type):
        """ % locals()
    def c_headers(self):
-        """Override `CLinkerObject.c_headers` """
+        """
+        Override `CLinkerObject.c_headers`.
+        """
        return scal.get_scalar_type(self.dtype).c_headers()
    def c_libraries(self):
@@ -568,7 +610,10 @@ class TensorType(Type):
        return scal.get_scalar_type(self.dtype).c_compile_args()
    def c_support_code(self):
-        """Override `CLinkerObject.c_support_code` """
+        """
+        Override `CLinkerObject.c_support_code`.
+        """
        return scal.get_scalar_type(self.dtype).c_support_code()
    def c_init_code(self):
@@ -584,6 +629,7 @@ class TensorType(Type):
    def value_zeros(self, shape):
        """
        Create an numpy ndarray full of 0 values.
        """
        return numpy.zeros(shape, dtype=self.dtype)
@@ -604,17 +650,33 @@ class TensorType(Type):
        ``get_size()`` will be called on the output of this function
        when printing the memory profile.
-        :param obj: The object that this Type represents during execution
+        Parameters
-        :return: Python object that ``self.get_size()`` understands
+        ----------
+        obj
+            The object that this Type represents during execution.
+        Returns
+        -------
+        object
+            Python object that ``self.get_size()`` understands.
        """
        return obj.shape
    def get_size(self, shape_info):
-        """ Number of bytes taken by the object represented by shape_info.
+        """
+        Number of bytes taken by the object represented by shape_info.
+        Parameters
+        ----------
+        shape_info
+            The output of the call to get_shape_info().
+        Returns
+        -------
+        int
+            The number of bytes taken by the object described by ``shape_info``.
-        :param shape_info: the output of the call to get_shape_info()
-        :return: the number of bytes taken by the object described by
-            ``shape_info``.
        """
        if shape_info:
            return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize

--- a/theano/tensor/type_other.py
+++ b/theano/tensor/type_other.py
@@ -105,6 +105,7 @@ SliceType.Constant = SliceConstant
 class NoneTypeT(Generic):
    """
    Inherit from Generic to have c code working.
    """
    def filter(self, x, strict=False, allow_downcast=None):

--- a/theano/tensor/utils.py
+++ b/theano/tensor/utils.py
@@ -6,7 +6,8 @@ from theano.gof.utils import hash_from_code
 def hash_from_ndarray(data):
-    """Return a hash from an ndarray
+    """
+    Return a hash from an ndarray.
    It takes care of the data, shapes, strides and dtype.
@@ -32,23 +33,31 @@ def hash_from_ndarray(data):
 def shape_of_variables(fgraph, input_shapes):
    """
-    Compute the numeric shape of all intermediate variables given input shapes
+    Compute the numeric shape of all intermediate variables given input shapes.
-    Inputs:
+    Parameters
-        fgraph - the theano.FunctionGraph in question
+    ----------
-        input_shapes - a dict mapping input to shape
+    fgraph
+        The theano.FunctionGraph in question.
+    input_shapes : dict
+        A dict mapping input to shape.
-    Outputs:
+    Returns
-        shapes - a dict mapping variable to shape
+    -------
+    shapes : dict
+        A dict mapping variable to shape
-    WARNING : This modifies the fgraph. Not pure.
+    .. warning:: This modifies the fgraph. Not pure.
+    Examples
+    --------
    >>> import theano
    >>> x = theano.tensor.matrix('x')
    >>> y = x[512:]; y.name = 'y'
    >>> fgraph = theano.FunctionGraph([x], [y], clone=False)
    >>> shape_of_variables(fgraph, {x: (1024, 1024)})
    {y: (512, 1024), x: (1024, 1024)}
    """
    if not hasattr(fgraph, 'shape_feature'):

--- a/theano/tensor/var.py
+++ b/theano/tensor/var.py
@@ -22,8 +22,9 @@ def equal_slices(s1, s2):
 class AsTensorError(TypeError):
-    """Raised when as_tensor_variable isn't able to create a
+    """
-    TensorVariable.
+    Raised when as_tensor_variable isn't able to create a TensorVariable.
    """
    pass
@@ -254,8 +255,11 @@ class _tensor_py_operators:
    def transpose(self, *axes):
        """
-        Return `tensor.transpose(self, axes)`
-        or `tensor.transpose(self, axes[0])`
+        Returns
+        -------
+        object
+            `tensor.transpose(self, axes)` or `tensor.transpose(self, axes[0])`.
        If only one `axes` argument is provided and it is iterable, then it is
        assumed to be the entire axes tuple, and passed intact to
@@ -298,16 +302,18 @@ class _tensor_py_operators:
    def reshape(self, shape, ndim=None):
        """Return a reshaped view/copy of this variable.
-        :param shape: something that can be converted to a symbolic vector of
+        Parameters
-            integers
+        ----------
+        shape
+            Something that can be converted to a symbolic vector of integers.
+        ndim
+            The length of the shape. Passing None here means for
+            Theano to try and guess the length of `shape`.
-        :param ndim: the length of the shape.  Passing None here means for
+        .. warning:: This has a different signature than numpy's
-            theano to try and guess the length of `shape`.
+                     ndarray.reshape!
+                     In numpy you do not need to wrap the shape arguments
-        * warning-- this has a different signature than numpy's
+                     in a tuple, in theano you do need to.
-                    ndarray.reshape!
-                    in numpy you do not need to wrap the shape arguments
-                    in a tuple, in theano you do need to
        """
@@ -323,21 +329,29 @@ class _tensor_py_operators:
        Reorder the dimensions of this variable, optionally inserting
        broadcasted dimensions.
-        :param pattern: list/tuple of int mixed with 'x' for broadcastable
+        Parameters
-            dimensions
+        ----------
+        pattern
+            List/tuple of int mixed with 'x' for broadcastable dimensions.
+        Examples
+        --------
        For example, to create a 3D view of a [2D] matrix, call
        ``dimshuffle([0,'x',1])``.  This will create a 3D view such that the
        middle dimension is an implicit broadcasted dimension.  To do the same
-        thing on the transpose of that matrix, call
+        thing on the transpose of that matrix, call ``dimshuffle([1, 'x', 0])``.
-        ``dimshuffle([1, 'x', 0])``.
+        Notes
+        -----
        This function supports the pattern passed as a tuple, or as a
        variable-length argument (e.g. ``a.dimshuffle(pattern)`` is equivalent
        to ``a.dimshuffle(*pattern)`` where ``pattern`` is a list/tuple of ints
        mixed with 'x' characters).
-        For more information, see `DimShuffle`.
+        See Also
+        --------
+        DimShuffle
        """
        if (len(pattern) == 1) and (isinstance(pattern[0], (list, tuple))):
            pattern = pattern[0]
@@ -524,13 +538,17 @@ class _tensor_py_operators:
    """The rank of this tensor."""
    broadcastable = property(lambda self: self.type.broadcastable)
-    """The broadcastable signature of this tensor.
+    """
+    The broadcastable signature of this tensor.
+    See Also
+    --------
+    broadcasting
-    See :doc:`broadcasting` for details.
    """
    dtype = property(lambda self: self.type.dtype)
-    """ The dtype of this tensor.  """
+    """The dtype of this tensor."""
    # extra pseudo-operator symbols
    def __dot__(left, right):
@@ -542,13 +560,13 @@ class _tensor_py_operators:
    dot = __dot__
    def sum(self, axis=None, dtype=None, keepdims=False, acc_dtype=None):
-        """See `theano.tensor.sum`"""
+        """See `theano.tensor.sum`."""
        return theano.tensor.basic.sum(self, axis=axis,
                                       dtype=dtype, keepdims=keepdims,
                                       acc_dtype=acc_dtype)
    def prod(self, axis=None, dtype=None, keepdims=False, acc_dtype=None):
-        """See `theano.tensor.prod`"""
+        """See `theano.tensor.prod`."""
        return theano.tensor.basic.prod(self, axis=axis,
                                        dtype=dtype, keepdims=keepdims,
                                        acc_dtype=acc_dtype)
@@ -564,49 +582,49 @@ class _tensor_py_operators:
                theano.tensor.basic.abs_(self), L).sum(axis=axis), 1.0 / L)
    def mean(self, axis=None, dtype=None, keepdims=False, acc_dtype=None):
-        """See `theano.tensor.mean`"""
+        """See `theano.tensor.mean`."""
        return theano.tensor.basic.mean(self, axis=axis,
                                        dtype=dtype, keepdims=keepdims,
                                        acc_dtype=acc_dtype)
    def var(self, axis=None, keepdims=False):
-        """See `theano.tensor.var`"""
+        """See `theano.tensor.var`."""
        return theano.tensor.basic.var(self, axis, keepdims=keepdims)
    def std(self, axis=None, keepdims=False):
-        """See `theano.tensor.std`"""
+        """See `theano.tensor.std`."""
        return theano.tensor.basic.std(self, axis, keepdims=keepdims)
    def min(self, axis=None, keepdims=False):
-        """See `theano.tensor.min`"""
+        """See `theano.tensor.min`."""
        return theano.tensor.basic.min(self, axis, keepdims=keepdims)
    def max(self, axis=None, keepdims=False):
-        """See `theano.tensor.max`"""
+        """See `theano.tensor.max`."""
        return theano.tensor.basic.max(self, axis, keepdims=keepdims)
    def argmin(self, axis=None, keepdims=False):
-        """See `theano.tensor.argmin`"""
+        """See `theano.tensor.argmin`."""
        return theano.tensor.basic.argmin(self, axis, keepdims=keepdims)
    def argmax(self, axis=None, keepdims=False):
-        """See `theano.tensor.argmax`"""
+        """See `theano.tensor.argmax`."""
        return theano.tensor.basic.argmax(self, axis, keepdims=keepdims)
    def nonzero(self, return_matrix=False):
-        """See `theano.tensor.nonzero`"""
+        """See `theano.tensor.nonzero`."""
        return theano.tensor.basic.nonzero(self, return_matrix=return_matrix)
    def nonzero_values(self):
-        """See `theano.tensor.nonzero_values`"""
+        """See `theano.tensor.nonzero_values`."""
        return theano.tensor.basic.nonzero_values(self)
    def sort(self, axis=-1, kind='quicksort', order=None):
-        """See `theano.tensor.sort`"""
+        """See `theano.tensor.sort`."""
        return theano.tensor.sort(self, axis, kind, order)
    def argsort(self, axis=-1, kind='quicksort', order=None):
-        """See `theano.tensor.argsort`"""
+        """See `theano.tensor.argsort`."""
        return theano.tensor.argsort(self, axis, kind, order)
    def clip(self, a_min, a_max):
@@ -614,17 +632,17 @@ class _tensor_py_operators:
        return theano.tensor.basic.clip(self, a_min, a_max)
    def conj(self):
-        """See `theano.tensor.conj`"""
+        """See `theano.tensor.conj`."""
        return theano.tensor.basic.conj(self)
    conjugate = conj
    def repeat(self, repeats, axis=None):
-        """See `theano.tensor.repeat`"""
+        """See `theano.tensor.repeat`."""
        return theano.tensor.extra_ops.repeat(self, repeats, axis)
    def round(self, mode="half_away_from_zero"):
-        """See `theano.tensor.round`"""
+        """See `theano.tensor.round`."""
        return theano.tensor.basic.round(self, mode)
    def trace(self):
@@ -646,12 +664,13 @@ class _tensor_py_operators:
        return theano.tensor.extra_ops.cumprod(self, axis)
    def ptp(self, axis=None):
-        """see 'theano.tensor.ptp'"""
+        """See 'theano.tensor.ptp'."""
        return theano.tensor.ptp(self, axis)
    def swapaxes(self, axis1, axis2):
-        """Return 'tensor.swapaxes(self, axis1, axis2)
+        """
+        Return 'tensor.swapaxes(self, axis1, axis2).
        If a matrix is provided with the right axes, its transpose
        will be returned.
@@ -660,32 +679,38 @@ class _tensor_py_operators:
        return theano.tensor.basic.swapaxes(self, axis1, axis2)
    def fill(self, value):
-        """Fill inputted tensor with the assigned value"""
+        """Fill inputted tensor with the assigned value."""
        return theano.tensor.basic.fill(self, value)
    def choose(self, a, choices, out=None, mode='raise'):
-        """Construct an array from an index array and a set of arrays to choose from."""
+        """
+        Construct an array from an index array and a set of arrays to choose
+        from.
+        """
        return theano.tensor.basic.choose(self, a, choices, out=None,
                                          mode='raise')
    def squeeze(self):
-        """Remove broadcastable dimensions from
+        """
-        the shape of an array.
+        Remove broadcastable dimensions from the shape of an array.
+        It returns the input array, but with the broadcastable dimensions
+        removed. This is always `x` itself or a view into `x`.
-        It returns the input array, but with the
-        broadcastable dimensions removed. This is
-        always `x` itself or a view into `x`.
        """
        return theano.tensor.extra_ops.squeeze(self)
    def compress(self, a, axis=None):
-        """Return selected slices only
+        """Return selected slices only."""
-        """
        return theano.tensor.extra_ops.compress(self, a, axis=axis)
 class TensorVariable(_tensor_py_operators, Variable):
-    """Subclass to add the tensor operators to the basic `Variable` class."""
+    """
+    Subclass to add the tensor operators to the basic `Variable` class.
+    """
    def __init__(self, type, owner=None, index=None, name=None):
        super(TensorVariable, self).__init__(type, owner=owner,
@@ -721,9 +746,11 @@ TensorType.Variable = TensorVariable
 class TensorConstantSignature(tuple):
-    """A Signature object for comparing TensorConstant instances
+    """
+    A Signature object for comparing TensorConstant instances.
    An instance is a pair: (Type instance, ndarray).
    """
    def __eq__(self, other):
        if type(self) != type(other):
@@ -814,6 +841,7 @@ class TensorConstant(_tensor_py_operators, Constant):
    """Subclass to add the tensor operators to the basic `Constant` class.
    To create a TensorConstant, use the `constant` function in this module.
    """
    def __init__(self, type, data, name=None):
        Constant.__init__(self, type, data, name)

--- a/theano/tensor/xlogx.py
+++ b/theano/tensor/xlogx.py
@@ -7,6 +7,7 @@ from theano import scalar
 class XlogX(scalar.UnaryScalarOp):
    """
    Compute X * log(X), with special case 0 log(0) = 0.
    """
    @staticmethod
    def st_impl(x):
@@ -39,6 +40,7 @@ xlogx = Elemwise(scalar_xlogx, name='xlogx')
 class XlogY0(scalar.BinaryScalarOp):
    """
    Compute X * log(Y), with special case 0 log(0) = 0.
    """
    @staticmethod
    def st_impl(x, y):