Merge pull request #401 from nouiz/doc_op

Doc op

Merge pull request #401 from nouiz/doc_op
d6ed790a · lamblin · 41103b5d · 14a0070c · d6ed790a · d6ed790a
--- a/NEWS.txt
+++ b/NEWS.txt
--- a/doc/library/gradient.txt
+++ b/doc/library/gradient.txt
@@ -16,57 +16,5 @@ function does the underlying work, and is more flexible, but is also more
 awkward to use when :func:`tensor.grad` can do the job.


-.. function:: grad_sources_inputs(sources, graph_inputs, warn_type=True)
-
-    A gradient source is a pair (``v``, ``g_v``), in which ``v`` is
-    a `Variable`, and ``g_v`` is a `Variable` that is a gradient wrt
-    ``v``. More specifically, ``g_v`` is the gradient of an external
-    scalar cost, ``cost`` (that is not explicitly used), wrt ``v``.
-
-    This function traverses the graph backward from the ``r`` sources,
-    calling ``op.grad(...)`` for all ops with some non-None gradient
-    on an output, to compute gradients of ``cost`` wrt intermediate
-    variables and ``graph_inputs``.
-
-    The ``op.grad(...)`` functions are called like this:
-
-    .. code-block:: python
-
-        op.grad(op.inputs[:], [total_gradient(v) for v in op.outputs])
-
-    This call to ``op.grad`` should return a list or tuple: one symbolic
-    gradient per input. These gradients represent the gradients of
-    the same implicit ``cost`` mentionned above, wrt ``op.inputs``.  Note
-    that this is **not** the same as the gradient of ``op.outputs`` wrt
-    ``op.inputs``.
-
-    If ``op`` has a single input, then ``op.grad`` should return a list
-    or tuple of length 1.
-    For each input wrt to which ``op`` is not differentiable, it should
-    return ``None`` instead of a `Variable` instance.
-
-    If a source ``r`` receives a gradient from another source ``r2``,
-    then the effective gradient on ``r`` is the sum of both gradients.
-
-
-    :type sources: list of pairs of Variable: (v, gradient-on-v) to 
-                   initialize the total_gradient dictionary
-
-    :param sources: gradients to back-propagate using chain rule
-
-    :param warn_type: True will trigger warnings via the logging module when
-       the gradient on an expression has a different type than the original
-       expression
-
-    :type warn_type: bool
-
-    :type graph_inputs: list of Variable
-
-    :param graph_inputs: variables considered to be constant 
-                         (do not backpropagate through them)
-
-    :rtype: dictionary whose keys and values are of type `Variable`
-
-    :returns: mapping from each Variable encountered in the backward traversal to its [total] gradient.
-
-
+.. automodule:: theano.gradient
+    :members:
--- a/doc/library/sandbox/cuda/index.txt
+++ b/doc/library/sandbox/cuda/index.txt
@@ -15,6 +15,4 @@

    var
    type
-
-
-
+    op
--- a/doc/library/sandbox/cuda/op.txt
+++ b/doc/library/sandbox/cuda/op.txt
+.. _libdoc_cuda_op:
+
+======================================================
+:mod:`sandbox.cuda` -- List of CUDA GPU Op implemented
+======================================================
+
+.. moduleauthor:: LISA
+
+Normally you should not call directly those Ops! Theano should automatically transform cpu ops to their gpu equivalent. So this list is just useful to let people know what is implemented on the gpu.
+
+Basic Op
+========
+
+.. automodule:: theano.sandbox.cuda.basic_ops
+    :members:
+
+Blas Op
+=======
+
+.. automodule:: theano.sandbox.cuda.blas
+    :members:
+
+Nnet Op
+=======
+
+.. automodule:: theano.sandbox.cuda.nnet
+    :members:
+
+Curand Op
+=========
+
+Random generator based on the CURAND libraries. It is not inserted automatically.
+
+.. automodule:: theano.sandbox.cuda.rng_curand
+    :members:
--- a/doc/tutorial/gradients.txt
+++ b/doc/tutorial/gradients.txt
@@ -94,9 +94,14 @@ of symbolic differentiation).
 Computing the Jacobian
 ======================

-In order to compute the Jacobian of some function ``y`` with respect to some
-parameter ``x`` we need to use the ``scan``. What we do is to loop over the
-entries in ``y`` and compute the gradient of ``y[i]`` with respect to ``x``.
+Theano implements :func:`theano.gradient.jacobian` macro that does all
+what is needed to compute the Jacobian. The following text explains how
+to do it manually.
+
+In order to manually compute the Jacobian of some function ``y`` with
+respect to some parameter ``x`` we need to use ``scan``. What we
+do is to loop over the entries in ``y`` and compute the gradient of
+``y[i]`` with respect to ``x``.

 .. note::
    
@@ -129,12 +134,17 @@ matrix, which corresponds to the Jacobian.
    seems possible. The reason is that ``y_i`` will not be a function of
    ``x`` anymore, while ``y[i]`` still is. 

+
 Computing the Hessian
 =====================

-Similar to computing the Jacobian we can also compute the Hessian. The only
+Theano implements :func:`theano.gradient.hessian` macro that does all
+that is needed to compute the Hessian. The following text explains how
+to do it manually.
+
+You can compute the Hessian manually as the Jacobian. The only
 difference is that now, instead of computing the Jacobian of some expression
-``y``, we compute the jacobian of ``T.grad(cost,x)``, where ``cost`` is some
+``y``, we compute the Jacobian of ``T.grad(cost,x)``, where ``cost`` is some
 scalar. 



--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -58,14 +58,50 @@ def format_as(use_list, use_tuple, outputs):

 def grad_sources_inputs(sources, graph_inputs, warn_type=True):
    """
-    :type sources: list of pairs of Variable: (v, gradient-on-v)
+    A gradient source is a pair (``v``, ``g_v``), in which ``v`` is
+    a `Variable`, and ``g_v`` is a `Variable` that is a gradient wrt
+    ``v``. More specifically, ``g_v`` is the gradient of an external
+    scalar cost, ``cost`` (that is not explicitly used), wrt ``v``.
+
+    This function traverses the graph backward from the ``r`` sources,
+    calling ``op.grad(...)`` for all ops with some non-None gradient
+    on an output, to compute gradients of ``cost`` wrt intermediate
+    variables and ``graph_inputs``.
+
+    The ``op.grad(...)`` functions are called like this:
+
+    .. code-block:: python
+
+        op.grad(op.inputs[:], [total_gradient(v) for v in op.outputs])
+
+    This call to ``op.grad`` should return a list or tuple: one symbolic
+    gradient per input. These gradients represent the gradients of
+    the same implicit ``cost`` mentionned above, wrt ``op.inputs``.  Note
+    that this is **not** the same as the gradient of ``op.outputs`` wrt
+    ``op.inputs``.
+
+    If ``op`` has a single input, then ``op.grad`` should return a list
+    or tuple of length 1.
+    For each input wrt to which ``op`` is not differentiable, it should
+    return ``None`` instead of a `Variable` instance.
+
+    If a source ``r`` receives a gradient from another source ``r2``,
+    then the effective gradient on ``r`` is the sum of both gradients.
+
+
+
+    :type sources: list of pairs of Variable: (v, gradient-on-v) to
+                   initialize the total_gradient dictionary
    :param sources: gradients to back-propagate using chain rule
    :type graph_inputs: list of Variable
    :param graph_inputs: variables considered to be constant
        (do not backpropagate through them)
+    :type warn_type: bool
+    :param warn_type: True will trigger warnings via the logging module when
+       the gradient on an expression has a different type than the original
+       expression

-    :rtype: dictionary whose keys and values are of type `Variable`
-
+    :rtype: dictionary whose keys and values are of type Variable
    :return: mapping from each Variable encountered in the backward
        traversal to the gradient with respect to that Variable.

@@ -73,9 +109,6 @@ def grad_sources_inputs(sources, graph_inputs, warn_type=True):
    sources, so that for each v, gradient-on-v is the gradient of J with
    respect to v

-
-
-
    """
    gmap = {}
    for (r, g_r) in sources:
@@ -182,23 +215,22 @@ def Rop(f, wrt, eval_points):
    in `eval_points`. Mathematically this stands for the jacobian of `f` wrt
    to `wrt` right muliplied by the eval points.

-    :type f: `Variable` or list of `Variable`s
-        `f` stands for the output of the computational graph to which you
-        want to apply the R operator
-    :type wrt: `Variable` or list of `Variables`s
-        variables for which you compute the R operator of the expression
-        described by `f`
-    :type eval_points: `Variable` or list of `Variable`s
-        evalutation points for each of the variables in `wrt`
-
-    :rtype: `Variable` or list/tuple of `Variable`s depending on type of f
+    :type f: Variable or list of Variables
+             `f` stands for the output of the computational graph to which you
+             want to apply the R operator
+    :type wrt: Variable or list of `Variables`s
+               variables for which you compute the R operator of the expression
+               described by `f`
+    :type eval_points: Variable or list of Variables
+                       evalutation points for each of the variables in `wrt`
+    :rtype: Variable or list/tuple of Variables depending on type of f
    :return: symbolic expression such that
        R_op[i] = sum_j ( d f[i] / d wrt[j]) eval_point[j]
        where the indices in that expression are magic multidimensional
        indices that specify both the position within a list and all
        coordinates of the tensor element in the last.
        If `wrt` is a list/tuple, then return a list/tuple with the results.
-        """
+    """
    from theano.tensor import as_tensor_variable
    using_list = isinstance(f, list)
    using_tuple = isinstance(f, tuple)
@@ -295,16 +327,16 @@ def Lop(f, wrt, eval_points, consider_constant=None, warn_type=False,
    in `eval_points`. Mathematically this stands for the jacobian of `f` wrt
    to `wrt` left muliplied by the eval points.

-    :type f: `Variable` or list of `Variable`s
+    :type f: Variable or list of Variables
        `f` stands for the output of the computational graph to which you
        want to apply the L operator
-    :type wrt: `Variable` or list of `Variables`s
+    :type wrt: Variable or list of `Variables`s
        variables for which you compute the L operator of the expression
        described by `f`
-    :type eval_points: `Variable` or list of `Variable`s
-        evalutation points for each of the variables in `f`
+    :type eval_points: Variable or list of Variables
+                        evalutation points for each of the variables in `f`

-    :rtype: `Variable` or list/tuple of `Variable`s depending on type of f
+    :rtype: Variable or list/tuple of Variables depending on type of f
    :return: symbolic expression such that
        L_op[i] = sum_i ( d f[i] / d wrt[j]) eval_point[i]
        where the indices in that expression are magic multidimensional
@@ -374,9 +406,9 @@ def Lop(f, wrt, eval_points, consider_constant=None, warn_type=False,
 def grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False,
         disconnected_inputs='raise'):
    """
-    :type cost: Scalar (0-dimensional) `Variable`
-    :type wrt: `Variable` or list of `Variable`s.
-    :type g_cost: Scalar `Variable`, or None
+    :type cost: Scalar (0-dimensional) Variable.
+    :type wrt: Variable or list of Variables.
+    :type g_cost: Scalar Variable, or None.
    :param g_cost: an expression for the gradient through cost.  The default is
        ``ones_like(cost)``.
    :param consider_constant: a list of expressions not to backpropagate
@@ -393,7 +425,7 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False,
        - 'warn': consider the gradient zero, and print a warning.
        - 'raise': raise an exception.

-    :rtype: `Variable` or list/tuple of `Variable`s (depending upon `wrt`)
+    :rtype: Variable or list/tuple of Variables (depending upon `wrt`)

    :return: symbolic expression of gradient of `cost` with respect to `wrt`.
             If an element of `wrt` is not differentiable with respect
@@ -672,9 +704,9 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None,
    """ Test a gradient by Finite Difference Method. Raise error on failure.

    Example:
-    >>> verify_grad(theano.tensor.tanh,
-                    (numpy.asarray([[2,3,4], [-1, 3.3, 9.9]]),),
-                    rng=numpy.random)
+        >>> verify_grad(theano.tensor.tanh,
+                        (numpy.asarray([[2,3,4], [-1, 3.3, 9.9]]),),
+                        rng=numpy.random)

    Raises an Exception if the difference between the analytic gradient and
    numerical gradient (computed through the Finite Difference Method) of a
@@ -841,8 +873,8 @@ verify_grad.E_grad = GradientError
 def jacobian(expression, wrt, consider_constant=None, warn_type=False,
             disconnected_inputs='raise'):
    """
-    :type expression: Vector (1-dimensional) `Variable`
-    :type wrt: 'Variable' or list of `Variables`s
+    :type expression: Vector (1-dimensional) Variable
+    :type wrt: Variable or list of Variables

    :param consider_constant: a list of expressions not to backpropagate
        through
@@ -858,7 +890,7 @@ def jacobian(expression, wrt, consider_constant=None, warn_type=False,
        - 'warn': consider the gradient zero, and print a warning.
        - 'raise': raise an exception.

-    :return: either a instance of `Variable` or list/tuple of `Variable`s
+    :return: either a instance of Variable or list/tuple of Variables
            (depending upon `wrt`) repesenting the jacobian of `expression`
            with respect to (elements of) `wrt`. If an element of `wrt` is not
            differentiable with respect to the output, then a zero
@@ -914,9 +946,9 @@ def jacobian(expression, wrt, consider_constant=None, warn_type=False,
 def hessian(cost, wrt, consider_constant=None, warn_type=False,
             disconnected_inputs='raise'):
    """
-    :type cost: Scalar (0-dimensional) `Variable`
+    :type cost: Scalar (0-dimensional) Variable.
    :type wrt: Vector (1-dimensional tensor) 'Variable' or list of
-            vectors (1-dimensional tensors) `Variable`s
+               vectors (1-dimensional tensors) Variables

    :param consider_constant: a list of expressions not to backpropagate
        through
@@ -932,7 +964,7 @@ def hessian(cost, wrt, consider_constant=None, warn_type=False,
        - 'warn': consider the gradient zero, and print a warning.
        - 'raise': raise an exception.

-    :return: either a instance of `Variable` or list/tuple of `Variable`s
+    :return: either a instance of Variable or list/tuple of Variables
            (depending upon `wrt`) repressenting the Hessian of the `cost`
            with respect to (elements of) `wrt`. If an element of `wrt` is not
            differentiable with respect to the output, then a zero

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -34,6 +34,9 @@ def as_cuda_array(obj):
        raise TypeError("Don't know how to cast to a CudaNdarray object")

 class HostFromGpu(Op):
+    """
+    Implement the transfer from gpu to the cpu.
+    """
    def __eq__(self, other):
        return type(self) == type(other)
    def __hash__(self):
@@ -63,6 +66,9 @@ class HostFromGpu(Op):
 host_from_gpu = HostFromGpu()

 class GpuFromHost(Op):
+    """
+    Implement the transfer from cpu to the gpu.
+    """
    def __eq__(self, other):
        return type(self) == type(other)
    def __hash__(self):
@@ -93,6 +99,9 @@ class GpuFromHost(Op):
 gpu_from_host = GpuFromHost()

 class GpuElemwise(Op):
+    """
+    Implement a generic elemwise on the gpu.
+    """
    nin = property(lambda self: self.scalar_op.nin)
    nout = property(lambda self: self.scalar_op.nout)

@@ -200,6 +209,9 @@ class GpuElemwise(Op):
        return self.src_generator.cache_version

 class GpuDimShuffle(Op):
+    """
+    Implement DimShuffle on the gpu.
+    """
    def __init__(self, input_broadcastable, new_order):
        input_broadcastable = tuple(input_broadcastable)
        self.input_broadcastable = input_broadcastable
@@ -403,7 +415,7 @@ class GpuSum(Op):
      - reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor.

    :note: any reduce_mask of all zeros is a sort of 'copy', and may be removed during graph
-    optimization
+           optimization

    """
    def __init__(self, reduce_mask):
@@ -1706,6 +1718,9 @@ class GpuSum(Op):
        return sio.getvalue()

 class GpuReshape(tensor.Reshape):
+    """
+    Implement Reshape on the gpu.
+    """
    # __hash__, __eq__, __str__ come from tensor.Subtensor
    def make_node(self, x, shp):
        host_reshaped = host_from_gpu(x).reshape(shp,ndim=self.ndim)
@@ -1719,6 +1734,9 @@ class GpuReshape(tensor.Reshape):
        out[0] = x.reshape(tuple(shp))

 class GpuSubtensor(tensor.Subtensor):
+    """
+    Implement subtensor on the gpu.
+    """
    # __hash__, __eq__, __str__ come from tensor.Subtensor
    def make_node(self, x, *inputs):
        assert isinstance(x.type, CudaNdarrayType)
@@ -1747,6 +1765,9 @@ class GpuSubtensor(tensor.Subtensor):
        out[0] = x.__getitem__(cdata)

 class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1):
+    """
+    Implement AdvancedSubtensor1 on the gpu.
+    """
    def make_node(self, x, ilist):
        x_ = as_cuda_ndarray_variable(x)
        ilist_ = tensor.as_tensor_variable(ilist)
@@ -1770,6 +1791,9 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1):
        out[0] = o

 class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1):
+    """
+    Implement AdvancedIncSubtensor1 on the gpu.
+    """
    def make_node(self, x, y, ilist):
        x_ = as_cuda_ndarray_variable(x)
        y_ = as_cuda_ndarray_variable(y)
@@ -1795,6 +1819,9 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1):
        # so we use the parent version that loop on each indices.

 class GpuIncSubtensor(tensor.IncSubtensor):
+    """
+    Implement IncSubtensor on the gpu.
+    """
    def make_node(self, x, y, *inputs):
        assert isinstance(x.type, CudaNdarrayType)
        assert isinstance(y.type, CudaNdarrayType)
@@ -1802,6 +1829,9 @@ class GpuIncSubtensor(tensor.IncSubtensor):
        return Apply(self, [x,y]+rval.inputs[2:], [x.type()])

 class GpuFlatten(tensor.Flatten):
+    """
+    Implement Flatten on the gpu.
+    """
    def make_node(self, x ):
        assert isinstance(x.type, CudaNdarrayType)
        rval = tensor.Flatten.make_node(self, x)
@@ -1810,11 +1840,17 @@ class GpuFlatten(tensor.Flatten):
        return Apply(self, [x], [out_type()])

 class GpuShape(tensor.Shape):
+    """
+    Implement Shape on the gpu.
+    """
    def make_node(self, x):
        return Apply(self, [x], [tensor.lvector()])
 gpu_shape = GpuShape()

 class GpuJoin(tensor.Join):
+    """
+    Implement Join on the gpu.
+    """
    def make_node(self, *axis_and_tensors):
        axis, tensors = axis_and_tensors[0], axis_and_tensors[1:]
        if not tensors:
@@ -1889,6 +1925,9 @@ class GpuJoin(tensor.Join):
 gpu_join = GpuJoin()

 class GpuAlloc(Op):
+    """
+    Implement Alloc on the gpu.
+    """
    def __init__(self):
        pass

@@ -1967,7 +2006,12 @@ class GpuAlloc(Op):

 gpu_alloc = GpuAlloc()

+
 class GpuContiguous(Op):
+    """
+    Always return a c contiguous output. Copy the input only if it is
+    not already c contiguous.
+    """
    view_map = {0: [0]}

    def __eq__(self, other):

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -6,6 +6,9 @@ import cuda_ndarray.cuda_ndarray as cuda
 from theano.sandbox.cuda.type import CudaNdarrayType

 class GpuDot22(Op):
+    """
+    Implement dot(2d, 2d) on the gpu.
+    """
    def __str__(self):
        return 'GpuDot22'
    def __eq__(self, other):
@@ -74,6 +77,9 @@ class GpuDot22(Op):
 gpu_dot22 = GpuDot22()

 class GpuDot22Scalar(Op):
+    """
+    Implement dot(2d, 2d) * scalar on the gpu.
+    """
    def __str__(self):
        return 'GpuDot22Scalar'
    def __eq__(self, other):
@@ -434,6 +440,7 @@ gpu_ger_no_inplace = GpuGer(inplace=False)
 gpu_ger_inplace = GpuGer(inplace=True)

 class GpuOuter(Op):
+    """ Implement outer on the gpu."""
    def make_node(self, x, y):
        # we suppose type checking has been done, but make sure.
        assert (x.type.ndim == 1 and y.type.ndim == 1 and
@@ -526,6 +533,9 @@ gpu_outer = GpuOuter()
 # Not really a BLAS operation, but whatever.
 #
 class GpuConv(Op):
+    """
+    Implement the batched and stacked 2d convolution on the gpu.
+    """
    @staticmethod
    def logical_output_shape_2d(imshp, kshp, mode):
        if mode == 'valid':
@@ -689,6 +699,9 @@ class GpuConv(Op):


 class GpuDownsampleFactorMax(Op):
+    """
+    Implement downsample with max on the gpu.
+    """
    def __init__(self, ds, ignore_border=False):
        self.ds = tuple(ds)
        self.ignore_border = ignore_border
@@ -846,6 +859,9 @@ class GpuDownsampleFactorMax(Op):
        """ % locals()

 class GpuDownsampleFactorMaxGrad(Op):
+    """
+    Implement the grad of downsample with max on the gpu.
+    """
    def __init__(self, ds, ignore_border):
        self.ds = tuple(ds)
        self.ignore_border = ignore_border

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -6,7 +6,11 @@ from theano.sandbox.cuda.type import CudaNdarrayType

 from theano.sandbox.cuda.kernel_codegen import nvcc_kernel, inline_reduce_max, inline_reduce_sum, inline_softmax

+
 class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
+    """
+    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
+    """
    nin=3
    nout=3
    def __eq__(self, other):
@@ -177,6 +181,9 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()

 class GpuCrossentropySoftmax1HotWithBiasDx (Op):
+    """
+    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
+    """
    nin=3
    nout=1
    """Gradient wrt x of the CrossentropySoftmax1Hot Op"""
@@ -296,7 +303,9 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
 gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()

 class GpuSoftmax (Op):
-    """Writeme"""
+    """
+    Implement Softmax on the gpu.
+    """
    def __eq__(self, other):
        return type(self) == type(other)
    def __hash__(self):
@@ -392,7 +401,9 @@ class GpuSoftmax (Op):
 gpu_softmax = GpuSoftmax()

 class GpuSoftmaxWithBias (Op):
-    """Writeme"""
+    """
+    Implement SoftmaxWithBias on the gpu.
+    """
    nin = 2
    nout = 1
    def __eq__(self, other):

--- a/theano/sandbox/cuda/rng_curand.py
+++ b/theano/sandbox/cuda/rng_curand.py
@@ -247,7 +247,8 @@ class CURAND_Uniform(CURAND_Base):


 class CURAND_RandomStreams(object):
-    """RandomStreams instance that creates CURAND-based random variables.
+    """
+    RandomStreams instance that creates CURAND-based random variables.

    One caveat is that generators are not serializable.
    """

--- a/theano/sandbox/linalg/ops.py
+++ b/theano/sandbox/linalg/ops.py
@@ -535,7 +535,7 @@ class MatrixInverse(Op):
    and :math:`A_{inv} \cdot A` equals the identity matrix :math:`I`.

    :note: When possible, the call to this op will be optimized to the call
-    of ``solve``.
+           of ``solve``.
    """

    def __init__(self):