subgrad documentation moved from tensor to theano.gradient

4e5b2223 · Kelvin Xu · Kelvin Xu · 4829418f · 4e5b2223 · 4e5b2223
--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -1658,100 +1658,6 @@ Gradient / Differentiation
    :rtype: variable or list of variables (matching `wrt`)
    :returns: gradients of the cost with respect to each of the `wrt` terms
-.. function:: subgraph_grad(wrt, end, start=None, cost=None, details=False)
-    With respect to `wrt`, computes gradients of cost and/or from existing
-    `start` gradients, up to the `end` variables of a symbolic digraph.
-    In other words, computes gradients for a subgraph of the
-    symbolic theano function. Ignores all disconnected inputs.
-    This can be useful when one needs to perform the gradient descent
-    iteratively (e.g. one layer at a time in an MLP), or when a particular
-    operation is not differentiable in theano (e.g. stochastic sampling
-    from a multinomial). In the latter case, the gradient of the
-    non-differentiable process could be approximated by user-defined
-    formula, which could be calculated using the gradients of a cost
-    with respect to samples (0s and 1s). These gradients are obtained
-    by performing a subgraph_grad from the `cost` or previously known gradients
-    (`start`) up to the outputs of the stochastic process (`end`).
-    A dictionary mapping gradients obtained from the user-defined
-    differentiation of the process, to variables, could then be fed into
-    another subgraph_grad as `start` with any other `cost` (e.g. weight decay).
-    In an MLP, we could use subgraph_grad to iteratively backpropagate:
-    .. testcode:: subgraph_grad
-       import theano
-       import numpy as np
-       x, t = theano.tensor.fvector('x'), theano.tensor.fvector('t')
-       w1 = theano.shared(np.random.randn(3,4))
-       w2 = theano.shared(np.random.randn(4,2))
-       a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
-       a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
-       cost2 = theano.tensor.sqr(a2 - t).sum()
-       cost2 += theano.tensor.sqr(w2.sum())
-       cost1 = theano.tensor.sqr(w1.sum())
-       params = [[w2],[w1]]
-       costs = [cost2,cost1]
-       grad_ends = [[a1], [x]]
-       next_grad = None
-       param_grads = []
-       for i in xrange(2):
-           param_grad, next_grad = theano.subgraph_grad(
-               wrt=params[i], end=grad_ends[i],
-               start=next_grad, cost=costs[i]
-           )
-           next_grad = dict(zip(grad_ends[i], next_grad))
-           param_grads.extend(param_grad)
-    :type wrt: list of variables
-    :param wrt:
-      Gradients are computed with respect to `wrt`.
-    :type end: list of variables
-    :param end:
-      Theano variables at which to end gradient descent (they are
-      considered constant in theano.grad).  For convenience, the
-      gradients with respect to these variables are also returned.
-    :type start: dictionary of variables
-    :param start:
-      If not None, a dictionary mapping variables to their
-      gradients. This is useful when the gradient on some variables
-      are known. These are used to compute the gradients backwards up
-      to the variables in `end` (they are used as known_grad in
-      theano.grad).
-    :type cost: scalar (0-dimensional) variable
-    :param cost:
-      Additional costs for which to compute the gradients.  For
-      example, these could be weight decay, an l1 constraint, MSE,
-      NLL, etc. May optionally be None if start is provided.
-      .. warning::
-        If the gradients of `cost` with respect to any of the `start`
-        variables is already part of the `start` dictionary, then it
-        may be counted twice with respect to `wrt` and `end`.
-    :type details: bool
-    :param details:
-      When True, additionally returns the list of gradients from
-      `start` and of `cost`, respectively, with respect to `wrt` (not
-      `end`).
-    :rtype: Tuple of 2 or 4 Lists of Variables
-    :return: Returns lists of gradients with respect to `wrt` and `end`,
-            respectively.
-    .. versionadded:: 0.6.1
 .. _R_op_list:

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -567,6 +567,33 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
    subgraph_grad as `start` with any other `cost` (e.g. weight
    decay).
+    In an MLP, we could use subgraph_grad to iteratively backpropagate:
+    .. code-block:: python
+        x, t = theano.tensor.fvector('x'), theano.tensor.fvector('t')
+        w1 = theano.shared(np.random.randn(3,4))
+        w2 = theano.shared(np.random.randn(4,2))
+        a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
+        a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
+        cost2 = theano.tensor.sqr(a2 - t).sum()
+        cost2 += theano.tensor.sqr(w2.sum())
+        cost1 = theano.tensor.sqr(w1.sum())
+        params = [[w2],[w1]]
+        costs = [cost2,cost1]
+        grad_ends = [[a1], [x]]
+        next_grad = None
+        param_grads = []
+        for i in xrange(2):
+            param_grad, next_grad = theano.subgraph_grad(
+                wrt=params[i], end=grad_ends[i],
+                start=next_grad, cost=costs[i]
+            )
+            next_grad = dict(zip(grad_ends[i], next_grad))
+            param_grads.extend(param_grad)
    :type wrt: list of variables
    :param wrt:
      Gradients are computed with respect to `wrt`.
@@ -593,7 +620,14 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
      : If the gradients of `cost` with respect to any of the `start`
      variables is already part of the `start` dictionary, then it may
      be counted twice with respect to `wrt` and `end`.
+      .. warning::
+        If the gradients of `cost` with respect to any of the `start`
+        variables is already part of the `start` dictionary, then it
+        may be counted twice with respect to `wrt` and `end`.
    :type details: bool
    :param details:
      When True, additionally returns the list of gradients from
@@ -605,6 +639,7 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
    :return: Returns lists of gradients with respect to `wrt` and `end`, 
            respectively.
+    .. versionadded:: 0.6.1
    '''
    assert ((cost is not None) or (start is not None))
    assert isinstance(end, list)