Merge pull request #1762 from nicholas-leonard/master

Add subgraph_grad

Merge pull request #1762 from nicholas-leonard/master
79b1e64f · abergeron · 8df7d385 · c2247cd1 · 79b1e64f · 79b1e64f
--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -1563,6 +1563,86 @@ Gradient / Differentiation

    :rtype: variable or list of variables (matching `wrt`)
    :returns: gradients of the cost with respect to each of the `wrt` terms
+    
+.. function:: subgraph_grad(wrt, end, start=None, cost=None, details=False)
+
+    With respect to `wrt`, computes gradients of cost and/or from existing 
+    `start` gradients, up to the `end` variables of a symbolic digraph. 
+    In other words, computes gradients for a subgraph of the
+    symbolic theano function. Ignores all disconnected inputs.
+    
+    This can be useful when one needs to perform the gradient descent 
+    iteratively (e.g. one layer at a time in an MLP), or when a particular 
+    operation is not differentiable in theano (e.g. stochastic sampling 
+    from a multinomial). In the latter case, the gradient of the 
+    non-differentiable process could be approximated by user-defined 
+    formula, which could be calculated using the gradients of a cost 
+    with respect to samples (0s and 1s). These gradients are obtained 
+    by performing a subgraph_grad from the `cost` or previously known gradients 
+    (`start`) up to the outputs of the stochastic process (`end`). 
+    A dictionary mapping gradients obtained from the user-defined 
+    differentiation of the process, to variables, could then be fed into 
+    another subgraph_grad as `start` with any other `cost` (e.g. weight decay).
+    
+    In an MLP, we could use subgraph_grad to iteratively backpropagate:
+    >>> x, t = theano.tensor.fvector('x'), theano.tensor.fvector('t')
+    >>> w1 = theano.shared(np.random.randn(3,4))
+    >>> w2 = theano.shared(np.random.randn(4,2))
+    >>> a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
+    >>> a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
+    >>> cost2 = theano.tensor.sqr(a2 - t).sum() 
+    >>> cost2 += theano.tensor.sqr(w2.sum())
+    >>> cost1 = theano.tensor.sqr(w1.sum())
+    
+    >>> params = [[w2],[w1]]
+    >>> costs = [cost2,cost1]
+    >>> grad_ends = [[a1], [x]]
+    
+    >>> next_grad = None
+    >>> param_grads = []
+    >>> for i in xrange(2):
+    >>>     param_grad, next_grad = theano.subgraph_grad(
+    >>>         wrt=params[i], end=grad_ends[i], 
+    >>>         start=next_grad, cost=costs[i]
+    >>>     )
+    >>>     next_grad = dict(zip(grad_ends[i], next_grad))
+    >>>     param_grads.extend(param_grad)
+    
+    :type wrt : List of Variables.
+        Gradients are computed with respect to `wrt`.
+    
+    :type end : List of Variables.
+        Theano variables at which to end gradient descent
+        (they are considered constant in theano.grad). 
+        For convenience, the gradients with respect to these variables 
+        are also returned.
+    
+    :type start : Dictionary of Variables
+    :param start: If not None, a dictionary mapping variables to 
+            their gradients. This is useful when the gradient on some 
+            variables are known. These are used to compute the gradients
+            backwards up to the variables in `end` 
+            (they are used as known_grad in theano.grad).
+    
+    :type cost: Scalar (0-dimensional) Variable.
+    :param cost: 
+            Additional costs for which to compute the gradients.  
+            For example, these could be weight decay, an l1 constraint,
+            MSE, NLL, etc. May optionally be None if start is provided.
+            Warning : If the gradients of `cost` with respect to any 
+            of the `start` variables is already part of the `start` 
+            dictionary, then it may be counted twice with respect to `wrt` 
+            and `end`.
+    
+    :type details: bool.
+    :param details: When True, additionally returns the 
+        list of gradients from `start` and of `cost`, respectively, 
+        with respect to `wrt` (not `end`).
+    
+    :rtype: Tuple of 2 or 4 Lists of Variables
+    
+    :return: Returns lists of gradients with respect to `wrt` and `end`, 
+            respectively.


 .. _R_op_list:

--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -79,7 +79,7 @@ from theano.updates import Updates, OrderedUpdates
 #we don't import by default as we don't want to force having scipy installed.
 #import sparse

-from theano.gradient import Rop, Lop, grad
+from theano.gradient import Rop, Lop, grad, subgraph_grad

 if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
    import theano.sandbox.cuda

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -544,6 +544,109 @@ def grad(cost, wrt, consider_constant=None,
        rval, = rval
    return rval

+def subgraph_grad(wrt, end, start=None, cost=None, details=False):
+    '''
+    With respect to `wrt`, computes gradients of cost and/or from existing 
+    `start` gradients, up to the `end` variables of a symbolic digraph. 
+    In other words, computes gradients for a subgraph of the
+    symbolic theano function. Ignores all disconnected inputs.
+    
+    This can be useful when one needs to perform the gradient descent 
+    iteratively (e.g. one layer at a time in an MLP), or when a particular 
+    operation is not differentiable in theano (e.g. stochastic sampling 
+    from a multinomial). In the latter case, the gradient of the 
+    non-differentiable process could be approximated by user-defined 
+    formula, which could be calculated using the gradients of a cost 
+    with respect to samples (0s and 1s). These gradients are obtained 
+    by performing a subgraph_grad from the `cost` or previously known gradients 
+    (`start`) up to the outputs of the stochastic process (`end`). 
+    A dictionary mapping gradients obtained from the user-defined 
+    differentiation of the process, to variables, could then be fed into 
+    another subgraph_grad as `start` with any other `cost` (e.g. weight decay).
+    
+    :type wrt : List of Variables.
+        Gradients are computed with respect to `wrt`.
+    
+    :type end : List of Variables.
+        Theano variables at which to end gradient descent
+        (they are considered constant in theano.grad). 
+        For convenience, the gradients with respect to these variables 
+        are also returned.
+    
+    :type start : Dictionary of Variables
+    :param start: If not None, a dictionary mapping variables to 
+            their gradients. This is useful when the gradient on some 
+            variables are known. These are used to compute the gradients
+            backwards up to the variables in `end` 
+            (they are used as known_grad in theano.grad).
+    
+    :type cost: Scalar (0-dimensional) Variable.
+    :param cost: 
+            Additional costs for which to compute the gradients.  
+            For example, these could be weight decay, an l1 constraint,
+            MSE, NLL, etc. May optionally be None if start is provided.
+            Warning : If the gradients of `cost` with respect to any 
+            of the `start` variables is already part of the `start` 
+            dictionary, then it may be counted twice with respect to `wrt` 
+            and `end`.
+    
+    :type details: bool.
+    :param details: When True, additionally returns the 
+        list of gradients from `start` and of `cost`, respectively, 
+        with respect to `wrt` (not `end`).
+    
+    :rtype: Tuple of 2 or 4 Lists of Variables
+    
+    :return: Returns lists of gradients with respect to `wrt` and `end`, 
+            respectively.
+    '''
+    assert ((cost is not None) or (start is not None))
+    assert isinstance(end, list)
+    assert isinstance(wrt, list)
+    if start is not None:
+        assert isinstance(start, dict)
+        
+    params = list(set(wrt + end))
+    
+    start_grads = None
+    cost_grads = None
+    if start is not None:
+        start_grads = list(
+            theano.grad(
+                cost=None, wrt=params, known_grads=start, 
+                consider_constant=end, 
+                disconnected_inputs='ignore'
+            )
+        )
+        
+    if cost is not None:
+        cost_grads = list(
+            theano.grad(
+                cost=cost, wrt=params,
+                consider_constant=end,
+                disconnected_inputs='ignore'
+            )
+        )
+                        
+    grads = None
+    if start is None:
+        grads = cost_grads
+    else:
+        grads = start_grads
+        if cost_grads is not None:
+            for i in range(len(grads)):
+                grads[i] += cost_grads[i]
+    
+    pgrads = OrderedDict(zip(params, grads))
+    # separate wrt from end grads:
+    wrt_grads = list(pgrads[k] for k in wrt)
+    end_grads = list(pgrads[k] for k in end)
+   
+    
+    if details:
+        return wrt_grads, end_grads, start_grads, cost_grads
+    
+    return wrt_grads, end_grads

 def _node_to_pattern(node):
    """ given an apply node, obtain its connection pattern

--- a/theano/tests/test_gradient.py
+++ b/theano/tests/test_gradient.py
@@ -554,6 +554,52 @@ def test_disconnected_cost_grad():
        except theano.gradient.DisconnectedInputError:
            return
        raise AssertionError("A disconnected gradient has been ignored.")
+        
+def test_subgraph_grad():
+
+    # Tests that the grad method with no known_grads
+    # matches what happens if you use successive subgraph_grads
+
+    x = theano.tensor.fvector('x')
+    t = theano.tensor.fvector('t')
+    w1 = theano.shared(np.random.randn(3,4))
+    w2 = theano.shared(np.random.randn(4,2))
+    a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
+    a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
+    cost2 = theano.tensor.sqr(a2 - t).sum() 
+    cost2 += theano.tensor.sqr(w2.sum())
+    cost1 = theano.tensor.sqr(w1.sum())
+    
+    params = [[w2],[w1]]
+    costs = [cost2,cost1]
+    grad_ends = [[a1], [x]]
+    
+    inputs = [t, x]
+    rng = np.random.RandomState([2012, 11, 15])
+    values = [rng.randn(2), rng.randn(3)]
+    values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)]
+
+    wrt = [w2, w1]
+    cost = cost2 + cost1
+    true_grads = theano.grad(cost, wrt)
+    true_grads = theano.function(inputs, true_grads)
+    true_grads = true_grads(*values)
+    from theano.gof.python25 import OrderedDict
+    next_grad = None
+    param_grads = []
+    for i in xrange(2):
+        param_grad, next_grad = theano.subgraph_grad(
+            wrt=params[i], end=grad_ends[i], 
+            start=next_grad, cost=costs[i]
+        )
+        next_grad = OrderedDict(zip(grad_ends[i], next_grad))
+        param_grads.extend(param_grad)
+    
+    pgrads = theano.function(inputs, param_grads)
+    pgrads = pgrads(*values)
+    
+    for true_grad, pgrad in zip(true_grads, pgrads):
+        assert(np.sum(np.abs(true_grad - pgrad)) < 0.00001)


 class TestConsiderConstant(unittest.TestCase):