initial commit

1b1e2ec3 · Nicholas Leonard · a915ac04 · 1b1e2ec3 · 1b1e2ec3 · 1b1e2ec3
--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -79,7 +79,7 @@ from theano.updates import Updates, OrderedUpdates
 #we don't import by default as we don't want to force having scipy installed.
 #import sparse

-from theano.gradient import Rop, Lop, grad
+from theano.gradient import Rop, Lop, grad, subgrad

 if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
    import theano.sandbox.cuda

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -543,6 +543,81 @@ def grad(cost, wrt, consider_constant=None,
        rval, = rval
    return rval

+def subgrad(wrt, grad_end, known_grads=None, cost=None, details=False):
+    '''
+    With respect to wrt, computes gradients of known_grads, cost, 
+    or both, up to grad_end theano variables in theano digraph. 
+    In other words, computes gradients for a subgraph of the
+    symbolic theano function. Ignores all disconnected inputs.
+    
+    This can be useful when one needs to perform the gradient descent 
+    iteratively (e.g. one layer at a time in an MLP), or when a particular 
+    operation is not differentiable in theano (e.g. stochastic sampling 
+    from a multinomial). In the latter case, the gradient of the 
+    non-differentiable process could be approximated by user-defined 
+    formula which could be calculated using the gradients at the 
+    output of the process. These are obtained by performing a subgrad 
+    from the cost or previously known_grads up to the outputs of the 
+    process (grad_end). The gradients obtained from the user defined 
+    gradient of the process can then be fed into another subgrad as 
+    known_grads with any other cost functions (e.g. weight decay), and 
+    so on.
+    
+    parameters
+    ----------
+    wrt : list
+        gradients are computed with regard to (wrt) these variables.
+    known_grads : dict
+        parameters, gradients (key, value) in the forward part 
+        (near cost) of the graph for which gradients are known. 
+        These will be used to compute the gradients backwards 
+        up to the variables in grad_end.
+    grad_end : list
+        theano variables where to stop the backpropagation of gradients
+        (they will be considered constant in theano.grad).
+    cost : theano scalar
+        additional costs for which to compute the gradients. For 
+        example, these could be weight decay, or l1 constraint on output
+    details: bool
+        when True, return OrderedDict of wrt, gradients, and lists of
+        gradients derived from known_grads, cost_grads, respectively
+        (in same order as params)
+    
+    return
+    ------
+    Returns an OrderedDict of params (keys), gradients (values)
+    '''
+    assert ((cost is not None) or (known_grads is not None))
+    assert isinstance(grad_end, list)
+    assert isinstance(wrt, list)
+    if known_grads is not None:
+        assert isinstance(known_grads, dict)
+    kg_grads = None
+    cost_grads = None
+    if known_grads is not None:
+        kg_grads = list(theano.grad(cost=None, wrt=wrt, 
+                          known_grads=known_grads, 
+                          consider_constant=grad_end,
+                          disconnected_inputs='ignore'))
+        
+    if cost is not None:
+        cost_grads = list(theano.grad(cost=cost, wrt=wrt,
+                        consider_constant=grad_end,
+                        disconnected_inputs='ignore'))
+                        
+    grads = None
+    if known_grads is None:
+        grads = cost_grads
+    else:
+        grads = kg_grads
+        if cost_grads is not None:
+            for i in range(len(grads)):
+                grads[i] += cost_grads[i]
+    
+            
+    if details:
+        return grads, kg_grads, cost_grads
+    return grads

 def _node_to_pattern(node):
    """ given an apply node, obtain its connection pattern

--- a/theano/tests/test_gradient.py
+++ b/theano/tests/test_gradient.py
@@ -553,6 +553,59 @@ def test_disconnected_cost_grad():
        except theano.gradient.DisconnectedInputError:
            return
        raise AssertionError("A disconnected gradient has been ignored.")
+        
+def test_subgrad():
+
+    # Tests that the grad method with no known_grads
+    # matches what happens if you use successive subgrads
+
+    x = theano.tensor.fvector('x')
+    t = theano.tensor.fvector('t')
+    w1 = theano.shared(np.random.randn(3,4))
+    w2 = theano.shared(np.random.randn(4,2))
+    a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
+    a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
+    cost2 = theano.tensor.sqr(a2 - t).sum() 
+    cost2 += theano.tensor.sqr(w2.sum())
+    cost1 = theano.tensor.sqr(w1.sum())
+    
+    params = [[w2,a1],[w1,x]]
+    costs = [cost2,cost1]
+    grad_ends = [[a1], [x]]
+    
+    inputs = [t, x]
+    rng = np.random.RandomState([2012, 11, 15])
+    values = [rng.randn(2), rng.randn(3)]
+    values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)]
+
+    wrt = [w2, a1, w1, x]
+    cost = cost2 + cost1
+    true_grads = theano.grad(cost, wrt)
+    true_grads = theano.function(inputs, true_grads)
+    true_grads = true_grads(*values)
+    from theano.gof.python25 import OrderedDict
+    known_grad = None
+    params2 = []
+    for i in xrange(2):
+        param = params[i]
+        cost = costs[i]
+        grad_end = grad_ends[i]
+        
+        pgrad = theano.subgrad(
+            wrt=param, grad_end=grad_end, 
+            known_grads=known_grad, cost=cost
+        )
+        known_grad = OrderedDict(zip(param,pgrad))
+        params2.extend(pgrad)
+    
+    pgrads = theano.function(inputs, params2)
+    pgrads = pgrads(*values)
+    print(pgrads)
+    print(true_grads)
+    
+    for true_grad, pgrad in zip(true_grads, pgrads):
+        print(true_grad, pgrad)
+        assert(np.sum(np.abs(true_grad - pgrad)) < 0.00001)

 if __name__ == '__main__':
    unittest.main()