提交 1b1e2ec3 authored 作者: Nicholas Leonard's avatar Nicholas Leonard

initial commit

上级 a915ac04
...@@ -79,7 +79,7 @@ from theano.updates import Updates, OrderedUpdates ...@@ -79,7 +79,7 @@ from theano.updates import Updates, OrderedUpdates
#we don't import by default as we don't want to force having scipy installed. #we don't import by default as we don't want to force having scipy installed.
#import sparse #import sparse
from theano.gradient import Rop, Lop, grad from theano.gradient import Rop, Lop, grad, subgrad
if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'): if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
import theano.sandbox.cuda import theano.sandbox.cuda
......
...@@ -543,6 +543,81 @@ def grad(cost, wrt, consider_constant=None, ...@@ -543,6 +543,81 @@ def grad(cost, wrt, consider_constant=None,
rval, = rval rval, = rval
return rval return rval
def subgrad(wrt, grad_end, known_grads=None, cost=None, details=False):
'''
With respect to wrt, computes gradients of known_grads, cost,
or both, up to grad_end theano variables in theano digraph.
In other words, computes gradients for a subgraph of the
symbolic theano function. Ignores all disconnected inputs.
This can be useful when one needs to perform the gradient descent
iteratively (e.g. one layer at a time in an MLP), or when a particular
operation is not differentiable in theano (e.g. stochastic sampling
from a multinomial). In the latter case, the gradient of the
non-differentiable process could be approximated by user-defined
formula which could be calculated using the gradients at the
output of the process. These are obtained by performing a subgrad
from the cost or previously known_grads up to the outputs of the
process (grad_end). The gradients obtained from the user defined
gradient of the process can then be fed into another subgrad as
known_grads with any other cost functions (e.g. weight decay), and
so on.
parameters
----------
wrt : list
gradients are computed with regard to (wrt) these variables.
known_grads : dict
parameters, gradients (key, value) in the forward part
(near cost) of the graph for which gradients are known.
These will be used to compute the gradients backwards
up to the variables in grad_end.
grad_end : list
theano variables where to stop the backpropagation of gradients
(they will be considered constant in theano.grad).
cost : theano scalar
additional costs for which to compute the gradients. For
example, these could be weight decay, or l1 constraint on output
details: bool
when True, return OrderedDict of wrt, gradients, and lists of
gradients derived from known_grads, cost_grads, respectively
(in same order as params)
return
------
Returns an OrderedDict of params (keys), gradients (values)
'''
assert ((cost is not None) or (known_grads is not None))
assert isinstance(grad_end, list)
assert isinstance(wrt, list)
if known_grads is not None:
assert isinstance(known_grads, dict)
kg_grads = None
cost_grads = None
if known_grads is not None:
kg_grads = list(theano.grad(cost=None, wrt=wrt,
known_grads=known_grads,
consider_constant=grad_end,
disconnected_inputs='ignore'))
if cost is not None:
cost_grads = list(theano.grad(cost=cost, wrt=wrt,
consider_constant=grad_end,
disconnected_inputs='ignore'))
grads = None
if known_grads is None:
grads = cost_grads
else:
grads = kg_grads
if cost_grads is not None:
for i in range(len(grads)):
grads[i] += cost_grads[i]
if details:
return grads, kg_grads, cost_grads
return grads
def _node_to_pattern(node): def _node_to_pattern(node):
""" given an apply node, obtain its connection pattern """ given an apply node, obtain its connection pattern
......
...@@ -554,5 +554,58 @@ def test_disconnected_cost_grad(): ...@@ -554,5 +554,58 @@ def test_disconnected_cost_grad():
return return
raise AssertionError("A disconnected gradient has been ignored.") raise AssertionError("A disconnected gradient has been ignored.")
def test_subgrad():
# Tests that the grad method with no known_grads
# matches what happens if you use successive subgrads
x = theano.tensor.fvector('x')
t = theano.tensor.fvector('t')
w1 = theano.shared(np.random.randn(3,4))
w2 = theano.shared(np.random.randn(4,2))
a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
cost2 = theano.tensor.sqr(a2 - t).sum()
cost2 += theano.tensor.sqr(w2.sum())
cost1 = theano.tensor.sqr(w1.sum())
params = [[w2,a1],[w1,x]]
costs = [cost2,cost1]
grad_ends = [[a1], [x]]
inputs = [t, x]
rng = np.random.RandomState([2012, 11, 15])
values = [rng.randn(2), rng.randn(3)]
values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)]
wrt = [w2, a1, w1, x]
cost = cost2 + cost1
true_grads = theano.grad(cost, wrt)
true_grads = theano.function(inputs, true_grads)
true_grads = true_grads(*values)
from theano.gof.python25 import OrderedDict
known_grad = None
params2 = []
for i in xrange(2):
param = params[i]
cost = costs[i]
grad_end = grad_ends[i]
pgrad = theano.subgrad(
wrt=param, grad_end=grad_end,
known_grads=known_grad, cost=cost
)
known_grad = OrderedDict(zip(param,pgrad))
params2.extend(pgrad)
pgrads = theano.function(inputs, params2)
pgrads = pgrads(*values)
print(pgrads)
print(true_grads)
for true_grad, pgrad in zip(true_grads, pgrads):
print(true_grad, pgrad)
assert(np.sum(np.abs(true_grad - pgrad)) < 0.00001)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论