提交 d8c5ab33 authored 作者: Razvan Pascanu's avatar Razvan Pascanu

Merge pull request #239 from goodfeli/to_rebase

rebased Razvan's Rop/Lop pull request It was checked by me and Ian.
"""Driver for gradient calculations.""" """Driver for gradient calculations."""
__authors__ = "James Bergstra, Razvan Pascanu" __authors__ = "James Bergstra, Razvan Pascanu"
__copyright__ = "(c) 2011, Universite de Montreal" __copyright__ = "(c) 2011, Universite de Montreal"
__license__ = "3-clause BSD License" __license__ = "3-clause BSD License"
__contact__ = "theano-dev <theano-dev@googlegroups.com>" __contact__ = "theano-dev <theano-dev@googlegroups.com>"
__docformat__ = "restructuredtext en" __docformat__ = "restructuredtext en"
...@@ -11,7 +11,7 @@ import __builtin__ ...@@ -11,7 +11,7 @@ import __builtin__
import logging import logging
import warnings import warnings
import numpy #for numeric_grad import numpy # for numeric_grad
import theano import theano
from theano.tensor import TensorType, TensorVariable, ones_like, \ from theano.tensor import TensorType, TensorVariable, ones_like, \
...@@ -26,6 +26,7 @@ _logger = logging.getLogger('theano.tensor.tensor_grad') ...@@ -26,6 +26,7 @@ _logger = logging.getLogger('theano.tensor.tensor_grad')
# R Operator # R Operator
######################## ########################
def Rop(f, wrt, eval_points): def Rop(f, wrt, eval_points):
""" """
Computes the R operation on `f` wrt to `wrt` evaluated at points given Computes the R operation on `f` wrt to `wrt` evaluated at points given
...@@ -50,16 +51,16 @@ def Rop(f, wrt, eval_points): ...@@ -50,16 +51,16 @@ def Rop(f, wrt, eval_points):
If `wrt` is a list/tuple, then return a list/tuple with the results. If `wrt` is a list/tuple, then return a list/tuple with the results.
""" """
using_list = isinstance(wrt, list) using_list = isinstance(f, list)
using_tuple = isinstance(wrt, tuple) using_tuple = isinstance(f, tuple)
if not (using_list or using_tuple): if not isinstance(wrt, (list, tuple)):
wrt = [ wrt ] wrt = [wrt]
if not isinstance(eval_points, (list, tuple)): if not isinstance(eval_points, (list, tuple)):
eval_points = [ eval_points ] eval_points = [eval_points]
if not isinstance(f, (list,tuple)): if not isinstance(f, (list, tuple)):
f = [f] f = [f]
assert len(wrt) == len(eval_points) assert len(wrt) == len(eval_points)
...@@ -84,7 +85,7 @@ def Rop(f, wrt, eval_points): ...@@ -84,7 +85,7 @@ def Rop(f, wrt, eval_points):
if node is None: if node is None:
return None return None
else: else:
op = node.op op = node.op
inputs = node.inputs inputs = node.inputs
if not hasattr(op, 'R_op'): if not hasattr(op, 'R_op'):
raise Exception((' R_op was not implemented for %s' raise Exception((' R_op was not implemented for %s'
...@@ -95,23 +96,24 @@ def Rop(f, wrt, eval_points): ...@@ -95,23 +96,24 @@ def Rop(f, wrt, eval_points):
local_eval_points = [] local_eval_points = []
for inp in inputs: for inp in inputs:
if inp in wrt: if inp in wrt:
local_eval_points.append( eval_points[wrt.index(inp)] ) local_eval_points.append(eval_points[wrt.index(inp)])
elif inp.owner is None: elif inp.owner is None:
local_eval_points.append( zeros_like(inp) ) local_eval_points.append(zeros_like(inp))
elif inp.owner in seen_nodes: elif inp.owner in seen_nodes:
local_eval_points.append( local_eval_points.append(
seen_nodes[inp.owner][inp.owner.outputs.index(inp) ] ) seen_nodes[inp.owner][inp.owner.outputs.index(inp)])
else: else:
# We actually need to compute the R_op for this node # We actually need to compute the R_op for this node
_traverse(inp.owner) _traverse(inp.owner)
local_eval_points.append( local_eval_points.append(
seen_nodes[inp.owner][inp.owner.outputs.index(inp) ]) seen_nodes[inp.owner][inp.owner.outputs.index(inp)])
for x,y in zip(inputs, local_eval_points): for x, y in zip(inputs, local_eval_points):
if y is not None: if y is not None:
assert (as_tensor_variable(x).type == as_tensor_variable(y).type) assert (as_tensor_variable(x).type ==
as_tensor_variable(y).type)
seen_nodes[node] = op.R_op(node.inputs, local_eval_points) seen_nodes[node] = op.R_op(node.inputs, local_eval_points)
return None return None
...@@ -123,12 +125,12 @@ def Rop(f, wrt, eval_points): ...@@ -123,12 +125,12 @@ def Rop(f, wrt, eval_points):
rval = [] rval = []
for out in f: for out in f:
if out in wrt: if out in wrt:
rval.append( eval_points[wrt.index(out)]) rval.append(eval_points[wrt.index(out)])
elif seen_nodes[out.owner][out.owner.outputs.index(out)] is None: elif seen_nodes[out.owner][out.owner.outputs.index(out)] is None:
raise ValueError(( 'The function is not differentiable with ' raise ValueError(('The function is not differentiable with '
'respect to the provided inputs !')) 'respect to the provided inputs !'))
else: else:
rval.append(seen_nodes[out.owner][out.owner.outputs.index(out)] ) rval.append(seen_nodes[out.owner][out.owner.outputs.index(out)])
if len(rval) == 1: if len(rval) == 1:
if using_list: if using_list:
...@@ -170,14 +172,16 @@ def Lop(f, wrt, eval_points, consider_constant=None, warn_type=False, ...@@ -170,14 +172,16 @@ def Lop(f, wrt, eval_points, consider_constant=None, warn_type=False,
consider_constant = [] consider_constant = []
if not isinstance(f, TensorVariable): if not isinstance(f, TensorVariable):
raise TypeError('In tensor.Lop(), cost argument should be a TensorVariable.', f) raise TypeError(('In tensor.Lop(), cost argument should be '
'a TensorVariable.'), f)
if type(eval_points) not in (list, tuple): if type(eval_points) not in (list, tuple):
eval_points = [eval_points] eval_points = [eval_points]
using_list = isinstance(f, list) using_list = isinstance(wrt, list)
using_tuple = isinstance(f, tuple) using_tuple = isinstance(wrt, tuple)
if not (using_list or using_tuple):
if not isinstance(f, (list, tuple)):
f = [f] f = [f]
inputs = gof.graph.inputs(f) inputs = gof.graph.inputs(f)
...@@ -193,7 +197,8 @@ def Lop(f, wrt, eval_points, consider_constant=None, warn_type=False, ...@@ -193,7 +197,8 @@ def Lop(f, wrt, eval_points, consider_constant=None, warn_type=False,
# such subtle cases can be fixed by a more careful implementation of the # such subtle cases can be fixed by a more careful implementation of the
# gradient, but for now Theano needs to throw an exception, and make the # gradient, but for now Theano needs to throw an exception, and make the
# user aware that it does not know how to compute that gradient # user aware that it does not know how to compute that gradient
if not isinstance(wrt, (list, tuple)):
if not (using_list or using_tuple):
wrt = [wrt] wrt = [wrt]
ret = [] ret = []
for p in wrt: for p in wrt:
...@@ -241,10 +246,11 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False, ...@@ -241,10 +246,11 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False,
:type g_cost: Scalar `Variable`, or None :type g_cost: Scalar `Variable`, or None
:param g_cost: an expression for the gradient through cost. The default is :param g_cost: an expression for the gradient through cost. The default is
``ones_like(cost)``. ``ones_like(cost)``.
:param consider_constant: a list of expressions not to backpropagate through :param consider_constant: a list of expressions not to backpropagate
through
:param warn_type: a value of True will cause warnings to be logged for any Op that emits a :param warn_type: a value of True will cause warnings to be logged for any
gradient that does not match its input type. Op that emits a gradient that does not match its input type.
:type disconnected_inputs: string :type disconnected_inputs: string
:param disconnected_inputs: Defines the behaviour if some of the variables :param disconnected_inputs: Defines the behaviour if some of the variables
...@@ -285,7 +291,8 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False, ...@@ -285,7 +291,8 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False,
if not isinstance(cost, TensorVariable): if not isinstance(cost, TensorVariable):
raise TypeError('In tensor.grad(), cost argument should be a TensorVariable.', cost) raise TypeError(('In tensor.grad(), cost argument should be '
'a TensorVariable.'), cost)
if cost.type.ndim: if cost.type.ndim:
raise TypeError( raise TypeError(
...@@ -302,7 +309,6 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False, ...@@ -302,7 +309,6 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, warn_type=False,
list(inputs) + list(consider_constant), list(inputs) + list(consider_constant),
warn_type=warn_type) warn_type=warn_type)
# Note : If p is not in gmap there can be several reasons, among which # Note : If p is not in gmap there can be several reasons, among which
# is the fact that p might not be part of the computational graph. A # is the fact that p might not be part of the computational graph. A
# simple example is that for a+b for e.g. a[0] is not part of the graph, # simple example is that for a+b for e.g. a[0] is not part of the graph,
...@@ -357,28 +363,29 @@ class numeric_grad(object): ...@@ -357,28 +363,29 @@ class numeric_grad(object):
# #
# There is a relationship between the step size and the function value and # There is a relationship between the step size and the function value and
# the measurement error that is incurred due to rounding. The finite # the measurement error that is incurred due to rounding. The finite
# difference we measure is delta = f(x0) - f(x0+eps) # difference we measure is
# delta = f(x0) - f(x0+eps)
# #
# For maximum precision, f should be close to zero. # For maximum precision, f should be close to zero.
# For every power of 2 that f departs from zero, we lose a bit of # For every power of 2 that f departs from zero, we lose a bit of precision
# precision in delta. # in delta.
# #
# Even in this case of maximum accuracy, there is a tradeoff between # Even in this case of maximum accuracy, there is a tradeoff between
# stepsize and measurement error. Taking small steps allows us to measure # stepsize and measurement error.
# large derivatives accuractly, but longer steps are required to measure # Taking small steps allows us to measure large derivatives accuractly,
# small derivatives accurately. However longer steps introduce bias into # but longer steps are required to measure small derivatives accurately.
# our measurement in general for non-linear functions. # However longer steps introduce bias into our measurement in general
# for non-linear functions.
# #
# It would be interesting to have a version of numeric grad that used an # It would be interesting to have a version of numeric grad that used an
# adaptive stepsize. # adaptive stepsize.
# #
# For now, we use a heuristic that catches very bad gradients, but is not # For now, we use a heuristic that catches very bad gradients, but is not
# perfectly accurate. # perfectly accurate.
type_eps = {'float64': 1e-7, type_eps = {'float64': 1e-7,
'float32': 3e-4, 'float32': 3e-4,
numpy.dtype('float64'):1e-7, numpy.dtype('float64'): 1e-7,
numpy.dtype('float32'):3e-4} numpy.dtype('float32'): 3e-4}
def __init__(self, f, pt, eps=None): def __init__(self, f, pt, eps=None):
"""Return the gradient of f at pt. """Return the gradient of f at pt.
...@@ -413,13 +420,15 @@ class numeric_grad(object): ...@@ -413,13 +420,15 @@ class numeric_grad(object):
dtypes = [str(p.dtype) for p in apt] dtypes = [str(p.dtype) for p in apt]
# TODO: remove this eventually (why was this here in the first place ?) # TODO: remove this eventually (why was this here in the first place ?)
# In the case of CSM, the arguments are a mixture of floats and integers... # In the case of CSM, the arguments are a mixture of floats and
#if not dtypes == [dtypes[0]] * len(apt): # integers...
#raise TypeError('All function arguments must have same dtype') # if not dtypes == [dtypes[0]] * len(apt):
# raise TypeError('All function arguments must have same dtype')
total_size = __builtin__.sum(prod(sh) for sh in shapes) total_size = __builtin__.sum(prod(sh) for sh in shapes)
working_dtype = __builtin__.min((self.type_eps[dt], dt) for dt in dtypes)[1] working_dtype = __builtin__.min((self.type_eps[dt], dt)
for dt in dtypes)[1]
#create un-initialized memory #create un-initialized memory
x = numpy.ndarray((total_size,), dtype=working_dtype) x = numpy.ndarray((total_size,), dtype=working_dtype)
...@@ -428,16 +437,15 @@ class numeric_grad(object): ...@@ -428,16 +437,15 @@ class numeric_grad(object):
if eps is None: if eps is None:
eps = __builtin__.max(self.type_eps[dt] for dt in dtypes) eps = __builtin__.max(self.type_eps[dt] for dt in dtypes)
#set up aliases so that apt[i] is backed by memory in x #set up aliases so that apt[i] is backed by memory in x
# and self.gf is backed by memory in gx # and self.gf is backed by memory in gx
cur_pos = 0 cur_pos = 0
self.gf = [] self.gf = []
for i,p in enumerate(apt): for i, p in enumerate(apt):
p_size = prod(p.shape) p_size = prod(p.shape)
# set up alias # set up alias
apt[i] = x[cur_pos:cur_pos+p_size].reshape(p.shape) apt[i] = x[cur_pos: cur_pos + p_size].reshape(p.shape)
self.gf.append(gx[cur_pos:cur_pos+p_size].reshape(p.shape)) self.gf.append(gx[cur_pos: cur_pos + p_size].reshape(p.shape))
# initialize with p's value # initialize with p's value
apt[i][...] = p apt[i][...] = p
cur_pos += p_size cur_pos += p_size
...@@ -452,7 +460,7 @@ class numeric_grad(object): ...@@ -452,7 +460,7 @@ class numeric_grad(object):
x[i] += eps x[i] += eps
f_eps = f(*apt) f_eps = f(*apt)
gx[i] = numpy.asarray((f_eps - f_x)/eps) gx[i] = numpy.asarray((f_eps - f_x) / eps)
if packed_pt: if packed_pt:
self.gf = self.gf[0] self.gf = self.gf[0]
...@@ -536,7 +544,7 @@ class numeric_grad(object): ...@@ -536,7 +544,7 @@ class numeric_grad(object):
def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None,
rel_tol=None, mode=None, cast_to_output_type=False): rel_tol=None, mode=None, cast_to_output_type=False):
""" Test a gradient by Finite Difference Method. Raise error on failure. """ Test a gradient by Finite Difference Method. Raise error on failure.
Example: Example:
...@@ -571,7 +579,7 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, ...@@ -571,7 +579,7 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None,
there is an experimental verify_grad that covers that case as well by there is an experimental verify_grad that covers that case as well by
using random projections. using random projections.
""" """
assert isinstance(pt, (list,tuple)) assert isinstance(pt, (list, tuple))
pt = [numpy.array(p) for p in pt] pt = [numpy.array(p) for p in pt]
for i, p in enumerate(pt): for i, p in enumerate(pt):
...@@ -579,7 +587,7 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, ...@@ -579,7 +587,7 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None,
raise TypeError(('verify_grad can work only with floating point ' raise TypeError(('verify_grad can work only with floating point '
'inputs, but input %i has dtype "%s".') % (i, p.dtype)) 'inputs, but input %i has dtype "%s".') % (i, p.dtype))
_type_tol = dict( # relativ error tolerances for different types _type_tol = dict( # relativ error tolerances for different types
float32=1e-2, float32=1e-2,
float64=1e-4) float64=1e-4)
...@@ -589,10 +597,11 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, ...@@ -589,10 +597,11 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None,
rel_tol = __builtin__.max(_type_tol[str(p.dtype)] for p in pt) rel_tol = __builtin__.max(_type_tol[str(p.dtype)] for p in pt)
if rng is None: if rng is None:
raise TypeError('rng be instance of numpy.random.RandomState', ( raise TypeError(('rng should be a valid instance of '
' hint: Maybe you meant to call' 'numpy.random.RandomState. You may '
' theano.tests.unittest_tools.verify_grad instead of' 'want to use theano.tests.unittest'
' theano.tensor.verify_grad.')) '_tools.verify_grad instead of '
'theano.tensor.verify_grad.'))
# We allow input downcast in function, because numeric_grad works in the # We allow input downcast in function, because numeric_grad works in the
# most precise dtype used among the inputs, so we may need to cast some. # most precise dtype used among the inputs, so we may need to cast some.
...@@ -614,9 +623,10 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, ...@@ -614,9 +623,10 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None,
o_output = fun(*tensor_pt) o_output = fun(*tensor_pt)
if isinstance(o_output, list): if isinstance(o_output, list):
raise NotImplementedError('verify gradient on multiple outputs') raise NotImplementedError(('cant (yet) autotest gradient of fun '
# we could make loop over outputs making random projections R for 'with multiple outputs'))
# each, but this doesn't handle the case where not all the outputs are # we could make loop over outputs making random projections R for each,
# but this doesn't handle the case where not all the outputs are
# differentiable... so I leave this as TODO for now -JB. # differentiable... so I leave this as TODO for now -JB.
o_fn = function(tensor_pt, o_output) o_fn = function(tensor_pt, o_output)
...@@ -630,15 +640,17 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, ...@@ -630,15 +640,17 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None,
# random_projection should not have elements too small, # random_projection should not have elements too small,
# otherwise too much precision is lost in numerical gradient # otherwise too much precision is lost in numerical gradient
def random_projection(): def random_projection():
plain = rng.rand(*o_fn_out.shape) + 0.5 plain = rng.rand(*o_fn_out.shape) + 0.5
if cast_to_output_type: if cast_to_output_type:
return numpy.array(plain,o_output.dtype) return numpy.array(plain, o_output.dtype)
return plain return plain
t_r = shared(random_projection()) t_r = shared(random_projection())
#random projection of o onto t_r # random projection of o onto t_r
# This sum() is defined above, it's not the builtin sum.
cost = theano.tensor.sum(t_r * o_output) cost = theano.tensor.sum(t_r * o_output)
cost_fn = function(tensor_pt, cost) cost_fn = function(tensor_pt, cost)
#todo-- determine if this is actually needed #todo-- determine if this is actually needed
...@@ -654,7 +666,6 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, ...@@ -654,7 +666,6 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None,
for test_num in xrange(n_tests): for test_num in xrange(n_tests):
num_grad = numeric_grad(cost_fn, [p.copy() for p in pt], eps) num_grad = numeric_grad(cost_fn, [p.copy() for p in pt], eps)
analytic_grad = grad_fn(*[p.copy() for p in pt]) analytic_grad = grad_fn(*[p.copy() for p in pt])
# Since `tensor_pt` is a list, `analytic_grad` should be one too. # Since `tensor_pt` is a list, `analytic_grad` should be one too.
...@@ -682,7 +693,6 @@ class GradientError(Exception): ...@@ -682,7 +693,6 @@ class GradientError(Exception):
self.abs_tol = abs_tol self.abs_tol = abs_tol
self.rel_tol = rel_tol self.rel_tol = rel_tol
def __str__(self): def __str__(self):
# args may have been inserted by e.g. makeTester # args may have been inserted by e.g. makeTester
args_msg = ", ".join(str(a) for a in self.args) args_msg = ", ".join(str(a) for a in self.args)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论