提交 40bbb7da authored 作者: David Warde-Farley's avatar David Warde-Farley

Merge pull request #1068 from goodfeli/fix_consider_constant

Fixes several issues with gradients and some other bugs
...@@ -249,6 +249,8 @@ following methods: ...@@ -249,6 +249,8 @@ following methods:
1) They must be Variable instances. 1) They must be Variable instances.
2) When they are types that have dtypes, they must never have an integer dtype. 2) When they are types that have dtypes, they must never have an integer dtype.
The output gradients passed *to* Op.grad will also obey these constraints.
Integers are a tricky subject. Integers are the main reason for having DisconnectedType, Integers are a tricky subject. Integers are the main reason for having DisconnectedType,
NullType or zero gradient. When you have an integer as an argument to your grad method, NullType or zero gradient. When you have an integer as an argument to your grad method,
recall the definition of a derivative to help you decide what value to return: recall the definition of a derivative to help you decide what value to return:
......
...@@ -55,9 +55,12 @@ class OpFromGraph(gof.Op): ...@@ -55,9 +55,12 @@ class OpFromGraph(gof.Op):
if grad_depth > 0: if grad_depth > 0:
output_grads = [t() for t in self.output_types] output_grads = [t() for t in self.output_types]
gd = G.grad_sources_inputs(zip(self.outputs, output_grads), # OpFromGraph doesn't implement a connection_pattern, so for now we regard
self.inputs) # all inputs and outputs as connected. This will compute the right numerical
gs = map(gd.get, self.inputs) # value for the gradients but could fail to raise the disconnected inputs error
# in some cases.
gs = G.grad(cost=None, known_grads=dict(zip(self.outputs, output_grads)),
wrt=self.inputs, disconnected_inputs='ignore')
self.grad_ops = [] self.grad_ops = []
for g in gs: for g in gs:
if g is None: if g is None:
......
...@@ -13,9 +13,11 @@ import warnings ...@@ -13,9 +13,11 @@ import warnings
_logger = logging.getLogger('theano.gradient') _logger = logging.getLogger('theano.gradient')
import numpy # for numeric_grad import numpy # for numeric_grad
np = numpy
import theano import theano
from itertools import izip
from theano import gof from theano import gof
from theano.gof import Variable from theano.gof import Variable
from theano.gof.python25 import all from theano.gof.python25 import all
...@@ -317,9 +319,6 @@ def Lop(f, wrt, eval_points, consider_constant=None, ...@@ -317,9 +319,6 @@ def Lop(f, wrt, eval_points, consider_constant=None,
coordinates of the tensor element in the last coordinates of the tensor element in the last
If `f` is a list/tuple, then return a list/tuple with the results. If `f` is a list/tuple, then return a list/tuple with the results.
""" """
if consider_constant is None:
consider_constant = []
if type(eval_points) not in (list, tuple): if type(eval_points) not in (list, tuple):
eval_points = [eval_points] eval_points = [eval_points]
...@@ -333,50 +332,15 @@ def Lop(f, wrt, eval_points, consider_constant=None, ...@@ -333,50 +332,15 @@ def Lop(f, wrt, eval_points, consider_constant=None,
f = list(f) f = list(f)
grads = list(eval_points) grads = list(eval_points)
for elem in consider_constant:
assert elem not in f
f.append(elem)
grads.append(elem.zeros_like())
if not isinstance(wrt, (list, tuple)): if not isinstance(wrt, (list, tuple)):
wrt = [wrt] wrt = [wrt]
arg1 = zip(f, eval_points) assert len(f) == len(grads)
arg2 = list(wrt) known = dict(izip(f, grads))
gmap = grad_sources_inputs( ret = grad(cost=None, known_grads=known,
arg1, consider_constant=consider_constant, wrt=wrt,
arg2) disconnected_inputs=disconnected_inputs)
# Note : If p is not in gmap there can be several reasons, among which
# is the fact that p might not be part of the computational graph. A
# simple example is that for a+b for e.g. a[0] is not part of the graph,
# so Theano does not know how to compute TT.grad(TT.sum(a+b), a[0])
# such subtle cases can be fixed by a more careful implementation of the
# gradient, but for now Theano needs to throw an exception, and make the
# user aware that it does not know how to compute that gradient
ret = []
for p in wrt:
if p in gmap:
ret.append(gmap[p])
else:
message = (
"Lop method was asked to compute the gradient "
"with respect to a variable that is not part of "
"the computational graph of the cost, or is used "
"only by a non-differentiable operator: %s" % p)
if disconnected_inputs == 'ignore':
pass
elif disconnected_inputs == 'warn':
warnings.warn(message, stacklevel=1)
elif disconnected_inputs == 'raise':
raise ValueError(message)
else:
raise ValueError(
"Invalid value for keyword "
"'disconnected_inputs', valid values are "
"'ignore', 'warn' and 'raise'.")
ret.append(p.zeros_like())
return format_as(using_list, using_tuple, ret) return format_as(using_list, using_tuple, ret)
...@@ -386,9 +350,11 @@ def Lop(f, wrt, eval_points, consider_constant=None, ...@@ -386,9 +350,11 @@ def Lop(f, wrt, eval_points, consider_constant=None,
######################### #########################
def grad(cost, wrt, g_cost=None, consider_constant=None, def grad(cost, wrt, g_cost=None, consider_constant=None,
disconnected_inputs='raise', add_names=True): disconnected_inputs='raise', add_names=True,
known_grads=None, return_disconnected='zero'):
""" """
:type cost: Scalar (0-dimensional) Variable. :type cost: Scalar (0-dimensional) Variable.
May optionally be None if known_grads is provided.
:type wrt: Variable or list of Variables. :type wrt: Variable or list of Variables.
:type g_cost: Scalar Variable, or None. :type g_cost: Scalar Variable, or None.
:param g_cost: an expression for the gradient through cost. The default is :param g_cost: an expression for the gradient through cost. The default is
...@@ -409,6 +375,20 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, ...@@ -409,6 +375,20 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
(d<cost.name>/d<wrt.name>) provided that both cost and wrt have (d<cost.name>/d<wrt.name>) provided that both cost and wrt have
names names
:type known_grads: dict
:param known_grads: If not None, a dictionary mapping variables to their
gradients. This is useful in the case where you know the
gradient on some variables but do not know the original
cost.
:type return_disconnected: string
:param return_disconnected:
'zero' : If wrt[i] is disconnected, return value i will be
wrt[i].zeros_like()
'None' : If wrt[i] is disconnected, return value i will be
None
'Disconnected' : returns variables of type DisconnectedType
:rtype: Variable or list/tuple of Variables (depending upon `wrt`) :rtype: Variable or list/tuple of Variables (depending upon `wrt`)
:return: symbolic expression of gradient of `cost` with respect to `wrt`. :return: symbolic expression of gradient of `cost` with respect to `wrt`.
...@@ -422,29 +402,17 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, ...@@ -422,29 +402,17 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
if tensor is None: if tensor is None:
from theano import tensor from theano import tensor
if isinstance(cost.type, NullType): if cost is None:
assert known_grads is not None
if cost is not None and isinstance(cost.type, NullType):
raise ValueError("Can't differentiate a NaN cost." raise ValueError("Can't differentiate a NaN cost."
"cost is NaN because " + \ "cost is NaN because " + \
cost.type.why_null) cost.type.why_null)
if cost.ndim != 0: if cost is not None and cost.ndim != 0:
raise TypeError("cost must be a scalar.") raise TypeError("cost must be a scalar.")
if consider_constant is None:
consider_constant = []
else:
# error checking on consider_constant: verify that it is a collection
# of theano variables
# this is important, if someone accidentally passes a nested data
# structure with theano variables at the leaves, only the root will
# be properly considered constant
if not hasattr(consider_constant, '__iter__'):
raise TypeError('consider_constant must be an iterable collection,'
' got ' + str(type(consider_constant)))
for elem in consider_constant:
if not isinstance(elem, gof.Variable):
raise TypeError('Elements of consider_constant must be '
'variables, but got ' + str(type(elem)))
if isinstance(wrt, set): if isinstance(wrt, set):
raise TypeError("wrt must not be a set. sets have no defined " raise TypeError("wrt must not be a set. sets have no defined "
...@@ -461,7 +429,14 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, ...@@ -461,7 +429,14 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
raise TypeError("Expected Variable, got " + str(elem) + raise TypeError("Expected Variable, got " + str(elem) +
" of type "+str(type(elem))) " of type "+str(type(elem)))
var_to_node_to_idx = _populate_var_to_node_to_idx([cost], wrt) outputs = []
if cost is not None:
outputs.append(cost)
if known_grads is not None:
outputs.extend(known_grads.keys())
var_to_node_to_idx = _populate_var_to_node_to_idx(
outputs, wrt, consider_constant)
# build a dict mapping var to the gradient of cost with respect to var # build a dict mapping var to the gradient of cost with respect to var
grad_dict = {} grad_dict = {}
...@@ -469,49 +444,57 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, ...@@ -469,49 +444,57 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
# The gradient of the cost should default to 1 if the cost is of a # The gradient of the cost should default to 1 if the cost is of a
# continuous dtype (float, for the moment, as complex are unsupported), # continuous dtype (float, for the moment, as complex are unsupported),
# and should always be 0 if the cost is of discrete (integer) dtype. # and should always be 0 if the cost is of discrete (integer) dtype.
if getattr(cost.type, 'dtype', None) not in tensor.float_dtypes: if cost is not None:
if g_cost is None:
g_cost = _float_ones_like(cost)
# g_cost may be Disconnected or NullType. A creative use of the function,
# sure, but nonetheless one we can and should support. So before we try
# to cast it make sure it even has a dtype
if hasattr(g_cost.type, 'dtype') and cost.type.dtype not in tensor.discrete_dtypes:
# Here we enforce the constraint that floating point variables have
# the same dtype as their gradient.
g_cost = g_cost.astype(cost.type.dtype)
# DO NOT enforce g_cost to be 0 if cost is an integer.
# This is to be enforced by the Op.grad method for the Op that outputs cost.
assert g_cost not in tensor.discrete_dtypes
grad_dict[cost] = g_cost
else:
if g_cost is not None: if g_cost is not None:
try: raise ValueError("No cost node was specified, but a gradient"
cval = theano.get_constant_value(g_cost) " on it was.")
if cval == 0:
g_cost_is_zero = True
else:
g_cost_is_zero = False
except TypeError:
g_cost_is_zero = False
if not g_cost_is_zero:
raise ValueError("The gradient of a cost of non-continuous "
"dtype (here, %s), if it is defined, should be 0. "
"However, a value of %s was provided in the 'g_cost' "
"argument of theano.grad(). To remove this error, "
"you can simply omit the 'g_cost' argument, or "
"give it the default value of None." % (
getattr(g_cost.type, 'dtype', 'no dtype defined'),
g_cost))
g_cost = tensor.zeros_like(cost)
elif g_cost is None:
# cost.type.dtype is in tensor.float_dtypes at that point
g_cost = tensor.ones_like(cost)
else: if known_grads is not None:
# Cast the provided gradient so that it has the same dtype for var in known_grads:
# as the cost. g_var = known_grads[var]
g_cost = g_cost.astype(cost.type.dtype)
if not hasattr(g_var, 'type'):
raise TypeError('output grads must be theano variables.'
'Ambiguous whether %s should be made into tensor'
' or sparse theano variable' % str(type(g_var)))
if g_var.type not in [NullType, DisconnectedType] and 'float' \
not in str(g_var.type.dtype):
raise TypeError("Gradients must always be NullType, "
"DisconnectedType, or continuous, but grad was "
"given a known_grad of type "+str(g_var.type))
# DO NOT check that these gradients are equal to 0 if var is int
# The gradient is allowed to be non-zero on var in that case
# Ops outputing var should not backpropagate its gradient further
# but that is enforced elsewhere (grep for only_connected_to_int)
grad_dict[var] = g_var
grad_dict[cost] = g_cost
# the gradient of the constants is 0
for const in consider_constant:
grad_dict[const] = DisconnectedType()()
# variables that do not influence the cost have zero gradient. # variables that do not influence the cost have zero gradient.
# if wrt is such a variable, populate the grad_dict with this info # if wrt is such a variable, populate the grad_dict with this info
# so that wrt not being in var_to_node_to_idx won't cause an error below # so that wrt not being in var_to_node_to_idx won't cause an error below
# according to the flag, possibly raise an error if wrt is disconnected # according to the flag, possibly raise an error if wrt is disconnected
for elem in wrt: for elem in wrt:
if elem not in var_to_node_to_idx and elem is not cost: if elem not in var_to_node_to_idx and elem is not cost \
and elem not in grad_dict:
message = ("grad method was asked to compute the gradient " message = ("grad method was asked to compute the gradient "
"with respect to a variable that is not part of " "with respect to a variable that is not part of "
"the computational graph of the cost, or is used " "the computational graph of the cost, or is used "
...@@ -529,15 +512,15 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, ...@@ -529,15 +512,15 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
grad_dict[elem] = DisconnectedType()() grad_dict[elem] = DisconnectedType()()
cost_name = None cost_name = None
if add_names: if add_names and cost is not None:
cost_name = cost.name cost_name = cost.name
# Make sure we didn't initialize the grad_dict with any ints # Make sure we didn't initialize the grad_dict with any ints
# for non-int outputs # The gradient may NEVER be an int, even if the variable is an int.
# Read the Op contract and talk to Ian Goodfellow before changing this!
for var in grad_dict: for var in grad_dict:
g = grad_dict[var] g = grad_dict[var]
if (hasattr(g.type, 'dtype') and if hasattr(g.type, 'dtype'):
getattr(var.type, 'dtype', '') in tensor.float_dtypes):
assert g.type.dtype in tensor.float_dtypes assert g.type.dtype in tensor.float_dtypes
rval = _populate_grad_dict(var_to_node_to_idx, rval = _populate_grad_dict(var_to_node_to_idx,
...@@ -545,7 +528,12 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, ...@@ -545,7 +528,12 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
for i in xrange(len(rval)): for i in xrange(len(rval)):
if isinstance(rval[i].type, DisconnectedType): if isinstance(rval[i].type, DisconnectedType):
rval[i] = _float_zeros_like(wrt[i]) if return_disconnected == 'zero':
rval[i] = _float_zeros_like(wrt[i])
elif return_disconnected == 'None':
rval[i] = None
else:
assert return_disconnected == 'Disconnected'
if using_tuple: if using_tuple:
rval = tuple(rval) rval = tuple(rval)
...@@ -592,15 +580,18 @@ def _node_to_pattern(node): ...@@ -592,15 +580,18 @@ def _node_to_pattern(node):
return connection_pattern return connection_pattern
def _populate_var_to_node_to_idx(outputs, wrt): def _populate_var_to_node_to_idx(outputs, wrt, consider_constant):
""" """
Common code shared between grad and grad_sources_inputs Helper function for grad function.
outputs: a list of variables we want to take gradients of outputs: a list of variables we want to take gradients of
wrt: a list of variables we want to take the gradient with wrt: a list of variables we want to take the gradient with
respect to. respect to.
consider_constant: a list of variables not to backpropagate
through.
returns: returns:
var_to_app_to_idx: var_to_app_to_idx:
...@@ -622,8 +613,30 @@ def _populate_var_to_node_to_idx(outputs, wrt): ...@@ -622,8 +613,30 @@ def _populate_var_to_node_to_idx(outputs, wrt):
This set is exactly the set of variables that connect This set is exactly the set of variables that connect
the variables in wrt to the cost being differentiated. the variables in wrt to the cost being differentiated.
(A variable in consider_constant is not a function of
anything)
""" """
# Validate and format consider_constant
if consider_constant is None:
consider_constant = []
else:
# error checking on consider_constant: verify that it is a collection
# of theano variables
# this is important, if someone accidentally passes a nested data
# structure with theano variables at the leaves, only the root will
# be properly considered constant
try:
iter(consider_constant)
except TypeError:
raise TypeError('consider_constant must be an iterable collection,'
' got ' + str(type(consider_constant)))
for elem in consider_constant:
if not isinstance(elem, gof.Variable):
raise TypeError('Elements of consider_constant must be '
'variables, but got ' + str(type(elem)))
# var_to_app_to_idx[var][node] = [i,j] means node has # var_to_app_to_idx[var][node] = [i,j] means node has
# var as input at positions i and j # var as input at positions i and j
var_to_app_to_idx = {} var_to_app_to_idx = {}
...@@ -638,9 +651,17 @@ def _populate_var_to_node_to_idx(outputs, wrt): ...@@ -638,9 +651,17 @@ def _populate_var_to_node_to_idx(outputs, wrt):
accounted_for = set([]) accounted_for = set([])
def account_for(var): def account_for(var):
# Don't visit the same variable twice
if var in accounted_for: if var in accounted_for:
return return
accounted_for.add(var) accounted_for.add(var)
# Constants are not a function of anything
if var in consider_constant:
return
# Recursively add the variables that this variable is
# a function of.
if var.owner is not None: if var.owner is not None:
app = var.owner app = var.owner
...@@ -699,11 +720,16 @@ def _populate_var_to_node_to_idx(outputs, wrt): ...@@ -699,11 +720,16 @@ def _populate_var_to_node_to_idx(outputs, wrt):
return var_to_app_to_idx return var_to_app_to_idx
class NullTypeGradError(TypeError):
"""
Raised when grad encounters a NullType.
"""
pass
def _populate_grad_dict(var_to_node_to_idx, def _populate_grad_dict(var_to_node_to_idx,
grad_dict, wrt, cost_name=None): grad_dict, wrt, cost_name=None):
""" """
Common code shared between grad_sources_inputs and grad Helper function for grad function.
var_to_node_to_idx: a dictionary mapping a variable to var_to_node_to_idx: a dictionary mapping a variable to
a second dictionary. a second dictionary.
...@@ -712,7 +738,7 @@ def _populate_grad_dict(var_to_node_to_idx, ...@@ -712,7 +738,7 @@ def _populate_grad_dict(var_to_node_to_idx,
node's input list node's input list
grad_dict: a dictionary mapping variables to their gradients grad_dict: a dictionary mapping variables to their gradients
should be populated by grad or grad_sources_inputs should be populated by grad function.
grad should set gradients to DisconnectedType()() for grad should set gradients to DisconnectedType()() for
variables to be considered constant, set the variables to be considered constant, set the
...@@ -779,38 +805,46 @@ def _populate_grad_dict(var_to_node_to_idx, ...@@ -779,38 +805,46 @@ def _populate_grad_dict(var_to_node_to_idx,
inputs = [try_to_copy_if_needed(ipt) for ipt in inputs] inputs = [try_to_copy_if_needed(ipt) for ipt in inputs]
# Build a list of output gradients with the same dtype as # Build a list of output gradients with the same dtype as
# the corresponding output variable. # the corresponding output variable.
# If an output is of a float dtype, we want to cast the # If an output is of a float dtype, we want to cast the
# output gradient into the same dtype, to avoid having a # output gradient into the same dtype, to avoid having a
# gradient graph with double precision (taking more memory, # gradient graph with double precision (taking more memory,
# and more computation). # and more computation).
# If an output is of an integer dtype, then we ensure the # If an output is of an integer dtype, then we just leave it
# output gradient is zero, and that zero can be represented # alone.
# in the same int dtype. # DO NOT force integer variables to have zero grad. This causes
# If an output gradient is a NullType or DisconnectedType, # bugs where we fail to detect disconnected or undefined gradients.
# then it will not have a dtype, and it will not be changed. # DO NOT force integer variables to have integer dtype. This is
# a violation of the op contract.
new_output_grads = [] new_output_grads = []
for o, og in zip(node.outputs, output_grads): for o, og in zip(node.outputs, output_grads):
o_dt = getattr(o.type, 'dtype', None) o_dt = getattr(o.type, 'dtype', None)
og_dt = getattr(og.type, 'dtype', None) og_dt = getattr(og.type, 'dtype', None)
if og_dt and o_dt in theano.tensor.discrete_dtypes: if o_dt not in theano.tensor.discrete_dtypes and og_dt and o_dt != og_dt:
new_output_grads.append(o.zeros_like())
elif o_dt and og_dt and o_dt != og_dt:
new_output_grads.append(og.astype(o_dt)) new_output_grads.append(og.astype(o_dt))
else: else:
new_output_grads.append(og) new_output_grads.append(og)
# Make sure that, if new_output_grads[i] has a dtype: # Make sure that, if new_output_grads[i] has a floating point dtype,
# - it is the same dtype as outputs[i] # it is the same dtype as outputs[i]
# - if the dtype is an int, then new_output_grads[i] is 0.
for o, ng in zip(node.outputs, new_output_grads): for o, ng in zip(node.outputs, new_output_grads):
o_dt = getattr(o.type, 'dtype', None) o_dt = getattr(o.type, 'dtype', None)
ng_dt = getattr(ng.type, 'dtype', None) ng_dt = getattr(ng.type, 'dtype', None)
if ng_dt: if ng_dt is not None and o_dt not in theano.tensor.discrete_dtypes:
assert ng_dt == o_dt assert ng_dt == o_dt
if ng_dt in theano.tensor.discrete_dtypes:
assert theano.get_constant_value(ng) == 0 # Someone who had obviously not read the Op contract tried
# to modify this part of the function.
# If you ever think it is a good idea to make an integer
# valued gradient, please
# 1) Read the Op contract again
# 2) Talk to Ian Goodfellow
# (Both of these sources will tell you not to do it)
for ng in new_output_grads:
assert getattr(ng.type, 'dtype', None) not in theano.tensor.discrete_dtypes
input_grads = node.op.grad(inputs, new_output_grads) input_grads = node.op.grad(inputs, new_output_grads)
...@@ -863,6 +897,7 @@ def _populate_grad_dict(var_to_node_to_idx, ...@@ -863,6 +897,7 @@ def _populate_grad_dict(var_to_node_to_idx,
'the grad_undefined or grad_unimplemented helper ' 'the grad_undefined or grad_unimplemented helper '
'functions.') % node.op) 'functions.') % node.op)
if not isinstance(term.type, if not isinstance(term.type,
(NullType, DisconnectedType)): (NullType, DisconnectedType)):
if term.type.dtype not in theano.tensor.float_dtypes: if term.type.dtype not in theano.tensor.float_dtypes:
...@@ -875,14 +910,9 @@ def _populate_grad_dict(var_to_node_to_idx, ...@@ -875,14 +910,9 @@ def _populate_grad_dict(var_to_node_to_idx,
# it's not undefined or disconnected # it's not undefined or disconnected
# The only other valid thing it can be is 0 # The only other valid thing it can be is 0
no_constant_value = True is_zero = _is_zero(term)
try: assert is_zero in ['yes', 'no', 'maybe']
constant_value = theano.get_constant_value(term) if is_zero == 'maybe':
no_constant_value = False
except TypeError:
pass
if no_constant_value:
msg = "%s.grad returned %s of type %s for input" msg = "%s.grad returned %s of type %s for input"
msg += " %d. This input's only connections to " msg += " %d. This input's only connections to "
msg += "the cost through this op are via " msg += "the cost through this op are via "
...@@ -896,8 +926,7 @@ def _populate_grad_dict(var_to_node_to_idx, ...@@ -896,8 +926,7 @@ def _populate_grad_dict(var_to_node_to_idx,
msg = msg % (str(node.op), str(term), msg = msg % (str(node.op), str(term),
str(type(term)), i) str(type(term)), i)
raise ValueError(msg) if is_zero == 'no':
if constant_value != 0:
msg = "%s.grad returned %s of type %s for input" msg = "%s.grad returned %s of type %s for input"
msg += " %d. Since this input is only connected " msg += " %d. Since this input is only connected "
msg += "to integer-valued outputs, it should " msg += "to integer-valued outputs, it should "
...@@ -905,7 +934,7 @@ def _populate_grad_dict(var_to_node_to_idx, ...@@ -905,7 +934,7 @@ def _populate_grad_dict(var_to_node_to_idx,
msg += "%s." msg += "%s."
msg % (str(node.op), str(term), str(type(term)), msg % (str(node.op), str(term), str(type(term)),
i, str(constant_value)) i, str(theano.get_constant_value(term)))
raise ValueError(msg) raise ValueError(msg)
...@@ -961,7 +990,7 @@ def _populate_grad_dict(var_to_node_to_idx, ...@@ -961,7 +990,7 @@ def _populate_grad_dict(var_to_node_to_idx,
type(term))) type(term)))
if isinstance(term.type, NullType): if isinstance(term.type, NullType):
raise TypeError("tensor.grad " raise NullTypeGradError("tensor.grad "
"encountered a NaN. " +\ "encountered a NaN. " +\
term.type.why_null) term.type.why_null)
...@@ -997,113 +1026,6 @@ def _populate_grad_dict(var_to_node_to_idx, ...@@ -997,113 +1026,6 @@ def _populate_grad_dict(var_to_node_to_idx,
return rval return rval
def grad_sources_inputs(sources, graph_inputs):
"""
Used to compute the gradient of a cost with respect to all the
variables between graph_input and cost, but in the special
case where you don't know the cost, you only know its gradient
on a set of intermediate values.
A gradient source is a pair (``v``, ``g_v``), in which ``v`` is
a `Variable`, and ``g_v`` is a `Variable` that is a gradient wrt
``v``. More specifically, ``g_v`` is the gradient of an external
scalar cost, ``cost`` (that is not explicitly used), wrt ``v``.
This function traverses the graph backward from the ``r`` sources,
calling ``op.grad(...)`` for all ops with some non-None gradient
on an output, to compute gradients of ``cost`` wrt intermediate
variables and ``graph_inputs``.
The ``op.grad(...)`` functions are called like this:
.. code-block:: python
op.grad(op.inputs[:], [total_gradient(v) for v in op.outputs])
This call to ``op.grad`` should return a list or tuple: one symbolic
gradient per input. These gradients represent the gradients of
the same implicit ``cost`` mentionned above, wrt ``op.inputs``. Note
that this is **not** the same as the gradient of ``op.outputs`` wrt
``op.inputs``.
If ``op`` has a single input, then ``op.grad`` should return a list
or tuple of length 1.
For each input wrt to which ``op`` is not differentiable, it should
return ``None`` instead of a `Variable` instance.
If a source ``r`` receives a gradient from another source ``r2``,
then the effective gradient on ``r`` is the sum of both gradients.
:type sources: list of pairs of Variable: (v, gradient-on-v) to
initialize the total_gradient dictionary
:param sources: gradients to back-propagate using chain rule
:type graph_inputs: list of Variable
:param graph_inputs: variables considered to be constant
(do not backpropagate through them)
:rtype: dictionary whose keys and values are of type Variable
:return: mapping from each Variable encountered in the backward
traversal to the gradient with respect to that Variable.
It is assumed that there is some objective J shared between all members of
sources, so that for each v, gradient-on-v is the gradient of J with
respect to v
"""
outputs, output_grads = zip(*sources)
for output_grad in output_grads:
if not hasattr(output_grad, 'type'):
raise TypeError('output grads must be theano variables.'
'Ambiguous whether %s should be made into tensor'
' or sparse theano variable' % str(type(output_grad)))
if graph_inputs is None:
graph_inputs = gof.graph.inputs(outputs)
wrt = graph_inputs
var_to_node_to_idx = _populate_var_to_node_to_idx(outputs, wrt)
# build a dict mapping var to the gradient of cost with respect to var
grad_dict = {}
for output, output_grad in sources:
# The gradient of the cost should always be 0 if the cost is of
# discrete (integer) dtype.
if getattr(output.type, 'dtype', '') not in theano.tensor.float_dtypes:
output_grad = output.zeros_like()
else:
# Cast the provided gradient so that it has the same dtype
# as the cost.
output_grad = output_grad.astype(output.type.dtype)
grad_dict[output] = output_grad
# variables that do not influence the cost have zero gradient.
# if wrt is such a variable, populate the grad_dict with this info
# so that wrt not being in var_to_node_to_idx won't cause an error below
# according to the flag, possibly raise an error if wrt is disconnected
for elem in wrt:
if elem not in var_to_node_to_idx and elem not in outputs:
grad_dict[elem] = DisconnectedType()()
_populate_grad_dict(var_to_node_to_idx,
grad_dict, wrt)
# post-process out the DisconnectedTypes
for key in grad_dict:
if isinstance(grad_dict[key].type, DisconnectedType):
if hasattr(key, 'zeros_like'):
grad_dict[key] = _float_zeros_like(key)
return grad_dict
def _float_zeros_like(x): def _float_zeros_like(x):
""" Like zeros_like, but forces the object to have a """ Like zeros_like, but forces the object to have a
a floating point dtype """ a floating point dtype """
...@@ -1634,3 +1556,32 @@ def hessian(cost, wrt, consider_constant=None, ...@@ -1634,3 +1556,32 @@ def hessian(cost, wrt, consider_constant=None,
"script that generated the error)") "script that generated the error)")
hessians.append(hess) hessians.append(hess)
return format_as(using_list, using_tuple, hessians) return format_as(using_list, using_tuple, hessians)
def _is_zero(x):
"""
Returns 'yes', 'no', or 'maybe' indicating whether x
is always 0.
'maybe' means that x is an expression that is complicated enough
that we can't tell that it simplifies to 0.
"""
if not hasattr(x, 'type'):
return np.all(x == 0.)
if isinstance(x.type, NullType):
return 'no'
if isinstance(x.type, DisconnectedType):
return 'yes'
no_constant_value = True
try:
constant_value = theano.get_constant_value(x)
no_constant_value = False
except TypeError:
pass
if no_constant_value:
return 'maybe'
if constant_value != 0.:
return 'no'
return 'yes'
...@@ -221,7 +221,8 @@ class Scan(PureOp): ...@@ -221,7 +221,8 @@ class Scan(PureOp):
'following error has been encountered: The ' 'following error has been encountered: The '
'%s %s (argument number %d) has dtype ' '%s %s (argument number %d) has dtype '
'%s and %d dimension(s). The corresponding slice %s ' '%s and %d dimension(s). The corresponding slice %s '
'however has dtype %s and %d dimension(s). This ' 'however has dtype %s and %d dimension(s) (it should '
'have the same dtype and one fewer dimensions). This '
'should never happen, please ' 'should never happen, please '
'report to theano-dev mailing list' 'report to theano-dev mailing list'
) )
...@@ -1261,11 +1262,9 @@ class Scan(PureOp): ...@@ -1261,11 +1262,9 @@ class Scan(PureOp):
if x in diff_inputs] if x in diff_inputs]
for x in consider_inps: for x in consider_inps:
try: try:
_gmp = gradient.grad_sources_inputs( gmp[x] = gradient.grad(cost=None,
[(y, g_y)], known_grads={y: g_y}, wrt=x)
[x]) except gradient.NullTypeGradError:
gmp[x] = _gmp[x]
except TypeError:
# It means the gradient is undefined (which implies # It means the gradient is undefined (which implies
# is connected) # is connected)
gmp[x] = x gmp[x] = x
...@@ -1374,11 +1373,21 @@ class Scan(PureOp): ...@@ -1374,11 +1373,21 @@ class Scan(PureOp):
self.inner_nitsot_outs(self_outputs)) self.inner_nitsot_outs(self_outputs))
def compute_gradient(y, g_y): def compute_gradient(y, g_y):
gmp = gradient.grad_sources_inputs( if 'int' in str(g_y.dtype):
[(y, g_y)], raise TypeError("Gradients may never be integers but g_y "
[x for x in theano.gof.graph.inputs([y]) "has type "+str(g_y.type))
if x in diff_inputs])
return [gmp.get(p, None) for p in diff_inputs] wrt = [x for x in theano.gof.graph.inputs([y])
if x in diff_inputs]
grads = gradient.grad(
cost = None,
known_grads = {y : g_y },
wrt=wrt, consider_constant=wrt,
disconnected_inputs='ignore',
return_disconnected='None')
gmp = dict(zip(wrt, grads))
rval = [gmp.get(p, None) for p in diff_inputs]
return rval
dC_dinps_t = [None for inp in diff_inputs] dC_dinps_t = [None for inp in diff_inputs]
disconnected_dC_dinps_t = [True for inp in diff_inputs] disconnected_dC_dinps_t = [True for inp in diff_inputs]
dC_dXts = [] dC_dXts = []
......
...@@ -462,13 +462,27 @@ def _allclose(a, b, rtol=None, atol=None): ...@@ -462,13 +462,27 @@ def _allclose(a, b, rtol=None, atol=None):
return numpy.allclose(a, b, atol=atol_, rtol=rtol_) return numpy.allclose(a, b, atol=atol_, rtol=rtol_)
class NotConstantError(TypeError):
"""
Raised by get_constant_value if called on something that is
not constant.
For now it is a TypeError, to maintain the old interface
that get_constant_value should raise a TypeError in this
situation. However, this is unsafe because get_constant_value
could inadvertently raise a TypeError if it has a bug.
So we should eventually make NotConstantError derive
from Exception directly, and modify all code that uses
get_constant_value to catch this more specific exception.
"""
pass
def get_constant_value(v): def get_constant_value(v):
"""return the constant scalar(0-D) value underlying variable `v` """return the constant scalar(0-D) value underlying variable `v`
If v is the output of dimshuffles, fills, allocs, rebroadcasts, cast If v is the output of dimshuffles, fills, allocs, rebroadcasts, cast
this function digs through them. this function digs through them.
If `v` is not some view of constant data, then raise a TypeError. If `v` is not some view of constant data, then raise a NotConstantError.
:note: There may be another function similar to this one in the :note: There may be another function similar to this one in the
code, but I'm not sure where it is. code, but I'm not sure where it is.
...@@ -488,7 +502,7 @@ def get_constant_value(v): ...@@ -488,7 +502,7 @@ def get_constant_value(v):
numpy.complex(data) # works for all numeric scalars numpy.complex(data) # works for all numeric scalars
return data return data
except Exception: except Exception:
raise TypeError( raise NotConstantError(
'v.data is non-numeric, non-scalar, or has more than one' 'v.data is non-numeric, non-scalar, or has more than one'
' unique value', v) ' unique value', v)
if v.owner: if v.owner:
...@@ -516,9 +530,17 @@ def get_constant_value(v): ...@@ -516,9 +530,17 @@ def get_constant_value(v):
v.owner.op.perform(v.owner, [const], ret) v.owner.op.perform(v.owner, [const], ret)
return ret[0][0] return ret[0][0]
if isinstance(v.owner.op, Subtensor) and v.ndim == 0: if isinstance(v.owner.op, Subtensor) and v.ndim == 0:
if isinstance(v.owner.inputs[0], TensorConstant): # This condition depends on Subtensor always embedding constant
return v.owner.inputs[0].data.__getitem__( # indices in the Op rather than making them inputs to the Apply node
if isinstance(v.owner.inputs[0], TensorConstant) and \
len(v.owner.inputs) == 1:
try:
return v.owner.inputs[0].data.__getitem__(
tuple(v.owner.op.idx_list)) tuple(v.owner.op.idx_list))
except IndexError:
raise IndexError(str(tuple(v.owner.op.idx_list))+" is not a valid index into " + \
str(v.owner.inputs[0].data))
# The index list 'idx_list' should have length the same # The index list 'idx_list' should have length the same
# shape as the input. # shape as the input.
...@@ -3780,7 +3802,7 @@ class AdvancedIndexingError(TypeError): ...@@ -3780,7 +3802,7 @@ class AdvancedIndexingError(TypeError):
class Subtensor(Op): class Subtensor(Op):
"""Return a subtensor view """Return a subtensor view
The inputs array is the tensor x, followed by scalar integer variables. The inputs array is the tensor x, followed by scalar integer types.
TODO: WRITEME: how are the scalar integer variables formatted? TODO: WRITEME: how are the scalar integer variables formatted?
This class uses a relatively complex internal representation of the inputs This class uses a relatively complex internal representation of the inputs
...@@ -3789,7 +3811,7 @@ class Subtensor(Op): ...@@ -3789,7 +3811,7 @@ class Subtensor(Op):
idx_list: instance variable TODO: WRITEME: is this a list or a tuple? idx_list: instance variable TODO: WRITEME: is this a list or a tuple?
(old docstring gives two conflicting (old docstring gives two conflicting
descriptions) descriptions)
elements are either integers, theano scalars, or slices. elements are either integers, theano scalar types, or slices.
one element per "explicitly named dimension" one element per "explicitly named dimension"
TODO: WRITEME: what is an "explicitly named dimension" ? TODO: WRITEME: what is an "explicitly named dimension" ?
...@@ -3798,7 +3820,11 @@ class Subtensor(Op): ...@@ -3798,7 +3820,11 @@ class Subtensor(Op):
if slice: if slice:
start/stop/step members of each slice are integer indices start/stop/step members of each slice are integer indices
into the inputs array or None into the inputs array or None
integer indices be actual integers or theano scalars integer indices be actual integers or theano scalar types
Note that the idx_list defines the Op, so two Subtensor instances are
considered to be different Ops if they have different idx_list fields.
This means that the entries in it are theano Types, not theano Variables.
@todo: add support for advanced tensor indexing (in Subtensor_dx too). @todo: add support for advanced tensor indexing (in Subtensor_dx too).
...@@ -3816,6 +3842,17 @@ class Subtensor(Op): ...@@ -3816,6 +3842,17 @@ class Subtensor(Op):
@staticmethod @staticmethod
def collapse(idxs, cond): def collapse(idxs, cond):
"""
idxs: a list of indices or slices.
cond: a callable that returns a bool
returns: idxs, with the slices flattened out into a list.
if cond is true for an entry, does not flatten it.
"""
ret = [] ret = []
def helper(entry): def helper(entry):
...@@ -3828,10 +3865,20 @@ class Subtensor(Op): ...@@ -3828,10 +3865,20 @@ class Subtensor(Op):
for idx in idxs: for idx in idxs:
helper(idx) helper(idx)
return ret return ret
@staticmethod @staticmethod
def convert(entry, slice_ok=True): def convert(entry, slice_ok=True):
"""
The "idx_list" field is unique to each Subtensor instance.
It is not unique to each Apply node, so it should not refer to
specific Variables. This method changes references to Variables
into references to Types.
TODO: WRITEME: This method also accepts "entry" already being a Type;
when would that happen?
"""
invalid_scal_types = [scal.float64, scal.float32] invalid_scal_types = [scal.float64, scal.float32]
scal_types = [scal.int64, scal.int32, scal.int16, scal.int8] scal_types = [scal.int64, scal.int32, scal.int16, scal.int8]
tensor_types = [lscalar, iscalar, wscalar, bscalar] tensor_types = [lscalar, iscalar, wscalar, bscalar]
......
...@@ -801,10 +801,9 @@ class ConvOp(OpenMPOp): ...@@ -801,10 +801,9 @@ class ConvOp(OpenMPOp):
# mimic what happens inside theano.grad: get the input gradient # mimic what happens inside theano.grad: get the input gradient
# of the final cost wrt all variables involved. # of the final cost wrt all variables involved.
tmp_gmap = theano.gradient.grad_sources_inputs( return theano.gradient.grad(cost=None,
[(node, gz)], [inputs, kerns]) known_grads={node: gz}, wrt=[inputs, kerns])
return [tmp_gmap[inputs], tmp_gmap[kerns]]
if self.dx not in (1, 2) or self.dy not in (1, 2): if self.dx not in (1, 2) or self.dy not in (1, 2):
raise NotImplementedError( raise NotImplementedError(
......
...@@ -6,7 +6,6 @@ import unittest ...@@ -6,7 +6,6 @@ import unittest
import theano import theano
from theano import gof from theano import gof
from theano.gradient import grad_sources_inputs
from theano import gradient from theano import gradient
from theano.tensor.nnet.Conv3D import conv3D from theano.tensor.nnet.Conv3D import conv3D
from theano import config from theano import config
...@@ -16,6 +15,16 @@ from theano.gof.null_type import NullType ...@@ -16,6 +15,16 @@ from theano.gof.null_type import NullType
one = theano.tensor.as_tensor_variable(1.) one = theano.tensor.as_tensor_variable(1.)
def grad_sources_inputs(sources, inputs):
"""
This implements the old grad_sources_inputs function in terms of
the new interface so the tests don't need to be rewritten.
"""
if inputs is None:
inputs = theano.gof.graph.inputs([source[0] for source in sources])
return dict(zip(inputs,theano.gradient.grad(cost=None, known_grads=dict(sources),
wrt=inputs, consider_constant=inputs)))
class testgrad_sources_inputs(unittest.TestCase): class testgrad_sources_inputs(unittest.TestCase):
def test_retNone1(self): def test_retNone1(self):
...@@ -369,35 +378,6 @@ class test_grad(unittest.TestCase): ...@@ -369,35 +378,6 @@ class test_grad(unittest.TestCase):
# If we made it to here without an exception, then the # If we made it to here without an exception, then the
# connection_pattern functionality worked correctly # connection_pattern functionality worked correctly
def test_sum_disconnected(self):
# Tests that we can add DisconnectedType to other terms correctly
x = theano.tensor.scalar()
y = x * 2.
z = x + 1.
cost = y + z
theano.tensor.grad(cost, x, consider_constant=[y, z])
# In an earlier version of theano, the above line would have failed
# while trying to add two DisconnectedTypes
def test_output_grad_on_int(self):
# If the g_cost argument is specified when x has a discrete dtype,
# g_cost should be equivalent to 0.
x = theano.tensor.iscalar('x')
y = x * 2
# Should work:
c0 = theano.tensor.constant(0)
theano.grad(y, x, g_cost=c0)
theano.grad(y, x, g_cost=y.zeros_like())
theano.grad(y, x, g_cost=y.zeros_like().astype('float64'))
# Should raise ValueError
c1 = theano.tensor.constant(1)
self.assertRaises(ValueError, theano.grad, y, x, g_cost=c1)
s0 = theano.shared(np.zeros((), dtype='int8'))
self.assertRaises(ValueError, theano.grad, y, x, g_cost=s0)
def test_downcast_dtype(self): def test_downcast_dtype(self):
# Test that the gradient of a cost wrt a float32 variable does not # Test that the gradient of a cost wrt a float32 variable does not
# get upcasted to float64. # get upcasted to float64.
...@@ -418,6 +398,124 @@ class test_grad(unittest.TestCase): ...@@ -418,6 +398,124 @@ class test_grad(unittest.TestCase):
# be downcasted to float32, so dc_dx should also be float32 # be downcasted to float32, so dc_dx should also be float32
assert dc_dx.dtype == 'float32' assert dc_dx.dtype == 'float32'
def test_grad_constant(self):
# Test that the gradient handles Constants and consider_constant variables
# consistently
x = theano.tensor.scalar()
y = theano.tensor.scalar()
z_x = x + y
z_one = one + y
g_x = theano.tensor.grad(z_x, x, consider_constant=[x])
g_one = theano.tensor.grad(z_one, one)
f = theano.function([x, y],[g_x, g_one])
g_x, g_one = f(1, .5)
if not np.allclose(g_x, g_one):
raise AssertionError("Gradient using consider constant is " + str(g_x)\
+ " but gradient with respect to the same Constant is " + \
str(g_one))
def test_known_grads():
# Tests that the grad method with no known_grads
# matches what happens if you put its own known_grads
# in for each variable
full_range = theano.tensor.arange(10)
x = theano.tensor.scalar('x')
t = theano.tensor.iscalar('t')
ft = full_range[t]
ft.name = 'ft'
coeffs = theano.tensor.vector('c')
ct = coeffs[t]
ct.name = 'ct'
p = x ** ft
p.name = 'p'
y = ct * p
y.name = 'y'
cost = theano.tensor.sqr(y)
cost.name = 'cost'
layers = [
[cost],
[y],
[ct,p],
[ct, x, ft],
[coeffs, t, full_range, x]
]
inputs = [coeffs, t, x]
rng = np.random.RandomState([2012, 11, 15])
values = [rng.randn(10), rng.randint(10), rng.randn() ]
values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)]
true_grads = theano.tensor.grad(cost, inputs, disconnected_inputs='ignore')
true_grads = theano.function(inputs, true_grads)
true_grads = true_grads(*values)
for layer in layers:
print 'Testing by separately computing ',layer
first = theano.tensor.grad(cost, layer, disconnected_inputs='ignore')
known = dict(zip(layer, first))
full = theano.tensor.grad(cost=None,
known_grads=known,wrt=inputs, disconnected_inputs='ignore')
full = theano.function(inputs, full)
full = full(*values)
assert len(true_grads) == len(full)
for a, b, var in zip(true_grads, full, inputs):
if not np.allclose(a, b):
print 'Failure'
print a
print b
print var
print layer
for v in known:
print v,':',theano.function(inputs,known[v])(*values)
assert False
def test_dxdx():
# Tests that the gradient of a scalar with respect to itself is 1
# I use an integer in this case because people keep changing this
# gradient to be 0 on integers but according to our interpretation
# of the gradient as defined in the Op contract, it should be 1.
# If you feel the need to change this unit test you are probably
# modifying the Op contract and should definitely get the approval
# of multiple people on theano-dev.
x = theano.tensor.iscalar()
g = theano.tensor.grad(x, x)
g = g.eval({ x : 12 })
assert np.allclose(g,1.)
def test_known_grads_integers():
# Tests that known_grads works on integers
x = theano.tensor.iscalar()
g_expected = theano.tensor.scalar()
g_grad = theano.gradient.grad(cost=None,
known_grads={x : g_expected},
wrt=x)
f = theano.function([g_expected],g_grad)
x = -3
gv = np.cast[theano.config.floatX](.6)
g_actual = f(gv)
assert np.allclose(g_actual, gv)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -341,15 +341,9 @@ class test_RopLop(RopLop_checker): ...@@ -341,15 +341,9 @@ class test_RopLop(RopLop_checker):
rop_out2 = tensor.Rop((m, v, m + v), [m, v], [m_, v_]) rop_out2 = tensor.Rop((m, v, m + v), [m, v], [m_, v_])
assert isinstance(rop_out2, tuple) assert isinstance(rop_out2, tuple)
assert len(rop_out2) == 3 assert len(rop_out2) == 3
lop_out1 = tensor.Lop([m, v, m + v], (m, v), [m_, v_])
assert isinstance(lop_out1, tuple)
assert len(lop_out1) == 2
lop_out2 = tensor.Lop((m, v, m + v), [m, v], [m_, v_])
assert isinstance(lop_out2, list)
assert len(lop_out2) == 2
all_outs = [] all_outs = []
for o in rop_out1, rop_out2, lop_out1, lop_out2: for o in rop_out1, rop_out2:
all_outs.extend(o) all_outs.extend(o)
f = theano.function([m, v, m_, v_], all_outs) f = theano.function([m, v, m_, v_], all_outs)
f(mval, vval, m_val, v_val) f(mval, vval, m_val, v_val)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论