提交 40bbb7da authored 作者: David Warde-Farley's avatar David Warde-Farley

Merge pull request #1068 from goodfeli/fix_consider_constant

Fixes several issues with gradients and some other bugs
...@@ -249,6 +249,8 @@ following methods: ...@@ -249,6 +249,8 @@ following methods:
1) They must be Variable instances. 1) They must be Variable instances.
2) When they are types that have dtypes, they must never have an integer dtype. 2) When they are types that have dtypes, they must never have an integer dtype.
The output gradients passed *to* Op.grad will also obey these constraints.
Integers are a tricky subject. Integers are the main reason for having DisconnectedType, Integers are a tricky subject. Integers are the main reason for having DisconnectedType,
NullType or zero gradient. When you have an integer as an argument to your grad method, NullType or zero gradient. When you have an integer as an argument to your grad method,
recall the definition of a derivative to help you decide what value to return: recall the definition of a derivative to help you decide what value to return:
......
...@@ -55,9 +55,12 @@ class OpFromGraph(gof.Op): ...@@ -55,9 +55,12 @@ class OpFromGraph(gof.Op):
if grad_depth > 0: if grad_depth > 0:
output_grads = [t() for t in self.output_types] output_grads = [t() for t in self.output_types]
gd = G.grad_sources_inputs(zip(self.outputs, output_grads), # OpFromGraph doesn't implement a connection_pattern, so for now we regard
self.inputs) # all inputs and outputs as connected. This will compute the right numerical
gs = map(gd.get, self.inputs) # value for the gradients but could fail to raise the disconnected inputs error
# in some cases.
gs = G.grad(cost=None, known_grads=dict(zip(self.outputs, output_grads)),
wrt=self.inputs, disconnected_inputs='ignore')
self.grad_ops = [] self.grad_ops = []
for g in gs: for g in gs:
if g is None: if g is None:
......
差异被折叠。
...@@ -221,7 +221,8 @@ class Scan(PureOp): ...@@ -221,7 +221,8 @@ class Scan(PureOp):
'following error has been encountered: The ' 'following error has been encountered: The '
'%s %s (argument number %d) has dtype ' '%s %s (argument number %d) has dtype '
'%s and %d dimension(s). The corresponding slice %s ' '%s and %d dimension(s). The corresponding slice %s '
'however has dtype %s and %d dimension(s). This ' 'however has dtype %s and %d dimension(s) (it should '
'have the same dtype and one fewer dimensions). This '
'should never happen, please ' 'should never happen, please '
'report to theano-dev mailing list' 'report to theano-dev mailing list'
) )
...@@ -1261,11 +1262,9 @@ class Scan(PureOp): ...@@ -1261,11 +1262,9 @@ class Scan(PureOp):
if x in diff_inputs] if x in diff_inputs]
for x in consider_inps: for x in consider_inps:
try: try:
_gmp = gradient.grad_sources_inputs( gmp[x] = gradient.grad(cost=None,
[(y, g_y)], known_grads={y: g_y}, wrt=x)
[x]) except gradient.NullTypeGradError:
gmp[x] = _gmp[x]
except TypeError:
# It means the gradient is undefined (which implies # It means the gradient is undefined (which implies
# is connected) # is connected)
gmp[x] = x gmp[x] = x
...@@ -1374,11 +1373,21 @@ class Scan(PureOp): ...@@ -1374,11 +1373,21 @@ class Scan(PureOp):
self.inner_nitsot_outs(self_outputs)) self.inner_nitsot_outs(self_outputs))
def compute_gradient(y, g_y): def compute_gradient(y, g_y):
gmp = gradient.grad_sources_inputs( if 'int' in str(g_y.dtype):
[(y, g_y)], raise TypeError("Gradients may never be integers but g_y "
[x for x in theano.gof.graph.inputs([y]) "has type "+str(g_y.type))
if x in diff_inputs])
return [gmp.get(p, None) for p in diff_inputs] wrt = [x for x in theano.gof.graph.inputs([y])
if x in diff_inputs]
grads = gradient.grad(
cost = None,
known_grads = {y : g_y },
wrt=wrt, consider_constant=wrt,
disconnected_inputs='ignore',
return_disconnected='None')
gmp = dict(zip(wrt, grads))
rval = [gmp.get(p, None) for p in diff_inputs]
return rval
dC_dinps_t = [None for inp in diff_inputs] dC_dinps_t = [None for inp in diff_inputs]
disconnected_dC_dinps_t = [True for inp in diff_inputs] disconnected_dC_dinps_t = [True for inp in diff_inputs]
dC_dXts = [] dC_dXts = []
......
...@@ -462,13 +462,27 @@ def _allclose(a, b, rtol=None, atol=None): ...@@ -462,13 +462,27 @@ def _allclose(a, b, rtol=None, atol=None):
return numpy.allclose(a, b, atol=atol_, rtol=rtol_) return numpy.allclose(a, b, atol=atol_, rtol=rtol_)
class NotConstantError(TypeError):
"""
Raised by get_constant_value if called on something that is
not constant.
For now it is a TypeError, to maintain the old interface
that get_constant_value should raise a TypeError in this
situation. However, this is unsafe because get_constant_value
could inadvertently raise a TypeError if it has a bug.
So we should eventually make NotConstantError derive
from Exception directly, and modify all code that uses
get_constant_value to catch this more specific exception.
"""
pass
def get_constant_value(v): def get_constant_value(v):
"""return the constant scalar(0-D) value underlying variable `v` """return the constant scalar(0-D) value underlying variable `v`
If v is the output of dimshuffles, fills, allocs, rebroadcasts, cast If v is the output of dimshuffles, fills, allocs, rebroadcasts, cast
this function digs through them. this function digs through them.
If `v` is not some view of constant data, then raise a TypeError. If `v` is not some view of constant data, then raise a NotConstantError.
:note: There may be another function similar to this one in the :note: There may be another function similar to this one in the
code, but I'm not sure where it is. code, but I'm not sure where it is.
...@@ -488,7 +502,7 @@ def get_constant_value(v): ...@@ -488,7 +502,7 @@ def get_constant_value(v):
numpy.complex(data) # works for all numeric scalars numpy.complex(data) # works for all numeric scalars
return data return data
except Exception: except Exception:
raise TypeError( raise NotConstantError(
'v.data is non-numeric, non-scalar, or has more than one' 'v.data is non-numeric, non-scalar, or has more than one'
' unique value', v) ' unique value', v)
if v.owner: if v.owner:
...@@ -516,9 +530,17 @@ def get_constant_value(v): ...@@ -516,9 +530,17 @@ def get_constant_value(v):
v.owner.op.perform(v.owner, [const], ret) v.owner.op.perform(v.owner, [const], ret)
return ret[0][0] return ret[0][0]
if isinstance(v.owner.op, Subtensor) and v.ndim == 0: if isinstance(v.owner.op, Subtensor) and v.ndim == 0:
if isinstance(v.owner.inputs[0], TensorConstant): # This condition depends on Subtensor always embedding constant
return v.owner.inputs[0].data.__getitem__( # indices in the Op rather than making them inputs to the Apply node
if isinstance(v.owner.inputs[0], TensorConstant) and \
len(v.owner.inputs) == 1:
try:
return v.owner.inputs[0].data.__getitem__(
tuple(v.owner.op.idx_list)) tuple(v.owner.op.idx_list))
except IndexError:
raise IndexError(str(tuple(v.owner.op.idx_list))+" is not a valid index into " + \
str(v.owner.inputs[0].data))
# The index list 'idx_list' should have length the same # The index list 'idx_list' should have length the same
# shape as the input. # shape as the input.
...@@ -3780,7 +3802,7 @@ class AdvancedIndexingError(TypeError): ...@@ -3780,7 +3802,7 @@ class AdvancedIndexingError(TypeError):
class Subtensor(Op): class Subtensor(Op):
"""Return a subtensor view """Return a subtensor view
The inputs array is the tensor x, followed by scalar integer variables. The inputs array is the tensor x, followed by scalar integer types.
TODO: WRITEME: how are the scalar integer variables formatted? TODO: WRITEME: how are the scalar integer variables formatted?
This class uses a relatively complex internal representation of the inputs This class uses a relatively complex internal representation of the inputs
...@@ -3789,7 +3811,7 @@ class Subtensor(Op): ...@@ -3789,7 +3811,7 @@ class Subtensor(Op):
idx_list: instance variable TODO: WRITEME: is this a list or a tuple? idx_list: instance variable TODO: WRITEME: is this a list or a tuple?
(old docstring gives two conflicting (old docstring gives two conflicting
descriptions) descriptions)
elements are either integers, theano scalars, or slices. elements are either integers, theano scalar types, or slices.
one element per "explicitly named dimension" one element per "explicitly named dimension"
TODO: WRITEME: what is an "explicitly named dimension" ? TODO: WRITEME: what is an "explicitly named dimension" ?
...@@ -3798,7 +3820,11 @@ class Subtensor(Op): ...@@ -3798,7 +3820,11 @@ class Subtensor(Op):
if slice: if slice:
start/stop/step members of each slice are integer indices start/stop/step members of each slice are integer indices
into the inputs array or None into the inputs array or None
integer indices be actual integers or theano scalars integer indices be actual integers or theano scalar types
Note that the idx_list defines the Op, so two Subtensor instances are
considered to be different Ops if they have different idx_list fields.
This means that the entries in it are theano Types, not theano Variables.
@todo: add support for advanced tensor indexing (in Subtensor_dx too). @todo: add support for advanced tensor indexing (in Subtensor_dx too).
...@@ -3816,6 +3842,17 @@ class Subtensor(Op): ...@@ -3816,6 +3842,17 @@ class Subtensor(Op):
@staticmethod @staticmethod
def collapse(idxs, cond): def collapse(idxs, cond):
"""
idxs: a list of indices or slices.
cond: a callable that returns a bool
returns: idxs, with the slices flattened out into a list.
if cond is true for an entry, does not flatten it.
"""
ret = [] ret = []
def helper(entry): def helper(entry):
...@@ -3828,10 +3865,20 @@ class Subtensor(Op): ...@@ -3828,10 +3865,20 @@ class Subtensor(Op):
for idx in idxs: for idx in idxs:
helper(idx) helper(idx)
return ret return ret
@staticmethod @staticmethod
def convert(entry, slice_ok=True): def convert(entry, slice_ok=True):
"""
The "idx_list" field is unique to each Subtensor instance.
It is not unique to each Apply node, so it should not refer to
specific Variables. This method changes references to Variables
into references to Types.
TODO: WRITEME: This method also accepts "entry" already being a Type;
when would that happen?
"""
invalid_scal_types = [scal.float64, scal.float32] invalid_scal_types = [scal.float64, scal.float32]
scal_types = [scal.int64, scal.int32, scal.int16, scal.int8] scal_types = [scal.int64, scal.int32, scal.int16, scal.int8]
tensor_types = [lscalar, iscalar, wscalar, bscalar] tensor_types = [lscalar, iscalar, wscalar, bscalar]
......
...@@ -801,10 +801,9 @@ class ConvOp(OpenMPOp): ...@@ -801,10 +801,9 @@ class ConvOp(OpenMPOp):
# mimic what happens inside theano.grad: get the input gradient # mimic what happens inside theano.grad: get the input gradient
# of the final cost wrt all variables involved. # of the final cost wrt all variables involved.
tmp_gmap = theano.gradient.grad_sources_inputs( return theano.gradient.grad(cost=None,
[(node, gz)], [inputs, kerns]) known_grads={node: gz}, wrt=[inputs, kerns])
return [tmp_gmap[inputs], tmp_gmap[kerns]]
if self.dx not in (1, 2) or self.dy not in (1, 2): if self.dx not in (1, 2) or self.dy not in (1, 2):
raise NotImplementedError( raise NotImplementedError(
......
...@@ -6,7 +6,6 @@ import unittest ...@@ -6,7 +6,6 @@ import unittest
import theano import theano
from theano import gof from theano import gof
from theano.gradient import grad_sources_inputs
from theano import gradient from theano import gradient
from theano.tensor.nnet.Conv3D import conv3D from theano.tensor.nnet.Conv3D import conv3D
from theano import config from theano import config
...@@ -16,6 +15,16 @@ from theano.gof.null_type import NullType ...@@ -16,6 +15,16 @@ from theano.gof.null_type import NullType
one = theano.tensor.as_tensor_variable(1.) one = theano.tensor.as_tensor_variable(1.)
def grad_sources_inputs(sources, inputs):
"""
This implements the old grad_sources_inputs function in terms of
the new interface so the tests don't need to be rewritten.
"""
if inputs is None:
inputs = theano.gof.graph.inputs([source[0] for source in sources])
return dict(zip(inputs,theano.gradient.grad(cost=None, known_grads=dict(sources),
wrt=inputs, consider_constant=inputs)))
class testgrad_sources_inputs(unittest.TestCase): class testgrad_sources_inputs(unittest.TestCase):
def test_retNone1(self): def test_retNone1(self):
...@@ -369,35 +378,6 @@ class test_grad(unittest.TestCase): ...@@ -369,35 +378,6 @@ class test_grad(unittest.TestCase):
# If we made it to here without an exception, then the # If we made it to here without an exception, then the
# connection_pattern functionality worked correctly # connection_pattern functionality worked correctly
def test_sum_disconnected(self):
# Tests that we can add DisconnectedType to other terms correctly
x = theano.tensor.scalar()
y = x * 2.
z = x + 1.
cost = y + z
theano.tensor.grad(cost, x, consider_constant=[y, z])
# In an earlier version of theano, the above line would have failed
# while trying to add two DisconnectedTypes
def test_output_grad_on_int(self):
# If the g_cost argument is specified when x has a discrete dtype,
# g_cost should be equivalent to 0.
x = theano.tensor.iscalar('x')
y = x * 2
# Should work:
c0 = theano.tensor.constant(0)
theano.grad(y, x, g_cost=c0)
theano.grad(y, x, g_cost=y.zeros_like())
theano.grad(y, x, g_cost=y.zeros_like().astype('float64'))
# Should raise ValueError
c1 = theano.tensor.constant(1)
self.assertRaises(ValueError, theano.grad, y, x, g_cost=c1)
s0 = theano.shared(np.zeros((), dtype='int8'))
self.assertRaises(ValueError, theano.grad, y, x, g_cost=s0)
def test_downcast_dtype(self): def test_downcast_dtype(self):
# Test that the gradient of a cost wrt a float32 variable does not # Test that the gradient of a cost wrt a float32 variable does not
# get upcasted to float64. # get upcasted to float64.
...@@ -418,6 +398,124 @@ class test_grad(unittest.TestCase): ...@@ -418,6 +398,124 @@ class test_grad(unittest.TestCase):
# be downcasted to float32, so dc_dx should also be float32 # be downcasted to float32, so dc_dx should also be float32
assert dc_dx.dtype == 'float32' assert dc_dx.dtype == 'float32'
def test_grad_constant(self):
# Test that the gradient handles Constants and consider_constant variables
# consistently
x = theano.tensor.scalar()
y = theano.tensor.scalar()
z_x = x + y
z_one = one + y
g_x = theano.tensor.grad(z_x, x, consider_constant=[x])
g_one = theano.tensor.grad(z_one, one)
f = theano.function([x, y],[g_x, g_one])
g_x, g_one = f(1, .5)
if not np.allclose(g_x, g_one):
raise AssertionError("Gradient using consider constant is " + str(g_x)\
+ " but gradient with respect to the same Constant is " + \
str(g_one))
def test_known_grads():
# Tests that the grad method with no known_grads
# matches what happens if you put its own known_grads
# in for each variable
full_range = theano.tensor.arange(10)
x = theano.tensor.scalar('x')
t = theano.tensor.iscalar('t')
ft = full_range[t]
ft.name = 'ft'
coeffs = theano.tensor.vector('c')
ct = coeffs[t]
ct.name = 'ct'
p = x ** ft
p.name = 'p'
y = ct * p
y.name = 'y'
cost = theano.tensor.sqr(y)
cost.name = 'cost'
layers = [
[cost],
[y],
[ct,p],
[ct, x, ft],
[coeffs, t, full_range, x]
]
inputs = [coeffs, t, x]
rng = np.random.RandomState([2012, 11, 15])
values = [rng.randn(10), rng.randint(10), rng.randn() ]
values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)]
true_grads = theano.tensor.grad(cost, inputs, disconnected_inputs='ignore')
true_grads = theano.function(inputs, true_grads)
true_grads = true_grads(*values)
for layer in layers:
print 'Testing by separately computing ',layer
first = theano.tensor.grad(cost, layer, disconnected_inputs='ignore')
known = dict(zip(layer, first))
full = theano.tensor.grad(cost=None,
known_grads=known,wrt=inputs, disconnected_inputs='ignore')
full = theano.function(inputs, full)
full = full(*values)
assert len(true_grads) == len(full)
for a, b, var in zip(true_grads, full, inputs):
if not np.allclose(a, b):
print 'Failure'
print a
print b
print var
print layer
for v in known:
print v,':',theano.function(inputs,known[v])(*values)
assert False
def test_dxdx():
# Tests that the gradient of a scalar with respect to itself is 1
# I use an integer in this case because people keep changing this
# gradient to be 0 on integers but according to our interpretation
# of the gradient as defined in the Op contract, it should be 1.
# If you feel the need to change this unit test you are probably
# modifying the Op contract and should definitely get the approval
# of multiple people on theano-dev.
x = theano.tensor.iscalar()
g = theano.tensor.grad(x, x)
g = g.eval({ x : 12 })
assert np.allclose(g,1.)
def test_known_grads_integers():
# Tests that known_grads works on integers
x = theano.tensor.iscalar()
g_expected = theano.tensor.scalar()
g_grad = theano.gradient.grad(cost=None,
known_grads={x : g_expected},
wrt=x)
f = theano.function([g_expected],g_grad)
x = -3
gv = np.cast[theano.config.floatX](.6)
g_actual = f(gv)
assert np.allclose(g_actual, gv)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -341,15 +341,9 @@ class test_RopLop(RopLop_checker): ...@@ -341,15 +341,9 @@ class test_RopLop(RopLop_checker):
rop_out2 = tensor.Rop((m, v, m + v), [m, v], [m_, v_]) rop_out2 = tensor.Rop((m, v, m + v), [m, v], [m_, v_])
assert isinstance(rop_out2, tuple) assert isinstance(rop_out2, tuple)
assert len(rop_out2) == 3 assert len(rop_out2) == 3
lop_out1 = tensor.Lop([m, v, m + v], (m, v), [m_, v_])
assert isinstance(lop_out1, tuple)
assert len(lop_out1) == 2
lop_out2 = tensor.Lop((m, v, m + v), [m, v], [m_, v_])
assert isinstance(lop_out2, list)
assert len(lop_out2) == 2
all_outs = [] all_outs = []
for o in rop_out1, rop_out2, lop_out1, lop_out2: for o in rop_out1, rop_out2:
all_outs.extend(o) all_outs.extend(o)
f = theano.function([m, v, m_, v_], all_outs) f = theano.function([m, v, m_, v_], all_outs)
f(mval, vval, m_val, v_val) f(mval, vval, m_val, v_val)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论