Merge pull request #1068 from goodfeli/fix_consider_constant

Fixes several issues with gradients and some other bugs

Merge pull request #1068 from goodfeli/fix_consider_constant
40bbb7da · David Warde-Farley · 87cd138e · 83781003 · 40bbb7da · 40bbb7da
--- a/doc/extending/op.txt
+++ b/doc/extending/op.txt
@@ -249,6 +249,8 @@ following methods:
  1) They must be Variable instances.
  2) When they are types that have dtypes, they must never have an integer dtype.
+  The output gradients passed *to* Op.grad will also obey these constraints.
  Integers are a tricky subject. Integers are the main reason for having DisconnectedType,
  NullType or zero gradient. When you have an integer as an argument to your grad method,
  recall the definition of a derivative to help you decide what value to return:

--- a/theano/compile/builders.py
+++ b/theano/compile/builders.py
@@ -55,9 +55,12 @@ class OpFromGraph(gof.Op):
        if grad_depth > 0:
            output_grads = [t() for t in self.output_types]
-            gd = G.grad_sources_inputs(zip(self.outputs, output_grads),
+            # OpFromGraph doesn't implement a connection_pattern, so for now we regard
-                    self.inputs)
+            # all inputs and outputs as connected. This will compute the right numerical
-            gs = map(gd.get, self.inputs)
+            # value for the gradients but could fail to raise the disconnected inputs error
+            # in some cases.
+            gs = G.grad(cost=None, known_grads=dict(zip(self.outputs, output_grads)),
+                    wrt=self.inputs, disconnected_inputs='ignore')
            self.grad_ops = []
            for g in gs:
                if g is None:

--- a/theano/gradient.py
+++ b/theano/gradient.py
--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -221,7 +221,8 @@ class Scan(PureOp):
                    'following error has been encountered: The '
                    '%s %s (argument number %d) has dtype '
                    '%s and %d dimension(s). The corresponding slice %s '
-                    'however has dtype %s and %d dimension(s). This '
+                    'however has dtype %s and %d dimension(s) (it should '
+                    'have the same dtype and one fewer dimensions). This '
                    'should never happen, please '
                    'report to theano-dev mailing list'
                   )
@@ -1261,11 +1262,9 @@ class Scan(PureOp):
                             if x in diff_inputs]
            for x in consider_inps:
                try:
-                    _gmp = gradient.grad_sources_inputs(
+                    gmp[x] = gradient.grad(cost=None,
-                        [(y, g_y)],
+                                           known_grads={y: g_y}, wrt=x)
-                        [x])
+                except gradient.NullTypeGradError:
-                    gmp[x] = _gmp[x]
-                except TypeError:
                    # It means the gradient is undefined (which implies
                    # is connected)
                    gmp[x] = x
@@ -1374,11 +1373,21 @@ class Scan(PureOp):
                        self.inner_nitsot_outs(self_outputs))
        def compute_gradient(y, g_y):
-            gmp = gradient.grad_sources_inputs(
+            if 'int' in str(g_y.dtype):
-                    [(y, g_y)],
+                raise TypeError("Gradients may never be integers but g_y "
-                    [x for x in theano.gof.graph.inputs([y])
+                        "has type "+str(g_y.type))
-                     if x in diff_inputs])
-            return [gmp.get(p, None) for p in diff_inputs]
+            wrt  = [x for x in theano.gof.graph.inputs([y])
+                    if x in diff_inputs]
+            grads =  gradient.grad(
+                    cost = None,
+                    known_grads = {y : g_y },
+                    wrt=wrt, consider_constant=wrt,
+                    disconnected_inputs='ignore',
+                    return_disconnected='None')
+            gmp = dict(zip(wrt, grads))
+            rval =  [gmp.get(p, None) for p in diff_inputs]
+            return rval
        dC_dinps_t = [None for inp in diff_inputs]
        disconnected_dC_dinps_t = [True for inp in diff_inputs]
        dC_dXts = []

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -462,13 +462,27 @@ def _allclose(a, b, rtol=None, atol=None):
    return numpy.allclose(a, b, atol=atol_, rtol=rtol_)
+class NotConstantError(TypeError):
+    """
+    Raised by get_constant_value if called on something that is
+    not constant.
+    For now it is a TypeError, to maintain the old interface
+    that get_constant_value should raise a TypeError in this
+    situation. However, this is unsafe because get_constant_value
+    could inadvertently raise a TypeError if it has a bug.
+    So we should eventually make NotConstantError derive
+    from Exception directly, and modify all code that uses
+    get_constant_value to catch this more specific exception.
+    """
+    pass
 def get_constant_value(v):
    """return the constant scalar(0-D) value underlying variable `v`
    If v is the output of dimshuffles, fills, allocs, rebroadcasts, cast
    this function digs through them.
-    If `v` is not some view of constant data, then raise a TypeError.
+    If `v` is not some view of constant data, then raise a NotConstantError.
    :note: There may be another function similar to this one in the
        code, but I'm not sure where it is.
@@ -488,7 +502,7 @@ def get_constant_value(v):
            numpy.complex(data)  # works for all numeric scalars
            return data
        except Exception:
-            raise TypeError(
+            raise NotConstantError(
                'v.data is non-numeric, non-scalar, or has more than one'
                ' unique value', v)
    if v.owner:
@@ -516,9 +530,17 @@ def get_constant_value(v):
            v.owner.op.perform(v.owner, [const], ret)
            return ret[0][0]
        if isinstance(v.owner.op, Subtensor) and v.ndim == 0:
-            if isinstance(v.owner.inputs[0], TensorConstant):
+            # This condition depends on Subtensor always embedding constant
+            # indices in the Op rather than making them inputs to the Apply node
+            if isinstance(v.owner.inputs[0], TensorConstant) and \
+                len(v.owner.inputs) == 1:
+                try:
                    return v.owner.inputs[0].data.__getitem__(
                    tuple(v.owner.op.idx_list))
+                except IndexError:
+                    raise IndexError(str(tuple(v.owner.op.idx_list))+" is not a valid index into " + \
+                            str(v.owner.inputs[0].data))
            # The index list 'idx_list' should have length the same
            # shape as the input.
@@ -3780,7 +3802,7 @@ class AdvancedIndexingError(TypeError):
 class Subtensor(Op):
    """Return a subtensor view
-    The inputs array is the tensor x, followed by scalar integer variables.
+    The inputs array is the tensor x, followed by scalar integer types.
    TODO: WRITEME: how are the scalar integer variables formatted?
    This class uses a relatively complex internal representation of the inputs
@@ -3789,7 +3811,7 @@ class Subtensor(Op):
    idx_list: instance variable TODO: WRITEME: is this a list or a tuple?
                                        (old docstring gives two conflicting
                                        descriptions)
-              elements are either integers, theano scalars, or slices.
+              elements are either integers, theano scalar types, or slices.
              one element per "explicitly named dimension"
                TODO: WRITEME: what is an "explicitly named dimension" ?
@@ -3798,7 +3820,11 @@ class Subtensor(Op):
              if slice:
                  start/stop/step members of each slice are integer indices
                  into the inputs array or None
-                  integer indices be actual integers or theano scalars
+                  integer indices be actual integers or theano scalar types
+    Note that the idx_list defines the Op, so two Subtensor instances are
+    considered to be different Ops if they have different idx_list fields.
+    This means that the entries in it are theano Types, not theano Variables.
    @todo: add support for advanced tensor indexing (in Subtensor_dx too).
@@ -3816,6 +3842,17 @@ class Subtensor(Op):
    @staticmethod
    def collapse(idxs, cond):
+        """
+        idxs: a list of indices or slices.
+        cond: a callable that returns a bool
+        returns: idxs, with the slices flattened out into a list.
+                if cond is true for an entry, does not flatten it.
+        """
        ret = []
        def helper(entry):
@@ -3828,10 +3865,20 @@ class Subtensor(Op):
        for idx in idxs:
            helper(idx)
        return ret
    @staticmethod
    def convert(entry, slice_ok=True):
+        """
+        The "idx_list" field is unique to each Subtensor instance.
+        It is not unique to each Apply node, so it should not refer to
+        specific Variables. This method changes references to Variables
+        into references to Types.
+        TODO: WRITEME: This method also accepts "entry" already being a Type;
+            when would that happen?
+        """
        invalid_scal_types = [scal.float64, scal.float32]
        scal_types = [scal.int64, scal.int32, scal.int16, scal.int8]
        tensor_types = [lscalar, iscalar, wscalar, bscalar]

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -801,10 +801,9 @@ class ConvOp(OpenMPOp):
            # mimic what happens inside theano.grad: get the input gradient
            # of the final cost wrt all variables involved.
-            tmp_gmap = theano.gradient.grad_sources_inputs(
+            return theano.gradient.grad(cost=None,
-                [(node, gz)], [inputs, kerns])
+                    known_grads={node: gz}, wrt=[inputs, kerns])
-            return [tmp_gmap[inputs], tmp_gmap[kerns]]
        if self.dx not in (1, 2) or self.dy not in (1, 2):
            raise NotImplementedError(

--- a/theano/tests/test_gradient.py
+++ b/theano/tests/test_gradient.py
@@ -6,7 +6,6 @@ import unittest
 import theano
 from theano import gof
-from theano.gradient import grad_sources_inputs
 from theano import gradient
 from theano.tensor.nnet.Conv3D import conv3D
 from theano import config
@@ -16,6 +15,16 @@ from theano.gof.null_type import NullType
 one = theano.tensor.as_tensor_variable(1.)
+def grad_sources_inputs(sources, inputs):
+    """
+    This implements the old grad_sources_inputs function in terms of
+    the new interface so the tests don't need to be rewritten.
+    """
+    if inputs is None:
+        inputs = theano.gof.graph.inputs([source[0] for source in sources])
+    return dict(zip(inputs,theano.gradient.grad(cost=None, known_grads=dict(sources),
+        wrt=inputs, consider_constant=inputs)))
 class testgrad_sources_inputs(unittest.TestCase):
    def test_retNone1(self):
@@ -369,35 +378,6 @@ class test_grad(unittest.TestCase):
        # If we made it to here without an exception, then the
        # connection_pattern functionality worked correctly
-    def test_sum_disconnected(self):
-        # Tests that we can add DisconnectedType to other terms correctly
-        x = theano.tensor.scalar()
-        y = x * 2.
-        z = x + 1.
-        cost = y + z
-        theano.tensor.grad(cost, x, consider_constant=[y, z])
-        # In an earlier version of theano, the above line would have failed
-        # while trying to add two DisconnectedTypes
-    def test_output_grad_on_int(self):
-        # If the g_cost argument is specified when x has a discrete dtype,
-        # g_cost should be equivalent to 0.
-        x = theano.tensor.iscalar('x')
-        y = x * 2
-        # Should work:
-        c0 = theano.tensor.constant(0)
-        theano.grad(y, x, g_cost=c0)
-        theano.grad(y, x, g_cost=y.zeros_like())
-        theano.grad(y, x, g_cost=y.zeros_like().astype('float64'))
-        # Should raise ValueError
-        c1 = theano.tensor.constant(1)
-        self.assertRaises(ValueError, theano.grad, y, x, g_cost=c1)
-        s0 = theano.shared(np.zeros((), dtype='int8'))
-        self.assertRaises(ValueError, theano.grad, y, x, g_cost=s0)
    def test_downcast_dtype(self):
        # Test that the gradient of a cost wrt a float32 variable does not
        # get upcasted to float64.
@@ -418,6 +398,124 @@ class test_grad(unittest.TestCase):
        # be downcasted to float32, so dc_dx should also be float32
        assert dc_dx.dtype == 'float32'
+    def test_grad_constant(self):
+        # Test that the gradient handles Constants and consider_constant variables
+        # consistently
+        x = theano.tensor.scalar()
+        y = theano.tensor.scalar()
+        z_x = x + y
+        z_one = one + y
+        g_x = theano.tensor.grad(z_x, x, consider_constant=[x])
+        g_one = theano.tensor.grad(z_one, one)
+        f = theano.function([x, y],[g_x, g_one])
+        g_x, g_one = f(1, .5)
+        if not np.allclose(g_x, g_one):
+            raise AssertionError("Gradient using consider constant is " + str(g_x)\
+                    + " but gradient with respect to the same Constant is " + \
+                    str(g_one))
+def test_known_grads():
+    # Tests that the grad method with no known_grads
+    # matches what happens if you put its own known_grads
+    # in for each variable
+    full_range = theano.tensor.arange(10)
+    x = theano.tensor.scalar('x')
+    t = theano.tensor.iscalar('t')
+    ft = full_range[t]
+    ft.name = 'ft'
+    coeffs = theano.tensor.vector('c')
+    ct = coeffs[t]
+    ct.name = 'ct'
+    p = x ** ft
+    p.name = 'p'
+    y = ct * p
+    y.name = 'y'
+    cost = theano.tensor.sqr(y)
+    cost.name = 'cost'
+    layers = [
+            [cost],
+            [y],
+            [ct,p],
+            [ct, x, ft],
+            [coeffs, t, full_range, x]
+            ]
+    inputs = [coeffs, t, x]
+    rng = np.random.RandomState([2012, 11, 15])
+    values = [rng.randn(10), rng.randint(10), rng.randn() ]
+    values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)]
+    true_grads = theano.tensor.grad(cost, inputs, disconnected_inputs='ignore')
+    true_grads = theano.function(inputs, true_grads)
+    true_grads = true_grads(*values)
+    for layer in layers:
+        print 'Testing by separately computing ',layer
+        first = theano.tensor.grad(cost, layer, disconnected_inputs='ignore')
+        known = dict(zip(layer, first))
+        full = theano.tensor.grad(cost=None,
+                known_grads=known,wrt=inputs, disconnected_inputs='ignore')
+        full = theano.function(inputs, full)
+        full = full(*values)
+        assert len(true_grads) == len(full)
+        for a, b, var in zip(true_grads, full, inputs):
+            if not np.allclose(a, b):
+                print 'Failure'
+                print a
+                print b
+                print var
+                print layer
+                for v in known:
+                    print v,':',theano.function(inputs,known[v])(*values)
+                assert False
+def test_dxdx():
+    # Tests that the gradient of a scalar with respect to itself is 1
+    # I use an integer in this case because people keep changing this
+    # gradient to be 0 on integers but according to our interpretation
+    # of the gradient as defined in the Op contract, it should be 1.
+    # If you feel the need to change this unit test you are probably
+    # modifying the Op contract and should definitely get the approval
+    # of multiple people on theano-dev.
+    x = theano.tensor.iscalar()
+    g = theano.tensor.grad(x, x)
+    g = g.eval({ x : 12 })
+    assert np.allclose(g,1.)
+def test_known_grads_integers():
+    # Tests that known_grads works on integers
+    x = theano.tensor.iscalar()
+    g_expected = theano.tensor.scalar()
+    g_grad = theano.gradient.grad(cost=None,
+            known_grads={x : g_expected},
+            wrt=x)
+    f = theano.function([g_expected],g_grad)
+    x = -3
+    gv = np.cast[theano.config.floatX](.6)
+    g_actual = f(gv)
+    assert np.allclose(g_actual, gv)
 if __name__ == '__main__':
    unittest.main()
--- a/theano/tests/test_rop.py
+++ b/theano/tests/test_rop.py
@@ -341,15 +341,9 @@ class test_RopLop(RopLop_checker):
        rop_out2 = tensor.Rop((m, v, m + v), [m, v], [m_, v_])
        assert isinstance(rop_out2, tuple)
        assert len(rop_out2) == 3
-        lop_out1 = tensor.Lop([m, v, m + v], (m, v), [m_, v_])
-        assert isinstance(lop_out1, tuple)
-        assert len(lop_out1) == 2
-        lop_out2 = tensor.Lop((m, v, m + v), [m, v], [m_, v_])
-        assert isinstance(lop_out2, list)
-        assert len(lop_out2) == 2
        all_outs = []
-        for o in rop_out1, rop_out2, lop_out1, lop_out2:
+        for o in rop_out1, rop_out2:
            all_outs.extend(o)
        f = theano.function([m, v, m_, v_], all_outs)
        f(mval, vval, m_val, v_val)