Merge pull request #1068 from goodfeli/fix_consider_constant

Fixes several issues with gradients and some other bugs

Merge pull request #1068 from goodfeli/fix_consider_constant
40bbb7da · David Warde-Farley · 87cd138e · 83781003 · 40bbb7da · 40bbb7da
--- a/doc/extending/op.txt
+++ b/doc/extending/op.txt
@@ -249,6 +249,8 @@ following methods:
  1) They must be Variable instances.
  2) When they are types that have dtypes, they must never have an integer dtype.

+  The output gradients passed *to* Op.grad will also obey these constraints.
+
  Integers are a tricky subject. Integers are the main reason for having DisconnectedType,
  NullType or zero gradient. When you have an integer as an argument to your grad method,
  recall the definition of a derivative to help you decide what value to return:

--- a/theano/compile/builders.py
+++ b/theano/compile/builders.py
@@ -55,9 +55,12 @@ class OpFromGraph(gof.Op):

        if grad_depth > 0:
            output_grads = [t() for t in self.output_types]
-            gd = G.grad_sources_inputs(zip(self.outputs, output_grads),
-                    self.inputs)
-            gs = map(gd.get, self.inputs)
+            # OpFromGraph doesn't implement a connection_pattern, so for now we regard
+            # all inputs and outputs as connected. This will compute the right numerical
+            # value for the gradients but could fail to raise the disconnected inputs error
+            # in some cases.
+            gs = G.grad(cost=None, known_grads=dict(zip(self.outputs, output_grads)),
+                    wrt=self.inputs, disconnected_inputs='ignore')
            self.grad_ops = []
            for g in gs:
                if g is None:

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -13,9 +13,11 @@ import warnings
 _logger = logging.getLogger('theano.gradient')

 import numpy  # for numeric_grad
+np = numpy

 import theano

+from itertools import izip
 from theano import gof
 from theano.gof import Variable
 from theano.gof.python25 import all
@@ -317,9 +319,6 @@ def Lop(f, wrt, eval_points, consider_constant=None,
        coordinates of the tensor element in the last
        If `f` is a list/tuple, then return a list/tuple with the results.
    """
-    if consider_constant is None:
-        consider_constant = []
-
    if type(eval_points) not in (list, tuple):
        eval_points = [eval_points]

@@ -333,50 +332,15 @@ def Lop(f, wrt, eval_points, consider_constant=None,
    f = list(f)
    grads = list(eval_points)

-    for elem in consider_constant:
-        assert elem not in f
-        f.append(elem)
-        grads.append(elem.zeros_like())
-
    if not isinstance(wrt, (list, tuple)):
        wrt = [wrt]

-    arg1 = zip(f, eval_points)
-    arg2 = list(wrt)
-
-    gmap = grad_sources_inputs(
-        arg1,
-        arg2)
-
-    # Note : If p is not in gmap there can be several reasons, among which
-    # is the fact that p might not be part of the computational graph. A
-    # simple example is that for a+b for e.g. a[0] is not part of the graph,
-    # so Theano does not know how to compute TT.grad(TT.sum(a+b), a[0])
-    # such subtle cases can be fixed by a more careful implementation of the
-    # gradient, but for now Theano needs to throw an exception, and make the
-    # user aware that it does not know how to compute that gradient
-    ret = []
-    for p in wrt:
-        if p in gmap:
-            ret.append(gmap[p])
-        else:
-            message = (
-                "Lop method was asked to compute the gradient "
-                "with respect to a variable that is not part of "
-                "the computational graph of the cost, or is used "
-                "only by a non-differentiable operator: %s" % p)
-            if disconnected_inputs == 'ignore':
-                pass
-            elif disconnected_inputs == 'warn':
-                warnings.warn(message, stacklevel=1)
-            elif disconnected_inputs == 'raise':
-                raise ValueError(message)
-            else:
-                raise ValueError(
-                    "Invalid value for keyword "
-                    "'disconnected_inputs', valid values are "
-                    "'ignore', 'warn' and 'raise'.")
-            ret.append(p.zeros_like())
+    assert len(f) == len(grads)
+    known = dict(izip(f, grads))
+
+    ret = grad(cost=None, known_grads=known,
+            consider_constant=consider_constant, wrt=wrt,
+            disconnected_inputs=disconnected_inputs)

    return format_as(using_list, using_tuple, ret)

@@ -386,9 +350,11 @@ def Lop(f, wrt, eval_points, consider_constant=None,
 #########################

 def grad(cost, wrt, g_cost=None, consider_constant=None,
-        disconnected_inputs='raise', add_names=True):
+        disconnected_inputs='raise', add_names=True,
+        known_grads=None, return_disconnected='zero'):
    """
    :type cost: Scalar (0-dimensional) Variable.
+        May optionally be None if known_grads is provided.
    :type wrt: Variable or list of Variables.
    :type g_cost: Scalar Variable, or None.
    :param g_cost: an expression for the gradient through cost.  The default is
@@ -409,6 +375,20 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
        (d<cost.name>/d<wrt.name>) provided that both cost and wrt have
        names

+    :type known_grads: dict
+    :param known_grads: If not None, a dictionary mapping variables to their
+            gradients. This is useful in the case where you know the
+            gradient on some variables but do not know the original
+            cost.
+
+    :type return_disconnected: string
+    :param return_disconnected:
+        'zero' : If wrt[i] is disconnected, return value i will be
+                 wrt[i].zeros_like()
+        'None' : If wrt[i] is disconnected, return value i will be
+                 None
+        'Disconnected' : returns variables of type DisconnectedType
+
    :rtype: Variable or list/tuple of Variables (depending upon `wrt`)

    :return: symbolic expression of gradient of `cost` with respect to `wrt`.
@@ -422,29 +402,17 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
    if tensor is None:
        from theano import tensor

-    if isinstance(cost.type, NullType):
+    if cost is None:
+        assert known_grads is not None
+
+    if cost is not None and isinstance(cost.type, NullType):
        raise ValueError("Can't differentiate a NaN cost."
            "cost is NaN because " + \
                cost.type.why_null)

-    if cost.ndim != 0:
+    if cost is not None and cost.ndim != 0:
        raise TypeError("cost must be a scalar.")

-    if consider_constant is None:
-        consider_constant = []
-    else:
-        # error checking on consider_constant: verify that it is a collection
-        # of theano variables
-        # this is important, if someone accidentally passes a nested data
-        # structure with theano variables at the leaves, only the root will
-        # be properly considered constant
-        if not hasattr(consider_constant, '__iter__'):
-            raise TypeError('consider_constant must be an iterable collection,'
-                    ' got ' + str(type(consider_constant)))
-        for elem in consider_constant:
-            if not isinstance(elem, gof.Variable):
-                raise TypeError('Elements of consider_constant must be '
-                                'variables, but got ' + str(type(elem)))

    if isinstance(wrt, set):
        raise TypeError("wrt must not be a set. sets have no defined "
@@ -461,7 +429,14 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
            raise TypeError("Expected Variable, got " + str(elem) +
                    " of type "+str(type(elem)))

-    var_to_node_to_idx = _populate_var_to_node_to_idx([cost], wrt)
+    outputs = []
+    if cost is not None:
+        outputs.append(cost)
+    if known_grads is not None:
+        outputs.extend(known_grads.keys())
+
+    var_to_node_to_idx = _populate_var_to_node_to_idx(
+            outputs, wrt, consider_constant)

    # build a dict mapping var to the gradient of cost with respect to var
    grad_dict = {}
@@ -469,49 +444,57 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
    # The gradient of the cost should default to 1 if the cost is of a
    # continuous dtype (float, for the moment, as complex are unsupported),
    # and should always be 0 if the cost is of discrete (integer) dtype.
-    if getattr(cost.type, 'dtype', None) not in tensor.float_dtypes:
+    if cost is not None:
+        if g_cost is None:
+            g_cost = _float_ones_like(cost)
+        # g_cost may be Disconnected or NullType. A creative use of the function,
+        # sure, but nonetheless one we can and should support. So before we try
+        # to cast it make sure it even has a dtype
+        if hasattr(g_cost.type, 'dtype') and cost.type.dtype not in tensor.discrete_dtypes:
+            # Here we enforce the constraint that floating point variables have
+            # the same dtype as their gradient.
+            g_cost = g_cost.astype(cost.type.dtype)
+        # DO NOT enforce g_cost to be 0 if cost is an integer.
+        # This is to be enforced by the Op.grad method for the Op that outputs cost.
+        assert g_cost not in tensor.discrete_dtypes
+
+        grad_dict[cost] = g_cost
+    else:
        if g_cost is not None:
-            try:
-                cval = theano.get_constant_value(g_cost)
-                if cval == 0:
-                    g_cost_is_zero = True
-                else:
-                    g_cost_is_zero = False
-            except TypeError:
-                g_cost_is_zero = False
-
-            if not g_cost_is_zero:
-                raise ValueError("The gradient of a cost of non-continuous "
-                        "dtype (here, %s), if it is defined, should be 0. "
-                        "However, a value of %s was provided in the 'g_cost' "
-                        "argument of theano.grad(). To remove this error, "
-                        "you can simply omit the 'g_cost' argument, or "
-                        "give it the default value of None." % (
-                            getattr(g_cost.type, 'dtype', 'no dtype defined'),
-                            g_cost))
-        g_cost = tensor.zeros_like(cost)
-
-    elif g_cost is None:
-        # cost.type.dtype is in tensor.float_dtypes at that point
-        g_cost = tensor.ones_like(cost)
+            raise ValueError("No cost node was specified, but a gradient"
+                    " on it was.")

-    else:
-        # Cast the provided gradient so that it has the same dtype
-        # as the cost.
-        g_cost = g_cost.astype(cost.type.dtype)
+    if known_grads is not None:
+        for var in known_grads:
+            g_var = known_grads[var]
+
+            if not hasattr(g_var, 'type'):
+                raise TypeError('output grads must be theano variables.'
+                    'Ambiguous whether %s should be made into tensor'
+                    ' or sparse theano variable' % str(type(g_var)))
+
+            if g_var.type not in [NullType, DisconnectedType] and 'float' \
+                not in str(g_var.type.dtype):
+                raise TypeError("Gradients must always be NullType, "
+                        "DisconnectedType, or continuous, but grad was "
+                        "given a known_grad of type "+str(g_var.type))
+
+            # DO NOT check that these gradients are equal to 0 if var is int
+            # The gradient is allowed to be non-zero on var in that case
+            # Ops outputing var should not backpropagate its gradient further
+            # but that is enforced elsewhere (grep for only_connected_to_int)
+
+            grad_dict[var] = g_var

-    grad_dict[cost] = g_cost

-    # the gradient of the constants is 0
-    for const in consider_constant:
-        grad_dict[const] = DisconnectedType()()

    # variables that do not influence the cost have zero gradient.
    # if wrt is such a variable, populate the grad_dict with this info
    # so that wrt not being in var_to_node_to_idx won't cause an error below
    # according to the flag, possibly raise an error if wrt is disconnected
    for elem in wrt:
-        if elem not in var_to_node_to_idx and elem is not cost:
+        if elem not in var_to_node_to_idx and elem is not cost \
+                and elem not in grad_dict:
            message = ("grad method was asked to compute the gradient "
                    "with respect to a variable that is not part of "
                    "the computational graph of the cost, or is used "
@@ -529,15 +512,15 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
            grad_dict[elem] = DisconnectedType()()

    cost_name = None
-    if add_names:
+    if add_names and cost is not None:
        cost_name = cost.name

    # Make sure we didn't initialize the grad_dict with any ints
-    # for non-int outputs
+    # The gradient may NEVER be an int, even if the variable is an int.
+    # Read the Op contract and talk to Ian Goodfellow before changing this!
    for var in grad_dict:
        g = grad_dict[var]
-        if (hasattr(g.type, 'dtype') and
-                getattr(var.type, 'dtype', '') in tensor.float_dtypes):
+        if hasattr(g.type, 'dtype'):
            assert g.type.dtype in tensor.float_dtypes

    rval = _populate_grad_dict(var_to_node_to_idx,
@@ -545,7 +528,12 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,

    for i in xrange(len(rval)):
        if isinstance(rval[i].type, DisconnectedType):
-            rval[i] = _float_zeros_like(wrt[i])
+            if return_disconnected == 'zero':
+                rval[i] = _float_zeros_like(wrt[i])
+            elif return_disconnected == 'None':
+                rval[i] = None
+            else:
+                assert return_disconnected == 'Disconnected'

    if using_tuple:
        rval = tuple(rval)
@@ -592,15 +580,18 @@ def _node_to_pattern(node):
    return connection_pattern


-def _populate_var_to_node_to_idx(outputs, wrt):
+def _populate_var_to_node_to_idx(outputs, wrt, consider_constant):
    """
-    Common code shared between grad and grad_sources_inputs
+    Helper function for grad function.

    outputs: a list of variables we want to take gradients of

    wrt: a list of variables we want to take the gradient with
        respect to.

+    consider_constant: a list of variables not to backpropagate
+        through.
+
    returns:

     var_to_app_to_idx:
@@ -622,8 +613,30 @@ def _populate_var_to_node_to_idx(outputs, wrt):
      This set is exactly the set of variables that connect
      the variables in wrt to the cost being differentiated.

+      (A variable in consider_constant is not a function of
+      anything)
+
    """

+    # Validate and format consider_constant
+    if consider_constant is None:
+        consider_constant = []
+    else:
+        # error checking on consider_constant: verify that it is a collection
+        # of theano variables
+        # this is important, if someone accidentally passes a nested data
+        # structure with theano variables at the leaves, only the root will
+        # be properly considered constant
+        try:
+            iter(consider_constant)
+        except TypeError:
+            raise TypeError('consider_constant must be an iterable collection,'
+                    ' got ' + str(type(consider_constant)))
+        for elem in consider_constant:
+            if not isinstance(elem, gof.Variable):
+                raise TypeError('Elements of consider_constant must be '
+                                'variables, but got ' + str(type(elem)))
+
    # var_to_app_to_idx[var][node] = [i,j] means node has
    # var as input at positions i and j
    var_to_app_to_idx = {}
@@ -638,9 +651,17 @@ def _populate_var_to_node_to_idx(outputs, wrt):
    accounted_for = set([])

    def account_for(var):
+        # Don't visit the same variable twice
        if var in accounted_for:
            return
        accounted_for.add(var)
+
+        # Constants are not a function of anything
+        if var in consider_constant:
+            return
+
+        # Recursively add the variables that this variable is
+        # a function of.
        if var.owner is not None:
            app = var.owner

@@ -699,11 +720,16 @@ def _populate_var_to_node_to_idx(outputs, wrt):

    return var_to_app_to_idx

+class NullTypeGradError(TypeError):
+    """
+    Raised when grad encounters a NullType.
+    """
+    pass

 def _populate_grad_dict(var_to_node_to_idx,
        grad_dict, wrt, cost_name=None):
    """
-        Common code shared between grad_sources_inputs and grad
+        Helper function for grad function.

        var_to_node_to_idx: a dictionary mapping a variable to
                a second dictionary.
@@ -712,7 +738,7 @@ def _populate_grad_dict(var_to_node_to_idx,
                node's input list

        grad_dict: a dictionary mapping variables to their gradients
-                   should be populated by grad or grad_sources_inputs
+                   should be populated by grad function.

                        grad should set gradients to DisconnectedType()() for
                        variables to be considered constant, set the
@@ -779,38 +805,46 @@ def _populate_grad_dict(var_to_node_to_idx,

                inputs = [try_to_copy_if_needed(ipt) for ipt in inputs]

+
+
                # Build a list of output gradients with the same dtype as
                # the corresponding output variable.
                # If an output is of a float dtype, we want to cast the
                # output gradient into the same dtype, to avoid having a
                # gradient graph with double precision (taking more memory,
                # and more computation).
-                # If an output is of an integer dtype, then we ensure the
-                # output gradient is zero, and that zero can be represented
-                # in the same int dtype.
-                # If an output gradient is a NullType or DisconnectedType,
-                # then it will not have a dtype, and it will not be changed.
+                # If an output is of an integer dtype, then we just leave it
+                # alone.
+                # DO NOT force integer variables to have zero grad. This causes
+                # bugs where we fail to detect disconnected or undefined gradients.
+                # DO NOT force integer variables to have integer dtype. This is
+                # a violation of the op contract.
                new_output_grads = []
                for o, og in zip(node.outputs, output_grads):
                    o_dt = getattr(o.type, 'dtype', None)
                    og_dt = getattr(og.type, 'dtype', None)
-                    if og_dt and o_dt in theano.tensor.discrete_dtypes:
-                        new_output_grads.append(o.zeros_like())
-                    elif o_dt and og_dt and o_dt != og_dt:
+                    if o_dt not in theano.tensor.discrete_dtypes and og_dt and o_dt != og_dt:
                        new_output_grads.append(og.astype(o_dt))
                    else:
                        new_output_grads.append(og)

-                # Make sure that, if new_output_grads[i] has a dtype:
-                # - it is the same dtype as outputs[i]
-                # - if the dtype is an int, then new_output_grads[i] is 0.
+                # Make sure that, if new_output_grads[i] has a floating point dtype,
+                # it is the same dtype as outputs[i]
                for o, ng in zip(node.outputs, new_output_grads):
                    o_dt = getattr(o.type, 'dtype', None)
                    ng_dt = getattr(ng.type, 'dtype', None)
-                    if ng_dt:
+                    if ng_dt is not None and o_dt not in theano.tensor.discrete_dtypes:
                        assert ng_dt == o_dt
-                        if ng_dt in theano.tensor.discrete_dtypes:
-                            assert theano.get_constant_value(ng) == 0
+
+                # Someone who had obviously not read the Op contract tried
+                # to modify this part of the function.
+                # If you ever think it is a good idea to make an integer
+                # valued gradient, please
+                # 1) Read the Op contract again
+                # 2) Talk to Ian Goodfellow
+                # (Both of these sources will tell you not to do it)
+                for ng in new_output_grads:
+                    assert getattr(ng.type, 'dtype', None) not in theano.tensor.discrete_dtypes

                input_grads = node.op.grad(inputs, new_output_grads)

@@ -863,6 +897,7 @@ def _populate_grad_dict(var_to_node_to_idx,
                            'the grad_undefined or grad_unimplemented helper '
                            'functions.') % node.op)

+
                if not isinstance(term.type,
                        (NullType, DisconnectedType)):
                    if term.type.dtype not in theano.tensor.float_dtypes:
@@ -875,14 +910,9 @@ def _populate_grad_dict(var_to_node_to_idx,
                        # it's not undefined or disconnected
                        # The only other valid thing it can be is 0

-                        no_constant_value = True
-                        try:
-                            constant_value = theano.get_constant_value(term)
-                            no_constant_value = False
-                        except TypeError:
-                            pass
-
-                        if no_constant_value:
+                        is_zero = _is_zero(term)
+                        assert is_zero in ['yes', 'no', 'maybe']
+                        if is_zero == 'maybe':
                            msg = "%s.grad returned %s of type %s for input"
                            msg += " %d. This input's only connections to "
                            msg += "the cost through this op are via "
@@ -896,8 +926,7 @@ def _populate_grad_dict(var_to_node_to_idx,
                            msg = msg % (str(node.op), str(term),
                                    str(type(term)), i)

-                            raise ValueError(msg)
-                        if constant_value != 0:
+                        if is_zero == 'no':
                            msg = "%s.grad returned %s of type %s for input"
                            msg += " %d. Since this input is only connected "
                            msg += "to integer-valued outputs, it should "
@@ -905,7 +934,7 @@ def _populate_grad_dict(var_to_node_to_idx,
                            msg += "%s."

                            msg % (str(node.op), str(term), str(type(term)),
-                                    i, str(constant_value))
+                                    i, str(theano.get_constant_value(term)))

                            raise ValueError(msg)

@@ -961,7 +990,7 @@ def _populate_grad_dict(var_to_node_to_idx,
                                        type(term)))

                        if isinstance(term.type, NullType):
-                            raise TypeError("tensor.grad "
+                            raise NullTypeGradError("tensor.grad "
                                "encountered a NaN. " +\
                                    term.type.why_null)

@@ -997,113 +1026,6 @@ def _populate_grad_dict(var_to_node_to_idx,

    return rval

-
-def grad_sources_inputs(sources, graph_inputs):
-    """
-    Used to compute the gradient of a cost with respect to all the
-    variables between graph_input and cost, but in the special
-    case where you don't know the cost, you only know its gradient
-    on a set of intermediate values.
-
-    A gradient source is a pair (``v``, ``g_v``), in which ``v`` is
-    a `Variable`, and ``g_v`` is a `Variable` that is a gradient wrt
-    ``v``. More specifically, ``g_v`` is the gradient of an external
-    scalar cost, ``cost`` (that is not explicitly used), wrt ``v``.
-
-    This function traverses the graph backward from the ``r`` sources,
-    calling ``op.grad(...)`` for all ops with some non-None gradient
-    on an output, to compute gradients of ``cost`` wrt intermediate
-    variables and ``graph_inputs``.
-
-    The ``op.grad(...)`` functions are called like this:
-
-    .. code-block:: python
-
-        op.grad(op.inputs[:], [total_gradient(v) for v in op.outputs])
-
-    This call to ``op.grad`` should return a list or tuple: one symbolic
-    gradient per input. These gradients represent the gradients of
-    the same implicit ``cost`` mentionned above, wrt ``op.inputs``.  Note
-    that this is **not** the same as the gradient of ``op.outputs`` wrt
-    ``op.inputs``.
-
-    If ``op`` has a single input, then ``op.grad`` should return a list
-    or tuple of length 1.
-    For each input wrt to which ``op`` is not differentiable, it should
-    return ``None`` instead of a `Variable` instance.
-
-    If a source ``r`` receives a gradient from another source ``r2``,
-    then the effective gradient on ``r`` is the sum of both gradients.
-
-
-    :type sources: list of pairs of Variable: (v, gradient-on-v) to
-                   initialize the total_gradient dictionary
-    :param sources: gradients to back-propagate using chain rule
-    :type graph_inputs: list of Variable
-    :param graph_inputs: variables considered to be constant
-        (do not backpropagate through them)
-
-    :rtype: dictionary whose keys and values are of type Variable
-    :return: mapping from each Variable encountered in the backward
-        traversal to the gradient with respect to that Variable.
-
-    It is assumed that there is some objective J shared between all members of
-    sources, so that for each v, gradient-on-v is the gradient of J with
-    respect to v
-
-    """
-
-    outputs, output_grads = zip(*sources)
-
-    for output_grad in output_grads:
-        if not hasattr(output_grad, 'type'):
-            raise TypeError('output grads must be theano variables.'
-                    'Ambiguous whether %s should be made into tensor'
-                    ' or sparse theano variable' % str(type(output_grad)))
-
-    if graph_inputs is None:
-        graph_inputs = gof.graph.inputs(outputs)
-
-    wrt = graph_inputs
-
-    var_to_node_to_idx = _populate_var_to_node_to_idx(outputs, wrt)
-
-    # build a dict mapping var to the gradient of cost with respect to var
-    grad_dict = {}
-
-    for output, output_grad in sources:
-        # The gradient of the cost should always be 0 if the cost is of
-        # discrete (integer) dtype.
-        if getattr(output.type, 'dtype', '') not in theano.tensor.float_dtypes:
-            output_grad = output.zeros_like()
-
-        else:
-            # Cast the provided gradient so that it has the same dtype
-            # as the cost.
-            output_grad = output_grad.astype(output.type.dtype)
-
-        grad_dict[output] = output_grad
-
-    # variables that do not influence the cost have zero gradient.
-    # if wrt is such a variable, populate the grad_dict with this info
-    # so that wrt not being in var_to_node_to_idx won't cause an error below
-    # according to the flag, possibly raise an error if wrt is disconnected
-    for elem in wrt:
-        if elem not in var_to_node_to_idx and elem not in outputs:
-            grad_dict[elem] = DisconnectedType()()
-
-    _populate_grad_dict(var_to_node_to_idx,
-            grad_dict, wrt)
-
-    # post-process out the DisconnectedTypes
-    for key in grad_dict:
-        if isinstance(grad_dict[key].type, DisconnectedType):
-            if hasattr(key, 'zeros_like'):
-                grad_dict[key] = _float_zeros_like(key)
-
-    return grad_dict
-
-
 def _float_zeros_like(x):
    """ Like zeros_like, but forces the object to have a
    a floating point dtype """
@@ -1634,3 +1556,32 @@ def hessian(cost, wrt, consider_constant=None,
                 "script that generated the error)")
        hessians.append(hess)
    return format_as(using_list, using_tuple, hessians)
+
+def _is_zero(x):
+    """
+    Returns 'yes', 'no', or 'maybe' indicating whether x
+    is always 0.
+    'maybe' means that x is an expression that is complicated enough
+    that we can't tell that it simplifies to 0.
+    """
+    if not hasattr(x, 'type'):
+        return np.all(x == 0.)
+    if isinstance(x.type, NullType):
+        return 'no'
+    if isinstance(x.type, DisconnectedType):
+        return 'yes'
+
+    no_constant_value = True
+    try:
+        constant_value = theano.get_constant_value(x)
+        no_constant_value = False
+    except TypeError:
+        pass
+
+    if no_constant_value:
+        return 'maybe'
+
+    if constant_value != 0.:
+        return 'no'
+
+    return 'yes'
--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -221,7 +221,8 @@ class Scan(PureOp):
                    'following error has been encountered: The '
                    '%s %s (argument number %d) has dtype '
                    '%s and %d dimension(s). The corresponding slice %s '
-                    'however has dtype %s and %d dimension(s). This '
+                    'however has dtype %s and %d dimension(s) (it should '
+                    'have the same dtype and one fewer dimensions). This '
                    'should never happen, please '
                    'report to theano-dev mailing list'
                   )
@@ -1261,11 +1262,9 @@ class Scan(PureOp):
                             if x in diff_inputs]
            for x in consider_inps:
                try:
-                    _gmp = gradient.grad_sources_inputs(
-                        [(y, g_y)],
-                        [x])
-                    gmp[x] = _gmp[x]
-                except TypeError:
+                    gmp[x] = gradient.grad(cost=None,
+                                           known_grads={y: g_y}, wrt=x)
+                except gradient.NullTypeGradError:
                    # It means the gradient is undefined (which implies
                    # is connected)
                    gmp[x] = x
@@ -1374,11 +1373,21 @@ class Scan(PureOp):
                        self.inner_nitsot_outs(self_outputs))

        def compute_gradient(y, g_y):
-            gmp = gradient.grad_sources_inputs(
-                    [(y, g_y)],
-                    [x for x in theano.gof.graph.inputs([y])
-                     if x in diff_inputs])
-            return [gmp.get(p, None) for p in diff_inputs]
+            if 'int' in str(g_y.dtype):
+                raise TypeError("Gradients may never be integers but g_y "
+                        "has type "+str(g_y.type))
+
+            wrt  = [x for x in theano.gof.graph.inputs([y])
+                    if x in diff_inputs]
+            grads =  gradient.grad(
+                    cost = None,
+                    known_grads = {y : g_y },
+                    wrt=wrt, consider_constant=wrt,
+                    disconnected_inputs='ignore',
+                    return_disconnected='None')
+            gmp = dict(zip(wrt, grads))
+            rval =  [gmp.get(p, None) for p in diff_inputs]
+            return rval
        dC_dinps_t = [None for inp in diff_inputs]
        disconnected_dC_dinps_t = [True for inp in diff_inputs]
        dC_dXts = []

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -462,13 +462,27 @@ def _allclose(a, b, rtol=None, atol=None):
    return numpy.allclose(a, b, atol=atol_, rtol=rtol_)


+class NotConstantError(TypeError):
+    """
+    Raised by get_constant_value if called on something that is
+    not constant.
+    For now it is a TypeError, to maintain the old interface
+    that get_constant_value should raise a TypeError in this
+    situation. However, this is unsafe because get_constant_value
+    could inadvertently raise a TypeError if it has a bug.
+    So we should eventually make NotConstantError derive
+    from Exception directly, and modify all code that uses
+    get_constant_value to catch this more specific exception.
+    """
+    pass
+
 def get_constant_value(v):
    """return the constant scalar(0-D) value underlying variable `v`

    If v is the output of dimshuffles, fills, allocs, rebroadcasts, cast
    this function digs through them.

-    If `v` is not some view of constant data, then raise a TypeError.
+    If `v` is not some view of constant data, then raise a NotConstantError.

    :note: There may be another function similar to this one in the
        code, but I'm not sure where it is.
@@ -488,7 +502,7 @@ def get_constant_value(v):
            numpy.complex(data)  # works for all numeric scalars
            return data
        except Exception:
-            raise TypeError(
+            raise NotConstantError(
                'v.data is non-numeric, non-scalar, or has more than one'
                ' unique value', v)
    if v.owner:
@@ -516,9 +530,17 @@ def get_constant_value(v):
            v.owner.op.perform(v.owner, [const], ret)
            return ret[0][0]
        if isinstance(v.owner.op, Subtensor) and v.ndim == 0:
-            if isinstance(v.owner.inputs[0], TensorConstant):
-                return v.owner.inputs[0].data.__getitem__(
+            # This condition depends on Subtensor always embedding constant
+            # indices in the Op rather than making them inputs to the Apply node
+            if isinstance(v.owner.inputs[0], TensorConstant) and \
+                len(v.owner.inputs) == 1:
+                try:
+                    return v.owner.inputs[0].data.__getitem__(
                    tuple(v.owner.op.idx_list))
+                except IndexError:
+                    raise IndexError(str(tuple(v.owner.op.idx_list))+" is not a valid index into " + \
+                            str(v.owner.inputs[0].data))
+

            # The index list 'idx_list' should have length the same
            # shape as the input.
@@ -3780,7 +3802,7 @@ class AdvancedIndexingError(TypeError):
 class Subtensor(Op):
    """Return a subtensor view

-    The inputs array is the tensor x, followed by scalar integer variables.
+    The inputs array is the tensor x, followed by scalar integer types.
    TODO: WRITEME: how are the scalar integer variables formatted?

    This class uses a relatively complex internal representation of the inputs
@@ -3789,7 +3811,7 @@ class Subtensor(Op):
    idx_list: instance variable TODO: WRITEME: is this a list or a tuple?
                                        (old docstring gives two conflicting
                                        descriptions)
-              elements are either integers, theano scalars, or slices.
+              elements are either integers, theano scalar types, or slices.
              one element per "explicitly named dimension"
                TODO: WRITEME: what is an "explicitly named dimension" ?

@@ -3798,7 +3820,11 @@ class Subtensor(Op):
              if slice:
                  start/stop/step members of each slice are integer indices
                  into the inputs array or None
-                  integer indices be actual integers or theano scalars
+                  integer indices be actual integers or theano scalar types
+
+    Note that the idx_list defines the Op, so two Subtensor instances are
+    considered to be different Ops if they have different idx_list fields.
+    This means that the entries in it are theano Types, not theano Variables.

    @todo: add support for advanced tensor indexing (in Subtensor_dx too).

@@ -3816,6 +3842,17 @@ class Subtensor(Op):

    @staticmethod
    def collapse(idxs, cond):
+        """
+
+        idxs: a list of indices or slices.
+        cond: a callable that returns a bool
+
+        returns: idxs, with the slices flattened out into a list.
+                if cond is true for an entry, does not flatten it.
+
+        """
+
+
        ret = []

        def helper(entry):
@@ -3828,10 +3865,20 @@ class Subtensor(Op):

        for idx in idxs:
            helper(idx)
+
+
        return ret

    @staticmethod
    def convert(entry, slice_ok=True):
+        """
+        The "idx_list" field is unique to each Subtensor instance.
+        It is not unique to each Apply node, so it should not refer to
+        specific Variables. This method changes references to Variables
+        into references to Types.
+        TODO: WRITEME: This method also accepts "entry" already being a Type;
+            when would that happen?
+        """
        invalid_scal_types = [scal.float64, scal.float32]
        scal_types = [scal.int64, scal.int32, scal.int16, scal.int8]
        tensor_types = [lscalar, iscalar, wscalar, bscalar]

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -801,10 +801,9 @@ class ConvOp(OpenMPOp):

            # mimic what happens inside theano.grad: get the input gradient
            # of the final cost wrt all variables involved.
-            tmp_gmap = theano.gradient.grad_sources_inputs(
-                [(node, gz)], [inputs, kerns])
+            return theano.gradient.grad(cost=None,
+                    known_grads={node: gz}, wrt=[inputs, kerns])

-            return [tmp_gmap[inputs], tmp_gmap[kerns]]

        if self.dx not in (1, 2) or self.dy not in (1, 2):
            raise NotImplementedError(

--- a/theano/tests/test_gradient.py
+++ b/theano/tests/test_gradient.py
@@ -6,7 +6,6 @@ import unittest
 import theano
 from theano import gof

-from theano.gradient import grad_sources_inputs
 from theano import gradient
 from theano.tensor.nnet.Conv3D import conv3D
 from theano import config
@@ -16,6 +15,16 @@ from theano.gof.null_type import NullType
 one = theano.tensor.as_tensor_variable(1.)


+def grad_sources_inputs(sources, inputs):
+    """
+    This implements the old grad_sources_inputs function in terms of
+    the new interface so the tests don't need to be rewritten.
+    """
+    if inputs is None:
+        inputs = theano.gof.graph.inputs([source[0] for source in sources])
+    return dict(zip(inputs,theano.gradient.grad(cost=None, known_grads=dict(sources),
+        wrt=inputs, consider_constant=inputs)))
+
 class testgrad_sources_inputs(unittest.TestCase):

    def test_retNone1(self):
@@ -369,35 +378,6 @@ class test_grad(unittest.TestCase):
        # If we made it to here without an exception, then the
        # connection_pattern functionality worked correctly

-    def test_sum_disconnected(self):
-
-        # Tests that we can add DisconnectedType to other terms correctly
-        x = theano.tensor.scalar()
-        y = x * 2.
-        z = x + 1.
-        cost = y + z
-        theano.tensor.grad(cost, x, consider_constant=[y, z])
-        # In an earlier version of theano, the above line would have failed
-        # while trying to add two DisconnectedTypes
-
-    def test_output_grad_on_int(self):
-        # If the g_cost argument is specified when x has a discrete dtype,
-        # g_cost should be equivalent to 0.
-        x = theano.tensor.iscalar('x')
-        y = x * 2
-
-        # Should work:
-        c0 = theano.tensor.constant(0)
-        theano.grad(y, x, g_cost=c0)
-        theano.grad(y, x, g_cost=y.zeros_like())
-        theano.grad(y, x, g_cost=y.zeros_like().astype('float64'))
-
-        # Should raise ValueError
-        c1 = theano.tensor.constant(1)
-        self.assertRaises(ValueError, theano.grad, y, x, g_cost=c1)
-        s0 = theano.shared(np.zeros((), dtype='int8'))
-        self.assertRaises(ValueError, theano.grad, y, x, g_cost=s0)
-
    def test_downcast_dtype(self):
        # Test that the gradient of a cost wrt a float32 variable does not
        # get upcasted to float64.
@@ -418,6 +398,124 @@ class test_grad(unittest.TestCase):
        # be downcasted to float32, so dc_dx should also be float32
        assert dc_dx.dtype == 'float32'

+    def test_grad_constant(self):
+
+        # Test that the gradient handles Constants and consider_constant variables
+        # consistently
+
+        x = theano.tensor.scalar()
+        y = theano.tensor.scalar()
+        z_x = x + y
+        z_one = one + y
+        g_x = theano.tensor.grad(z_x, x, consider_constant=[x])
+        g_one = theano.tensor.grad(z_one, one)
+
+        f = theano.function([x, y],[g_x, g_one])
+
+        g_x, g_one = f(1, .5)
+
+        if not np.allclose(g_x, g_one):
+            raise AssertionError("Gradient using consider constant is " + str(g_x)\
+                    + " but gradient with respect to the same Constant is " + \
+                    str(g_one))
+
+
+def test_known_grads():
+
+    # Tests that the grad method with no known_grads
+    # matches what happens if you put its own known_grads
+    # in for each variable
+
+    full_range = theano.tensor.arange(10)
+    x = theano.tensor.scalar('x')
+    t = theano.tensor.iscalar('t')
+    ft = full_range[t]
+    ft.name = 'ft'
+    coeffs = theano.tensor.vector('c')
+    ct = coeffs[t]
+    ct.name = 'ct'
+    p = x ** ft
+    p.name = 'p'
+    y = ct * p
+    y.name = 'y'
+    cost = theano.tensor.sqr(y)
+    cost.name = 'cost'
+
+    layers = [
+            [cost],
+            [y],
+            [ct,p],
+            [ct, x, ft],
+            [coeffs, t, full_range, x]
+            ]
+
+    inputs = [coeffs, t, x]
+
+    rng = np.random.RandomState([2012, 11, 15])
+    values = [rng.randn(10), rng.randint(10), rng.randn() ]
+    values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)]
+
+    true_grads = theano.tensor.grad(cost, inputs, disconnected_inputs='ignore')
+    true_grads = theano.function(inputs, true_grads)
+    true_grads = true_grads(*values)
+
+    for layer in layers:
+        print 'Testing by separately computing ',layer
+        first = theano.tensor.grad(cost, layer, disconnected_inputs='ignore')
+        known = dict(zip(layer, first))
+        full = theano.tensor.grad(cost=None,
+                known_grads=known,wrt=inputs, disconnected_inputs='ignore')
+        full = theano.function(inputs, full)
+        full = full(*values)
+        assert len(true_grads) == len(full)
+        for a, b, var in zip(true_grads, full, inputs):
+            if not np.allclose(a, b):
+                print 'Failure'
+                print a
+                print b
+                print var
+                print layer
+                for v in known:
+                    print v,':',theano.function(inputs,known[v])(*values)
+                assert False
+
+def test_dxdx():
+
+
+    # Tests that the gradient of a scalar with respect to itself is 1
+    # I use an integer in this case because people keep changing this
+    # gradient to be 0 on integers but according to our interpretation
+    # of the gradient as defined in the Op contract, it should be 1.
+    # If you feel the need to change this unit test you are probably
+    # modifying the Op contract and should definitely get the approval
+    # of multiple people on theano-dev.
+
+    x = theano.tensor.iscalar()
+    g = theano.tensor.grad(x, x)
+
+    g = g.eval({ x : 12 })
+
+    assert np.allclose(g,1.)
+
+def test_known_grads_integers():
+
+    # Tests that known_grads works on integers
+
+    x = theano.tensor.iscalar()
+    g_expected = theano.tensor.scalar()
+
+    g_grad = theano.gradient.grad(cost=None,
+            known_grads={x : g_expected},
+            wrt=x)
+
+    f = theano.function([g_expected],g_grad)
+
+    x = -3
+    gv = np.cast[theano.config.floatX](.6)
+
+    g_actual = f(gv)
+
+    assert np.allclose(g_actual, gv)

 if __name__ == '__main__':
    unittest.main()
--- a/theano/tests/test_rop.py
+++ b/theano/tests/test_rop.py
@@ -341,15 +341,9 @@ class test_RopLop(RopLop_checker):
        rop_out2 = tensor.Rop((m, v, m + v), [m, v], [m_, v_])
        assert isinstance(rop_out2, tuple)
        assert len(rop_out2) == 3
-        lop_out1 = tensor.Lop([m, v, m + v], (m, v), [m_, v_])
-        assert isinstance(lop_out1, tuple)
-        assert len(lop_out1) == 2
-        lop_out2 = tensor.Lop((m, v, m + v), [m, v], [m_, v_])
-        assert isinstance(lop_out2, list)
-        assert len(lop_out2) == 2

        all_outs = []
-        for o in rop_out1, rop_out2, lop_out1, lop_out2:
+        for o in rop_out1, rop_out2:
            all_outs.extend(o)
        f = theano.function([m, v, m_, v_], all_outs)
        f(mval, vval, m_val, v_val)