Re-add part of the dtype constraint on out grads

In order to avoid expanding memory usage and computations in the part of the graph that computes gradients, I propose the following conventions, that re-instate some of the constraint that existed before on the dtype of gradients: - When calling some_op.grad(inputs, output_grads), each variable in the "output_grads" list, if it is an actual numeric variable (and not, for instance, DisconnectedType or NullType), should have the same dtype as the corresponding output variable. - Moreover, if one of the output variables is of a discrete dtype (int or uint), then the corresponding output gradient (if not a special case like NullType) should be zeros. This is implemented in theano.grad, so the Op's grad method does not have to be changed, but now it can rely again on the fact that, if an output gradient has a dtype, that dtype will be the same as the corresponding output variable.

Re-add part of the dtype constraint on out grads
3bd9ffde · Pascal Lamblin · 0e01edd8 · 3bd9ffde
--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -465,9 +465,39 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
    # build a dict mapping var to the gradient of cost with respect to var
    grad_dict = {}
-    # by default, the gradient of the cost is 1
-    if g_cost is None:
+    # The gradient of the cost should default to 1 if the cost is of a
-        g_cost = _float_ones_like(cost)
+    # continuous dtype (float, for the moment, as complex are unsupported),
+    # and should always be 0 if the cost is of discrete (integer) dtype.
+    if getattr(cost.type, 'dtype', None) not in tensor.float_dtypes:
+        if g_cost is not None:
+            try:
+                cval = theano.get_constant_value(g_cost)
+                if cval == 0:
+                    g_cost_is_zero = True
+            except TypeError:
+                g_cost_is_zero = False
+            if not g_cost_is_zero:
+                raise ValueError("The gradient of a cost of non-continuous "
+                        "dtype (here, %s), if it is defined, should be 0. "
+                        "However, a value of %s was provided in the 'g_cost' "
+                        "argument of theano.grad(). To remove this error, "
+                        "you can simply omit the 'g_cost' argument, or "
+                        "give it the default value of None." % (
+                            getattr(g_cost.type, 'dtype', 'no dtype defined'),
+                            g_cost))
+        g_cost = tensor.zeros_like(cost)
+    elif g_cost is None:
+        # cost.type.dtype is in tensor.float_dtypes at that point
+        g_cost = tensor.ones_like(cost)
+    else:
+        # Cast the provided gradient so that it has the same dtype
+        # as the cost.
+        g_cost = g_cost.astype(cost.type.dtype)
    grad_dict[cost] = g_cost
    # the gradient of the constants is 0
@@ -501,10 +531,12 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
        cost_name = cost.name
    # Make sure we didn't initialize the grad_dict with any ints
+    # for non-int outputs
    for var in grad_dict:
        g = grad_dict[var]
-        if hasattr(g.type, 'dtype'):
+        if (hasattr(g.type, 'dtype') and
-            assert g.type.dtype.find('float') != -1
+                getattr(var.type, 'dtype', '') in tensor.float_dtypes):
+            assert g.type.dtype in tensor.float_dtypes
    rval = _populate_grad_dict(var_to_node_to_idx,
            grad_dict, wrt, cost_name)
@@ -739,7 +771,30 @@ def _populate_grad_dict(var_to_node_to_idx,
                inputs = [try_to_copy_if_needed(ipt) for ipt in inputs]
-                input_grads = node.op.grad(inputs, output_grads)
+                # Build a list of output gradients with the same dtype as
+                # the corresponding output variable.
+                # If an output is of a float dtype, we want to cast the
+                # output gradient into the same dtype, to avoid having a
+                # gradient graph with double precision (taking more memory,
+                # and more computation).
+                # If an output is of an integer dtype, then we ensure the
+                # output gradient is zero, and that zero can be represented
+                # in the same int dtype.
+                # If an output gradient is a NullType or DisconnectedType,
+                # then it will not have a dtype, and it will not be changed.
+                new_output_grads = []
+                for o, og in zip(node.outputs, output_grads):
+                    o_dt = getattr(o.type, 'dtype', None)
+                    og_dt = getattr(og.type, 'dtype', None)
+                    if o_dt and og_dt and o_dt != og_dt:
+                        if o_dt in theano.tensor.float_dtypes:
+                            new_output_grads.append(og.astype(o_dt))
+                        else:
+                            new_output_grads.append(o.zeros_like())
+                    else:
+                        new_output_grads.append(og)
+                input_grads = node.op.grad(inputs, new_output_grads)
                if input_grads is None:
                    raise TypeError("%s.grad returned NoneType, "
@@ -997,8 +1052,18 @@ def grad_sources_inputs(sources, graph_inputs):
    # build a dict mapping var to the gradient of cost with respect to var
    grad_dict = {}
-    # by default, the gradient of the cost is 1
    for output, output_grad in sources:
+        # The gradient of the cost should always be 0 if the cost is of
+        # discrete (integer) dtype.
+        if getattr(output.type, 'dtype', '') not in theano.tensor.float_dtypes:
+            output_grad = output.zeros_like()
+        else:
+            # Cast the provided gradient so that it has the same dtype
+            # as the cost.
+            output_grad = output_grad.astype(output.type.dtype)
        grad_dict[output] = output_grad
    # variables that do not influence the cost have zero gradient.