提交 3bd9ffde authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Re-add part of the dtype constraint on out grads

In order to avoid expanding memory usage and computations in the part of the graph that computes gradients, I propose the following conventions, that re-instate some of the constraint that existed before on the dtype of gradients: - When calling some_op.grad(inputs, output_grads), each variable in the "output_grads" list, if it is an actual numeric variable (and not, for instance, DisconnectedType or NullType), should have the same dtype as the corresponding output variable. - Moreover, if one of the output variables is of a discrete dtype (int or uint), then the corresponding output gradient (if not a special case like NullType) should be zeros. This is implemented in theano.grad, so the Op's grad method does not have to be changed, but now it can rely again on the fact that, if an output gradient has a dtype, that dtype will be the same as the corresponding output variable.
上级 0e01edd8
...@@ -465,9 +465,39 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, ...@@ -465,9 +465,39 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
# build a dict mapping var to the gradient of cost with respect to var # build a dict mapping var to the gradient of cost with respect to var
grad_dict = {} grad_dict = {}
# by default, the gradient of the cost is 1
if g_cost is None: # The gradient of the cost should default to 1 if the cost is of a
g_cost = _float_ones_like(cost) # continuous dtype (float, for the moment, as complex are unsupported),
# and should always be 0 if the cost is of discrete (integer) dtype.
if getattr(cost.type, 'dtype', None) not in tensor.float_dtypes:
if g_cost is not None:
try:
cval = theano.get_constant_value(g_cost)
if cval == 0:
g_cost_is_zero = True
except TypeError:
g_cost_is_zero = False
if not g_cost_is_zero:
raise ValueError("The gradient of a cost of non-continuous "
"dtype (here, %s), if it is defined, should be 0. "
"However, a value of %s was provided in the 'g_cost' "
"argument of theano.grad(). To remove this error, "
"you can simply omit the 'g_cost' argument, or "
"give it the default value of None." % (
getattr(g_cost.type, 'dtype', 'no dtype defined'),
g_cost))
g_cost = tensor.zeros_like(cost)
elif g_cost is None:
# cost.type.dtype is in tensor.float_dtypes at that point
g_cost = tensor.ones_like(cost)
else:
# Cast the provided gradient so that it has the same dtype
# as the cost.
g_cost = g_cost.astype(cost.type.dtype)
grad_dict[cost] = g_cost grad_dict[cost] = g_cost
# the gradient of the constants is 0 # the gradient of the constants is 0
...@@ -501,10 +531,12 @@ def grad(cost, wrt, g_cost=None, consider_constant=None, ...@@ -501,10 +531,12 @@ def grad(cost, wrt, g_cost=None, consider_constant=None,
cost_name = cost.name cost_name = cost.name
# Make sure we didn't initialize the grad_dict with any ints # Make sure we didn't initialize the grad_dict with any ints
# for non-int outputs
for var in grad_dict: for var in grad_dict:
g = grad_dict[var] g = grad_dict[var]
if hasattr(g.type, 'dtype'): if (hasattr(g.type, 'dtype') and
assert g.type.dtype.find('float') != -1 getattr(var.type, 'dtype', '') in tensor.float_dtypes):
assert g.type.dtype in tensor.float_dtypes
rval = _populate_grad_dict(var_to_node_to_idx, rval = _populate_grad_dict(var_to_node_to_idx,
grad_dict, wrt, cost_name) grad_dict, wrt, cost_name)
...@@ -739,7 +771,30 @@ def _populate_grad_dict(var_to_node_to_idx, ...@@ -739,7 +771,30 @@ def _populate_grad_dict(var_to_node_to_idx,
inputs = [try_to_copy_if_needed(ipt) for ipt in inputs] inputs = [try_to_copy_if_needed(ipt) for ipt in inputs]
input_grads = node.op.grad(inputs, output_grads) # Build a list of output gradients with the same dtype as
# the corresponding output variable.
# If an output is of a float dtype, we want to cast the
# output gradient into the same dtype, to avoid having a
# gradient graph with double precision (taking more memory,
# and more computation).
# If an output is of an integer dtype, then we ensure the
# output gradient is zero, and that zero can be represented
# in the same int dtype.
# If an output gradient is a NullType or DisconnectedType,
# then it will not have a dtype, and it will not be changed.
new_output_grads = []
for o, og in zip(node.outputs, output_grads):
o_dt = getattr(o.type, 'dtype', None)
og_dt = getattr(og.type, 'dtype', None)
if o_dt and og_dt and o_dt != og_dt:
if o_dt in theano.tensor.float_dtypes:
new_output_grads.append(og.astype(o_dt))
else:
new_output_grads.append(o.zeros_like())
else:
new_output_grads.append(og)
input_grads = node.op.grad(inputs, new_output_grads)
if input_grads is None: if input_grads is None:
raise TypeError("%s.grad returned NoneType, " raise TypeError("%s.grad returned NoneType, "
...@@ -997,8 +1052,18 @@ def grad_sources_inputs(sources, graph_inputs): ...@@ -997,8 +1052,18 @@ def grad_sources_inputs(sources, graph_inputs):
# build a dict mapping var to the gradient of cost with respect to var # build a dict mapping var to the gradient of cost with respect to var
grad_dict = {} grad_dict = {}
# by default, the gradient of the cost is 1
for output, output_grad in sources: for output, output_grad in sources:
# The gradient of the cost should always be 0 if the cost is of
# discrete (integer) dtype.
if getattr(output.type, 'dtype', '') not in theano.tensor.float_dtypes:
output_grad = output.zeros_like()
else:
# Cast the provided gradient so that it has the same dtype
# as the cost.
output_grad = output_grad.astype(output.type.dtype)
grad_dict[output] = output_grad grad_dict[output] = output_grad
# variables that do not influence the cost have zero gradient. # variables that do not influence the cost have zero gradient.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论