提交 16496197 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5234 from ReyhaneAskari/fix_opt

Useless sum in grad removed and test added
...@@ -700,7 +700,7 @@ second dimension ...@@ -700,7 +700,7 @@ second dimension
# we can sum over them # we can sum over them
# todo: only count dimensions that were effectively broadcasted # todo: only count dimensions that were effectively broadcasted
to_sum = [j for j, bcast in enumerate(ipt.type.broadcastable) to_sum = [j for j, bcast in enumerate(ipt.type.broadcastable)
if bcast] if bcast and not outs[0].broadcastable[j]]
if to_sum: if to_sum:
shuffle = [] shuffle = []
......
...@@ -1729,21 +1729,6 @@ def local_advanced_indexing_crossentropy_onehot_grad(node): ...@@ -1729,21 +1729,6 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
else: else:
return return
# If the arg to softmax is a broadcasted vector, d_sm has the form:
# DimShuffle{x,0}(Sum{0}(...))
# we consider what's inside of the sum instead
vector_softmax = False
if d_sm.owner and isinstance(d_sm.owner.op, tensor.DimShuffle):
ds_op = d_sm.owner.op
if ds_op.input_broadcastable == (False,) and ds_op.new_order == ('x', 0):
maybe_sum = d_sm.owner.inputs[0]
if maybe_sum.owner and isinstance(maybe_sum.owner.op, tensor.Sum):
if sm.broadcastable == (True, False)\
and maybe_sum.owner.op.axis == (0,)\
and len(maybe_sum.owner.inputs) == 1:
vector_softmax = True
d_sm = maybe_sum.owner.inputs[0]
# Two cases are supported: # Two cases are supported:
# 1. AdvancedIncSubtensor( # 1. AdvancedIncSubtensor(
# zeros_like(softmax(x)), # zeros_like(softmax(x)),
...@@ -1886,8 +1871,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node): ...@@ -1886,8 +1871,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
# Check z is zeros_like(log(sm)) # Check z is zeros_like(log(sm))
if not _is_const(z, 0): if not _is_const(z, 0):
return return
if z.broadcastable != (False, False): if z.broadcastable not in [(False, False), (True, False)]:
if not (vector_softmax and z.broadcastable == (True, False)):
return return
# here we know that we are incrementing a matrix of zeros # here we know that we are incrementing a matrix of zeros
# (or a broadcasted vector). # (or a broadcasted vector).
......
...@@ -4608,7 +4608,6 @@ class Canonizer(gof.LocalOptimizer): ...@@ -4608,7 +4608,6 @@ class Canonizer(gof.LocalOptimizer):
| x * y * z -> ([x, y, z], []) | x * y * z -> ([x, y, z], [])
""" """
# This function is recursive. The idea is that there is a # This function is recursive. The idea is that there is a
# get_num_denum recursion in which the internal ops are all # get_num_denum recursion in which the internal ops are all
# one of (main, inverse, reciprocal, DimShuffle) and the # one of (main, inverse, reciprocal, DimShuffle) and the
...@@ -5551,11 +5550,7 @@ def local_useless_reduce(node): ...@@ -5551,11 +5550,7 @@ def local_useless_reduce(node):
return [summed] return [summed]
# Enabling this optimization at canonicalization step break this test: @register_canonicalize
# theano/tensor/tests/test_opt.py:T_local_reduce.test_local_reduce_broadcast_some_0
# see gh-790 issue.
#
# @register_canonicalize
@register_uncanonicalize @register_uncanonicalize
@register_specialize @register_specialize
@gof.local_optimizer(ALL_REDUCE) @gof.local_optimizer(ALL_REDUCE)
...@@ -6340,8 +6335,9 @@ def local_greedy_distributor(node): ...@@ -6340,8 +6335,9 @@ def local_greedy_distributor(node):
if candidate not in num: if candidate not in num:
continue continue
num.remove(candidate) num.remove(candidate)
_change, candidate, num, denum = attempt_distribution(candidate, _change, candidate, num, denum = attempt_distribution(
num, denum, out_type) candidate, num, denum, out_type,)
change |= _change change |= _change
new_num.append(candidate) new_num.append(candidate)
...@@ -6349,11 +6345,10 @@ def local_greedy_distributor(node): ...@@ -6349,11 +6345,10 @@ def local_greedy_distributor(node):
if candidate not in denum: if candidate not in denum:
continue continue
denum.remove(candidate) denum.remove(candidate)
_change, candidate, denum, num = attempt_distribution(candidate, _change, candidate, denum, num = attempt_distribution(
denum, num, out_type) candidate, denum, num, out_type)
change |= _change change |= _change
new_denum.append(candidate) new_denum.append(candidate)
if not change: if not change:
return False return False
......
...@@ -1242,6 +1242,36 @@ def test_clip_grad(): ...@@ -1242,6 +1242,36 @@ def test_clip_grad():
[numpy.asarray([-1., 0.5, 2.]), 0., 1.]) [numpy.asarray([-1., 0.5, 2.]), 0., 1.])
def test_grad_useless_sum():
"""Test absence of useless sum.
When an operation (such as T.mul) is done on a broadcastable vector and
a matrix, the gradient in backward path is computed for the broadcasted
vector. So a sum reverts the broadcasted vector to a vector. In the case
of operations on two broadcastable vectors, the sum should not be generated.
This test checks whether there is a useless sum in the gradient
computations.
"""
mode = theano.compile.get_default_mode().including('canonicalize')
x = tensor.TensorType(theano.config.floatX, (True,))('x')
l = tensor.log(1.0 - tensor.nnet.sigmoid(x))[0]
g = tensor.grad(l, x)
nodes = theano.gof.graph.ops([x], [g])
f = theano.function([x], g, mode=mode)
test_values = [-100, -1, 0, 1, 100]
outputs = []
for test_value in test_values:
outputs.append(f(numpy.array([test_value]).astype('float32')))
assert not any([isinstance(node.op, theano.tensor.elemwise.Sum) for node in nodes])
assert numpy.allclose(outputs, [[-3.72007598e-44],
[-0.26894142],
[-0.5],
[-0.73105858],
[-1.]])
def test_clip_grad_int(): def test_clip_grad_int():
# test that integers don't crash clip gradient # test that integers don't crash clip gradient
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论