Merge pull request #5234 from ReyhaneAskari/fix_opt

Useless sum in grad removed and test added

Merge pull request #5234 from ReyhaneAskari/fix_opt
16496197 · Frédéric Bastien · GitHub · c927092b · 380bcbb3 · 16496197
--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -700,7 +700,7 @@ second dimension
            # we can sum over them
            # todo: only count dimensions that were effectively broadcasted
            to_sum = [j for j, bcast in enumerate(ipt.type.broadcastable)
-                      if bcast]
+                      if bcast and not outs[0].broadcastable[j]]
            if to_sum:
                shuffle = []

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -1729,21 +1729,6 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
    else:
        return
-    # If the arg to softmax is a broadcasted vector, d_sm has the form:
-    #   DimShuffle{x,0}(Sum{0}(...))
-    # we consider what's inside of the sum instead
-    vector_softmax = False
-    if d_sm.owner and isinstance(d_sm.owner.op, tensor.DimShuffle):
-        ds_op = d_sm.owner.op
-        if ds_op.input_broadcastable == (False,) and ds_op.new_order == ('x', 0):
-            maybe_sum = d_sm.owner.inputs[0]
-            if maybe_sum.owner and isinstance(maybe_sum.owner.op, tensor.Sum):
-                if sm.broadcastable == (True, False)\
-                        and maybe_sum.owner.op.axis == (0,)\
-                        and len(maybe_sum.owner.inputs) == 1:
-                    vector_softmax = True
-                    d_sm = maybe_sum.owner.inputs[0]
    # Two cases are supported:
    # 1. AdvancedIncSubtensor(
    #           zeros_like(softmax(x)),
@@ -1886,8 +1871,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
            # Check z is zeros_like(log(sm))
            if not _is_const(z, 0):
                return
-            if z.broadcastable != (False, False):
+            if z.broadcastable not in [(False, False), (True, False)]:
-                if not (vector_softmax and z.broadcastable == (True, False)):
                    return
            # here we know that we are incrementing a matrix of zeros
            # (or a broadcasted vector).

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -4608,7 +4608,6 @@ class Canonizer(gof.LocalOptimizer):
        | x * y * z -> ([x, y, z], [])
        """
        # This function is recursive.  The idea is that there is a
        # get_num_denum recursion in which the internal ops are all
        # one of (main, inverse, reciprocal, DimShuffle) and the
@@ -5551,11 +5550,7 @@ def local_useless_reduce(node):
            return [summed]
-# Enabling this optimization at canonicalization step break this test:
+@register_canonicalize
-# theano/tensor/tests/test_opt.py:T_local_reduce.test_local_reduce_broadcast_some_0
-# see gh-790 issue.
-#
-# @register_canonicalize
 @register_uncanonicalize
 @register_specialize
 @gof.local_optimizer(ALL_REDUCE)
@@ -6340,8 +6335,9 @@ def local_greedy_distributor(node):
        if candidate not in num:
            continue
        num.remove(candidate)
-        _change, candidate, num, denum = attempt_distribution(candidate,
+        _change, candidate, num, denum = attempt_distribution(
-                                                              num, denum, out_type)
+            candidate, num, denum, out_type,)
        change |= _change
        new_num.append(candidate)
@@ -6349,11 +6345,10 @@ def local_greedy_distributor(node):
        if candidate not in denum:
            continue
        denum.remove(candidate)
-        _change, candidate, denum, num = attempt_distribution(candidate,
+        _change, candidate, denum, num = attempt_distribution(
-                                                              denum, num, out_type)
+            candidate, denum, num, out_type)
        change |= _change
        new_denum.append(candidate)
    if not change:
        return False

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -1242,6 +1242,36 @@ def test_clip_grad():
                               [numpy.asarray([-1., 0.5, 2.]), 0., 1.])
+def test_grad_useless_sum():
+    """Test absence of useless sum.
+    When an operation (such as T.mul) is done on a broadcastable vector and
+    a matrix, the gradient in backward path is computed for the broadcasted
+    vector. So a sum reverts the broadcasted vector to a vector. In the case
+    of operations on two broadcastable vectors, the sum should not be generated.
+    This test checks whether there is a useless sum in the gradient
+    computations.
+    """
+    mode = theano.compile.get_default_mode().including('canonicalize')
+    x = tensor.TensorType(theano.config.floatX, (True,))('x')
+    l = tensor.log(1.0 - tensor.nnet.sigmoid(x))[0]
+    g = tensor.grad(l, x)
+    nodes = theano.gof.graph.ops([x], [g])
+    f = theano.function([x], g, mode=mode)
+    test_values = [-100, -1, 0, 1, 100]
+    outputs = []
+    for test_value in test_values:
+        outputs.append(f(numpy.array([test_value]).astype('float32')))
+    assert not any([isinstance(node.op, theano.tensor.elemwise.Sum) for node in nodes])
+    assert numpy.allclose(outputs, [[-3.72007598e-44],
+                                    [-0.26894142],
+                                    [-0.5],
+                                    [-0.73105858],
+                                    [-1.]])
 def test_clip_grad_int():
    # test that integers don't crash clip gradient