Optimization of automatically computed gradient of softmax+nll by

crossentropy_softmax_1hot_with_bias_dx

Optimization of automatically computed gradient of softmax+nll by
a414e5f3 · Pascal Lamblin · b72de011 · a414e5f3
--- a/theano/tensor/nnet.py
+++ b/theano/tensor/nnet.py
@@ -953,6 +953,181 @@ def local_advanced_indexing_crossentropy_onehot(node):
            if labels.ndim == 1 and x_var.ndim == 2:
                return [-crossentropy_softmax_argmax_1hot_with_bias(x_var, b_var, labels)[0]]
+@opt.register_specialize
+@gof.local_optimizer([softmax_grad])
+def local_advanced_indexing_crossentropy_onehot_grad(node):
+    if not (node.op == softmax_grad):
+        return
+    sm = None
+    try:
+        out_grad, sm = node.inputs
+    except:
+        return
+    if sm is not None and sm.owner and sm.owner.op == softmax:
+        x_var = sm.owner.inputs[0]
+    else:
+        return
+    # Two cases are supported:
+    # 1. AdvancedIncSubtensor(
+    #           zeros_like(softmax(x)),
+    #           -1. / AdvancedSubtensor(softmax(x), arange(y.shape[0]), y),
+    #           arange(y.shape[0]),
+    #           y)
+    #   which arises from the gradient of log(softmax(x)[arange(y.shape[0]), y])
+    #
+    # 2. AdvancedIncSubtensor(
+    #           zeros_like(log(softmax(x))),
+    #           -1. like (AdvancedSubtensor(log(softmax(x)), arange(y.shape[0]), y)),
+    #           arange(y.shape[0]),
+    #           y)
+    #           / softmax(x)
+    #   which arises from the gradient of log(softmax(x))[arange(y.shape[0]), y]
+    #
+    # In some cases, in case 2., insted of "-1. like (AdvancedSubtensor...)",
+    # we can have "-1. like ([-1] * AdvancedSubtensor...)". This case will be
+    # recognized too, but other variants, even with the same shape, might not
+    # (yet).
+    # First case.
+    # After the check for AdvancedIncSubtensor, if anything does not fit with
+    # the formula above, there's no way to fit it with the the second case,
+    # so we return immediately.
+    if out_grad.owner and isinstance(out_grad.owner.op, tensor.AdvancedIncSubtensor):
+        try:
+            z, incr, rows, labels = out_grad.owner.inputs
+        except:
+            return
+        # Check that z == zeros_like(softmax(x))
+        if z.owner and z.owner.op == tensor.fill:
+            model, value = z.owner.inputs
+            if not (model is sm and numpy.all(value.data == 0)):
+                return
+            #else: OK
+        else:
+            return
+        # Check that incr has the form -1./sm[arange(len(y)), y]
+        if incr.owner and incr.owner.op == tensor.true_div:
+            num, denom = incr.owner.inputs
+            if not numpy.all(num.data == -1):
+                return
+            #else: OK
+            if denom.owner and isinstance(denom.owner.op, tensor.AdvancedSubtensor):
+                try:
+                    maybe_sm, maybe_rows, maybe_labels = denom.owner.inputs
+                except:
+                    return
+                if not (maybe_sm is sm and maybe_rows is rows and maybe_labels is labels):
+                    return
+                #else: OK
+            else:
+                return
+        else:
+            return
+        # Check that rows is arange(labels.shape[0])
+        if not _check_rows_is_arange_len_labels(rows, labels):
+            return
+        # else, arguments of AdvancedIncSubtensor are OK,
+        # it was really case 1.
+    # Second case
+    elif out_grad.owner and out_grad.owner.op == tensor.true_div:
+        try:
+            num, denom = out_grad.owner.inputs
+        except:
+            return
+        # Check the numerator (AdvancedIncSubtensor)
+        if num.owner and isinstance(num.owner.op, tensor.AdvancedIncSubtensor):
+            try:
+                z, incr, rows, labels = num.owner.inputs
+            except:
+                return
+            # Check z is zeros_like(log(sm))
+            if z.owner and z.owner.op == tensor.fill:
+                model, value = z.owner.inputs
+                if model.owner and model.owner.op == tensor.log:
+                    if sm is model.owner.inputs[0]:
+                        log_sm = model
+                    else:
+                        return
+                    if not numpy.all(value.data == 0):
+                        return
+                    #else: OK
+                else:
+                    return
+            else:
+                return
+            # Check incr is (-1.) like log(softmax(x))[arange(len(y)), y]
+            if incr.owner and incr.owner.op == tensor.fill:
+                model, value = incr.owner.inputs
+                adv_subtensor = None
+                if model.owner and isinstance(model.owner.op, tensor.AdvancedSubtensor):
+                    adv_subtensor = model
+                else:
+                    if model.owner and isinstance(model.owner.op, tensor.Elemwise):
+                        for input in model.owner.inputs:
+                            if input.owner and isinstance(input.owner.op, tensor.AdvancedSubtensor):
+                                adv_subtensor = input
+                                break
+                                #TODO: try them all, not just the first one
+                    else:
+                        return
+                if adv_subtensor is not None:
+                    try:
+                        maybe_log_sm, maybe_rows, maybe_labels = adv_subtensor.owner.inputs
+                    except:
+                        return
+                    if not (maybe_log_sm is log_sm and maybe_rows is rows and maybe_labels is labels):
+                        return
+                    #else: OK
+                if not numpy.all(value.data == -1):
+                    return
+            else:
+                return
+            # Check that rows is arange(labels.shape[0])
+            if not check_rows_is_arange_len_labels(rows, labels):
+                return
+            # else, arguments of AdvancedIncSubtensor are OK
+        # Check the denominator (sm)
+        if not denom is sm:
+            return
+        # else, numerator and denominator are OK,
+        # it was really case 2.
+    else:
+        return
+    # Dimension check before substitution
+    if labels.ndim == 1 and x_var.ndim == 2:
+        print 'YAY!'
+        return [crossentropy_softmax_1hot_with_bias_dx(tensor.ones_like(sm[:,0]), sm, labels)]
+    else:
+        return
 def binary_crossentropy(output, target):