Merge pull request #3404 from kelvinxu/stack_trace

Add stack trace for nnet

Merge pull request #3404 from kelvinxu/stack_trace
8f038cc6 · Pascal Lamblin · cebafe0d · 8b239b25 · 8f038cc6
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -21,8 +21,8 @@ from theano import gof
 from theano import scalar
 from theano.tensor import basic as tensor
 from theano.tensor import subtensor
-from theano.tensor import elemwise
 from theano.tensor import opt
+from theano.tensor.opt import copy_stack_trace
 from theano.compile import optdb
 from theano.gof import Apply
@@ -31,6 +31,7 @@ from theano.gradient import DisconnectedType
 from theano.gradient import grad_not_implemented
 from theano.tensor.type import values_eq_approx_remove_nan
 ############
 #
 # TENSOR OPS
@@ -113,7 +114,8 @@ class SoftmaxWithBias(gof.Op):
        # TODO: set error messages for failures in this code
-        # TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1]
+        # TODO: use this to accept float32 and int32:
+        # node.inputs[0].type.dtype_specs()[1]
        init_decl = """
        npy_intp* Nx = PyArray_DIMS(%(x)s);
        npy_intp Sx = 0;
@@ -634,16 +636,19 @@ def local_softmax_with_bias(node):
                # we're in business...
                if len(vectors) > 1:
                    vector_sum = tensor.add(*vectors)
+                    copy_stack_trace(x_in, vector_sum)
                else:
                    vector_sum = vectors[0]
                if len(non_vectors) > 1:
                    non_vector_sum = tensor.add(*non_vectors)
+                    copy_stack_trace(x_in, non_vector_sum)
                else:
                    non_vector_sum = non_vectors[0]
                try:
                    sm_bias = softmax_with_bias(non_vector_sum, vector_sum)
+                    copy_stack_trace(node.outputs[0], sm_bias)
                except Exception:
                    # if our arguments have the wrong types, then
                    # forget about it
@@ -692,114 +697,6 @@ def softmax_simplifier(numerators, denominators):
    return numerators, denominators
 opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier')
-if 0:
-    @opt.register_specialize
-    @gof.local_optimizer([tensor.add])
-    def local_softmax_grad(node):
-        '''dy*sm - DimShuffle{0,'x'}(sum{1}(dy*sm))*sm -> softmax_grad(dy,sm)'''
-        # TODO what if the signs are changed?
-        # TODO and if a scalar is distributed before each of the terms?
-        # TODO 'dy' could also be a product
-        if node.op == tensor.add and node.out.ndim == 2:
-            add_inputs = node.inputs
-            # Trying to locate two nodes in the sum:
-            #   dy * sm, prod_term
-            #   - DimShuffle{0,'x'}(sum{1}(dy*sm))*sm
-            prod_term = None
-            other_terms = []
-            # First, prod_term
-            for add_in in add_inputs:
-                if (add_in.owner and
-                        add_in.owner.op == tensor.mul and
-                        prod_term is None):
-                    mul_inputs = add_in.owner.inputs
-                    if (len(mul_inputs) == 2 and
-                            all([mul_in.ndim == 2 for mul_in in mul_inputs])):
-                        prod_term = add_in
-                    else:
-                        other_terms.append(add_in)
-                else:
-                    other_terms.append(add_in)
-            if prod_term is None:
-                # print 'no prod_term'
-                return
-            assert len(other_terms) == len(add_inputs) - 1
-            ds_term = None
-            rest = []
-            for add_in in other_terms:
-                if add_in.owner and add_in.owner.op == tensor.neg:
-                    neg_input = add_in.owner.inputs[0]
-                    if neg_input.owner and neg_input.owner.op == tensor.mul:
-                        mul2_inputs = neg_input.owner.inputs
-                        if len(mul2_inputs) != 2:
-                            rest.append(add_in)
-                            # print 'len(mul2_inputs) =', len(mul2_inputs)
-                            continue
-                        # Try and find DimShuffle(Sum)
-                        maybe_ds = None
-                        for i, mul2_in in enumerate(mul2_inputs):
-                            if mul2_in.owner and isinstance(mul2_in.owner.op,
-                                                            elemwise.DimShuffle):
-                                maybe_ds = mul2_in
-                                maybe_sm = mul2_inputs[1 - i]  # The other one
-                        if (maybe_ds is None or
-                                maybe_ds.ndim != 2 or
-                                maybe_sm.ndim != 2):
-                            rest.append(add_in)
-                            # print 'maybe_ds =', maybe_ds
-                            # if maybe_ds:
-                            # print 'maybe_ds.ndim =', maybe_ds.ndim, ', maybe_sm.ndim =', maybe_sm.ndim
-                            continue
-                        if maybe_sm is mul_inputs[0]:
-                            maybe_dy = mul_inputs[1]
-                        elif maybe_sm is mul_inputs[1]:
-                            maybe_dy = mul_inputs[0]
-                        else:
-                            rest.append(add_in)
-                            # print 'maybe_sm, maybe_dy =', maybe_sm, maybe_dy
-                            # print 'mul_inputs =', mul_inputs
-                            continue
-                        ds_order = maybe_ds.owner.op.new_order
-                        ds_input = maybe_ds.owner.inputs[0]
-                        axis = None
-                        if ds_input.owner and isinstance(ds_input.owner.op,
-                                                         elemwise.Sum):
-                            axis = ds_input.owner.op.axis
-                            sum_input = ds_input.owner.inputs[0]
-                        if ((ds_order != (0, 'x')) or
-                                (axis != (1,)) or
-                                (sum_input is not prod_term)):
-                            rest.append(add_in)
-                            # print 'ds_order =', ds_order
-                            # print 'axis =', axis
-                            # if axis is not None:
-                            #    print 'sum_input =', sum_input, ', prod_term =', prod_term
-                            # else:
-                            #    print 'ds_input.owner =', ds_input.owner
-                            # print 'add_in =', add_in
-                            continue
-                        ds_term = add_in
-                    else:
-                        # print 'neg_input.owner =', neg_input.owner
-                        rest.append(add_in)
-                else:
-                    # print 'add_in.owner =', add_in.owner
-                    rest.append(add_in)
-            if ds_term is None:
-                # print 'no ds_term'
-                return
-            if len(rest) == 0:
-                return [softmax_grad(maybe_dy, maybe_sm)]
-            else:
-                return [tensor.add(softmax_grad(maybe_dy, maybe_sm), *rest)]
 class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
    """
@@ -1457,6 +1354,7 @@ def local_softmax_grad_to_crossentropy_with_softmax_grad(node):
            g_nll, coding_dist, true_one_of_n = g_coding_dist.owner.inputs
            dx = crossentropy_softmax_1hot_with_bias_dx(g_nll, coding_dist,
                                                        true_one_of_n)
+            copy_stack_trace(node.outputs[0], dx)
            return [dx]
@@ -1485,13 +1383,18 @@ def local_argmax_pushdown(node):
        if x.owner and x.owner.op in (softmax_op, softplus, tensor.exp,
                                      tensor.log, tensor.tanh, sigmoid):
            pre_x, = x.owner.inputs
-            return tensor._max_and_argmax(pre_x, axis)
+            ret = tensor._max_and_argmax(pre_x, axis)
+            copy_stack_trace(x_max, ret)
+            return ret
        if x.owner and x.owner.op == softmax_with_bias:
            pre_x, pre_bias = x.owner.inputs
-            return tensor._max_and_argmax(pre_x +
+            ret = tensor._max_and_argmax(pre_x +
-                                          tensor.DimShuffle(
+                                         tensor.DimShuffle(
-                                              pre_bias.broadcastable,
+                                             pre_bias.broadcastable,
-                                              ('x', 0))(pre_bias), axis)
+                                             ('x', 0))(pre_bias), axis)
+            # copy both stack traces
+            copy_stack_trace(x_max, ret)
+            return ret
 # Utility function used by the two next optimizations
@@ -1585,9 +1488,12 @@ def local_advanced_indexing_crossentropy_onehot(node):
        # Check that rows == arange(labels.shape[0])
        if _check_rows_is_arange_len_labels(rows, labels):
            if labels.ndim == 1 and x_var.ndim == 2:
-                return [-crossentropy_softmax_argmax_1hot_with_bias(x_var,
+                minus_ret = crossentropy_softmax_argmax_1hot_with_bias(x_var,
-                                                                    b_var,
+                                                                       b_var,
-                                                                    labels)[0]]
+                                                                       labels)[0]
+                ret = -minus_ret
+                copy_stack_trace(node.outputs[0], [minus_ret, ret])
+                return [ret]
 @opt.register_specialize('fast_compile_gpu')
@@ -1809,7 +1715,11 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
    # Dimension check before substitution
    if labels.ndim == 1 and x_var.ndim == 2:
-        return [crossentropy_softmax_1hot_with_bias_dx(out_grad, sm, labels)]
+        ret = crossentropy_softmax_1hot_with_bias_dx(out_grad, sm, labels)
+        # The stack trace is not added to output_grad, sm and labels at
+        # the moment but may need to be added at a future point
+        copy_stack_trace(node.outputs[0], ret)
+        return [ret]
    else:
        return
@@ -1825,6 +1735,7 @@ def graph_merge_softmax_with_crossentropy_softmax(node):
                if big_client in [b_client[0] for b_client in b.clients]:
                    xx, bb, ll = big_client.inputs
                    mergeable_client = big_client.op(x, b, ll)
+                    copy_stack_trace(node.outputs[0], mergeable_client[1])
                    return [mergeable_client[1]]
@@ -1885,7 +1796,9 @@ def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node):
                msg = '`sm` and `dy` do not have the same shape.'
                dz = opt.Assert(msg)(dz, cond)
-            return [node.op(dz, sm, y_idx)]
+            ret = node.op(dz, sm, y_idx)
+            copy_stack_trace(node.outputs[0], ret)
+            return [ret]
 def binary_crossentropy(output, target):