Merge pull request #3206 from julianser/new_stacktrace_fix

Continued work on #3018: Stacktrace fix

Merge pull request #3206 from julianser/new_stacktrace_fix
805991f1 · Pascal Lamblin · 33c97605 · 84b21aa2 · 805991f1 · 805991f1
--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -199,7 +199,7 @@ optdb.register('merge1', gof.MergeOptimizer(),
 # rearranges elemwise expressions
 optdb.register('canonicalize', gof.EquilibriumDB(ignore_newtrees=False),
-               1, 'fast_run', 'fast_compile')
+               1, 'fast_run', 'fast_compile', 'canonicalize_db')
 # Register in the canonizer Equilibrium as a clean up opt the merge opt.
 # Without this, as the equilibrium have ignore_newtrees=False, we
 # won't merge all nodes if it is set as a global optimizer with

--- a/theano/gof/fg.py
+++ b/theano/gof/fg.py
@@ -450,7 +450,7 @@ class FunctionGraph(utils.object2):
                            assert path is not None
                            tr = getattr(r.tag, 'trace', [])
                            detailed_err_msg = ""
-                            if len(tr) > 0:
+                            if type(tr) is list and len(tr) > 0:
                                detailed_err_msg += "\nBacktrace when the variable is created:\n"
                                # Print separate message for each element in

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -168,7 +168,7 @@ def raise_with_op(node, thunk=None, exc_info=None, storage_map=None):
    # Print node backtraces
    tr = getattr(node.outputs[0].tag, 'trace', [])
-    if len(tr) > 0:
+    if type(tr) is list and len(tr) > 0:
        detailed_err_msg += "\nBacktrace when the node is created:\n"
        # Print separate message for each element in the list of batcktraces

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -542,7 +542,7 @@ class PureOp(object):
                    "For compute_test_value, one input test value does not"
                    " have the requested type.\n")
                tr = getattr(v.tag, 'trace', [])
-                if len(tr) > 0:
+                if type(tr) is list and len(tr) > 0:
                    detailed_err_msg += (
                        " \nBacktrace when that variable is created:\n")
                    # Print separate message for each element in the list

--- a/theano/gof/utils.py
+++ b/theano/gof/utils.py
@@ -77,6 +77,7 @@ def add_tag_trace(thing, user_line=1):
    if limit == -1:
        limit = None
    tr = simple_extract_stack(limit=limit)[:-1]
    # Different python version use different sementic for
    # limit. python 2.7 include the call to extrack_stack. The -1 get
    # rid of it.
@@ -93,7 +94,11 @@ def add_tag_trace(thing, user_line=1):
                  "theano/sparse/", "theano\\sparse\\",
                  "theano/typed_list/", "theano\\typed_list\\",
                  ]:
-            if p in file_path:
+            # Julian: I added the 'tests' exception together with Arnaud.
+            # Otherwise, we'd lose the stack trace during in our test cases
+            # (e.g. in test_opt.py). We're not sure this is the right way to
+            # do it though.
+            if p in file_path and 'tests' not in file_path:
                tr = tr[:-1]
                rm = True
                break

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -87,15 +87,13 @@ def copy_stack_trace(from_var, to_var):
    tr = []
    if type(from_var) is list:
        # If from_var is a list, store concatenated stack traces
-        if len(from_var) > 0:
+        for v in from_var:
-            for v in from_var:
+            tr += getattr(v.tag, 'trace', [])
-                if hasattr(v.tag, 'trace'):
-                    tr = tr + v.tag.trace
    else:
-        # If from_var is not a list, it must be a single tensor
+        # If from_var is not a list, it must be a single tensor variable,
-        # variable, so just store that particular stack trace
+        # so just store that particular stack trace
-        if hasattr(from_var.tag, 'trace'):
+        tr = getattr(from_var.tag, 'trace', [])
-            tr = from_var.tag.trace
    # Copy over stack traces to to_var
    if type(to_var) is list:
@@ -1853,6 +1851,7 @@ def local_subtensor_make_vector(node):
                    ret = [x.owner.inputs[v]]
                except IndexError:
                    raise NotScalarConstantError("Bad user graph!")
                return ret
            except NotScalarConstantError:
                pass
@@ -1872,7 +1871,10 @@ def local_subtensor_make_vector(node):
        try:
            const_slice = node.op.get_constant_idx(node.inputs,
                                                   allow_partial=False)[0]
-            return [make_vector(*x.owner.inputs[const_slice])]
+            ret = make_vector(*x.owner.inputs[const_slice])
+            # Copy over stack trace from previous outputs to new output
+            copy_stack_trace(node.outputs, ret)
+            return [ret]
        except NotScalarConstantError:
            pass
    else:
@@ -1995,13 +1997,14 @@ def local_alloc_unary(node):
            x = a.owner.inputs[0]
            shp = a.owner.inputs[1:]
            v = node.op(x)
+            # T.alloc does not preserve the stacktrace of v,
+            # so we need to copy it over from x.
            copy_stack_trace(node.outputs[0], v)
            ret = T.alloc(T.cast(v, node.outputs[0].dtype), *shp)
-            # Is it really necessary to copy over stack trace here?
+            # T.cast does not preserve the stacktrace of x,
-            # after all, T.alloc and T.cast should preserve the stack trace from x,
+            # so we need to copy it over to the output.
-            # but perhaps the trace is lost in "v = node.op(x)"?
+            copy_stack_trace([node.outputs[0], a], ret)
-            copy_stack_trace(node.outputs[0], ret)
            return [ret]
@@ -2293,6 +2296,9 @@ def local_upcast_elemwise_constant_inputs(node):
                    # As this is just to allow merging more case, if
                    # the upcast don't work, we can just skip it.
                    return
+                # Copy over output stacktrace from before upcasting
+                copy_stack_trace(node.outputs[0], rval)
                return rval
 ##################
@@ -2345,7 +2351,10 @@ def local_useless_inc_subtensor(node):
               for e in node.op.idx_list):
            # They are the same shape, so we can remore this IncSubtensor
            return [node.inputs[1]]
-        return [Subtensor(node.op.idx_list)(*node.inputs[1:])]
+        ret = Subtensor(node.op.idx_list)(*node.inputs[1:])
+        # Copy over previous output stacktrace
+        copy_stack_trace(node.outputs, ret)
+        return [ret]
 @register_canonicalize
@@ -2378,7 +2387,11 @@ def local_set_to_inc_subtensor(node):
        if (subn.inputs[1] != node.inputs[2] or
                subn.inputs[0] != node.inputs[0]):
            return
-        return [advanced_inc_subtensor1(node.inputs[0], other, node.inputs[2])]
+        ret = advanced_inc_subtensor1(node.inputs[0], other, node.inputs[2])
+        # Copy over previous output stacktrace
+        # Julian: I'm not sure about this at all...
+        copy_stack_trace(node.outputs, ret)
+        return [ret]
 @register_canonicalize
@@ -2404,7 +2417,8 @@ def local_useless_slice(node):
            sl_ins = Subtensor.collapse(slices[:last_slice],
                                        lambda x: isinstance(x, T.Variable))
            out = subtens(node.inputs[0], *sl_ins)
+            # Copy over previous output stacktrace
+            copy_stack_trace(node.outputs, out)
            return [out]
@@ -2522,6 +2536,8 @@ def local_useless_subtensor(node):
    else:
        return False
+    # We don't need to copy over any stacktrace here,
+    # because previous stacktrace should suffice.
    return [node.inputs[0]]
@@ -2546,7 +2562,13 @@ def local_subtensor_lift(node):
        if isinstance(u.owner.op, T.Elemwise) and len(u.owner.inputs) == 1:
            idx = node.inputs[1:]
            x_idx = node.op(u.owner.inputs[0], *idx)
-            return [u.owner.op(x_idx)]
+            # Copy over previous output stacktrace
+            copy_stack_trace(node.outputs, x_idx)
+            ret = u.owner.op(x_idx)
+            # Copy over previous output stacktrace
+            # and stacktrace from previous unary operation
+            copy_stack_trace([node.outputs[0], node.inputs[0]], ret)
+            return [ret]
        if isinstance(u.owner.op, T.Elemwise):
            new_inputs = []
@@ -2554,7 +2576,14 @@ def local_subtensor_lift(node):
                # There is no broadcastable in the inputs
                idx = node.inputs[1:]
                new_inputs = [node.op(i, *idx) for i in u.owner.inputs]
-                return [u.owner.op(*new_inputs)]
+                # Copy over previous output stacktrace
+                copy_stack_trace(node.outputs[0], new_inputs)
+                ret = u.owner.op(*new_inputs)
+                # Copy over previous output stacktrace
+                # and stacktrace from previous unary operation
+                copy_stack_trace([node.outputs[0], node.inputs[0]], ret)
+                return [ret]
            elif all([sum(i.type.broadcastable) in [i.ndim, 0]
                      for i in u.owner.inputs]):
                # There is no broadcastable in the inputs or it is scalar
@@ -2571,7 +2600,15 @@ def local_subtensor_lift(node):
                        else:
                            new_inputs.append(
                                i.dimshuffle(['x'] * node.outputs[0].ndim))
-                return [u.owner.op(*new_inputs)]
+                # Copy over previous output stacktrace
+                copy_stack_trace(node.outputs[0], new_inputs)
+                ret = u.owner.op(*new_inputs)
+                # Copy over previous output stacktrace
+                # and stacktrace from previous unary operation
+                copy_stack_trace([node.outputs[0], node.inputs[0]], ret)
+                return [ret]
        if isinstance(u.owner.op, T.Rebroadcast):
            # make sure that Rebroadcast has only 1 input
@@ -2597,7 +2634,13 @@ def local_subtensor_lift(node):
                j += 1
            subt_x = node.op(u.owner.inputs[0], *node.inputs[1:])
+            # Copy over previous output stacktrace
+            copy_stack_trace(node.outputs[0], subt_x)
            rbcast_subt_x = T.Rebroadcast(*new_axis)(subt_x)
+            # Copy over previous output stacktrace
+            # and stacktrace from previous unary operation
+            copy_stack_trace([node.outputs[0], node.inputs[0]], rbcast_subt_x)
            return [rbcast_subt_x]
@@ -2789,11 +2832,18 @@ def local_subtensor_merge(node):
            merged_slices = make_constant(merged_slices)
            subtens = Subtensor(merged_slices)
            sl_ins = Subtensor.collapse(
                merged_slices,
                lambda x: isinstance(x, T.Variable))
            # Do not call make_node for test_value
            out = subtens(x, *sl_ins)
+            # Copy over previous output stacktrace
+            # and stacktrace from previous slicing operation.
+            # Why? Because, the merged slicing operation could have failed
+            # because of either of the two original slicing operations
+            copy_stack_trace([node.outputs[0], node.inputs[0]], out)
            return [out]
@@ -2912,7 +2962,19 @@ def local_subtensor_of_dot(node):
    a_sub = a.__getitem__(tuple(a_indices))
    b_sub = b.__getitem__(tuple(b_indices)) if b_indices else b
-    return [T.dot(a_sub, b_sub)]
+    # Copy over previous output stacktrace to a_sub and b_sub,
+    # because an error in the subtensor operation (e.g. an index error)
+    # on either a or b must correspond to an error in the
+    # subtensor operation on their dot product.
+    copy_stack_trace(node.outputs[0], [a_sub, b_sub])
+    # Copy over previous output stacktrace and previous dot product stacktrace,
+    # because an error here may correspond to an either in either the original
+    # dot product, or in the dot product after the subtensor operation.
+    r = T.dot(a_sub, b_sub)
+    copy_stack_trace([node.outputs[0], node.inputs[0]], r)
+    return [r]
 @register_canonicalize
@@ -2968,12 +3030,21 @@ def local_IncSubtensor_serialize(node):
                          [mi.owner.inputs[0] for mi in movable_inputs])
            new_add = T.add(*new_inputs)
+            # Copy over stacktrace from original output, as an error
+            # (e.g. an index error) in this add operation should
+            # correspond to an error in the original add operation.
+            copy_stack_trace(node.outputs[0], new_add)
            # stack up the new incsubtensors
            tip = new_add
            for mi in movable_inputs:
                assert tip.type == o_type
                assert tip.type == mi.owner.inputs[0].type
                tip = mi.owner.op(tip, *mi.owner.inputs[1:])
+                # Copy over stacktrace from outputs of the original
+                # "movable" operation to the new operation.
+                copy_stack_trace(node.outputs + mi.owner.outputs, tip)
            return [tip]
        # print incsub_inputs, [id(i.owner.inputs[0]) for i in incsub_inputs]
@@ -3003,6 +3074,10 @@ def local_inplace_setsubtensor(node):
            set_instead_of_inc=node.op.set_instead_of_inc,
            destroyhandler_tolerate_aliased=dta)
        new_node = new_op(*node.inputs)
+        # Copy stacktrace from original outputs to new outputs.
+        # This is sensible, because the new operation is the
+        # same as the old one, but now with different attributes.
+        copy_stack_trace(node.outputs, new_node)
        return [new_node]
    return False
 compile.optdb.register('local_inplace_setsubtensor',
@@ -3021,6 +3096,11 @@ def local_inplace_incsubtensor1(node):
    if isinstance(node.op, AdvancedIncSubtensor1) and not node.op.inplace:
        new_op = node.op.clone_inplace()
        new_node = new_op(*node.inputs)
+        # Copy stacktrace from original outputs to new outputs.
+        # This is sensible, because the new operation is the
+        # same as the old one, but now with different attributes.
+        copy_stack_trace(node.outputs, new_node)
        return [new_node]
    return False
 compile.optdb.register('local_inplace_incsubtensor1',
@@ -3055,6 +3135,8 @@ def local_incsubtensor_of_zeros(node):
            pass
        if replace:
+            # No need to copy over the stacktrace,
+            # because x should already have a stacktrace
            return [x]
        else:
            return False
@@ -3089,6 +3171,9 @@ def local_setsubtensor_of_constants(node):
        if (replace_x is not None and
                replace_y is not None and
                replace_x == replace_y):
+            # No need to copy over the stacktrace,
+            # because x should already have a stacktrace
            return [x]
        else:
            return False
@@ -3135,7 +3220,13 @@ def local_adv_sub1_adv_inc_sub1(node):
        return [y]
    # It is possible that y is upcast or downcast to x.dtype.
    # In all case, as we set or add with 0, we can just cast y.
-    return [T.cast(y, node.outputs[0].dtype)]
+    r = T.cast(y, node.outputs[0].dtype)
+    # Copy over stacktrace from before casting, since
+    # we don't expect problems in the casting operation,
+    # and any problems in the indexing would have been spotted above.
+    copy_stack_trace(y, r)
+    return [r]
 @register_specialize
@@ -3238,7 +3329,14 @@ def local_useless_inc_subtensor_alloc(node):
                msg = '`x[i]` and `y` do not have the same shape.'
                z = Assert(msg)(z, *cond)
-            return [node.op(x, z, *i)]
+            r = node.op(x, z, *i)
+            # Copy over stacktrace from previous output, since
+            # we don't expect problems when removing the intermediate
+            # alloc operation and so we still want to point at the line
+            # of the inc_subtensor operation.
+            copy_stack_trace(node.outputs, r)
+            return [r]
 ####################
@@ -3257,6 +3355,8 @@ def local_useless_rebroadcast(node):
        x = node.inputs[0]
        if numpy.all(x.broadcastable == node.outputs[0].broadcastable):
            # No broadcastable flag was modified
+            # No need to copy over stack trace,
+            # because x should already have a stack trace.
            return [x]
        else:
            # Keep the flags that modify something
@@ -3268,7 +3368,10 @@ def local_useless_rebroadcast(node):
                # All flags are useful
                return
            else:
-                return [T.Rebroadcast(*list(new_axis.items()))(x)]
+                r = T.Rebroadcast(*list(new_axis.items()))(x)
+                # Copy over stacktrace from previous output
+                copy_stack_trace(node.outputs, r)
+                return [r]
 @register_canonicalize
@@ -3295,8 +3398,21 @@ def local_rebroadcast_lift(node):
        # by the `unbroadcast` function before we are in the actual function
        # compilation phase.
        if hasattr(input, 'clients') and len(input.clients) == 1:
-            rval = inode.op.make_node(T.Rebroadcast(*list(op.axis.items()))(
+            rebroadcasted = T.Rebroadcast(*list(op.axis.items()))(
-                inode.inputs[0])).outputs
+                inode.inputs[0])
+            # Copy over stacktrace from previous output (after rebroadcasting)
+            # to new output, because an error in the new graph right after
+            # rebroadcasting must have been caused by the previous rebroadcasting.
+            copy_stack_trace(node.outputs, rebroadcasted)
+            rval = inode.op.make_node(rebroadcasted).outputs
+            # Copy over stacktrace from previous output (after rebroadcasting)
+            # and input (after elemwise operation) to new output, because an
+            # error in the new graph could have been caused by either of the
+            # two ops.
+            copy_stack_trace(node.outputs + node.inputs, rval)
            return rval
    if inode and isinstance(inode.op, T.Rebroadcast):
        # the "axis" specification in the outer Rebroadcast overrides
@@ -3304,7 +3420,14 @@ def local_rebroadcast_lift(node):
        axis = inode.op.axis.copy()
        axis.update(op.axis)
        iinput = inode.inputs[0]
        rval = [T.Rebroadcast(*list(axis.items()))(iinput)]
+        # Copy over stacktrace from previous output (after second rebroadcast)
+        # and from previous input (after first rebroadcast op) because an error in
+        # the new graph could have been caused by either of the two
+        # rebroadcast ops.
+        copy_stack_trace(node.outputs + node.inputs, rval)
        return rval
@@ -3358,6 +3481,8 @@ def local_join_1(node):
        return
    tensors = node.inputs[1:]
    if len(tensors) == 1:
+        # We don't need to copy over any stacktrace here, because the
+        # input variable should already have its own stacktrace.
        return [tensors[0]]
@@ -3396,10 +3521,21 @@ def local_join_empty(node):
        if ret.dtype != o.dtype:
            # Join can upcast some inputs
            return
+        # Copy over stacktrace from previous output (after join op)
+        # to new output, because an error in the new op must be caused
+        # by an error in the old join op.
+        copy_stack_trace(node.outputs, ret)
        if ret.type != o.type:
            assert ret.dtype == o.dtype
            assert ret.ndim == o.ndim
            ret = T.patternbroadcast(ret, node.outputs[0].broadcastable)
+        # Copy over stacktrace from previous output
+        # (after patternbroadcast op) for same reasons as before.
+        copy_stack_trace(node.outputs, ret)
        return [ret]
@@ -3426,10 +3562,20 @@ def local_join_make_vector(node):
                inp.owner.op == new_inputs[-1].owner.op):
            inps = new_inputs[-1].owner.inputs + inp.owner.inputs
            new_inputs[-1] = inp.owner.op(*inps)
+            # Copy over stacktrace from previous output (after join op)
+            # to new intermediate output, because an error in the intermediate
+            # op must be caused by an error in the old join op.
+            copy_stack_trace(node.outputs, new_inputs[-1])
        else:
            new_inputs.append(inp)
    if len(new_inputs) < len(node.inputs) - 1:
        ret = T.join(node.inputs[0], *new_inputs)
+        # Copy over stacktrace from previous output (after join op)
+        # to new output, because an error in the new op must be caused
+        # by an error in the old join op.
+        copy_stack_trace(node.outputs, ret)
        return [ret]
@@ -3455,25 +3601,40 @@ def local_useless_switch(node):
        cond = T.extract_constant(node.inputs[0], elemwise=False)
        if type(cond) is numpy.ndarray and cond.ndim == 0:
            if cond == 0:
-                out = node.inputs[2]
+                correct_out = node.inputs[2]
            else:
-                out = node.inputs[1]
+                correct_out = node.inputs[1]
-            if out.ndim != node.outputs[0].ndim:
+            if correct_out.ndim != node.outputs[0].ndim:
                # TODO: broadcast?
                return False
-            if out.dtype != node.outputs[0].dtype:
+            if correct_out.dtype != node.outputs[0].dtype:
-                out = T.cast(out, node.outputs[0].dtype)
+                out = T.cast(correct_out, node.outputs[0].dtype)
+            else:
+                out = correct_out
            if out.type.broadcastable != node.outputs[0].type.broadcastable:
                # We need to copy data to the new dimensions during execution
                out = T.alloc(out, *[node.outputs[0].shape[i] for i
                                     in xrange(out.ndim)])
+            else:
+                out = out
+            # Copy over stacktrace from selected output to new output
+            copy_stack_trace(node.outputs + correct_out, out)
            return [out]
        # if left is right -> left
        if node.inputs[1] is node.inputs[2]:
+            # Note: No need to copy over stacktrace, because the input node
+            # already has its own stacktrace
            if cond.type == node.inputs[1].type:
                return [node.inputs[1]]
-            return [T.fill(cond, node.inputs[1])]
+            ret = T.fill(cond, node.inputs[1])
+            # Copy over stacktrace from switch output and correct branch
+            copy_stack_trace(node.outputs + node.inputs[1], ret)
+            return [ret]
        # This case happens with scan.
        # Elemwise{switch}(le(shape_i{id}(X), 0), 0, shape_i{id}(X)) -> shape_i{id}(X)
@@ -3489,6 +3650,8 @@ def local_useless_switch(node):
           T.extract_constant(left) == 0 and \
           right is cond_var.owner.inputs[0]:
            assert right.type == node.outputs[0].type
+            # No need to copy over stacktrace, because the right input node
+            # already has its own stacktrace
            return [right]
        return False
    return False
@@ -3529,9 +3692,24 @@ def local_mul_switch_sink(node):
                if (get_scalar_constant_value(
                        switch.inputs[1], only_process_constants=True) == 0.):
                    listmul = node.inputs[:idx] + node.inputs[idx + 1:]
+                    fmul = T.mul(*(listmul + [switch.inputs[2]]))
+                    # Copy over stacktrace for elementwise multiplication op
+                    # from previous elementwise multiplication op.
+                    # An error in the multiplication (e.g. errors due to
+                    # inconsistent shapes), will point to the
+                    # multiplication op.
+                    copy_stack_trace(node.outputs, fmul)
                    fct = [T.switch(switch.inputs[0], 0,
-                                    T.mul(*(listmul + [switch.inputs[2]])))]
+                                    fmul)]
                    fct[0].values_eq_approx = values_eq_approx_remove_nan
+                    # Copy over stacktrace for switch op from both previous
+                    #  elementwise multiplication op and previous switch op,
+                    # because an error in this part can be caused by either
+                    # of the two previous ops.
+                    copy_stack_trace(node.outputs + switch.outputs, fct)
                    return fct
            except NotScalarConstantError:
                pass
@@ -3539,9 +3717,23 @@ def local_mul_switch_sink(node):
                if (get_scalar_constant_value(
                        switch.inputs[2], only_process_constants=True) == 0.):
                    listmul = node.inputs[:idx] + node.inputs[idx + 1:]
+                    fmul = T.mul(*(listmul + [switch.inputs[1]]))
+                    # Copy over stacktrace for elementwise multiplication op
+                    # from previous elementwise multiplication op.
+                    # An error in the multiplication (e.g. errors due to
+                    # inconsistent shapes), will point to the
+                    # multiplication op.
+                    copy_stack_trace(node.outputs, fmul)
                    fct = [T.switch(switch.inputs[0],
-                                    T.mul(*(listmul + [switch.inputs[1]])), 0)]
+                                    fmul, 0)]
                    fct[0].values_eq_approx = values_eq_approx_remove_nan
+                    # Copy over stacktrace for switch op from both previous
+                    # elementwise multiplication op and previous switch op,
+                    # because an error in this part can be caused by either
+                    # of the two previous ops.
+                    copy_stack_trace(node.outputs + switch.outputs, fct)
                    return fct
            except NotScalarConstantError:
                pass
@@ -3569,17 +3761,45 @@ def local_div_switch_sink(node):
        switch = node.inputs[0].owner
        try:
            if get_scalar_constant_value(switch.inputs[1]) == 0.:
+                fdiv = op(switch.inputs[2], node.inputs[1])
+                # Copy over stacktrace for elementwise division op
+                # from previous elementwise multiplication op.
+                # An error in the division (e.g. errors due to
+                # inconsistent shapes or division by zero),
+                # will point to the new division op.
+                copy_stack_trace(node.outputs, fdiv)
                fct = [T.switch(switch.inputs[0], 0,
-                                op(switch.inputs[2], node.inputs[1]))]
+                                fdiv)]
                fct[0].values_eq_approx = values_eq_approx_remove_nan
+                # Copy over stacktrace for switch op from both previous
+                # elementwise division op and previous switch op,
+                # because an error in this part can be caused by either
+                # of the two previous ops.
+                copy_stack_trace(node.outputs + switch.outputs, fct)
                return fct
        except NotScalarConstantError:
            pass
        try:
            if get_scalar_constant_value(switch.inputs[2]) == 0.:
+                fdiv = op(switch.inputs[1], node.inputs[1])
+                # Copy over stacktrace for elementwise division op
+                # from previous elementwise multiplication op.
+                # An error in the division (e.g. errors due to
+                # inconsistent shapes or division by zero),
+                # will point to the new division op.
+                copy_stack_trace(node.outputs, fdiv)
                fct = [T.switch(switch.inputs[0],
-                                op(switch.inputs[1], node.inputs[1]), 0)]
+                                fdiv, 0)]
                fct[0].values_eq_approx = values_eq_approx_remove_nan
+                # Copy over stacktrace for switch op from both previous
+                # elementwise division op and previous switch op,
+                # because an error in this part can be caused by either
+                # of the two previous ops.
+                copy_stack_trace(node.outputs + switch.outputs, fct)
                return fct
        except NotScalarConstantError:
            pass
@@ -3606,6 +3826,8 @@ def local_useless_tile(node):
                try:
                    l = T.get_vector_length(node.inputs[1])
                    if l == node.inputs[0].ndim:
+                        # No need to copy over any stacktrace as previous
+                        # input variable already has a stacktrace
                        return [node.inputs[0]]
                    elif l < node.inputs[0].ndim:
                        # The Op don't support that case, so we can't
@@ -3618,7 +3840,11 @@ def local_useless_tile(node):
                        return
                        x_nd = node.inputs[0].ndim
                        broad = ['x'] * (l - x_nd) + xrange(x_nd)
-                        return [node.inputs[0].dimshuffle(broad)]
+                        ret = node.inputs[0].dimshuffle(broad)
+                        # Copy over stacktrace from previous output node,
+                        # and from node before tiling operation.
+                        copy_stack_trace(node.outputs + node.inputs[0], ret)
+                        return [ret]
                except ValueError:
                    return
        except NotScalarConstantError:
@@ -3642,6 +3868,9 @@ def local_useless_split(node):
            x, axis, splits = node.inputs
            out = assert_op(x, T.eq(splits.shape[0], 1))
            out = assert_op(out, T.eq(x.shape[axis], splits[0]))
+            # Copy over stacktrace from previous output node.
+            copy_stack_trace(node.outputs, out)
            return [out]

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -108,6 +108,9 @@ class test_dimshuffle_lift(unittest.TestCase):
        self.assertTrue(str(g) == "[DimShuffle{1,0}(DimShuffle{1,0}(x))]")
        dimshuffle_lift.optimize(g)
        self.assertTrue(str(g) == "[x]")
+        # Check stacktrace was copied over correctly after opt was applied
+        self.assertTrue(hasattr(g.outputs[0].tag, 'trace'))
    def test_merge2(self):
        x, y, z = inputs()
@@ -118,6 +121,8 @@ class test_dimshuffle_lift(unittest.TestCase):
                str(g))
        dimshuffle_lift.optimize(g)
        self.assertTrue(str(g) == "[DimShuffle{0,1,x,x}(x)]", str(g))
+        # Check stacktrace was copied over correctly after opt was applied
+        self.assertTrue(hasattr(g.outputs[0].tag, 'trace'))
    def test_elim3(self):
        x, y, z = inputs()
@@ -129,6 +134,8 @@ class test_dimshuffle_lift(unittest.TestCase):
                str(g))
        dimshuffle_lift.optimize(g)
        self.assertTrue(str(g) == "[x]", str(g))
+        # Check stacktrace was copied over correctly after opt was applied
+        self.assertTrue(hasattr(g.outputs[0].tag, 'trace'))
    def test_lift(self):
        x, y, z = inputs([False] * 1, [False] * 2, [False] * 3)
@@ -155,6 +162,9 @@ class test_dimshuffle_lift(unittest.TestCase):
        dimshuffle_lift.optimize(g)
        self.assertTrue(str(g) in (opt_str_g_inplace, opt_str_g_noinplace),
                        str(g))
+        # Check stacktrace was copied over correctly after opt was applied
+        self.assertTrue(hasattr(g.outputs[0].tag, 'trace'))
    def test_recursive_lift(self):
        v = T.vector(dtype="float64")
@@ -169,6 +179,7 @@ class test_dimshuffle_lift(unittest.TestCase):
                      "(<TensorType(float64, matrix)>, "
                      "DimShuffle{x,x}(TensorConstant{84}))))]")
        self.assertTrue(str(g) == init_str_g)
        new_out = local_dimshuffle_lift.transform(g.outputs[0].owner)[0]
        new_g = FunctionGraph(g.inputs, [new_out])
        opt_str_g = ("[Elemwise{mul,no_inplace}(Elemwise{add,no_inplace}"
@@ -178,6 +189,8 @@ class test_dimshuffle_lift(unittest.TestCase):
                     "(<TensorType(float64, matrix)>), "
                     "DimShuffle{x,x}(TensorConstant{84})))]")
        self.assertTrue(str(new_g) == opt_str_g)
+        # Check stacktrace was copied over correctly after opt was applied
+        self.assertTrue(hasattr(new_g.outputs[0].tag, 'trace'))
 def test_add_canonizer_problem0():
@@ -1609,6 +1622,11 @@ def test_local_useless_slice():
    subtens = apply_node.op
    assert not any(isinstance(idx, slice) for idx in subtens.idx_list), "Slice should be gone"
+    # Now test that the stack trace is copied over properly,
+    # before before and after optimization.
+    assert hasattr(f_unopt.outputs[0].variable.tag, 'trace')
+    assert hasattr(f_opt.outputs[0].variable.tag, 'trace')
    # test a 4d tensor
    z = tensor.tensor4('z')
    o2 = z[1, :, :, 1]
@@ -1625,6 +1643,10 @@ def test_local_useless_slice():
    subtens = apply_node.op
    assert not any(isinstance(idx, slice) for idx in subtens.idx_list)
+    # Finally, test that the stack trace is copied over properly,
+    # before before and after optimization.
+    assert hasattr(f_opt_check.outputs[0].variable.tag, 'trace')
+    assert hasattr(f_opt_check_apply.outputs[0].variable.tag, 'trace')
 def test_local_useless_inc_subtensor():
    x = tensor.matrix('x')
@@ -1835,13 +1857,40 @@ class test_local_subtensor_make_vector(unittest.TestCase):
        r = f(0, 1, 2)
        assert r[0] == 0 and r[1] == 2
+    def test_stacktrace(self):
+        x, y, z = tensor.lscalars('xyz')
+        v = make_vector(x, y, z)
+        # Compile function using only the 'local_subtensor_make_vector' optimization,
+        # which requires us to add the 'canonicalize' phase.
+        mode = theano.compile.mode.Mode(optimizer=None).including('canonicalize_db').including("local_subtensor_make_vector")
+        f = function([x, y, z], v[0], mode=mode)
+        # Check stacktrace was copied over correctly after opt was applied
+        self.assertTrue(hasattr(f.outputs[0].variable.tag, 'trace'))
+        #import ipdb; ipdb.set_trace()
+        # Compile function using all optimizations in fast_compile mode, 
+        # including the 'local_subtensor_make_vector' optimization
+        mode = theano.compile.mode.get_mode('FAST_COMPILE').including("local_subtensor_make_vector")
+        f = function([x, y, z], v[0], mode=mode)
+        # Check stacktrace was copied over correctly after opt was applied
+        self.assertTrue(hasattr(f.outputs[0].variable.tag, 'trace'))
 class test_local_subtensor_lift(unittest.TestCase):
+    def _verify_stack_trace(self, f):
+        for output in f.outputs:
+            # Check stacktrace was copied over correctly after opt was applied
+            self.assertTrue(hasattr(output.variable.tag, 'trace'))
    def test0(self):
        # basic test that the Op works
        x = tensor.matrix('x')
        f = function([x], tensor.exp(x)[0], mode=mode_opt)
+        self._verify_stack_trace(f)
        prog = f.maker.fgraph.toposort()
        assert isinstance(prog[0].op, tensor.Subtensor)  # first subtensor
        assert prog[1].op == tensor.exp
@@ -1854,6 +1903,8 @@ class test_local_subtensor_lift(unittest.TestCase):
        x = tensor.matrix('x')
        f = function([x], [tensor.exp(x)[0], tensor.exp(x)], mode=mode_opt)
+        self._verify_stack_trace(f)
        prog = f.maker.fgraph.toposort()
        assert prog[0].op == tensor.exp
        assert isinstance(prog[1].op, tensor.Subtensor)  # first subtensor
@@ -1868,6 +1919,8 @@ class test_local_subtensor_lift(unittest.TestCase):
        z = tensor.matrix('z')
        f = function([x, y, z], tensor.exp(x + y + z)[0], mode=mode_opt)
+        self._verify_stack_trace(f)
        prog = f.maker.fgraph.toposort()
        assert isinstance(prog[0].op, tensor.DimShuffle)
        assert isinstance(prog[1].op, tensor.Subtensor)  # first subtensor
@@ -1885,6 +1938,8 @@ class test_local_subtensor_lift(unittest.TestCase):
        z = tensor.matrix('z')
        f = function([x, y, z], tensor.exp(x + y + z)[0:2], mode=mode_opt)
+        self._verify_stack_trace(f)
        prog = f.maker.fgraph.toposort()
        assert isinstance(prog[0].op, tensor.DimShuffle)
        assert isinstance(prog[1].op, tensor.Subtensor)  # first subtensor
@@ -1901,6 +1956,8 @@ class test_local_subtensor_lift(unittest.TestCase):
        y = tensor.vector('y')
        f = function([y], tensor.exp(y.dimshuffle(0, 'x'))[0], mode=mode_opt)
+        self._verify_stack_trace(f)
        prog = f.maker.fgraph.toposort()
        assert isinstance(prog[0].op, tensor.DimShuffle)
        assert isinstance(prog[1].op, tensor.Subtensor)
@@ -1916,6 +1973,8 @@ class test_local_subtensor_lift(unittest.TestCase):
        y = tensor.vector('y')
        f = function([x, y], tensor.exp(x + y)[0], mode=mode_opt)
+        self._verify_stack_trace(f)
        prog = f.maker.fgraph.toposort()
        assert isinstance(prog[0].op, tensor.DimShuffle)
        assert prog[1].op == tensor.add
@@ -1932,6 +1991,8 @@ class test_local_subtensor_lift(unittest.TestCase):
        f = function([x, y], [tensor.exp(x + y)[0], tensor.exp(x + y) + x],
                     mode=mode_opt)
+        self._verify_stack_trace(f)
        prog = f.maker.fgraph.toposort()
        assert isinstance(prog[0].op, tensor.DimShuffle)
        assert isinstance(prog[1].op.scalar_op, theano.scalar.
@@ -1950,6 +2011,8 @@ class test_local_subtensor_lift(unittest.TestCase):
        y = tensor.scalar('y')
        f = function([x, y], tensor.exp(x + y)[0], mode=mode_opt)
+        self._verify_stack_trace(f)
        prog = f.maker.fgraph.toposort()
        assert isinstance(prog[0].op, tensor.Subtensor)
        # Composite{add,exp}
@@ -1969,6 +2032,7 @@ class test_local_subtensor_lift(unittest.TestCase):
        assert newx.broadcastable == (True, False)
        f1 = function([x], newx[:2, :5], mode=mode_opt)
+        self._verify_stack_trace(f1)
        prog = f1.maker.fgraph.toposort()
        assert isinstance(prog[0].op, tensor.Subtensor)
        assert isinstance(prog[1].op, tensor.Rebroadcast)
@@ -1982,6 +2046,7 @@ class test_local_subtensor_lift(unittest.TestCase):
        assert newy.broadcastable == (True, False, True, False)
        f2 = function([y], newy[:, 3, 0, :], mode=mode_opt)
+        self._verify_stack_trace(f2)
        prog = f2.maker.fgraph.toposort()
        assert isinstance(prog[0].op, tensor.Subtensor)
        assert isinstance(prog[1].op, tensor.Rebroadcast)
@@ -1989,6 +2054,7 @@ class test_local_subtensor_lift(unittest.TestCase):
        # corner case 2: subtensor idx_list is shorter than resulting broadcast pattern
        f3 = function([y], newy[:, 3, 0], mode=mode_opt)
+        self._verify_stack_trace(f3)
        prog = f3.maker.fgraph.toposort()
        assert isinstance(prog[0].op, tensor.Subtensor)
        assert isinstance(prog[1].op, tensor.Rebroadcast)
@@ -2003,6 +2069,7 @@ class test_local_subtensor_lift(unittest.TestCase):
        out = newz[:, 3, 0]
        f4 = function([z], newz[:, 3, 0], mode=mode_opt)
+        self._verify_stack_trace(f4)
        prog = f4.maker.fgraph.toposort()
        assert isinstance(prog[0].op, tensor.Subtensor)
        assert isinstance(prog[1].op, tensor.Rebroadcast)
@@ -2010,6 +2077,11 @@ class test_local_subtensor_lift(unittest.TestCase):
 class test_local_subtensor_merge(unittest.TestCase):
+    def _verify_stack_trace(self, f):
+        for output in f.outputs:
+            # Check stacktrace was copied over correctly after opt was applied
+            self.assertTrue(hasattr(output.variable.tag, 'trace'))
    def setUp(self):
        utt.seed_rng()
        self.x_shapes = [(2, 2), (5, 3), (4, 1), (1, 2),
@@ -2024,6 +2096,8 @@ class test_local_subtensor_merge(unittest.TestCase):
            g = function([x], x[idx::][-1], mode=mode_opt.excluding(
                'local_subtensor_merge'))
+            self._verify_stack_trace(f)
            topo = f.maker.fgraph.toposort()
            assert len([t for t in topo
                        if isinstance(t.op, tensor.Subtensor)]) == 1
@@ -2050,6 +2124,8 @@ class test_local_subtensor_merge(unittest.TestCase):
                     mode=mode_opt.excluding('local_subtensor_merge'))
        #theano.printing.debugprint(f, print_type=True)
+        self._verify_stack_trace(f)
        topo = f.maker.fgraph.toposort()
        # print [t for t in topo if isinstance(t.op, tensor.Subtensor)]
        assert len([t for t in topo
@@ -2077,6 +2153,8 @@ class test_local_subtensor_merge(unittest.TestCase):
            g = function([x], x[::-1][idx],
                         mode=mode_opt.excluding('local_subtensor_merge'))
+            self._verify_stack_trace(f)
            #theano.printing.debugprint(f, print_type=True)
            topo = f.maker.fgraph.toposort()
            # print [t for t in topo if isinstance(t.op, tensor.Subtensor)]
@@ -2105,6 +2183,8 @@ class test_local_subtensor_merge(unittest.TestCase):
                     mode=mode_opt.excluding('local_subtensor_merge'))
        #theano.printing.debugprint(f, print_type=True)
+        self._verify_stack_trace(f)
        topo = f.maker.fgraph.toposort()
        # print [t for t in topo if isinstance(t.op, tensor.Subtensor)]
        assert len([t for t in topo
@@ -2127,6 +2207,8 @@ class test_local_subtensor_merge(unittest.TestCase):
        for idx in xrange(-9, 8):
            f = function([x], x[::-1][:idx], mode=mode_opt)
+            self._verify_stack_trace(f)
            #theano.printing.debugprint(f, print_type=True)
            topo = f.maker.fgraph.toposort()
            # print [t for t in topo if isinstance(t.op, tensor.Subtensor)]
@@ -2144,6 +2226,9 @@ class test_local_subtensor_merge(unittest.TestCase):
        x = tensor.matrix('x')
        y = tensor.iscalar('y')
        f = function([x, y], x[::-1][:y], mode=mode_opt)
+        self._verify_stack_trace(f)
        #theano.printing.debugprint(f, print_type=True)
        topo = f.maker.fgraph.toposort()
@@ -2165,6 +2250,8 @@ class test_local_subtensor_merge(unittest.TestCase):
            for idx2 in xrange(-7, 7):
                f = function([x], x[idx1:][:idx2], mode=mode_opt)
+                self._verify_stack_trace(f)
                #theano.printing.debugprint(f, print_type=True)
                topo = f.maker.fgraph.toposort()
                # print [t for t in topo if isinstance(t.op, tensor.Subtensor)]
@@ -2183,6 +2270,9 @@ class test_local_subtensor_merge(unittest.TestCase):
        y = tensor.iscalar('y')
        z = tensor.iscalar('y')
        f = function([x, y, z], x[y:][:z], mode=mode_opt)
+        self._verify_stack_trace(f)
        #theano.printing.debugprint(f, print_type=True)
        topo = f.maker.fgraph.toposort()
@@ -2212,6 +2302,9 @@ class test_local_subtensor_merge(unittest.TestCase):
            z = x[slice(*sl1)][slice(*sl2)]
            f = function([x], z, mode=mode_opt)
+            self._verify_stack_trace(f)
            x_val = self.rng.uniform(size=shape).astype(config.floatX)
            f(x_val)
@@ -2227,6 +2320,9 @@ class test_local_subtensor_merge(unittest.TestCase):
        s2 = tensor.iscalar('s2')
        f = function([x, b1, e1, s1, b2, e2, s2], x[b1:e1:s1][b2:e2:s2],
                     mode=mode_opt)
+        self._verify_stack_trace(f)
        #theano.printing.debugprint(f, print_type=True)
        topo = f.maker.fgraph.toposort()
@@ -2265,6 +2361,7 @@ class test_local_subtensor_merge(unittest.TestCase):
        t = theano.shared(numpy.int64(0))
        fun = theano.function([x], y[t])
        val = fun(data)
        assert val == data[7:1:-1][0]
@@ -2310,6 +2407,9 @@ class test_local_subtensor_merge(unittest.TestCase):
        s = tensor.iscalar('s')
        i = tensor.iscalar('i')
        f = function([x, b, e, s, i], x[b:e:s][i], mode=mode_opt)
+        self._verify_stack_trace(f)
        #theano.printing.debugprint(f, print_type=True)
        topo = f.maker.fgraph.toposort()
@@ -2401,6 +2501,9 @@ class test_local_subtensor_merge(unittest.TestCase):
            sub_x = x[slice1][slice2]
            f = theano.function([x] + input_vars, sub_x, mode=mode_opt)
+            self._verify_stack_trace(f)
            topo = f.maker.fgraph.toposort()
            # print [t for t in topo if isinstance(t.op, tensor.Subtensor)]
            assert len([t for t in topo if isinstance(t.op,
@@ -2458,6 +2561,8 @@ class test_local_subtensor_merge(unittest.TestCase):
            sub_x = x[symbol_slice][i]
            f = theano.function([x] + input_vars, sub_x, mode=mode_opt)
+            self._verify_stack_trace(f)
            topo = f.maker.fgraph.toposort()
            # print [t for t in topo if isinstance(t.op, tensor.Subtensor)]
            assert len([t for t in topo if isinstance(t.op,
@@ -2571,6 +2676,32 @@ class test_local_adv_sub1_adv_inc_sub1(unittest.TestCase):
            self.assertRaises((AssertionError, ValueError),
                              f, dx, dy, [1])
+    def test_stacktrace(self):
+        x = tensor.matrix("x")
+        y = tensor.matrix("y")
+        idx = tensor.ivector()
+        dx = numpy.random.rand(4, 5).astype(config.floatX)
+        dy = numpy.random.rand(2, 5).astype(config.floatX)
+        didx = numpy.asarray([1, 3], "int32")
+        # set_subtensor
+        inc = tensor.set_subtensor(x[idx], y)
+        o = inc[idx]
+        # Compile function using only the 'local_subtensor_make_vector' optimization,
+        # which requires us to add the 'canonicalize' phase.
+        mode = theano.compile.mode.Mode(optimizer=None).including('canonicalize').including("local_adv_sub1_adv_inc_sub1")
+        f = theano.function([x, y, idx], o, self.mode)
+        # Check stacktrace was copied over correctly after opt was applied
+        self.assertTrue(hasattr(f.outputs[0].variable.tag, 'trace'))
+        # Compile function using all optimizations in fast_compile mode, 
+        # including the 'local_subtensor_make_vector' optimization
+        mode = theano.compile.mode.get_mode('FAST_COMPILE').including("local_adv_sub1_adv_inc_sub1")
+        f = theano.function([x, y, idx], o, self.mode)
+        # Check stacktrace was copied over correctly after opt was applied
+        self.assertTrue(hasattr(f.outputs[0].variable.tag, 'trace'))
 class Test_alloc_zero(unittest.TestCase):
    def setUp(self):
@@ -2771,7 +2902,11 @@ def test_local_IncSubtensor_serialize():
                                    tensor.AdvancedIncSubtensor1))
                        for inp in a.inputs])
+    # Now test that the stack trace is copied over properly,
+    # if we return the gradients. We need to use same mode as before.
+    f = theano.function([i, j, t], dW, mode=mode)
+    assert hasattr(f.outputs[0].variable.tag, 'trace')
 def test_local_set_to_inc_subtensor():
    v = theano.tensor.fmatrix()
    s = v[[2, 1]]
@@ -2800,7 +2935,12 @@ def test_local_set_to_inc_subtensor():
    utt.assert_allclose(r1, r2)
+    # Finally, test that the stack trace is copied over properly,
+    # before before and after optimization.
+    assert hasattr(f1.outputs[0].variable.tag, 'trace')
+    assert hasattr(f2.outputs[0].variable.tag, 'trace')
 def test_local_subtensor_of_dot():
    m1 = theano.tensor.matrix()
    m2 = theano.tensor.matrix()
@@ -2832,10 +2972,16 @@ def test_local_subtensor_of_dot():
    f = theano.function([m1, m2, idx], theano.dot(m1, m2)[idx, 1:4, :, idx:], mode=mode)
    assert test_equality(f(d1, d2, 1), numpy.dot(d1, d2)[1, 1:4, :, 1:])
+    # if we return the gradients. We need to use same mode as before.
+    assert hasattr(f.outputs[0].variable.tag, 'trace')
    f = theano.function([m1, m2, idx], theano.dot(m1, m2)[1:4, :, idx:, idx], mode=mode)
    assert test_equality(f(d1, d2, 1), numpy.dot(d1, d2)[1:4, :, 1:, 1])
+    # Now test that the stack trace is copied over properly,
+    # if we return the gradients. We need to use same mode as before.
+    assert hasattr(f.outputs[0].variable.tag, 'trace')
 class Test_local_elemwise_alloc(unittest.TestCase):
    dtype = config.floatX
@@ -2881,6 +3027,11 @@ class Test_local_elemwise_alloc(unittest.TestCase):
                 if elem.op is not None]) == count
        )
+    def _verify_stack_trace(self, f):
+        for output in f.outputs:
+            # Check stacktrace was copied over correctly after opt was applied
+            self.assertTrue(hasattr(output.variable.tag, 'trace'))
    def test_remove_alloc_wo_dimshuffle(self):
        # No optimization on alloc
        func = function(
@@ -2890,6 +3041,7 @@ class Test_local_elemwise_alloc(unittest.TestCase):
        )
        self._verify_alloc_count(func, 1)
        self._verify_assert_count(func, 0)
+        self._verify_stack_trace(func)
        # Optimization on alloc with assert
        func = function(
@@ -3332,6 +3484,11 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
 class Test_local_useless_alloc(unittest.TestCase):
+    def _verify_stack_trace(self, f):
+        for output in f.outputs:
+            # Check stacktrace was copied over correctly after opt was applied
+            self.assertTrue(hasattr(output.variable.tag, 'trace'))
    def setUp(self):
        self.rng = numpy.random.RandomState(utt.fetch_seed())
@@ -3352,6 +3509,8 @@ class Test_local_useless_alloc(unittest.TestCase):
        if isinstance(mode_opt, compile.DebugMode):
            self.assertRaises(ValueError, f)
+        self._verify_stack_trace(f)
    def test1(self):
        # Test that alloc never gets instantiated during optimization
        mode = mode_opt.excluding('local_useless_alloc')
@@ -3365,6 +3524,8 @@ class Test_local_useless_alloc(unittest.TestCase):
        op_classes = [node.op.__class__ for node in f.maker.fgraph.toposort()]
        assert tensor.Alloc not in op_classes
+        self._verify_stack_trace(f)
    def test2(self):
        # Test that alloc never gets instantiated during optimization
        mode = mode_opt.excluding('local_useless_alloc')
@@ -3383,10 +3544,17 @@ class Test_local_useless_alloc(unittest.TestCase):
        # in op_classes and we have to change the assert.
        assert tensor.Alloc in op_classes
+        self._verify_stack_trace(f)
 class Test_local_useless_inc_subtensor_alloc(unittest.TestCase):
    opt_name = 'local_useless_inc_subtensor_alloc'
+    def _verify_stack_trace(self, f):
+        for output in f.outputs:
+            # Check stacktrace was copied over correctly after opt was applied
+            self.assertTrue(hasattr(output.variable.tag, 'trace'))
    def setUp(self):
        # The optimization requires the shape feature so we need to compile in
        # FAST_RUN mode.
@@ -3423,6 +3591,10 @@ class Test_local_useless_inc_subtensor_alloc(unittest.TestCase):
        r2 = f2(x_value, i_value, y_value)
        utt.assert_allclose(r1, r2)
+        self._verify_stack_trace(f1)
+        self._verify_stack_trace(f2)
    def test_advanced_inc_subtensor1(self):
        if tensor.inplace_increment is None:
@@ -3452,6 +3624,9 @@ class Test_local_useless_inc_subtensor_alloc(unittest.TestCase):
        r2 = f2(x_value, i_value, y_value)
        utt.assert_allclose(r1, r2)
+        self._verify_stack_trace(f1)
+        self._verify_stack_trace(f2)
    def test_incsubtensor(self):
        x = tensor.vector('x')
@@ -3478,6 +3653,9 @@ class Test_local_useless_inc_subtensor_alloc(unittest.TestCase):
        r2 = f2(x_value, i_value, y_value)
        utt.assert_allclose(r1, r2)
+        self._verify_stack_trace(f1)
+        self._verify_stack_trace(f2)
 class test_shapeoptimizer(unittest.TestCase):
@@ -3792,7 +3970,6 @@ class test_assert(utt.InferShapeTester):
        self._compile_and_check([admat, adscal, bdscal], [out],
                                [admat_val, adscal_val, bdscal_val], Assert)
 def test_local_mul_specialize():
    mode = theano.config.mode
    if mode == 'FAST_COMPILE':
@@ -3849,6 +4026,10 @@ class T_Tile(unittest.TestCase):
                assert len(topo) == 1
                assert isinstance(topo[0].op, compile.DeepCopyOp)
                f(data)
+                # Check that stacktrace is copied over
+                self.assertTrue(hasattr(f.outputs[0].variable.tag, 'trace'))
+                self.assertTrue(len(f.outputs[0].variable.tag.trace)>0)
 def speed_local_pow_specialize_range():
@@ -3987,6 +4168,8 @@ class T_Rebroadcast(unittest.TestCase):
        e = f.maker.fgraph.toposort()
        assert len([n for n in e if isinstance(n.op, T.Rebroadcast)]) == 0
+        assert hasattr(f.outputs[0].variable.tag, 'trace')
    def test_rebroadcast_rebroadcast(self):
        mode = theano.compile.get_default_mode().including('canonicalize')
        m = T.matrix()
@@ -5532,6 +5715,7 @@ def test_local_join_empty():
                for n in e if isinstance(n.op, Join)])
    assert f.maker.fgraph.outputs[0].dtype == config.floatX
    # test for matrix join(1,a)
    empty_mat = numpy.asarray([[]], dtype=config.floatX)
    m = tensor.matrix('m')
@@ -5544,7 +5728,6 @@ def test_local_join_empty():
    assert all([not isinstance(n.op, Join) or len(n.inputs) == 4
                for n in e if isinstance(n.op, Join)])
    assert f.maker.fgraph.outputs[0].dtype == config.floatX
    # test for vector, vector, empty to matrix
    # We can't optimize this case.
    s = tensor.stack([a, a, empty_vec])
@@ -5556,7 +5739,6 @@ def test_local_join_empty():
    assert all([not isinstance(n.op, Join) or len(n.inputs) == 4
                for n in e if isinstance(n.op, Join)])
    assert f.maker.fgraph.outputs[0].dtype == config.floatX
    # test for matrix join(0,a)
    # We can't optimize this case.
    s = join(0, m, numpy.asarray([[2.]], dtype=config.floatX), m)
@@ -5585,6 +5767,9 @@ def test_local_join_make_vector():
                for n in e if isinstance(n.op, Join)])
    assert f.maker.fgraph.outputs[0].dtype == config.floatX
+    assert hasattr(f.outputs[0].variable, 'tag')
+    assert hasattr(f.outputs[0].variable.tag, 'trace')
 def test_local_add_specialize():
    # test of non-zero dimension
@@ -5685,6 +5870,12 @@ def test_local_useless_split():
    assert len(graph_nonopt)==1
    assert isinstance(graph_nonopt[0].op, tensor.Split)
+    # Check that stacktraces have been copied over properly
+    assert hasattr(f_opt.outputs[0].variable.tag, 'trace')
+    assert len(f_opt.outputs[0].variable.tag.trace) > 0
+    assert hasattr(f_nonopt.outputs[0].variable.tag, 'trace')
+    assert len(f_nonopt.outputs[0].variable.tag.trace) > 0
 def test_local_flatten_lift():
    for i in xrange(1, 4):
@@ -5751,6 +5942,8 @@ class Test_lift_transpose_through_dot(unittest.TestCase):
        g = self.simple_optimize(FunctionGraph([a, b], [tensor.dot(a, b).T]))
        sg = '[dot(DimShuffle{1,0}(b), DimShuffle{1,0}(a))]'
        assert str(g) == sg, (str(g), sg)
+        # Check stacktrace was copied over correctly after opt was applied
+        self.assertTrue(hasattr(g.outputs[0].tag, 'trace'))
    def test_row_matrix(self):
        a = vector('a')
@@ -5761,6 +5954,8 @@ class Test_lift_transpose_through_dot(unittest.TestCase):
            level='stabilize')
        sg = '[dot(DimShuffle{1,0}(b), DimShuffle{0,x}(a))]'
        assert str(g) == sg, (str(g), sg)
+        # Check stacktrace was copied over correctly after opt was applied
+        self.assertTrue(hasattr(g.outputs[0].tag, 'trace'))
    def test_matrix_col(self):
        a = vector('a')
@@ -5771,6 +5966,8 @@ class Test_lift_transpose_through_dot(unittest.TestCase):
            level='stabilize')
        sg = '[dot(DimShuffle{x,0}(a), DimShuffle{1,0}(b))]'
        assert str(g) == sg, (str(g), sg)
+        # Check stacktrace was copied over correctly after opt was applied
+        self.assertTrue(hasattr(g.outputs[0].tag, 'trace'))
 def test_local_upcast_elemwise_constant_inputs():